SQL’s “Group By” the MarkLogic Way

July 21, 2015 Data & AI, MarkLogic

Coming from an RDBMS background, one of the features I looked for in MarkLogic is the “group by” statement to run aggregate functions based on particular fields. Let’s see how this is done in MarkLogic using both XQuery and server-side JavaScript (SJS).

First, let’s create some person data using this sample data structure:

Person
first_nameJohn
last_nameDoe
age18
genderM
height180.2
civil_statusSingle

We can represent that in MarkLogic as either XML or JSON:

<person>
  <name>
    <first_name>john</first_name>
    <last_name>doe</last_name>
  </name>
  <age>18</age>
  <gender>M</gender>
  <height>180.2</height>
  <civil_status>Single</civil_status>
</person>
{
  "person" : {
    "name" : {
      "first_name" : "john",
      "last_name" : "doe"
    },
    "age" : 18,
    "gender" : "M",
    "height" : 180.2,
    "civil_status": "Single"
  }
}

We create the data in our database using the following:

xdmp:document-insert(
  '/person/doe/john.xml',
  <person>
    <name>
      <first_name>john</first_name>
      <last_name>doe</last_name>
    </name>
    <age>18</age>
    <gender>M</gender>
    <height>180.2</height>
    <civil_status>Single</civil_status>
  </person>
),
xdmp:document-insert(
  '/person/doe/jane.xml',
  <person>
    <name>
      <first_name>jane</first_name>
      <last_name>doe</last_name>
    </name>
    <age>19</age>
    <gender>F</gender>
    <height>172.3</height>
    <civil_status>Single</civil_status>
  </person>
),
xdmp:document-insert(
  '/person/xi/chan.json',
  xdmp:unquote(
    '{
      "person" : {
        "name" : {
          "first_name" : "chan",
          "last_name" : "xi"
        },
        "age" : 17,
        "gender" : "F",
        "height" : 155.1,
        "civil_status": "Single"
      }
    }')
)
declareUpdate();

xdmp.documentInsert(
  '/person/roe/john.xml',
  xdmp.unquote(
    '<person>' +
    '  <name>' +
    '    <first_name>john</first_name>' +
    '    <last_name>roe</last_name>' +
    '  </name>' +
    '  <age>20</age>' +
    '  <gender>M</gender>' +
    '  <height>170.2</height>' +
    '  <civil_status>Married</civil_status>' +
    '</person>')
);

xdmp.documentInsert(
  '/person/roe/jane.json',
  {
    "person" : {
      "name" : {
        "first_name" : "jane",
        "last_name" : "roe"
      },
      "age" : 21,
      "gender" : "F",
      "height" : 162.3,
      "civil_status": "Married"
    }
  }
);

To get a basic gender count like ‘male: 22, female: 31’, in an RDBMS we do this:

select gender, count(1) from person group by gender

Here’s how to achieve this in MarkLogic; note that these snippets work, but aren’t optimized.

for $gender in fn:distinct-values(/person/gender)
let $genderQuery := 
  cts:or-query((
    cts:element-value-query(xs:QName('gender'), $gender),
    cts:json-property-value-query('gender', $gender)
  ))
return $gender || ":" || fn:count(cts:search(fn:doc(),$genderQuery))
var result = {};
var gender;
for (var doc of cts.search(cts.directoryQuery(["/person/"], "infinity"))) {
  if (doc instanceof XMLNode) {
    gender = doc.root.xpath("/person/gender/fn:string()");
  } else {
    gender = doc.root.person.gender;
  }
  result[gender] = (result[gender] ? result[gender] : 0) + 1;
}
result;

fn:distinct-values() and fn:count() always load documents off disk to do their work, which isn’t scalable. Performance gets slower the bigger the data set. To solve this, we can use MarkLogic extension functions that can run out of pure indexes.

We add an ‘element range index’ for ‘gender’:

  1. Go to admin console (localhost:8001)
  2. Go to Configure > Databases > myDB > Element Range Indexes
  3. Add an entry for gender as follows:
    • Scalar type: string
    • localname: gender
    • Collation: you could use the collation builder, just take note of the resulting value and pass that as part of the ‘options’ of ‘cts:element-reference’

With this element range index, we can now use cts:values() and xdmp:estimate() to get a big speed boost.

for $gender in cts:values(cts:element-reference(xs:QName('gender')))
return $gender || ": " || cts:frequency($gender)
var result = {};
for (var gender of cts.values(cts.elementReference(xs.QName('gender')))){
  result[gender] = cts.frequency(gender);
}
result

This code is index optimized, runs in parallel on large clusters, and is amazingly fast. It’s simple to get sub-second answers even against massive data sets. It greatly outperforms typical RDBMS systems.

Next, let’s limit our search to a particular height range.

select gender, count(1) from person where age between 18 and 40 group by gender

In MarkLogic, we represent constraints as ‘queries’, which can be built up from primitives. We first define an age query constraint as shown below. For each unique ‘gender’ value in the database, we apply the ‘$age-query’ to limit the results to those genders appearing in a specific age range. To count the matching documents, we combine the existing ‘$gender-query’ with ‘$age-query’.

let $age-query := cts:and-query((
    cts:element-range-query(xs:QName('age'), ">=", 18),
    cts:element-range-query(xs:QName('age'), "<=", 40)
  ))
for $gender in cts:values(cts:element-reference(xs:QName('gender')), (), (), $age-query)
let $query := cts:and-query((
  $age-query,
  cts:element-value-query(xs:QName('gender'), $gender)
))
return $gender || ": " || xdmp:estimate(cts:search(fn:doc(), $query))
    cts.elementRangeQuery(xs.QName('age'), ">=", 18),
    cts.elementRangeQuery(xs.QName('age'), "<=", 40)
  ]);
var result = {};
for (var gender of cts.values(cts.elementReference(xs.QName('gender')), null, null, query)){
  var query = cts.andQuery([
    ageQuery,
    cts.elementValueQuery(xs.QName('gender'), gender)
  ]);
  result[gender] = cts.estimate(query);
}
result

Let’s get fancier and use two different fields for the “group by”:

select last_name, gender, count(1) from person group by last_name, gender

Similar to gender, we add an element range index for ‘last_name’:

  1. Go to admin console (localhost:8001)
  2. Go to Configure > Databases > myDB > Element Range Indexes
  3. Add an entry for last_name as follows:
    • Scalar type: string
    • localname: last_name

Now our code will look like this:

for $lastName in cts:values(cts:element-reference(xs:QName( 'last_name'))) 
let $lastNameQuery := cts:element-value-query(xs:QName( 'last_name'), $lastName)
for $gender in cts:values(cts:element-reference(xs:QName( 'gender')), (), (), $lastNameQuery)
let $query := cts:and-query((
    $lastNameQuery,
    cts:element-value-query(xs:QName( 'gender'), $gender)
  ))
return $lastName || ":" || $gender || ": " || xdmp:estimate(cts:search(fn:doc(), $query))
var result = {};
for (var lastName of cts.values(cts.elementReference(xs.QName('last_name')))){
  var lastNameQuery = cts.elementValueQuery(xs.QName('last_name'), lastName);
  for (var gender of cts.values(cts.elementReference(xs.QName('gender')), null, null, lastNameQuery)){
    var query = cts.andQuery([
        lastNameQuery,
        cts.elementValueQuery(xs.QName('gender'), gender)
      ]);
    if (result[lastName] == undefined){
      result[lastName] = {};
    }
    result[lastName][gender] = cts.estimate(query);
  }
}
result

That works, but there’s a better way. It’s more elegant and faster to use co-occurences:

for $pair in cts:element-value-co-occurrences(xs:QName( 'last_name'), xs:QName( 'gender'),
    ("item-frequency", "frequency-order"))
return $pair/*:value[1] || " - " || $pair/*:value[2] || " - " || cts:frequency($pair)
var result = {};
for (var pair of cts.elementValueCoOccurrences(xs.QName('last_name'),
    xs.QName('gender'),
    ["item-frequency", "frequency-order"])) {
  if (result[pair[0]] == undefined){
    result[pair[0]] = {};
  }
  result[pair[0]][pair[1]] = cts.frequency(pair);
}
result

It’s also possible to do “n-way” co-occurences. Let’s bring civil_status into the mix as well.

  1. Go to admin console (localhost:8001)
  2. Go to Configure > Databases > myDB > Element Range Indexes
  3. Add an entry for civil_status as follows:
    • Scalar type: string
    • localname: civil_status
    • range value positions: true
  4. Update indexes for ‘gender’ and ‘age’ to set ‘range value positions’ to true.
  5. Use ‘cts:value-tuples’ which support n-size list of indexes.

Now our code looks like this:

for $tuple in cts:value-tuples(
    (
      cts:element-reference(xs:QName( 'civil_status')),
      cts:element-reference(xs:QName( 'last_name')),
      cts:element-reference(xs:QName( 'gender'))
    ),
    ("item-frequency", "frequency-order")
  )
return $tuple[1] || " - " || $tuple[2] || " - " || $tuple[3] || " - " || cts:frequency($tuple)
var result = [];
for (var tuple of cts.valueTuples(
    [
      cts.elementReference(xs.QName('civil_status')),
      cts.elementReference(xs.QName('last_name')),
      cts.elementReference(xs.QName('gender'))
    ],
    ["item-frequency", "frequency-order"]
)){
  var record = {};
  record['week'] = tuple[0];
  record['gender'] = tuple[1];
  record['count'] = cts.frequency(tuple);
  result.push(record);
}
result

We can even rewrite our initial age/gender query using value-tuples:

let $ageQuery := cts:and-query((
    cts:element-range-query(xs:QName('age'), ">=", 18),
    cts:element-range-query(xs:QName('age'), "<=", 40)
  ))
for $tuple in cts:value-tuples(
  cts:element-reference(xs:QName( 'gender')),
  ("item-frequency", "frequency-order"),
  $ageQuery)
return $tuple[1] || " - " || cts:frequency($tuple)
var result = [];
var ageQuery = cts.andQuery([
    cts.elementRangeQuery(xs.QName('age'), ">=", 18),
    cts.elementRangeQuery(xs.QName('age'), "<=", 40)
  ]);
for (var tuple of cts.valueTuples(
    cts.elementReference(xs.QName('gender')),
    ["item-frequency", "frequency-order"],
    ageQuery
)){
  var record = {};
  record['gender'] = tuple[0];
  record['count'] = cts.frequency(tuple);
  result.push(record);
}
result

As a final thought, we don’t just have to count results. There’s a long list of aggregate functions available. For example, this query shows the average height grouped by age and gender:

for $tuple in cts:value-tuples(
  (
    cts:element-reference(xs:QName( 'age')),
    cts:element-reference(xs:QName( 'gender'))
  ),
  ("item-frequency", "frequency-order"))
let $query  := cts:and-query((
  cts:element-value-query(xs:QName( 'age'), xs:string($tuple[1])),
  cts:element-value-query(xs:QName( 'gender'), xs:string($tuple[2]))
))
return $tuple[1] || " - " || $tuple[2] || " - " || cts:avg-aggregate(cts:element-reference(xs:QName( 'height')), (), $query)
var result = [];
for (var tuple of cts.valueTuples(
    [
      cts.elementReference(xs.QName('age')),
      cts.elementReference(xs.QName('gender'))
    ],
    ["item-frequency", "frequency-order"]
)){
var query = cts.andQuery([
    cts.elementValueQuery(xs.QName('age'), xs.string(tuple[0])),
    cts.elementValueQuery(xs.QName('gender'), xs.string(tuple[1]))
  ]);
  var record = {};
  record['age'] = tuple[0];
  record['gender'] = tuple[1];
  record['averageHeight'] = cts.avgAggregate(cts.elementReference(xs.QName('height')), [], query);
  result.push(record);
}
result

Hope this helps you as you do grouped aggregates in MarkLogic!

Gabo Manuel