Coming from an RDBMS background, one of the features I looked for in MarkLogic is the “group by” statement to run aggregate functions based on particular fields. Let’s see how this is done in MarkLogic using both XQuery and server-side JavaScript (SJS).
First, let’s create some person data using this sample data structure:
Person | |
first_name | John |
last_name | Doe |
age | 18 |
gender | M |
height | 180.2 |
civil_status | Single |
We can represent that in MarkLogic as either XML or JSON:
<person> <name> <first_name>john</first_name> <last_name>doe</last_name> </name> <age>18</age> <gender>M</gender> <height>180.2</height> <civil_status>Single</civil_status> </person>
{ "person" : { "name" : { "first_name" : "john", "last_name" : "doe" }, "age" : 18, "gender" : "M", "height" : 180.2, "civil_status": "Single" } }
We create the data in our database using the following:
xdmp:document-insert( '/person/doe/john.xml', <person> <name> <first_name>john</first_name> <last_name>doe</last_name> </name> <age>18</age> <gender>M</gender> <height>180.2</height> <civil_status>Single</civil_status> </person> ), xdmp:document-insert( '/person/doe/jane.xml', <person> <name> <first_name>jane</first_name> <last_name>doe</last_name> </name> <age>19</age> <gender>F</gender> <height>172.3</height> <civil_status>Single</civil_status> </person> ), xdmp:document-insert( '/person/xi/chan.json', xdmp:unquote( '{ "person" : { "name" : { "first_name" : "chan", "last_name" : "xi" }, "age" : 17, "gender" : "F", "height" : 155.1, "civil_status": "Single" } }') )
declareUpdate(); xdmp.documentInsert( '/person/roe/john.xml', xdmp.unquote( '<person>' + ' <name>' + ' <first_name>john</first_name>' + ' <last_name>roe</last_name>' + ' </name>' + ' <age>20</age>' + ' <gender>M</gender>' + ' <height>170.2</height>' + ' <civil_status>Married</civil_status>' + '</person>') ); xdmp.documentInsert( '/person/roe/jane.json', { "person" : { "name" : { "first_name" : "jane", "last_name" : "roe" }, "age" : 21, "gender" : "F", "height" : 162.3, "civil_status": "Married" } } );
To get a basic gender count like ‘male: 22, female: 31’, in an RDBMS we do this:
select gender, count(1) from person group by gender
Here’s how to achieve this in MarkLogic; note that these snippets work, but aren’t optimized.
for $gender in fn:distinct-values(/person/gender) let $genderQuery := cts:or-query(( cts:element-value-query(xs:QName('gender'), $gender), cts:json-property-value-query('gender', $gender) )) return $gender || ":" || fn:count(cts:search(fn:doc(),$genderQuery))
var result = {}; var gender; for (var doc of cts.search(cts.directoryQuery(["/person/"], "infinity"))) { if (doc instanceof XMLNode) { gender = doc.root.xpath("/person/gender/fn:string()"); } else { gender = doc.root.person.gender; } result[gender] = (result[gender] ? result[gender] : 0) + 1; } result;
fn:distinct-values()
and fn:count()
always load documents off disk to do their work, which isn’t scalable. Performance gets slower the bigger the data set. To solve this, we can use MarkLogic extension functions that can run out of pure indexes.
We add an ‘element range index’ for ‘gender’:
With this element range index, we can now use cts:values()
and xdmp:estimate()
to get a big speed boost.
for $gender in cts:values(cts:element-reference(xs:QName('gender'))) return $gender || ": " || cts:frequency($gender)
var result = {}; for (var gender of cts.values(cts.elementReference(xs.QName('gender')))){ result[gender] = cts.frequency(gender); } result
This code is index optimized, runs in parallel on large clusters, and is amazingly fast. It’s simple to get sub-second answers even against massive data sets. It greatly outperforms typical RDBMS systems.
Next, let’s limit our search to a particular height range.
select gender, count(1) from person where age between 18 and 40 group by gender
In MarkLogic, we represent constraints as ‘queries’, which can be built up from primitives. We first define an age query constraint as shown below. For each unique ‘gender’ value in the database, we apply the ‘$age-query’ to limit the results to those genders appearing in a specific age range. To count the matching documents, we combine the existing ‘$gender-query’ with ‘$age-query’.
let $age-query := cts:and-query(( cts:element-range-query(xs:QName('age'), ">=", 18), cts:element-range-query(xs:QName('age'), "<=", 40) )) for $gender in cts:values(cts:element-reference(xs:QName('gender')), (), (), $age-query) let $query := cts:and-query(( $age-query, cts:element-value-query(xs:QName('gender'), $gender) )) return $gender || ": " || xdmp:estimate(cts:search(fn:doc(), $query))
cts.elementRangeQuery(xs.QName('age'), ">=", 18), cts.elementRangeQuery(xs.QName('age'), "<=", 40) ]); var result = {}; for (var gender of cts.values(cts.elementReference(xs.QName('gender')), null, null, query)){ var query = cts.andQuery([ ageQuery, cts.elementValueQuery(xs.QName('gender'), gender) ]); result[gender] = cts.estimate(query); } result
Let’s get fancier and use two different fields for the “group by”:
select last_name, gender, count(1) from person group by last_name, gender
Similar to gender, we add an element range index for ‘last_name’:
Now our code will look like this:
for $lastName in cts:values(cts:element-reference(xs:QName( 'last_name'))) let $lastNameQuery := cts:element-value-query(xs:QName( 'last_name'), $lastName) for $gender in cts:values(cts:element-reference(xs:QName( 'gender')), (), (), $lastNameQuery) let $query := cts:and-query(( $lastNameQuery, cts:element-value-query(xs:QName( 'gender'), $gender) )) return $lastName || ":" || $gender || ": " || xdmp:estimate(cts:search(fn:doc(), $query))
var result = {}; for (var lastName of cts.values(cts.elementReference(xs.QName('last_name')))){ var lastNameQuery = cts.elementValueQuery(xs.QName('last_name'), lastName); for (var gender of cts.values(cts.elementReference(xs.QName('gender')), null, null, lastNameQuery)){ var query = cts.andQuery([ lastNameQuery, cts.elementValueQuery(xs.QName('gender'), gender) ]); if (result[lastName] == undefined){ result[lastName] = {}; } result[lastName][gender] = cts.estimate(query); } } result
That works, but there’s a better way. It’s more elegant and faster to use co-occurences:
for $pair in cts:element-value-co-occurrences(xs:QName( 'last_name'), xs:QName( 'gender'), ("item-frequency", "frequency-order")) return $pair/*:value[1] || " - " || $pair/*:value[2] || " - " || cts:frequency($pair)
var result = {}; for (var pair of cts.elementValueCoOccurrences(xs.QName('last_name'), xs.QName('gender'), ["item-frequency", "frequency-order"])) { if (result[pair[0]] == undefined){ result[pair[0]] = {}; } result[pair[0]][pair[1]] = cts.frequency(pair); } result
It’s also possible to do “n-way” co-occurences. Let’s bring civil_status into the mix as well.
Now our code looks like this:
for $tuple in cts:value-tuples( ( cts:element-reference(xs:QName( 'civil_status')), cts:element-reference(xs:QName( 'last_name')), cts:element-reference(xs:QName( 'gender')) ), ("item-frequency", "frequency-order") ) return $tuple[1] || " - " || $tuple[2] || " - " || $tuple[3] || " - " || cts:frequency($tuple)
var result = []; for (var tuple of cts.valueTuples( [ cts.elementReference(xs.QName('civil_status')), cts.elementReference(xs.QName('last_name')), cts.elementReference(xs.QName('gender')) ], ["item-frequency", "frequency-order"] )){ var record = {}; record['week'] = tuple[0]; record['gender'] = tuple[1]; record['count'] = cts.frequency(tuple); result.push(record); } result
We can even rewrite our initial age/gender query using value-tuples:
let $ageQuery := cts:and-query(( cts:element-range-query(xs:QName('age'), ">=", 18), cts:element-range-query(xs:QName('age'), "<=", 40) )) for $tuple in cts:value-tuples( cts:element-reference(xs:QName( 'gender')), ("item-frequency", "frequency-order"), $ageQuery) return $tuple[1] || " - " || cts:frequency($tuple)
var result = []; var ageQuery = cts.andQuery([ cts.elementRangeQuery(xs.QName('age'), ">=", 18), cts.elementRangeQuery(xs.QName('age'), "<=", 40) ]); for (var tuple of cts.valueTuples( cts.elementReference(xs.QName('gender')), ["item-frequency", "frequency-order"], ageQuery )){ var record = {}; record['gender'] = tuple[0]; record['count'] = cts.frequency(tuple); result.push(record); } result
As a final thought, we don’t just have to count results. There’s a long list of aggregate functions available. For example, this query shows the average height grouped by age and gender:
for $tuple in cts:value-tuples( ( cts:element-reference(xs:QName( 'age')), cts:element-reference(xs:QName( 'gender')) ), ("item-frequency", "frequency-order")) let $query := cts:and-query(( cts:element-value-query(xs:QName( 'age'), xs:string($tuple[1])), cts:element-value-query(xs:QName( 'gender'), xs:string($tuple[2])) )) return $tuple[1] || " - " || $tuple[2] || " - " || cts:avg-aggregate(cts:element-reference(xs:QName( 'height')), (), $query)
var result = []; for (var tuple of cts.valueTuples( [ cts.elementReference(xs.QName('age')), cts.elementReference(xs.QName('gender')) ], ["item-frequency", "frequency-order"] )){ var query = cts.andQuery([ cts.elementValueQuery(xs.QName('age'), xs.string(tuple[0])), cts.elementValueQuery(xs.QName('gender'), xs.string(tuple[1])) ]); var record = {}; record['age'] = tuple[0]; record['gender'] = tuple[1]; record['averageHeight'] = cts.avgAggregate(cts.elementReference(xs.QName('height')), [], query); result.push(record); } result
Hope this helps you as you do grouped aggregates in MarkLogic!
Subscribe to get all the news, info and tutorials you need to build better business apps and sites