From a21d11ddea8f3e0e089a60cff60d219d20fc772d Mon Sep 17 00:00:00 2001 From: Lloyd Tabb Date: Mon, 16 Dec 2024 15:36:53 -0800 Subject: [PATCH] added some basic hyperloglog functions --- .../src/dialect/trino/dialect_functions.ts | 69 +++++++++++++++++++ .../presto-trino/presto-trino.spec.ts | 27 ++++++++ 2 files changed, 96 insertions(+) diff --git a/packages/malloy/src/dialect/trino/dialect_functions.ts b/packages/malloy/src/dialect/trino/dialect_functions.ts index cf66e1b7f..0ce3f7ce5 100644 --- a/packages/malloy/src/dialect/trino/dialect_functions.ts +++ b/packages/malloy/src/dialect/trino/dialect_functions.ts @@ -77,6 +77,70 @@ const count_approx: DefinitionBlueprint = { isSymmetric: true, }; +const hll_accumulate: OverloadedDefinitionBlueprint = { + default: { + generic: [ + 'T', + ['string', 'number', 'date', 'timestamp', 'boolean', 'json'], + ], + takes: {'value': {dimension: {generic: 'T'}}}, + returns: {measure: 'string'}, + isSymmetric: true, + impl: { + function: 'APPROX_SET', + }, + }, + with_percent: { + generic: [ + 'T', + ['string', 'number', 'date', 'timestamp', 'boolean', 'json'], + ], + takes: {'value': {dimension: {generic: 'T'}}, 'accuracy': 'number'}, + returns: {measure: 'string'}, + isSymmetric: true, + impl: { + function: 'APPROX_SET', + }, + }, +}; + +const hll_combine: DefinitionBlueprint = { + takes: { + 'value': 'string', + }, + returns: {measure: 'string'}, + impl: {function: 'MERGE'}, + isSymmetric: true, +}; + +const hll_estimate: DefinitionBlueprint = { + takes: { + 'value': 'string', + }, + returns: {dimension: 'number'}, + impl: {function: 'CARDINALITY'}, +}; + +const hll_export: DefinitionBlueprint = { + takes: { + 'value': 'string', + }, + returns: {dimension: 'string'}, + impl: { + sql: 'CAST(${value} AS VARBINARY)', + }, +}; + +const hll_import: DefinitionBlueprint = { + takes: { + 'value': 'string', + }, + returns: {dimension: 'string'}, + impl: { + sql: 'CAST(${value} AS HyperLogLog)', + }, +}; + const max_by: DefinitionBlueprint = { generic: ['T', ['string', 'number', 'date', 'timestamp', 'boolean', 'json']], takes: { @@ -294,6 +358,8 @@ export const TRINO_DIALECT_FUNCTIONS: DefinitionBlueprintMap = { bool_or, corr, count_approx, + hll_accumulate, + hll_combine, max_by, min_by, string_agg, @@ -306,6 +372,9 @@ export const TRINO_DIALECT_FUNCTIONS: DefinitionBlueprintMap = { date_format, date_parse, from_unixtime, + hll_estimate, + hll_export, + hll_import, json_extract_scalar, regexp_like, regexp_replace, diff --git a/test/src/databases/presto-trino/presto-trino.spec.ts b/test/src/databases/presto-trino/presto-trino.spec.ts index edd759151..b8c9979fa 100644 --- a/test/src/databases/presto-trino/presto-trino.spec.ts +++ b/test/src/databases/presto-trino/presto-trino.spec.ts @@ -285,6 +285,33 @@ describe.each(runtimes.runtimeList)( ]); }); + it(`hyperloglog basic - ${databaseName}`, async () => { + await expect(`run: ${databaseName}.table('malloytest.state_facts') -> { + aggregate: + m1 is floor(hll_estimate(hll_accumulate(state))/10) + }`).malloyResultMatches(runtime, {m1: 5}); + }); + + it(`hyperloglog combine - ${databaseName}`, async () => { + await expect(`run: ${databaseName}.table('malloytest.state_facts') -> { + group_by: state + aggregate: names_hll is hll_accumulate(popular_name) + } -> { + aggregate: name_count is hll_estimate(hll_combine(names_hll)) + } + `).malloyResultMatches(runtime, {name_count: 6}); + }); + + it(`hyperloglog import/export - ${databaseName}`, async () => { + await expect(`run: ${databaseName}.table('malloytest.state_facts') -> { + group_by: state + aggregate: names_hll is hll_export(hll_accumulate(popular_name)) + } -> { + aggregate: name_count is hll_estimate(hll_combine(hll_import(names_hll))) + } + `).malloyResultMatches(runtime, {name_count: 6}); + }); + it(`runs the url_extract functions - ${databaseName}`, async () => { await expect(` run: ${databaseName}.sql(