From 50b3db537ed84a708bbd20024c18f937eb49e3a3 Mon Sep 17 00:00:00 2001 From: Will Scullin Date: Mon, 11 Mar 2024 17:59:11 -0700 Subject: [PATCH] Motherduck wasm (#1640) Add support for motherduck in wasm build. Add tests for node motherduck. --- .github/workflows/db-motherduck.yaml | 29 +++++++ jest.config.js | 13 ++- package-lock.json | 37 +++++--- packages/malloy-db-duckdb/package.json | 7 +- .../malloy-db-duckdb/src/duckdb_common.ts | 3 + .../malloy-db-duckdb/src/duckdb_connection.ts | 2 - .../src/duckdb_wasm_connection.ts | 45 ++++++++-- .../src/duckdb_wasm_connection_browser.ts | 85 ++++++++++++++++++- .../src/duckdb_wasm_connection_node.ts | 8 +- test/src/databases/all/nomodel.spec.ts | 2 + test/src/runtimes.ts | 23 ++--- 11 files changed, 207 insertions(+), 47 deletions(-) create mode 100644 .github/workflows/db-motherduck.yaml diff --git a/.github/workflows/db-motherduck.yaml b/.github/workflows/db-motherduck.yaml new file mode 100644 index 000000000..ad03b737d --- /dev/null +++ b/.github/workflows/db-motherduck.yaml @@ -0,0 +1,29 @@ +name: MotherDuck + +on: [push] + +jobs: + build: + runs-on: ubuntu-latest + + strategy: + matrix: + node-version: [18.x] + + steps: + - uses: actions/checkout@v4 + with: + submodules: 'true' + - name: Use Node.js ${{ matrix.node-version }} + uses: actions/setup-node@v4 + with: + node-version: ${{ matrix.node-version }} + - name: npm install, build, and test + run: | + npm ci --loglevel error + npm run build + npm run test-silent + env: + CI: true + MALLOY_DATABASES: motherduck + MOTHERDUCK_TOKEN: ${{ secrets.MOTHERDUCK_TOKEN }} diff --git a/jest.config.js b/jest.config.js index 2c457c474..1219f13c5 100644 --- a/jest.config.js +++ b/jest.config.js @@ -23,14 +23,21 @@ process.env.TZ = 'America/Los_Angeles'; +const transformIgnoreModules = [ + 'lit-html', + 'lit-element', + 'lit', + '@lit', + '@lit-labs', + '@motherduck/wasm-client', +].join('|'); + module.exports = { moduleFileExtensions: ['js', 'jsx', 'ts', 'tsx'], setupFilesAfterEnv: ['jest-expect-message'], testMatch: ['**/?(*.)spec.(ts|js)?(x)'], testPathIgnorePatterns: ['/node_modules/', '/dist/', '/out/'], - transformIgnorePatterns: [ - 'node_modules/(?!(lit-html|lit-element|lit|@lit|@lit-labs)/)', - ], + transformIgnorePatterns: [`node_modules/(?!(${transformIgnoreModules})/)`], transform: { '^.+\\.(ts|tsx)$': ['ts-jest', {tsconfig: '/tsconfig.json'}], '^.+\\.(js|jsx)$': [ diff --git a/package-lock.json b/package-lock.json index e51ff1904..9620cfe62 100644 --- a/package-lock.json +++ b/package-lock.json @@ -3434,6 +3434,14 @@ "node": ">=10.0.0" } }, + "node_modules/@duckdb/duckdb-wasm": { + "version": "1.28.1-dev106.0", + "resolved": "https://registry.npmjs.org/@duckdb/duckdb-wasm/-/duckdb-wasm-1.28.1-dev106.0.tgz", + "integrity": "sha512-HcA9q/Yq1t8nAIg2rl8DmOTjKy1tAHSdBGHlCcWAm5StsfAjcm+f0STBEH3hmWPk0qEtOJF30OR+GfeyUOP+hA==", + "dependencies": { + "apache-arrow": "^14.0.1" + } + }, "node_modules/@emotion/use-insertion-effect-with-fallbacks": { "version": "1.0.1", "dev": true, @@ -5308,14 +5316,6 @@ "resolved": "packages/malloy-db-snowflake", "link": true }, - "node_modules/@malloydata/duckdb-wasm": { - "version": "0.0.6", - "resolved": "https://registry.npmjs.org/@malloydata/duckdb-wasm/-/duckdb-wasm-0.0.6.tgz", - "integrity": "sha512-jy+gIP8ITUnDsF8HhI+WMFQRzkGRx/vLxmWA/dA+DI7dbk9MlCnCR/U0XoOHOIyRlpCIVCHfb9L1EO0YGJHooA==", - "dependencies": { - "apache-arrow": "^13.0.0" - } - }, "node_modules/@malloydata/eslint-plugin-lint": { "resolved": "packages/malloy-lint", "link": true @@ -5424,6 +5424,14 @@ "react": ">=16" } }, + "node_modules/@motherduck/wasm-client": { + "version": "0.4.0", + "resolved": "https://registry.npmjs.org/@motherduck/wasm-client/-/wasm-client-0.4.0.tgz", + "integrity": "sha512-NNzs/DUQOpVZPCthbO3ka8PCdfBQN7cJXpNr+7kGbnJRqDaA/4Tpc99M5yl8wOKrYGwCZO36hIwe1+e+nRs8zQ==", + "peerDependencies": { + "apache-arrow": "^14.0.x" + } + }, "node_modules/@ndelangen/get-tarball": { "version": "3.0.9", "dev": true, @@ -10991,9 +10999,9 @@ } }, "node_modules/apache-arrow": { - "version": "13.0.0", - "resolved": "https://registry.npmjs.org/apache-arrow/-/apache-arrow-13.0.0.tgz", - "integrity": "sha512-3gvCX0GDawWz6KFNC28p65U+zGh/LZ6ZNKWNu74N6CQlKzxeoWHpi4CgEQsgRSEMuyrIIXi1Ea2syja7dwcHvw==", + "version": "14.0.2", + "resolved": "https://registry.npmjs.org/apache-arrow/-/apache-arrow-14.0.2.tgz", + "integrity": "sha512-EBO2xJN36/XoY81nhLcwCJgFwkboDZeyNQ+OPsG7bCoQjc2BT0aTyH/MR6SrL+LirSNz+cYqjGRlupMMlP1aEg==", "dependencies": { "@types/command-line-args": "5.2.0", "@types/command-line-usage": "5.0.2", @@ -28122,9 +28130,10 @@ "version": "0.0.130", "license": "MIT", "dependencies": { - "@malloydata/duckdb-wasm": "0.0.6", - "@malloydata/malloy": "^0.0.130", - "apache-arrow": "^13.0.0", + "@duckdb/duckdb-wasm": "1.28.1-dev106.0", + "@malloydata/malloy": "0.0.130", + "@motherduck/wasm-client": "^0.4.0", + "apache-arrow": "^14.0.0", "duckdb": "0.9.2", "web-worker": "^1.2.0" }, diff --git a/packages/malloy-db-duckdb/package.json b/packages/malloy-db-duckdb/package.json index 7c9468247..0922eb019 100644 --- a/packages/malloy-db-duckdb/package.json +++ b/packages/malloy-db-duckdb/package.json @@ -40,9 +40,10 @@ "prepublishOnly": "npm run build" }, "dependencies": { - "@malloydata/duckdb-wasm": "0.0.6", - "@malloydata/malloy": "^0.0.130", - "apache-arrow": "^13.0.0", + "@duckdb/duckdb-wasm": "1.28.1-dev106.0", + "@malloydata/malloy": "0.0.130", + "@motherduck/wasm-client": "^0.4.0", + "apache-arrow": "^14.0.0", "duckdb": "0.9.2", "web-worker": "^1.2.0" } diff --git a/packages/malloy-db-duckdb/src/duckdb_common.ts b/packages/malloy-db-duckdb/src/duckdb_common.ts index fe0814aa7..e45fd0344 100644 --- a/packages/malloy-db-duckdb/src/duckdb_common.ts +++ b/packages/malloy-db-duckdb/src/duckdb_common.ts @@ -54,6 +54,9 @@ const unquoteName = (name: string) => { export abstract class DuckDBCommon implements TestableConnection, PersistSQLResults, StreamingConnection { + protected isMotherDuck = false; + protected motherDuckToken: string | undefined; + private readonly dialect = new DuckDBDialect(); static DEFAULT_QUERY_OPTIONS: DuckDBQueryOptions = { rowLimit: 10, diff --git a/packages/malloy-db-duckdb/src/duckdb_connection.ts b/packages/malloy-db-duckdb/src/duckdb_connection.ts index 252102f72..e867b6d5b 100644 --- a/packages/malloy-db-duckdb/src/duckdb_connection.ts +++ b/packages/malloy-db-duckdb/src/duckdb_connection.ts @@ -56,8 +56,6 @@ export class DuckDBConnection extends DuckDBCommon { private additionalExtensions: string[] = []; private databasePath = ':memory:'; private workingDirectory = '.'; - private isMotherDuck = false; - private motherDuckToken: string | undefined; private readOnly = false; connecting: Promise; diff --git a/packages/malloy-db-duckdb/src/duckdb_wasm_connection.ts b/packages/malloy-db-duckdb/src/duckdb_wasm_connection.ts index 5f2c21e0b..6cf9ca650 100644 --- a/packages/malloy-db-duckdb/src/duckdb_wasm_connection.ts +++ b/packages/malloy-db-duckdb/src/duckdb_wasm_connection.ts @@ -21,7 +21,7 @@ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -import * as duckdb from '@malloydata/duckdb-wasm'; +import * as duckdb from '@duckdb/duckdb-wasm'; import Worker from 'web-worker'; import { FetchSchemaOptions, @@ -38,6 +38,8 @@ import {DuckDBCommon} from './duckdb_common'; const TABLE_MATCH = /FROM\s*('([^']*)'|"([^"]*)")/gi; const TABLE_FUNCTION_MATCH = /FROM\s+[a-z0-9_]+\(('([^']*)'|"([^"]*)")/gi; +const FILE_EXTS = ['.csv', '.tsv', '.parquet'] as const; + /** * Arrow's toJSON() doesn't really do what I'd expect, since * it still includes Arrow objects like DecimalBigNums and Vectors, @@ -47,7 +49,7 @@ const TABLE_FUNCTION_MATCH = /FROM\s+[a-z0-9_]+\(('([^']*)'|"([^"]*)")/gi; * @return Vanilla Javascript value */ // eslint-disable-next-line @typescript-eslint/no-explicit-any -const unwrapArrow = (value: unknown): any => { +export const unwrapArrow = (value: unknown): any => { if (value === null) { return value; } else if (value instanceof Vector) { @@ -67,6 +69,12 @@ const unwrapArrow = (value: unknown): any => { return Number(obj[Symbol.toPrimitive]()); } else if (Array.isArray(value)) { return value.map(unwrapArrow); + } else if (obj['microseconds'] && obj['timezone'] === null) { + // Convert epoch µs to ms + return Number(obj['microseconds']) / 1000; + } else if (obj['days']) { + // Convert epoch day to Date + return new Date(obj['days'] * 8.64e7); } else { // eslint-disable-next-line @typescript-eslint/no-explicit-any const result: Record = {}; @@ -88,7 +96,7 @@ const unwrapArrow = (value: unknown): any => { * For some reason a custom replacer only sees DecimalBigNums as * strings, as well. */ -const unwrapRow = (row: StructRow): QueryDataRow => { +export const unwrapRow = (row: StructRow): QueryDataRow => { return unwrapArrow(row.toJSON()); }; @@ -106,10 +114,13 @@ type RemoteFileCallback = ( ) => Promise; export interface DuckDBWasmOptions extends ConnectionConfig { + additionalExtensions?: string[]; databasePath?: string; + motherDuckToken: string | undefined; workingDirectory?: string; } export abstract class DuckDBWASMConnection extends DuckDBCommon { + private additionalExtensions: string[] = []; public readonly name: string; private databasePath: string | null = null; protected workingDirectory = '/'; @@ -158,11 +169,21 @@ export abstract class DuckDBWASMConnection extends DuckDBCommon { if (typeof arg.workingDirectory === 'string') { this.workingDirectory = arg.workingDirectory; } + if (typeof arg.motherDuckToken === 'string') { + this.motherDuckToken = arg.motherDuckToken; + } + if (Array.isArray(arg.additionalExtensions)) { + this.additionalExtensions = arg.additionalExtensions; + } } + this.isMotherDuck = + this.databasePath?.startsWith('md:') || + this.databasePath?.startsWith('motherduck:') || + false; this.connecting = this.init(); } - private async init(): Promise { + protected async init(): Promise { // Select a bundle based on browser checks const bundle = await duckdb.selectBundle(this.getBundles()); @@ -219,10 +240,11 @@ export abstract class DuckDBWASMConnection extends DuckDBCommon { `SET FILE_SEARCH_PATH='${this.workingDirectory}'` ); } - // Not quite ready for prime time - // for (const ext of ['json', 'httpfs', 'icu']) { - // await this.loadExtension(ext); - // } + const extensions = ['json', 'icu', ...this.additionalExtensions]; + + for (const ext of extensions) { + await this.loadExtension(ext); + } const setupCmds = ["SET TimeZone='UTC'"]; for (const cmd of setupCmds) { try { @@ -327,6 +349,13 @@ export abstract class DuckDBWASMConnection extends DuckDBCommon { await this.setup(); for (const tablePath of tables) { + if ( + this.isMotherDuck && + !tables.includes('/') && + !FILE_EXTS.some(ext => tablePath.endsWith(ext)) + ) { + continue; + } // http and s3 urls are handled by duckdb-wasm if (tablePath.match(/^https?:\/\//)) { continue; diff --git a/packages/malloy-db-duckdb/src/duckdb_wasm_connection_browser.ts b/packages/malloy-db-duckdb/src/duckdb_wasm_connection_browser.ts index c2420ab7c..5cf3c8907 100644 --- a/packages/malloy-db-duckdb/src/duckdb_wasm_connection_browser.ts +++ b/packages/malloy-db-duckdb/src/duckdb_wasm_connection_browser.ts @@ -21,14 +21,95 @@ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -import * as duckdb from '@malloydata/duckdb-wasm'; -import {DuckDBWASMConnection as DuckDBWASMConnectionBase} from './duckdb_wasm_connection'; +import * as duckdb from '@duckdb/duckdb-wasm'; +import { + DuckDBWASMConnection as DuckDBWASMConnectionBase, + unwrapArrow, +} from './duckdb_wasm_connection'; +import {MDConnection} from '@motherduck/wasm-client'; +import {QueryDataRow} from '@malloydata/malloy'; export class DuckDBWASMConnection extends DuckDBWASMConnectionBase { + protected _mdConnection: MDConnection | null = null; + getBundles(): duckdb.DuckDBBundles { return duckdb.getJsDelivrBundles(); } + override async init(): Promise { + if (this.isMotherDuck) { + if (!this.motherDuckToken) { + throw new Error('Please set your MotherDuck token'); + } + const mdConnection = MDConnection.create({ + mdToken: this.motherDuckToken, + }); + await mdConnection.isInitialized(); + this._mdConnection = mdConnection; + console.info('MotherDuck initialized'); + } else { + await super.init(); + } + } + + override async setup(): Promise { + if (this.isMotherDuck) { + const doSetup = async () => { + const setupCmds = ["SET TimeZone='UTC'"]; + for (const cmd of setupCmds) { + try { + await this.runDuckDBQuery(cmd); + } catch (error) { + // eslint-disable-next-line no-console + console.error(`duckdb setup ${cmd} => ${error}`); + } + } + }; + await this.connecting; + if (!this.isSetup) { + this.isSetup = doSetup(); + } + await this.isSetup; + } else { + await super.setup(); + } + } + + protected override async runDuckDBQuery( + sql: string, + abortSignal?: AbortSignal + ): Promise<{rows: QueryDataRow[]; totalRows: number}> { + if (this.isMotherDuck) { + if (this._mdConnection) { + const connection = this._mdConnection; + let queryId: string | undefined = undefined; + const cancel = () => { + if (queryId) { + connection.cancelQuery(queryId, 'Cancelled'); + } + }; + abortSignal?.addEventListener('abort', cancel); + queryId = connection.enqueueQuery(sql); + if (queryId) { + const result = await connection.evaluateQueuedQuery(queryId); + if (result?.data) { + const rows = unwrapArrow(result.data.toRows()); + const totalRows = result.data.rowCount; + return { + rows, + totalRows, + }; + } + throw new Error('No data'); + } + throw new Error('Failed to enqueue query'); + } + throw new Error('MotherDuck not initialized'); + } else { + return super.runDuckDBQuery(sql, abortSignal); + } + } + async createHash(sqlCommand: string): Promise { const msgUint8 = new TextEncoder().encode(sqlCommand); const hashBuffer = await crypto.subtle.digest('SHA-256', msgUint8); diff --git a/packages/malloy-db-duckdb/src/duckdb_wasm_connection_node.ts b/packages/malloy-db-duckdb/src/duckdb_wasm_connection_node.ts index 13d1597a4..0089f676d 100644 --- a/packages/malloy-db-duckdb/src/duckdb_wasm_connection_node.ts +++ b/packages/malloy-db-duckdb/src/duckdb_wasm_connection_node.ts @@ -22,18 +22,18 @@ */ import crypto from 'crypto'; -import {DuckDBBundles} from '@malloydata/duckdb-wasm'; +import {DuckDBBundles} from '@duckdb/duckdb-wasm'; import {DuckDBWASMConnection as DuckDBWASMConnectionBase} from './duckdb_wasm_connection'; export class DuckDBWASMConnection extends DuckDBWASMConnectionBase { getBundles(): DuckDBBundles { - const resolvePath = require.resolve('@malloydata/duckdb-wasm'); + const resolvePath = require.resolve('@duckdb/duckdb-wasm'); if (!resolvePath) { - throw new Error('Unable to resolve @malloydata/duckdb-wasm path'); + throw new Error('Unable to resolve @duckdb/duckdb-wasm path'); } const distMatch = resolvePath.match(/^.*\/dist\//); if (!distMatch) { - throw new Error('Unable to resolve @malloydata/duckdb-wasm dist path'); + throw new Error('Unable to resolve @duckdb/duckdb-wasm dist path'); } const dist = distMatch[0]; diff --git a/test/src/databases/all/nomodel.spec.ts b/test/src/databases/all/nomodel.spec.ts index df19cbd99..0503eaa81 100644 --- a/test/src/databases/all/nomodel.spec.ts +++ b/test/src/databases/all/nomodel.spec.ts @@ -46,6 +46,8 @@ function getSplitFunction(db: string) { `string_to_array(${column}, '${splitChar}')`, 'duckdb_wasm': (column: string, splitChar: string) => `string_to_array(${column}, '${splitChar}')`, + 'motherduck': (column: string, splitChar: string) => + `string_to_array(${column}, '${splitChar}')`, 'snowflake': (column: string, splitChar: string) => `split(${column}, '${splitChar}')`, }[db]; diff --git a/test/src/runtimes.ts b/test/src/runtimes.ts index 9f5fc0836..caa33af09 100644 --- a/test/src/runtimes.ts +++ b/test/src/runtimes.ts @@ -89,10 +89,6 @@ export class PostgresTestConnection extends PooledPostgresConnection { export class DuckDBTestConnection extends DuckDBConnection { // we probably need a better way to do this. - constructor(name: string) { - super(name, 'test/data/duckdb/duckdb_test.db'); - } - public async runSQL( sqlCommand: string, options?: RunSQLOptions @@ -110,10 +106,6 @@ export class DuckDBTestConnection extends DuckDBConnection { export class DuckDBWASMTestConnection extends DuckDBWASMConnection { // we probably need a better way to do this. - constructor(name: string) { - super(name, 'test/data/duckdb/duckdb_test.db'); - } - public async runSQL( sqlCommand: string, options?: RunSQLOptions @@ -135,7 +127,7 @@ export function rows(qr: Result): QueryDataRow[] { } export function runtimeFor(dbName: string): SingleConnectionRuntime { - let connection; + let connection: Connection; try { switch (dbName) { case 'bigquery': @@ -149,10 +141,19 @@ export function runtimeFor(dbName: string): SingleConnectionRuntime { connection = new PostgresTestConnection(dbName); break; case 'duckdb': - connection = new DuckDBTestConnection(dbName); + connection = new DuckDBTestConnection( + dbName, + 'test/data/duckdb/duckdb_test.db' + ); break; case 'duckdb_wasm': - connection = new DuckDBWASMTestConnection(dbName); + connection = new DuckDBWASMTestConnection( + dbName, + 'test/data/duckdb/duckdb_test.db' + ); + break; + case 'motherduck': + connection = new DuckDBTestConnection(dbName, 'md:my_db'); break; case 'snowflake': {