From 54e6a3a51c88e37120bc16d0069109a550cd8b33 Mon Sep 17 00:00:00 2001 From: Michael Toy <66150587+mtoy-googly-moogly@users.noreply.github.com> Date: Thu, 16 Jan 2025 12:27:35 -0800 Subject: [PATCH] Stop using regex based schema parser for table schema on tresto (#2098) * move to non-regex schema parser * fix trino version of schema parser * fix parser for spaces in names * parse trailing (num) on timezones * correctly parse decimal types * fix parsing for timestamp() again with test this time * add paranoia about case --- package.json | 2 + packages/malloy-db-trino/src/index.ts | 1 - .../src/trino_connection.spec.ts | 64 ++++---- .../malloy-db-trino/src/trino_connection.ts | 146 +++++------------- 4 files changed, 78 insertions(+), 135 deletions(-) diff --git a/package.json b/package.json index cc6a150bb..78591789b 100644 --- a/package.json +++ b/package.json @@ -37,6 +37,8 @@ "test-postgres": "MALLOY_DATABASE=postgres jest --runInBand", "test-duckdb": "JEST_SILENT_REPORTER_SHOW_PATHS=true MALLOY_DATABASE=duckdb jest --runInBand --reporters jest-silent-reporter", "ci-snowflake": "JEST_SILENT_REPORTER_SHOW_PATHS=true jest --config jest.snowflake.config.ts --reporters jest-silent-reporter --reporters summary", + "ci-trino": "MALLOY_DATABASE=trino JEST_SILENT_REPORTER_SHOW_PATHS=true jest --config jest.trino.config.ts --reporters jest-silent-reporter --reporters summary", + "ci-presto": "MALLOY_DATABASE=presto JEST_SILENT_REPORTER_SHOW_PATHS=true jest --config jest.presto.config.ts --reporters jest-silent-reporter --reporters summary", "test-silent": "JEST_SILENT_REPORTER_SHOW_PATHS=true jest --runInBand --reporters jest-silent-reporter --no-color", "test-deps": "npm run build && npx jest -t dependencies", "third-party-licenses": "ts-node scripts/third_party_licenses", diff --git a/packages/malloy-db-trino/src/index.ts b/packages/malloy-db-trino/src/index.ts index f8e6cf23d..bfa4d9240 100644 --- a/packages/malloy-db-trino/src/index.ts +++ b/packages/malloy-db-trino/src/index.ts @@ -24,7 +24,6 @@ export type {BaseRunner} from './trino_connection'; export { PrestoConnection, - PrestoExplainParser, TrinoConnection, TrinoPrestoConnection, } from './trino_connection'; diff --git a/packages/malloy-db-trino/src/trino_connection.spec.ts b/packages/malloy-db-trino/src/trino_connection.spec.ts index 1cfac227a..ac45bca9f 100644 --- a/packages/malloy-db-trino/src/trino_connection.spec.ts +++ b/packages/malloy-db-trino/src/trino_connection.spec.ts @@ -63,39 +63,58 @@ describe('Trino connection', () => { describe('schema parser', () => { it('parses arrays', () => { - expect(connection.malloyTypeFromTrinoType('test', ARRAY_SCHEMA)).toEqual({ + expect(connection.malloyTypeFromTrinoType(ARRAY_SCHEMA)).toEqual({ type: 'array', elementTypeDef: intType, }); }); it('parses inline', () => { - expect(connection.malloyTypeFromTrinoType('test', INLINE_SCHEMA)).toEqual( - { - 'type': 'record', - 'fields': recordSchema, - } - ); + expect(connection.malloyTypeFromTrinoType(INLINE_SCHEMA)).toEqual({ + 'type': 'record', + 'fields': recordSchema, + }); }); it('parses nested', () => { - expect(connection.malloyTypeFromTrinoType('test', NESTED_SCHEMA)).toEqual( - { - 'type': 'array', - 'elementTypeDef': {type: 'record_element'}, - 'fields': recordSchema, - } - ); + expect(connection.malloyTypeFromTrinoType(NESTED_SCHEMA)).toEqual({ + 'type': 'array', + 'elementTypeDef': {type: 'record_element'}, + 'fields': recordSchema, + }); }); it('parses a simple type', () => { - expect(connection.malloyTypeFromTrinoType('test', 'varchar(60)')).toEqual( + expect(connection.malloyTypeFromTrinoType('varchar(60)')).toEqual( stringType ); }); + it('parses a decimal integer type', () => { + expect(connection.malloyTypeFromTrinoType('decimal(10)')).toEqual({ + type: 'number', + numberType: 'integer', + }); + }); + + it('parses a decimal float type', () => { + expect(connection.malloyTypeFromTrinoType('decimal(10,10)')).toEqual({ + type: 'number', + numberType: 'float', + }); + }); + + it('parses row with timestamp(3)', () => { + expect( + connection.malloyTypeFromTrinoType('row(la_time timestamp(3))') + ).toEqual({ + type: 'record', + fields: [{name: 'la_time', type: 'timestamp'}], + }); + }); + it('parses deep nesting', () => { - expect(connection.malloyTypeFromTrinoType('test', DEEP_SCHEMA)).toEqual({ + expect(connection.malloyTypeFromTrinoType(DEEP_SCHEMA)).toEqual({ 'type': 'array', 'elementTypeDef': {type: 'record_element'}, 'fields': [ @@ -114,17 +133,4 @@ describe('Trino connection', () => { }); }); }); - - describe('splitColumns', () => { - it('handles internal rows', () => { - const nested = - 'popular_name varchar, airport_count double, by_state array(row(state varchar, airport_count double))'; - - expect(connection.splitColumns(nested)).toEqual([ - 'popular_name varchar', - 'airport_count double', - 'by_state array(row(state varchar, airport_count double))', - ]); - }); - }); }); diff --git a/packages/malloy-db-trino/src/trino_connection.ts b/packages/malloy-db-trino/src/trino_connection.ts index 3087e60cd..64e535049 100644 --- a/packages/malloy-db-trino/src/trino_connection.ts +++ b/packages/malloy-db-trino/src/trino_connection.ts @@ -39,10 +39,8 @@ import { AtomicTypeDef, mkFieldDef, isScalarArray, - RepeatedRecordTypeDef, RecordTypeDef, Dialect, - ArrayTypeDef, FieldDef, TinyParser, isRepeatedRecord, @@ -261,7 +259,7 @@ export abstract class TrinoPrestoConnection const columns = r.columns; const malloyColumns = columns.map(c => - this.malloyTypeFromTrinoType(c.name, c.type) + mkFieldDef(this.malloyTypeFromTrinoType(c.type), c.name) ); const malloyRows: QueryDataRow[] = []; @@ -349,101 +347,16 @@ export abstract class TrinoPrestoConnection //while (!(await result.next()).done); } - splitColumns(s: string) { - const columns: string[] = []; - let parens = 0; - let column = ''; - let eatSpaces = true; - for (let idx = 0; idx < s.length; idx++) { - const c = s.charAt(idx); - if (eatSpaces && c === ' ') { - // Eat space - } else { - eatSpaces = false; - if (!parens && c === ',') { - columns.push(column); - column = ''; - eatSpaces = true; - } else { - column += c; - } - if (c === '(') { - parens += 1; - } else if (c === ')') { - parens -= 1; - } - } - } - columns.push(column); - return columns; - } - - public malloyTypeFromTrinoType( - name: string, - trinoType: string - ): AtomicTypeDef { - // Arrays look like `array(type)` - const arrayMatch = trinoType.match(/^(([^,])+\s)?array\((.*)\)$/); - - // Structs look like `row(name type, name type)` - const structMatch = trinoType.match(/^(([^,])+\s)?row\((.*)\)$/); - - if (arrayMatch) { - const arrayType = arrayMatch[3]; - const innerType = this.malloyTypeFromTrinoType(name, arrayType); - if (innerType.type === 'record') { - const complexStruct: RepeatedRecordTypeDef = { - type: 'array', - elementTypeDef: {type: 'record_element'}, - fields: innerType.fields, - }; - return complexStruct; - } else { - const arrayStruct: ArrayTypeDef = { - type: 'array', - elementTypeDef: innerType, - }; - return arrayStruct; - } - } else if (structMatch) { - // TODO: Trino doesn't quote or escape commas in field names, - // so some magic is going to need to be applied before we get here - // to avoid confusion if a field name contains a comma - const innerTypes = this.splitColumns(structMatch[3]); - const recordType: RecordTypeDef = { - type: 'record', - fields: [], - }; - for (let innerType of innerTypes) { - // TODO: Handle time zone type annotation, which is an - // exception to the types not containing spaces assumption - innerType = innerType.replace(/ with time zone$/, ''); - let parts = innerType.match(/^(.+?)\s((array\(|row\().*)$/); - if (parts === null) { - parts = innerType.match(/^(.+)\s(\S+)$/); - } - if (parts) { - // remove quotes from the name - const innerName = parts[1].replace(/^"(.+(?="$))"$/, '$1'); - const innerTrinoType = parts[2]; - const innerMalloyType = this.malloyTypeFromTrinoType( - innerName, - innerTrinoType - ); - recordType.fields.push(mkFieldDef(innerMalloyType, innerName)); - } - } - return recordType; - } - return this.dialect.sqlTypeToMalloyType(trinoType); + public malloyTypeFromTrinoType(trinoType: string): AtomicTypeDef { + const typeParse = new TrinoPrestoSchemaParser(trinoType, this.dialect); + return typeParse.typeDef(); } structDefFromSchema(rows: string[][], structDef: StructDef): void { for (const row of rows) { const name = row[0]; const type = row[4] || row[1]; - const malloyType = this.malloyTypeFromTrinoType(name, type); - // console.log('>', row, '\n<', malloyType); + const malloyType = mkFieldDef(this.malloyTypeFromTrinoType(type), name); structDef.fields.push(mkFieldDef(malloyType, name)); } } @@ -561,8 +474,8 @@ export class PrestoConnection extends TrinoPrestoConnection { ); } - const schemaDesc = new PrestoExplainParser(lines[0], this.dialect); - structDef.fields = schemaDesc.parseExplain(); + const schemaDesc = new TrinoPrestoSchemaParser(lines[0], this.dialect); + structDef.fields = schemaDesc.parseQueryPlan(); } unpackArray(data: unknown): unknown[] { @@ -607,16 +520,19 @@ export class TrinoConnection extends TrinoPrestoConnection { } /** - * A hand built parser for schema lines, roughly this grammar + * A hand built parser for schema lines, it parses two things ... + * A presto query plan * SCHEMA_LINE: - Output [PlanName N] [NAME_LIST] => [TYPE_LIST] * NAME_LIST: NAME (, NAME)* * TYPE_LIST: TYPE_SPEC (, TYPE_SPEC)* * TYPE_SPEC: exprN ':' TYPE + * + * And a presto/trino type * TYPE: REC_TYPE | ARRAY_TYPE | SQL_TYPE * ARRAY_TYPE: ARRAY '(' TYPE ')' * REC_TYPE: REC '(' "name" TYPE (, "name" TYPE)* ')' */ -export class PrestoExplainParser extends TinyParser { +class TrinoPrestoSchemaParser extends TinyParser { constructor( readonly input: string, readonly dialect: Dialect @@ -626,7 +542,7 @@ export class PrestoExplainParser extends TinyParser { arrow: /^=>/, char: /^[,:[\]()-]/, id: /^\w+/, - quoted_name: /^"\w+"/, + quoted_name: /^"(\\"|[^"])*"/, }); } @@ -651,7 +567,7 @@ export class PrestoExplainParser extends TinyParser { return fieldNames; } - parseExplain(): FieldDef[] { + parseQueryPlan(): FieldDef[] { const fieldNames = this.fieldNameList(); const fields: FieldDef[] = []; this.next('arrow', '['); @@ -686,7 +602,10 @@ export class PrestoExplainParser extends TinyParser { } else if (typToken.text === 'row' && this.next('(')) { const fields: FieldDef[] = []; for (;;) { - const name = this.next('quoted_name'); + const name = this.next(); + if (name.type !== 'id' && name.type !== 'quoted_name') { + throw this.parseError(`Expected property name, got '${name.type}'`); + } const getDef = this.typeDef(); fields.push(mkFieldDef(getDef, name.text)); const sep = this.next(); @@ -713,17 +632,34 @@ export class PrestoExplainParser extends TinyParser { } : {type: 'array', elementTypeDef: elType}; } else if (typToken.type === 'id') { - const sqlType = typToken.text; - const def = this.dialect.sqlTypeToMalloyType(sqlType); - if (def === undefined) { - throw this.parseError(`Can't parse presto type ${sqlType}`); - } + const sqlType = typToken.text.toLowerCase(); if (sqlType === 'varchar') { if (this.peek().type === '(') { this.next('(', 'id', ')'); } + } else if (sqlType === 'timestamp') { + if (this.peek().text === '(') { + this.next('(', 'id', ')'); + } + if (this.peek().text === 'with') { + this.nextText('with', 'time', 'zone'); + } } - return def; + const typeDef = this.dialect.sqlTypeToMalloyType(sqlType); + if (typeDef.type === 'number' && sqlType === 'decimal') { + this.next('(', 'id'); + if (this.peek().type === ',') { + this.next(',', 'id'); + typeDef.numberType = 'float'; + } else { + typeDef.numberType = 'integer'; + } + this.next(')'); + } + if (typeDef === undefined) { + throw this.parseError(`Can't parse presto type ${sqlType}`); + } + return typeDef; } throw this.parseError( `'${typToken.text}' unexpected while looking for a type`