From a19dfa1b25201a4b3acfcb7560641c84b5daa3b5 Mon Sep 17 00:00:00 2001 From: Michael Toy <66150587+mtoy-googly-moogly@users.noreply.github.com> Date: Fri, 22 Nov 2024 20:09:13 -0800 Subject: [PATCH] inch a little with snowflake --- .../src/snowflake_connection.ts | 165 ++++++---- .../malloy-db-trino/src/trino_connection.ts | 302 ++++++------------ packages/malloy/src/dialect/index.ts | 2 + .../malloy/src/dialect/snowflake/snowflake.ts | 12 +- packages/malloy/src/dialect/tiny_parser.ts | 136 ++++++++ packages/malloy/src/index.ts | 2 + 6 files changed, 345 insertions(+), 274 deletions(-) create mode 100644 packages/malloy/src/dialect/tiny_parser.ts diff --git a/packages/malloy-db-snowflake/src/snowflake_connection.ts b/packages/malloy-db-snowflake/src/snowflake_connection.ts index 5dde6b40e..f62880b9e 100644 --- a/packages/malloy-db-snowflake/src/snowflake_connection.ts +++ b/packages/malloy-db-snowflake/src/snowflake_connection.ts @@ -37,7 +37,7 @@ import { SnowflakeDialect, TestableConnection, arrayEachFields, - LeafAtomicTypeDef, + TinyParser, } from '@malloydata/malloy'; import {BaseConnection} from '@malloydata/malloy/connection'; @@ -63,19 +63,19 @@ export interface SnowflakeConnectionOptions { class StructMap { fieldMap = new Map(); - type = 'record'; - isArray = false; - constructor(type: string, isArray: boolean) { - this.type = type; - this.isArray = isArray; - } + constructor(public type: string) {} - addChild(name: string, type: string): StructMap { - const s = new StructMap(type, false); + setChild(name: string, type: string) { + // Really only need a fieldmap for array or object types, but whatever + const s = new StructMap(type); this.fieldMap.set(name, s); return s; } + + getChild(name: string) { + return this.fieldMap.get(name); + } } export class SnowflakeConnection @@ -179,21 +179,13 @@ export class SnowflakeConnection structDef: StructDef, structMap: StructMap ): void { - if (structMap.fieldMap.size === 0) return; for (const [field, value] of structMap.fieldMap) { const type = value.type; const name = field; - // check for an array - if (value.isArray && type !== 'object') { - // Apparently there can only be arrays of integers, strings, or unknowns? - // TODO is this true or is this just all that got implemented? - const malloyType: LeafAtomicTypeDef = - type === 'integer' - ? {type: 'number', numberType: 'integer'} - : type === 'varchar' - ? {type: 'string'} - : {type: 'sql native', rawType: type}; + const inArray = structMap.type === 'array'; + if (inArray && type !== 'object') { + const malloyType = this.dialect.sqlTypeToMalloyType(type); const innerStructDef: StructDef = { type: 'array', name, @@ -205,7 +197,7 @@ export class SnowflakeConnection structDef.fields.push(innerStructDef); } else if (type === 'object') { const structParts = {name, dialect: this.dialectName, fields: []}; - const innerStructDef: StructDef = value.isArray + const innerStructDef: StructDef = inArray ? { ...structParts, type: 'array', @@ -218,7 +210,7 @@ export class SnowflakeConnection join: 'one', }; this.addFieldsToStructDef(innerStructDef, value); - structDef.fields.push(innerStructDef); + structDef.fields.push({...innerStructDef, name}); } else { const malloyType = this.dialect.sqlTypeToMalloyType(type); structDef.fields.push({...malloyType, name}); @@ -236,29 +228,20 @@ export class SnowflakeConnection const notVariant = new Map(); for (const row of rows) { // data types look like `VARCHAR(1234)` - let snowflakeDataType = row['type'] as string; - snowflakeDataType = snowflakeDataType.toLocaleLowerCase().split('(')[0]; - const s = structDef; - const malloyType = this.dialect.sqlTypeToMalloyType(snowflakeDataType); + const snowflakeDataType = (row['type'] as string) + .toLocaleLowerCase() + .split('(')[0]; const name = row['name'] as string; - if (snowflakeDataType === 'variant' || snowflakeDataType === 'array') { + if (['variant', 'array', 'object'].includes(snowflakeDataType)) { variants.push(name); - continue; - } - - notVariant.set(name, true); - if (malloyType) { - s.fields.push({...malloyType, name}); } else { - s.fields.push({ - type: 'sql native', - rawType: snowflakeDataType, - name, - }); + notVariant.set(name, true); + const malloyType = this.dialect.sqlTypeToMalloyType(snowflakeDataType); + structDef.fields.push({...malloyType, name}); } } - // if we have variants, sample the data + // For these things, we need to sample the data to know the schema if (variants.length > 0) { const sampleQuery = ` SELECT regexp_replace(PATH, '\\\\[[0-9]*\\\\]', '') as PATH, lower(TYPEOF(value)) as type @@ -271,32 +254,33 @@ export class SnowflakeConnection // take the schema in list form an convert it into a tree. - const structMap = new StructMap('object', true); + const structMap = new StructMap('object'); for (const f of fieldPathRows) { const pathString = f['PATH']?.valueOf().toString(); const fieldType = f['TYPE']?.valueOf().toString(); if (pathString === undefined || fieldType === undefined) continue; - const path = pathString.split('.'); - let parent = structMap; - + const pathParser = new PathParser(pathString); + const zPath = pathParser.pathChain(); // ignore the fields we've already added. - if (path.length === 1 && notVariant.get(pathString)) continue; - - let index = 0; - for (const segment of path) { - let thisNode = parent.fieldMap.get(segment); - if (thisNode === undefined) { - thisNode = parent.addChild(segment, fieldType); - } - if (fieldType === 'array') { - thisNode.isArray = true; - // if this is the last - } else if (index === path.length - 1) { - thisNode.type = fieldType; + if (zPath.next === undefined && notVariant.get(zPath.name)) continue; + + for ( + let segment: PathChain | undefined = zPath, parent = structMap; + segment; + segment = segment.next + ) { + if (segment.next === undefined) { + // if this is the last element in the path, that is where the type goes + parent.setChild(segment.name, fieldType); + } else { + // just walking the tree to part the knows + const nxtP = parent.getChild(segment.name); + if (!nxtP) { + throw new Error('paarse pickle spfndkjlfsd'); + } + parent = nxtP; } - parent = thisNode; - index += 1; } } this.addFieldsToStructDef(structDef, structMap); @@ -338,3 +322,68 @@ export class SnowflakeConnection return tableName; } } + +/** + * Instead of an array of names, we return a path as a linked list. + */ +interface PathChain { + name: string; + next?: PathChain; +} + +export class PathParser extends TinyParser { + constructor(pathName: string) { + super(pathName, { + quoted: /^'(\\'|[^'])*'/, + char: /^[[.\]]/, + number: /^\d+/, + word: /^\w+/, + }); + } + + getName() { + const nameStart = this.next(); + if (nameStart.type === 'word') { + return nameStart.text; + } + if (nameStart.type === '[') { + const quotedName = this.next('quoted'); + this.next(']'); + return quotedName.text; + } + throw this.parseError('Expected column name'); + } + + getSubscript(node: PathChain): PathChain { + const index = this.next(); + if (index.type === 'number') { + node.next = {name: index.text}; + return node.next; + } else if (index.type === 'quoted') { + node.next = {name: index.text}; + return node.next; + } else { + throw this.parseError(`Unexpected ${index.type}`); + } + } + + pathChain(): PathChain { + const chain: PathChain = {name: this.getName()}; + let node = chain; + for (;;) { + const sep = this.next(); + if (sep.type === 'eof') { + return chain; + } + if (sep.type === '.') { + node.next = {name: this.next('word').text}; + node = node.next; + } else if (sep.type === '[') { + node = this.getSubscript(node); + this.next(']'); + } else { + throw this.parseError(`Unexpected ${sep.type}`); + } + } + } +} diff --git a/packages/malloy-db-trino/src/trino_connection.ts b/packages/malloy-db-trino/src/trino_connection.ts index 68da4e1dc..77e9c49ab 100644 --- a/packages/malloy-db-trino/src/trino_connection.ts +++ b/packages/malloy-db-trino/src/trino_connection.ts @@ -45,6 +45,7 @@ import { Dialect, ArrayTypeDef, FieldDef, + TinyParser, } from '@malloydata/malloy'; import {BaseConnection} from '@malloydata/malloy/connection'; @@ -580,68 +581,8 @@ export class PrestoConnection extends TrinoPrestoConnection { ); } - /* - * Here's a hand built parser for schema lines, roughly this grammar - * SCHEMA_LINE: PrestoExplainParser => [TYPE_LIST] - * NAME_LIST: NAME (, NAME)* - * TYPE_LIST: TYPE_SPEC (, TYPE_SPEC)* - * TYPE_SPEC: exprN ':' TYPE - * TYPE: REC_TYPE | ARRAY_TYPE | SQL_TYPE - * ARRAY_TYPE: ARRAY '(' TYPE ')' - * REC_TYPE: REC '(' "name" TYPE (, "name" TYPE)* ')' - */ const schemaDesc = new PrestoExplainParser(lines[0], this.dialect); - if (schemaDesc.containsNo(']') || schemaDesc.missingExpected('[')) { - throw schemaDesc.parseError( - "Expected something like '- Output [PlanName N] [NAME_LIST]'" - ); - } - const fieldNames: string[] = []; - for (;;) { - const nmToken = schemaDesc.next(); - if (nmToken.type !== 'id') { - throw schemaDesc.parseError('Expected name of field'); - } - fieldNames.push(nmToken.text); - const sep = schemaDesc.next(); - if (sep.type === ',') { - continue; - } - if (sep.type !== ']') { - throw schemaDesc.parseError( - `Unexpected '${sep.text}' while getting name list` - ); - } - break; - } - if (schemaDesc.missingExpected('arrow', '[')) { - throw schemaDesc.parseError("Expected '=> [' to begin type definition"); - } - for (let nameIndex = 0; ; nameIndex += 1) { - const name = fieldNames[nameIndex]; - if (schemaDesc.missingExpected('id', ':')) { - throw schemaDesc.parseError( - "Expected 'exprN:' before each type in schema" - ); - } - const nextType = schemaDesc.typeDef(); - structDef.fields.push({...nextType, name}); - const sep = schemaDesc.next(); - if (sep.text === ',') { - continue; - } - if (sep.text !== ']') { - throw schemaDesc.parseError( - `Unexpected '${sep.text}' between field types` - ); - } - break; - } - if (structDef.fields.length !== fieldNames.length) { - throw new Error( - `presto schema error mismatched ${structDef.fields.length} types and ${fieldNames.length} fields` - ); - } + structDef.fields = schemaDesc.parseExplain(); } unpackArray(data: unknown): unknown[] { @@ -681,77 +622,76 @@ export class TrinoConnection extends TrinoPrestoConnection { } } -interface Token { - type: string; - text: string; -} - -class PrestoExplainParser { - tokens: Generator; - parseCursor = 0; - peeked?: Token; +/** + * A hand built parser for schema lines, roughly this grammar + * SCHEMA_LINE: - Output [PlanName N] [NAME_LIST] => [TYPE_LIST] + * NAME_LIST: NAME (, NAME)* + * TYPE_LIST: TYPE_SPEC (, TYPE_SPEC)* + * TYPE_SPEC: exprN ':' TYPE + * TYPE: REC_TYPE | ARRAY_TYPE | SQL_TYPE + * ARRAY_TYPE: ARRAY '(' TYPE ')' + * REC_TYPE: REC '(' "name" TYPE (, "name" TYPE)* ')' + */ +class PrestoExplainParser extends TinyParser { constructor( readonly input: string, readonly dialect: Dialect ) { - this.tokens = this.tokenize(input); - } - - parseError(str: string) { - const errText = - `INTERAL ERROR parsing presto schema: ${str}\n` + - `${this.input}\n` + - `${' '.repeat(this.parseCursor)}^`; - return new Error(errText); - } - - peek(): Token { - if (this.peeked) { - return this.peeked; - } else { - const {value} = this.tokens.next(); - const peekVal = value ?? {type: 'eof', text: ''}; - this.peeked = peekVal; - return peekVal; - } - } - - next(): Token { - let next = this.peeked; - if (next) { - this.peeked = undefined; - return next; - } else { - next = this.peek(); - this.peeked = undefined; - return next; - } + super(input, { + space: /^\s+/, + arrow: /^=>/, + char: /^[,:[\]()-]/, + id: /^\w+/, + // mtoy todo check what happens if a name has quotes in it + quoted_name: /^"\w+"/, + }); } - hasExpected(...type: string[]) { - for (const t of type) { - const next = this.next(); - if (next.type !== t) { - return false; + fieldNameList(): string[] { + this.skipTo(']'); // Skip to end of plan + this.next('['); // Expect start of name list + const fieldNames: string[] = []; + for (;;) { + const nmToken = this.next('id'); + fieldNames.push(nmToken.text); + const sep = this.next(); + if (sep.type === ',') { + continue; + } + if (sep.type !== ']') { + throw this.parseError( + `Unexpected '${sep.text}' while getting field name list` + ); } + break; } - return true; + return fieldNames; } - containsNo(type: string) { - for (;;) { - const next = this.next(); - if (next.type === 'eof') { - return true; + parseExplain(): FieldDef[] { + const fieldNames = this.fieldNameList(); + const fields: FieldDef[] = []; + this.next('arrow', '['); + for (let nameIndex = 0; ; nameIndex += 1) { + const name = fieldNames[nameIndex]; + this.next('id', ':'); + const nextType = this.typeDef(); + fields.push({...nextType, name}); + const sep = this.next(); + if (sep.text === ',') { + continue; } - if (next.type === type) { - return false; + if (sep.text !== ']') { + throw this.parseError(`Unexpected '${sep.text}' between field types`); } + break; } - } - - missingExpected(...type: string[]) { - return !this.hasExpected(...type); + if (fields.length !== fieldNames.length) { + throw new Error( + `Presto schema error mismatched ${fields.length} types and ${fieldNames.length} fields` + ); + } + return fields; } typeDef(): AtomicTypeDef { @@ -760,58 +700,42 @@ class PrestoExplainParser { throw this.parseError( 'Unexpected EOF parsing type, expected a type name' ); - } else if (typToken.text === 'row') { - if (this.hasExpected('(')) { - const fields: FieldDef[] = []; - for (;;) { - const name = this.next(); - if (name.type !== 'name') { - throw this.parseError('Expected quoted "name" for record property'); - } - const getDef = this.typeDef(); - fields.push({...getDef, name: name.text}); - const sep = this.next(); - if (sep.text === ')') { - break; - } - if (sep.text === ',') { - continue; - } - throw this.parseError( - `Unexpected '${sep.text}' while parsing record type` - ); + } else if (typToken.text === 'row' && this.next('(')) { + const fields: FieldDef[] = []; + for (;;) { + const name = this.next('quoted_name'); + const getDef = this.typeDef(); + fields.push({...getDef, name: name.text}); + const sep = this.next(); + if (sep.text === ')') { + break; } - const def: RecordTypeDef = { - type: 'record', - name: '', - join: 'one', - dialect: this.dialect.name, - fields, - }; - return def; - } else { - throw new Error('Expected rec followed by('); - } - } else if (typToken.text === 'array') { - if (this.hasExpected('(')) { - const elType = this.typeDef(); - if (this.missingExpected(')')) { - throw this.parseError("Expected ')' at end of array type"); + if (sep.text === ',') { + continue; } - const def: ArrayTypeDef = { - type: 'array', - name: '', - dialect: this.dialect.name, - join: 'many', - elementTypeDef: - elType.type === 'record' ? {type: 'record_element'} : elType, - fields: - elType.type === 'record' ? elType.fields : arrayEachFields(elType), - }; - return def; - } else { - throw this.parseError('Expected array followed by ('); } + const def: RecordTypeDef = { + type: 'record', + name: '', + join: 'one', + dialect: this.dialect.name, + fields, + }; + return def; + } else if (typToken.text === 'array' && this.next('(')) { + const elType = this.typeDef(); + this.next(')'); + const def: ArrayTypeDef = { + type: 'array', + name: '', + dialect: this.dialect.name, + join: 'many', + elementTypeDef: + elType.type === 'record' ? {type: 'record_element'} : elType, + fields: + elType.type === 'record' ? elType.fields : arrayEachFields(elType), + }; + return def; } else if (typToken.type === 'id') { const sqlType = typToken.text; const def = this.dialect.sqlTypeToMalloyType(sqlType); @@ -820,9 +744,7 @@ class PrestoExplainParser { } if (sqlType === 'varchar') { if (this.peek().type === '(') { - if (this.missingExpected('(', 'id', ')')) { - throw this.parseError('Error parsing varchar()'); - } + this.next('(', 'id', ')'); } } return def; @@ -831,42 +753,4 @@ class PrestoExplainParser { `'${typToken.text}' unexpected while looking for a type` ); } - - private *tokenize(src: string): Generator { - const tokenRegex = { - space: /^\s+/, - arrow: /^=>/, - char: /^[,:[\]()-]/, - id: /^\w+/, - name: /^"\w+"/, - }; - for (;;) { - let notFound = true; - for (const tokenType in tokenRegex) { - const foundToken = src.match(tokenRegex[tokenType]); - if (foundToken) { - let tokenText = foundToken[0]; - src = src.slice(tokenText.length); - this.parseCursor = this.input.length - src.length; - if (tokenType !== 'space') { - if (tokenType === 'name') { - tokenText = tokenText.slice(1, -1); // strip quotes - } - yield { - type: tokenType === 'char' ? tokenText : tokenType, - text: tokenText, - }; - notFound = false; - } - } - } - if (notFound) { - yield {type: 'unexpected token', text: src}; - return; - } - if (src === '') { - return; - } - } - } } diff --git a/packages/malloy/src/dialect/index.ts b/packages/malloy/src/dialect/index.ts index 140716594..15c70a24f 100644 --- a/packages/malloy/src/dialect/index.ts +++ b/packages/malloy/src/dialect/index.ts @@ -52,3 +52,5 @@ export {MySQLDialect} from './mysql'; export {getDialect, registerDialect} from './dialect_map'; export {getMalloyStandardFunctions} from './functions'; export type {MalloyStandardFunctionImplementations} from './functions'; +export type {TinyToken} from './tiny_parser'; +export {TinyParser} from './tiny_parser'; diff --git a/packages/malloy/src/dialect/snowflake/snowflake.ts b/packages/malloy/src/dialect/snowflake/snowflake.ts index f44909f17..92ead8cfa 100644 --- a/packages/malloy/src/dialect/snowflake/snowflake.ts +++ b/packages/malloy/src/dialect/snowflake/snowflake.ts @@ -521,14 +521,12 @@ ${indent(sql)} sqlLiteralRecord(lit: RecordLiteralNode): string { const rowVals: string[] = []; for (const f of lit.typeDef.fields) { - if (isAtomic(f)) { - const name = f.as ?? f.name; - const propName = `'${name}'`; - const propVal = lit.kids[name].sql ?? 'internal-error-record-literal'; - rowVals.push(`${propName}:${propVal}`); - } + const name = f.as ?? f.name; + const propName = `'${name}'`; + const propVal = lit.kids[name].sql ?? 'internal-error-record-literal'; + rowVals.push(`${propName}:${propVal}`); } - return `{${rowVals.join(',')}}::${this.malloyTypeToSQLType(lit.typeDef)}`; + return `{${rowVals.join(',')}}`; } sqlLiteralArray(lit: ArrayLiteralNode): string { diff --git a/packages/malloy/src/dialect/tiny_parser.ts b/packages/malloy/src/dialect/tiny_parser.ts new file mode 100644 index 000000000..01765a9a7 --- /dev/null +++ b/packages/malloy/src/dialect/tiny_parser.ts @@ -0,0 +1,136 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */ + +export interface TinyToken { + type: string; + text: string; +} + +/** + * Simple framework for writing schema parsers. The parsers using this felt + * better than the more ad-hoc code they replaced, and are smaller than + * using a parser generator. + * + * NOTE: All parse errors are exceptions. + */ +export class TinyParser { + private tokens: Generator; + private parseCursor = 0; + private lookAhead?: TinyToken; + private tokenMap: Record; + + /** + * The token map is tested in order. Return TinyToken + * is {type: tokenMapKey, text: matchingText }, except + * for the special tokenMapKeys: + * * space: skipped and never returned + * * char: matched string return in both .type and .text + * * q*: any token name starting with 'q' is assumed to be + * a quoted string and the text will have the first and + * last characters stripped + */ + constructor( + readonly input: string, + tokenMap?: Record + ) { + this.tokens = this.tokenize(input); + this.tokenMap = tokenMap ?? { + space: /^\s+/, + char: /^[,:[\]()-]/, + id: /^\w+/, + qstr: /^"\w+"/, + }; + } + + parseError(str: string) { + const errText = + `INTERNAL ERROR parsing schema: ${str}\n` + + `${this.input}\n` + + `${' '.repeat(this.parseCursor)}^`; + return new Error(errText); + } + + peek(): TinyToken { + if (this.lookAhead) { + return this.lookAhead; + } else { + const {value} = this.tokens.next(); + const peekVal = value ?? {type: 'eof', text: ''}; + this.lookAhead = peekVal; + return peekVal; + } + } + + private getNext(): TinyToken { + const next = this.lookAhead ?? this.peek(); + this.lookAhead = undefined; + return next; + } + + /** + * Return next token, if any token types are passed, read and require those + * tokens, then return the last one. + * @param types list of token types + * @returns The last token read + */ + next(...types: string[]): TinyToken { + if (types.length === 0) return this.getNext(); + let next: TinyToken | undefined = undefined; + let expected = types[0]; + for (const typ of types) { + next = this.getNext(); + expected = typ; + if (next.type !== typ) { + next = undefined; + break; + } + } + if (next) return next; + throw this.parseError(`Expected ${expected}`); + } + + skipTo(type: string) { + for (;;) { + const next = this.next(); + if (next.type === 'eof') { + throw this.parseError(`Expected token '${type}`); + } + if (next.type === type) { + return; + } + } + } + + private *tokenize(src: string): Generator { + const tokenList = this.tokenMap; + while (this.parseCursor < src.length) { + let notFound = true; + for (const tokenType in tokenList) { + const srcAtCursor = src.slice(this.parseCursor); + const foundToken = srcAtCursor.match(tokenList[tokenType]); + if (foundToken) { + notFound = false; + let tokenText = foundToken[0]; + this.parseCursor += tokenText.length; + if (tokenType !== 'space') { + if (tokenType[0] === 'q') { + tokenText = tokenText.slice(1, -1); // strip quotes + } + yield { + type: tokenType === 'char' ? tokenText : tokenType, + text: tokenText, + }; + } + } + } + if (notFound) { + yield {type: 'unexpected token', text: src}; + return; + } + } + } +} diff --git a/packages/malloy/src/index.ts b/packages/malloy/src/index.ts index c123c28c6..10715428c 100644 --- a/packages/malloy/src/index.ts +++ b/packages/malloy/src/index.ts @@ -42,6 +42,7 @@ export { literal, spread, Dialect, + TinyParser, } from './dialect'; export type { DialectFieldList, @@ -51,6 +52,7 @@ export type { DefinitionBlueprint, DefinitionBlueprintMap, OverloadedDefinitionBlueprint, + TinyToken, } from './dialect'; // TODO tighten up exports export type {