Skip to content

Commit

Permalink
try to properly parse nested arrays in duckdb
Browse files Browse the repository at this point in the history
  • Loading branch information
mtoy-googly-moogly committed Dec 11, 2024
1 parent 7d480f8 commit 2629f0d
Show file tree
Hide file tree
Showing 4 changed files with 138 additions and 98 deletions.
94 changes: 4 additions & 90 deletions packages/malloy-db-duckdb/src/duckdb_common.ts
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ import {
DuckDBDialect,
SQLSourceDef,
TableSourceDef,
mkArrayDef,
mkFieldDef,
} from '@malloydata/malloy';
import {BaseConnection} from '@malloydata/malloy/connection';

Expand Down Expand Up @@ -149,102 +149,16 @@ export abstract class DuckDBCommon
return {};
}

/**
* Split's a structs columns declaration into individual columns
* to be fed back into fillStructDefFromTypeMap(). Handles commas
* within nested STRUCT() declarations.
*
* (https://github.com/malloydata/malloy/issues/635)
*
* @param s struct's column declaration
* @return Array of column type declarations
*/
private splitColumns(s: string) {
const columns: string[] = [];
let parens = 0;
let column = '';
let eatSpaces = true;
for (let idx = 0; idx < s.length; idx++) {
const c = s.charAt(idx);
if (eatSpaces && c === ' ') {
// Eat space
} else {
eatSpaces = false;
if (!parens && c === ',') {
columns.push(column);
column = '';
eatSpaces = true;
} else {
column += c;
}
if (c === '(') {
parens += 1;
} else if (c === ')') {
parens -= 1;
}
}
}
columns.push(column);
return columns;
}

private stringToTypeMap(s: string): {[name: string]: string} {
const ret: {[name: string]: string} = {};
const columns = this.splitColumns(s);
for (const c of columns) {
//const [name, type] = c.split(" ", 1);
const columnMatch = c.match(/^(?<name>[^\s]+) (?<type>.*)$/);
if (columnMatch && columnMatch.groups) {
ret[columnMatch.groups['name']] = columnMatch.groups['type'];
} else {
throw new Error(`Badly form Structure definition ${s}`);
}
}
return ret;
}

fillStructDefFromTypeMap(
structDef: StructDef,
typeMap: {[name: string]: string}
) {
for (const fieldName in typeMap) {
let duckDBType = typeMap[fieldName];
// Remove quotes from field name
const name = unquoteName(fieldName);
let malloyType = this.dialect.sqlTypeToMalloyType(duckDBType);
const arrayMatch = duckDBType.match(/(?<duckDBType>.*)\[\]$/);
if (arrayMatch && arrayMatch.groups) {
duckDBType = arrayMatch.groups['duckDBType'];
}
const structMatch = duckDBType.match(/^STRUCT\((?<fields>.*)\)$/);
if (structMatch && structMatch.groups) {
const newTypeMap = this.stringToTypeMap(structMatch.groups['fields']);
let innerStructDef: StructDef;
const structhead = {name, dialect: this.dialectName, fields: []};
if (arrayMatch) {
innerStructDef = {
type: 'array',
elementTypeDef: {type: 'record_element'},
join: 'many',
...structhead,
};
} else {
innerStructDef = {
type: 'record',
join: 'one',
...structhead,
};
}
this.fillStructDefFromTypeMap(innerStructDef, newTypeMap);
structDef.fields.push(innerStructDef);
} else {
if (arrayMatch) {
malloyType = this.dialect.sqlTypeToMalloyType(duckDBType);
structDef.fields.push(mkArrayDef(malloyType, name, this.dialectName));
} else {
structDef.fields.push({...malloyType, name});
}
}
const dbType = typeMap[fieldName];
const malloyType = this.dialect.parseDuckDBType(dbType);
structDef.fields.push(mkFieldDef(malloyType, name, 'duckdb'));
}
}

Expand Down
126 changes: 126 additions & 0 deletions packages/malloy/src/dialect/duckdb/duckdb.ts
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ import {
TD,
RecordLiteralNode,
OrderBy,
mkFieldDef,
} from '../../model/malloy_types';
import {indent} from '../../model/utils';
import {
Expand All @@ -45,6 +46,7 @@ import {DialectFieldList, FieldReferenceType, inDays} from '../dialect';
import {PostgresBase} from '../pg_impl';
import {DUCKDB_DIALECT_FUNCTIONS} from './dialect_functions';
import {DUCKDB_MALLOY_STANDARD_OVERLOADS} from './function_overrides';
import {TinyParseError, TinyParser, TinyToken} from '../tiny_parser';

// need to refactor runSQL to take a SQLBlock instead of just a sql string.
const hackSplitComment = '-- hack: split on this';
Expand Down Expand Up @@ -372,6 +374,19 @@ export class DuckDBDialect extends PostgresBase {
return malloyType.type;
}

parseDuckDBType(sqlType: string): AtomicTypeDef {
const parser = new DuckDBTypeParser(sqlType);
try {
return parser.typeDef();
} catch (e) {
if (e instanceof TinyParseError) {
return {type: 'sql native', rawType: sqlType};
} else {
throw e;
}
}
}

sqlTypeToMalloyType(sqlType: string): LeafAtomicTypeDef {
// Remove decimal precision
const ddbType = sqlType.replace(/^DECIMAL\(\d+,\d+\)/g, 'DECIMAL');
Expand Down Expand Up @@ -444,3 +459,114 @@ export class DuckDBDialect extends PostgresBase {
return '{' + pairs.join(',') + '}';
}
}

class DuckDBTypeParser extends TinyParser {
constructor(input: string) {
super(input, {
space: /^\s+/,
qsingle: /^'([^']|'')*'/,
qdouble: /^"([^"]|"")*"/,
char: /^[,:[\]()-]/,
id: /^[A-Z-a-z]\w*/,
number: /^\d+/,
});
}

unquoteName(token: TinyToken): string {
if (token.type === 'qsingle') {
return token.text.replace("''", '');
} else if (token.type === 'qdouble') {
return token.text.replace('""', '');
}
return token.text;
}

typeDef(): AtomicTypeDef {
const unknownStart = this.parseCursor;
const id = this.next('id');
let baseType: AtomicTypeDef;
if (
(id.text === 'DECIMAL' || id.type === 'NUMERIC') &&
this.peek().text === '('
) {
this.next('(');
const _prec0 = this.next('number');
this.next(',');
const prec1 = this.next('number');
this.next(')');
baseType = {
type: 'number',
numberType: Number.parseInt(prec1.text) > 0 ? 'float' : 'integer',
};
} else if (id.text === 'TIMESTAMP') {
if (this.peek().text === 'WITH') {
this.next('WITH', 'TIME', 'ZONE');
baseType = {type: 'sql native', rawType: 'TIMESTAMP WITH TIME ZONE'};
} else {
baseType = {type: 'timestamp'};
}
} else if (duckDBToMalloyTypes[id.text]) {
baseType = duckDBToMalloyTypes[id.text];
} else if (id.text === 'STRUCT') {
this.next('(');
baseType = {type: 'record', fields: []};
for (;;) {
const fieldName = this.next();
if (
fieldName.type === 'qsingle' ||
fieldName.type === 'qdouble' ||
fieldName.type === 'id'
) {
const fieldType = this.typeDef();
baseType.fields.push(
mkFieldDef(fieldType, this.unquoteName(fieldName), 'duckdb')
);
} else {
if (fieldName.type !== ')') {
throw this.parseError('Expected identifier or ) to end STRUCT');
}
break;
}
if (this.peek().type === ',') {
this.next();
}
}
} else {
if (id.type === 'id') {
for (;;) {
const next = this.peek();
// Might be WEIRDTYP(a,b)[] ... stop at the [
if (next.type === '[' || next.type === 'eof') {
break;
}
this.next();
}
baseType = {
type: 'sql native',
rawType: this.input.slice(
unknownStart,
this.parseCursor - unknownStart + 1
),
};
} else {
throw this.parseError('Could not understand type');
}
}
while (this.peek().type === '[') {
this.next('[', ']');
if (baseType.type === 'record') {
baseType = {
type: 'array',
elementTypeDef: {type: 'record_element'},
fields: baseType.fields,
};
} else {
baseType = {
type: 'array',
elementTypeDef: baseType,
};
}
}
return baseType;
}
}
7 changes: 4 additions & 3 deletions packages/malloy/src/dialect/tiny_parser.ts
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,10 @@ export interface TinyToken {
*
* NOTE: All parse errors are exceptions.
*/
export class TinyParseError extends Error {}
export class TinyParser {
private tokens: Generator<TinyToken>;
private parseCursor = 0;
protected parseCursor = 0;
private lookAhead?: TinyToken;
private tokenMap: Record<string, RegExp>;

Expand All @@ -37,21 +38,21 @@ export class TinyParser {
readonly input: string,
tokenMap?: Record<string, RegExp>
) {
this.tokens = this.tokenize(input);
this.tokenMap = tokenMap ?? {
space: /^\s+/,
char: /^[,:[\]()-]/,
id: /^\w+/,
qstr: /^"\w+"/,
};
this.tokens = this.tokenize(input);
}

parseError(str: string) {
const errText =
`INTERNAL ERROR parsing schema: ${str}\n` +
`${this.input}\n` +
`${' '.repeat(this.parseCursor)}^`;
return new Error(errText);
return new TinyParseError(errText);
}

peek(): TinyToken {
Expand Down
9 changes: 4 additions & 5 deletions test/src/databases/duckdb/duckdb.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ describe.each(allDucks.runtimeList)('duckdb:%s', (dbName, runtime) => {
).malloyResultMatches(runtime, {abc: 'a', abc3: 'a3'});
});

describe('time', () => {
describe('time oddities', () => {
const zone = 'America/Mexico_City'; // -06:00 no DST
const zone_2020 = DateTime.fromObject(
{
Expand All @@ -147,13 +147,12 @@ describe.each(allDucks.runtimeList)('duckdb:%s', (dbName, runtime) => {
}
);
test('can cast TIMESTAMPTZ to timestamp', async () => {
await expect(
`run: duckdb.sql("""
await expect(`
run: duckdb.sql("""
SELECT TIMESTAMPTZ '2020-02-20 00:00:00 ${zone}' as t_tstz
""") -> {
select: mex_220 is t_tstz::timestamp
}`
).malloyResultMatches(runtime, {mex_220: zone_2020.toJSDate()});
}`).malloyResultMatches(runtime, {mex_220: zone_2020.toJSDate()});
});
});
});
Expand Down

0 comments on commit 2629f0d

Please sign in to comment.