From d7aa8fd5da95708c90a1b5bedf5cafc87b085eb8 Mon Sep 17 00:00:00 2001 From: jheer Date: Mon, 16 Sep 2024 10:31:30 -0700 Subject: [PATCH] feat: Use toArray in toArrow when feasible. --- src/format/from-arrow.js | 2 +- src/format/to-arrow.js | 9 ++--- src/format/types.ts | 2 +- src/table/types.ts | 2 ++ test/format/to-arrow-test.js | 66 +++++++++++++++++++++++++++++++++++- 5 files changed, 74 insertions(+), 7 deletions(-) diff --git a/src/format/from-arrow.js b/src/format/from-arrow.js index 23d0921..5898100 100644 --- a/src/format/from-arrow.js +++ b/src/format/from-arrow.js @@ -30,7 +30,7 @@ export default function(input, options) { // build Arquero columns for backing Arrow columns const cols = columnSet(); sel.forEach((name, key) => { - const col = arrow.getChild(key); + const col = /** @type {import('./types.js').ArrowColumn} */ (arrow.getChild(key)); cols.add(name, col.type.typeId === -1 ? dictionary(col) : col); }); diff --git a/src/format/to-arrow.js b/src/format/to-arrow.js index cc9baff..6be25be 100644 --- a/src/format/to-arrow.js +++ b/src/format/to-arrow.js @@ -1,6 +1,7 @@ import { columnFromArray, columnFromValues, tableFromColumns } from '@uwdata/flechette'; import { columns as select } from './util.js'; import isArrayType from '../util/is-array-type.js'; +import isFunction from '../util/is-function.js'; /** * Create an Apache Arrow table for an input table. @@ -11,7 +12,7 @@ import isArrayType from '../util/is-array-type.js'; * @return {import('@uwdata/flechette').Table} An Arrow Table instance. */ export default function(table, options = {}) { - const { columns, limit, offset, types = {}, ...opt } = options; + const { columns, limit = Infinity, offset = 0, types = {}, ...opt } = options; const names = select(table, columns); const length = table.size; const data = table.data(); @@ -27,9 +28,9 @@ export default function(table, options = {}) { const type = types[name]; const isArray = isArrayType(values); let col; - if (fullScan && isArray) { - // use faster path, take advantange of any typed arrays - col = columnFromArray(values, type, opt); + if (fullScan && (isArray || isFunction(values.toArray))) { + // @ts-ignore - use faster path, takes advantange of typed arrays + col = columnFromArray(isArray ? values : values.toArray(), type, opt); } else { // use table scan method to visit column values const get = isArray diff --git a/src/format/types.ts b/src/format/types.ts index 8142c2a..5b9d0c2 100644 --- a/src/format/types.ts +++ b/src/format/types.ts @@ -12,7 +12,7 @@ export type ArrowInput = export interface ArrowColumn extends ColumnType { type: ArrowDataType; nullCount: number; - toArray(): ColumnType + toArray(): ColumnType; } /** Minimal interface for an Arrow data type. */ diff --git a/src/table/types.ts b/src/table/types.ts index 8c889fe..bee007a 100644 --- a/src/table/types.ts +++ b/src/table/types.ts @@ -15,6 +15,8 @@ export interface ColumnType { at(row: number): T; /** Return a column value iterator. */ [Symbol.iterator]() : Iterator; + /** Optional toArray method. */ + toArray?() : ColumnType; } /** A named collection of columns. */ diff --git a/test/format/to-arrow-test.js b/test/format/to-arrow-test.js index 13cf059..d2b7c0f 100644 --- a/test/format/to-arrow-test.js +++ b/test/format/to-arrow-test.js @@ -116,7 +116,7 @@ describe('toArrow', () => { it('produces Arrow data for an input table', () => { const dt = table({ i: [1, 2, 3, undefined, 4, 5], - f: Float32Array.from([1.2, 2.3, 3.0, 3.4, null, 4.5]), + f: Float32Array.from([1.2, 2.3, 3.0, 3.4, -1.3, 4.5]), n: [4.5, 4.4, 3.4, 3.0, 2.3, 1.2], b: [true, true, false, true, null, false], s: ['foo', null, 'bar', 'baz', 'baz', 'bar'], @@ -173,6 +173,70 @@ describe('toArrow', () => { ); }); + it('produces Arrow data from mixed inputs', () => { + const dt0 = table({ + i: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], + f: Float32Array.from([1.2, 2.3, 3.0, 3.4, 4.5, 5.4, 6.5, 7.6, 8.7, 9.2]) + }); + + // create an arrow table with multiple record batches + // then derive a new table + const at0 = toArrow(dt0, { maxBatchRows: 4 }); + const dt = fromArrow(at0).derive({ sum: d => d.i + d.f }); + const at = toArrow(dt); + + assert.equal( + compareTables(dt, at), 0, + 'arquero and arrow tables match' + ); + + const buffer = tableToIPC(at); + const bt = tableFromIPC(buffer); + + assert.equal( + compareTables(dt, bt), 0, + 'arquero and serialized arrow tables match' + ); + + assert.equal( + compareTables(fromArrow(bt), at), 0, + 'serialized arquero and arrow tables match' + ); + }); + + it('produces Arrow data from filtered mixed inputs', () => { + const dt0 = table({ + i: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], + f: Float32Array.from([1.2, 2.3, 3.0, 3.4, 4.5, 5.4, 6.5, 7.6, 8.7, 9.2]) + }); + + // create an arrow table with multiple record batches + // then derive a new table + const at0 = toArrow(dt0, { maxBatchRows: 4 }); + const dt = fromArrow(at0) + .derive({ sum: d => d.i + d.f }) + .filter(d => d.i % 2 === 0); + const at = toArrow(dt); + + assert.equal( + compareTables(dt, at), 0, + 'arquero and arrow tables match' + ); + + const buffer = tableToIPC(at); + const bt = tableFromIPC(buffer); + + assert.equal( + compareTables(dt, bt), 0, + 'arquero and serialized arrow tables match' + ); + + assert.equal( + compareTables(fromArrow(bt), at), 0, + 'serialized arquero and arrow tables match' + ); + }); + it('throws on ambiguously typed data', async () => { assert.throws( () => toArrow(table({ x: [1, 2, 3, 'foo'] })),