Skip to content

Commit

Permalink
feat: Use toArray in toArrow when feasible.
Browse files Browse the repository at this point in the history
  • Loading branch information
jheer committed Sep 16, 2024
1 parent c743889 commit d7aa8fd
Show file tree
Hide file tree
Showing 5 changed files with 74 additions and 7 deletions.
2 changes: 1 addition & 1 deletion src/format/from-arrow.js
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ export default function(input, options) {
// build Arquero columns for backing Arrow columns
const cols = columnSet();
sel.forEach((name, key) => {
const col = arrow.getChild(key);
const col = /** @type {import('./types.js').ArrowColumn} */ (arrow.getChild(key));
cols.add(name, col.type.typeId === -1 ? dictionary(col) : col);
});

Expand Down
9 changes: 5 additions & 4 deletions src/format/to-arrow.js
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import { columnFromArray, columnFromValues, tableFromColumns } from '@uwdata/flechette';
import { columns as select } from './util.js';
import isArrayType from '../util/is-array-type.js';
import isFunction from '../util/is-function.js';

/**
* Create an Apache Arrow table for an input table.
Expand All @@ -11,7 +12,7 @@ import isArrayType from '../util/is-array-type.js';
* @return {import('@uwdata/flechette').Table} An Arrow Table instance.
*/
export default function(table, options = {}) {
const { columns, limit, offset, types = {}, ...opt } = options;
const { columns, limit = Infinity, offset = 0, types = {}, ...opt } = options;
const names = select(table, columns);
const length = table.size;
const data = table.data();
Expand All @@ -27,9 +28,9 @@ export default function(table, options = {}) {
const type = types[name];
const isArray = isArrayType(values);
let col;
if (fullScan && isArray) {
// use faster path, take advantange of any typed arrays
col = columnFromArray(values, type, opt);
if (fullScan && (isArray || isFunction(values.toArray))) {
// @ts-ignore - use faster path, takes advantange of typed arrays
col = columnFromArray(isArray ? values : values.toArray(), type, opt);
} else {
// use table scan method to visit column values
const get = isArray
Expand Down
2 changes: 1 addition & 1 deletion src/format/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ export type ArrowInput =
export interface ArrowColumn<T> extends ColumnType<T> {
type: ArrowDataType;
nullCount: number;
toArray(): ColumnType<T>
toArray(): ColumnType<T>;
}

/** Minimal interface for an Arrow data type. */
Expand Down
2 changes: 2 additions & 0 deletions src/table/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@ export interface ColumnType<T> {
at(row: number): T;
/** Return a column value iterator. */
[Symbol.iterator]() : Iterator<T>;
/** Optional toArray method. */
toArray?() : ColumnType<T>;
}

/** A named collection of columns. */
Expand Down
66 changes: 65 additions & 1 deletion test/format/to-arrow-test.js
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ describe('toArrow', () => {
it('produces Arrow data for an input table', () => {
const dt = table({
i: [1, 2, 3, undefined, 4, 5],
f: Float32Array.from([1.2, 2.3, 3.0, 3.4, null, 4.5]),
f: Float32Array.from([1.2, 2.3, 3.0, 3.4, -1.3, 4.5]),
n: [4.5, 4.4, 3.4, 3.0, 2.3, 1.2],
b: [true, true, false, true, null, false],
s: ['foo', null, 'bar', 'baz', 'baz', 'bar'],
Expand Down Expand Up @@ -173,6 +173,70 @@ describe('toArrow', () => {
);
});

it('produces Arrow data from mixed inputs', () => {
const dt0 = table({
i: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
f: Float32Array.from([1.2, 2.3, 3.0, 3.4, 4.5, 5.4, 6.5, 7.6, 8.7, 9.2])
});

// create an arrow table with multiple record batches
// then derive a new table
const at0 = toArrow(dt0, { maxBatchRows: 4 });
const dt = fromArrow(at0).derive({ sum: d => d.i + d.f });
const at = toArrow(dt);

assert.equal(
compareTables(dt, at), 0,
'arquero and arrow tables match'
);

const buffer = tableToIPC(at);
const bt = tableFromIPC(buffer);

assert.equal(
compareTables(dt, bt), 0,
'arquero and serialized arrow tables match'
);

assert.equal(
compareTables(fromArrow(bt), at), 0,
'serialized arquero and arrow tables match'
);
});

it('produces Arrow data from filtered mixed inputs', () => {
const dt0 = table({
i: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
f: Float32Array.from([1.2, 2.3, 3.0, 3.4, 4.5, 5.4, 6.5, 7.6, 8.7, 9.2])
});

// create an arrow table with multiple record batches
// then derive a new table
const at0 = toArrow(dt0, { maxBatchRows: 4 });
const dt = fromArrow(at0)
.derive({ sum: d => d.i + d.f })
.filter(d => d.i % 2 === 0);
const at = toArrow(dt);

assert.equal(
compareTables(dt, at), 0,
'arquero and arrow tables match'
);

const buffer = tableToIPC(at);
const bt = tableFromIPC(buffer);

assert.equal(
compareTables(dt, bt), 0,
'arquero and serialized arrow tables match'
);

assert.equal(
compareTables(fromArrow(bt), at), 0,
'serialized arquero and arrow tables match'
);
});

it('throws on ambiguously typed data', async () => {
assert.throws(
() => toArrow(table({ x: [1, 2, 3, 'foo'] })),
Expand Down

0 comments on commit d7aa8fd

Please sign in to comment.