Skip to content

Commit c9727a4

Browse files
park-brianplatypii
andauthored
Query filter (#56)
* implement ParquetQueryFilter types * implement parquetQuery filter tests * implement parquetQuery filter * filter before ordering * apply filters before sorting/slicing * format types * add deep equality utility * document and format equals utility * use deep equality checks * update filter tests * support more types for equality * make $not unary * ensure arrays are correctly compared * support both forms of $not * add operator tests * Filter operator tests --------- Co-authored-by: Brian Park <park-brian@users.noreply.github.com> Co-authored-by: Kenny Daniel <platypii@gmail.com>
1 parent cb639a0 commit c9727a4

File tree

4 files changed

+240
-5
lines changed

4 files changed

+240
-5
lines changed

src/query.js

Lines changed: 68 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,31 @@
11
import { parquetReadObjects } from './hyparquet.js'
22
import { parquetMetadataAsync } from './metadata.js'
3+
import { equals } from './utils.js'
34

45
/**
5-
* Wraps parquetRead with orderBy support.
6+
* Wraps parquetRead with filter and orderBy support.
67
* This is a parquet-aware query engine that can read a subset of rows and columns.
7-
* Accepts an optional orderBy column name to sort the results.
8+
* Accepts optional filter object to filter the results and orderBy column name to sort the results.
89
* Note that using orderBy may SIGNIFICANTLY increase the query time.
910
*
10-
* @param {ParquetReadOptions & { orderBy?: string }} options
11+
* @import {ParquetQueryFilter} from '../src/types.d.ts'
12+
* @param {ParquetReadOptions & { filter?: ParquetQueryFilter, orderBy?: string }} options
1113
* @returns {Promise<Record<string, any>[]>} resolves when all requested rows and columns are parsed
1214
*/
1315
export async function parquetQuery(options) {
14-
const { file, rowStart, rowEnd, orderBy } = options
16+
const { file, rowStart, rowEnd, orderBy, filter } = options
1517
options.metadata ||= await parquetMetadataAsync(file)
1618

1719
// TODO: Faster path for: no orderBy, no rowStart/rowEnd, one row group
1820

19-
if (typeof orderBy === 'string') {
21+
if (filter) {
22+
// TODO: Move filter to parquetRead for performance
23+
const results = await parquetReadObjects({ ...options, rowStart: undefined, rowEnd: undefined })
24+
return results
25+
.filter(row => matchQuery(row, filter))
26+
.sort((a, b) => orderBy ? compare(a[orderBy], b[orderBy]) : 0)
27+
.slice(rowStart, rowEnd)
28+
} else if (typeof orderBy === 'string') {
2029
// Fetch orderBy column first
2130
const orderColumn = await parquetReadObjects({ ...options, rowStart: undefined, rowEnd: undefined, columns: [orderBy] })
2231

@@ -98,3 +107,57 @@ function compare(a, b) {
98107
if (a > b) return 1
99108
return 1 // TODO: how to handle nulls?
100109
}
110+
111+
/**
112+
* Match a record against a query filter
113+
*
114+
* @param {any} record
115+
* @param {ParquetQueryFilter} query
116+
* @returns {boolean}
117+
* @example matchQuery({ id: 1 }, { id: {$gte: 1} }) // true
118+
*/
119+
export function matchQuery(record, query = {}) {
120+
121+
if (query.$not) {
122+
return !matchQuery(record, query.$not)
123+
}
124+
125+
if (query.$and) {
126+
return query.$and.every(subQuery => matchQuery(record, subQuery))
127+
}
128+
129+
if (query.$or) {
130+
return query.$or.some(subQuery => matchQuery(record, subQuery))
131+
}
132+
133+
return Object.entries(query).every(([field, condition]) => {
134+
const value = record[field]
135+
136+
if (condition !== null && (Array.isArray(condition) || typeof condition !== 'object')) {
137+
return equals(value, condition)
138+
}
139+
140+
return Object.entries(condition || {}).every(([operator, target]) => {
141+
switch (operator) {
142+
case '$gt':
143+
return value > target
144+
case '$gte':
145+
return value >= target
146+
case '$lt':
147+
return value < target
148+
case '$lte':
149+
return value <= target
150+
case '$ne':
151+
return !equals(value, target)
152+
case '$in':
153+
return Array.isArray(target) && target.includes(value)
154+
case '$nin':
155+
return Array.isArray(target) && !target.includes(value)
156+
case '$not':
157+
return !matchQuery({ [field]: value }, { [field]: target })
158+
default:
159+
return true
160+
}
161+
})
162+
})
163+
}

src/types.d.ts

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -350,3 +350,22 @@ export interface ParquetReadOptions {
350350
compressors?: Compressors // custom decompressors
351351
utf8?: boolean // decode byte arrays as utf8 strings (default true)
352352
}
353+
354+
export type ParquetQueryValue = string | number | boolean | object | null | undefined
355+
356+
export type ParquetQueryOperator = {
357+
$gt?: ParquetQueryValue
358+
$gte?: ParquetQueryValue
359+
$lt?: ParquetQueryValue
360+
$lte?: ParquetQueryValue
361+
$ne?: ParquetQueryValue
362+
$in?: ParquetQueryValue[]
363+
$nin?: ParquetQueryValue[]
364+
}
365+
366+
export interface ParquetQueryFilter {
367+
[key: string]: ParquetQueryValue | ParquetQueryOperator | ParquetQueryFilter[] | undefined
368+
$and?: ParquetQueryFilter[]
369+
$or?: ParquetQueryFilter[]
370+
$not?: ParquetQueryFilter
371+
}

src/utils.js

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,22 @@ export function concat(aaa, bbb) {
3737
}
3838
}
3939

40+
/**
41+
* Deep equality comparison
42+
*
43+
* @param {any} a First object to compare
44+
* @param {any} b Second object to compare
45+
* @returns {boolean} true if objects are equal
46+
*/
47+
export function equals(a, b) {
48+
if (a === b) return true
49+
if (a instanceof Uint8Array && b instanceof Uint8Array) return equals(Array.from(a), Array.from(b))
50+
if (!a || !b || typeof a !== typeof b) return false
51+
return Array.isArray(a) && Array.isArray(b)
52+
? a.length === b.length && a.every((v, i) => equals(v, b[i]))
53+
: typeof a === 'object' && Object.keys(a).length === Object.keys(b).length && Object.keys(a).every(k => equals(a[k], b[k]))
54+
}
55+
4056
/**
4157
* Get the byte length of a URL using a HEAD request.
4258
* If requestInit is provided, it will be passed to fetch.

test/query.test.js

Lines changed: 137 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,4 +58,141 @@ describe('parquetQuery', () => {
5858
const futureRows = parquetQuery({ file, orderBy: 'nonexistent' })
5959
await expect(futureRows).rejects.toThrow('parquet columns not found: nonexistent')
6060
})
61+
62+
it('reads data with filter', async () => {
63+
const file = await asyncBufferFromFile('test/files/datapage_v2.snappy.parquet')
64+
const rows = await parquetQuery({ file, filter: { c: 2 } })
65+
expect(toJson(rows)).toEqual([
66+
{ a: 'abc', b: 1, c: 2, d: true, e: [ 1, 2, 3 ] },
67+
{ a: 'abc', b: 5, c: 2, d: true, e: [ 1, 2 ] },
68+
])
69+
})
70+
71+
it('reads data with filter and rowStart/rowEnd', async () => {
72+
const file = await asyncBufferFromFile('test/files/datapage_v2.snappy.parquet')
73+
const rows = await parquetQuery({ file, filter: { c: 2 }, rowStart: 1, rowEnd: 5 })
74+
expect(toJson(rows)).toEqual([ { a: 'abc', b: 5, c: 2, d: true, e: [ 1, 2 ] } ])
75+
})
76+
77+
it('reads data with filter and orderBy', async () => {
78+
const file = await asyncBufferFromFile('test/files/datapage_v2.snappy.parquet')
79+
const rows = await parquetQuery({ file, filter: { c: 2 }, orderBy: 'b' })
80+
expect(toJson(rows)).toEqual([
81+
{ a: 'abc', b: 1, c: 2, d: true, e: [ 1, 2, 3 ] },
82+
{ a: 'abc', b: 5, c: 2, d: true, e: [ 1, 2 ] },
83+
])
84+
})
85+
86+
it('reads data with filter, orderBy, and rowStart/rowEnd', async () => {
87+
const file = await asyncBufferFromFile('test/files/datapage_v2.snappy.parquet')
88+
const rows = await parquetQuery({ file, filter: { c: 2 }, orderBy: 'b', rowStart: 1, rowEnd: 2 })
89+
expect(toJson(rows)).toEqual([ { a: 'abc', b: 5, c: 2, d: true, e: [ 1, 2 ] } ])
90+
})
91+
92+
it('reads data with $and filter', async () => {
93+
const file = await asyncBufferFromFile('test/files/datapage_v2.snappy.parquet')
94+
const rows = await parquetQuery({ file, filter: { $and: [{ c: 2 }, { e: [1, 2, 3] }] } })
95+
expect(toJson(rows)).toEqual([
96+
{ a: 'abc', b: 1, c: 2, d: true, e: [1, 2, 3] },
97+
])
98+
})
99+
100+
it('reads data with $or filter', async () => {
101+
const file = await asyncBufferFromFile('test/files/datapage_v2.snappy.parquet')
102+
const rows = await parquetQuery({ file, filter: { $or: [{ c: 2 }, { d: false }] } })
103+
expect(toJson(rows)).toEqual([
104+
{ a: 'abc', b: 1, c: 2, d: true, e: [1, 2, 3] },
105+
{ a: null, b: 4, c: 5, d: false, e: [1, 2, 3] },
106+
{ a: 'abc', b: 5, c: 2, d: true, e: [1, 2] },
107+
])
108+
})
109+
110+
it('reads data with $not filter', async () => {
111+
const file = await asyncBufferFromFile('test/files/datapage_v2.snappy.parquet')
112+
const rows = await parquetQuery({ file, filter: { $not: { c: 2 } } })
113+
expect(toJson(rows)).toEqual([
114+
{ a: 'abc', b: 2, c: 3, d: true },
115+
{ a: 'abc', b: 3, c: 4, d: true },
116+
{ a: null, b: 4, c: 5, d: false, e: [1, 2, 3] },
117+
])
118+
})
119+
120+
it('reads data with $not value filter', async () => {
121+
const file = await asyncBufferFromFile('test/files/datapage_v2.snappy.parquet')
122+
const rows = await parquetQuery({ file, filter: { c: { $not: 2 } } })
123+
expect(toJson(rows)).toEqual([
124+
{ a: 'abc', b: 2, c: 3, d: true },
125+
{ a: 'abc', b: 3, c: 4, d: true },
126+
{ a: null, b: 4, c: 5, d: false, e: [1, 2, 3] },
127+
])
128+
})
129+
130+
it('reads data with $gt filter', async () => {
131+
const file = await asyncBufferFromFile('test/files/datapage_v2.snappy.parquet')
132+
const rows = await parquetQuery({ file, filter: { b: { $gt: 3 } } })
133+
expect(toJson(rows)).toEqual([
134+
{ a: null, b: 4, c: 5, d: false, e: [1, 2, 3] },
135+
{ a: 'abc', b: 5, c: 2, d: true, e: [1, 2] },
136+
])
137+
})
138+
139+
140+
it('reads data with $gte filter', async () => {
141+
const file = await asyncBufferFromFile('test/files/datapage_v2.snappy.parquet')
142+
const rows = await parquetQuery({ file, filter: { b: { $gte: 3 } } })
143+
expect(toJson(rows)).toEqual([
144+
{ a: 'abc', b: 3, c: 4, d: true },
145+
{ a: null, b: 4, c: 5, d: false, e: [1, 2, 3] },
146+
{ a: 'abc', b: 5, c: 2, d: true, e: [1, 2] },
147+
])
148+
})
149+
150+
it('reads data with $lt filter', async () => {
151+
const file = await asyncBufferFromFile('test/files/datapage_v2.snappy.parquet')
152+
const rows = await parquetQuery({ file, filter: { b: { $lt: 3 } } })
153+
expect(toJson(rows)).toEqual([
154+
{ a: 'abc', b: 1, c: 2, d: true, e: [1, 2, 3] },
155+
{ a: 'abc', b: 2, c: 3, d: true },
156+
])
157+
})
158+
159+
it('reads data with $lte filter', async () => {
160+
const file = await asyncBufferFromFile('test/files/datapage_v2.snappy.parquet')
161+
const rows = await parquetQuery({ file, filter: { b: { $lte: 3 } } })
162+
expect(toJson(rows)).toEqual([
163+
{ a: 'abc', b: 1, c: 2, d: true, e: [1, 2, 3] },
164+
{ a: 'abc', b: 2, c: 3, d: true },
165+
{ a: 'abc', b: 3, c: 4, d: true },
166+
])
167+
})
168+
169+
it('reads data with $ne filter', async () => {
170+
const file = await asyncBufferFromFile('test/files/datapage_v2.snappy.parquet')
171+
const rows = await parquetQuery({ file, filter: { b: { $ne: 3 } } })
172+
expect(toJson(rows)).toEqual([
173+
{ a: 'abc', b: 1, c: 2, d: true, e: [1, 2, 3] },
174+
{ a: 'abc', b: 2, c: 3, d: true },
175+
{ a: null, b: 4, c: 5, d: false, e: [1, 2, 3] },
176+
{ a: 'abc', b: 5, c: 2, d: true, e: [1, 2] },
177+
])
178+
})
179+
180+
it('reads data with $in filter', async () => {
181+
const file = await asyncBufferFromFile('test/files/datapage_v2.snappy.parquet')
182+
const rows = await parquetQuery({ file, filter: { b: { $in: [2, 4] } } })
183+
expect(toJson(rows)).toEqual([
184+
{ a: 'abc', b: 2, c: 3, d: true },
185+
{ a: null, b: 4, c: 5, d: false, e: [1, 2, 3] },
186+
])
187+
})
188+
189+
it('reads data with $nin filter', async () => {
190+
const file = await asyncBufferFromFile('test/files/datapage_v2.snappy.parquet')
191+
const rows = await parquetQuery({ file, filter: { b: { $nin: [2, 4] } } })
192+
expect(toJson(rows)).toEqual([
193+
{ a: 'abc', b: 1, c: 2, d: true, e: [1, 2, 3] },
194+
{ a: 'abc', b: 3, c: 4, d: true },
195+
{ a: 'abc', b: 5, c: 2, d: true, e: [1, 2] },
196+
])
197+
})
61198
})

0 commit comments

Comments
 (0)