Skip to content

Commit

Permalink
Convert dictionary before dereferencing, and check encoding
Browse files Browse the repository at this point in the history
  • Loading branch information
platypii committed May 24, 2024
1 parent f4877dc commit 9aebdb2
Show file tree
Hide file tree
Showing 2 changed files with 36 additions and 32 deletions.
13 changes: 5 additions & 8 deletions src/column.js
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import { assembleLists } from './assemble.js'
import { convert, dereferenceDictionary } from './convert.js'
import { convertWithDictionary } from './convert.js'
import { readDataPage, readDictionaryPage } from './datapage.js'
import { readDataPageV2 } from './datapageV2.js'
import { parquetHeader } from './header.js'
Expand Down Expand Up @@ -49,11 +49,9 @@ export function readColumn(reader, rowGroup, columnMetadata, schemaPath, { compr
seen += daph.num_values
// assert(!daph.statistics || daph.statistics.null_count === BigInt(daph.num_values - dataPage.length))

// construct output values: skip nulls and construct lists
values = dereferenceDictionary(dictionary, dataPage)
values = convert(values, element, utf8)
// convert types, dereference dictionary, and assemble lists
values = convertWithDictionary(dataPage, dictionary, element, daph.encoding, utf8)
if (repetitionLevels.length || definitionLevels?.length) {
// Use repetition levels to construct lists
const maxDefinitionLevel = getMaxDefinitionLevel(schemaPath)
const maxRepetitionLevel = getMaxRepetitionLevel(schemaPath)
const repetitionPath = schemaPath.map(({ element }) => element.repetition_type)
Expand All @@ -79,10 +77,9 @@ export function readColumn(reader, rowGroup, columnMetadata, schemaPath, { compr
)
seen += daph2.num_values

values = dereferenceDictionary(dictionary, dataPage)
values = convert(values, element, utf8)
// convert types, dereference dictionary, and assemble lists
values = convertWithDictionary(dataPage, dictionary, element, daph2.encoding, utf8)
if (repetitionLevels.length || definitionLevels?.length) {
// Use repetition levels to construct lists
const maxDefinitionLevel = getMaxDefinitionLevel(schemaPath)
const maxRepetitionLevel = getMaxRepetitionLevel(schemaPath)
const repetitionPath = schemaPath.map(({ element }) => element.repetition_type)
Expand Down
55 changes: 31 additions & 24 deletions src/convert.js
Original file line number Diff line number Diff line change
@@ -1,11 +1,40 @@
const dayMillis = 86400000 // 1 day in milliseconds

/**
* Convert known types from primitive to rich.
* Convert known types from primitive to rich, and dereference dictionary.
*
* @typedef {import('./types.js').DecodedArray} DecodedArray
* @typedef {import('./types.js').SchemaElement} SchemaElement
* @param {DecodedArray} data series of primitive types
* @param {import('./types.js').SchemaElement} schemaElement schema element for the data
* @param {DecodedArray | undefined} dictionary
* @param {SchemaElement} schemaElement
* @param {import('./types.js').Encoding} encoding
* @param {boolean | undefined} utf8 decode bytes as utf8?
* @returns {DecodedArray} series of rich types
*/
export function convertWithDictionary(data, dictionary, schemaElement, encoding, utf8 = true) {
if (dictionary && encoding.endsWith('_DICTIONARY')) {
// convert dictionary
dictionary = convert(dictionary, schemaElement, utf8)
let output = data
if (data instanceof Uint8Array && !(dictionary instanceof Uint8Array)) {
// @ts-expect-error upgrade data to match dictionary type with fancy constructor
output = new dictionary.constructor(data.length)
}
for (let i = 0; i < data.length; i++) {
output[i] = dictionary[data[i]]
}
return output
} else {
return convert(data, schemaElement, utf8)
}
}

/**
* Convert known types from primitive to rich.
*
* @param {DecodedArray} data series of primitive types
* @param {SchemaElement} schemaElement
* @param {boolean | undefined} utf8 decode bytes as utf8?
* @returns {DecodedArray} series of rich types
*/
Expand Down Expand Up @@ -125,25 +154,3 @@ export function parseFloat16(bytes) {
if (exp === 0x1f) return frac ? NaN : sign * Infinity
return sign * Math.pow(2, exp - 15) * (1 + frac / 1024)
}

/**
* Map data to dictionary values in place.
*
* @param {DecodedArray | undefined} dictionary
* @param {DecodedArray} dataPage
* @returns {DecodedArray}
*/
export function dereferenceDictionary(dictionary, dataPage) {
let output = dataPage
if (dictionary) {
if (dataPage instanceof Uint8Array && !(dictionary instanceof Uint8Array)) {
// upgrade dataPage to match dictionary type
// @ts-expect-error not my fault typescript doesn't understand constructors
output = new dictionary.constructor(dataPage.length)
}
for (let i = 0; i < dataPage.length; i++) {
output[i] = dictionary[dataPage[i]]
}
}
return output
}

0 comments on commit 9aebdb2

Please sign in to comment.