diff --git a/src/column.js b/src/column.js index e7b8592..dfafc4b 100644 --- a/src/column.js +++ b/src/column.js @@ -1,5 +1,5 @@ import { assembleLists } from './assemble.js' -import { convert, dereferenceDictionary } from './convert.js' +import { convertWithDictionary } from './convert.js' import { readDataPage, readDictionaryPage } from './datapage.js' import { readDataPageV2 } from './datapageV2.js' import { parquetHeader } from './header.js' @@ -49,11 +49,9 @@ export function readColumn(reader, rowGroup, columnMetadata, schemaPath, { compr seen += daph.num_values // assert(!daph.statistics || daph.statistics.null_count === BigInt(daph.num_values - dataPage.length)) - // construct output values: skip nulls and construct lists - values = dereferenceDictionary(dictionary, dataPage) - values = convert(values, element, utf8) + // convert types, dereference dictionary, and assemble lists + values = convertWithDictionary(dataPage, dictionary, element, daph.encoding, utf8) if (repetitionLevels.length || definitionLevels?.length) { - // Use repetition levels to construct lists const maxDefinitionLevel = getMaxDefinitionLevel(schemaPath) const maxRepetitionLevel = getMaxRepetitionLevel(schemaPath) const repetitionPath = schemaPath.map(({ element }) => element.repetition_type) @@ -79,10 +77,9 @@ export function readColumn(reader, rowGroup, columnMetadata, schemaPath, { compr ) seen += daph2.num_values - values = dereferenceDictionary(dictionary, dataPage) - values = convert(values, element, utf8) + // convert types, dereference dictionary, and assemble lists + values = convertWithDictionary(dataPage, dictionary, element, daph2.encoding, utf8) if (repetitionLevels.length || definitionLevels?.length) { - // Use repetition levels to construct lists const maxDefinitionLevel = getMaxDefinitionLevel(schemaPath) const maxRepetitionLevel = getMaxRepetitionLevel(schemaPath) const repetitionPath = schemaPath.map(({ element }) => element.repetition_type) diff --git a/src/convert.js b/src/convert.js index d555e6c..78ba055 100644 --- a/src/convert.js +++ b/src/convert.js @@ -1,11 +1,40 @@ const dayMillis = 86400000 // 1 day in milliseconds /** - * Convert known types from primitive to rich. + * Convert known types from primitive to rich, and dereference dictionary. * * @typedef {import('./types.js').DecodedArray} DecodedArray + * @typedef {import('./types.js').SchemaElement} SchemaElement * @param {DecodedArray} data series of primitive types - * @param {import('./types.js').SchemaElement} schemaElement schema element for the data + * @param {DecodedArray | undefined} dictionary + * @param {SchemaElement} schemaElement + * @param {import('./types.js').Encoding} encoding + * @param {boolean | undefined} utf8 decode bytes as utf8? + * @returns {DecodedArray} series of rich types + */ +export function convertWithDictionary(data, dictionary, schemaElement, encoding, utf8 = true) { + if (dictionary && encoding.endsWith('_DICTIONARY')) { + // convert dictionary + dictionary = convert(dictionary, schemaElement, utf8) + let output = data + if (data instanceof Uint8Array && !(dictionary instanceof Uint8Array)) { + // @ts-expect-error upgrade data to match dictionary type with fancy constructor + output = new dictionary.constructor(data.length) + } + for (let i = 0; i < data.length; i++) { + output[i] = dictionary[data[i]] + } + return output + } else { + return convert(data, schemaElement, utf8) + } +} + +/** + * Convert known types from primitive to rich. + * + * @param {DecodedArray} data series of primitive types + * @param {SchemaElement} schemaElement * @param {boolean | undefined} utf8 decode bytes as utf8? * @returns {DecodedArray} series of rich types */ @@ -125,25 +154,3 @@ export function parseFloat16(bytes) { if (exp === 0x1f) return frac ? NaN : sign * Infinity return sign * Math.pow(2, exp - 15) * (1 + frac / 1024) } - -/** - * Map data to dictionary values in place. - * - * @param {DecodedArray | undefined} dictionary - * @param {DecodedArray} dataPage - * @returns {DecodedArray} - */ -export function dereferenceDictionary(dictionary, dataPage) { - let output = dataPage - if (dictionary) { - if (dataPage instanceof Uint8Array && !(dictionary instanceof Uint8Array)) { - // upgrade dataPage to match dictionary type - // @ts-expect-error not my fault typescript doesn't understand constructors - output = new dictionary.constructor(dataPage.length) - } - for (let i = 0; i < dataPage.length; i++) { - output[i] = dictionary[dataPage[i]] - } - } - return output -}