diff --git a/lilac/data/dataset_duckdb.py b/lilac/data/dataset_duckdb.py index b62deb44..2df96c58 100644 --- a/lilac/data/dataset_duckdb.py +++ b/lilac/data/dataset_duckdb.py @@ -3381,7 +3381,7 @@ def to_json( file.write(orjson.dumps(row)) file.write('\n'.encode('utf-8')) else: - file.write(orjson.dumps(rows)) + file.write(orjson.dumps(list(rows))) log(f'Dataset exported to {filepath}') @override diff --git a/lilac/data/dataset_export_test.py b/lilac/data/dataset_export_test.py index fff2c839..c60f035a 100644 --- a/lilac/data/dataset_export_test.py +++ b/lilac/data/dataset_export_test.py @@ -101,7 +101,7 @@ def test_export_to_json(make_test_data: TestDataMaker, tmp_path: pathlib.Path) - filepath = tmp_path / 'dataset.json' dataset.to_json(filepath) - with open(filepath) as f: + with open(filepath, 'r') as f: parsed_items = [json.loads(line) for line in f.readlines()] assert parsed_items == [{'text': 'hello'}, {'text': 'everybody'}] @@ -109,7 +109,7 @@ def test_export_to_json(make_test_data: TestDataMaker, tmp_path: pathlib.Path) - # Include signals. dataset.to_json(filepath, include_signals=True) - with open(filepath) as f: + with open(filepath, 'r') as f: parsed_items = [json.loads(line) for line in f.readlines()] assert parsed_items == [ @@ -126,7 +126,7 @@ def test_export_to_json(make_test_data: TestDataMaker, tmp_path: pathlib.Path) - include_signals=True, ) - with open(filepath) as f: + with open(filepath, 'r') as f: parsed_items = [json.loads(line) for line in f.readlines()] assert parsed_items == [ @@ -138,7 +138,62 @@ def test_export_to_json(make_test_data: TestDataMaker, tmp_path: pathlib.Path) - filepath, filters=[('text.test_signal.flen', 'less_equal', '5')], include_signals=True ) - with open(filepath) as f: + with open(filepath, 'r') as f: + parsed_items = [json.loads(line) for line in f.readlines()] + + assert parsed_items == [{'text': {VALUE_KEY: 'hello', 'test_signal': {'flen': 5.0, 'len': 5}}}] + + +def test_export_to_jsonl(make_test_data: TestDataMaker, tmp_path: pathlib.Path) -> None: + dataset = make_test_data([{'text': 'hello'}, {'text': 'everybody'}]) + dataset.compute_signal(TestSignal(), 'text') + + # Download all columns. + filepath = tmp_path / 'dataset.json' + dataset.to_json(filepath, jsonl=True) + + with open(filepath, 'r') as f: + parsed_items = [json.loads(line) for line in f.readlines()] + + assert parsed_items == [{'text': 'hello'}, {'text': 'everybody'}] + + # Include signals. + dataset.to_json(filepath, jsonl=True, include_signals=True) + + with open(filepath, 'r') as f: + parsed_items = [json.loads(line) for line in f.readlines()] + + assert parsed_items == [ + {'text': {VALUE_KEY: 'hello', 'test_signal': {'flen': 5.0, 'len': 5}}}, + {'text': {VALUE_KEY: 'everybody', 'test_signal': {'flen': 9.0, 'len': 9}}}, + ] + + # Download a subset of columns with filter. + filepath = tmp_path / 'dataset2.json' + dataset.to_json( + filepath, + jsonl=True, + columns=['text', 'text.test_signal'], + filters=[('text.test_signal.len', 'greater', '6')], + include_signals=True, + ) + + with open(filepath, 'r') as f: + parsed_items = [json.loads(line) for line in f.readlines()] + + assert parsed_items == [ + {'text': {VALUE_KEY: 'everybody', 'test_signal': {'flen': 9.0, 'len': 9}}} + ] + + filepath = tmp_path / 'dataset3.json' + dataset.to_json( + filepath, + jsonl=True, + filters=[('text.test_signal.flen', 'less_equal', '5')], + include_signals=True, + ) + + with open(filepath, 'r') as f: parsed_items = [json.loads(line) for line in f.readlines()] assert parsed_items == [{'text': {VALUE_KEY: 'hello', 'test_signal': {'flen': 5.0, 'len': 5}}}] @@ -152,7 +207,7 @@ def test_export_to_csv(make_test_data: TestDataMaker, tmp_path: pathlib.Path) -> filepath = tmp_path / 'dataset.csv' dataset.to_csv(filepath) - with open(filepath) as f: + with open(filepath, 'r') as f: rows = list(csv.reader(f)) assert rows == [ @@ -172,7 +227,7 @@ def test_export_to_csv_include_signals( filepath = tmp_path / 'dataset.csv' dataset.to_csv(filepath, include_signals=True) - with open(filepath) as f: + with open(filepath, 'r') as f: rows = list(csv.reader(f)) assert rows == [ @@ -196,7 +251,7 @@ def test_export_to_csv_subset_source_columns( filepath = tmp_path / 'dataset.csv' dataset.to_csv(filepath, columns=['age', 'metric']) - with open(filepath) as f: + with open(filepath, 'r') as f: rows = list(csv.reader(f)) assert rows == [ @@ -232,7 +287,7 @@ def test_export_to_csv_subset_of_nested_data( filepath = tmp_path / 'dataset.csv' dataset.to_csv(filepath, columns=['doc.content', 'doc.paragraphs.*.text']) - with open(filepath) as f: + with open(filepath, 'r') as f: rows = list(csv.reader(f)) assert rows == [ @@ -323,7 +378,7 @@ def test_label_and_export_by_excluding( filepath = tmp_path / 'dataset.json' dataset.to_json(filepath) - with open(filepath) as f: + with open(filepath, 'r') as f: parsed_items = [json.loads(line) for line in f.readlines()] assert parsed_items == [{f'{DELETED_LABEL_NAME}': None, 'text': 'a'}] @@ -332,7 +387,7 @@ def test_label_and_export_by_excluding( filepath = tmp_path / 'dataset.json' dataset.to_json(filepath, include_deleted=True) - with open(filepath) as f: + with open(filepath, 'r') as f: parsed_items = [json.loads(line) for line in f.readlines()] assert parsed_items == [ @@ -357,7 +412,7 @@ def test_include_multiple_labels(make_test_data: TestDataMaker, tmp_path: pathli filepath = tmp_path / 'dataset.json' dataset.to_json(filepath, columns=['text'], include_labels=['good', 'very_good']) - with open(filepath) as f: + with open(filepath, 'r') as f: parsed_items = [json.loads(line) for line in f.readlines()] parsed_items = sorted(parsed_items, key=lambda x: x['text']) @@ -373,7 +428,7 @@ def test_exclude_multiple_labels(make_test_data: TestDataMaker, tmp_path: pathli filepath = tmp_path / 'dataset.json' dataset.to_json(filepath, columns=['text'], exclude_labels=['bad', 'very_bad']) - with open(filepath) as f: + with open(filepath, 'r') as f: parsed_items = [json.loads(line) for line in f.readlines()] parsed_items = sorted(parsed_items, key=lambda x: x['text']) @@ -389,7 +444,7 @@ def test_exclude_trumps_include(make_test_data: TestDataMaker, tmp_path: pathlib filepath = tmp_path / 'dataset.json' dataset.to_json(filepath, columns=['text'], include_labels=['good'], exclude_labels=['bad']) - with open(filepath) as f: + with open(filepath, 'r') as f: parsed_items = [json.loads(line) for line in f.readlines()] assert parsed_items == [{'text': 'b'}] diff --git a/lilac/router_dataset.py b/lilac/router_dataset.py index ffa78973..fbef6181 100644 --- a/lilac/router_dataset.py +++ b/lilac/router_dataset.py @@ -155,6 +155,7 @@ class SelectRowsOptions(BaseModel): offset: Optional[int] = None combine_columns: Optional[bool] = None include_deleted: bool = False + exclude_signals: bool = False class SelectRowsSchemaOptions(BaseModel): @@ -206,6 +207,7 @@ def select_rows( offset=options.offset, combine_columns=options.combine_columns or False, include_deleted=options.include_deleted, + exclude_signals=options.exclude_signals, user=user, ) @@ -303,6 +305,7 @@ class ExportOptions(BaseModel): columns: Sequence[Path] = [] include_labels: Sequence[str] = [] exclude_labels: Sequence[str] = [] + include_signals: bool = False # Note: "__deleted__" is "just" another label, and the UI # will default to adding the "__deleted__" label to the exclude_labels list. If the user wants # to include deleted items, they can remove the "__deleted__" label from the exclude_labels list. @@ -328,20 +331,31 @@ def export_dataset(namespace: str, dataset_name: str, options: ExportOptions) -> if options.format == 'csv': dataset.to_csv( - options.filepath, options.columns, [], options.include_labels, options.exclude_labels + filepath=options.filepath, + columns=options.columns, + filters=[], + include_labels=options.include_labels, + exclude_labels=options.exclude_labels, + include_signals=options.include_signals, ) elif options.format == 'json': dataset.to_json( - options.filepath, - options.jsonl or False, - options.columns, - [], - options.include_labels, - options.exclude_labels, + filepath=options.filepath, + jsonl=options.jsonl or False, + columns=options.columns, + filters=[], + include_labels=options.include_labels, + exclude_labels=options.exclude_labels, + include_signals=options.include_signals, ) elif options.format == 'parquet': dataset.to_parquet( - options.filepath, options.columns, [], options.include_labels, options.exclude_labels + filepath=options.filepath, + columns=options.columns, + filters=[], + include_labels=options.include_labels, + exclude_labels=options.exclude_labels, + include_signals=options.include_signals, ) else: raise ValueError(f'Unknown format: {options.format}') diff --git a/web/blueprint/src/lib/components/datasetView/ExportModal.svelte b/web/blueprint/src/lib/components/datasetView/ExportModal.svelte index a4db4032..0878e794 100644 --- a/web/blueprint/src/lib/components/datasetView/ExportModal.svelte +++ b/web/blueprint/src/lib/components/datasetView/ExportModal.svelte @@ -26,7 +26,6 @@ NotificationActionButton, RadioButton, RadioButtonGroup, - SkeletonPlaceholder, SkeletonText, TextArea, TextInput, @@ -41,21 +40,36 @@ const formats: ExportOptions['format'][] = ['json', 'csv', 'parquet']; let selectedFormat: ExportOptions['format'] = 'json'; let filepath = ''; - let jsonl = false; + let jsonl = true; const dispatch = createEventDispatcher(); const exportDataset = exportDatasetMutation(); const datasetViewStore = getDatasetViewContext(); - $: ({sourceFields, enrichedFields, labelFields, mapFields} = getFields(schema)); + $: ({sourceFields, signalFields: signalFields, labelFields, mapFields} = getFields(schema)); let checkedSourceFields: LilacField[] | undefined = undefined; let checkedLabeledFields: LilacField[] = []; - let checkedEnrichedFields: LilacField[] = []; + let checkedSignalFields: LilacField[] = []; let checkedMapFields: LilacField[] = []; let includeOnlyLabels: boolean[] = []; let excludeLabels: boolean[] = []; + let includeSignals = false; + + function includeSignalsChecked(e: Event) { + includeSignals = (e.target as HTMLInputElement).checked; + if (includeSignals) { + checkedSignalFields = signalFields; + } else { + checkedSignalFields = []; + } + } + function signalCheckboxClicked() { + if (checkedSignalFields.length > 0) { + includeSignals = true; + } + } // Default the checked source fields to all of them. $: { @@ -67,7 +81,7 @@ $: exportFields = [ ...(checkedSourceFields || []), ...checkedLabeledFields, - ...checkedEnrichedFields, + ...checkedSignalFields, ...checkedMapFields ]; @@ -76,7 +90,8 @@ ? querySelectRows($datasetViewStore.namespace, $datasetViewStore.datasetName, { columns: exportFields.map(x => x.path), limit: 3, - combine_columns: true + combine_columns: true, + exclude_signals: !includeSignals }) : null; $: exportDisabled = @@ -87,21 +102,21 @@ const petalFields = petals(schema).filter(f => !isEmbeddingField(f)); const labelFields = allFields.filter(f => isLabelRootField(f)); - const enrichedFields = allFields - .filter(f => isSignalField(f) || isClusterField(f)) + const signalFields = allFields + .filter(f => isSignalField(f)) .filter(f => !childFields(f).some(f => f.dtype?.type === 'embedding')); - const mapFields = allFields.filter(f => isMapField(f)); + const mapFields = allFields.filter(f => isMapField(f) || isClusterField(f)); const sourceFields = petalFields.filter( f => !labelFields.includes(f) && - !enrichedFields.includes(f) && + !signalFields.includes(f) && !mapFields.includes(f) && // Labels are special in that we only show the root of the label field so the children do // not show up in the labelFields. !isLabelField(f) ); - return {sourceFields, enrichedFields, labelFields, mapFields}; + return {sourceFields, signalFields, labelFields, mapFields}; } async function submit() { @@ -113,7 +128,8 @@ jsonl, columns: exportFields.map(x => x.path), include_labels: labelFields.filter((_, i) => includeOnlyLabels[i]).map(x => x.path[0]), - exclude_labels: labelFields.filter((_, i) => excludeLabels[i]).map(x => x.path[0]) + exclude_labels: labelFields.filter((_, i) => excludeLabels[i]).map(x => x.path[0]), + include_signals: includeSignals }; $exportDataset.mutate([namespace, datasetName, options]); } @@ -150,7 +166,7 @@

0}> No fields selected. Please select at least one field to export.

-
+

Source fields

{#if checkedSourceFields != null} @@ -163,10 +179,23 @@
{/if} - {#if enrichedFields.length > 0} + {#if signalFields.length > 0}
-

Enriched fields

- +
+
+ includeSignalsChecked(e)} + /> +
+

Signal fields

+
+ signalCheckboxClicked()} + />
{/if} {#if mapFields.length > 0} @@ -229,7 +258,7 @@ />
{#if selectedFormat === 'json'} -
+
{/if} @@ -241,7 +270,7 @@ hideCloseButton /> {:else if $exportDataset.isLoading} - + {:else if $exportDataset.data}
diff --git a/web/blueprint/src/lib/components/datasetView/FieldList.svelte b/web/blueprint/src/lib/components/datasetView/FieldList.svelte index 5fb04ff5..f6175fe0 100644 --- a/web/blueprint/src/lib/components/datasetView/FieldList.svelte +++ b/web/blueprint/src/lib/components/datasetView/FieldList.svelte @@ -9,10 +9,13 @@ type LilacField } from '$lilac'; import {Checkbox} from 'carbon-components-svelte'; + import {createEventDispatcher} from 'svelte'; export let fields: LilacField[]; export let checkedFields: LilacField[]; + const dispatch = createEventDispatcher(); + function checkboxClicked(field: LilacField, event: Event) { const checked = (event.target as HTMLInputElement).checked; if (checked) { @@ -21,6 +24,7 @@ } else { checkedFields = checkedFields.filter(f => !pathIsEqual(f.path, field.path)); } + dispatch('change', checkedFields); } diff --git a/web/lib/fastapi_client/models/ExportOptions.ts b/web/lib/fastapi_client/models/ExportOptions.ts index 709d01fc..06aa9784 100644 --- a/web/lib/fastapi_client/models/ExportOptions.ts +++ b/web/lib/fastapi_client/models/ExportOptions.ts @@ -13,5 +13,6 @@ export type ExportOptions = { columns?: Array<(Array | string)>; include_labels?: Array; exclude_labels?: Array; + include_signals?: boolean; }; diff --git a/web/lib/fastapi_client/models/SelectRowsOptions.ts b/web/lib/fastapi_client/models/SelectRowsOptions.ts index 68266cf4..fccff324 100644 --- a/web/lib/fastapi_client/models/SelectRowsOptions.ts +++ b/web/lib/fastapi_client/models/SelectRowsOptions.ts @@ -27,5 +27,6 @@ export type SelectRowsOptions = { offset?: (number | null); combine_columns?: (boolean | null); include_deleted?: boolean; + exclude_signals?: boolean; };