diff --git a/docs/source/explanations/catalog.md b/docs/source/explanations/catalog.md index 7b6f7be3e..9bd7df587 100644 --- a/docs/source/explanations/catalog.md +++ b/docs/source/explanations/catalog.md @@ -54,7 +54,8 @@ and `assets`, describes the format, structure, and location of the data. to the Adapter - `management` --- enum indicating whether the data is registered `"external"` data or `"writable"` data managed by Tiled -- `structure_family` --- enum of structure types (`"container"`, `"array"`, `"table"`, ...) +- `structure_family` --- enum of structure types (`"container"`, `"array"`, `"table"`, + etc. -- except for `consolidated`, which can not be assigned to a Data Source) - `structure_id` --- a foreign key to the `structures` table - `node_id` --- foreign key to `nodes` - `id` --- integer primary key diff --git a/docs/source/explanations/structures.md b/docs/source/explanations/structures.md index 65ce55deb..89039231d 100644 --- a/docs/source/explanations/structures.md +++ b/docs/source/explanations/structures.md @@ -11,7 +11,8 @@ The structure families are: * array --- a strided array, like a [numpy](https://numpy.org) array * awkward --- nested, variable-sized data (as implemented by [AwkwardArray](https://awkward-array.org/)) -* container --- a of other structures, akin to a dictionary or a directory +* consolidated --- a container-like structure to combine tables and arrays in a common namespace +* container --- a collection of other structures, akin to a dictionary or a directory * sparse --- a sparse array (i.e. an array which is mostly zeros) * table --- tabular data, as in [Apache Arrow](https://arrow.apache.org) or [pandas](https://pandas.pydata.org/) @@ -575,3 +576,76 @@ response. "count": 5 } ``` + +### Consolidated + +This is a specialized container-like structure designed to link together multiple tables and arrays that store +related scientific data. It does not support nesting but provides a common namespace across all columns of the +contained tables along with the arrays (thus, name collisions are forbidden). This allows to further abstract out +the disparate internal storage mechanisms (e.g. Parquet for tables and zarr for arrays) and present the user with a +smooth homogeneous interface for data access. Consolidated structures do not support pagination and are not +recommended for "wide" datasets with more than ~1000 items (cloumns and arrays) in the namespace. + +Below is an example of a Consolidated structure that describes two tables and two arrays of various sizes. Their +respective structures are specfied in the `parts` list, and `all_keys` defines the internal namespace of directly +addressible columns and arrays. + +```json +{ + "parts": [ + { + "structure_family": "table", + "structure": { + "arrow_schema": "data:application/vnd.apache.arrow.file;base64,/////...FFFF", + "npartitions": 1, + "columns": ["A", "B"], + "resizable": false + }, + "name": "table1" + }, + { + "structure_family": "table", + "structure": { + "arrow_schema": "data:application/vnd.apache.arrow.file;base64,/////...FFFF", + "npartitions": 1, + "columns": ["C", "D", "E"], + "resizable": false + }, + "name": "table2" + }, + { + "structure_family": "array", + "structure": { + "data_type": { + "endianness": "little", + "kind": "f", + "itemsize": 8, + "dt_units": null + }, + "chunks": [[3], [5]], + "shape": [3, 5], + "dims": null, + "resizable": false + }, + "name": "F" + }, + { + "structure_family": "array", + "structure": { + "data_type": { + "endianness": "not_applicable", + "kind": "u", + "itemsize": 1, + "dt_units": null + }, + "chunks": [[5], [7], [3]], + "shape": [5, 7, 3], + "dims": null, + "resizable": false + }, + "name": "G" + } + ], + "all_keys": ["A", "B", "C", "D", "E", "F", "G"] +} +``` diff --git a/docs/source/how-to/register.md b/docs/source/how-to/register.md index 2ca73f068..835ea1cb3 100644 --- a/docs/source/how-to/register.md +++ b/docs/source/how-to/register.md @@ -72,7 +72,10 @@ Sometimes it is necessary to take more manual control of this registration process, such as if you want to take advantage of particular knowledge about the files to specify particular `metadata` or `specs`. -Use the Python client, as in this example. +#### Registering external data + +To register data from external files in Tiled, one can use the Python client and +construct Data Source object explicitly passing the list of assets, as in the following example. ```py import numpy @@ -112,3 +115,36 @@ client.new( specs=[], ) ``` + +#### Writing a consolidated structure + +Similarly, to create a consolidated container structure, one needs to specify +its constituents as separate Data Sources. For example, to consolidate a table +and an array, consider the following example + +```python +import pandas + +rng = numpy.random.default_rng(12345) +arr = rng.random(size=(3, 5), dtype="float64") +df = pandas.DataFrame({"A": ["one", "two", "three"], "B": [1, 2, 3]}) + +node = client.create_consolidated( + [ + DataSource( + structure_family=StructureFamily.table, + structure=TableStructure.from_pandas(df), + name="table1", + ), + DataSource( + structure_family=StructureFamily.array, + structure=ArrayStructure.from_array(arr), + name="C", + ) + ] +) + +# Write the data +node.parts["table1"].write(df) +node.parts["C"].write_block(arr, (0, 0)) +``` diff --git a/docs/source/reference/service.md b/docs/source/reference/service.md index 213a81d57..a28b00fb8 100644 --- a/docs/source/reference/service.md +++ b/docs/source/reference/service.md @@ -104,6 +104,8 @@ See {doc}`../explanations/structures` for more context. tiled.structures.array.BuiltinDtype tiled.structures.array.Endianness tiled.structures.array.Kind + tiled.structures.consolidated.ConsolidatedStructure + tiled.structures.consolidated.ConsolidatedStructurePart tiled.structures.core.Spec tiled.structures.core.StructureFamily tiled.structures.table.TableStructure diff --git a/tiled/_tests/test_consolidated.py b/tiled/_tests/test_consolidated.py new file mode 100644 index 000000000..103be7520 --- /dev/null +++ b/tiled/_tests/test_consolidated.py @@ -0,0 +1,88 @@ +import numpy +import pandas +import pandas.testing +import pytest + +from ..catalog import in_memory +from ..client import Context, from_context +from ..server.app import build_app +from ..structures.array import ArrayStructure +from ..structures.core import StructureFamily +from ..structures.data_source import DataSource +from ..structures.table import TableStructure + +rng = numpy.random.default_rng(12345) + +df1 = pandas.DataFrame({"A": ["one", "two", "three"], "B": [1, 2, 3]}) +df2 = pandas.DataFrame( + { + "C": ["red", "green", "blue", "white"], + "D": [10.0, 20.0, 30.0, 40.0], + "E": [0, 0, 0, 0], + } +) +arr1 = rng.random(size=(3, 5), dtype="float64") +arr2 = rng.integers(0, 255, size=(5, 7, 3), dtype="uint8") +md = {"md_key1": "md_val1", "md_key2": 2} + + +@pytest.fixture(scope="module") +def tree(tmp_path_factory): + return in_memory(writable_storage=tmp_path_factory.getbasetemp()) + + +@pytest.fixture(scope="module") +def context(tree): + with Context.from_app(build_app(tree)) as context: + client = from_context(context) + x = client.create_consolidated( + [ + DataSource( + structure_family=StructureFamily.table, + structure=TableStructure.from_pandas(df1), + name="table1", + ), + DataSource( + structure_family=StructureFamily.table, + structure=TableStructure.from_pandas(df2), + name="table2", + ), + DataSource( + structure_family=StructureFamily.array, + structure=ArrayStructure.from_array(arr1), + name="F", + ), + DataSource( + structure_family=StructureFamily.array, + structure=ArrayStructure.from_array(arr2), + name="G", + ), + ], + key="x", + metadata=md, + ) + # Write by data source. + x.parts["table1"].write(df1) + x.parts["table2"].write(df2) + x.parts["F"].write_block(arr1, (0, 0)) + x.parts["G"].write_block(arr2, (0, 0, 0)) + + yield context + + +def test_iterate_parts(context): + client = from_context(context) + for part in client["x"].parts: + client["x"].parts[part].read() + + +def test_iterate_columns(context): + client = from_context(context) + for col in client["x"]: + client["x"][col].read() + client[f"x/{col}"].read() + + +def test_metadata(context): + client = from_context(context) + assert client["x"].metadata == md diff --git a/tiled/_tests/test_dataframe.py b/tiled/_tests/test_dataframe.py index 1df2163bf..01570356b 100644 --- a/tiled/_tests/test_dataframe.py +++ b/tiled/_tests/test_dataframe.py @@ -41,6 +41,17 @@ pandas.DataFrame({f"column_{i:03d}": i * numpy.ones(5) for i in range(10)}), npartitions=1, ), + # a dataframe with mixed types + "diverse": DataFrameAdapter.from_pandas( + pandas.DataFrame( + { + "A": numpy.array([1, 2, 3], dtype="|u8"), + "B": numpy.array([1, 2, 3], dtype="<f8"), + "C": ["one", "two", "three"], + } + ), + npartitions=1, + ), } ) @@ -100,6 +111,17 @@ def test_dataframe_single_partition(context): pandas.testing.assert_frame_equal(actual, expected) +def test_reading_diverse_dtypes(context): + client = from_context(context) + expected = tree["diverse"].read() + actual = client["diverse"].read() + pandas.testing.assert_frame_equal(actual, expected) + + for col in expected.columns: + actual = client["diverse"][col].read() + assert numpy.array_equal(expected[col], actual) + + def test_dask(context): client = from_context(context, "dask")["basic"] expected = tree["basic"].read() diff --git a/tiled/_tests/test_writing.py b/tiled/_tests/test_writing.py index 117edd6f0..8e70ed864 100644 --- a/tiled/_tests/test_writing.py +++ b/tiled/_tests/test_writing.py @@ -676,7 +676,7 @@ def test_append_partition( assert_frame_equal(x.read(), df3, check_dtype=False) -def test_union_one_table(tree): +def test_consolidated_one_table(tree): with Context.from_app(build_app(tree)) as context: client = from_context(context) df = pandas.DataFrame({"A": [], "B": []}) @@ -686,17 +686,17 @@ def test_union_one_table(tree): structure=structure, name="table", ) - client.create_union([data_source], key="x") + client.create_consolidated([data_source], key="x") -def test_union_two_tables(tree): +def test_consolidated_two_tables(tree): with Context.from_app(build_app(tree)) as context: client = from_context(context) df1 = pandas.DataFrame({"A": [], "B": []}) df2 = pandas.DataFrame({"C": [], "D": [], "E": []}) structure1 = TableStructure.from_pandas(df1) structure2 = TableStructure.from_pandas(df2) - x = client.create_union( + x = client.create_consolidated( [ DataSource( structure_family=StructureFamily.table, @@ -717,7 +717,7 @@ def test_union_two_tables(tree): x.parts["table2"].read() -def test_union_two_tables_colliding_names(tree): +def test_consolidated_two_tables_colliding_names(tree): with Context.from_app(build_app(tree)) as context: client = from_context(context) df1 = pandas.DataFrame({"A": [], "B": []}) @@ -725,7 +725,7 @@ def test_union_two_tables_colliding_names(tree): structure1 = TableStructure.from_pandas(df1) structure2 = TableStructure.from_pandas(df2) with fail_with_status_code(422): - client.create_union( + client.create_consolidated( [ DataSource( structure_family=StructureFamily.table, @@ -742,7 +742,7 @@ def test_union_two_tables_colliding_names(tree): ) -def test_union_two_tables_colliding_keys(tree): +def test_consolidated_two_tables_colliding_keys(tree): with Context.from_app(build_app(tree)) as context: client = from_context(context) df1 = pandas.DataFrame({"A": [], "B": []}) @@ -750,7 +750,7 @@ def test_union_two_tables_colliding_keys(tree): structure1 = TableStructure.from_pandas(df1) structure2 = TableStructure.from_pandas(df2) with fail_with_status_code(422): - client.create_union( + client.create_consolidated( [ DataSource( structure_family=StructureFamily.table, @@ -767,7 +767,7 @@ def test_union_two_tables_colliding_keys(tree): ) -def test_union_two_tables_two_arrays(tree): +def test_consolidated_two_tables_two_arrays(tree): with Context.from_app(build_app(tree)) as context: client = from_context(context) df1 = pandas.DataFrame({"A": [], "B": []}) @@ -778,7 +778,7 @@ def test_union_two_tables_two_arrays(tree): structure2 = TableStructure.from_pandas(df2) structure3 = ArrayStructure.from_array(arr1) structure4 = ArrayStructure.from_array(arr2) - x = client.create_union( + x = client.create_consolidated( [ DataSource( structure_family=StructureFamily.table, @@ -820,7 +820,7 @@ def test_union_two_tables_two_arrays(tree): x[column].read() -def test_union_table_column_array_key_collision(tree): +def test_consolidated_table_column_array_key_collision(tree): with Context.from_app(build_app(tree)) as context: client = from_context(context) df = pandas.DataFrame({"A": [], "B": []}) @@ -828,7 +828,7 @@ def test_union_table_column_array_key_collision(tree): structure1 = TableStructure.from_pandas(df) structure2 = ArrayStructure.from_array(arr) with fail_with_status_code(422): - client.create_union( + client.create_consolidated( [ DataSource( structure_family=StructureFamily.table, diff --git a/tiled/adapters/arrow.py b/tiled/adapters/arrow.py index 9c2da456c..975c27fa0 100644 --- a/tiled/adapters/arrow.py +++ b/tiled/adapters/arrow.py @@ -129,7 +129,7 @@ def generate_data_sources( """ return [ DataSource( - structure_family=self.structure_family, + structure_family=StructureFamily.table, mimetype=mimetype, structure=dict_or_none(self.structure()), parameters={}, diff --git a/tiled/adapters/parquet.py b/tiled/adapters/parquet.py index 3438dc995..d7bf14304 100644 --- a/tiled/adapters/parquet.py +++ b/tiled/adapters/parquet.py @@ -9,6 +9,7 @@ from ..structures.table import TableStructure from ..type_aliases import JSON from ..utils import path_from_uri +from .array import ArrayAdapter from .dataframe import DataFrameAdapter from .protocols import AccessPolicy @@ -165,5 +166,5 @@ def structure(self) -> TableStructure: """ return self._structure - def get(self, key): + def get(self, key: str) -> Union[ArrayAdapter, None]: return self.dataframe_adapter.get(key) diff --git a/tiled/adapters/table.py b/tiled/adapters/table.py index 01a89a5d7..dab586edb 100644 --- a/tiled/adapters/table.py +++ b/tiled/adapters/table.py @@ -156,29 +156,25 @@ def __repr__(self) -> str: return f"{type(self).__name__}({self._structure.columns!r})" def __getitem__(self, key: str) -> ArrayAdapter: - """ + # Must compute to determine shape + array = self.read([key])[key].values - Parameters - ---------- - key : + # Convert (experimental) pandas.StringDtype to numpy's unicode string dtype + if isinstance(array.dtype, pandas.StringDtype): + import numpy - Returns - ------- + max_size = max((len(i) for i in array.ravel())) + array = array.astype(dtype=numpy.dtype(f"<U{max_size}")) - """ - # Must compute to determine shape. - return ArrayAdapter.from_array(self.read([key])[key].values) + return ArrayAdapter.from_array(array) - def get(self, key): + def get(self, key: str) -> Union[ArrayAdapter, None]: if key not in self.structure().columns: return None - return ArrayAdapter.from_array(self.read([key])[key].values) + return self[key] - def items(self): - yield from ( - (key, ArrayAdapter.from_array(self.read([key])[key].values)) - for key in self._structure.columns - ) + def items(self) -> Iterator[Tuple[str, ArrayAdapter]]: + yield from ((key, self[key]) for key in self._structure.columns) def metadata(self) -> JSON: """ diff --git a/tiled/catalog/adapter.py b/tiled/catalog/adapter.py index e88a23cdf..007dd9d11 100644 --- a/tiled/catalog/adapter.py +++ b/tiled/catalog/adapter.py @@ -62,8 +62,8 @@ ZARR_MIMETYPE, ) from ..query_registration import QueryTranslationRegistry +from ..server.pydantic_consolidated import ConsolidatedStructure from ..server.pydantic_container import ContainerStructure -from ..server.pydantic_union import UnionStructure, UnionStructurePart from ..server.schemas import Asset, DataSource, Management, Revision, Spec from ..structures.core import StructureFamily from ..utils import ( @@ -381,22 +381,8 @@ def structure(self): if self.structure_family == StructureFamily.container: # Give no inlined contents. return ContainerStructure(contents=None, count=None) - if self.structure_family == StructureFamily.union: - parts = [] - all_keys = [] - for data_source in self.data_sources: - parts.append( - UnionStructurePart( - structure=data_source.structure, - structure_family=data_source.structure_family, - name=data_source.name, - ) - ) - if data_source.structure_family == StructureFamily.table: - all_keys.extend(data_source.structure.columns) - else: - all_keys.append(data_source.name) - return UnionStructure(parts=parts, all_keys=all_keys) + if self.structure_family == StructureFamily.consolidated: + return ConsolidatedStructure.from_data_sources(self.data_sources) if self.data_sources: assert len(self.data_sources) == 1 # more not yet implemented return self.data_sources[0].structure @@ -461,11 +447,11 @@ async def lookup_adapter( for i in range(len(segments)): catalog_adapter = await self.lookup_adapter(segments[:i]) - if (catalog_adapter.structure_family == StructureFamily.union) and len( - segments[i:] - ) == 1: - # All the segments but the final segment, segments[-1], resolves - # resolve to a union structure. Dispatch to the union Adapter + if ( + catalog_adapter.structure_family == StructureFamily.consolidated + ) and len(segments[i:]) == 1: + # All the segments but the final segment, segments[-1], resolve + # to a consolidated structure. Dispatch to the consolidated Adapter # to get the inner Adapter for whatever type of structure it is. return await ensure_awaitable(catalog_adapter.get, segments[-1]) if catalog_adapter.data_sources: @@ -680,7 +666,7 @@ async def create_node( ] data_source.parameters = {} data_uri_path_parts = self.segments + [key] - if structure_family == StructureFamily.union: + if structure_family == StructureFamily.consolidated: data_uri_path_parts.append(data_source.name) data_uri = str(self.context.writable_storage) + "".join( f"/{quote_plus(segment)}" for segment in data_uri_path_parts @@ -1155,7 +1141,7 @@ async def append_partition(self, *args, **kwargs): ) -class CatalogUnionAdapter(CatalogNodeAdapter): +class CatalogConsolidatedAdapter(CatalogNodeAdapter): async def get(self, key): if key not in self.structure().all_keys: return None @@ -1573,5 +1559,5 @@ def specs_array_to_json(specs): StructureFamily.container: CatalogContainerAdapter, StructureFamily.sparse: CatalogSparseAdapter, StructureFamily.table: CatalogTableAdapter, - StructureFamily.union: CatalogUnionAdapter, + StructureFamily.consolidated: CatalogConsolidatedAdapter, } diff --git a/tiled/catalog/migrations/versions/0dc110294112_add_union_to_structure_family_enum.py b/tiled/catalog/migrations/versions/0dc110294112_add_union_to_structure_family_enum.py index bf83c8baa..5dc04c83f 100644 --- a/tiled/catalog/migrations/versions/0dc110294112_add_union_to_structure_family_enum.py +++ b/tiled/catalog/migrations/versions/0dc110294112_add_union_to_structure_family_enum.py @@ -1,4 +1,4 @@ -"""Add 'union' to structure_family enum. +"""Add 'consolidated' to structure_family enum. Revision ID: 0dc110294112 Revises: 7c8130c40b8f @@ -22,7 +22,7 @@ def upgrade(): with op.get_context().autocommit_block(): op.execute( sa.text( - "ALTER TYPE structurefamily ADD VALUE IF NOT EXISTS 'union' AFTER 'table'" + "ALTER TYPE structurefamily ADD VALUE IF NOT EXISTS 'consolidated' AFTER 'table'" ) ) diff --git a/tiled/catalog/orm.py b/tiled/catalog/orm.py index a176a9daf..190969eec 100644 --- a/tiled/catalog/orm.py +++ b/tiled/catalog/orm.py @@ -369,7 +369,7 @@ class DataSource(Timestamped, Base): # This relates to the mutability of the data. management = Column(Enum(Management), nullable=False) structure_family = Column(Enum(StructureFamily), nullable=False) - # This is used by `union` structures to address arrays. + # This is used by `consolidated` structures to address arrays. # It may have additional uses in the future. name = Column(Unicode(1023), nullable=True) diff --git a/tiled/client/union.py b/tiled/client/consolidated.py similarity index 83% rename from tiled/client/union.py rename to tiled/client/consolidated.py index 525837b89..38a08e228 100644 --- a/tiled/client/union.py +++ b/tiled/client/consolidated.py @@ -5,7 +5,7 @@ from .utils import MSGPACK_MIME_TYPE, ClientError, client_for_item, handle_error -class UnionClient(BaseClient): +class ConsolidatedClient(BaseClient): def __repr__(self): return ( f"<{type(self).__name__} {{" @@ -15,7 +15,7 @@ def __repr__(self): @property def parts(self): - return UnionContents(self) + return ConsolidatedContents(self) def __getitem__(self, key): if key not in self.structure().all_keys: @@ -47,8 +47,11 @@ def __getitem__(self, key): include_data_sources=self._include_data_sources, ) + def __iter__(self): + yield from self.structure().all_keys -class UnionContents: + +class ConsolidatedContents: def __init__(self, node): self.node = node @@ -60,10 +63,10 @@ def __repr__(self): ) def __getitem__(self, name): - for index, union_item in enumerate(self.node.structure().parts): - if union_item.name == name: - structure_family = union_item.structure_family - structure_dict = union_item.structure + for index, item in enumerate(self.node.structure().parts): + if item.name == name: + structure_family = item.structure_family + structure_dict = item.structure break else: raise KeyError(name) @@ -80,3 +83,7 @@ def __getitem__(self, name): structure=structure, include_data_sources=self.node._include_data_sources, ) + + def __iter__(self): + for item in self.node.structure().parts: + yield item.name diff --git a/tiled/client/container.py b/tiled/client/container.py index 937059b32..2e011fa51 100644 --- a/tiled/client/container.py +++ b/tiled/client/container.py @@ -643,7 +643,7 @@ def new( if structure_family == StructureFamily.container: structure = {"contents": None, "count": None} - elif structure_family == StructureFamily.union: + elif structure_family == StructureFamily.consolidated: structure = None # To be filled in below, by server response. # We need the server to tell us data_source_ids. @@ -685,7 +685,7 @@ def new( # to attempt to avoid bumping into size limits. _SUGGESTED_MAX_UPLOAD_SIZE = 100_000_000 # 100 MB - def create_container(self, key=None, *, metadata=None, dims=None, specs=None): + def create_container(self, key=None, *, metadata=None, specs=None): """ EXPERIMENTAL: Create a new, empty container. @@ -696,8 +696,6 @@ def create_container(self, key=None, *, metadata=None, dims=None, specs=None): metadata : dict, optional User metadata. May be nested. Must contain only basic types (e.g. numbers, strings, lists, dicts) that are JSON-serializable. - dims : List[str], optional - A label for each dimension of the array. specs : List[Spec], optional List of names that are used to label that the data and/or metadata conform to some named standard specification. @@ -711,25 +709,25 @@ def create_container(self, key=None, *, metadata=None, dims=None, specs=None): specs=specs, ) - def create_union(self, data_sources, key=None, *, metadata=None, specs=None): + def create_consolidated(self, data_sources, key=None, *, metadata=None, specs=None): """ - EXPERIMENTAL: Create a new union backed by data sources. + EXPERIMENTAL: Create a new consolidated node backed by data sources. Parameters ---------- data_sources : List[DataSources] + key : str, optional + Key (name) for this new node. If None, the server will provide a unique key. metadata : dict, optional User metadata. May be nested. Must contain only basic types (e.g. numbers, strings, lists, dicts) that are JSON-serializable. - dims : List[str], optional - A label for each dimension of the array. specs : List[Spec], optional List of names that are used to label that the data and/or metadata conform to some named standard specification. """ return self.new( - StructureFamily.union, + StructureFamily.consolidated, data_sources, key=key, metadata=metadata, @@ -1083,7 +1081,9 @@ def _write_partition(x, partition_info, client): "table": _LazyLoad( ("..dataframe", Container.__module__), "DataFrameClient" ), - "union": _LazyLoad(("..union", Container.__module__), "UnionClient"), + "consolidated": _LazyLoad( + ("..consolidated", Container.__module__), "ConsolidatedClient" + ), "xarray_dataset": _LazyLoad( ("..xarray", Container.__module__), "DatasetClient" ), @@ -1102,7 +1102,9 @@ def _write_partition(x, partition_info, client): "table": _LazyLoad( ("..dataframe", Container.__module__), "DaskDataFrameClient" ), - "union": _LazyLoad(("..union", Container.__module__), "UnionClient"), + "consolidated": _LazyLoad( + ("..consolidated", Container.__module__), "ConsolidatedClient" + ), "xarray_dataset": _LazyLoad( ("..xarray", Container.__module__), "DaskDatasetClient" ), diff --git a/tiled/server/dependencies.py b/tiled/server/dependencies.py index 6cc699e6e..ef94a5172 100644 --- a/tiled/server/dependencies.py +++ b/tiled/server/dependencies.py @@ -132,14 +132,14 @@ async def inner( entry.structure_family in structure_families ): return entry - # Handle union structure_family - if entry.structure_family == StructureFamily.union: + # Handle consolidated structure_family + if entry.structure_family == StructureFamily.consolidated: if not part: raise HTTPException( status_code=HTTP_404_NOT_FOUND, detail=( "A part query parameter is required on this endpoint " - "when addressing a 'union' structure." + "when addressing a 'consolidated' structure." ), ) entry_for_part = entry.for_part(part) diff --git a/tiled/server/links.py b/tiled/server/links.py index ff186ebbc..2d1db43a6 100644 --- a/tiled/server/links.py +++ b/tiled/server/links.py @@ -37,7 +37,7 @@ def links_for_awkward(structure_family, structure, base_url, path_str, part=None def links_for_container(structure_family, structure, base_url, path_str): - # Cannot be used inside union, so there is no part parameter. + # Cannot be used inside consolidated, so there is no part parameter. links = {} links["full"] = f"{base_url}/container/full/{path_str}" links["search"] = f"{base_url}/search/{path_str}" @@ -54,7 +54,7 @@ def links_for_table(structure_family, structure, base_url, path_str, part=None): return links -def links_for_union(structure_family, structure, base_url, path_str): +def links_for_consolidated(structure_family, structure, base_url, path_str): links = {} # This contains the links for each structure. links["parts"] = [] @@ -77,5 +77,5 @@ def links_for_union(structure_family, structure, base_url, path_str): StructureFamily.container: links_for_container, StructureFamily.sparse: links_for_array, # sparse and array are the same StructureFamily.table: links_for_table, - StructureFamily.union: links_for_union, + StructureFamily.consolidated: links_for_consolidated, } diff --git a/tiled/server/pydantic_consolidated.py b/tiled/server/pydantic_consolidated.py new file mode 100644 index 000000000..5f6c49d07 --- /dev/null +++ b/tiled/server/pydantic_consolidated.py @@ -0,0 +1,49 @@ +from typing import Any, List, Optional + +import pydantic + +from ..structures.core import StructureFamily + + +class ConsolidatedStructurePart(pydantic.BaseModel): + structure_family: StructureFamily + structure: Any # Union of Structures, but we do not want to import them... + name: str + + @classmethod + def from_json(cls, item): + return cls(**item) + + +class ConsolidatedStructure(pydantic.BaseModel): + parts: List[ConsolidatedStructurePart] + all_keys: Optional[List[str]] + + @classmethod + def from_json(cls, structure): + return cls( + parts=[ + ConsolidatedStructurePart.from_json(item) for item in structure["parts"] + ], + all_keys=structure["all_keys"], + ) + + @classmethod + def from_data_sources(cls, data_sources): + all_keys = [] + for data_source in data_sources: + if data_source.structure_family == StructureFamily.table: + all_keys.extend(data_source.structure.columns) + else: + all_keys.append(data_source.name) + parts = [ + ConsolidatedStructurePart( + data_source_id=data_source.id, + structure=data_source.structure, + structure_family=data_source.structure_family, + name=data_source.name, + ) + for data_source in data_sources + ] + + return cls(parts=parts, all_keys=all_keys) diff --git a/tiled/server/pydantic_union.py b/tiled/server/pydantic_union.py deleted file mode 100644 index 7d13645df..000000000 --- a/tiled/server/pydantic_union.py +++ /dev/null @@ -1,27 +0,0 @@ -from typing import Any, List, Optional - -import pydantic - -from ..structures.core import StructureFamily - - -class UnionStructurePart(pydantic.BaseModel): - structure_family: StructureFamily - structure: Any # Union of Structures, but we do not want to import them... - name: str - - @classmethod - def from_json(cls, item): - return cls(**item) - - -class UnionStructure(pydantic.BaseModel): - parts: List[UnionStructurePart] - all_keys: Optional[List[str]] - - @classmethod - def from_json(cls, structure): - return cls( - parts=[UnionStructurePart.from_json(item) for item in structure["parts"]], - all_keys=structure["all_keys"], - ) diff --git a/tiled/server/router.py b/tiled/server/router.py index be4bd600c..b72b4bc3a 100644 --- a/tiled/server/router.py +++ b/tiled/server/router.py @@ -27,7 +27,6 @@ ) from .. import __version__ -from ..server.pydantic_union import UnionStructure, UnionStructurePart from ..structures.core import Spec, StructureFamily from ..utils import ensure_awaitable, patch_mimetypes, path_from_uri from ..validation_registration import ValidationError @@ -62,6 +61,7 @@ ) from .file_response_with_range import FileResponseWithRange from .links import links_for_node +from .pydantic_consolidated import ConsolidatedStructure from .settings import get_settings from .utils import filter_for_access, get_base_url, record_timing @@ -423,7 +423,7 @@ async def array_block( "Use slicing ('?slice=...') to request smaller chunks." ), ) - if entry.structure_family == StructureFamily.union: + if entry.structure_family == StructureFamily.consolidated: structure_family = entry.data_source.structure_family else: structure_family = entry.structure_family @@ -464,7 +464,7 @@ async def array_full( """ Fetch a slice of array-like data. """ - if entry.structure_family == StructureFamily.union: + if entry.structure_family == StructureFamily.consolidated: structure_family = entry.data_source.structure_family else: structure_family = entry.structure_family @@ -729,7 +729,7 @@ async def table_full( "request a smaller chunks." ), ) - if entry.structure_family == StructureFamily.union: + if entry.structure_family == StructureFamily.consolidated: structure_family = entry.data_source.structure_family else: structure_family = entry.structure_family @@ -1160,25 +1160,8 @@ async def _create_node( body.specs, ) metadata_modified = False - if structure_family == StructureFamily.union: - all_keys = [] - for data_source in body.data_sources: - if data_source.structure_family == StructureFamily.table: - all_keys.extend(data_source.structure.columns) - else: - all_keys.append(data_source.name) - structure = UnionStructure( - parts=[ - UnionStructurePart( - data_source_id=data_source.id, - structure=data_source.structure, - structure_family=data_source.structure_family, - name=data_source.name, - ) - for data_source in body.data_sources - ], - all_keys=all_keys, - ) + if structure_family == StructureFamily.consolidated: + structure = ConsolidatedStructure.from_data_sources(body.data_sources) elif body.data_sources: assert len(body.data_sources) == 1 # more not yet implemented structure = body.data_sources[0].structure diff --git a/tiled/server/schemas.py b/tiled/server/schemas.py index e0edd8ecf..58d996edb 100644 --- a/tiled/server/schemas.py +++ b/tiled/server/schemas.py @@ -14,9 +14,9 @@ from ..structures.data_source import Management, validate_data_sources from .pydantic_array import ArrayStructure from .pydantic_awkward import AwkwardStructure +from .pydantic_consolidated import ConsolidatedStructure from .pydantic_sparse import SparseStructure from .pydantic_table import TableStructure -from .pydantic_union import UnionStructure if TYPE_CHECKING: import tiled.authn_database.orm @@ -149,7 +149,7 @@ class DataSource(pydantic.BaseModel): NodeStructure, SparseStructure, TableStructure, - UnionStructure, + ConsolidatedStructure, ] ] = None mimetype: Optional[str] = None @@ -186,7 +186,7 @@ class NodeAttributes(pydantic.BaseModel): NodeStructure, SparseStructure, TableStructure, - UnionStructure, + ConsolidatedStructure, ] ] = None @@ -235,9 +235,9 @@ class SparseLinks(pydantic.BaseModel): block: str -class UnionLinks(pydantic.BaseModel): +class ConsolidatedLinks(pydantic.BaseModel): self: str - contents: List[ + parts: List[ Union[ArrayLinks, AwkwardLinks, ContainerLinks, DataFrameLinks, SparseLinks] ] @@ -248,7 +248,7 @@ class UnionLinks(pydantic.BaseModel): StructureFamily.container: ContainerLinks, StructureFamily.sparse: SparseLinks, StructureFamily.table: DataFrameLinks, - StructureFamily.union: UnionLinks, + StructureFamily.consolidated: ConsolidatedLinks, } @@ -480,14 +480,14 @@ class PutDataSourceRequest(pydantic.BaseModel): class PostMetadataResponse(pydantic.BaseModel, Generic[ResourceLinksT]): id: str - links: Union[ArrayLinks, DataFrameLinks, SparseLinks, UnionLinks] + links: Union[ArrayLinks, DataFrameLinks, SparseLinks, ConsolidatedLinks] structure: Union[ ArrayStructure, AwkwardStructure, NodeStructure, SparseStructure, TableStructure, - UnionStructure, + ConsolidatedStructure, ] metadata: Dict data_sources: List[DataSource] diff --git a/tiled/structures/array.py b/tiled/structures/array.py index 53207b84e..901c58fc9 100644 --- a/tiled/structures/array.py +++ b/tiled/structures/array.py @@ -52,7 +52,7 @@ class Kind(str, enum.Enum): unicode = "U" # fixed-length sequence of Py_UNICODE other = "V" # "V" is for "void" -- generic fixed-size chunk of memory - # By default, do not tolerate numpy objectg arrays + # By default, do not tolerate numpy object arrays if os.getenv("TILED_ALLOW_OBJECT_ARRAYS", "0") != "0": object = "O" # Object (i.e. the memory contains a pointer to PyObject) diff --git a/tiled/structures/consolidated.py b/tiled/structures/consolidated.py new file mode 100644 index 000000000..4e5a3bfc4 --- /dev/null +++ b/tiled/structures/consolidated.py @@ -0,0 +1,50 @@ +import dataclasses +from typing import Any, List, Optional + +from .core import StructureFamily + + +@dataclasses.dataclass +class ConsolidatedStructurePart: + structure_family: StructureFamily + structure: Any # Union of Structures, but we do not want to import them... + name: Optional[str] + + @classmethod + def from_json(cls, item): + return cls(**item) + + +@dataclasses.dataclass +class ConsolidatedStructure: + parts: List[ConsolidatedStructurePart] + all_keys: List[str] + + @classmethod + def from_json(cls, structure): + return cls( + parts=[ + ConsolidatedStructurePart.from_json(item) for item in structure["parts"] + ], + all_keys=structure["all_keys"], + ) + + @classmethod + def from_data_sources(cls, data_sources): + all_keys = [] + for data_source in data_sources: + if data_source.structure_family == StructureFamily.table: + all_keys.extend(data_source.structure.columns) + else: + all_keys.append(data_source.name) + parts = [ + ConsolidatedStructurePart( + data_source_id=data_source.id, + structure=data_source.structure, + structure_family=data_source.structure_family, + name=data_source.name, + ) + for data_source in data_sources + ] + + return cls(parts=parts, all_keys=all_keys) diff --git a/tiled/structures/core.py b/tiled/structures/core.py index 8f5e10795..5612b1623 100644 --- a/tiled/structures/core.py +++ b/tiled/structures/core.py @@ -12,22 +12,13 @@ from ..utils import OneShotCachedMap -class BaseStructureFamily(str, enum.Enum): - array = "array" - awkward = "awkward" - container = "container" - sparse = "sparse" - table = "table" - # excludes union, which DataSources cannot have - - class StructureFamily(str, enum.Enum): array = "array" awkward = "awkward" container = "container" sparse = "sparse" table = "table" - union = "union" + consolidated = "consolidated" # can not be used in DataSources @dataclass(frozen=True) @@ -73,8 +64,8 @@ def dict(self) -> Dict[str, Optional[str]]: StructureFamily.sparse: lambda: importlib.import_module( "...structures.sparse", StructureFamily.__module__ ).SparseStructure, - StructureFamily.union: lambda: importlib.import_module( - "...structures.union", StructureFamily.__module__ - ).UnionStructure, + StructureFamily.consolidated: lambda: importlib.import_module( + "...structures.consolidated", StructureFamily.__module__ + ).ConsolidatedStructure, } ) diff --git a/tiled/structures/data_source.py b/tiled/structures/data_source.py index c3c65bfca..679fc0346 100644 --- a/tiled/structures/data_source.py +++ b/tiled/structures/data_source.py @@ -3,7 +3,7 @@ import enum from typing import Any, List, Optional -from ..structures.core import BaseStructureFamily, StructureFamily +from ..structures.core import StructureFamily class Management(str, enum.Enum): @@ -24,7 +24,7 @@ class Asset: @dataclasses.dataclass class DataSource: - structure_family: BaseStructureFamily + structure_family: StructureFamily structure: Any id: Optional[int] = None mimetype: Optional[str] = None @@ -33,6 +33,12 @@ class DataSource: management: Management = Management.writable name: Optional[str] = None + def __post_init__(self): + if self.structure_family == StructureFamily.consolidated: + raise ValueError( + "DataSource can not be intialized with Consolidated StructureFamliy" + ) + @classmethod def from_json(cls, d): d = d.copy() @@ -54,14 +60,14 @@ def validate_container_data_sources(node_structure_family, data_sources): return data_sources -def validate_union_data_sources(node_structure_family, data_sources): +def validate_consolidated_data_sources(node_structure_family, data_sources): "Check that column names and keys of others (e.g. arrays) do not collide." keys = set() names = set() for data_source in data_sources: if data_source.name is None: raise ValueError( - "Data sources backing a union structure_family must " + "Data sources backing a consolidated structure_family must " "all have non-NULL names." ) if data_source.name in names: @@ -95,4 +101,4 @@ def validate_other_data_sources(node_structure_family, data_sources): validators = collections.defaultdict(lambda: validate_other_data_sources) validators[StructureFamily.container] = validate_container_data_sources -validators[StructureFamily.union] = validate_union_data_sources +validators[StructureFamily.consolidated] = validate_consolidated_data_sources diff --git a/tiled/structures/union.py b/tiled/structures/union.py deleted file mode 100644 index 3d4a6cc4b..000000000 --- a/tiled/structures/union.py +++ /dev/null @@ -1,28 +0,0 @@ -import dataclasses -from typing import Any, List, Optional - -from .core import StructureFamily - - -@dataclasses.dataclass -class UnionStructurePart: - structure_family: StructureFamily - structure: Any # Union of Structures, but we do not want to import them... - name: Optional[str] - - @classmethod - def from_json(cls, item): - return cls(**item) - - -@dataclasses.dataclass -class UnionStructure: - parts: List[UnionStructurePart] - all_keys: List[str] - - @classmethod - def from_json(cls, structure): - return cls( - parts=[UnionStructurePart.from_json(item) for item in structure["parts"]], - all_keys=structure["all_keys"], - )