diff --git a/astrapy/idiomatic/cursors.py b/astrapy/idiomatic/cursors.py index 60db1443..f8a04e8d 100644 --- a/astrapy/idiomatic/cursors.py +++ b/astrapy/idiomatic/cursors.py @@ -25,6 +25,7 @@ Iterable, List, Optional, + Tuple, TypeVar, Union, TYPE_CHECKING, @@ -43,24 +44,88 @@ BC = TypeVar("BC", bound="BaseCursor") T = TypeVar("T") +IndexPairType = Tuple[str, Optional[int]] FIND_PREFETCH = 20 +def _maybe_valid_list_index(key_block: str) -> Optional[int]: + # '0', '1' is good. '00', '01', '-30' are not. + try: + kb_index = int(key_block) + if kb_index >= 0 and key_block == str(kb_index): + return kb_index + else: + return None + except ValueError: + return None + + def _create_document_key_extractor( key: str, ) -> Callable[[Dict[str, Any]], Iterable[Any]]: - if "." in key: - raise NotImplementedError - def _item_extractor(document: Dict[str, Any]) -> Any: - # TEMPORARY - if key in document: - yield document[key] + key_blocks0: List[IndexPairType] = [ + (kb_str, _maybe_valid_list_index(kb_str)) for kb_str in key.split(".") + ] + if key_blocks0 == []: + raise ValueError("Field path specification cannot be empty") + if any(kb[0] == "" for kb in key_blocks0): + raise ValueError("Field path components cannot be empty") + + def _extract_with_key_blocks( + key_blocks: List[IndexPairType], value: Any + ) -> Iterable[Any]: + if key_blocks == []: + if isinstance(value, list): + for item in value: + yield item + else: + yield value + return + else: + # go deeper as requested + rest_key_blocks = key_blocks[1:] + key_block = key_blocks[0] + k_str, k_int = key_block + if isinstance(value, dict): + if k_str in value: + new_value = value[k_str] + for item in _extract_with_key_blocks(rest_key_blocks, new_value): + yield item + return + elif isinstance(value, list): + if k_int is not None and len(value) > k_int: + new_value = value[k_int] + for item in _extract_with_key_blocks(rest_key_blocks, new_value): + yield item + else: + for item in value: + for item in _extract_with_key_blocks(key_blocks, item): + yield item + return + else: + # keyblocks are deeper than the document. Nothing to extract. + return + + def _item_extractor(document: Dict[str, Any]) -> Iterable[Any]: + return _extract_with_key_blocks(key_blocks=key_blocks0, value=document) return _item_extractor +def _reduce_distinct_key_to_safe(distinct_key: str) -> str: + """ + In light of the twofold interpretation of "0" as index and dict key + in selection (for distinct), and the auto-unroll of lists, it is not + safe to go beyond the first level. See this example: + document = {'x': [{'y': 'Y', '0': 'ZERO'}]} + key = "x.0" + With full key as projection, we would lose the `"y": "Y"` part (mistakenly). + """ + return distinct_key.split(".")[0] + + class BaseCursor: """ Represents a generic Cursor over query results, regardless of whether @@ -395,8 +460,9 @@ def distinct(self, key: str) -> List[Any]: distinct_items = [] _extractor = _create_document_key_extractor(key) + _key = _reduce_distinct_key_to_safe(key) - d_cursor = self._copy(projection={key: True}, started=False) + d_cursor = self._copy(projection={_key: True}, started=False) for document in d_cursor: for item in _extractor(document): _normalized_item = _normalize_payload_value(path=[], value=item) @@ -552,8 +618,9 @@ async def distinct(self, key: str) -> List[Any]: distinct_items = [] _extractor = _create_document_key_extractor(key) + _key = _reduce_distinct_key_to_safe(key) - d_cursor = self._copy(projection={key: True}, started=False) + d_cursor = self._copy(projection={_key: True}, started=False) async for document in d_cursor: for item in _extractor(document): _normalized_item = _normalize_payload_value(path=[], value=item) diff --git a/tests/idiomatic/integration/test_dml_async.py b/tests/idiomatic/integration/test_dml_async.py index 27bdff55..6da35352 100644 --- a/tests/idiomatic/integration/test_dml_async.py +++ b/tests/idiomatic/integration/test_dml_async.py @@ -461,10 +461,52 @@ async def test_collection_distinct_nonhashable_async( await acol.insert_many(documents * 2) d_items = await acol.distinct("f") - assert len(d_items) == len([doc for doc in documents if "f" in doc]) + expected = [ + 1, + "a", + {"subf": 99}, + {"subf": 99, "another": {"subsubf": [True, False]}}, + 10, + 11, + datetime.datetime(2000, 1, 1, 12, 0), + None, + ] + assert len(d_items) == len(expected) for doc in documents: if "f" in doc: - assert doc["f"] in d_items + if isinstance(doc["f"], list): + for item in doc["f"]: + assert item in d_items + else: + assert doc["f"] in d_items + + @pytest.mark.describe("test of usage of projection in distinct, async") + async def test_collection_projections_distinct_async( + self, + async_empty_collection: AsyncCollection, + ) -> None: + acol = async_empty_collection + await acol.insert_one({"x": [{"y": "Y", "0": "ZERO"}]}) + + assert await acol.distinct("x.y") == ["Y"] + # the one below shows that if index-in-list, then browse-whole-list is off + assert await acol.distinct("x.0") == [{"y": "Y", "0": "ZERO"}] + assert await acol.distinct("x.0.y") == ["Y"] + assert await acol.distinct("x.0.0") == ["ZERO"] + + @pytest.mark.describe("test of unacceptable paths for distinct, async") + async def test_collection_wrong_paths_distinc_async( + self, + async_empty_collection: AsyncCollection, + ) -> None: + with pytest.raises(ValueError): + await async_empty_collection.distinct("root.1..subf") + with pytest.raises(ValueError): + await async_empty_collection.distinct("root..1.subf") + with pytest.raises(ValueError): + await async_empty_collection.distinct("root..subf.subsubf") + with pytest.raises(ValueError): + await async_empty_collection.distinct("root.subf..subsubf") @pytest.mark.describe("test of collection insert_many, async") async def test_collection_insert_many_async( diff --git a/tests/idiomatic/integration/test_dml_sync.py b/tests/idiomatic/integration/test_dml_sync.py index 9cfe279e..f9c9cfe0 100644 --- a/tests/idiomatic/integration/test_dml_sync.py +++ b/tests/idiomatic/integration/test_dml_sync.py @@ -408,10 +408,52 @@ def test_collection_distinct_nonhashable_sync( col.insert_many(documents) d_items = col.distinct("f") - assert len(d_items) == len([doc for doc in documents if "f" in doc]) + expected = [ + 1, + "a", + {"subf": 99}, + {"subf": 99, "another": {"subsubf": [True, False]}}, + 10, + 11, + datetime.datetime(2000, 1, 1, 12, 0), + None, + ] + assert len(d_items) == len(expected) for doc in documents: if "f" in doc: - assert doc["f"] in d_items + if isinstance(doc["f"], list): + for item in doc["f"]: + assert item in d_items + else: + assert doc["f"] in d_items + + @pytest.mark.describe("test of usage of projection in distinct, sync") + def test_collection_projections_distinct_sync( + self, + sync_empty_collection: Collection, + ) -> None: + col = sync_empty_collection + col.insert_one({"x": [{"y": "Y", "0": "ZERO"}]}) + + assert col.distinct("x.y") == ["Y"] + # the one below shows that if index-in-list, then browse-whole-list is off + assert col.distinct("x.0") == [{"y": "Y", "0": "ZERO"}] + assert col.distinct("x.0.y") == ["Y"] + assert col.distinct("x.0.0") == ["ZERO"] + + @pytest.mark.describe("test of unacceptable paths for distinct, sync") + def test_collection_wrong_paths_distinc_sync( + self, + sync_empty_collection: Collection, + ) -> None: + with pytest.raises(ValueError): + sync_empty_collection.distinct("root.1..subf") + with pytest.raises(ValueError): + sync_empty_collection.distinct("root..1.subf") + with pytest.raises(ValueError): + sync_empty_collection.distinct("root..subf.subsubf") + with pytest.raises(ValueError): + sync_empty_collection.distinct("root.subf..subsubf") @pytest.mark.describe("test of collection insert_many, sync") def test_collection_insert_many_sync( diff --git a/tests/idiomatic/unit/test_document_extractors.py b/tests/idiomatic/unit/test_document_extractors.py new file mode 100644 index 00000000..65b6a1d4 --- /dev/null +++ b/tests/idiomatic/unit/test_document_extractors.py @@ -0,0 +1,115 @@ +# Copyright DataStax, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Any, Dict, List + +import pytest + +from astrapy.idiomatic.cursors import _create_document_key_extractor + + +class TestDocumentExtractors: + @pytest.mark.describe("test of plain fieldname document extractor") + def test_plain_fieldname_document_extractor(self) -> None: + f_extractor = _create_document_key_extractor("f") + + assert list(f_extractor({})) == [] + assert list(f_extractor({"e": "E", "f": "F", "g": "G"})) == ["F"] + assert list(f_extractor({"f": {"name": "F"}})) == [{"name": "F"}] + assert list(f_extractor({"f": None})) == [None] + + @pytest.mark.describe("test of dotted fieldname document extractor") + def test_dotted_fieldname_document_extractor(self) -> None: + # various patterns mixing dict and list indexed access + document1 = { + "l": ["L0", "L1", {"name": "L2"}], + "d": {"0": "D0", "1": "D1", "2": {"name": "D2"}}, + "ll": [ + ["LL00", "LL01"], + ["LL10", {"name": "LL11"}], + ], + "ld": [ + {"0": "LD00", "1": "LD01"}, + {"0": "LD10", "1": {"name": "LD11"}}, + ], + "dl": { + "0": ["DL090", "DL01"], + "1": ["DL10", {"name": "DL11"}], + }, + "dd": { + "0": {"0": "DL00", "1": "DL01"}, + "1": {"0": "DL10", "1": {"name": "DL11"}}, + }, + "non_index": {"00": "NI00", "01": "NI01"}, + } + + def assert_extracts( + document: Dict[str, Any], key: str, expected: List[Any] + ) -> None: + _extractor = _create_document_key_extractor(key) + _extracted = list(_extractor(document)) + assert len(_extracted) == len(expected) + for item in expected: + assert item in _extracted + + assert_extracts(document1, "l", ["L0", "L1", {"name": "L2"}]) + assert_extracts(document1, "l.1", ["L1"]) + assert_extracts(document1, "l.2", [{"name": "L2"}]) + assert_extracts(document1, "d", [{"0": "D0", "1": "D1", "2": {"name": "D2"}}]) + assert_extracts(document1, "d.1", ["D1"]) + assert_extracts(document1, "d.2", [{"name": "D2"}]) + assert_extracts(document1, "ll", [["LL00", "LL01"], ["LL10", {"name": "LL11"}]]) + assert_extracts(document1, "ll.1", ["LL10", {"name": "LL11"}]) + assert_extracts(document1, "ll.1.1", [{"name": "LL11"}]) + assert_extracts( + document1, + "ld", + [{"0": "LD00", "1": "LD01"}, {"0": "LD10", "1": {"name": "LD11"}}], + ) + assert_extracts(document1, "ld.1", [{"0": "LD10", "1": {"name": "LD11"}}]) + assert_extracts(document1, "ld.1.1", [{"name": "LD11"}]) + assert_extracts( + document1, "dl", [{"0": ["DL090", "DL01"], "1": ["DL10", {"name": "DL11"}]}] + ) + assert_extracts(document1, "dl.1", ["DL10", {"name": "DL11"}]) + assert_extracts(document1, "dl.1.1", [{"name": "DL11"}]) + assert_extracts( + document1, + "dd", + [ + { + "0": {"0": "DL00", "1": "DL01"}, + "1": {"0": "DL10", "1": {"name": "DL11"}}, + } + ], + ) + assert_extracts(document1, "dd.1", [{"0": "DL10", "1": {"name": "DL11"}}]) + assert_extracts(document1, "dd.1.1", [{"name": "DL11"}]) + + # test about omitting list indices + document2a = {"x": [{"y": 10}, {"y": 11}, {"y": 12}]} + document2b = {"x": [{"y": 100}]} + document2c = {"x": ["X0", "X1"]} + + assert_extracts(document2a, "x.y", [10, 11, 12]) + assert_extracts(document2a, "x.0.y", [10]) + assert_extracts(document2a, "x.1.y", [11]) + + assert_extracts(document2b, "x.y", [100]) + assert_extracts(document2b, "x.0.y", [100]) + assert_extracts(document2b, "x.1.y", []) + + assert_extracts(document2c, "x", ["X0", "X1"]) + assert_extracts(document2c, "x.0", ["X0"]) + assert_extracts(document2c, "x.1", ["X1"])