full support for dotted syntax in distinct

datastax · Mar 10, 2024 · 6c40223 · 6c40223
1 parent 31bb77e
commit 6c40223
Show file tree

Hide file tree

Showing 4 changed files with 278 additions and 12 deletions.
diff --git a/astrapy/idiomatic/cursors.py b/astrapy/idiomatic/cursors.py
@@ -25,6 +25,7 @@
     Iterable,
     List,
     Optional,
+    Tuple,
     TypeVar,
     Union,
     TYPE_CHECKING,
@@ -43,24 +44,88 @@
 
 BC = TypeVar("BC", bound="BaseCursor")
 T = TypeVar("T")
+IndexPairType = Tuple[str, Optional[int]]
 
 FIND_PREFETCH = 20
 
 
+def _maybe_valid_list_index(key_block: str) -> Optional[int]:
+    # '0', '1' is good. '00', '01', '-30' are not.
+    try:
+        kb_index = int(key_block)
+        if kb_index >= 0 and key_block == str(kb_index):
+            return kb_index
+        else:
+            return None
+    except ValueError:
+        return None
+
+
 def _create_document_key_extractor(
     key: str,
 ) -> Callable[[Dict[str, Any]], Iterable[Any]]:
-    if "." in key:
-        raise NotImplementedError
 
-    def _item_extractor(document: Dict[str, Any]) -> Any:
-        # TEMPORARY
-        if key in document:
-            yield document[key]
+    key_blocks0: List[IndexPairType] = [
+        (kb_str, _maybe_valid_list_index(kb_str)) for kb_str in key.split(".")
+    ]
+    if key_blocks0 == []:
+        raise ValueError("Field path specification cannot be empty")
+    if any(kb[0] == "" for kb in key_blocks0):
+        raise ValueError("Field path components cannot be empty")
+
+    def _extract_with_key_blocks(
+        key_blocks: List[IndexPairType], value: Any
+    ) -> Iterable[Any]:
+        if key_blocks == []:
+            if isinstance(value, list):
+                for item in value:
+                    yield item
+            else:
+                yield value
+            return
+        else:
+            # go deeper as requested
+            rest_key_blocks = key_blocks[1:]
+            key_block = key_blocks[0]
+            k_str, k_int = key_block
+            if isinstance(value, dict):
+                if k_str in value:
+                    new_value = value[k_str]
+                    for item in _extract_with_key_blocks(rest_key_blocks, new_value):
+                        yield item
+                return
+            elif isinstance(value, list):
+                if k_int is not None and len(value) > k_int:
+                    new_value = value[k_int]
+                    for item in _extract_with_key_blocks(rest_key_blocks, new_value):
+                        yield item
+                else:
+                    for item in value:
+                        for item in _extract_with_key_blocks(key_blocks, item):
+                            yield item
+                return
+            else:
+                # keyblocks are deeper than the document. Nothing to extract.
+                return
+
+    def _item_extractor(document: Dict[str, Any]) -> Iterable[Any]:
+        return _extract_with_key_blocks(key_blocks=key_blocks0, value=document)
 
     return _item_extractor
 
 
+def _reduce_distinct_key_to_safe(distinct_key: str) -> str:
+    """
+    In light of the twofold interpretation of "0" as index and dict key
+    in selection (for distinct), and the auto-unroll of lists, it is not
+    safe to go beyond the first level. See this example:
+        document = {'x': [{'y': 'Y', '0': 'ZERO'}]}
+        key = "x.0"
+    With full key as projection, we would lose the `"y": "Y"` part (mistakenly).
+    """
+    return distinct_key.split(".")[0]
+
+
 class BaseCursor:
     """
     Represents a generic Cursor over query results, regardless of whether
@@ -395,8 +460,9 @@ def distinct(self, key: str) -> List[Any]:
         distinct_items = []
 
         _extractor = _create_document_key_extractor(key)
+        _key = _reduce_distinct_key_to_safe(key)
 
-        d_cursor = self._copy(projection={key: True}, started=False)
+        d_cursor = self._copy(projection={_key: True}, started=False)
         for document in d_cursor:
             for item in _extractor(document):
                 _normalized_item = _normalize_payload_value(path=[], value=item)
@@ -552,8 +618,9 @@ async def distinct(self, key: str) -> List[Any]:
         distinct_items = []
 
         _extractor = _create_document_key_extractor(key)
+        _key = _reduce_distinct_key_to_safe(key)
 
-        d_cursor = self._copy(projection={key: True}, started=False)
+        d_cursor = self._copy(projection={_key: True}, started=False)
         async for document in d_cursor:
             for item in _extractor(document):
                 _normalized_item = _normalize_payload_value(path=[], value=item)

diff --git a/tests/idiomatic/integration/test_dml_async.py b/tests/idiomatic/integration/test_dml_async.py
@@ -461,10 +461,52 @@ async def test_collection_distinct_nonhashable_async(
         await acol.insert_many(documents * 2)
 
         d_items = await acol.distinct("f")
-        assert len(d_items) == len([doc for doc in documents if "f" in doc])
+        expected = [
+            1,
+            "a",
+            {"subf": 99},
+            {"subf": 99, "another": {"subsubf": [True, False]}},
+            10,
+            11,
+            datetime.datetime(2000, 1, 1, 12, 0),
+            None,
+        ]
+        assert len(d_items) == len(expected)
         for doc in documents:
             if "f" in doc:
-                assert doc["f"] in d_items
+                if isinstance(doc["f"], list):
+                    for item in doc["f"]:
+                        assert item in d_items
+                else:
+                    assert doc["f"] in d_items
+
+    @pytest.mark.describe("test of usage of projection in distinct, async")
+    async def test_collection_projections_distinct_async(
+        self,
+        async_empty_collection: AsyncCollection,
+    ) -> None:
+        acol = async_empty_collection
+        await acol.insert_one({"x": [{"y": "Y", "0": "ZERO"}]})
+
+        assert await acol.distinct("x.y") == ["Y"]
+        # the one below shows that if index-in-list, then browse-whole-list is off
+        assert await acol.distinct("x.0") == [{"y": "Y", "0": "ZERO"}]
+        assert await acol.distinct("x.0.y") == ["Y"]
+        assert await acol.distinct("x.0.0") == ["ZERO"]
+
+    @pytest.mark.describe("test of unacceptable paths for distinct, async")
+    async def test_collection_wrong_paths_distinc_async(
+        self,
+        async_empty_collection: AsyncCollection,
+    ) -> None:
+        with pytest.raises(ValueError):
+            await async_empty_collection.distinct("root.1..subf")
+        with pytest.raises(ValueError):
+            await async_empty_collection.distinct("root..1.subf")
+        with pytest.raises(ValueError):
+            await async_empty_collection.distinct("root..subf.subsubf")
+        with pytest.raises(ValueError):
+            await async_empty_collection.distinct("root.subf..subsubf")
 
     @pytest.mark.describe("test of collection insert_many, async")
     async def test_collection_insert_many_async(

diff --git a/tests/idiomatic/integration/test_dml_sync.py b/tests/idiomatic/integration/test_dml_sync.py
@@ -408,10 +408,52 @@ def test_collection_distinct_nonhashable_sync(
         col.insert_many(documents)
 
         d_items = col.distinct("f")
-        assert len(d_items) == len([doc for doc in documents if "f" in doc])
+        expected = [
+            1,
+            "a",
+            {"subf": 99},
+            {"subf": 99, "another": {"subsubf": [True, False]}},
+            10,
+            11,
+            datetime.datetime(2000, 1, 1, 12, 0),
+            None,
+        ]
+        assert len(d_items) == len(expected)
         for doc in documents:
             if "f" in doc:
-                assert doc["f"] in d_items
+                if isinstance(doc["f"], list):
+                    for item in doc["f"]:
+                        assert item in d_items
+                else:
+                    assert doc["f"] in d_items
+
+    @pytest.mark.describe("test of usage of projection in distinct, sync")
+    def test_collection_projections_distinct_sync(
+        self,
+        sync_empty_collection: Collection,
+    ) -> None:
+        col = sync_empty_collection
+        col.insert_one({"x": [{"y": "Y", "0": "ZERO"}]})
+
+        assert col.distinct("x.y") == ["Y"]
+        # the one below shows that if index-in-list, then browse-whole-list is off
+        assert col.distinct("x.0") == [{"y": "Y", "0": "ZERO"}]
+        assert col.distinct("x.0.y") == ["Y"]
+        assert col.distinct("x.0.0") == ["ZERO"]
+
+    @pytest.mark.describe("test of unacceptable paths for distinct, sync")
+    def test_collection_wrong_paths_distinc_sync(
+        self,
+        sync_empty_collection: Collection,
+    ) -> None:
+        with pytest.raises(ValueError):
+            sync_empty_collection.distinct("root.1..subf")
+        with pytest.raises(ValueError):
+            sync_empty_collection.distinct("root..1.subf")
+        with pytest.raises(ValueError):
+            sync_empty_collection.distinct("root..subf.subsubf")
+        with pytest.raises(ValueError):
+            sync_empty_collection.distinct("root.subf..subsubf")
 
     @pytest.mark.describe("test of collection insert_many, sync")
     def test_collection_insert_many_sync(

diff --git a/tests/idiomatic/unit/test_document_extractors.py b/tests/idiomatic/unit/test_document_extractors.py
@@ -0,0 +1,115 @@
+# Copyright DataStax, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Any, Dict, List
+
+import pytest
+
+from astrapy.idiomatic.cursors import _create_document_key_extractor
+
+
+class TestDocumentExtractors:
+    @pytest.mark.describe("test of plain fieldname document extractor")
+    def test_plain_fieldname_document_extractor(self) -> None:
+        f_extractor = _create_document_key_extractor("f")
+
+        assert list(f_extractor({})) == []
+        assert list(f_extractor({"e": "E", "f": "F", "g": "G"})) == ["F"]
+        assert list(f_extractor({"f": {"name": "F"}})) == [{"name": "F"}]
+        assert list(f_extractor({"f": None})) == [None]
+
+    @pytest.mark.describe("test of dotted fieldname document extractor")
+    def test_dotted_fieldname_document_extractor(self) -> None:
+        # various patterns mixing dict and list indexed access
+        document1 = {
+            "l": ["L0", "L1", {"name": "L2"}],
+            "d": {"0": "D0", "1": "D1", "2": {"name": "D2"}},
+            "ll": [
+                ["LL00", "LL01"],
+                ["LL10", {"name": "LL11"}],
+            ],
+            "ld": [
+                {"0": "LD00", "1": "LD01"},
+                {"0": "LD10", "1": {"name": "LD11"}},
+            ],
+            "dl": {
+                "0": ["DL090", "DL01"],
+                "1": ["DL10", {"name": "DL11"}],
+            },
+            "dd": {
+                "0": {"0": "DL00", "1": "DL01"},
+                "1": {"0": "DL10", "1": {"name": "DL11"}},
+            },
+            "non_index": {"00": "NI00", "01": "NI01"},
+        }
+
+        def assert_extracts(
+            document: Dict[str, Any], key: str, expected: List[Any]
+        ) -> None:
+            _extractor = _create_document_key_extractor(key)
+            _extracted = list(_extractor(document))
+            assert len(_extracted) == len(expected)
+            for item in expected:
+                assert item in _extracted
+
+        assert_extracts(document1, "l", ["L0", "L1", {"name": "L2"}])
+        assert_extracts(document1, "l.1", ["L1"])
+        assert_extracts(document1, "l.2", [{"name": "L2"}])
+        assert_extracts(document1, "d", [{"0": "D0", "1": "D1", "2": {"name": "D2"}}])
+        assert_extracts(document1, "d.1", ["D1"])
+        assert_extracts(document1, "d.2", [{"name": "D2"}])
+        assert_extracts(document1, "ll", [["LL00", "LL01"], ["LL10", {"name": "LL11"}]])
+        assert_extracts(document1, "ll.1", ["LL10", {"name": "LL11"}])
+        assert_extracts(document1, "ll.1.1", [{"name": "LL11"}])
+        assert_extracts(
+            document1,
+            "ld",
+            [{"0": "LD00", "1": "LD01"}, {"0": "LD10", "1": {"name": "LD11"}}],
+        )
+        assert_extracts(document1, "ld.1", [{"0": "LD10", "1": {"name": "LD11"}}])
+        assert_extracts(document1, "ld.1.1", [{"name": "LD11"}])
+        assert_extracts(
+            document1, "dl", [{"0": ["DL090", "DL01"], "1": ["DL10", {"name": "DL11"}]}]
+        )
+        assert_extracts(document1, "dl.1", ["DL10", {"name": "DL11"}])
+        assert_extracts(document1, "dl.1.1", [{"name": "DL11"}])
+        assert_extracts(
+            document1,
+            "dd",
+            [
+                {
+                    "0": {"0": "DL00", "1": "DL01"},
+                    "1": {"0": "DL10", "1": {"name": "DL11"}},
+                }
+            ],
+        )
+        assert_extracts(document1, "dd.1", [{"0": "DL10", "1": {"name": "DL11"}}])
+        assert_extracts(document1, "dd.1.1", [{"name": "DL11"}])
+
+        # test about omitting list indices
+        document2a = {"x": [{"y": 10}, {"y": 11}, {"y": 12}]}
+        document2b = {"x": [{"y": 100}]}
+        document2c = {"x": ["X0", "X1"]}
+
+        assert_extracts(document2a, "x.y", [10, 11, 12])
+        assert_extracts(document2a, "x.0.y", [10])
+        assert_extracts(document2a, "x.1.y", [11])
+
+        assert_extracts(document2b, "x.y", [100])
+        assert_extracts(document2b, "x.0.y", [100])
+        assert_extracts(document2b, "x.1.y", [])
+
+        assert_extracts(document2c, "x", ["X0", "X1"])
+        assert_extracts(document2c, "x.0", ["X0"])
+        assert_extracts(document2c, "x.1", ["X1"])