Skip to content

Commit

Permalink
full support for dotted syntax in distinct
Browse files Browse the repository at this point in the history
  • Loading branch information
hemidactylus committed Mar 10, 2024
1 parent 31bb77e commit 6c40223
Show file tree
Hide file tree
Showing 4 changed files with 278 additions and 12 deletions.
83 changes: 75 additions & 8 deletions astrapy/idiomatic/cursors.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
Iterable,
List,
Optional,
Tuple,
TypeVar,
Union,
TYPE_CHECKING,
Expand All @@ -43,24 +44,88 @@

BC = TypeVar("BC", bound="BaseCursor")
T = TypeVar("T")
IndexPairType = Tuple[str, Optional[int]]

FIND_PREFETCH = 20


def _maybe_valid_list_index(key_block: str) -> Optional[int]:
# '0', '1' is good. '00', '01', '-30' are not.
try:
kb_index = int(key_block)
if kb_index >= 0 and key_block == str(kb_index):
return kb_index
else:
return None
except ValueError:
return None


def _create_document_key_extractor(
key: str,
) -> Callable[[Dict[str, Any]], Iterable[Any]]:
if "." in key:
raise NotImplementedError

def _item_extractor(document: Dict[str, Any]) -> Any:
# TEMPORARY
if key in document:
yield document[key]
key_blocks0: List[IndexPairType] = [
(kb_str, _maybe_valid_list_index(kb_str)) for kb_str in key.split(".")
]
if key_blocks0 == []:
raise ValueError("Field path specification cannot be empty")
if any(kb[0] == "" for kb in key_blocks0):
raise ValueError("Field path components cannot be empty")

def _extract_with_key_blocks(
key_blocks: List[IndexPairType], value: Any
) -> Iterable[Any]:
if key_blocks == []:
if isinstance(value, list):
for item in value:
yield item
else:
yield value
return
else:
# go deeper as requested
rest_key_blocks = key_blocks[1:]
key_block = key_blocks[0]
k_str, k_int = key_block
if isinstance(value, dict):
if k_str in value:
new_value = value[k_str]
for item in _extract_with_key_blocks(rest_key_blocks, new_value):
yield item
return
elif isinstance(value, list):
if k_int is not None and len(value) > k_int:
new_value = value[k_int]
for item in _extract_with_key_blocks(rest_key_blocks, new_value):
yield item
else:
for item in value:
for item in _extract_with_key_blocks(key_blocks, item):
yield item
return
else:
# keyblocks are deeper than the document. Nothing to extract.
return

def _item_extractor(document: Dict[str, Any]) -> Iterable[Any]:
return _extract_with_key_blocks(key_blocks=key_blocks0, value=document)

return _item_extractor


def _reduce_distinct_key_to_safe(distinct_key: str) -> str:
"""
In light of the twofold interpretation of "0" as index and dict key
in selection (for distinct), and the auto-unroll of lists, it is not
safe to go beyond the first level. See this example:
document = {'x': [{'y': 'Y', '0': 'ZERO'}]}
key = "x.0"
With full key as projection, we would lose the `"y": "Y"` part (mistakenly).
"""
return distinct_key.split(".")[0]


class BaseCursor:
"""
Represents a generic Cursor over query results, regardless of whether
Expand Down Expand Up @@ -395,8 +460,9 @@ def distinct(self, key: str) -> List[Any]:
distinct_items = []

_extractor = _create_document_key_extractor(key)
_key = _reduce_distinct_key_to_safe(key)

d_cursor = self._copy(projection={key: True}, started=False)
d_cursor = self._copy(projection={_key: True}, started=False)
for document in d_cursor:
for item in _extractor(document):
_normalized_item = _normalize_payload_value(path=[], value=item)
Expand Down Expand Up @@ -552,8 +618,9 @@ async def distinct(self, key: str) -> List[Any]:
distinct_items = []

_extractor = _create_document_key_extractor(key)
_key = _reduce_distinct_key_to_safe(key)

d_cursor = self._copy(projection={key: True}, started=False)
d_cursor = self._copy(projection={_key: True}, started=False)
async for document in d_cursor:
for item in _extractor(document):
_normalized_item = _normalize_payload_value(path=[], value=item)
Expand Down
46 changes: 44 additions & 2 deletions tests/idiomatic/integration/test_dml_async.py
Original file line number Diff line number Diff line change
Expand Up @@ -461,10 +461,52 @@ async def test_collection_distinct_nonhashable_async(
await acol.insert_many(documents * 2)

d_items = await acol.distinct("f")
assert len(d_items) == len([doc for doc in documents if "f" in doc])
expected = [
1,
"a",
{"subf": 99},
{"subf": 99, "another": {"subsubf": [True, False]}},
10,
11,
datetime.datetime(2000, 1, 1, 12, 0),
None,
]
assert len(d_items) == len(expected)
for doc in documents:
if "f" in doc:
assert doc["f"] in d_items
if isinstance(doc["f"], list):
for item in doc["f"]:
assert item in d_items
else:
assert doc["f"] in d_items

@pytest.mark.describe("test of usage of projection in distinct, async")
async def test_collection_projections_distinct_async(
self,
async_empty_collection: AsyncCollection,
) -> None:
acol = async_empty_collection
await acol.insert_one({"x": [{"y": "Y", "0": "ZERO"}]})

assert await acol.distinct("x.y") == ["Y"]
# the one below shows that if index-in-list, then browse-whole-list is off
assert await acol.distinct("x.0") == [{"y": "Y", "0": "ZERO"}]
assert await acol.distinct("x.0.y") == ["Y"]
assert await acol.distinct("x.0.0") == ["ZERO"]

@pytest.mark.describe("test of unacceptable paths for distinct, async")
async def test_collection_wrong_paths_distinc_async(
self,
async_empty_collection: AsyncCollection,
) -> None:
with pytest.raises(ValueError):
await async_empty_collection.distinct("root.1..subf")
with pytest.raises(ValueError):
await async_empty_collection.distinct("root..1.subf")
with pytest.raises(ValueError):
await async_empty_collection.distinct("root..subf.subsubf")
with pytest.raises(ValueError):
await async_empty_collection.distinct("root.subf..subsubf")

@pytest.mark.describe("test of collection insert_many, async")
async def test_collection_insert_many_async(
Expand Down
46 changes: 44 additions & 2 deletions tests/idiomatic/integration/test_dml_sync.py
Original file line number Diff line number Diff line change
Expand Up @@ -408,10 +408,52 @@ def test_collection_distinct_nonhashable_sync(
col.insert_many(documents)

d_items = col.distinct("f")
assert len(d_items) == len([doc for doc in documents if "f" in doc])
expected = [
1,
"a",
{"subf": 99},
{"subf": 99, "another": {"subsubf": [True, False]}},
10,
11,
datetime.datetime(2000, 1, 1, 12, 0),
None,
]
assert len(d_items) == len(expected)
for doc in documents:
if "f" in doc:
assert doc["f"] in d_items
if isinstance(doc["f"], list):
for item in doc["f"]:
assert item in d_items
else:
assert doc["f"] in d_items

@pytest.mark.describe("test of usage of projection in distinct, sync")
def test_collection_projections_distinct_sync(
self,
sync_empty_collection: Collection,
) -> None:
col = sync_empty_collection
col.insert_one({"x": [{"y": "Y", "0": "ZERO"}]})

assert col.distinct("x.y") == ["Y"]
# the one below shows that if index-in-list, then browse-whole-list is off
assert col.distinct("x.0") == [{"y": "Y", "0": "ZERO"}]
assert col.distinct("x.0.y") == ["Y"]
assert col.distinct("x.0.0") == ["ZERO"]

@pytest.mark.describe("test of unacceptable paths for distinct, sync")
def test_collection_wrong_paths_distinc_sync(
self,
sync_empty_collection: Collection,
) -> None:
with pytest.raises(ValueError):
sync_empty_collection.distinct("root.1..subf")
with pytest.raises(ValueError):
sync_empty_collection.distinct("root..1.subf")
with pytest.raises(ValueError):
sync_empty_collection.distinct("root..subf.subsubf")
with pytest.raises(ValueError):
sync_empty_collection.distinct("root.subf..subsubf")

@pytest.mark.describe("test of collection insert_many, sync")
def test_collection_insert_many_sync(
Expand Down
115 changes: 115 additions & 0 deletions tests/idiomatic/unit/test_document_extractors.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
# Copyright DataStax, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import Any, Dict, List

import pytest

from astrapy.idiomatic.cursors import _create_document_key_extractor


class TestDocumentExtractors:
@pytest.mark.describe("test of plain fieldname document extractor")
def test_plain_fieldname_document_extractor(self) -> None:
f_extractor = _create_document_key_extractor("f")

assert list(f_extractor({})) == []
assert list(f_extractor({"e": "E", "f": "F", "g": "G"})) == ["F"]
assert list(f_extractor({"f": {"name": "F"}})) == [{"name": "F"}]
assert list(f_extractor({"f": None})) == [None]

@pytest.mark.describe("test of dotted fieldname document extractor")
def test_dotted_fieldname_document_extractor(self) -> None:
# various patterns mixing dict and list indexed access
document1 = {
"l": ["L0", "L1", {"name": "L2"}],
"d": {"0": "D0", "1": "D1", "2": {"name": "D2"}},
"ll": [
["LL00", "LL01"],
["LL10", {"name": "LL11"}],
],
"ld": [
{"0": "LD00", "1": "LD01"},
{"0": "LD10", "1": {"name": "LD11"}},
],
"dl": {
"0": ["DL090", "DL01"],
"1": ["DL10", {"name": "DL11"}],
},
"dd": {
"0": {"0": "DL00", "1": "DL01"},
"1": {"0": "DL10", "1": {"name": "DL11"}},
},
"non_index": {"00": "NI00", "01": "NI01"},
}

def assert_extracts(
document: Dict[str, Any], key: str, expected: List[Any]
) -> None:
_extractor = _create_document_key_extractor(key)
_extracted = list(_extractor(document))
assert len(_extracted) == len(expected)
for item in expected:
assert item in _extracted

assert_extracts(document1, "l", ["L0", "L1", {"name": "L2"}])
assert_extracts(document1, "l.1", ["L1"])
assert_extracts(document1, "l.2", [{"name": "L2"}])
assert_extracts(document1, "d", [{"0": "D0", "1": "D1", "2": {"name": "D2"}}])
assert_extracts(document1, "d.1", ["D1"])
assert_extracts(document1, "d.2", [{"name": "D2"}])
assert_extracts(document1, "ll", [["LL00", "LL01"], ["LL10", {"name": "LL11"}]])
assert_extracts(document1, "ll.1", ["LL10", {"name": "LL11"}])
assert_extracts(document1, "ll.1.1", [{"name": "LL11"}])
assert_extracts(
document1,
"ld",
[{"0": "LD00", "1": "LD01"}, {"0": "LD10", "1": {"name": "LD11"}}],
)
assert_extracts(document1, "ld.1", [{"0": "LD10", "1": {"name": "LD11"}}])
assert_extracts(document1, "ld.1.1", [{"name": "LD11"}])
assert_extracts(
document1, "dl", [{"0": ["DL090", "DL01"], "1": ["DL10", {"name": "DL11"}]}]
)
assert_extracts(document1, "dl.1", ["DL10", {"name": "DL11"}])
assert_extracts(document1, "dl.1.1", [{"name": "DL11"}])
assert_extracts(
document1,
"dd",
[
{
"0": {"0": "DL00", "1": "DL01"},
"1": {"0": "DL10", "1": {"name": "DL11"}},
}
],
)
assert_extracts(document1, "dd.1", [{"0": "DL10", "1": {"name": "DL11"}}])
assert_extracts(document1, "dd.1.1", [{"name": "DL11"}])

# test about omitting list indices
document2a = {"x": [{"y": 10}, {"y": 11}, {"y": 12}]}
document2b = {"x": [{"y": 100}]}
document2c = {"x": ["X0", "X1"]}

assert_extracts(document2a, "x.y", [10, 11, 12])
assert_extracts(document2a, "x.0.y", [10])
assert_extracts(document2a, "x.1.y", [11])

assert_extracts(document2b, "x.y", [100])
assert_extracts(document2b, "x.0.y", [100])
assert_extracts(document2b, "x.1.y", [])

assert_extracts(document2c, "x", ["X0", "X1"])
assert_extracts(document2c, "x.0", ["X0"])
assert_extracts(document2c, "x.1", ["X1"])

0 comments on commit 6c40223

Please sign in to comment.