From 4423f38146d84555d8fa6464e21fdadf37dd85bd Mon Sep 17 00:00:00 2001 From: Tiago Santana <54704492+SantanaTiago@users.noreply.github.com> Date: Wed, 19 Mar 2025 19:07:32 +0000 Subject: [PATCH 1/3] add data index list items. add get document urls Signed-off-by: Tiago Santana <54704492+SantanaTiago@users.noreply.github.com> --- deepsearch/cps/cli/cli_options.py | 4 + deepsearch/cps/cli/data_indices_typer.py | 40 +++++ .../cps/client/components/data_indices.py | 142 ++++++++++++++++-- deepsearch/cps/client/components/documents.py | 4 +- deepsearch/cps/client/components/elastic.py | 11 +- docs/guide/data_indices.md | 31 ++++ 6 files changed, 217 insertions(+), 15 deletions(-) diff --git a/deepsearch/cps/cli/cli_options.py b/deepsearch/cps/cli/cli_options.py index ee78e2c8..a3a84ed6 100644 --- a/deepsearch/cps/cli/cli_options.py +++ b/deepsearch/cps/cli/cli_options.py @@ -98,3 +98,7 @@ "-t", help="""Provide path to file containing task ids generated during document conversion.""", ) + +QUERY_STRING = typer.Option("*", "-q", "--query-string", help="Query string") + +MAX_ITEMS = typer.Option(10, "-mi", "--max-items", help="Max items to list") diff --git a/deepsearch/cps/cli/data_indices_typer.py b/deepsearch/cps/cli/data_indices_typer.py index 9b8baaa9..9c7ca55d 100644 --- a/deepsearch/cps/cli/data_indices_typer.py +++ b/deepsearch/cps/cli/data_indices_typer.py @@ -15,7 +15,9 @@ COORDINATES_PATH, INDEX_ITEM_ID, INDEX_KEY, + MAX_ITEMS, PROJ_KEY, + QUERY_STRING, SOURCE_PATH, TARGET_SETTINGS, URL, @@ -237,5 +239,43 @@ def add_attachment( raise typer.Abort() +@app.command(name="list", help="List/search items in an index", no_args_is_help=True) +@cli_handler() +def list_items( + proj_key: str = PROJ_KEY, + index_key: str = INDEX_KEY, + query_string: str = QUERY_STRING, + max_items: int = MAX_ITEMS, +): + """ + List/search items in an index" + """ + api = CpsApi.from_env() + + # get indices of the project + indices = api.data_indices.list(proj_key) + + # get specific index to add attachment + index = next((x for x in indices if x.source.index_key == index_key), None) + + if index is not None: + try: + items = index.list_items( + api=api, + query_string=query_string, + max_items=max_items, + ) + for item in items: + typer.echo(item) + except ValueError as e: + typer.echo(f"Error occurred: {e}") + typer.echo(ERROR_MSG) + raise typer.Abort() + return + else: + typer.echo("Index key not found") + raise typer.Abort() + + if __name__ == "__main__": app() diff --git a/deepsearch/cps/client/components/data_indices.py b/deepsearch/cps/client/components/data_indices.py index 2a3635b0..2fdde7e2 100644 --- a/deepsearch/cps/client/components/data_indices.py +++ b/deepsearch/cps/client/components/data_indices.py @@ -1,10 +1,9 @@ from __future__ import annotations -import ast import os from dataclasses import dataclass from pathlib import Path -from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union +from typing import TYPE_CHECKING, Any, Dict, Generator, List, Optional, Union from urllib.parse import urlparse import requests @@ -17,6 +16,9 @@ from deepsearch.cps.apis.public.models.task import Task from deepsearch.cps.apis.public.models.token_response import TokenResponse from deepsearch.cps.client.components.api_object import ApiConnectedObject +from deepsearch.cps.client.components.elastic import ElasticProjectDataCollectionSource +from deepsearch.cps.client.queries.query import Query +from deepsearch.cps.queries import DataQuery if TYPE_CHECKING: from deepsearch.cps.client import CpsApi @@ -179,14 +181,6 @@ def upload( return task -class ElasticProjectDataCollectionSource(BaseModel): - proj_key: str - index_key: str - - def to_resource(self) -> Dict[str, Any]: - return {"type": "elastic", "proj_key": self.proj_key, "index": self.index_key} - - class DataIndex(BaseModel): source: ElasticProjectDataCollectionSource @@ -259,6 +253,128 @@ def add_item_attachment( params=params, ) + def list_items( + self, + api: CpsApi, + query_string: str = "*", + page_size: int = 10, + max_items: int = 100, + ) -> Generator[dict]: + """ + Method to list/search documents in an index. + + Input + ----- + api : CpsApi + CpsApi Class + query_string: str + string to search documents, defaults to all ("*") + page_size : int + page size in query pagination, defaults to 10 + max_items : int + maximum items to list, defaults to 100 + """ + + query_tasks = Query() + + if max_items < page_size: + page_size = max_items + + lookup = query_tasks.add( + "ElasticQuery", + task_id="elastic-search", + parameters={ + "source": ["_name", "_id"], + "sort": [ + {"description.publication_date": {"order": "desc"}}, + {"description.logs.date": {"order": "desc"}}, + ], + "limit": page_size, + }, + coordinates=ElasticProjectDataCollectionSource( + proj_key=self.source.proj_key, index_key=self.source.index_key + ), + ) + lookup.output("items").output_as("result") + + query = DataQuery( + search_query=query_string, + limit=page_size, + coordinates=ElasticProjectDataCollectionSource( + proj_key=self.source.proj_key, index_key=self.source.index_key + ), + ) + + # Run task. + cursor = api.queries.run_paginated_query(query) + pages_loaded = 0 + for result in cursor: + for row in result.outputs["data_outputs"]: + yield { + "name": row["_source"]["_name"], + "id": row["_source"]["file-info"]["document-hash"], + } + + pages_loaded += 1 + + if pages_loaded * page_size >= max_items: + break + + def get_item_urls( + self, + api: CpsApi, + index_item_id: str, + ) -> DataIndexItemUrls: + """ + Method to get document urls. + + Input + ----- + api : CpsApi + CpsApi Class + index_item_id : string + id of document in index + """ + + query_tasks = Query() + + lookup = query_tasks.add( + "ElasticQuery", + task_id="elastic-search", + parameters={ + "elastic_query": { + "bool": {"filter": {"terms": {"_id": [index_item_id]}}} + }, + "limit": 1, + }, + coordinates=ElasticProjectDataCollectionSource( + proj_key=self.source.proj_key, index_key=self.source.index_key + ), + ) + lookup.output("items").output_as("result") + + # Run task. + response = api.queries.run(query_tasks) + + s3_data: dict = ( + response.outputs.get("result", [{}])[0] + .get("_source", {}) + .get("_s3_data", {}) + ) + + def get_url(document: str) -> str: + doc_info: Union[dict, list] = s3_data.get(document, {}) + if isinstance(doc_info, list): + return doc_info[0].get("url", "") + else: + return doc_info.get("url", "") + + return DataIndexItemUrls( + pdf_url=get_url("pdf-document"), + md_url=get_url("markdown-document"), + json_url=get_url("json-document"), + ) + @dataclass class CpsApiDataIndex(ApiConnectedObject): @@ -275,3 +391,9 @@ class S3Coordinates(BaseModel): bucket: str location: str key_prefix: str = "" + + +class DataIndexItemUrls(BaseModel): + pdf_url: str + md_url: str + json_url: str diff --git a/deepsearch/cps/client/components/documents.py b/deepsearch/cps/client/components/documents.py index 76fa327b..16c2b164 100644 --- a/deepsearch/cps/client/components/documents.py +++ b/deepsearch/cps/client/components/documents.py @@ -25,10 +25,10 @@ SemanticIngestSourcePublicDataDocument, ) from deepsearch.cps.apis.public_v2.models.source4 import Source4 -from deepsearch.cps.client.components.data_indices import ( +from deepsearch.cps.client.components.elastic import ( + ElasticDataCollectionSource, ElasticProjectDataCollectionSource, ) -from deepsearch.cps.client.components.elastic import ElasticDataCollectionSource from deepsearch.cps.client.components.projects import Project if TYPE_CHECKING: diff --git a/deepsearch/cps/client/components/elastic.py b/deepsearch/cps/client/components/elastic.py index 964ffccf..631e25cf 100644 --- a/deepsearch/cps/client/components/elastic.py +++ b/deepsearch/cps/client/components/elastic.py @@ -6,9 +6,6 @@ from pydantic import BaseModel from deepsearch.cps.apis import public as sw_client -from deepsearch.cps.client.components.data_indices import ( - ElasticProjectDataCollectionSource, -) if TYPE_CHECKING: from deepsearch.cps.client import CpsApi @@ -58,6 +55,14 @@ class ElasticDataCollectionMetadata(BaseModel): version: str +class ElasticProjectDataCollectionSource(BaseModel): + proj_key: str + index_key: str + + def to_resource(self) -> Dict[str, Any]: + return {"type": "elastic", "proj_key": self.proj_key, "index": self.index_key} + + class ElasticDataCollection(BaseModel): source: Union[ElasticDataCollectionSource, ElasticProjectDataCollectionSource] diff --git a/docs/guide/data_indices.md b/docs/guide/data_indices.md index 97201baa..d840f7c1 100644 --- a/docs/guide/data_indices.md +++ b/docs/guide/data_indices.md @@ -173,3 +173,34 @@ Attachments can be added to an index item in a project. Briefly, attachments hav attachment_key=attachment_key, # optional ) ``` + +--- + +## List documents in an index + +Listing documents in an index can be done by calling method 'list_items' in 'DataIndex' class. It also accepts a query string to list specific document(s). + +=== "CLI" +
+ + ```console + $ deepsearch cps data-indices list -p PROJ_KEY -x INDEX_KEY -q QUERY_STRING + ``` + +
+=== "Python" + ```python + from deepsearch.cps.client.components.elastic import ElasticProjectDataCollectionSource + + # get indices of the project + indices = api.data_indices.list(PROJ_KEY) + + # get specific index to list document + index = next((x for x in indices if x.source.index_key == index_key), None) + + items = dataindex.list_items(api) + for item in items: + print(item) + ``` + +--- \ No newline at end of file From 2abd45144628728820cf0e35722a595589fedaf2 Mon Sep 17 00:00:00 2001 From: Tiago Santana <54704492+SantanaTiago@users.noreply.github.com> Date: Wed, 19 Mar 2025 19:17:33 +0000 Subject: [PATCH 2/3] code clean up Signed-off-by: Tiago Santana <54704492+SantanaTiago@users.noreply.github.com> --- .../cps/client/components/data_indices.py | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/deepsearch/cps/client/components/data_indices.py b/deepsearch/cps/client/components/data_indices.py index 2fdde7e2..2442626a 100644 --- a/deepsearch/cps/client/components/data_indices.py +++ b/deepsearch/cps/client/components/data_indices.py @@ -275,28 +275,9 @@ def list_items( maximum items to list, defaults to 100 """ - query_tasks = Query() - if max_items < page_size: page_size = max_items - lookup = query_tasks.add( - "ElasticQuery", - task_id="elastic-search", - parameters={ - "source": ["_name", "_id"], - "sort": [ - {"description.publication_date": {"order": "desc"}}, - {"description.logs.date": {"order": "desc"}}, - ], - "limit": page_size, - }, - coordinates=ElasticProjectDataCollectionSource( - proj_key=self.source.proj_key, index_key=self.source.index_key - ), - ) - lookup.output("items").output_as("result") - query = DataQuery( search_query=query_string, limit=page_size, From fdafc6a3507d53d8017ca363f9f11482fdb42c22 Mon Sep 17 00:00:00 2001 From: Tiago Santana <54704492+SantanaTiago@users.noreply.github.com> Date: Mon, 31 Mar 2025 16:16:38 +0100 Subject: [PATCH 3/3] improve list items guide Signed-off-by: Tiago Santana <54704492+SantanaTiago@users.noreply.github.com> --- docs/guide/data_indices.md | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/docs/guide/data_indices.md b/docs/guide/data_indices.md index d840f7c1..18ac03b3 100644 --- a/docs/guide/data_indices.md +++ b/docs/guide/data_indices.md @@ -198,9 +198,11 @@ Listing documents in an index can be done by calling method 'list_items' in 'Dat # get specific index to list document index = next((x for x in indices if x.source.index_key == index_key), None) - items = dataindex.list_items(api) - for item in items: - print(item) + # if the index exists, list items + if index is not None: + items = index.list_items(api) + for item in items: + print(item) ``` --- \ No newline at end of file