-
Notifications
You must be signed in to change notification settings - Fork 0
[NEM-363] Add built-context indexing flow #114
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
f3fe77c
3dc440f
ad0ef1f
2c4cdf7
7eb309f
2ea2c94
7742baf
135ab2e
b5e1327
474a249
92788e7
cf2f145
f887d70
0a71424
af99c58
615bfd1
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -3,13 +3,16 @@ | |
| from typing import Any, overload | ||
|
|
||
| from databao_context_engine.build_sources import BuildContextResult, build_all_datasources | ||
| from databao_context_engine.build_sources.build_runner import IndexSummary | ||
| from databao_context_engine.build_sources.build_wiring import index_built_contexts | ||
| from databao_context_engine.databao_context_engine import DatabaoContextEngine | ||
| from databao_context_engine.datasources.check_config import ( | ||
| CheckDatasourceConnectionResult, | ||
| ) | ||
| from databao_context_engine.datasources.check_config import ( | ||
| check_datasource_connection as check_datasource_connection_internal, | ||
| ) | ||
| from databao_context_engine.datasources.datasource_context import DatasourceContext | ||
| from databao_context_engine.datasources.datasource_discovery import get_datasource_list | ||
| from databao_context_engine.datasources.types import Datasource, DatasourceId | ||
| from databao_context_engine.pluginlib.build_plugin import DatasourceType | ||
|
|
@@ -89,6 +92,34 @@ def build_context( | |
| # TODO: Filter which datasources to build by datasource_ids | ||
| return build_all_datasources(project_layout=self._project_layout, chunk_embedding_mode=chunk_embedding_mode) | ||
|
|
||
| def index_built_contexts( | ||
| self, | ||
| datasource_ids: list[DatasourceId] | None = None, | ||
| chunk_embedding_mode: ChunkEmbeddingMode = ChunkEmbeddingMode.EMBEDDABLE_TEXT_ONLY, | ||
| ) -> IndexSummary: | ||
| """Index built datasource contexts into the embeddings database. | ||
|
|
||
| It reads already built context files from the output directory, chunks them using the appropriate plugin, | ||
| embeds the chunks and persists both the chunks and embeddings. | ||
|
|
||
| Args: | ||
| datasource_ids: The list of datsource ids to index. If None, all datsources will be indexed. | ||
| chunk_embedding_mode: The mode to use for chunk embedding. | ||
|
|
||
| Returns: | ||
| The summary of the index operation. | ||
| """ | ||
| engine: DatabaoContextEngine = self.get_engine_for_project() | ||
| contexts: list[DatasourceContext] = engine.get_all_contexts() | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Improvement for an other PR: we probably should have an API in the engine to get only the datasources from a list Right now, we only have:
We should add:
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Indeed, we could add that. I considered implementing it for this PR, but I'm not sure it actually saves too much IO, but it might, specially if we have contexts that are too big. |
||
|
|
||
| if datasource_ids is not None: | ||
| wanted_paths = {d.datasource_path for d in datasource_ids} | ||
| contexts = [c for c in contexts if c.datasource_id.datasource_path in wanted_paths] | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Wouldn't you be able to simply check |
||
|
|
||
| return index_built_contexts( | ||
| project_layout=self._project_layout, contexts=contexts, chunk_embedding_mode=chunk_embedding_mode | ||
| ) | ||
|
|
||
| def check_datasource_connection( | ||
| self, datasource_ids: list[DatasourceId] | None = None | ||
| ) -> list[CheckDatasourceConnectionResult]: | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -2,6 +2,7 @@ | |
| import os | ||
| from dataclasses import dataclass | ||
| from pathlib import Path | ||
| from typing import Iterable | ||
|
|
||
| import yaml | ||
|
|
||
|
|
@@ -25,15 +26,25 @@ class DatasourceContext: | |
| context: str | ||
|
|
||
|
|
||
| def _read_datasource_type_from_context_file(context_path: Path) -> DatasourceType: | ||
| def read_datasource_type_from_context_file(context_path: Path) -> DatasourceType: | ||
| with context_path.open("r") as context_file: | ||
| type_key = "datasource_type" | ||
| for line in context_file: | ||
| if line.startswith(f"{type_key}: "): | ||
| datasource_type = yaml.safe_load(line)[type_key] | ||
| return DatasourceType(full_type=datasource_type) | ||
| return _read_datasource_type_from_lines(context_file, source_label=str(context_path)) | ||
|
|
||
| raise ValueError(f"Could not find type in context file {context_path}") | ||
|
|
||
| def read_datasource_type_from_context(context: DatasourceContext) -> DatasourceType: | ||
| return _read_datasource_type_from_lines( | ||
| context.context.splitlines(True), | ||
| source_label=str(context.datasource_id), | ||
| ) | ||
|
|
||
|
|
||
| def _read_datasource_type_from_lines(lines: Iterable[str], *, source_label: str) -> DatasourceType: | ||
| type_key = "datasource_type" | ||
| for line in lines: | ||
| if line.startswith(f"{type_key}: "): | ||
| datasource_type = yaml.safe_load(line)[type_key] | ||
| return DatasourceType(full_type=datasource_type) | ||
| raise ValueError(f"Could not find type in context {source_label}") | ||
|
|
||
|
|
||
| def get_introspected_datasource_list(project_layout: ProjectLayout) -> list[Datasource]: | ||
|
|
@@ -48,7 +59,7 @@ def get_introspected_datasource_list(project_layout: ProjectLayout) -> list[Data | |
| result.append( | ||
| Datasource( | ||
| id=DatasourceId.from_datasource_context_file_path(relative_context_file), | ||
| type=_read_datasource_type_from_context_file(context_file), | ||
| type=read_datasource_type_from_context_file(context_file), | ||
| ) | ||
| ) | ||
| except ValueError as e: | ||
|
|
@@ -73,7 +84,10 @@ def get_all_contexts(project_layout: ProjectLayout) -> list[DatasourceContext]: | |
| result = [] | ||
| for dirpath, dirnames, filenames in os.walk(project_layout.output_dir): | ||
| for context_file_name in filenames: | ||
| if Path(context_file_name).suffix not in DatasourceId.ALLOWED_YAML_SUFFIXES: | ||
| if ( | ||
| Path(context_file_name).suffix not in DatasourceId.ALLOWED_YAML_SUFFIXES | ||
| or context_file_name == "all_results.yaml" | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 👍 well spotted |
||
| ): | ||
| continue | ||
| context_file = Path(dirpath).joinpath(context_file_name) | ||
| relative_context_file = context_file.relative_to(project_layout.output_dir) | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Instead of
intfor each of these properties, should we return a list of DatasourceId?We can still keep the
indexedas a calculated property for quick access if we wantAt the very least, I think we should be able to know which datasource failed
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
That is already being logged in the exception catch.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
But logs is an information that might or might not be shown depending on who is calling: I'm guessing we will still show info logs in the console of the CLI (but since the CLI is going to live in a separate repo, we shouldn't make any strong assumptions on this). But what about the agents? I don't think they would output the logs anywhere.
And since it's not returned in Python, it's not usable by any callers of the method. The most obvious usage in Python would be to retry indexing only the datasources that failed.