Skip to content

Commit e29bec6

Browse files
authored
Allow file index to be private (#45)
* Fix breaking reranker * Allow private file index * Avoid setting default to 1 when user management is enabled
1 parent 456f020 commit e29bec6

File tree

5 files changed

+145
-36
lines changed

5 files changed

+145
-36
lines changed

libs/ktem/ktem/index/base.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -109,11 +109,16 @@ def get_admin_settings(cls) -> dict:
109109
return {}
110110

111111
@abc.abstractmethod
112-
def get_indexing_pipeline(self, settings: dict) -> "BaseComponent":
112+
def get_indexing_pipeline(
113+
self, settings: dict, user_id: Optional[int]
114+
) -> "BaseComponent":
113115
"""Return the indexing pipeline that populates the entities into the index
114116
115117
Args:
116118
settings: the user settings of the index
119+
user_id: the user id who is accessing the index
120+
TODO: instead of having a user_id, should have an app_state
121+
which might also contain the settings.
117122
118123
Returns:
119124
BaseIndexing: the indexing pipeline

libs/ktem/ktem/index/file/index.py

Lines changed: 34 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,26 @@ class FileIndex(BaseIndex):
3131

3232
def __init__(self, app, id: int, name: str, config: dict):
3333
super().__init__(app, id, name, config)
34+
35+
self._indexing_pipeline_cls: Type[BaseFileIndexIndexing]
36+
self._retriever_pipeline_cls: list[Type[BaseFileIndexRetriever]]
37+
self._selector_ui_cls: Type
38+
self._selector_ui: Any = None
39+
self._index_ui_cls: Type
40+
self._index_ui: Any = None
41+
42+
self._default_settings: dict[str, dict] = {}
43+
self._setting_mappings: dict[str, dict] = {}
44+
45+
def _setup_resources(self):
46+
"""Setup resources for the file index
47+
48+
The resources include:
49+
- Database table
50+
- Vector store
51+
- Document store
52+
- File storage path
53+
"""
3454
Base = declarative_base()
3555
Source = type(
3656
"Source",
@@ -50,6 +70,7 @@ def __init__(self, app, id: int, name: str, config: dict):
5070
"date_created": Column(
5171
DateTime(timezone=True), server_default=func.now()
5272
),
73+
"user": Column(Integer, default=1),
5374
},
5475
)
5576
Index = type(
@@ -61,6 +82,7 @@ def __init__(self, app, id: int, name: str, config: dict):
6182
"source_id": Column(String),
6283
"target_id": Column(String),
6384
"relation_type": Column(Integer),
85+
"user": Column(Integer, default=1),
6486
},
6587
)
6688
self._vs: BaseVectorStore = get_vectorstore(f"index_{self.id}")
@@ -74,16 +96,6 @@ def __init__(self, app, id: int, name: str, config: dict):
7496
"FileStoragePath": self._fs_path,
7597
}
7698

77-
self._indexing_pipeline_cls: Type[BaseFileIndexIndexing]
78-
self._retriever_pipeline_cls: list[Type[BaseFileIndexRetriever]]
79-
self._selector_ui_cls: Type
80-
self._selector_ui: Any = None
81-
self._index_ui_cls: Type
82-
self._index_ui: Any = None
83-
84-
self._default_settings: dict[str, dict] = {}
85-
self._setting_mappings: dict[str, dict] = {}
86-
8799
def _setup_indexing_cls(self):
88100
"""Retrieve the indexing class for the file index
89101
@@ -247,6 +259,7 @@ def on_create(self):
247259
self.config = config
248260

249261
# create the resources
262+
self._setup_resources()
250263
self._resources["Source"].metadata.create_all(engine) # type: ignore
251264
self._resources["Index"].metadata.create_all(engine) # type: ignore
252265
self._fs_path.mkdir(parents=True, exist_ok=True)
@@ -255,6 +268,7 @@ def on_delete(self):
255268
"""Clean up the index when the user delete it"""
256269
import shutil
257270

271+
self._setup_resources()
258272
self._resources["Source"].__table__.drop(engine) # type: ignore
259273
self._resources["Index"].__table__.drop(engine) # type: ignore
260274
self._vs.drop()
@@ -263,6 +277,7 @@ def on_delete(self):
263277

264278
def on_start(self):
265279
"""Setup the classes and hooks"""
280+
self._setup_resources()
266281
self._setup_indexing_cls()
267282
self._setup_retriever_cls()
268283
self._setup_file_index_ui_cls()
@@ -326,9 +341,16 @@ def get_admin_settings(cls):
326341
"Set 0 to disable."
327342
),
328343
},
344+
"private": {
345+
"name": "Make private",
346+
"value": False,
347+
"component": "radio",
348+
"choices": [("Yes", True), ("No", False)],
349+
"info": "If private, files will not be accessible across users.",
350+
},
329351
}
330352

331-
def get_indexing_pipeline(self, settings) -> BaseFileIndexIndexing:
353+
def get_indexing_pipeline(self, settings, user_id) -> BaseFileIndexIndexing:
332354
"""Define the interface of the indexing pipeline"""
333355

334356
prefix = f"index.options.{self.id}."
@@ -341,6 +363,7 @@ def get_indexing_pipeline(self, settings) -> BaseFileIndexIndexing:
341363

342364
obj = self._indexing_pipeline_cls.get_pipeline(stripped_settings, self.config)
343365
obj.set_resources(resources=self._resources)
366+
obj._user_id = user_id
344367

345368
return obj
346369

libs/ktem/ktem/index/file/pipelines.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
from ktem.components import filestorage_path
1414
from ktem.db.models import engine
1515
from ktem.embeddings.manager import embedding_models_manager
16+
from ktem.llms.manager import llms
1617
from llama_index.vector_stores import (
1718
FilterCondition,
1819
FilterOperator,
@@ -28,7 +29,7 @@
2829
from kotaemon.base import RetrievedDocument
2930
from kotaemon.indices import VectorIndexing, VectorRetrieval
3031
from kotaemon.indices.ingests import DocumentIngestor
31-
from kotaemon.indices.rankings import BaseReranking
32+
from kotaemon.indices.rankings import BaseReranking, LLMReranking
3233

3334
from .base import BaseFileIndexIndexing, BaseFileIndexRetriever
3435

@@ -72,7 +73,7 @@ class DocumentRetrievalPipeline(BaseFileIndexRetriever):
7273
"""
7374

7475
vector_retrieval: VectorRetrieval = VectorRetrieval.withx()
75-
reranker: BaseReranking
76+
reranker: BaseReranking = LLMReranking.withx()
7677
get_extra_table: bool = False
7778
mmr: bool = False
7879
top_k: int = 5
@@ -225,12 +226,15 @@ def get_pipeline(cls, user_settings, index_settings, selected):
225226
"""
226227
retriever = cls(
227228
get_extra_table=user_settings["prioritize_table"],
228-
reranker=user_settings["reranking_llm"],
229229
top_k=user_settings["num_retrieval"],
230230
mmr=user_settings["mmr"],
231231
)
232232
if not user_settings["use_reranking"]:
233233
retriever.reranker = None # type: ignore
234+
else:
235+
retriever.reranker.llm = llms.get(
236+
user_settings["reranking_llm"], llms.get_default()
237+
)
234238

235239
retriever.vector_retrieval.embedding = embedding_models_manager[
236240
index_settings.get("embedding", embedding_models_manager.get_default_name())
@@ -342,6 +346,7 @@ def run(
342346
name=Path(file_path).name,
343347
path=file_hash,
344348
size=Path(file_path).stat().st_size,
349+
user=self._user_id, # type: ignore
345350
)
346351
file_to_source[file_path] = source
347352

0 commit comments

Comments
 (0)