From e0515e13ddeb050b8c9fad96db65bfcc1a7fc671 Mon Sep 17 00:00:00 2001 From: Um Changyong Date: Fri, 13 Dec 2024 16:48:12 +0900 Subject: [PATCH 01/11] feat: add field to vector item --- autorag/nodes/retrieval/base.py | 9 ++++++--- autorag/nodes/retrieval/vectordb.py | 29 +++++++++++++++++------------ autorag/vectordb/base.py | 2 +- autorag/vectordb/milvus.py | 13 +++++++++---- 4 files changed, 33 insertions(+), 20 deletions(-) diff --git a/autorag/nodes/retrieval/base.py b/autorag/nodes/retrieval/base.py index 7556c9a9b..9da0e6f4a 100644 --- a/autorag/nodes/retrieval/base.py +++ b/autorag/nodes/retrieval/base.py @@ -102,8 +102,8 @@ def cast_queries(queries: Union[str, List[str]]) -> List[str]: def evenly_distribute_passages( - ids: List[List[str]], scores: List[List[float]], top_k: int -) -> Tuple[List[str], List[float]]: + ids: List[List[str]], scores: List[List[float]], contents: [List[List[str]]], top_k: int +) -> Tuple[List[str], List[float], List[str]]: assert len(ids) == len(scores), "ids and scores must have same length." query_cnt = len(ids) avg_len = top_k // query_cnt @@ -111,15 +111,18 @@ def evenly_distribute_passages( new_ids = [] new_scores = [] + new_contents = [] for i in range(query_cnt): if i < remainder: new_ids.extend(ids[i][: avg_len + 1]) new_scores.extend(scores[i][: avg_len + 1]) + new_contents.extend(contents[i][: avg_len + 1]) else: new_ids.extend(ids[i][:avg_len]) new_scores.extend(scores[i][:avg_len]) + new_contents.extend(contents[i][:avg_len]) - return new_ids, new_scores + return new_ids, new_scores, new_contents def get_bm25_pkl_name(bm25_tokenizer: str): diff --git a/autorag/nodes/retrieval/vectordb.py b/autorag/nodes/retrieval/vectordb.py index ba0cc2c13..a02bca4bf 100644 --- a/autorag/nodes/retrieval/vectordb.py +++ b/autorag/nodes/retrieval/vectordb.py @@ -69,8 +69,11 @@ def __del__(self): def pure(self, previous_result: pd.DataFrame, *args, **kwargs): queries = self.cast_to_run(previous_result) pure_params = pop_params(self._pure, kwargs) - ids, scores = self._pure(queries, **pure_params) - contents = fetch_contents(self.corpus_df, ids) + ids, scores, contents = self._pure(queries, **pure_params) + # contents = fetch_contents(self.corpus_df, ids) + ids = [[_id[0]] for _id in ids] + scores = [[score[0]] for score in scores] + contents = [[content[0]] for content in contents] return contents, ids, scores def _pure( @@ -79,7 +82,7 @@ def _pure( top_k: int, embedding_batch: int = 128, ids: Optional[List[List[str]]] = None, - ) -> Tuple[List[List[str]], List[List[float]]]: + ) -> Tuple[List[List[str]], List[List[float]], List[List[str]]]: """ VectorDB retrieval function. You have to get a chroma collection that is already ingested. @@ -113,7 +116,8 @@ def _pure( ) id_result = list(map(lambda x: x[0], results)) score_result = list(map(lambda x: x[1], results)) - return id_result, score_result + content_result = list(map(lambda x: x[2], results)) + return id_result, score_result, content_result def __get_ids_scores(self, queries, ids, embedding_batch: int): # truncate queries and embedding execution here. @@ -167,7 +171,7 @@ async def run_fetch(ids): async def vectordb_pure( queries: List[str], top_k: int, vectordb: BaseVectorStore -) -> Tuple[List[str], List[float]]: +) -> Tuple[List[str], List[float], List[str]]: """ Async VectorDB retrieval function. Its usage is for async retrieval of vector_db row by row. @@ -177,19 +181,20 @@ async def vectordb_pure( :param vectordb: The vector store instance. :return: The tuple contains a list of passage ids that are retrieved from vectordb and a list of its scores. """ - id_result, score_result = await vectordb.query(queries=queries, top_k=top_k) + id_result, score_result, content_result = await vectordb.query(queries=queries, top_k=top_k) # Distribute passages evenly - id_result, score_result = evenly_distribute_passages(id_result, score_result, top_k) + id_result, score_result, content_result = evenly_distribute_passages(id_result, score_result, content_result, top_k) # sort id_result and score_result by score result = [ - (_id, score) - for score, _id in sorted( - zip(score_result, id_result), key=lambda pair: pair[0], reverse=True + (_id, score, content) + for score, _id, content in sorted( + zip(score_result, id_result, content_result), key=lambda pair: pair[0], reverse=True ) ] - id_result, score_result = zip(*result) - return list(id_result), list(score_result) + id_result, score_result, content_result = zip(*result) + return list(id_result), list(score_result), list(content_result) + async def filter_exist_ids( diff --git a/autorag/vectordb/base.py b/autorag/vectordb/base.py index e7cd15101..fd456f80a 100644 --- a/autorag/vectordb/base.py +++ b/autorag/vectordb/base.py @@ -35,7 +35,7 @@ async def add( @abstractmethod async def query( self, queries: List[str], top_k: int, **kwargs - ) -> Tuple[List[List[str]], List[List[float]]]: + ) -> Tuple[List[List[str]], List[List[float]], List[List[str]]]: pass @abstractmethod diff --git a/autorag/vectordb/milvus.py b/autorag/vectordb/milvus.py index 7a4a1c9f0..cea2ed3bb 100644 --- a/autorag/vectordb/milvus.py +++ b/autorag/vectordb/milvus.py @@ -68,7 +68,10 @@ def __init__( field = FieldSchema( name="vector", dtype=DataType.FLOAT_VECTOR, dim=dimension ) - schema = CollectionSchema(fields=[pk, field]) + content = FieldSchema( + name="content", dtype=DataType.VARCHAR, max_length=65535 + ) + schema = CollectionSchema(fields=[pk, field, content]) self.collection = Collection(name=self.collection_name, schema=schema) index_params = { @@ -90,7 +93,7 @@ async def add(self, ids: List[str], texts: List[str]): # make data for insertion data = list( - map(lambda _id, vector: {"id": _id, "vector": vector}, ids, text_embeddings) + map(lambda _id, vector, text: {"id": _id, "vector": vector, "content":text}, ids, text_embeddings, texts) ) # Insert data into the collection @@ -103,7 +106,7 @@ async def add(self, ids: List[str], texts: List[str]): async def query( self, queries: List[str], top_k: int, **kwargs - ) -> Tuple[List[List[str]], List[List[float]]]: + ) -> Tuple[List[List[str]], List[List[float]], List[List[str]]]: queries = self.truncated_inputs(queries) query_embeddings: List[ List[float] @@ -117,6 +120,7 @@ async def query( limit=top_k, anns_field="vector", param={"metric_type": self.similarity_metric.upper()}, + output_fields=["vector","content"], timeout=self.timeout, **kwargs, ) @@ -124,11 +128,12 @@ async def query( # Extract IDs and distances ids = [[str(hit.id) for hit in result] for result in results] distances = [[hit.distance for hit in result] for result in results] + contents = [[str(hit.fields["content"]) for hit in result] for result in results] if self.similarity_metric in ["l2"]: distances = apply_recursive(lambda x: -x, distances) - return ids, distances + return ids, distances, contents async def fetch(self, ids: List[str]) -> List[List[float]]: try: From 7dc7528e82a3da7d69ae702c59e9af16f08dcb60 Mon Sep 17 00:00:00 2001 From: Um Changyong Date: Fri, 13 Dec 2024 16:48:12 +0900 Subject: [PATCH 02/11] feat: add field to vector item --- autorag/nodes/retrieval/base.py | 9 ++++++--- autorag/nodes/retrieval/vectordb.py | 29 +++++++++++++++++------------ autorag/vectordb/base.py | 2 +- autorag/vectordb/milvus.py | 13 +++++++++---- 4 files changed, 33 insertions(+), 20 deletions(-) diff --git a/autorag/nodes/retrieval/base.py b/autorag/nodes/retrieval/base.py index 7556c9a9b..9da0e6f4a 100644 --- a/autorag/nodes/retrieval/base.py +++ b/autorag/nodes/retrieval/base.py @@ -102,8 +102,8 @@ def cast_queries(queries: Union[str, List[str]]) -> List[str]: def evenly_distribute_passages( - ids: List[List[str]], scores: List[List[float]], top_k: int -) -> Tuple[List[str], List[float]]: + ids: List[List[str]], scores: List[List[float]], contents: [List[List[str]]], top_k: int +) -> Tuple[List[str], List[float], List[str]]: assert len(ids) == len(scores), "ids and scores must have same length." query_cnt = len(ids) avg_len = top_k // query_cnt @@ -111,15 +111,18 @@ def evenly_distribute_passages( new_ids = [] new_scores = [] + new_contents = [] for i in range(query_cnt): if i < remainder: new_ids.extend(ids[i][: avg_len + 1]) new_scores.extend(scores[i][: avg_len + 1]) + new_contents.extend(contents[i][: avg_len + 1]) else: new_ids.extend(ids[i][:avg_len]) new_scores.extend(scores[i][:avg_len]) + new_contents.extend(contents[i][:avg_len]) - return new_ids, new_scores + return new_ids, new_scores, new_contents def get_bm25_pkl_name(bm25_tokenizer: str): diff --git a/autorag/nodes/retrieval/vectordb.py b/autorag/nodes/retrieval/vectordb.py index ba0cc2c13..a02bca4bf 100644 --- a/autorag/nodes/retrieval/vectordb.py +++ b/autorag/nodes/retrieval/vectordb.py @@ -69,8 +69,11 @@ def __del__(self): def pure(self, previous_result: pd.DataFrame, *args, **kwargs): queries = self.cast_to_run(previous_result) pure_params = pop_params(self._pure, kwargs) - ids, scores = self._pure(queries, **pure_params) - contents = fetch_contents(self.corpus_df, ids) + ids, scores, contents = self._pure(queries, **pure_params) + # contents = fetch_contents(self.corpus_df, ids) + ids = [[_id[0]] for _id in ids] + scores = [[score[0]] for score in scores] + contents = [[content[0]] for content in contents] return contents, ids, scores def _pure( @@ -79,7 +82,7 @@ def _pure( top_k: int, embedding_batch: int = 128, ids: Optional[List[List[str]]] = None, - ) -> Tuple[List[List[str]], List[List[float]]]: + ) -> Tuple[List[List[str]], List[List[float]], List[List[str]]]: """ VectorDB retrieval function. You have to get a chroma collection that is already ingested. @@ -113,7 +116,8 @@ def _pure( ) id_result = list(map(lambda x: x[0], results)) score_result = list(map(lambda x: x[1], results)) - return id_result, score_result + content_result = list(map(lambda x: x[2], results)) + return id_result, score_result, content_result def __get_ids_scores(self, queries, ids, embedding_batch: int): # truncate queries and embedding execution here. @@ -167,7 +171,7 @@ async def run_fetch(ids): async def vectordb_pure( queries: List[str], top_k: int, vectordb: BaseVectorStore -) -> Tuple[List[str], List[float]]: +) -> Tuple[List[str], List[float], List[str]]: """ Async VectorDB retrieval function. Its usage is for async retrieval of vector_db row by row. @@ -177,19 +181,20 @@ async def vectordb_pure( :param vectordb: The vector store instance. :return: The tuple contains a list of passage ids that are retrieved from vectordb and a list of its scores. """ - id_result, score_result = await vectordb.query(queries=queries, top_k=top_k) + id_result, score_result, content_result = await vectordb.query(queries=queries, top_k=top_k) # Distribute passages evenly - id_result, score_result = evenly_distribute_passages(id_result, score_result, top_k) + id_result, score_result, content_result = evenly_distribute_passages(id_result, score_result, content_result, top_k) # sort id_result and score_result by score result = [ - (_id, score) - for score, _id in sorted( - zip(score_result, id_result), key=lambda pair: pair[0], reverse=True + (_id, score, content) + for score, _id, content in sorted( + zip(score_result, id_result, content_result), key=lambda pair: pair[0], reverse=True ) ] - id_result, score_result = zip(*result) - return list(id_result), list(score_result) + id_result, score_result, content_result = zip(*result) + return list(id_result), list(score_result), list(content_result) + async def filter_exist_ids( diff --git a/autorag/vectordb/base.py b/autorag/vectordb/base.py index e7cd15101..fd456f80a 100644 --- a/autorag/vectordb/base.py +++ b/autorag/vectordb/base.py @@ -35,7 +35,7 @@ async def add( @abstractmethod async def query( self, queries: List[str], top_k: int, **kwargs - ) -> Tuple[List[List[str]], List[List[float]]]: + ) -> Tuple[List[List[str]], List[List[float]], List[List[str]]]: pass @abstractmethod diff --git a/autorag/vectordb/milvus.py b/autorag/vectordb/milvus.py index 0bc4103bc..4be88d3c7 100644 --- a/autorag/vectordb/milvus.py +++ b/autorag/vectordb/milvus.py @@ -68,7 +68,10 @@ def __init__( field = FieldSchema( name="vector", dtype=DataType.FLOAT_VECTOR, dim=dimension ) - schema = CollectionSchema(fields=[pk, field]) + content = FieldSchema( + name="content", dtype=DataType.VARCHAR, max_length=65535 + ) + schema = CollectionSchema(fields=[pk, field, content]) self.collection = Collection(name=self.collection_name, schema=schema) index_params = { @@ -90,7 +93,7 @@ async def add(self, ids: List[str], texts: List[str]): # make data for insertion data = list( - map(lambda _id, vector: {"id": _id, "vector": vector}, ids, text_embeddings) + map(lambda _id, vector, text: {"id": _id, "vector": vector, "content":text}, ids, text_embeddings, texts) ) # Insert data into the collection @@ -103,7 +106,7 @@ async def add(self, ids: List[str], texts: List[str]): async def query( self, queries: List[str], top_k: int, **kwargs - ) -> Tuple[List[List[str]], List[List[float]]]: + ) -> Tuple[List[List[str]], List[List[float]], List[List[str]]]: queries = self.truncated_inputs(queries) query_embeddings: List[ List[float] @@ -117,6 +120,7 @@ async def query( limit=top_k, anns_field="vector", param={"metric_type": self.similarity_metric.upper()}, + output_fields=["vector","content"], timeout=self.timeout, **kwargs, ) @@ -124,11 +128,12 @@ async def query( # Extract IDs and distances ids = [[str(hit.id) for hit in result] for result in results] distances = [[hit.distance for hit in result] for result in results] + contents = [[str(hit.fields["content"]) for hit in result] for result in results] if self.similarity_metric in ["l2"]: distances = apply_recursive(lambda x: -x, distances) - return ids, distances + return ids, distances, contents async def fetch(self, ids: List[str]) -> List[List[float]]: try: From 329e0cf420500a450caae5b006883f4e90f71fde Mon Sep 17 00:00:00 2001 From: Um Changyong Date: Mon, 16 Dec 2024 10:28:01 +0900 Subject: [PATCH 03/11] feat: update chroma --- autorag/vectordb/chroma.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/autorag/vectordb/chroma.py b/autorag/vectordb/chroma.py index 3b198b780..37a73f21c 100644 --- a/autorag/vectordb/chroma.py +++ b/autorag/vectordb/chroma.py @@ -68,9 +68,9 @@ async def add(self, ids: List[str], texts: List[str]): texts = self.truncated_inputs(texts) text_embeddings = await self.embedding.aget_text_embedding_batch(texts) if isinstance(self.collection, AsyncCollection): - await self.collection.add(ids=ids, embeddings=text_embeddings) + await self.collection.add(ids=ids, embeddings=text_embeddings, documents=texts) else: - self.collection.add(ids=ids, embeddings=text_embeddings) + self.collection.add(ids=ids, embeddings=text_embeddings, documents=texts) async def fetch(self, ids: List[str]) -> List[List[float]]: if isinstance(self.collection, AsyncCollection): @@ -92,7 +92,7 @@ async def is_exist(self, ids: List[str]) -> List[bool]: async def query( self, queries: List[str], top_k: int, **kwargs - ) -> Tuple[List[List[str]], List[List[float]]]: + ) -> Tuple[List[List[str]], List[List[float]], List[List[str]]]: queries = self.truncated_inputs(queries) query_embeddings: List[ List[float] @@ -107,8 +107,9 @@ async def query( ) ids = query_result["ids"] scores = query_result["distances"] + contents = query_result["documents"] scores = apply_recursive(lambda x: 1 - x, scores) - return ids, scores + return ids, scores, contents async def delete(self, ids: List[str]): if isinstance(self.collection, AsyncCollection): From 28efa65883a495691d45e5044e78a84a08b11eb7 Mon Sep 17 00:00:00 2001 From: Um Changyong Date: Mon, 16 Dec 2024 10:28:29 +0900 Subject: [PATCH 04/11] feat: update couchbase --- autorag/vectordb/couchbase.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/autorag/vectordb/couchbase.py b/autorag/vectordb/couchbase.py index b69125885..5c7116117 100644 --- a/autorag/vectordb/couchbase.py +++ b/autorag/vectordb/couchbase.py @@ -133,7 +133,7 @@ async def query( List[float] ] = await self.embedding.aget_text_embedding_batch(queries) - ids, scores = [], [] + ids, scores, texts = [], [], [] for query_embedding in query_embeddings: # Create Search Request search_req = search.SearchRequest.create( @@ -151,27 +151,29 @@ async def query( search_iter = self.scope.search( self.index_name, search_req, - SearchOptions(limit=top_k), + SearchOptions(limit=top_k, fields=[self.text_key]), ) else: search_iter = self.cluster.search( self.index_name, search_req, - SearchOptions(limit=top_k), + SearchOptions(limit=top_k, fields=[self.text_key]), ) # Parse the search results # search_iter.rows() can only be iterated once. - id_list, score_list = [], [] + id_list, score_list, text_list = [], [], [] for result in search_iter.rows(): id_list.append(result.id) score_list.append(result.score) + text_list.append(result.fields[self.text_key]) ids.append(id_list) scores.append(score_list) + texts.append(text_list) - return ids, scores + return ids, scores, texts async def delete(self, ids: List[str]): self.collection.remove_multi(ids) From 2e9e3436dfd05d43230e1422ec9abb5f08964656 Mon Sep 17 00:00:00 2001 From: Um Changyong Date: Mon, 16 Dec 2024 10:48:22 +0900 Subject: [PATCH 05/11] fix: update return object of couchbase.query() --- autorag/vectordb/couchbase.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/autorag/vectordb/couchbase.py b/autorag/vectordb/couchbase.py index 5c7116117..900590133 100644 --- a/autorag/vectordb/couchbase.py +++ b/autorag/vectordb/couchbase.py @@ -27,7 +27,7 @@ def __init__( username: str = "", password: str = "", ingest_batch: int = 100, - text_key: Optional[str] = "text", + text_key: Optional[str] = "content", embedding_key: Optional[str] = "embedding", scoped_index: bool = True, ): @@ -123,7 +123,7 @@ async def is_exist(self, ids: List[str]) -> List[bool]: async def query( self, queries: List[str], top_k: int, **kwargs - ) -> Tuple[List[List[str]], List[List[float]]]: + ) -> Tuple[List[List[str]], List[List[float]], List[List[str]]]: import couchbase.search as search from couchbase.options import SearchOptions from couchbase.vector_search import VectorQuery, VectorSearch From d993a25ebb87a9ecd1e9b6a93585a9da22f68bb0 Mon Sep 17 00:00:00 2001 From: Um Changyong Date: Mon, 16 Dec 2024 10:53:51 +0900 Subject: [PATCH 06/11] feat: update pinecone --- autorag/vectordb/pinecone.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/autorag/vectordb/pinecone.py b/autorag/vectordb/pinecone.py index 4f8f04a25..cf78cc9f6 100644 --- a/autorag/vectordb/pinecone.py +++ b/autorag/vectordb/pinecone.py @@ -22,6 +22,7 @@ def __init__( cloud: Optional[str] = "aws", region: Optional[str] = "us-east-1", api_key: Optional[str] = None, + text_key: Optional[str] = "content", deletion_protection: Optional[str] = "disabled", # "enabled" or "disabled" namespace: Optional[str] = "default", ingest_batch: int = 200, @@ -31,6 +32,7 @@ def __init__( self.index_name = index_name self.namespace = namespace self.ingest_batch = ingest_batch + self.text_key = text_key self.client = Pinecone_client(api_key=api_key) @@ -58,7 +60,11 @@ async def add(self, ids: List[str], texts: List[str]): List[float] ] = await self.embedding.aget_text_embedding_batch(texts) - vector_tuples = list(zip(ids, text_embeddings)) + metadatas = [{} for _ in texts] + for metadata, text in zip(metadatas, texts): + metadata[self.text_key] = text + + vector_tuples = list(zip(ids, text_embeddings, metadatas)) batch_vectors = make_batch(vector_tuples, self.ingest_batch) async_res = [ @@ -87,28 +93,30 @@ async def is_exist(self, ids: List[str]) -> List[bool]: async def query( self, queries: List[str], top_k: int, **kwargs - ) -> Tuple[List[List[str]], List[List[float]]]: + ) -> Tuple[List[List[str]], List[List[float]], List[List[str]]]: queries = self.truncated_inputs(queries) query_embeddings: List[ List[float] ] = await self.embedding.aget_text_embedding_batch(queries) - ids, scores = [], [] + ids, scores, texts = [], [] for query_embedding in query_embeddings: response = self.index.query( vector=query_embedding, top_k=top_k, include_values=True, + include_metadata=True, namespace=self.namespace, ) ids.append([o.id for o in response.matches]) scores.append([o.score for o in response.matches]) + scores.append([o.metadata[self.text_key] for o in response.matches]) if self.similarity_metric in ["l2"]: scores = apply_recursive(lambda x: -x, scores) - return ids, scores + return ids, scores, texts async def delete(self, ids: List[str]): # Delete entries by IDs From 6cacb4ea90f5090674d031efe67365ef36ba12c9 Mon Sep 17 00:00:00 2001 From: Um Changyong Date: Mon, 16 Dec 2024 11:13:39 +0900 Subject: [PATCH 07/11] feat: update qdrant --- autorag/vectordb/qdrant.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/autorag/vectordb/qdrant.py b/autorag/vectordb/qdrant.py index fa80d1004..a18f171a4 100644 --- a/autorag/vectordb/qdrant.py +++ b/autorag/vectordb/qdrant.py @@ -11,7 +11,7 @@ SearchRequest, ) -from typing import List, Tuple +from typing import List, Optional, Tuple from autorag.vectordb import BaseVectorStore @@ -33,6 +33,7 @@ def __init__( ingest_batch: int = 64, parallel: int = 1, max_retries: int = 3, + text_key: Optional[str] = "content", ): super().__init__(embedding_model, similarity_metric, embedding_batch) @@ -40,6 +41,7 @@ def __init__( self.ingest_batch = ingest_batch self.parallel = parallel self.max_retries = max_retries + self.text_key = text_key if similarity_metric == "cosine": distance = Distance.COSINE @@ -82,8 +84,12 @@ async def add(self, ids: List[str], texts: List[str]): texts = self.truncated_inputs(texts) text_embeddings = await self.embedding.aget_text_embedding_batch(texts) + metadatas = [{} for _ in texts] + for metadata, text in zip(metadatas, texts): + metadata[self.text_key] = text + points = list( - map(lambda x: PointStruct(id=x[0], vector=x[1]), zip(ids, text_embeddings)) + map(lambda x: PointStruct(id=x[0], vector=x[1], payload=x[2]), zip(ids, text_embeddings, metadatas)) ) self.client.upload_points( @@ -119,7 +125,7 @@ async def is_exist(self, ids: List[str]) -> List[bool]: async def query( self, queries: List[str], top_k: int, **kwargs - ) -> Tuple[List[List[str]], List[List[float]]]: + ) -> Tuple[List[List[str]], List[List[float]], List[List[str]]]: queries = self.truncated_inputs(queries) query_embeddings: List[ List[float] @@ -127,7 +133,7 @@ async def query( search_queries = list( map( - lambda x: SearchRequest(vector=x, limit=top_k, with_vector=True), + lambda x: SearchRequest(vector=x, limit=top_k, with_vector=True, with_payload=True), query_embeddings, ) ) @@ -139,8 +145,9 @@ async def query( # Extract IDs and distances ids = [[str(hit.id) for hit in result] for result in search_result] scores = [[hit.score for hit in result] for result in search_result] + contents = [[hit.payload.get(self.text_key) for hit in result] for result in search_result] - return ids, scores + return ids, scores, contents async def delete(self, ids: List[str]): self.client.delete( From 1dc236632fb8f6bb4c9def6aae0f29e1a2db3198 Mon Sep 17 00:00:00 2001 From: Um Changyong Date: Mon, 16 Dec 2024 11:21:43 +0900 Subject: [PATCH 08/11] feat: update vectordb(weaviate) --- autorag/vectordb/weaviate.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/autorag/vectordb/weaviate.py b/autorag/vectordb/weaviate.py index 6b8b333fd..88444c3c7 100644 --- a/autorag/vectordb/weaviate.py +++ b/autorag/vectordb/weaviate.py @@ -120,13 +120,13 @@ async def is_exist(self, ids: List[str]) -> List[bool]: async def query( self, queries: List[str], top_k: int, **kwargs - ) -> Tuple[List[List[str]], List[List[float]]]: + ) -> Tuple[List[List[str]], List[List[float]], List[List[str]]]: queries = self.truncated_inputs(queries) query_embeddings: List[ List[float] ] = await self.embedding.aget_text_embedding_batch(queries) - ids, scores = [], [] + ids, scores, contents = [], [], [] for query_embedding in query_embeddings: response = self.collection.query.near_vector( near_vector=query_embedding, @@ -141,8 +141,9 @@ async def query( for o in response.objects ] ) + contents.append([o.properties[self.text_key] for o in response.objects]) - return ids, scores + return ids, scores, contents async def delete(self, ids: List[str]): filter = wvc.query.Filter.by_id().contains_any(ids) From 8fb2be1f8bf19be4628159a86ff1c146eff30094 Mon Sep 17 00:00:00 2001 From: Um Changyong Date: Mon, 16 Dec 2024 11:52:00 +0900 Subject: [PATCH 09/11] fix: reformatting... --- autorag/nodes/retrieval/base.py | 5 ++++- autorag/nodes/retrieval/vectordb.py | 2 +- autorag/vectordb/chroma.py | 4 +++- autorag/vectordb/milvus.py | 2 +- autorag/vectordb/pinecone.py | 4 ++-- autorag/vectordb/qdrant.py | 16 ++++++++++++---- 6 files changed, 23 insertions(+), 10 deletions(-) diff --git a/autorag/nodes/retrieval/base.py b/autorag/nodes/retrieval/base.py index 9da0e6f4a..16fb8335d 100644 --- a/autorag/nodes/retrieval/base.py +++ b/autorag/nodes/retrieval/base.py @@ -102,7 +102,10 @@ def cast_queries(queries: Union[str, List[str]]) -> List[str]: def evenly_distribute_passages( - ids: List[List[str]], scores: List[List[float]], contents: [List[List[str]]], top_k: int + ids: List[List[str]], + scores: List[List[float]], + contents: [List[List[str]]], + top_k: int, ) -> Tuple[List[str], List[float], List[str]]: assert len(ids) == len(scores), "ids and scores must have same length." query_cnt = len(ids) diff --git a/autorag/nodes/retrieval/vectordb.py b/autorag/nodes/retrieval/vectordb.py index a02bca4bf..fcaa1cb77 100644 --- a/autorag/nodes/retrieval/vectordb.py +++ b/autorag/nodes/retrieval/vectordb.py @@ -27,7 +27,7 @@ flatten_apply, result_to_dataframe, pop_params, - fetch_contents, + # fetch_contents, empty_cuda_cache, convert_inputs_to_list, make_batch, diff --git a/autorag/vectordb/chroma.py b/autorag/vectordb/chroma.py index 37a73f21c..c7375fef2 100644 --- a/autorag/vectordb/chroma.py +++ b/autorag/vectordb/chroma.py @@ -68,7 +68,9 @@ async def add(self, ids: List[str], texts: List[str]): texts = self.truncated_inputs(texts) text_embeddings = await self.embedding.aget_text_embedding_batch(texts) if isinstance(self.collection, AsyncCollection): - await self.collection.add(ids=ids, embeddings=text_embeddings, documents=texts) + await self.collection.add( + ids=ids, embeddings=text_embeddings, documents=texts + ) else: self.collection.add(ids=ids, embeddings=text_embeddings, documents=texts) diff --git a/autorag/vectordb/milvus.py b/autorag/vectordb/milvus.py index 4be88d3c7..bd2e7638c 100644 --- a/autorag/vectordb/milvus.py +++ b/autorag/vectordb/milvus.py @@ -68,7 +68,7 @@ def __init__( field = FieldSchema( name="vector", dtype=DataType.FLOAT_VECTOR, dim=dimension ) - content = FieldSchema( + content = FieldSchema( name="content", dtype=DataType.VARCHAR, max_length=65535 ) schema = CollectionSchema(fields=[pk, field, content]) diff --git a/autorag/vectordb/pinecone.py b/autorag/vectordb/pinecone.py index cf78cc9f6..28e16304e 100644 --- a/autorag/vectordb/pinecone.py +++ b/autorag/vectordb/pinecone.py @@ -62,8 +62,8 @@ async def add(self, ids: List[str], texts: List[str]): metadatas = [{} for _ in texts] for metadata, text in zip(metadatas, texts): - metadata[self.text_key] = text - + metadata[self.text_key] = text + vector_tuples = list(zip(ids, text_embeddings, metadatas)) batch_vectors = make_batch(vector_tuples, self.ingest_batch) diff --git a/autorag/vectordb/qdrant.py b/autorag/vectordb/qdrant.py index a18f171a4..680bb810c 100644 --- a/autorag/vectordb/qdrant.py +++ b/autorag/vectordb/qdrant.py @@ -86,10 +86,13 @@ async def add(self, ids: List[str], texts: List[str]): metadatas = [{} for _ in texts] for metadata, text in zip(metadatas, texts): - metadata[self.text_key] = text + metadata[self.text_key] = text points = list( - map(lambda x: PointStruct(id=x[0], vector=x[1], payload=x[2]), zip(ids, text_embeddings, metadatas)) + map( + lambda x: PointStruct(id=x[0], vector=x[1], payload=x[2]), + zip(ids, text_embeddings, metadatas), + ) ) self.client.upload_points( @@ -133,7 +136,9 @@ async def query( search_queries = list( map( - lambda x: SearchRequest(vector=x, limit=top_k, with_vector=True, with_payload=True), + lambda x: SearchRequest( + vector=x, limit=top_k, with_vector=True, with_payload=True + ), query_embeddings, ) ) @@ -145,7 +150,10 @@ async def query( # Extract IDs and distances ids = [[str(hit.id) for hit in result] for result in search_result] scores = [[hit.score for hit in result] for result in search_result] - contents = [[hit.payload.get(self.text_key) for hit in result] for result in search_result] + contents = [ + [hit.payload.get(self.text_key) for hit in result] + for result in search_result + ] return ids, scores, contents From 081110ef1e5fb877c0148807812a0b57a10fa036 Mon Sep 17 00:00:00 2001 From: e7217 Date: Mon, 16 Dec 2024 22:26:30 +0900 Subject: [PATCH 10/11] fix: update some code and test code. --- autorag/nodes/retrieval/base.py | 2 +- autorag/nodes/retrieval/bm25.py | 5 ++++- autorag/nodes/retrieval/vectordb.py | 21 ++++++++++++------- .../nodes/retrieval/test_retrieval_base.py | 4 +++- .../autorag/nodes/retrieval/test_vectordb.py | 6 +++--- tests/autorag/vectordb/test_chroma.py | 7 +++++-- tests/autorag/vectordb/test_couchbase.py | 7 +++++-- tests/autorag/vectordb/test_milvus.py | 6 ++++-- tests/autorag/vectordb/test_pinecone.py | 7 +++++-- tests/autorag/vectordb/test_qdrant.py | 7 +++++-- 10 files changed, 49 insertions(+), 23 deletions(-) diff --git a/autorag/nodes/retrieval/base.py b/autorag/nodes/retrieval/base.py index 16fb8335d..8e4a747fc 100644 --- a/autorag/nodes/retrieval/base.py +++ b/autorag/nodes/retrieval/base.py @@ -104,7 +104,7 @@ def cast_queries(queries: Union[str, List[str]]) -> List[str]: def evenly_distribute_passages( ids: List[List[str]], scores: List[List[float]], - contents: [List[List[str]]], + contents: List[List[str]], top_k: int, ) -> Tuple[List[str], List[float], List[str]]: assert len(ids) == len(scores), "ids and scores must have same length." diff --git a/autorag/nodes/retrieval/bm25.py b/autorag/nodes/retrieval/bm25.py index 80ac44ded..b03367fd2 100644 --- a/autorag/nodes/retrieval/bm25.py +++ b/autorag/nodes/retrieval/bm25.py @@ -265,8 +265,11 @@ async def bm25_pure( id_result.append(ids) score_result.append(sorted_scores[:top_k]) + # dummy contents + dummy_contents = [["" for _ in _id_result] for _id_result in id_result] + # make a total result to top_k - id_result, score_result = evenly_distribute_passages(id_result, score_result, top_k) + id_result, score_result, content_result = evenly_distribute_passages(id_result, score_result, dummy_contents, top_k) # sort id_result and score_result by score result = [ (_id, score) diff --git a/autorag/nodes/retrieval/vectordb.py b/autorag/nodes/retrieval/vectordb.py index fcaa1cb77..733a97448 100644 --- a/autorag/nodes/retrieval/vectordb.py +++ b/autorag/nodes/retrieval/vectordb.py @@ -27,7 +27,7 @@ flatten_apply, result_to_dataframe, pop_params, - # fetch_contents, + fetch_contents, empty_cuda_cache, convert_inputs_to_list, make_batch, @@ -69,11 +69,18 @@ def __del__(self): def pure(self, previous_result: pd.DataFrame, *args, **kwargs): queries = self.cast_to_run(previous_result) pure_params = pop_params(self._pure, kwargs) - ids, scores, contents = self._pure(queries, **pure_params) - # contents = fetch_contents(self.corpus_df, ids) - ids = [[_id[0]] for _id in ids] - scores = [[score[0]] for score in scores] - contents = [[content[0]] for content in contents] + ids, scores, contents = self._pure(queries, **pure_params) + + ids = [[_ for _ in _id] for _id in ids] + scores = [[_ for _ in score] for score in scores] + + # TODO: Refactor to a single logic that can handle all situations. + if pure_params.get("ids", None) is not None: + contents = [] + contents = fetch_contents(self.corpus_df, ids) + else: + contents = [[_ for _ in content] for content in contents] + return contents, ids, scores def _pure( @@ -166,7 +173,7 @@ async def run_fetch(ids): content_embeddings, ) ) - return ids, score_result + return ids, score_result, queries async def vectordb_pure( diff --git a/tests/autorag/nodes/retrieval/test_retrieval_base.py b/tests/autorag/nodes/retrieval/test_retrieval_base.py index bb64689a2..975641fcd 100644 --- a/tests/autorag/nodes/retrieval/test_retrieval_base.py +++ b/tests/autorag/nodes/retrieval/test_retrieval_base.py @@ -83,8 +83,10 @@ def base_retrieval_node_test(result_df): def test_evenly_distribute_passages(): ids = [[f"test-{i}-{j}" for i in range(10)] for j in range(3)] scores = [[i for i in range(10)] for _ in range(3)] + contents = [[f"test-{i}-{j}" for i in range(10)] for j in range(3)] top_k = 10 - new_ids, new_scores = evenly_distribute_passages(ids, scores, top_k) + new_ids, new_scores, new_content = evenly_distribute_passages(ids, scores, contents, top_k) assert len(new_ids) == top_k assert len(new_scores) == top_k assert new_scores == [0, 1, 2, 3, 0, 1, 2, 0, 1, 2] + assert len(new_content) == top_k diff --git a/tests/autorag/nodes/retrieval/test_vectordb.py b/tests/autorag/nodes/retrieval/test_vectordb.py index 2c9ee6b2f..9c602f92e 100644 --- a/tests/autorag/nodes/retrieval/test_vectordb.py +++ b/tests/autorag/nodes/retrieval/test_vectordb.py @@ -139,7 +139,7 @@ def vectordb_instance(project_dir_for_vectordb_node): def test_vectordb_retrieval(vectordb_instance): top_k = 4 - id_result, score_result = vectordb_instance._pure( + id_result, score_result, content_result = vectordb_instance._pure( queries, top_k=top_k, ) @@ -148,7 +148,7 @@ def test_vectordb_retrieval(vectordb_instance): def test_vectordb_retrieval_ids(vectordb_instance): ids = [["doc2", "doc3"], ["doc1", "doc2"], ["doc4", "doc5"]] - id_result, score_result = vectordb_instance._pure( + id_result, score_result, content_result = vectordb_instance._pure( queries, top_k=4, ids=ids, @@ -160,7 +160,7 @@ def test_vectordb_retrieval_ids(vectordb_instance): def test_vectordb_retrieval_ids_empty(vectordb_instance): ids = [["doc2", "doc3"], [], ["doc4"]] - id_result, score_result = vectordb_instance._pure( + id_result, score_result, content_result = vectordb_instance._pure( queries, top_k=4, ids=ids, diff --git a/tests/autorag/vectordb/test_chroma.py b/tests/autorag/vectordb/test_chroma.py index 36edf3c41..44b424cbd 100644 --- a/tests/autorag/vectordb/test_chroma.py +++ b/tests/autorag/vectordb/test_chroma.py @@ -20,10 +20,12 @@ async def test_add_and_query_documents(chroma_ephemeral): # Query documents queries = ["test document"] - contents, scores = await chroma_ephemeral.query(queries, top_k=2) + result_ids, scores, contents = await chroma_ephemeral.query(queries, top_k=2) + assert len(result_ids) == 1 assert len(contents) == 1 assert len(scores) == 1 + assert len(result_ids[0]) == 2 assert len(contents[0]) == 2 assert len(scores[0]) == 2 assert scores[0][0] > scores[0][1] @@ -50,7 +52,8 @@ async def test_delete_documents(chroma_ephemeral): # Query documents to ensure they are deleted queries = ["test document"] - contents, scores = await chroma_ephemeral.query(queries, top_k=2) + id_resuls, scores, contents = await chroma_ephemeral.query(queries, top_k=2) + assert len(id_resuls[0]) == 1 assert len(contents[0]) == 1 assert len(scores[0]) == 1 diff --git a/tests/autorag/vectordb/test_couchbase.py b/tests/autorag/vectordb/test_couchbase.py index a41bfebed..7562636ee 100644 --- a/tests/autorag/vectordb/test_couchbase.py +++ b/tests/autorag/vectordb/test_couchbase.py @@ -41,10 +41,12 @@ async def test_add_and_query_documents(couchbase_instance): # Query documents queries = ["test document"] - contents, scores = await couchbase_instance.query(queries, top_k=2) + result_ids, scores, contents = await couchbase_instance.query(queries, top_k=2) + assert len(result_ids) == 1 assert len(contents) == 1 assert len(scores) == 1 + assert len(result_ids[0]) == 2 assert len(contents[0]) == 2 assert len(scores[0]) == 2 assert scores[0][0] > scores[0][1] @@ -77,7 +79,8 @@ async def test_delete_documents(couchbase_instance): # Query documents to ensure they are deleted queries = ["test document"] - contents, scores = await couchbase_instance.query(queries, top_k=2) + result_ids, scores, contents = await couchbase_instance.query(queries, top_k=2) + assert len(result_ids[0]) == 1 assert len(contents[0]) == 1 assert len(scores[0]) == 1 diff --git a/tests/autorag/vectordb/test_milvus.py b/tests/autorag/vectordb/test_milvus.py index 7d6247f8b..f388f2479 100644 --- a/tests/autorag/vectordb/test_milvus.py +++ b/tests/autorag/vectordb/test_milvus.py @@ -39,10 +39,12 @@ async def test_add_and_query_documents(milvus_instance): # Query documents queries = ["test document"] - contents, scores = await milvus_instance.query(queries, top_k=2) + result_ids, contents, scores = await milvus_instance.query(queries, top_k=2) + assert len(result_ids) == 1 assert len(contents) == 1 assert len(scores) == 1 + assert len(result_ids[0]) == 2 assert len(contents[0]) == 2 assert len(scores[0]) == 2 assert scores[0][0] > scores[0][1] @@ -75,7 +77,7 @@ async def test_delete_documents(milvus_instance): # Query documents to ensure they are deleted queries = ["test document"] - contents, scores = await milvus_instance.query(queries, top_k=2) + result_ids, contents, scores = await milvus_instance.query(queries, top_k=2) assert len(contents[0]) == 1 assert len(scores[0]) == 1 diff --git a/tests/autorag/vectordb/test_pinecone.py b/tests/autorag/vectordb/test_pinecone.py index d53e4cb04..55580cb3c 100644 --- a/tests/autorag/vectordb/test_pinecone.py +++ b/tests/autorag/vectordb/test_pinecone.py @@ -38,10 +38,12 @@ async def test_add_and_query_documents(pinecone_instance): # Query documents queries = ["test document"] - contents, scores = await pinecone_instance.query(queries, top_k=2) + result_ids, scores, contents = await pinecone_instance.query(queries, top_k=2) + assert len(result_ids) == 1 assert len(contents) == 1 assert len(scores) == 1 + assert len(result_ids[0]) == 2 assert len(contents[0]) == 2 assert len(scores[0]) == 2 assert scores[0][0] > scores[0][1] @@ -74,7 +76,8 @@ async def test_delete_documents(pinecone_instance): # Query documents to ensure they are deleted queries = ["test document"] - contents, scores = await pinecone_instance.query(queries, top_k=2) + result_ids, scores, contents = await pinecone_instance.query(queries, top_k=2) + assert len(result_ids[0]) == 1 assert len(contents[0]) == 1 assert len(scores[0]) == 1 diff --git a/tests/autorag/vectordb/test_qdrant.py b/tests/autorag/vectordb/test_qdrant.py index 7770610ff..c725e3edd 100644 --- a/tests/autorag/vectordb/test_qdrant.py +++ b/tests/autorag/vectordb/test_qdrant.py @@ -39,10 +39,12 @@ async def test_add_and_query_documents(qdrant_instance): # Query documents queries = ["test document"] - contents, scores = await qdrant_instance.query(queries, top_k=2) + result_ids, scores, contents = await qdrant_instance.query(queries, top_k=2) + assert len(result_ids) == 1 assert len(contents) == 1 assert len(scores) == 1 + assert len(result_ids[0]) == 2 assert len(contents[0]) == 2 assert len(scores[0]) == 2 assert scores[0][0] > scores[0][1] @@ -75,7 +77,8 @@ async def test_delete_documents(qdrant_instance): # Query documents to ensure they are deleted queries = ["test document"] - contents, scores = await qdrant_instance.query(queries, top_k=2) + result_ids, scores, contents = await qdrant_instance.query(queries, top_k=2) + assert len(result_ids[0]) == 1 assert len(contents[0]) == 1 assert len(scores[0]) == 1 From 3e2c634aa61173fc0efe891242303f473abb60ca Mon Sep 17 00:00:00 2001 From: e7217 Date: Mon, 16 Dec 2024 23:07:20 +0900 Subject: [PATCH 11/11] feat: add and change in test_vectordbs --- tests/autorag/vectordb/test_milvus.py | 5 +++-- tests/autorag/vectordb/test_weaviate.py | 7 +++++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/tests/autorag/vectordb/test_milvus.py b/tests/autorag/vectordb/test_milvus.py index f388f2479..4d83c0da2 100644 --- a/tests/autorag/vectordb/test_milvus.py +++ b/tests/autorag/vectordb/test_milvus.py @@ -39,7 +39,7 @@ async def test_add_and_query_documents(milvus_instance): # Query documents queries = ["test document"] - result_ids, contents, scores = await milvus_instance.query(queries, top_k=2) + result_ids, scores, contents = await milvus_instance.query(queries, top_k=2) assert len(result_ids) == 1 assert len(contents) == 1 @@ -77,7 +77,8 @@ async def test_delete_documents(milvus_instance): # Query documents to ensure they are deleted queries = ["test document"] - result_ids, contents, scores = await milvus_instance.query(queries, top_k=2) + result_ids, scores, contents = await milvus_instance.query(queries, top_k=2) + assert len(result_ids[0]) == 1 assert len(contents[0]) == 1 assert len(scores[0]) == 1 diff --git a/tests/autorag/vectordb/test_weaviate.py b/tests/autorag/vectordb/test_weaviate.py index e86e58d99..238154009 100644 --- a/tests/autorag/vectordb/test_weaviate.py +++ b/tests/autorag/vectordb/test_weaviate.py @@ -37,10 +37,12 @@ async def test_add_and_query_documents(weaviate_instance): # Query documents queries = ["test document"] - contents, scores = await weaviate_instance.query(queries, top_k=2) + result_ids, scores, contents = await weaviate_instance.query(queries, top_k=2) + assert len(result_ids) == 1 assert len(contents) == 1 assert len(scores) == 1 + assert len(result_ids[0]) == 2 assert len(contents[0]) == 2 assert len(scores[0]) == 2 assert scores[0][0] > scores[0][1] @@ -73,7 +75,8 @@ async def test_delete_documents(weaviate_instance): # Query documents to ensure they are deleted queries = ["test document"] - contents, scores = await weaviate_instance.query(queries, top_k=2) + result_ids, scores, contents = await weaviate_instance.query(queries, top_k=2) + assert len(result_ids[0]) == 1 assert len(contents[0]) == 1 assert len(scores[0]) == 1