@@ -52,7 +52,11 @@ def _results_to_docs_and_scores(results: Any) -> List[Tuple[Document, float]]:
52
52
53
53
54
54
def cosine_similarity (X : Matrix , Y : Matrix ) -> np .ndarray :
55
- """Row-wise cosine similarity between two equal-width matrices."""
55
+ """Row-wise cosine similarity between two equal-width matrices.
56
+
57
+ Raises:
58
+ ValueError: If the number of columns in X and Y are not the same.
59
+ """
56
60
if len (X ) == 0 or len (Y ) == 0 :
57
61
return np .array ([])
58
62
@@ -80,7 +84,21 @@ def maximal_marginal_relevance(
80
84
lambda_mult : float = 0.5 ,
81
85
k : int = 4 ,
82
86
) -> List [int ]:
83
- """Calculate maximal marginal relevance."""
87
+ """Calculate maximal marginal relevance.
88
+
89
+ Args:
90
+ query_embedding: Query embedding.
91
+ embedding_list: List of embeddings to select from.
92
+ lambda_mult: Number between 0 and 1 that determines the degree
93
+ of diversity among the results with 0 corresponding
94
+ to maximum diversity and 1 to minimum diversity.
95
+ Defaults to 0.5.
96
+ k: Number of Documents to return. Defaults to 4.
97
+
98
+ Returns:
99
+ List of indices of embeddings selected by maximal marginal relevance.
100
+ """
101
+
84
102
if min (k , len (embedding_list )) <= 0 :
85
103
return []
86
104
if query_embedding .ndim == 1 :
@@ -136,8 +154,21 @@ def __init__(
136
154
relevance_score_fn : Optional [Callable [[float ], float ]] = None ,
137
155
create_collection_if_not_exists : Optional [bool ] = True ,
138
156
) -> None :
139
- """Initialize with a Chroma client."""
157
+ """Initialize with a Chroma client.
140
158
159
+ Args:
160
+ collection_name: Name of the collection to create.
161
+ embedding_function: Embedding class object. Used to embed texts.
162
+ persist_director: Directory to persist the collection.
163
+ client_settings: Chroma client settings
164
+ collection_metadata: Collection configurations.
165
+ client: Chroma client. Documentation:
166
+ https://docs.trychroma.com/reference/js-client#class:-chromaclient
167
+ relevance_score_fn: Function to calculate relevance score from distance.
168
+ Used only in `similarity_search_with_relevance_scores`
169
+ create_collection_if_not_exists: Whether to create collection
170
+ if it doesn't exist. Defaults to True.
171
+ """
141
172
if client is not None :
142
173
self ._client_settings = client_settings
143
174
self ._client = client
@@ -204,7 +235,23 @@ def __query_collection(
204
235
where_document : Optional [Dict [str , str ]] = None ,
205
236
** kwargs : Any ,
206
237
) -> Union [List [Document ], chromadb .QueryResult ]:
207
- """Query the chroma collection."""
238
+ """Query the chroma collection.
239
+
240
+ Args:
241
+ query_texts: List of query texts.
242
+ query_embeddings: List of query embeddings.
243
+ n_results: Number of results to return. Defaults to 4.
244
+ where: dict used to filter results by
245
+ e.g. {"color" : "red", "price": 4.20}.
246
+ where_document: dict used to filter by the documents.
247
+ E.g. {$contains: {"text": "hello"}}.
248
+
249
+ Returns:
250
+ List of `n_results` nearest neighbor embeddings for provided
251
+ query_embeddings or query_texts.
252
+
253
+ See more: https://docs.trychroma.com/reference/py-collection#query
254
+ """
208
255
return self ._collection .query (
209
256
query_texts = query_texts ,
210
257
query_embeddings = query_embeddings , # type: ignore
@@ -229,12 +276,16 @@ def add_images(
229
276
"""Run more images through the embeddings and add to the vectorstore.
230
277
231
278
Args:
232
- uris List[str]: File path to the image.
233
- metadatas (Optional[List[dict]], optional): Optional list of metadatas.
234
- ids (Optional[List[str]], optional): Optional list of IDs.
279
+ uris: File path to the image.
280
+ metadatas: Optional list of metadatas.
281
+ When querying, you can filter on this metadata.
282
+ ids: Optional list of IDs.
235
283
236
284
Returns:
237
- List[str]: List of IDs of the added images.
285
+ List of IDs of the added images.
286
+
287
+ Raises:
288
+ ValueError: When metadata is incorrect.
238
289
"""
239
290
# Map from uris to b64 encoded strings
240
291
b64_texts = [self .encode_image (uri = uri ) for uri in uris ]
@@ -312,14 +363,18 @@ def add_texts(
312
363
"""Run more texts through the embeddings and add to the vectorstore.
313
364
314
365
Args:
315
- texts (Iterable[str]): Texts to add to the vectorstore.
316
- metadatas (Optional[List[dict]], optional): Optional list of metadatas.
317
- ids (Optional[List[str]], optional): Optional list of IDs.
366
+ texts: Texts to add to the vectorstore.
367
+ metadatas: Optional list of metadatas.
368
+ When querying, you can filter on this metadata.
369
+ ids: Optional list of IDs.
318
370
319
371
Returns:
320
- List[str]: List of IDs of the added texts.
372
+ List of IDs of the added texts.
373
+
374
+ Raises:
375
+ ValueError: When metadata is incorrect.
321
376
"""
322
- # TODO: Handle the case where the user doesn't provide ids on the Collection
377
+
323
378
if ids is None :
324
379
ids = [str (uuid .uuid4 ()) for _ in texts ]
325
380
embeddings = None
@@ -391,12 +446,12 @@ def similarity_search(
391
446
"""Run similarity search with Chroma.
392
447
393
448
Args:
394
- query (str) : Query text to search for.
395
- k (int) : Number of results to return. Defaults to 4.
396
- filter (Optional[Dict[str, str]]) : Filter by metadata. Defaults to None.
449
+ query: Query text to search for.
450
+ k: Number of results to return. Defaults to 4.
451
+ filter: Filter by metadata. Defaults to None.
397
452
398
453
Returns:
399
- List[Document]: List of documents most similar to the query text.
454
+ List of documents most similar to the query text.
400
455
"""
401
456
docs_and_scores = self .similarity_search_with_score (
402
457
query , k , filter = filter , ** kwargs
@@ -412,10 +467,14 @@ def similarity_search_by_vector(
412
467
** kwargs : Any ,
413
468
) -> List [Document ]:
414
469
"""Return docs most similar to embedding vector.
470
+
415
471
Args:
416
- embedding (List[float]): Embedding to look up documents similar to.
417
- k (int): Number of Documents to return. Defaults to 4.
418
- filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None.
472
+ embedding: Embedding to look up documents similar to.
473
+ k: Number of Documents to return. Defaults to 4.
474
+ filter: Filter by metadata. Defaults to None.
475
+ where_document: dict used to filter by the documents.
476
+ E.g. {$contains: {"text": "hello"}}.
477
+
419
478
Returns:
420
479
List of Documents most similar to the query vector.
421
480
"""
@@ -441,13 +500,14 @@ def similarity_search_by_vector_with_relevance_scores(
441
500
442
501
Args:
443
502
embedding (List[float]): Embedding to look up documents similar to.
444
- k (int): Number of Documents to return. Defaults to 4.
445
- filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None.
503
+ k: Number of Documents to return. Defaults to 4.
504
+ filter: Filter by metadata. Defaults to None.
505
+ where_document: dict used to filter by the documents.
506
+ E.g. {$contains: {"text": "hello"}}.
446
507
447
508
Returns:
448
- List[Tuple[Document, float]]: List of documents most similar to
449
- the query text and cosine distance in float for each.
450
- Lower score represents more similarity.
509
+ List of documents most similar to the query text and relevance score
510
+ in float for each. Lower score represents more similarity.
451
511
"""
452
512
results = self .__query_collection (
453
513
query_embeddings = embedding ,
@@ -469,14 +529,15 @@ def similarity_search_with_score(
469
529
"""Run similarity search with Chroma with distance.
470
530
471
531
Args:
472
- query (str): Query text to search for.
473
- k (int): Number of results to return. Defaults to 4.
474
- filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None.
532
+ query: Query text to search for.
533
+ k: Number of results to return. Defaults to 4.
534
+ filter: Filter by metadata. Defaults to None.
535
+ where_document: dict used to filter by the documents.
536
+ E.g. {$contains: {"text": "hello"}}.
475
537
476
538
Returns:
477
- List[Tuple[Document, float]]: List of documents most similar to
478
- the query text and cosine distance in float for each.
479
- Lower score represents more similarity.
539
+ List of documents most similar to the query text and
540
+ distance in float for each. Lower score represents more similarity.
480
541
"""
481
542
if self ._embedding_function is None :
482
543
results = self .__query_collection (
@@ -499,14 +560,21 @@ def similarity_search_with_score(
499
560
return _results_to_docs_and_scores (results )
500
561
501
562
def _select_relevance_score_fn (self ) -> Callable [[float ], float ]:
563
+ """Select the relevance score function based on collections distance metric.
564
+
565
+ The most similar documents will have the lowest relevance score. Default
566
+ relevance score function is euclidean distance. Distance metric must be
567
+ provided in `collection_metadata` during initizalition of Chroma object.
568
+ Example: collection_metadata={"hnsw:space": "cosine"}. Available distance
569
+ metrics are: 'cosine', 'l2' and 'ip'.
570
+
571
+ Returns:
572
+ The relevance score function.
573
+
574
+ Raises:
575
+ ValueError: If the distance metric is not supported.
502
576
"""
503
- The 'correct' relevance function
504
- may differ depending on a few things, including:
505
- - the distance / similarity metric used by the VectorStore
506
- - the scale of your embeddings (OpenAI's are unit normed. Many others are not!)
507
- - embedding dimensionality
508
- - etc.
509
- """
577
+
510
578
if self .override_relevance_score_fn :
511
579
return self .override_relevance_score_fn
512
580
@@ -541,18 +609,20 @@ def max_marginal_relevance_search_by_vector(
541
609
** kwargs : Any ,
542
610
) -> List [Document ]:
543
611
"""Return docs selected using the maximal marginal relevance.
612
+
544
613
Maximal marginal relevance optimizes for similarity to query AND diversity
545
614
among selected documents.
546
615
547
616
Args:
548
617
embedding: Embedding to look up documents similar to.
549
618
k: Number of Documents to return. Defaults to 4.
550
- fetch_k: Number of Documents to fetch to pass to MMR algorithm.
619
+ fetch_k: Number of Documents to fetch to pass to MMR algorithm. Defaults to
620
+ 20.
551
621
lambda_mult: Number between 0 and 1 that determines the degree
552
- of diversity among the results with 0 corresponding
553
- to maximum diversity and 1 to minimum diversity.
554
- Defaults to 0.5.
555
- filter (Optional[Dict[str, str]]) : Filter by metadata. Defaults to None.
622
+ of diversity among the results with 0 corresponding
623
+ to maximum diversity and 1 to minimum diversity.
624
+ Defaults to 0.5.
625
+ filter: Filter by metadata. Defaults to None.
556
626
557
627
Returns:
558
628
List of Documents selected by maximal marginal relevance.
@@ -600,26 +670,30 @@ def max_marginal_relevance_search(
600
670
of diversity among the results with 0 corresponding
601
671
to maximum diversity and 1 to minimum diversity.
602
672
Defaults to 0.5.
603
- filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None.
673
+ filter: Filter by metadata. Defaults to None.
674
+ where_document: dict used to filter by the documents.
675
+ E.g. {$contains: {"text": "hello"}}.
604
676
605
677
Returns:
606
678
List of Documents selected by maximal marginal relevance.
679
+
680
+ Raises:
681
+ ValueError: If the embedding function is not provided.
607
682
"""
608
683
if self ._embedding_function is None :
609
684
raise ValueError (
610
685
"For MMR search, you must specify an embedding function on" "creation."
611
686
)
612
687
613
688
embedding = self ._embedding_function .embed_query (query )
614
- docs = self .max_marginal_relevance_search_by_vector (
689
+ return self .max_marginal_relevance_search_by_vector (
615
690
embedding ,
616
691
k ,
617
692
fetch_k ,
618
693
lambda_mult = lambda_mult ,
619
694
filter = filter ,
620
695
where_document = where_document ,
621
696
)
622
- return docs
623
697
624
698
def delete_collection (self ) -> None :
625
699
"""Delete the collection."""
@@ -656,6 +730,9 @@ def get(
656
730
Can contain `"embeddings"`, `"metadatas"`, `"documents"`.
657
731
Ids are always included.
658
732
Defaults to `["metadatas", "documents"]`. Optional.
733
+
734
+ Return:
735
+ A dict with the keys `"ids"`, `"embeddings"`, `"metadatas"`, `"documents"`.
659
736
"""
660
737
kwargs = {
661
738
"ids" : ids ,
@@ -674,8 +751,8 @@ def update_document(self, document_id: str, document: Document) -> None:
674
751
"""Update a document in the collection.
675
752
676
753
Args:
677
- document_id (str) : ID of the document to update.
678
- document (Document) : Document to update.
754
+ document_id: ID of the document to update.
755
+ document: Document to update.
679
756
"""
680
757
return self .update_documents ([document_id ], [document ])
681
758
@@ -684,8 +761,11 @@ def update_documents(self, ids: List[str], documents: List[Document]) -> None:
684
761
"""Update a document in the collection.
685
762
686
763
Args:
687
- ids (List[str]): List of ids of the document to update.
688
- documents (List[Document]): List of documents to update.
764
+ ids: List of ids of the document to update.
765
+ documents: List of documents to update.
766
+
767
+ Raises:
768
+ ValueError: If the embedding function is not provided.
689
769
"""
690
770
text = [document .page_content for document in documents ]
691
771
metadata = [document .metadata for document in documents ]
@@ -741,14 +821,14 @@ def from_texts(
741
821
Otherwise, the data will be ephemeral in-memory.
742
822
743
823
Args:
744
- texts (List[str]) : List of texts to add to the collection.
745
- collection_name (str) : Name of the collection to create.
746
- persist_directory (Optional[str]) : Directory to persist the collection.
747
- embedding (Optional[Embeddings]) : Embedding function. Defaults to None.
748
- metadatas (Optional[List[dict]]) : List of metadatas. Defaults to None.
749
- ids (Optional[List[str]]) : List of document IDs. Defaults to None.
750
- client_settings (Optional[chromadb.config.Settings]) : Chroma client settings
751
- collection_metadata (Optional[Dict]) : Collection configurations.
824
+ texts: List of texts to add to the collection.
825
+ collection_name: Name of the collection to create.
826
+ persist_directory: Directory to persist the collection.
827
+ embedding: Embedding function. Defaults to None.
828
+ metadatas: List of metadatas. Defaults to None.
829
+ ids: List of document IDs. Defaults to None.
830
+ client_settings: Chroma client settings
831
+ collection_metadata: Collection configurations.
752
832
Defaults to None.
753
833
754
834
Returns:
@@ -804,13 +884,13 @@ def from_documents(
804
884
Otherwise, the data will be ephemeral in-memory.
805
885
806
886
Args:
807
- collection_name (str) : Name of the collection to create.
808
- persist_directory (Optional[str]) : Directory to persist the collection.
809
- ids (Optional[List[str]]) : List of document IDs. Defaults to None.
810
- documents (List[Document]) : List of documents to add to the vectorstore.
811
- embedding (Optional[Embeddings]) : Embedding function. Defaults to None.
812
- client_settings (Optional[chromadb.config.Settings]) : Chroma client settings
813
- collection_metadata (Optional[Dict]) : Collection configurations.
887
+ collection_name: Name of the collection to create.
888
+ persist_directory: Directory to persist the collection.
889
+ ids : List of document IDs. Defaults to None.
890
+ documents: List of documents to add to the vectorstore.
891
+ embedding: Embedding function. Defaults to None.
892
+ client_settings: Chroma client settings
893
+ collection_metadata: Collection configurations.
814
894
Defaults to None.
815
895
816
896
Returns:
0 commit comments