Skip to content

Commit 40c81ce

Browse files
klaudialemiechinthornw
authored andcommitted
docs: Chroma docstrings update (#22001)
Thank you for contributing to LangChain! - [X] **PR title**: "docs: Chroma docstrings update" - Where "package" is whichever of langchain, community, core, experimental, etc. is being modified. Use "docs: ..." for purely docs changes, "templates: ..." for template changes, "infra: ..." for CI changes. - Example: "community: add foobar LLM" - [X] **PR message**: - **Description:** Added and updated Chroma docstrings - **Issue:** #21983 - [X] **Add tests and docs**: If you're adding a new integration, please include 1. a test for the integration, preferably unit tests that do not rely on network access, 2. an example notebook showing its use. It lives in `docs/docs/integrations` directory. - only docs - [X] **Lint and test**: Run `make format`, `make lint` and `make test` from the root of the package(s) you've modified. See contribution guidelines for more: https://python.langchain.com/docs/contributing/ Additional guidelines: - Make sure optional dependencies are imported within a function. - Please do not add dependencies to pyproject.toml files (even optional ones) unless they are required for unit tests. - Most PRs should not touch more than one package. - Changes should be backwards compatible. - If you are adding something to community, do not re-import it in langchain. If no one reviews your PR within a few days, please @-mention one of baskaryan, efriis, eyurtsev, ccurme, vbarda, hwchase17.
1 parent a4a59f7 commit 40c81ce

File tree

1 file changed

+145
-65
lines changed

1 file changed

+145
-65
lines changed

libs/partners/chroma/langchain_chroma/vectorstores.py

Lines changed: 145 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,11 @@ def _results_to_docs_and_scores(results: Any) -> List[Tuple[Document, float]]:
5252

5353

5454
def cosine_similarity(X: Matrix, Y: Matrix) -> np.ndarray:
55-
"""Row-wise cosine similarity between two equal-width matrices."""
55+
"""Row-wise cosine similarity between two equal-width matrices.
56+
57+
Raises:
58+
ValueError: If the number of columns in X and Y are not the same.
59+
"""
5660
if len(X) == 0 or len(Y) == 0:
5761
return np.array([])
5862

@@ -80,7 +84,21 @@ def maximal_marginal_relevance(
8084
lambda_mult: float = 0.5,
8185
k: int = 4,
8286
) -> List[int]:
83-
"""Calculate maximal marginal relevance."""
87+
"""Calculate maximal marginal relevance.
88+
89+
Args:
90+
query_embedding: Query embedding.
91+
embedding_list: List of embeddings to select from.
92+
lambda_mult: Number between 0 and 1 that determines the degree
93+
of diversity among the results with 0 corresponding
94+
to maximum diversity and 1 to minimum diversity.
95+
Defaults to 0.5.
96+
k: Number of Documents to return. Defaults to 4.
97+
98+
Returns:
99+
List of indices of embeddings selected by maximal marginal relevance.
100+
"""
101+
84102
if min(k, len(embedding_list)) <= 0:
85103
return []
86104
if query_embedding.ndim == 1:
@@ -136,8 +154,21 @@ def __init__(
136154
relevance_score_fn: Optional[Callable[[float], float]] = None,
137155
create_collection_if_not_exists: Optional[bool] = True,
138156
) -> None:
139-
"""Initialize with a Chroma client."""
157+
"""Initialize with a Chroma client.
140158
159+
Args:
160+
collection_name: Name of the collection to create.
161+
embedding_function: Embedding class object. Used to embed texts.
162+
persist_director: Directory to persist the collection.
163+
client_settings: Chroma client settings
164+
collection_metadata: Collection configurations.
165+
client: Chroma client. Documentation:
166+
https://docs.trychroma.com/reference/js-client#class:-chromaclient
167+
relevance_score_fn: Function to calculate relevance score from distance.
168+
Used only in `similarity_search_with_relevance_scores`
169+
create_collection_if_not_exists: Whether to create collection
170+
if it doesn't exist. Defaults to True.
171+
"""
141172
if client is not None:
142173
self._client_settings = client_settings
143174
self._client = client
@@ -204,7 +235,23 @@ def __query_collection(
204235
where_document: Optional[Dict[str, str]] = None,
205236
**kwargs: Any,
206237
) -> Union[List[Document], chromadb.QueryResult]:
207-
"""Query the chroma collection."""
238+
"""Query the chroma collection.
239+
240+
Args:
241+
query_texts: List of query texts.
242+
query_embeddings: List of query embeddings.
243+
n_results: Number of results to return. Defaults to 4.
244+
where: dict used to filter results by
245+
e.g. {"color" : "red", "price": 4.20}.
246+
where_document: dict used to filter by the documents.
247+
E.g. {$contains: {"text": "hello"}}.
248+
249+
Returns:
250+
List of `n_results` nearest neighbor embeddings for provided
251+
query_embeddings or query_texts.
252+
253+
See more: https://docs.trychroma.com/reference/py-collection#query
254+
"""
208255
return self._collection.query(
209256
query_texts=query_texts,
210257
query_embeddings=query_embeddings, # type: ignore
@@ -229,12 +276,16 @@ def add_images(
229276
"""Run more images through the embeddings and add to the vectorstore.
230277
231278
Args:
232-
uris List[str]: File path to the image.
233-
metadatas (Optional[List[dict]], optional): Optional list of metadatas.
234-
ids (Optional[List[str]], optional): Optional list of IDs.
279+
uris: File path to the image.
280+
metadatas: Optional list of metadatas.
281+
When querying, you can filter on this metadata.
282+
ids: Optional list of IDs.
235283
236284
Returns:
237-
List[str]: List of IDs of the added images.
285+
List of IDs of the added images.
286+
287+
Raises:
288+
ValueError: When metadata is incorrect.
238289
"""
239290
# Map from uris to b64 encoded strings
240291
b64_texts = [self.encode_image(uri=uri) for uri in uris]
@@ -312,14 +363,18 @@ def add_texts(
312363
"""Run more texts through the embeddings and add to the vectorstore.
313364
314365
Args:
315-
texts (Iterable[str]): Texts to add to the vectorstore.
316-
metadatas (Optional[List[dict]], optional): Optional list of metadatas.
317-
ids (Optional[List[str]], optional): Optional list of IDs.
366+
texts: Texts to add to the vectorstore.
367+
metadatas: Optional list of metadatas.
368+
When querying, you can filter on this metadata.
369+
ids: Optional list of IDs.
318370
319371
Returns:
320-
List[str]: List of IDs of the added texts.
372+
List of IDs of the added texts.
373+
374+
Raises:
375+
ValueError: When metadata is incorrect.
321376
"""
322-
# TODO: Handle the case where the user doesn't provide ids on the Collection
377+
323378
if ids is None:
324379
ids = [str(uuid.uuid4()) for _ in texts]
325380
embeddings = None
@@ -391,12 +446,12 @@ def similarity_search(
391446
"""Run similarity search with Chroma.
392447
393448
Args:
394-
query (str): Query text to search for.
395-
k (int): Number of results to return. Defaults to 4.
396-
filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None.
449+
query: Query text to search for.
450+
k: Number of results to return. Defaults to 4.
451+
filter: Filter by metadata. Defaults to None.
397452
398453
Returns:
399-
List[Document]: List of documents most similar to the query text.
454+
List of documents most similar to the query text.
400455
"""
401456
docs_and_scores = self.similarity_search_with_score(
402457
query, k, filter=filter, **kwargs
@@ -412,10 +467,14 @@ def similarity_search_by_vector(
412467
**kwargs: Any,
413468
) -> List[Document]:
414469
"""Return docs most similar to embedding vector.
470+
415471
Args:
416-
embedding (List[float]): Embedding to look up documents similar to.
417-
k (int): Number of Documents to return. Defaults to 4.
418-
filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None.
472+
embedding: Embedding to look up documents similar to.
473+
k: Number of Documents to return. Defaults to 4.
474+
filter: Filter by metadata. Defaults to None.
475+
where_document: dict used to filter by the documents.
476+
E.g. {$contains: {"text": "hello"}}.
477+
419478
Returns:
420479
List of Documents most similar to the query vector.
421480
"""
@@ -441,13 +500,14 @@ def similarity_search_by_vector_with_relevance_scores(
441500
442501
Args:
443502
embedding (List[float]): Embedding to look up documents similar to.
444-
k (int): Number of Documents to return. Defaults to 4.
445-
filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None.
503+
k: Number of Documents to return. Defaults to 4.
504+
filter: Filter by metadata. Defaults to None.
505+
where_document: dict used to filter by the documents.
506+
E.g. {$contains: {"text": "hello"}}.
446507
447508
Returns:
448-
List[Tuple[Document, float]]: List of documents most similar to
449-
the query text and cosine distance in float for each.
450-
Lower score represents more similarity.
509+
List of documents most similar to the query text and relevance score
510+
in float for each. Lower score represents more similarity.
451511
"""
452512
results = self.__query_collection(
453513
query_embeddings=embedding,
@@ -469,14 +529,15 @@ def similarity_search_with_score(
469529
"""Run similarity search with Chroma with distance.
470530
471531
Args:
472-
query (str): Query text to search for.
473-
k (int): Number of results to return. Defaults to 4.
474-
filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None.
532+
query: Query text to search for.
533+
k: Number of results to return. Defaults to 4.
534+
filter: Filter by metadata. Defaults to None.
535+
where_document: dict used to filter by the documents.
536+
E.g. {$contains: {"text": "hello"}}.
475537
476538
Returns:
477-
List[Tuple[Document, float]]: List of documents most similar to
478-
the query text and cosine distance in float for each.
479-
Lower score represents more similarity.
539+
List of documents most similar to the query text and
540+
distance in float for each. Lower score represents more similarity.
480541
"""
481542
if self._embedding_function is None:
482543
results = self.__query_collection(
@@ -499,14 +560,21 @@ def similarity_search_with_score(
499560
return _results_to_docs_and_scores(results)
500561

501562
def _select_relevance_score_fn(self) -> Callable[[float], float]:
563+
"""Select the relevance score function based on collections distance metric.
564+
565+
The most similar documents will have the lowest relevance score. Default
566+
relevance score function is euclidean distance. Distance metric must be
567+
provided in `collection_metadata` during initizalition of Chroma object.
568+
Example: collection_metadata={"hnsw:space": "cosine"}. Available distance
569+
metrics are: 'cosine', 'l2' and 'ip'.
570+
571+
Returns:
572+
The relevance score function.
573+
574+
Raises:
575+
ValueError: If the distance metric is not supported.
502576
"""
503-
The 'correct' relevance function
504-
may differ depending on a few things, including:
505-
- the distance / similarity metric used by the VectorStore
506-
- the scale of your embeddings (OpenAI's are unit normed. Many others are not!)
507-
- embedding dimensionality
508-
- etc.
509-
"""
577+
510578
if self.override_relevance_score_fn:
511579
return self.override_relevance_score_fn
512580

@@ -541,18 +609,20 @@ def max_marginal_relevance_search_by_vector(
541609
**kwargs: Any,
542610
) -> List[Document]:
543611
"""Return docs selected using the maximal marginal relevance.
612+
544613
Maximal marginal relevance optimizes for similarity to query AND diversity
545614
among selected documents.
546615
547616
Args:
548617
embedding: Embedding to look up documents similar to.
549618
k: Number of Documents to return. Defaults to 4.
550-
fetch_k: Number of Documents to fetch to pass to MMR algorithm.
619+
fetch_k: Number of Documents to fetch to pass to MMR algorithm. Defaults to
620+
20.
551621
lambda_mult: Number between 0 and 1 that determines the degree
552-
of diversity among the results with 0 corresponding
553-
to maximum diversity and 1 to minimum diversity.
554-
Defaults to 0.5.
555-
filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None.
622+
of diversity among the results with 0 corresponding
623+
to maximum diversity and 1 to minimum diversity.
624+
Defaults to 0.5.
625+
filter: Filter by metadata. Defaults to None.
556626
557627
Returns:
558628
List of Documents selected by maximal marginal relevance.
@@ -600,26 +670,30 @@ def max_marginal_relevance_search(
600670
of diversity among the results with 0 corresponding
601671
to maximum diversity and 1 to minimum diversity.
602672
Defaults to 0.5.
603-
filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None.
673+
filter: Filter by metadata. Defaults to None.
674+
where_document: dict used to filter by the documents.
675+
E.g. {$contains: {"text": "hello"}}.
604676
605677
Returns:
606678
List of Documents selected by maximal marginal relevance.
679+
680+
Raises:
681+
ValueError: If the embedding function is not provided.
607682
"""
608683
if self._embedding_function is None:
609684
raise ValueError(
610685
"For MMR search, you must specify an embedding function on" "creation."
611686
)
612687

613688
embedding = self._embedding_function.embed_query(query)
614-
docs = self.max_marginal_relevance_search_by_vector(
689+
return self.max_marginal_relevance_search_by_vector(
615690
embedding,
616691
k,
617692
fetch_k,
618693
lambda_mult=lambda_mult,
619694
filter=filter,
620695
where_document=where_document,
621696
)
622-
return docs
623697

624698
def delete_collection(self) -> None:
625699
"""Delete the collection."""
@@ -656,6 +730,9 @@ def get(
656730
Can contain `"embeddings"`, `"metadatas"`, `"documents"`.
657731
Ids are always included.
658732
Defaults to `["metadatas", "documents"]`. Optional.
733+
734+
Return:
735+
A dict with the keys `"ids"`, `"embeddings"`, `"metadatas"`, `"documents"`.
659736
"""
660737
kwargs = {
661738
"ids": ids,
@@ -674,8 +751,8 @@ def update_document(self, document_id: str, document: Document) -> None:
674751
"""Update a document in the collection.
675752
676753
Args:
677-
document_id (str): ID of the document to update.
678-
document (Document): Document to update.
754+
document_id: ID of the document to update.
755+
document: Document to update.
679756
"""
680757
return self.update_documents([document_id], [document])
681758

@@ -684,8 +761,11 @@ def update_documents(self, ids: List[str], documents: List[Document]) -> None:
684761
"""Update a document in the collection.
685762
686763
Args:
687-
ids (List[str]): List of ids of the document to update.
688-
documents (List[Document]): List of documents to update.
764+
ids: List of ids of the document to update.
765+
documents: List of documents to update.
766+
767+
Raises:
768+
ValueError: If the embedding function is not provided.
689769
"""
690770
text = [document.page_content for document in documents]
691771
metadata = [document.metadata for document in documents]
@@ -741,14 +821,14 @@ def from_texts(
741821
Otherwise, the data will be ephemeral in-memory.
742822
743823
Args:
744-
texts (List[str]): List of texts to add to the collection.
745-
collection_name (str): Name of the collection to create.
746-
persist_directory (Optional[str]): Directory to persist the collection.
747-
embedding (Optional[Embeddings]): Embedding function. Defaults to None.
748-
metadatas (Optional[List[dict]]): List of metadatas. Defaults to None.
749-
ids (Optional[List[str]]): List of document IDs. Defaults to None.
750-
client_settings (Optional[chromadb.config.Settings]): Chroma client settings
751-
collection_metadata (Optional[Dict]): Collection configurations.
824+
texts: List of texts to add to the collection.
825+
collection_name: Name of the collection to create.
826+
persist_directory: Directory to persist the collection.
827+
embedding: Embedding function. Defaults to None.
828+
metadatas: List of metadatas. Defaults to None.
829+
ids: List of document IDs. Defaults to None.
830+
client_settings: Chroma client settings
831+
collection_metadata: Collection configurations.
752832
Defaults to None.
753833
754834
Returns:
@@ -804,13 +884,13 @@ def from_documents(
804884
Otherwise, the data will be ephemeral in-memory.
805885
806886
Args:
807-
collection_name (str): Name of the collection to create.
808-
persist_directory (Optional[str]): Directory to persist the collection.
809-
ids (Optional[List[str]]): List of document IDs. Defaults to None.
810-
documents (List[Document]): List of documents to add to the vectorstore.
811-
embedding (Optional[Embeddings]): Embedding function. Defaults to None.
812-
client_settings (Optional[chromadb.config.Settings]): Chroma client settings
813-
collection_metadata (Optional[Dict]): Collection configurations.
887+
collection_name: Name of the collection to create.
888+
persist_directory: Directory to persist the collection.
889+
ids : List of document IDs. Defaults to None.
890+
documents: List of documents to add to the vectorstore.
891+
embedding: Embedding function. Defaults to None.
892+
client_settings: Chroma client settings
893+
collection_metadata: Collection configurations.
814894
Defaults to None.
815895
816896
Returns:

0 commit comments

Comments
 (0)