diff --git a/src/openparse/processing/__init__.py b/src/openparse/processing/__init__.py index 8375714..21add97 100644 --- a/src/openparse/processing/__init__.py +++ b/src/openparse/processing/__init__.py @@ -15,7 +15,7 @@ NoOpIngestionPipeline, SemanticIngestionPipeline, ) -from .semantic_transforms import CombineNodesSemantically, OpenAIEmbeddings +from .semantic_transforms import CombineNodesSemantically, OpenAIEmbeddings, AzureOpenAIEmbeddings __all__ = [ "ProcessingStep", @@ -33,4 +33,5 @@ "RemoveNodesBelowNTokens", "CombineNodesSemantically", "OpenAIEmbeddings", + "AzureOpenAIEmbeddings", ] diff --git a/src/openparse/processing/ingest.py b/src/openparse/processing/ingest.py index a08f84b..2e28161 100644 --- a/src/openparse/processing/ingest.py +++ b/src/openparse/processing/ingest.py @@ -17,6 +17,7 @@ CombineNodesSemantically, EmbeddingModel, OpenAIEmbeddings, + AzureOpenAIEmbeddings ) from openparse.schemas import Node @@ -97,12 +98,24 @@ class SemanticIngestionPipeline(IngestionPipeline): def __init__( self, - openai_api_key: str, + api_key: str, + api_endpoint: str, + deployment: str, + api_version: str = "2024-02-15-preview", model: EmbeddingModel = "text-embedding-3-large", min_tokens: int = consts.TOKENIZATION_LOWER_LIMIT, max_tokens: int = consts.TOKENIZATION_UPPER_LIMIT, ) -> None: - embedding_client = OpenAIEmbeddings(api_key=openai_api_key, model=model) + # if an api endpoint is provided, use AzureOpenAIEmbeddings + if api_endpoint is not None: + embedding_client = AzureOpenAIEmbeddings( + api_key=api_key, + api_endpoint=api_endpoint, + deployment=deployment, + api_version=api_version + ) + else: + embedding_client = OpenAIEmbeddings(api_key=api_key, model=model) self.transformations = [ RemoveTextInsideTables(), diff --git a/src/openparse/processing/semantic_transforms.py b/src/openparse/processing/semantic_transforms.py index 8369035..d66b7ee 100644 --- a/src/openparse/processing/semantic_transforms.py +++ b/src/openparse/processing/semantic_transforms.py @@ -1,3 +1,4 @@ +from abc import ABC, abstractmethod from typing import List, Literal, Union import numpy as np @@ -14,10 +15,27 @@ def cosine_similarity( a: Union[np.ndarray, List[float]], b: Union[np.ndarray, List[float]] ) -> float: + """ + Calculate the cosine similarity between two vectors. + + Cosine similarity is a measure of similarity between two non-zero vectors of an inner product space that measures the cosine of the angle between them. + + Parameters: + a (Union[np.ndarray, List[float]]): The first vector. + b (Union[np.ndarray, List[float]]): The second vector. + + Returns: + float: The cosine similarity between vector `a` and vector `b`. The value ranges from -1 meaning exactly opposite, to 1 meaning exactly the same, with 0 usually indicating orthogonality (independence), and in-between values indicating intermediate similarity or dissimilarity. + """ return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b)) -class OpenAIEmbeddings: +class BaseEmbeddings(ABC): + @abstractmethod + def embed_many(self, texts: List[str]) -> List[List[float]]: + pass + +class OpenAIEmbeddings(BaseEmbeddings): def __init__( self, model: EmbeddingModel, @@ -68,7 +86,64 @@ def _create_client(self): ) from err return OpenAI(api_key=self.api_key) +class AzureOpenAIEmbeddings(BaseEmbeddings): + def __init__( + self, + api_key: str, + api_endpoint: str, + deployment: str, + api_version: str = "2024-02-15-preview", + batch_size: int = 256, + ): + """ + Used to generate embeddings for Nodes. + + Args: + model (str): The embedding model to use. + api_key (str): Your Azure OpenAI API key. + api_endpoint (str): The Azure endpoint to use. + api_version (str): The version of the API to use. + deployment (str): The deployment to use. + batch_size (int): The number of texts to process in each api call. + """ + self.api_key = api_key + self.api_endpoint = api_endpoint + self.api_version = api_version + self.deployment = deployment + self.batch_size = batch_size + self.client = self._create_client() + def embed_many(self, texts: List[str]) -> List[List[float]]: + """ + Generate embeddings for a list of texts in batches. + + Args: + texts (list[str]): The list of texts to embed. + batch_size (int): The number of texts to process in each batch. + + Returns: + List[List[float]]: A list of embeddings. + """ + res = [] + for i in range(0, len(texts), self.batch_size): + batch_texts = texts[i : i + self.batch_size] + api_resp = self.client.embeddings.create( + input=batch_texts, model=self.deployment + ) + batch_res = [val.embedding for val in api_resp.data] + res.extend(batch_res) + + return res + + def _create_client(self): + try: + from openai import AzureOpenAI + except ImportError as err: + raise ImportError( + "You need to install the openai package to use this feature." + ) from err + return AzureOpenAI(api_key=self.api_key, azure_endpoint=self.api_endpoint, azure_deployment=self.deployment, api_version=self.api_version) + class CombineNodesSemantically(ProcessingStep): """ Combines nodes that are semantically related. @@ -76,7 +151,7 @@ class CombineNodesSemantically(ProcessingStep): def __init__( self, - embedding_client: OpenAIEmbeddings, + embedding_client: BaseEmbeddings, min_similarity: float, max_tokens: int, ):