From 741f8cdf2ac4c490613a66fdc3c25bc6803d4ca9 Mon Sep 17 00:00:00 2001 From: Michael Anckaert Date: Mon, 27 Nov 2023 10:45:16 +0100 Subject: [PATCH 1/5] Add AzureOpenAIEncoder --- pinecone_text/dense/openai_encoder.py | 73 +++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) diff --git a/pinecone_text/dense/openai_encoder.py b/pinecone_text/dense/openai_encoder.py index 841a892..1987f97 100644 --- a/pinecone_text/dense/openai_encoder.py +++ b/pinecone_text/dense/openai_encoder.py @@ -1,3 +1,4 @@ +import os from typing import Union, List, Any, Optional from pinecone_text.dense.base_dense_ecoder import BaseDenseEncoder @@ -76,3 +77,75 @@ def _encode( if isinstance(texts, str): return response.data[0].embedding return [result.embedding for result in response.data] + + +class AzureOpenAIEncoder(BaseDenseEncoder): + """ + OpenAI's text embedding wrapper. See https://platform.openai.com/docs/guides/embeddings + + Note: You should provide an API key and organization in the environment variables OPENAI_API_KEY and OPENAI_ORG. + Or you can pass them as arguments to the constructor as `api_key` and `organization`. + """ + + def __init__( + self, + model_name: str = "text-embedding-ada-002", + api_key: Optional[str] = None, + organization: Optional[str] = None, + base_url: Optional[str] = None, + **kwargs: Any, + ): + """ + Initialize the OpenAI encoder. + + :param model_name: The name of the embedding model to use. See https://beta.openai.com/docs/api-reference/embeddings + :param kwargs: Additional arguments to pass to the underlying openai client. See https://github.com/openai/openai-python + """ + if not _openai_installed: + raise ImportError( + "Failed to import openai. Make sure you install openai extra " + "dependencies by running: " + "`pip install pinecone-text[openai]" + ) + self._model_name = model_name + self._client = openai.AzureOpenAI( + api_key=api_key, + api_version="2023-05-15", + azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"), + base_url=base_url, + **kwargs, + ) + + def encode_documents( + self, texts: Union[str, List[str]] + ) -> Union[List[float], List[List[float]]]: + return self._encode(texts) + + def encode_queries( + self, texts: Union[str, List[str]] + ) -> Union[List[float], List[List[float]]]: + return self._encode(texts) + + def _encode( + self, texts: Union[str, List[str]] + ) -> Union[List[float], List[List[float]]]: + if isinstance(texts, str): + texts_input = [texts] + elif isinstance(texts, list): + texts_input = texts + else: + raise ValueError( + f"texts must be a string or list of strings, got: {type(texts)}" + ) + + try: + response = self._client.embeddings.create( + input=texts_input, model=self._model_name + ) + except OpenAIError as e: + # TODO: consider wrapping external provider errors + raise e + + if isinstance(texts, str): + return response.data[0].embedding + return [result.embedding for result in response.data] From 3b3c4db04ac74ae74cf52a22fedd50f88d5dd53b Mon Sep 17 00:00:00 2001 From: Michael Anckaert Date: Mon, 27 Nov 2023 11:19:07 +0100 Subject: [PATCH 2/5] Inherit from OpenAIEncoder for simplicity --- pinecone_text/dense/openai_encoder.py | 44 +++------------------------ 1 file changed, 5 insertions(+), 39 deletions(-) diff --git a/pinecone_text/dense/openai_encoder.py b/pinecone_text/dense/openai_encoder.py index 1987f97..ee9ac3e 100644 --- a/pinecone_text/dense/openai_encoder.py +++ b/pinecone_text/dense/openai_encoder.py @@ -79,19 +79,19 @@ def _encode( return [result.embedding for result in response.data] -class AzureOpenAIEncoder(BaseDenseEncoder): +class AzureOpenAIEncoder(OpenAIEncoder): """ - OpenAI's text embedding wrapper. See https://platform.openai.com/docs/guides/embeddings + Azure OpenAI's text embedding wrapper. + See https://learn.microsoft.com/en-us/azure/ai-services/openai/concepts/understand-embeddings - Note: You should provide an API key and organization in the environment variables OPENAI_API_KEY and OPENAI_ORG. - Or you can pass them as arguments to the constructor as `api_key` and `organization`. + Note: You should provide an API key in the environment variables AZURE_OPENAI_API_KEY. + Or you can pass it as an arguments to the constructor as `api_key`. """ def __init__( self, model_name: str = "text-embedding-ada-002", api_key: Optional[str] = None, - organization: Optional[str] = None, base_url: Optional[str] = None, **kwargs: Any, ): @@ -115,37 +115,3 @@ def __init__( base_url=base_url, **kwargs, ) - - def encode_documents( - self, texts: Union[str, List[str]] - ) -> Union[List[float], List[List[float]]]: - return self._encode(texts) - - def encode_queries( - self, texts: Union[str, List[str]] - ) -> Union[List[float], List[List[float]]]: - return self._encode(texts) - - def _encode( - self, texts: Union[str, List[str]] - ) -> Union[List[float], List[List[float]]]: - if isinstance(texts, str): - texts_input = [texts] - elif isinstance(texts, list): - texts_input = texts - else: - raise ValueError( - f"texts must be a string or list of strings, got: {type(texts)}" - ) - - try: - response = self._client.embeddings.create( - input=texts_input, model=self._model_name - ) - except OpenAIError as e: - # TODO: consider wrapping external provider errors - raise e - - if isinstance(texts, str): - return response.data[0].embedding - return [result.embedding for result in response.data] From 5507b800181cfb66d4bc7a854679976a72c82b1d Mon Sep 17 00:00:00 2001 From: Michael Anckaert Date: Fri, 15 Dec 2023 11:42:04 +0100 Subject: [PATCH 3/5] Batch request size to Azure OpenAI --- pinecone_text/dense/openai_encoder.py | 28 ++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/pinecone_text/dense/openai_encoder.py b/pinecone_text/dense/openai_encoder.py index ee9ac3e..ce4a655 100644 --- a/pinecone_text/dense/openai_encoder.py +++ b/pinecone_text/dense/openai_encoder.py @@ -66,17 +66,23 @@ def _encode( f"texts must be a string or list of strings, got: {type(texts)}" ) - try: - response = self._client.embeddings.create( - input=texts_input, model=self._model_name - ) - except OpenAIError as e: - # TODO: consider wrapping external provider errors - raise e - - if isinstance(texts, str): - return response.data[0].embedding - return [result.embedding for result in response.data] + batch_size = 16 # Azure OpenAI limit as of 2023-11-27 + result = [] + for i in range(0, len(texts), batch_size): + batch = texts[i : i + batch_size] + try: + response = self._client.embeddings.create( + input=batch, model=self._model_name + ) + except OpenAIError as e: + # TODO: consider wrapping external provider errors + raise e + + if isinstance(batch, str): + result.extend(response.data[0].embedding) + result.extend([result.embedding for result in response.data]) + + return result class AzureOpenAIEncoder(OpenAIEncoder): From f57e8c8dfa35e0f50d972cec4fd8a79482803096 Mon Sep 17 00:00:00 2001 From: Amnon Catav Date: Wed, 10 Jan 2024 17:11:18 +0200 Subject: [PATCH 4/5] add tests to azure openai encoder --- .github/workflows/CI.yaml | 5 + pinecone_text/dense/__init__.py | 2 +- pinecone_text/dense/openai_encoder.py | 133 +++++++++++++------------- pyproject.toml | 2 +- tests/system/test_openai_encoder.py | 23 +++-- tests/unit/test_openai_encoder.py | 67 +++++++------ 6 files changed, 127 insertions(+), 105 deletions(-) diff --git a/.github/workflows/CI.yaml b/.github/workflows/CI.yaml index 68ae559..423c584 100644 --- a/.github/workflows/CI.yaml +++ b/.github/workflows/CI.yaml @@ -46,4 +46,9 @@ jobs: env: OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} CO_API_KEY: ${{ secrets.CO_API_KEY }} + AZURE_OPENAI_API_KEY: ${{ secrets.AZURE_OPENAI_API_KEY }} + AZURE_OPENAI_ENDPOINT: ${{ secrets.AZURE_OPENAI_ENDPOINT }} + OPENAI_API_VERSION: ${{ secrets.OPENAI_API_VERSION }} + EMBEDDINGS_AZURE_OPENAI_DEPLOYMENT_NAME: ${{ secrets.EMBEDDINGS_AZURE_OPENAI_DEPLOYMENT_NAME }} + run: poetry run pytest tests/system diff --git a/pinecone_text/dense/__init__.py b/pinecone_text/dense/__init__.py index ebafef6..0afe497 100644 --- a/pinecone_text/dense/__init__.py +++ b/pinecone_text/dense/__init__.py @@ -1,3 +1,3 @@ from .cohere_encoder import CohereEncoder -from .openai_encoder import OpenAIEncoder +from .openai_encoder import OpenAIEncoder, AzureOpenAIEncoder from .sentence_transformer_encoder import SentenceTransformerEncoder diff --git a/pinecone_text/dense/openai_encoder.py b/pinecone_text/dense/openai_encoder.py index ce4a655..0c0a377 100644 --- a/pinecone_text/dense/openai_encoder.py +++ b/pinecone_text/dense/openai_encoder.py @@ -15,24 +15,31 @@ class OpenAIEncoder(BaseDenseEncoder): """ OpenAI's text embedding wrapper. See https://platform.openai.com/docs/guides/embeddings - Note: You should provide an API key and organization in the environment variables OPENAI_API_KEY and OPENAI_ORG. - Or you can pass them as arguments to the constructor as `api_key` and `organization`. - """ + Note: this method reflects the OpenAI client initialization behaviour (See https://github.com/openai/openai-python/blob/main/src/openai/_client.py) + On initialization, You may explicitly pass any argument that the OpenAI client accepts, or use the following environment variables: + + - `OPENAI_API_KEY` as `api_key` + - `OPENAI_ORG_ID` as `organization` + - `OPENAI_BASE_URL` as `base_url` + + Example: + Using environment variables: + >>> import os + >>> from pinecone_text.dense import OpenAIEncoder + >>> os.environ['OPENAI_API_KEY'] = "sk-..." + >>> encoder = OpenAIEncoder() + >>> encoder.encode_documents(["some text", "some other text"]) + + Passing arguments explicitly: + >>> from pinecone_text.dense import OpenAIEncoder + >>> encoder = OpenAIEncoder(api_key="sk-...") + """ # noqa: E501 def __init__( self, model_name: str = "text-embedding-ada-002", - api_key: Optional[str] = None, - organization: Optional[str] = None, - base_url: Optional[str] = None, **kwargs: Any, ): - """ - Initialize the OpenAI encoder. - - :param model_name: The name of the embedding model to use. See https://beta.openai.com/docs/api-reference/embeddings - :param kwargs: Additional arguments to pass to the underlying openai client. See https://github.com/openai/openai-python - """ if not _openai_installed: raise ImportError( "Failed to import openai. Make sure you install openai extra " @@ -40,9 +47,11 @@ def __init__( "`pip install pinecone-text[openai]" ) self._model_name = model_name - self._client = openai.OpenAI( - api_key=api_key, organization=organization, base_url=base_url, **kwargs - ) + self._client = self._create_client(**kwargs) + + @staticmethod + def _create_client(**kwargs: Any) -> Union[openai.OpenAI, openai.AzureOpenAI]: + return openai.OpenAI(**kwargs) def encode_documents( self, texts: Union[str, List[str]] @@ -66,58 +75,52 @@ def _encode( f"texts must be a string or list of strings, got: {type(texts)}" ) - batch_size = 16 # Azure OpenAI limit as of 2023-11-27 - result = [] - for i in range(0, len(texts), batch_size): - batch = texts[i : i + batch_size] - try: - response = self._client.embeddings.create( - input=batch, model=self._model_name - ) - except OpenAIError as e: - # TODO: consider wrapping external provider errors - raise e - - if isinstance(batch, str): - result.extend(response.data[0].embedding) - result.extend([result.embedding for result in response.data]) + try: + response = self._client.embeddings.create( + input=texts_input, model=self._model_name + ) + except OpenAIError as e: + # TODO: consider wrapping external provider errors + raise e - return result + if isinstance(texts, str): + return response.data[0].embedding + return [result.embedding for result in response.data] class AzureOpenAIEncoder(OpenAIEncoder): """ - Azure OpenAI's text embedding wrapper. - See https://learn.microsoft.com/en-us/azure/ai-services/openai/concepts/understand-embeddings - - Note: You should provide an API key in the environment variables AZURE_OPENAI_API_KEY. - Or you can pass it as an arguments to the constructor as `api_key`. - """ - - def __init__( - self, - model_name: str = "text-embedding-ada-002", - api_key: Optional[str] = None, - base_url: Optional[str] = None, - **kwargs: Any, - ): - """ - Initialize the OpenAI encoder. - - :param model_name: The name of the embedding model to use. See https://beta.openai.com/docs/api-reference/embeddings - :param kwargs: Additional arguments to pass to the underlying openai client. See https://github.com/openai/openai-python - """ - if not _openai_installed: - raise ImportError( - "Failed to import openai. Make sure you install openai extra " - "dependencies by running: " - "`pip install pinecone-text[openai]" - ) - self._model_name = model_name - self._client = openai.AzureOpenAI( - api_key=api_key, - api_version="2023-05-15", - azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"), - base_url=base_url, - **kwargs, - ) + Initialize the Azure OpenAI encoder. + + Note: this method reflects the AzureOpenAI client initialization behaviour (See https://github.com/openai/openai-python/blob/main/src/openai/lib/azure.py). + You may explicitly pass any argument that the AzureOpenAI client accepts, or use the following environment variables: + + - `AZURE_OPENAI_API_KEY` as `api_key` + - `AZURE_OPENAI_ENDPOINT` as `azure_endpoint` + - `OPENAI_API_VERSION` as `api_version` + - `OPENAI_ORG_ID` as `organization` + - `AZURE_OPENAI_AD_TOKEN` as `azure_ad_token` + + In addition, you must pass the `model_name` argument with the name of the deployment you wish to use in your own Azure account. + + Example: + Using environment variables: + >>> import os + >>> from pinecone_text.dense import AzureOpenAIEncoder + >>> os.environ['AZURE_OPENAI_API_KEY'] = "sk-..." + >>> os.environ['AZURE_OPENAI_ENDPOINT'] = "https://.....openai.azure.com/" + >>> os.environ['OPENAI_API_VERSION'] = "2023-12-01-preview" + >>> encoder = AzureOpenAIEncoder(model_name="my-ada-002-deployment") + >>> encoder.encode_documents(["some text", "some other text"]) + + Passing arguments explicitly: + >>> from pinecone_text.dense import AzureOpenAIEncoder + >>> encoder = AzureOpenAIEncoder(api_key="sk-...", azure_endpoint="https://.....openai.azure.com/", api_version="2023-12-01-preview") + """ # noqa: E501 + + def __init__(self, model_name: str, **kwargs: Any): + super().__init__(model_name=model_name, **kwargs) + + @staticmethod + def _create_client(**kwargs: Any) -> openai.AzureOpenAI: + return openai.AzureOpenAI(**kwargs) diff --git a/pyproject.toml b/pyproject.toml index d53f849..2fe42af 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "pinecone-text" -version = "0.7.1" +version = "0.7.2" description = "Text utilities library by Pinecone.io" authors = ["Pinecone.io"] readme = "README.md" diff --git a/tests/system/test_openai_encoder.py b/tests/system/test_openai_encoder.py index 24c8f81..d1a8ad1 100644 --- a/tests/system/test_openai_encoder.py +++ b/tests/system/test_openai_encoder.py @@ -1,19 +1,28 @@ import pytest -from pinecone_text.dense import OpenAIEncoder +import os +from pinecone_text.dense import OpenAIEncoder, AzureOpenAIEncoder from openai import BadRequestError, AuthenticationError DEFAULT_DIMENSION = 1536 -@pytest.fixture -def openai_encoder(): - return OpenAIEncoder() +@pytest.fixture(params=[OpenAIEncoder, AzureOpenAIEncoder]) +def openai_encoder(request): + if request.param == OpenAIEncoder: + return request.param() + else: + model_name = os.environ.get("EMBEDDINGS_AZURE_OPENAI_DEPLOYMENT_NAME") + return request.param(model_name=model_name) -def test_init_with_kwargs(): - encoder = OpenAIEncoder( - api_key="test_api_key", organization="test_organization", timeout=30 +@pytest.mark.parametrize("encoder_class", [OpenAIEncoder, AzureOpenAIEncoder]) +def test_init_with_kwargs(encoder_class): + encoder = encoder_class( + api_key="test_api_key", + organization="test_organization", + timeout=30, + model_name="test_model_name", ) assert encoder._client.api_key == "test_api_key" assert encoder._client.organization == "test_organization" diff --git a/tests/unit/test_openai_encoder.py b/tests/unit/test_openai_encoder.py index 0f24352..3aff629 100644 --- a/tests/unit/test_openai_encoder.py +++ b/tests/unit/test_openai_encoder.py @@ -1,6 +1,6 @@ import pytest from unittest.mock import patch, Mock -from pinecone_text.dense import OpenAIEncoder +from pinecone_text.dense import OpenAIEncoder, AzureOpenAIEncoder def create_mock_response(embeddings): @@ -13,10 +13,11 @@ def create_mock_response(embeddings): mock_multiple_embeddings = create_mock_response([[0.1, 0.2, 0.3], [0.4, 0.5, 0.6]]) -@pytest.fixture -def openai_encoder(): +@pytest.fixture(params=[OpenAIEncoder, AzureOpenAIEncoder]) +def encoder(request): + encoder_class = request.param with patch("pinecone_text.dense.openai_encoder.openai"): - yield OpenAIEncoder() + return encoder_class(model_name="test_model_name") def test_init_without_openai_installed(): @@ -25,21 +26,31 @@ def test_init_without_openai_installed(): OpenAIEncoder() -def test_init_with_kwargs(): +@pytest.mark.parametrize("encoder_class", [OpenAIEncoder, AzureOpenAIEncoder]) +def test_init_with_kwargs(encoder_class): with patch("pinecone_text.dense.openai_encoder.openai") as mock_openai: - OpenAIEncoder( - api_key="test_api_key", organization="test_organization", timeout=30 - ) - mock_openai.OpenAI.assert_called_with( + encoder_class( api_key="test_api_key", organization="test_organization", - base_url=None, timeout=30, + model_name="test_model_name", ) - - -def encode_by_type(openai_encoder, encoding_function, test_input): - func = getattr(openai_encoder, encoding_function) + if encoder_class == OpenAIEncoder: + mock_openai.OpenAI.assert_called_with( + api_key="test_api_key", + organization="test_organization", + timeout=30, + ) + else: + mock_openai.AzureOpenAI.assert_called_with( + api_key="test_api_key", + organization="test_organization", + timeout=30, + ) + + +def encode_by_type(encoder, encoding_function, test_input): + func = getattr(encoder, encoding_function) return func(test_input) @@ -50,12 +61,10 @@ def encode_by_type(openai_encoder, encoding_function, test_input): ("encode_queries"), ], ) -def test_encode_single_text(openai_encoder, encoding_function): - with patch.object( - openai_encoder._client, "embeddings", create=True - ) as mock_embeddings: +def test_encode_single_text(encoder, encoding_function): + with patch.object(encoder._client, "embeddings", create=True) as mock_embeddings: mock_embeddings.create.return_value = mock_single_embedding - result = encode_by_type(openai_encoder, encoding_function, "test text") + result = encode_by_type(encoder, encoding_function, "test text") assert result == [0.1, 0.2, 0.3] @@ -66,12 +75,10 @@ def test_encode_single_text(openai_encoder, encoding_function): ("encode_queries"), ], ) -def test_encode_multiple_texts(openai_encoder, encoding_function): - with patch.object( - openai_encoder._client, "embeddings", create=True - ) as mock_embeddings: +def test_encode_multiple_texts(encoder, encoding_function): + with patch.object(encoder._client, "embeddings", create=True) as mock_embeddings: mock_embeddings.create.return_value = mock_multiple_embeddings - result = encode_by_type(openai_encoder, encoding_function, ["text1", "text2"]) + result = encode_by_type(encoder, encoding_function, ["text1", "text2"]) assert result == [[0.1, 0.2, 0.3], [0.4, 0.5, 0.6]] @@ -82,9 +89,9 @@ def test_encode_multiple_texts(openai_encoder, encoding_function): ("encode_queries"), ], ) -def test_encode_invalid_input(openai_encoder, encoding_function): +def test_encode_invalid_input(encoder, encoding_function): with pytest.raises(ValueError): - encode_by_type(openai_encoder, encoding_function, 123) + encode_by_type(encoder, encoding_function, 123) @pytest.mark.parametrize( @@ -94,10 +101,8 @@ def test_encode_invalid_input(openai_encoder, encoding_function): ("encode_queries"), ], ) -def test_encode_error_handling(openai_encoder, encoding_function): - with patch.object( - openai_encoder._client, "embeddings", create=True - ) as mock_embeddings: +def test_encode_error_handling(encoder, encoding_function): + with patch.object(encoder._client, "embeddings", create=True) as mock_embeddings: mock_embeddings.create.side_effect = ValueError("OpenAI API error") with pytest.raises(ValueError, match="OpenAI API error"): - encode_by_type(openai_encoder, encoding_function, "test text") + encode_by_type(encoder, encoding_function, "test text") From ff977e862300aa9a626209acc49b66c36595b022 Mon Sep 17 00:00:00 2001 From: Amnon Catav Date: Wed, 10 Jan 2024 17:28:50 +0200 Subject: [PATCH 5/5] add azure openai to readme --- README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/README.md b/README.md index 1b02150..084e821 100644 --- a/README.md +++ b/README.md @@ -162,6 +162,9 @@ encoder.encode_queries(["Who jumped over the lazy dog?"]) # [[0.11, 0.43, 0.67, ...]] ``` +Pinecone text also supports Azure OpenAI API. To use it, you need to import the `AzureOpenAIEncoder` class instead of `OpenAIEncoder`. You also need to pass Azure specific environment variables to the constructor, along with your specific embeddings deployment as the model name. For more information please follow the `AzureOpenAIEncoder` documentation. + + ## Combining Sparse and Dense Encodings for Hybrid Search To combine sparse and dense encodings for hybrid search, you can use the `hybrid_convex_scale` method on your query.