Add LlamaIndex as a Datastore (openai#99)

* add dep * wip * wip * wuip * wip * update dependency * wip * wip * wip * wip * lock * wip * wip * wip * wup * wip * update llama-index * wip * refactor docs * update docs and misc * update remove unused dep * wip * wip
acatav · Apr 5, 2023 · 96add7c · 96add7c
1 parent 964072d
commit 96add7c
Show file tree

Hide file tree

Showing 8 changed files with 696 additions and 42 deletions.
diff --git a/README.md b/README.md
@@ -257,6 +257,17 @@ For more detailed instructions on setting up and using each vector database prov
 
 [Redis](https://redis.com/solutions/use-cases/vector-database/) is a real-time data platform suitable for a variety of use cases, including everyday applications and AI/ML workloads. It can be used as a low-latency vector engine by creating a Redis database with the [Redis Stack docker container](/examples/docker/redis/docker-compose.yml). For a hosted/managed solution, [Redis Cloud](https://app.redislabs.com/#/) is available. For detailed setup instructions, refer to [`/docs/providers/redis/setup.md`](/docs/providers/redis/setup.md).
 
+
+#### LlamaIndex
+
+[LlamaIndex](https://github.com/jerryjliu/llama_index) is a central interface to connect your LLM's with external data.
+It provides a suite of in-memory indices over your unstructured and structured data for use with ChatGPT.
+Unlike standard vector databases, LlamaIndex supports a wide range of indexing strategies (e.g. tree, keyword table, knowledge graph) optimized for different use-cases.
+It is light-weight, easy-to-use, and requires no additional deployment.
+All you need to do is specifying a few environment variables (optionally point to an existing saved Index json file).
+Note that metadata filters in queries are not yet supported.
+For detailed setup instructions, refer to [`/docs/providers/llama/setup.md`](/docs/providers/llama/setup.md).
+
 ### Running the API locally
 
 To run the API locally, you first need to set the requisite environment variables with the `export` command:

diff --git a/datastore/factory.py b/datastore/factory.py
@@ -7,6 +7,10 @@ async def get_datastore() -> DataStore:
     assert datastore is not None
 
     match datastore:
+        case "llama":
+            from datastore.providers.llama_datastore import LlamaDataStore
+            return LlamaDataStore()
+
         case "pinecone":
             from datastore.providers.pinecone_datastore import PineconeDataStore
 

diff --git a/datastore/providers/llama_datastore.py b/datastore/providers/llama_datastore.py
@@ -0,0 +1,181 @@
+import json
+import os
+from typing import Dict, List, Optional, Type
+from loguru import logger
+from datastore.datastore import DataStore
+from models.models import DocumentChunk, DocumentChunkMetadata, DocumentChunkWithScore, DocumentMetadataFilter, Query, QueryResult, QueryWithEmbedding
+
+from llama_index.indices.base import BaseGPTIndex
+from llama_index.indices.vector_store.base import GPTVectorStoreIndex
+from llama_index.indices.query.schema import QueryBundle
+from llama_index.response.schema import Response
+from llama_index.data_structs.node_v2 import Node, DocumentRelationship, NodeWithScore
+from llama_index.indices.registry import INDEX_STRUCT_TYPE_TO_INDEX_CLASS
+from llama_index.data_structs.struct_type import IndexStructType
+from llama_index.indices.response.builder import ResponseMode
+
+INDEX_STRUCT_TYPE_STR = os.environ.get('LLAMA_INDEX_TYPE', IndexStructType.SIMPLE_DICT.value)
+INDEX_JSON_PATH = os.environ.get('LLAMA_INDEX_JSON_PATH', None)
+QUERY_KWARGS_JSON_PATH = os.environ.get('LLAMA_QUERY_KWARGS_JSON_PATH', None)
+RESPONSE_MODE = os.environ.get('LLAMA_RESPONSE_MODE', ResponseMode.NO_TEXT.value)
+
+EXTERNAL_VECTOR_STORE_INDEX_STRUCT_TYPES = [
+    IndexStructType.DICT,
+    IndexStructType.WEAVIATE,
+    IndexStructType.PINECONE,
+    IndexStructType.QDRANT,
+    IndexStructType.CHROMA,
+    IndexStructType.VECTOR_STORE,
+]
+
+def _create_or_load_index(
+    index_type_str: Optional[str] = None,
+    index_json_path: Optional[str] = None,
+    index_type_to_index_cls: Optional[dict[str, Type[BaseGPTIndex]]] = None,
+) -> BaseGPTIndex:
+    """Create or load index from json path."""
+    index_json_path = index_json_path or INDEX_JSON_PATH
+    index_type_to_index_cls = index_type_to_index_cls or INDEX_STRUCT_TYPE_TO_INDEX_CLASS
+    index_type_str = index_type_str or INDEX_STRUCT_TYPE_STR
+    index_type = IndexStructType(index_type_str)
+
+    if index_type not in index_type_to_index_cls:
+        raise ValueError(f'Unknown index type: {index_type}')
+
+    if index_type in EXTERNAL_VECTOR_STORE_INDEX_STRUCT_TYPES:
+        raise ValueError('Please use vector store directly.')
+
+    index_cls = index_type_to_index_cls[index_type]
+    if index_json_path is None:
+        return index_cls(nodes=[])  # Create empty index
+    else:
+        return index_cls.load_from_disk(index_json_path) # Load index from disk
+
+def _create_or_load_query_kwargs(query_kwargs_json_path: Optional[str] = None) -> Optional[dict]:
+    """Create or load query kwargs from json path."""
+    query_kwargs_json_path= query_kwargs_json_path or QUERY_KWARGS_JSON_PATH
+    query_kargs: Optional[dict] = None
+    if  query_kwargs_json_path is not None:
+        with open(INDEX_JSON_PATH, 'r') as f:
+            query_kargs = json.load(f)
+    return query_kargs
+
+
+def _doc_chunk_to_node(doc_chunk: DocumentChunk, source_doc_id: str) -> Node:
+    """Convert document chunk to Node"""
+    return Node(
+        doc_id=doc_chunk.id,
+        text=doc_chunk.text,
+        embedding=doc_chunk.embedding,
+        extra_info=doc_chunk.metadata.dict(),
+        relationships={
+            DocumentRelationship.SOURCE: source_doc_id
+        }
+    )
+
+def _query_with_embedding_to_query_bundle(query: QueryWithEmbedding) -> QueryBundle:
+    return QueryBundle(
+        query_str = query.query,
+        embedding=query.embedding,
+    )
+
+def _source_node_to_doc_chunk_with_score(node_with_score: NodeWithScore) -> DocumentChunkWithScore:
+    node = node_with_score.node
+    if node.extra_info is not None:
+        metadata = DocumentChunkMetadata(**node.extra_info)
+    else:
+        metadata = DocumentChunkMetadata()
+
+    return DocumentChunkWithScore(
+        id=node.doc_id,
+        text=node.text,
+        score=node_with_score.score if node_with_score.score is not None else 1.,
+        metadata=metadata,
+    )
+
+def _response_to_query_result(response: Response, query: QueryWithEmbedding) -> QueryResult:
+    results = [_source_node_to_doc_chunk_with_score(node) for node in response.source_nodes]
+    return QueryResult(query=query.query, results=results,)
+
+class LlamaDataStore(DataStore):
+    def __init__(self, index: Optional[BaseGPTIndex] = None, query_kwargs: Optional[dict] = None):
+        self._index = index or _create_or_load_index()
+        self._query_kwargs = query_kwargs or _create_or_load_query_kwargs()
+
+    async def _upsert(self, chunks: Dict[str, List[DocumentChunk]]) -> List[str]:
+        """
+        Takes in a list of list of document chunks and inserts them into the database.
+        Return a list of document ids.
+        """
+        doc_ids = []
+        for doc_id, doc_chunks in chunks.items():
+            logger.debug(f"Upserting {doc_id} with {len(doc_chunks)} chunks")
+
+            nodes = [
+                _doc_chunk_to_node(doc_chunk=doc_chunk, source_doc_id=doc_id)
+                for doc_chunk in doc_chunks
+            ]
+
+            self._index.insert_nodes(nodes)
+            doc_ids.append(doc_id)
+        return doc_ids
+
+    async def _query(
+        self,
+        queries: List[QueryWithEmbedding],
+    ) -> List[QueryResult]:
+        """
+        Takes in a list of queries with embeddings and filters and
+        returns a list of query results with matching document chunks and scores.
+        """
+        query_result_all = []
+        for query in queries:
+            if query.filter is not None:
+                logger.warning('Filters are not supported yet, ignoring for now.')
+
+            query_bundle = _query_with_embedding_to_query_bundle(query)
+
+            # Setup query kwargs
+            if self._query_kwargs is not None:
+                query_kwargs = self._query_kwargs
+            else:
+                query_kwargs = {}
+            # TODO: support top_k for other indices
+            if isinstance(self._index, GPTVectorStoreIndex):
+                query_kwargs['similarity_top_k'] = query.top_k
+
+            response = await self._index.aquery(query_bundle, response_mode=RESPONSE_MODE, **query_kwargs)
+
+            query_result = _response_to_query_result(response, query)
+            query_result_all.append(query_result)
+
+        return query_result_all
+
+    async def delete(
+        self,
+        ids: Optional[List[str]] = None,
+        filter: Optional[DocumentMetadataFilter] = None,
+        delete_all: Optional[bool] = None,
+    ) -> bool:
+        """
+        Removes vectors by ids, filter, or everything in the datastore.
+        Returns whether the operation was successful.
+        """
+        if delete_all:
+            logger.warning('Delete all not supported yet.')
+            return False
+
+        if filter is not None:
+            logger.warning('Filters are not supported yet.')
+            return False
+
+        if ids is not None:
+            for id_ in ids:
+                try:
+                    self._index.delete(id_)
+                except NotImplementedError:
+                    # NOTE: some indices does not support delete yet.
+                    logger.warning(f'{type(self._index)} does not support delete yet.')
+                    return False
+
+        return True
diff --git a/docs/deployment/removing-unused-dependencies.md b/docs/deployment/removing-unused-dependencies.md
@@ -4,11 +4,12 @@ Before deploying your app, you might want to remove unused dependencies from you
 
 Here are the packages you can remove for each vector database provider:
 
-- **Pinecone:** Remove `weaviate-client`, `pymilvus`, `qdrant-client`, and `redis`.
-- **Weaviate:** Remove `pinecone-client`, `pymilvus`, `qdrant-client`, and `redis`.
-- **Zilliz:** Remove `pinecone-client`, `weaviate-client`, `qdrant-client`, and `redis`.
-- **Milvus:** Remove `pinecone-client`, `weaviate-client`, `qdrant-client`, and `redis`.
-- **Qdrant:** Remove `pinecone-client`, `weaviate-client`, `pymilvus`, and `redis`.
-- **Redis:** Remove `pinecone-client`, `weaviate-client`, `pymilvus`, and `qdrant-client`.
+- **Pinecone:** Remove `weaviate-client`, `pymilvus`, `qdrant-client`, `redis`, and `llama-index`.
+- **Weaviate:** Remove `pinecone-client`, `pymilvus`, `qdrant-client`, `redis`, and `llama-index`.
+- **Zilliz:** Remove `pinecone-client`, `weaviate-client`, `qdrant-client`, `redis`, and `llama-index`.
+- **Milvus:** Remove `pinecone-client`, `weaviate-client`, `qdrant-client`, `redis`, and `llama-index`.
+- **Qdrant:** Remove `pinecone-client`, `weaviate-client`, `pymilvus`, `redis`, and `llama-index`.
+- **Redis:** Remove `pinecone-client`, `weaviate-client`, `pymilvus`, `qdrant-client`, and `llama-index`.
+- **LlamaIndex:** Remove `pinecone-client`, `weaviate-client`, `pymilvus`, `qdrant-client`, and `redis`.
 
 After removing the unnecessary packages from the `pyproject.toml` file, you don't need to run `poetry lock` and `poetry install` manually. The provided Dockerfile takes care of installing the required dependencies using the `requirements.txt` file generated by the `poetry export` command.
diff --git a/docs/providers/llama/setup.md b/docs/providers/llama/setup.md
@@ -0,0 +1,51 @@
+
+# LlamaIndex
+
+[LlamaIndex](https://github.com/jerryjliu/llama_index) is a central interface to connect your LLM's with external data.
+It provides a suite of in-memory indices over your unstructured and structured data for use with ChatGPT.
+Unlike standard vector databases, LlamaIndex supports a wide range of indexing strategies (e.g. tree, keyword table, knowledge graph) optimized for different use-cases.
+It is light-weight, easy-to-use, and requires no additional deployment.
+All you need to do is specifying a few environment variables (optionally point to an existing saved Index json file).
+Note that metadata filters in queries are not yet supported.
+
+## Setup
+Currently, LlamaIndex requires no additional deployment
+and runs as a part of the Retrieval Plugin.
+It is super easy to setup and great for quick prototyping
+with ChatGPT and your external data.
+
+**Retrieval App Environment Variables**
+
+| Name             | Required | Description                            |
+| ---------------- | -------- | -------------------------------------- |
+| `DATASTORE`      | Yes      | Datastore name. Set this to `llama` |
+| `BEARER_TOKEN`   | Yes      | Your secret token                      |
+| `OPENAI_API_KEY` | Yes      | Your OpenAI API key                    |
+
+**Llama Datastore Environment Variables**
+
+| Name                            | Required | Description                                                        | Default            |
+| ------------------------------- | -------- | ------------------------------------------------------------------ | ------------------ |
+| `LLAMA_INDEX_TYPE`              | Optional | Index type (see below for details)                                 | `simple_dict`      |
+| `LLAMA_INDEX_JSON_PATH`         | Optional | Path to saved Index json file                                      | None               |
+| `LLAMA_QUERY_KWARGS_JSON_PATH`         | Optional | Path to saved query kwargs json file                                      | None               |
+| `LLAMA_RESPONSE_MODE`           | Optional | Response mode for query                                            | `no_text`          | 
+
+
+**Different Index Types**
+By default, we use a `GPTSimpleVectorIndex` to store document chunks in memory, 
+and retrieve top-k nodes by embedding similarity.
+Different index types are optimized for different data and query use-cases.
+See this guide on [How Each Index Works](https://gpt-index.readthedocs.io/en/latest/guides/primer/index_guide.html) to learn more.
+You can configure the index type via the `LLAMA_INDEX_TYPE`, see [here](https://gpt-index.readthedocs.io/en/latest/reference/indices/composability_query.html#gpt_index.data_structs.struct_type.IndexStructType) for the full list of accepted index type identifiers.
+
+
+Read more details on [readthedocs](https://gpt-index.readthedocs.io/en/latest/), 
+and engage with the community on [discord](https://discord.com/invite/dGcwcsnxhU).
+
+## Running Tests
+You can launch the test suite with this command:
+
+```bash
+pytest ./tests/datastore/providers/llama/test_llama_datastore.py
+```