edwardcjohnson · edwardcjohnson · Aug 2, 2023 · Aug 2, 2023 · Aug 8, 2023 · Aug 8, 2023
diff --git a/generative_ai/langchain/doc_qa.py b/generative_ai/langchain/doc_qa.py
@@ -0,0 +1,90 @@
+from langchain.llms import OpenAI
+from langchain.chains import (
+    StuffDocumentsChain, LLMChain, ConversationalRetrievalChain
+)
+from langchain.document_loaders import UnstructuredPDFLoader
+from langchain.embeddings import OpenAIEmbeddings
+from langchain.vectorstores import Chroma
+from langchain.chat_models import ChatOpenAI
+from langchain.chains.qa_with_sources import load_qa_with_sources_chain
+from langchain.prompts import PromptTemplate
+import pprint
+
+def load_vectorstore(persist_directory, collection_name, embedding_function):
+    """
+    Load a vector store.
+
+    Args:
+        persist_directory (str): Directory for vector store persistence.
+        collection_name (str): Name of the collection.
+        embedding_function (callable): Embedding function for creating vectors.
+
+    Returns:
+        Chroma: Loaded vector store.
+    """
+    vectorstore = Chroma(
+        persist_directory=persist_directory,
+        collection_name=collection_name,
+        embedding_function=embedding_function
+    )
+    return vectorstore
+
+def chain_executor(question, chat_history, chat_model, question_generator_chain, document_chain):
+    """
+    Execute the conversation chain to generate a rephrased question based on conversation history.
+
+    Args:
+        question (str): Input question.
+        chat_history (list): List of (question, answer) tuples representing chat history.
+        chat_model (ChatOpenAI): Chat model for question generation.
+        question_generator_chain: LLMChain for generating questions.
+        document_chain: QAWithSourcesChain for document-based question answering.
+
+    Returns:
+        dict: Result containing the answer and rephrased question.
+    """
+    retriever = chat_model({"question": question, "chat_history": chat_history})
+    condensed_question = question_generator_chain({"question": retriever['answer'], "chat_history": chat_history})
+    answer_with_sources = document_chain({"question": condensed_question, "retriever": retriever})
+    return {"answer": retriever['answer'], "rephrased_question": condensed_question, "sources": answer_with_sources}
+
+def main():
+    """
+    Main function to orchestrate the conversation and question generation.
+    """
+    persist_directory = "./data/chroma_db"
+    collection_name = "test_collection"
+
+    embeddings = OpenAIEmbeddings()
+    vectorstore = load_vectorstore(persist_directory, collection_name, embeddings)
+
+    llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)
+    template = """You are a friendly assistant. Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question, in its original language.
+
+Chat History:
+{chat_history}
+Follow Up Input: {question}
+Standalone question:"""
+    prompt = PromptTemplate.from_template(template)
+    question_generator_chain = LLMChain(llm=llm, prompt=prompt)
+    doc_chain = load_qa_with_sources_chain(llm, chain_type="stuff")
+
+    retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 4})
+    chain = ConversationalRetrievalChain(
+        retriever=retriever,
+        question_generator=question_generator_chain,
+        combine_docs_chain=doc_chain,
+    )
+
+    chat_history = []
+    question = "what question should i ask you?"
+    result = chain_executor(question, chat_history, llm, question_generator_chain, doc_chain)
+    pprint.pprint(result)
+
+    chat_history.append((question, result['answer']))
+    question = "What other questions should i ask you?"
+    result = chain_executor(question, chat_history, llm, question_generator_chain, doc_chain)
+    pprint.pprint(result)
+
+if __name__ == "__main__":
+    main()
diff --git a/generative_ai/langchain/load_vectordb.py b/generative_ai/langchain/load_vectordb.py
@@ -0,0 +1,134 @@
+from langchain.document_loaders import UnstructuredPDFLoader
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain.embeddings import OpenAIEmbeddings
+from langchain.vectorstores import Chroma
+
+class DocumentProcessor:
+    """Handles loading and splitting of documents."""
+
+    def __init__(self, pdf_path, chunk_size=500, chunk_overlap=0):
+        """
+        Initialize the DocumentProcessor.
+
+        Args:
+            pdf_path (str): Path to the PDF document.
+            chunk_size (int, optional): Size of text chunks. Defaults to 500.
+            chunk_overlap (int, optional): Overlap between text chunks. Defaults to 0.
+        """
+        self.pdf_path = pdf_path
+        self.chunk_size = chunk_size
+        self.chunk_overlap = chunk_overlap
+
+    def load_document(self):
+        """
+        Load the document using UnstructuredPDFLoader.
+
+        Returns:
+            str: Loaded document data.
+        """
+        loader = UnstructuredPDFLoader(self.pdf_path)
+        return loader.load()
+
+    def split_document(self, data):
+        """
+        Split the document into chunks using RecursiveCharacterTextSplitter.
+
+        Args:
+            data (str): Document data.
+
+        Returns:
+            list: List of document splits.
+        """
+        text_splitter = RecursiveCharacterTextSplitter(chunk_size=self.chunk_size, chunk_overlap=self.chunk_overlap)
+        return text_splitter.split_documents(data)
+
+class VectorStoreManager:
+    """Handles vector store creation and similarity searches."""
+
+    def __init__(self, persist_directory, collection_name, embedding_function):
+        """
+        Initialize the VectorStoreManager.
+
+        Args:
+            persist_directory (str): Directory for vector store persistence.
+            collection_name (str): Name of the collection.
+            embedding_function (callable): Embedding function for creating vectors.
+        """
+        self.persist_directory = persist_directory
+        self.collection_name = collection_name
+        self.embedding_function = embedding_function
+
+    def create_vector_store(self, documents):
+        """
+        Create a vector store using Chroma.from_documents.
+
+        Args:
+            documents (list): List of document splits.
+
+        Returns:
+            Chroma: Created vector store.
+        """
+        vectorstore = Chroma.from_documents(
+            documents=documents, embedding=self.embedding_function,
+            persist_directory=self.persist_directory,
+            collection_name=self.collection_name
+        )
+        return vectorstore
+
+    def similarity_search(self, question):
+        """
+        Perform a similarity search in the vector store.
+
+        Args:
+            question (str): Question for similarity search.
+
+        Returns:
+            list: List of similar documents.
+        """
+        vectorstore = Chroma(
+            persist_directory=self.persist_directory,
+            collection_name=self.collection_name,
+            embedding_function=self.embedding_function
+        )
+        return vectorstore.similarity_search(question, collection_name=self.collection_name)
+
+def main():
+    """Main function to orchestrate the document processing and similarity search."""
+    pdf_path = "./data/test.pdf"
+    persist_directory = "./data/chroma_db"
+    collection_name = "test_collection"
+
+    processor = DocumentProcessor(pdf_path)
+    data = processor.load_document()
+    doc_splits = processor.split_document(data)
+
+    embeddings = OpenAIEmbeddings()
+
+    vectorstore_manager = VectorStoreManager(persist_directory, collection_name, embeddings)
+    vectorstore = vectorstore_manager.create_vector_store(doc_splits)
+
+    question = "<test questions>"
+    similar_docs = vectorstore_manager.similarity_search(question)
+    num_similar_docs = len(similar_docs)
+
+    print(f"Number of similar documents: {num_similar_docs}")
+    print(f"Retrieved similar documents: {similar_docs}")
+
+if __name__ == "__main__":
+    main()
+
+
+#---- optional load from disk:
+# vectorstore = Chroma(
+#     persist_directory="./data/chroma_db",
+#     collection_name="test_collection",
+#     embedding_function=OpenAIEmbeddings()
+# )
+# ---------------------------
+
+# ---- DB interaction: https://docs.trychroma.com/api-reference
+# import chromadb
+# client = chromadb.PersistentClient(path="./data/chroma_db")
+# client.list_collections()
+# client.delete_collection("langchain")
+# ---------------------------
diff --git a/generative_ai/langchain/requirements.txt b/generative_ai/langchain/requirements.txt
@@ -0,0 +1,14 @@
+chromadb
+langchain
+openai
+pandas
+pypdf
+pdf2image
+pdfminer.six
+pyodbc
+pytest-cov
+pytesseract
+tabulate
+tiktoken
+unstructured
+xlrd