Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

langchain musings #6

Draft
wants to merge 6 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
90 changes: 90 additions & 0 deletions generative_ai/langchain/doc_qa.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
from langchain.llms import OpenAI
from langchain.chains import (
StuffDocumentsChain, LLMChain, ConversationalRetrievalChain
)
from langchain.document_loaders import UnstructuredPDFLoader
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.chat_models import ChatOpenAI
from langchain.chains.qa_with_sources import load_qa_with_sources_chain
from langchain.prompts import PromptTemplate
import pprint

def load_vectorstore(persist_directory, collection_name, embedding_function):
"""
Load a vector store.

Args:
persist_directory (str): Directory for vector store persistence.
collection_name (str): Name of the collection.
embedding_function (callable): Embedding function for creating vectors.

Returns:
Chroma: Loaded vector store.
"""
vectorstore = Chroma(
persist_directory=persist_directory,
collection_name=collection_name,
embedding_function=embedding_function
)
return vectorstore

def chain_executor(question, chat_history, chat_model, question_generator_chain, document_chain):
"""
Execute the conversation chain to generate a rephrased question based on conversation history.

Args:
question (str): Input question.
chat_history (list): List of (question, answer) tuples representing chat history.
chat_model (ChatOpenAI): Chat model for question generation.
question_generator_chain: LLMChain for generating questions.
document_chain: QAWithSourcesChain for document-based question answering.

Returns:
dict: Result containing the answer and rephrased question.
"""
retriever = chat_model({"question": question, "chat_history": chat_history})
condensed_question = question_generator_chain({"question": retriever['answer'], "chat_history": chat_history})
answer_with_sources = document_chain({"question": condensed_question, "retriever": retriever})
return {"answer": retriever['answer'], "rephrased_question": condensed_question, "sources": answer_with_sources}

def main():
"""
Main function to orchestrate the conversation and question generation.
"""
persist_directory = "./data/chroma_db"
collection_name = "test_collection"

embeddings = OpenAIEmbeddings()
vectorstore = load_vectorstore(persist_directory, collection_name, embeddings)

llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)
template = """You are a friendly assistant. Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question, in its original language.

Chat History:
{chat_history}
Follow Up Input: {question}
Standalone question:"""
prompt = PromptTemplate.from_template(template)
question_generator_chain = LLMChain(llm=llm, prompt=prompt)
doc_chain = load_qa_with_sources_chain(llm, chain_type="stuff")

retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 4})
chain = ConversationalRetrievalChain(
retriever=retriever,
question_generator=question_generator_chain,
combine_docs_chain=doc_chain,
)

chat_history = []
question = "what question should i ask you?"
result = chain_executor(question, chat_history, llm, question_generator_chain, doc_chain)
pprint.pprint(result)

chat_history.append((question, result['answer']))
question = "What other questions should i ask you?"
result = chain_executor(question, chat_history, llm, question_generator_chain, doc_chain)
pprint.pprint(result)

if __name__ == "__main__":
main()
134 changes: 134 additions & 0 deletions generative_ai/langchain/load_vectordb.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
from langchain.document_loaders import UnstructuredPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma

class DocumentProcessor:
"""Handles loading and splitting of documents."""

def __init__(self, pdf_path, chunk_size=500, chunk_overlap=0):
"""
Initialize the DocumentProcessor.

Args:
pdf_path (str): Path to the PDF document.
chunk_size (int, optional): Size of text chunks. Defaults to 500.
chunk_overlap (int, optional): Overlap between text chunks. Defaults to 0.
"""
self.pdf_path = pdf_path
self.chunk_size = chunk_size
self.chunk_overlap = chunk_overlap

def load_document(self):
"""
Load the document using UnstructuredPDFLoader.

Returns:
str: Loaded document data.
"""
loader = UnstructuredPDFLoader(self.pdf_path)
return loader.load()

def split_document(self, data):
"""
Split the document into chunks using RecursiveCharacterTextSplitter.

Args:
data (str): Document data.

Returns:
list: List of document splits.
"""
text_splitter = RecursiveCharacterTextSplitter(chunk_size=self.chunk_size, chunk_overlap=self.chunk_overlap)
return text_splitter.split_documents(data)

class VectorStoreManager:
"""Handles vector store creation and similarity searches."""

def __init__(self, persist_directory, collection_name, embedding_function):
"""
Initialize the VectorStoreManager.

Args:
persist_directory (str): Directory for vector store persistence.
collection_name (str): Name of the collection.
embedding_function (callable): Embedding function for creating vectors.
"""
self.persist_directory = persist_directory
self.collection_name = collection_name
self.embedding_function = embedding_function

def create_vector_store(self, documents):
"""
Create a vector store using Chroma.from_documents.

Args:
documents (list): List of document splits.

Returns:
Chroma: Created vector store.
"""
vectorstore = Chroma.from_documents(
documents=documents, embedding=self.embedding_function,
persist_directory=self.persist_directory,
collection_name=self.collection_name
)
return vectorstore

def similarity_search(self, question):
"""
Perform a similarity search in the vector store.

Args:
question (str): Question for similarity search.

Returns:
list: List of similar documents.
"""
vectorstore = Chroma(
persist_directory=self.persist_directory,
collection_name=self.collection_name,
embedding_function=self.embedding_function
)
return vectorstore.similarity_search(question, collection_name=self.collection_name)

def main():
"""Main function to orchestrate the document processing and similarity search."""
pdf_path = "./data/test.pdf"
persist_directory = "./data/chroma_db"
collection_name = "test_collection"

processor = DocumentProcessor(pdf_path)
data = processor.load_document()
doc_splits = processor.split_document(data)

embeddings = OpenAIEmbeddings()

vectorstore_manager = VectorStoreManager(persist_directory, collection_name, embeddings)
vectorstore = vectorstore_manager.create_vector_store(doc_splits)

question = "<test questions>"
similar_docs = vectorstore_manager.similarity_search(question)
num_similar_docs = len(similar_docs)

print(f"Number of similar documents: {num_similar_docs}")
print(f"Retrieved similar documents: {similar_docs}")

if __name__ == "__main__":
main()


#---- optional load from disk:
# vectorstore = Chroma(
# persist_directory="./data/chroma_db",
# collection_name="test_collection",
# embedding_function=OpenAIEmbeddings()
# )
# ---------------------------

# ---- DB interaction: https://docs.trychroma.com/api-reference
# import chromadb
# client = chromadb.PersistentClient(path="./data/chroma_db")
# client.list_collections()
# client.delete_collection("langchain")
# ---------------------------
14 changes: 14 additions & 0 deletions generative_ai/langchain/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
chromadb
langchain
openai
pandas
pypdf
pdf2image
pdfminer.six
pyodbc
pytest-cov
pytesseract
tabulate
tiktoken
unstructured
xlrd