Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Wip #1

Open
wants to merge 14 commits into
base: main
Choose a base branch
from
60 changes: 60 additions & 0 deletions misc/gen_rag.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
import sys
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
from langchain.prompts import ChatPromptTemplate
from langchain_community.llms import Ollama

# set some config variables for ChromaDB
CHROMA_DATA_PATH = "vdb_data/"
DOC_PATH = sys.argv[1]

llm = Ollama(model="mistral")

# load your pdf doc
loader = PyPDFLoader(DOC_PATH)
pages = loader.load()

# split the doc into smaller chunks i.e. chunk_size=500
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
chunks = text_splitter.split_documents(pages)


embeddings = FastEmbedEmbeddings()

# embed the chunks as vectors and load them into the database
db_chroma = Chroma.from_documents(chunks, embeddings, persist_directory=CHROMA_DATA_PATH)

PROMPT_TEMPLATE = """
Answer the question based only on the following context:
{context}
Answer the question based on the above context: {question}.
Provide a detailed answer.
Don’t justify your answers.
Don’t give information not mentioned in the CONTEXT INFORMATION.
Do not say "according to the context" or "mentioned in the context" or similar.
"""


while True:
query = input("\nQuery: ")
if query == "exit":
break
if query.strip() == "":
continue
# retrieve context - top 5 most relevant (closests) chunks to the query vector
# (by default Langchain is using cosine distance metric)
docs_chroma = db_chroma.similarity_search_with_score(query, k=5)

# generate an answer based on given user query and retrieved context information
context_text = "\n\n".join([doc.page_content for doc, _score in docs_chroma])
# you can use a prompt template

# load retrieved context and user query in the prompt template
prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
prompt = prompt_template.format(context=context_text, question=query)

# call LLM model to generate the answer based on the given context and query
response_text = llm.invoke(prompt)
print(response_text)
30 changes: 30 additions & 0 deletions misc/genai_agent.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
import crewai
from langchain_community.llms import Ollama
from langchain_community.tools import DuckDuckGoSearchRun
from crewai_tools import tool

llm = Ollama(model="mistral")

def callback_function(output):
print(f"Task completed: {output.raw_output}")

@tool("DuckDuckGoSearch")
def search(search_query: str) -> str:
"""Search the web for information on a given topic"""
return DuckDuckGoSearchRun().run(search_query)

agent = crewai.Agent(
role="Calendar",
goal="What day of the month is Thanksgiving on in 2024?",
backstory="You are a calendar assistant. You provide information about dates. ",
tools=[search],
llm=llm,
allow_delegation=False, verbose=True)

task = crewai.Task(description="What day of the month is Thanksgiving on in 2024?",
agent=agent,
expected_output="Date of Thanksgiving in the current year")

crew = crewai.Crew(agents=[agent], tasks=[task], verbose=True)
res = crew.kickoff()
print(res)
49 changes: 49 additions & 0 deletions misc/genai_agent2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
from crewai import Crew, Agent, Task
from langchain_community.llms import Ollama
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import SentenceTransformerEmbeddings # Use the wrapper
from langchain_community.document_loaders import UnstructuredURLLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from crewai_tools import BaseTool
from crewai_tools import tool
from crewai_tools import WebsiteSearchTool
from langchain_community.embeddings.fastembed import FastEmbedEmbeddings

searchtool = WebsiteSearchTool(
website="https://www.almanac.com/thanksgiving-day",
config=dict(
llm=dict(
provider="ollama", # or google, openai, anthropic, llama2, ...
config=dict(
model="phi3",
),
),
embedder=dict(
provider="ollama",
config=dict(
model="mxbai-embed-large:latest",

),
),
)
)



llm = Ollama(model="phi3")

agent = Agent(
role="Calendar",
goal="What day of the month is Thanksgiving on in 2024?",
backstory="You are a calendar assistant. You provide information about dates. ",
tools=[searchtool],
llm=llm,
allow_delegation=False, verbose=True)

task = Task(description="What day of the month is Thanksgiving on in 2024?",
agent=agent,
expected_output="Date of Thanksgiving in 2024")

crew = Crew(agents=[agent], tasks=[task], verbose=True)
res = crew.kickoff()
print(res)
1 change: 1 addition & 0 deletions misc/genai_crew.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@

41 changes: 41 additions & 0 deletions misc/genai_rag2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
import os
import wget
from langchain.vectorstores import Qdrant
from langchain_community.document_loaders import BSHTMLLoader
from langchain.chains import RetrievalQA
from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
from langchain_community.llms import Ollama

#download War and Peace by Tolstoy
# wget.download("http://az.lib.ru/t/tolstoj_lew_nikolaewich/text_0073.shtml")
wget.download("https://www.cs.cmu.edu/~rgs/alice-I.html")

#load text from html
loader = BSHTMLLoader("alice-I.html", open_encoding='ISO-8859-1')
war_and_peace = loader.load()

#init Vector DB

embeddings = FastEmbedEmbeddings()

doc_store = Qdrant.from_documents(
war_and_peace,
embeddings,
location=":memory:",
collection_name="docs",
)

llm = Ollama(model="mistral")
# ask questions

while True:
question = input('Your question: ')
qa = RetrievalQA.from_chain_type(
llm=llm,
chain_type="stuff",
retriever=doc_store.as_retriever(),
return_source_documents=False,
)

result = qa(question)
print(f"Answer: {result}")
45 changes: 45 additions & 0 deletions misc/genai_rag3.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
import os
from langchain_community.llms import Ollama
from dotenv import load_dotenv
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain.chains import create_retrieval_chain
from langchain import hub
from langchain.chains.combine_documents import create_stuff_documents_chain

llm = Ollama(model="mistral", base_url="http://127.0.0.1:11434")


embed_model = OllamaEmbeddings(
model="mistral",
base_url='http://127.0.0.1:11434'
)


text = """
In the lush canopy of a tropical rainforest, two mischievous monkeys, Coco and Mango, swung from branch to branch, their playful antics echoing through the trees. They were inseparable companions, sharing everything from juicy fruits to secret hideouts high above the forest floor. One day, while exploring a new part of the forest, Coco stumbled upon a beautiful orchid hidden among the foliage. Entranced by its delicate petals, Coco plucked it and presented it to Mango with a wide grin. Overwhelmed by Coco's gesture of friendship, Mango hugged Coco tightly, cherishing the bond they shared. From that day on, Coco and Mango ventured through the forest together, their friendship growing stronger with each passing adventure. As they watched the sun dip below the horizon, casting a golden glow over the treetops, they knew that no matter what challenges lay ahead, they would always have each other, and their hearts brimmed with joy.
"""

text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=128)
chunks = text_splitter.split_text(text)

vector_store = Chroma.from_texts(chunks, embed_model)


retriever = vector_store.as_retriever()

chain = create_retrieval_chain(combine_docs_chain=llm,retriever=retriever)

retrieval_qa_chat_prompt = hub.pull("langchain-ai/retrieval-qa-chat")

combine_docs_chain = create_stuff_documents_chain(
llm, retrieval_qa_chat_prompt
)

retrieval_chain = create_retrieval_chain(retriever, combine_docs_chain)

response = retrieval_chain.invoke({"input": "Tell me name of monkeys and where do they live"})
print(response['answer'])

46 changes: 46 additions & 0 deletions misc/genai_rag4.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
import os
from langchain_community.llms import Ollama
from dotenv import load_dotenv
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain.chains import create_retrieval_chain
from langchain import hub
from langchain.chains.combine_documents import create_stuff_documents_chain

llm = Ollama(model="mistral", base_url="http://127.0.0.1:11434")


embed_model = OllamaEmbeddings(
model="mistral",
base_url='http://127.0.0.1:11434'
)


text = """
In the lush canopy of a tropical rainforest, two mischievous monkeys, Coco and Mango, swung from branch to branch, their playful antics echoing through the trees. They were inseparable companions, sharing everything from juicy fruits to secret hideouts high above the forest floor. One day, while exploring a new part of the forest, Coco stumbled upon a beautiful orchid hidden among the foliage. Entranced by its delicate petals, Coco plucked it and presented it to Mango with a wide grin. Overwhelmed by Coco's gesture of friendship, Mango hugged Coco tightly, cherishing the bond they shared. From that day on, Coco and Mango ventured through the forest together, their friendship growing stronger with each passing adventure. As they watched the sun dip below the horizon, casting a golden glow over the treetops, they knew that no matter what challenges lay ahead, they would always have each other, and their hearts brimmed with joy.
"""

text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=128)
chunks = text_splitter.split_text(text)

vector_store = Chroma.from_texts(chunks, embed_model)


retriever = vector_store.as_retriever()

chain = create_retrieval_chain(combine_docs_chain=llm,retriever=retriever)

retrieval_qa_chat_prompt = hub.pull("langchain-ai/retrieval-qa-chat")

combine_docs_chain = create_stuff_documents_chain(
llm, retrieval_qa_chat_prompt
)

retrieval_chain = create_retrieval_chain(retriever, combine_docs_chain)

response = retrieval_chain.invoke({"input": "Tell me name of monkeys and where do they live"})
print(response['answer'])


62 changes: 62 additions & 0 deletions misc/genai_rag5.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
import sys
from langchain_community.llms import Ollama
from langchain_community.embeddings import OllamaEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain.chains import create_retrieval_chain
from langchain_core.prompts import ChatPromptTemplate
from langchain import hub
from langchain.chains.combine_documents import create_stuff_documents_chain
from tempfile import NamedTemporaryFile

llm = Ollama(model="mistral", base_url="http://127.0.0.1:11434")


pdf_path = sys.argv[1]

# Function to extract text from a PDF file
def extract_text_from_pdf(pdf_path):
text = ""
with NamedTemporaryFile(dir='.', suffix='.pdf') as f:
f.write(pdf_path.getbuffer())
with open(f.name, "rb") as file:
reader = PyPDF2.PdfReader(f.name)
for page_num in range(len(reader.pages)):
text += reader.pages[page_num].extract_text()
f.close()
return text



embed_model = OllamaEmbeddings(
model="mistral",
base_url='http://127.0.0.1:11434'
)


text = extract_text_from_pdf(pdf_path)

text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=128)
chunks = text_splitter.split_text(text)

vector_store = Chroma.from_texts(chunks, embed_model)


retriever = vector_store.as_retriever()

chain = create_retrieval_chain(combine_docs_chain=llm,retriever=retriever)

retrieval_qa_chat_prompt = hub.pull("langchain-ai/retrieval-qa-chat")

combine_docs_chain = create_stuff_documents_chain(
llm, retrieval_qa_chat_prompt
)

retrieval_chain = create_retrieval_chain(retriever, combine_docs_chain)

prompt = ChatPromptTemplate.from_messages(
[("system","Tell me name of monkeys and where do they live?\n\n{context}")]
)

response = retrieval_chain.invoke({"context": retrieval_chain})
print(response['answer'])
44 changes: 44 additions & 0 deletions misc/genai_rag6.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
import fitz # PyMuPDF
from transformers import RagTokenizer, RagTokenForGeneration

def extract_text_from_pdf(pdf_path):
"""Extracts text from a given PDF file."""
doc = fitz.open(pdf_path)
text = ""
for page in doc:
text += page.get_text()
doc.close()
return text

def setup_rag_model():
"""Sets up the RAG tokenizer and model."""
tokenizer = RagTokenizer.from_pretrained("facebook/rag-token-nq")
model = RagTokenForGeneration.from_pretrained("facebook/rag-token-nq")
return tokenizer, model

def answer_question(question, context, tokenizer, model):
"""Generates an answer to the question based on the context using RAG."""
inputs = tokenizer(question, context, return_tensors="pt", truncation=True, padding=True)
with tokenizer.as_target_tokenizer():
output_ids = model.generate(**inputs)
return tokenizer.decode(output_ids[0], skip_special_tokens=True)

# Example usage
pdf_path = 'sample.pdf' # Path to your PDF file
context = extract_text_from_pdf(pdf_path)
tokenizer, model = setup_rag_model()
question = "What is the main topic of the document?"
answer = answer_question(question, context, tokenizer, model)
print("Answer:", answer)


### Running the Example
#1. Replace `'sample.pdf'` with the path to your PDF file.
#2. Make sure to have a valid question that relates to the content of the PDF.
#3. Execute the script.

### How It Works
#- **PDF Text Extraction**: The `extract_text_from_pdf` function reads the PDF and extracts all text from it. This text serves as the context for generating answers.
#- **Model Setup**: The `setup_rag_model` function loads the pre-trained RAG tokenizer and model.
#- **Answer Generation**: The `answer_question` function uses the model and tokenizer to generate an answer to the input question based on the extracted PDF text.

Loading