skillrepos · brentlaster · Aug 5, 2024 · Aug 5, 2024 · Aug 5, 2024 · Aug 5, 2024
diff --git a/misc/gen_rag.py b/misc/gen_rag.py
@@ -0,0 +1,60 @@
+import sys
+from langchain_community.document_loaders import PyPDFLoader
+from langchain_community.vectorstores import Chroma
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
+from langchain.prompts import ChatPromptTemplate
+from langchain_community.llms import Ollama
+
+# set some config variables for ChromaDB
+CHROMA_DATA_PATH = "vdb_data/"
+DOC_PATH = sys.argv[1]
+
+llm = Ollama(model="mistral")
+
+# load your pdf doc
+loader = PyPDFLoader(DOC_PATH)
+pages = loader.load()
+
+# split the doc into smaller chunks i.e. chunk_size=500
+text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
+chunks = text_splitter.split_documents(pages)
+
+
+embeddings = FastEmbedEmbeddings()  
+
+# embed the chunks as vectors and load them into the database
+db_chroma = Chroma.from_documents(chunks, embeddings, persist_directory=CHROMA_DATA_PATH)
+
+PROMPT_TEMPLATE = """
+Answer the question based only on the following context:
+{context}
+Answer the question based on the above context: {question}.
+Provide a detailed answer.
+Don’t justify your answers.
+Don’t give information not mentioned in the CONTEXT INFORMATION.
+Do not say "according to the context" or "mentioned in the context" or similar.
+"""
+
+
+while True:
+    query = input("\nQuery: ")
+    if query == "exit":
+        break
+    if query.strip() == "":
+        continue
+    # retrieve context - top 5 most relevant (closests) chunks to the query vector 
+    # (by default Langchain is using cosine distance metric)
+    docs_chroma = db_chroma.similarity_search_with_score(query, k=5)
+
+    # generate an answer based on given user query and retrieved context information
+    context_text = "\n\n".join([doc.page_content for doc, _score in docs_chroma])
+    # you can use a prompt template
+
+    # load retrieved context and user query in the prompt template
+    prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
+    prompt = prompt_template.format(context=context_text, question=query)
+
+    # call LLM model to generate the answer based on the given context and query
+    response_text = llm.invoke(prompt)
+    print(response_text)
diff --git a/misc/genai_agent.py b/misc/genai_agent.py
@@ -0,0 +1,30 @@
+import crewai
+from langchain_community.llms import Ollama
+from langchain_community.tools import DuckDuckGoSearchRun
+from crewai_tools import tool
+
+llm = Ollama(model="mistral")
+
+def callback_function(output):
+    print(f"Task completed: {output.raw_output}")
+
+@tool("DuckDuckGoSearch")
+def search(search_query: str) -> str:
+    """Search the web for information on a given topic"""
+    return DuckDuckGoSearchRun().run(search_query)
+
+agent = crewai.Agent(
+    role="Calendar",
+    goal="What day of the month is Thanksgiving on in 2024?",
+    backstory="You are a calendar assistant. You provide information about dates. ",
+    tools=[search],
+    llm=llm,
+    allow_delegation=False, verbose=True)
+
+task = crewai.Task(description="What day of the month is Thanksgiving on in 2024?",
+                   agent=agent,
+                   expected_output="Date of Thanksgiving in the current year")
+
+crew = crewai.Crew(agents=[agent], tasks=[task], verbose=True)
+res = crew.kickoff()
+print(res)
diff --git a/misc/genai_agent2.py b/misc/genai_agent2.py
@@ -0,0 +1,49 @@
+from crewai import Crew, Agent, Task 
+from langchain_community.llms import Ollama
+from langchain_community.vectorstores import Chroma
+from langchain_community.embeddings import SentenceTransformerEmbeddings  # Use the wrapper
+from langchain_community.document_loaders import UnstructuredURLLoader
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from crewai_tools import BaseTool
+from crewai_tools import tool
+from crewai_tools import WebsiteSearchTool
+from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
+
+searchtool = WebsiteSearchTool(
+    website="https://www.almanac.com/thanksgiving-day", 
+    config=dict(
+        llm=dict(
+            provider="ollama", # or google, openai, anthropic, llama2, ...
+            config=dict(
+                model="phi3",
+            ),
+        ),
+        embedder=dict(
+            provider="ollama",
+                config=dict(
+                    model="mxbai-embed-large:latest",
+
+            ),
+        ),
+    )
+)
+
+
+
+llm = Ollama(model="phi3")
+
+agent = Agent(
+    role="Calendar",
+    goal="What day of the month is Thanksgiving on in 2024?",
+    backstory="You are a calendar assistant. You provide information about dates. ",
+    tools=[searchtool],
+    llm=llm,
+    allow_delegation=False, verbose=True)
+
+task = Task(description="What day of the month is Thanksgiving on in 2024?",
+                   agent=agent,
+                   expected_output="Date of Thanksgiving in 2024")
+
+crew = Crew(agents=[agent], tasks=[task], verbose=True)
+res = crew.kickoff()
+print(res)
diff --git a/misc/genai_crew.py b/misc/genai_crew.py
@@ -0,0 +1 @@
+
diff --git a/misc/genai_rag2.py b/misc/genai_rag2.py
@@ -0,0 +1,41 @@
+import os
+import wget
+from langchain.vectorstores import Qdrant
+from langchain_community.document_loaders import BSHTMLLoader
+from langchain.chains import RetrievalQA
+from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
+from langchain_community.llms import Ollama
+
+#download War and Peace by Tolstoy
+# wget.download("http://az.lib.ru/t/tolstoj_lew_nikolaewich/text_0073.shtml")
+wget.download("https://www.cs.cmu.edu/~rgs/alice-I.html")
+
+#load text from html
+loader = BSHTMLLoader("alice-I.html", open_encoding='ISO-8859-1')
+war_and_peace = loader.load()
+
+#init Vector DB
+
+embeddings = FastEmbedEmbeddings()  
+
+doc_store = Qdrant.from_documents(
+    war_and_peace, 
+    embeddings,
+    location=":memory:", 
+    collection_name="docs",
+)
+
+llm = Ollama(model="mistral")
+# ask questions
+
+while True:
+    question = input('Your question: ')
+    qa = RetrievalQA.from_chain_type(
+        llm=llm,
+        chain_type="stuff",
+        retriever=doc_store.as_retriever(),
+        return_source_documents=False,
+    )
+
+    result = qa(question)
+    print(f"Answer: {result}")
diff --git a/misc/genai_rag3.py b/misc/genai_rag3.py
@@ -0,0 +1,45 @@
+import os
+from langchain_community.llms import Ollama
+from dotenv import load_dotenv
+from langchain_community.embeddings import OllamaEmbeddings
+from langchain_community.document_loaders import TextLoader
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain_community.vectorstores import Chroma
+from langchain.chains import create_retrieval_chain
+from langchain import hub
+from langchain.chains.combine_documents import create_stuff_documents_chain
+
+llm = Ollama(model="mistral", base_url="http://127.0.0.1:11434")
+
+
+embed_model = OllamaEmbeddings(
+    model="mistral",
+    base_url='http://127.0.0.1:11434'
+)
+
+
+text = """
+    In the lush canopy of a tropical rainforest, two mischievous monkeys, Coco and Mango, swung from branch to branch, their playful antics echoing through the trees. They were inseparable companions, sharing everything from juicy fruits to secret hideouts high above the forest floor. One day, while exploring a new part of the forest, Coco stumbled upon a beautiful orchid hidden among the foliage. Entranced by its delicate petals, Coco plucked it and presented it to Mango with a wide grin. Overwhelmed by Coco's gesture of friendship, Mango hugged Coco tightly, cherishing the bond they shared. From that day on, Coco and Mango ventured through the forest together, their friendship growing stronger with each passing adventure. As they watched the sun dip below the horizon, casting a golden glow over the treetops, they knew that no matter what challenges lay ahead, they would always have each other, and their hearts brimmed with joy.
+    """
+
+text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=128)
+chunks = text_splitter.split_text(text)
+
+vector_store = Chroma.from_texts(chunks, embed_model)
+
+
+retriever = vector_store.as_retriever()
+
+chain = create_retrieval_chain(combine_docs_chain=llm,retriever=retriever)
+
+retrieval_qa_chat_prompt = hub.pull("langchain-ai/retrieval-qa-chat")
+
+combine_docs_chain = create_stuff_documents_chain(
+    llm, retrieval_qa_chat_prompt
+)
+
+retrieval_chain = create_retrieval_chain(retriever, combine_docs_chain)    
+
+response = retrieval_chain.invoke({"input": "Tell me name of monkeys and where do they live"})
+print(response['answer'])
+
diff --git a/misc/genai_rag4.py b/misc/genai_rag4.py
@@ -0,0 +1,46 @@
+import os
+from langchain_community.llms import Ollama
+from dotenv import load_dotenv
+from langchain_community.embeddings import OllamaEmbeddings
+from langchain_community.document_loaders import TextLoader
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain_community.vectorstores import Chroma
+from langchain.chains import create_retrieval_chain
+from langchain import hub
+from langchain.chains.combine_documents import create_stuff_documents_chain
+
+llm = Ollama(model="mistral", base_url="http://127.0.0.1:11434")
+
+
+embed_model = OllamaEmbeddings(
+    model="mistral",
+    base_url='http://127.0.0.1:11434'
+)
+
+
+text = """
+    In the lush canopy of a tropical rainforest, two mischievous monkeys, Coco and Mango, swung from branch to branch, their playful antics echoing through the trees. They were inseparable companions, sharing everything from juicy fruits to secret hideouts high above the forest floor. One day, while exploring a new part of the forest, Coco stumbled upon a beautiful orchid hidden among the foliage. Entranced by its delicate petals, Coco plucked it and presented it to Mango with a wide grin. Overwhelmed by Coco's gesture of friendship, Mango hugged Coco tightly, cherishing the bond they shared. From that day on, Coco and Mango ventured through the forest together, their friendship growing stronger with each passing adventure. As they watched the sun dip below the horizon, casting a golden glow over the treetops, they knew that no matter what challenges lay ahead, they would always have each other, and their hearts brimmed with joy.
+    """
+
+text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=128)
+chunks = text_splitter.split_text(text)
+
+vector_store = Chroma.from_texts(chunks, embed_model)
+
+
+retriever = vector_store.as_retriever()
+
+chain = create_retrieval_chain(combine_docs_chain=llm,retriever=retriever)
+
+retrieval_qa_chat_prompt = hub.pull("langchain-ai/retrieval-qa-chat")
+
+combine_docs_chain = create_stuff_documents_chain(
+    llm, retrieval_qa_chat_prompt
+)
+
+retrieval_chain = create_retrieval_chain(retriever, combine_docs_chain)    
+
+response = retrieval_chain.invoke({"input": "Tell me name of monkeys and where do they live"})
+print(response['answer'])
+
+
diff --git a/misc/genai_rag5.py b/misc/genai_rag5.py
@@ -0,0 +1,62 @@
+import sys
+from langchain_community.llms import Ollama
+from langchain_community.embeddings import OllamaEmbeddings
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain_community.vectorstores import Chroma
+from langchain.chains import create_retrieval_chain
+from langchain_core.prompts import ChatPromptTemplate
+from langchain import hub
+from langchain.chains.combine_documents import create_stuff_documents_chain
+from tempfile import NamedTemporaryFile
+
+llm = Ollama(model="mistral", base_url="http://127.0.0.1:11434")
+
+
+pdf_path = sys.argv[1]
+
+# Function to extract text from a PDF file
+def extract_text_from_pdf(pdf_path):
+    text = ""
+    with NamedTemporaryFile(dir='.', suffix='.pdf') as f:
+        f.write(pdf_path.getbuffer())
+        with open(f.name, "rb") as file:
+            reader = PyPDF2.PdfReader(f.name)
+            for page_num in range(len(reader.pages)):
+                text += reader.pages[page_num].extract_text()      
+            f.close()      
+    return text
+
+
+
+embed_model = OllamaEmbeddings(
+    model="mistral",
+    base_url='http://127.0.0.1:11434'
+)
+
+
+text = extract_text_from_pdf(pdf_path)
+
+text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=128)
+chunks = text_splitter.split_text(text)
+
+vector_store = Chroma.from_texts(chunks, embed_model)
+
+
+retriever = vector_store.as_retriever()
+
+chain = create_retrieval_chain(combine_docs_chain=llm,retriever=retriever)
+
+retrieval_qa_chat_prompt = hub.pull("langchain-ai/retrieval-qa-chat")
+
+combine_docs_chain = create_stuff_documents_chain(
+    llm, retrieval_qa_chat_prompt
+)
+
+retrieval_chain = create_retrieval_chain(retriever, combine_docs_chain)    
+
+prompt = ChatPromptTemplate.from_messages(
+    [("system","Tell me name of monkeys and where do they live?\n\n{context}")]
+)
+
+response = retrieval_chain.invoke({"context": retrieval_chain})
+print(response['answer'])
diff --git a/misc/genai_rag6.py b/misc/genai_rag6.py
@@ -0,0 +1,44 @@
+import fitz  # PyMuPDF
+from transformers import RagTokenizer, RagTokenForGeneration
+
+def extract_text_from_pdf(pdf_path):
+    """Extracts text from a given PDF file."""
+    doc = fitz.open(pdf_path)
+    text = ""
+    for page in doc:
+        text += page.get_text()
+    doc.close()
+    return text
+
+def setup_rag_model():
+    """Sets up the RAG tokenizer and model."""
+    tokenizer = RagTokenizer.from_pretrained("facebook/rag-token-nq")
+    model = RagTokenForGeneration.from_pretrained("facebook/rag-token-nq")
+    return tokenizer, model
+
+def answer_question(question, context, tokenizer, model):
+    """Generates an answer to the question based on the context using RAG."""
+    inputs = tokenizer(question, context, return_tensors="pt", truncation=True, padding=True)
+    with tokenizer.as_target_tokenizer():
+        output_ids = model.generate(**inputs)
+    return tokenizer.decode(output_ids[0], skip_special_tokens=True)
+
+# Example usage
+pdf_path = 'sample.pdf'  # Path to your PDF file
+context = extract_text_from_pdf(pdf_path)
+tokenizer, model = setup_rag_model()
+question = "What is the main topic of the document?"
+answer = answer_question(question, context, tokenizer, model)
+print("Answer:", answer)
+
+
+### Running the Example
+#1. Replace `'sample.pdf'` with the path to your PDF file.
+#2. Make sure to have a valid question that relates to the content of the PDF.
+#3. Execute the script.
+
+### How It Works
+#- **PDF Text Extraction**: The `extract_text_from_pdf` function reads the PDF and extracts all text from it. This text serves as the context for generating answers.
+#- **Model Setup**: The `setup_rag_model` function loads the pre-trained RAG tokenizer and model.
+#- **Answer Generation**: The `answer_question` function uses the model and tokenizer to generate an answer to the input question based on the extracted PDF text.
+