Finish refining rag code.

Lebaranto · Jun 20, 2024 · 34fada1 · 34fada1
1 parent 31a1a4f
commit 34fada1
Show file tree

Hide file tree

Showing 6 changed files with 38 additions and 32 deletions.
diff --git a/4_rag/3_rag_text_splitting_deep_dive.py b/4_rag/3_rag_text_splitting_deep_dive.py
@@ -1,8 +1,5 @@
 import os
 
-# Set environment variable to suppress tokenizer parallelism warning
-os.environ["TOKENIZERS_PARALLELISM"] = "false"
-
 from langchain.text_splitter import (
     CharacterTextSplitter,
     RecursiveCharacterTextSplitter,
@@ -16,7 +13,7 @@
 
 # Define the directory containing the text file
 current_dir = os.path.dirname(os.path.abspath(__file__))
-file_path = os.path.join(current_dir, "books/odyssey.txt")
+file_path = os.path.join(current_dir, "books", "romeo_and_juliet.txt")
 db_dir = os.path.join(current_dir, "db")
 
 # Check if the text file exists
@@ -31,7 +28,7 @@
 
 # Define the embedding model
 embeddings = OpenAIEmbeddings(
-    model="text-embedding-ada-002"
+    model="text-embedding-3-small"
 )  # Update to a valid embedding model if needed
 
 
@@ -45,7 +42,8 @@ def create_vector_store(docs, store_name):
         )
         print(f"--- Finished creating vector store {store_name} ---")
     else:
-        print(f"Vector store {store_name} already exists. No need to initialize.")
+        print(
+            f"Vector store {store_name} already exists. No need to initialize.")
 
 
 # 1. Character-based Splitting
@@ -76,7 +74,8 @@ def create_vector_store(docs, store_name):
 # Attempts to split text at natural boundaries (sentences, paragraphs) within character limit.
 # Balances between maintaining coherence and adhering to character limits.
 print("\n--- Using Recursive Character-based Splitting ---")
-rec_char_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
+rec_char_splitter = RecursiveCharacterTextSplitter(
+    chunk_size=1000, chunk_overlap=100)
 rec_char_docs = rec_char_splitter.split_documents(documents)
 create_vector_store(rec_char_docs, "chroma_db_rec_char")
 
@@ -107,7 +106,7 @@ def query_vector_store(store_name, query):
         )
         retriever = db.as_retriever(
             search_type="similarity_score_threshold",
-            search_kwargs={"k": 1, "score_threshold": 0.75},
+            search_kwargs={"k": 1, "score_threshold": 0.1},
         )
         relevant_docs = retriever.invoke(query)
         # Display the relevant results with metadata
@@ -121,7 +120,7 @@ def query_vector_store(store_name, query):
 
 
 # Define the user's question
-query = "Who is Odysseus' wife?"
+query = "How did Juliet die?"
 
 # Query each vector store
 query_vector_store("chroma_db_char", query)

diff --git a/4_rag/4_rag_embedding_deep_dive.py b/4_rag/4_rag_embedding_deep_dive.py
@@ -1,8 +1,5 @@
 import os
 
-# Set environment variable to suppress tokenizer parallelism warning
-os.environ["TOKENIZERS_PARALLELISM"] = "false"
-
 from langchain.embeddings import HuggingFaceEmbeddings
 from langchain.text_splitter import CharacterTextSplitter
 from langchain_community.document_loaders import TextLoader
@@ -11,7 +8,7 @@
 
 # Define the directory containing the text file and the persistent directory
 current_dir = os.path.dirname(os.path.abspath(__file__))
-file_path = os.path.join(current_dir, "books/odyssey.txt")
+file_path = os.path.join(current_dir, "books", "odyssey.txt")
 db_dir = os.path.join(current_dir, "db")
 
 # Check if the text file exists
@@ -39,10 +36,12 @@ def create_vector_store(docs, embeddings, store_name):
     persistent_directory = os.path.join(db_dir, store_name)
     if not os.path.exists(persistent_directory):
         print(f"\n--- Creating vector store {store_name} ---")
-        Chroma.from_documents(docs, embeddings, persist_directory=persistent_directory)
+        Chroma.from_documents(
+            docs, embeddings, persist_directory=persistent_directory)
         print(f"--- Finished creating vector store {store_name} ---")
     else:
-        print(f"Vector store {store_name} already exists. No need to initialize.")
+        print(
+            f"Vector store {store_name} already exists. No need to initialize.")
 
 
 # 1. OpenAI Embeddings
@@ -78,10 +77,8 @@ def query_vector_store(store_name, query, embedding_function):
             embedding_function=embedding_function,
         )
         retriever = db.as_retriever(
-            search_type="similarity",
-            search_kwargs={
-                "k": 3,
-            },
+            search_type="similarity_score_threshold",
+            search_kwargs={"k": 3, "score_threshold": 0.1},
         )
         relevant_docs = retriever.invoke(query)
         # Display the relevant results with metadata

diff --git a/4_rag/5_rag_retriever_deep_dive.py b/4_rag/5_rag_retriever_deep_dive.py
@@ -1,8 +1,11 @@
 import os
 
+from dotenv import load_dotenv
 from langchain_community.vectorstores import Chroma
 from langchain_openai import OpenAIEmbeddings
 
+load_dotenv()
+
 # Define the persistent directory
 current_dir = os.path.dirname(os.path.abspath(__file__))
 db_dir = os.path.join(current_dir, "db")
@@ -12,7 +15,8 @@
 embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
 
 # Load the existing vector store with the embedding function
-db = Chroma(persist_directory=persistent_directory, embedding_function=embeddings)
+db = Chroma(persist_directory=persistent_directory,
+            embedding_function=embeddings)
 
 
 # Function to query a vector store with different search types and parameters
@@ -41,7 +45,7 @@ def query_vector_store(
 
 
 # Define the user's question
-query = "Who is the main character in Moby Dick?"
+query = "How did Juliet die?"
 
 # Showcase different retrieval methods
 
@@ -50,7 +54,8 @@ def query_vector_store(
 # It finds the most similar documents to the query vector based on cosine similarity.
 # Use this when you want to retrieve the top k most similar documents.
 print("\n--- Using Similarity Search ---")
-query_vector_store("chroma_db_with_metadata", query, embeddings, "similarity", {"k": 3})
+query_vector_store("chroma_db_with_metadata", query,
+                   embeddings, "similarity", {"k": 3})
 
 # 2. Max Marginal Relevance (MMR)
 # This method balances between selecting documents that are relevant to the query and diverse among themselves.
@@ -79,7 +84,7 @@ def query_vector_store(
     query,
     embeddings,
     "similarity_score_threshold",
-    {"k": 3, "score_threshold": 0.7},
+    {"k": 3, "score_threshold": 0.1},
 )
 
 print("Querying demonstrations with different search types completed.")
diff --git a/4_rag/6_rag_one_off_question.py b/4_rag/6_rag_one_off_question.py
@@ -10,16 +10,18 @@
 
 # Define the persistent directory
 current_dir = os.path.dirname(os.path.abspath(__file__))
-persistent_directory = os.path.join(current_dir, "db", "chroma_db_with_metadata")
+persistent_directory = os.path.join(
+    current_dir, "db", "chroma_db_with_metadata")
 
 # Define the embedding model
 embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
 
 # Load the existing vector store with the embedding function
-db = Chroma(persist_directory=persistent_directory, embedding_function=embeddings)
+db = Chroma(persist_directory=persistent_directory,
+            embedding_function=embeddings)
 
 # Define the user's question
-query = "Who is tom brady?"
+query = "How can I learn more about LangChain?"
 
 # Retrieve relevant documents based on the query
 retriever = db.as_retriever(
@@ -56,7 +58,7 @@
 
 # Display the full result and content only
 print("\n--- Generated Response ---")
-print("Full result:")
-print(result)
+# print("Full result:")
+# print(result)
 print("Content only:")
 print(result.content)
diff --git a/4_rag/8_rag_web_scrape.py → 4_rag/8_rag_web_scrape_basic.py b/4_rag/8_rag_web_scrape.py → 4_rag/8_rag_web_scrape_basic.py
diff --git a/4_rag/8_rag_web_scrape_firecrawl.py b/4_rag/8_rag_web_scrape_firecrawl.py
@@ -24,7 +24,8 @@ def create_vector_store():
 
     # Step 1: Crawl the website using FireCrawlLoader
     print("Begin crawling the website...")
-    loader = FireCrawlLoader(api_key=api_key, url="https://apple.com", mode="scrape")
+    loader = FireCrawlLoader(
+        api_key=api_key, url="https://apple.com", mode="scrape")
     docs = loader.load()
     print("Finished crawling the website.")
 
@@ -58,11 +59,13 @@ def create_vector_store():
 if not os.path.exists(persistent_directory):
     create_vector_store()
 else:
-    print(f"Vector store {persistent_directory} already exists. No need to initialize.")
+    print(
+        f"Vector store {persistent_directory} already exists. No need to initialize.")
 
 # Load the vector store with the embeddings
 embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
-db = Chroma(persist_directory=persistent_directory, embedding_function=embeddings)
+db = Chroma(persist_directory=persistent_directory,
+            embedding_function=embeddings)
 
 
 # Step 5: Query the vector store
@@ -86,7 +89,7 @@ def query_vector_store(query):
 
 
 # Define the user's question
-query = "WWDC24?"
+query = "Apple Intelligence?"
 
 # Query the vector store with the user's question
 query_vector_store(query)