Skip to content

Commit

Permalink
Finish refining rag code.
Browse files Browse the repository at this point in the history
  • Loading branch information
bhancockio committed Jun 20, 2024
1 parent 31a1a4f commit 34fada1
Show file tree
Hide file tree
Showing 6 changed files with 38 additions and 32 deletions.
17 changes: 8 additions & 9 deletions 4_rag/3_rag_text_splitting_deep_dive.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,5 @@
import os

# Set environment variable to suppress tokenizer parallelism warning
os.environ["TOKENIZERS_PARALLELISM"] = "false"

from langchain.text_splitter import (
CharacterTextSplitter,
RecursiveCharacterTextSplitter,
Expand All @@ -16,7 +13,7 @@

# Define the directory containing the text file
current_dir = os.path.dirname(os.path.abspath(__file__))
file_path = os.path.join(current_dir, "books/odyssey.txt")
file_path = os.path.join(current_dir, "books", "romeo_and_juliet.txt")
db_dir = os.path.join(current_dir, "db")

# Check if the text file exists
Expand All @@ -31,7 +28,7 @@

# Define the embedding model
embeddings = OpenAIEmbeddings(
model="text-embedding-ada-002"
model="text-embedding-3-small"
) # Update to a valid embedding model if needed


Expand All @@ -45,7 +42,8 @@ def create_vector_store(docs, store_name):
)
print(f"--- Finished creating vector store {store_name} ---")
else:
print(f"Vector store {store_name} already exists. No need to initialize.")
print(
f"Vector store {store_name} already exists. No need to initialize.")


# 1. Character-based Splitting
Expand Down Expand Up @@ -76,7 +74,8 @@ def create_vector_store(docs, store_name):
# Attempts to split text at natural boundaries (sentences, paragraphs) within character limit.
# Balances between maintaining coherence and adhering to character limits.
print("\n--- Using Recursive Character-based Splitting ---")
rec_char_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
rec_char_splitter = RecursiveCharacterTextSplitter(
chunk_size=1000, chunk_overlap=100)
rec_char_docs = rec_char_splitter.split_documents(documents)
create_vector_store(rec_char_docs, "chroma_db_rec_char")

Expand Down Expand Up @@ -107,7 +106,7 @@ def query_vector_store(store_name, query):
)
retriever = db.as_retriever(
search_type="similarity_score_threshold",
search_kwargs={"k": 1, "score_threshold": 0.75},
search_kwargs={"k": 1, "score_threshold": 0.1},
)
relevant_docs = retriever.invoke(query)
# Display the relevant results with metadata
Expand All @@ -121,7 +120,7 @@ def query_vector_store(store_name, query):


# Define the user's question
query = "Who is Odysseus' wife?"
query = "How did Juliet die?"

# Query each vector store
query_vector_store("chroma_db_char", query)
Expand Down
17 changes: 7 additions & 10 deletions 4_rag/4_rag_embedding_deep_dive.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,5 @@
import os

# Set environment variable to suppress tokenizer parallelism warning
os.environ["TOKENIZERS_PARALLELISM"] = "false"

from langchain.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.document_loaders import TextLoader
Expand All @@ -11,7 +8,7 @@

# Define the directory containing the text file and the persistent directory
current_dir = os.path.dirname(os.path.abspath(__file__))
file_path = os.path.join(current_dir, "books/odyssey.txt")
file_path = os.path.join(current_dir, "books", "odyssey.txt")
db_dir = os.path.join(current_dir, "db")

# Check if the text file exists
Expand Down Expand Up @@ -39,10 +36,12 @@ def create_vector_store(docs, embeddings, store_name):
persistent_directory = os.path.join(db_dir, store_name)
if not os.path.exists(persistent_directory):
print(f"\n--- Creating vector store {store_name} ---")
Chroma.from_documents(docs, embeddings, persist_directory=persistent_directory)
Chroma.from_documents(
docs, embeddings, persist_directory=persistent_directory)
print(f"--- Finished creating vector store {store_name} ---")
else:
print(f"Vector store {store_name} already exists. No need to initialize.")
print(
f"Vector store {store_name} already exists. No need to initialize.")


# 1. OpenAI Embeddings
Expand Down Expand Up @@ -78,10 +77,8 @@ def query_vector_store(store_name, query, embedding_function):
embedding_function=embedding_function,
)
retriever = db.as_retriever(
search_type="similarity",
search_kwargs={
"k": 3,
},
search_type="similarity_score_threshold",
search_kwargs={"k": 3, "score_threshold": 0.1},
)
relevant_docs = retriever.invoke(query)
# Display the relevant results with metadata
Expand Down
13 changes: 9 additions & 4 deletions 4_rag/5_rag_retriever_deep_dive.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
import os

from dotenv import load_dotenv
from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings

load_dotenv()

# Define the persistent directory
current_dir = os.path.dirname(os.path.abspath(__file__))
db_dir = os.path.join(current_dir, "db")
Expand All @@ -12,7 +15,8 @@
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

# Load the existing vector store with the embedding function
db = Chroma(persist_directory=persistent_directory, embedding_function=embeddings)
db = Chroma(persist_directory=persistent_directory,
embedding_function=embeddings)


# Function to query a vector store with different search types and parameters
Expand Down Expand Up @@ -41,7 +45,7 @@ def query_vector_store(


# Define the user's question
query = "Who is the main character in Moby Dick?"
query = "How did Juliet die?"

# Showcase different retrieval methods

Expand All @@ -50,7 +54,8 @@ def query_vector_store(
# It finds the most similar documents to the query vector based on cosine similarity.
# Use this when you want to retrieve the top k most similar documents.
print("\n--- Using Similarity Search ---")
query_vector_store("chroma_db_with_metadata", query, embeddings, "similarity", {"k": 3})
query_vector_store("chroma_db_with_metadata", query,
embeddings, "similarity", {"k": 3})

# 2. Max Marginal Relevance (MMR)
# This method balances between selecting documents that are relevant to the query and diverse among themselves.
Expand Down Expand Up @@ -79,7 +84,7 @@ def query_vector_store(
query,
embeddings,
"similarity_score_threshold",
{"k": 3, "score_threshold": 0.7},
{"k": 3, "score_threshold": 0.1},
)

print("Querying demonstrations with different search types completed.")
12 changes: 7 additions & 5 deletions 4_rag/6_rag_one_off_question.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,16 +10,18 @@

# Define the persistent directory
current_dir = os.path.dirname(os.path.abspath(__file__))
persistent_directory = os.path.join(current_dir, "db", "chroma_db_with_metadata")
persistent_directory = os.path.join(
current_dir, "db", "chroma_db_with_metadata")

# Define the embedding model
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

# Load the existing vector store with the embedding function
db = Chroma(persist_directory=persistent_directory, embedding_function=embeddings)
db = Chroma(persist_directory=persistent_directory,
embedding_function=embeddings)

# Define the user's question
query = "Who is tom brady?"
query = "How can I learn more about LangChain?"

# Retrieve relevant documents based on the query
retriever = db.as_retriever(
Expand Down Expand Up @@ -56,7 +58,7 @@

# Display the full result and content only
print("\n--- Generated Response ---")
print("Full result:")
print(result)
# print("Full result:")
# print(result)
print("Content only:")
print(result.content)
File renamed without changes.
11 changes: 7 additions & 4 deletions 4_rag/8_rag_web_scrape_firecrawl.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,8 @@ def create_vector_store():

# Step 1: Crawl the website using FireCrawlLoader
print("Begin crawling the website...")
loader = FireCrawlLoader(api_key=api_key, url="https://apple.com", mode="scrape")
loader = FireCrawlLoader(
api_key=api_key, url="https://apple.com", mode="scrape")
docs = loader.load()
print("Finished crawling the website.")

Expand Down Expand Up @@ -58,11 +59,13 @@ def create_vector_store():
if not os.path.exists(persistent_directory):
create_vector_store()
else:
print(f"Vector store {persistent_directory} already exists. No need to initialize.")
print(
f"Vector store {persistent_directory} already exists. No need to initialize.")

# Load the vector store with the embeddings
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
db = Chroma(persist_directory=persistent_directory, embedding_function=embeddings)
db = Chroma(persist_directory=persistent_directory,
embedding_function=embeddings)


# Step 5: Query the vector store
Expand All @@ -86,7 +89,7 @@ def query_vector_store(query):


# Define the user's question
query = "WWDC24?"
query = "Apple Intelligence?"

# Query the vector store with the user's question
query_vector_store(query)

0 comments on commit 34fada1

Please sign in to comment.