Skip to content

Commit

Permalink
Rag 1 through 3 done
Browse files Browse the repository at this point in the history
  • Loading branch information
bhancockio committed Jun 6, 2024
1 parent b596f3c commit 666963d
Show file tree
Hide file tree
Showing 26 changed files with 232,809 additions and 81 deletions.
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
.env
chroma_db/
db/
72 changes: 0 additions & 72 deletions 4_rag/1_rag_basics.py

This file was deleted.

49 changes: 49 additions & 0 deletions 4_rag/1a_rag_basics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
import os

from langchain.text_splitter import CharacterTextSplitter
from langchain_community.document_loaders import TextLoader
from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings

# Define the directory containing the text file and the persistent directory
current_dir = os.path.dirname(os.path.abspath(__file__))
file_path = os.path.join(current_dir, "books/odyssey.txt")
persistent_directory = os.path.join(current_dir, "db", "chroma_db")

# Check if the Chroma vector store already exists
if not os.path.exists(persistent_directory):
print("Persistent directory does not exist. Initializing vector store...")

# Ensure the text file exists
if not os.path.exists(file_path):
raise FileNotFoundError(
f"The file {file_path} does not exist. Please check the path."
)

# Read the text content from the file
loader = TextLoader(file_path)
documents = loader.load()

# Split the document into chunks
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
docs = text_splitter.split_documents(documents)

# Display information about the split documents
print("\n--- Document Chunks Information ---")
print(f"Number of document chunks: {len(docs)}")
print(f"Sample chunk:\n{docs[0].page_content}\n")

# Create embeddings
print("\n--- Creating embeddings ---")
embeddings = OpenAIEmbeddings(
model="text-embedding-3-small"
) # Update to a valid embedding model if needed
print("\n--- Finished creating embeddings ---")

# Create the vector store and persist it automatically
print("\n--- Creating vector store ---")
db = Chroma.from_documents(docs, embeddings, persist_directory=persistent_directory)
print("\n--- Finished creating vector store ---")

else:
print("Vector store already exists. No need to initialize.")
31 changes: 31 additions & 0 deletions 4_rag/1b_rag_basics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
import os

from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings

# Define the persistent directory
current_dir = os.path.dirname(os.path.abspath(__file__))
persistent_directory = os.path.join(current_dir, "db", "chroma_db")

# Define the embedding model
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

# Load the existing vector store with the embedding function
db = Chroma(persist_directory=persistent_directory, embedding_function=embeddings)

# Define the user's question
query = "Who is Odysseus' wife?"

# Retrieve relevant documents based on the query
retriever = db.as_retriever(
search_type="similarity_score_threshold",
search_kwargs={"k": 3, "score_threshold": 0.7},
)
relevant_docs = retriever.invoke(query)

# Display the relevant results with metadata
print("\n--- Relevant Documents ---")
for i, doc in enumerate(relevant_docs, 1):
print(f"Document {i}:\n{doc.page_content}\n")
if doc.metadata:
print(f"Source: {doc.metadata.get('source', 'Unknown')}\n")
5 changes: 0 additions & 5 deletions 4_rag/2_rag_basics_metadata.py

This file was deleted.

61 changes: 61 additions & 0 deletions 4_rag/2a_rag_basics_metadata.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
import os

from langchain.text_splitter import CharacterTextSplitter
from langchain_community.document_loaders import TextLoader
from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings

# Define the directory containing the text files and the persistent directory
current_dir = os.path.dirname(os.path.abspath(__file__))
books_dir = os.path.join(current_dir, "books")
persistent_directory = os.path.join(current_dir, "db", "chroma_db_with_metadata")

print(f"Books directory: {books_dir}")
print(f"Persistent directory: {persistent_directory}")

# Check if the Chroma vector store already exists
if not os.path.exists(persistent_directory):
print("Persistent directory does not exist. Initializing vector store...")

# Ensure the books directory exists
if not os.path.exists(books_dir):
raise FileNotFoundError(
f"The directory {books_dir} does not exist. Please check the path."
)

# List all text files in the directory
book_files = [f for f in os.listdir(books_dir) if f.endswith(".txt")]

# Read the text content from each file and store it with metadata
documents = []
for book_file in book_files:
file_path = os.path.join(books_dir, book_file)
loader = TextLoader(file_path)
book_docs = loader.load()
for doc in book_docs:
# Add metadata to each document indicating its source
doc.metadata = {"source": book_file}
documents.append(doc)

# Split the documents into chunks
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
docs = text_splitter.split_documents(documents)

# Display information about the split documents
print("\n--- Document Chunks Information ---")
print(f"Number of document chunks: {len(docs)}")

# Create embeddings
print("\n--- Creating embeddings ---")
embeddings = OpenAIEmbeddings(
model="text-embedding-ada-002"
) # Update to a valid embedding model if needed
print("\n--- Finished creating embeddings ---")

# Create the vector store and persist it
print("\n--- Creating and persisting vector store ---")
db = Chroma.from_documents(docs, embeddings, persist_directory=persistent_directory)
print("\n--- Finished creating and persisting vector store ---")

else:
print("Vector store already exists. No need to initialize.")
30 changes: 30 additions & 0 deletions 4_rag/2b_rag_basics_metadata.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
import os

from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings

# Define the persistent directory
current_dir = os.path.dirname(os.path.abspath(__file__))
persistent_directory = os.path.join(current_dir, "db", "chroma_db_with_metadata")

# Define the embedding model
embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")

# Load the existing vector store with the embedding function
db = Chroma(persist_directory=persistent_directory, embedding_function=embeddings)

# Define the user's question
query = "Who is the main character in moby dick?"

# Retrieve relevant documents based on the query
retriever = db.as_retriever(
search_type="similarity_score_threshold",
search_kwargs={"k": 3, "score_threshold": 0.7},
)
relevant_docs = retriever.invoke(query)

# Display the relevant results with metadata
print("\n--- Relevant Documents ---")
for i, doc in enumerate(relevant_docs, 1):
print(f"Document {i}:\n{doc.page_content}\n")
print(f"Source: {doc.metadata['source']}\n")
131 changes: 130 additions & 1 deletion 4_rag/3_rag_text_splitting_deep_dive.py
Original file line number Diff line number Diff line change
@@ -1 +1,130 @@
# Multipe inputs + Metadata
import os

# Set environment variable to suppress tokenizer parallelism warning
os.environ["TOKENIZERS_PARALLELISM"] = "false"

from langchain.text_splitter import (
CharacterTextSplitter,
RecursiveCharacterTextSplitter,
SentenceTransformersTokenTextSplitter,
TextSplitter,
TokenTextSplitter,
)
from langchain_community.document_loaders import TextLoader
from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings

# Define the directory containing the text file
current_dir = os.path.dirname(os.path.abspath(__file__))
file_path = os.path.join(current_dir, "books/odyssey.txt")

# Check if the text file exists
if not os.path.exists(file_path):
raise FileNotFoundError(
f"The file {file_path} does not exist. Please check the path."
)

# Read the text content from the file
loader = TextLoader(file_path)
documents = loader.load()

# Define the embedding model
embeddings = OpenAIEmbeddings(
model="text-embedding-ada-002"
) # Update to a valid embedding model if needed


# Function to create and persist vector store
def create_vector_store(docs, store_name):
persistent_directory = os.path.join(current_dir, "db", store_name)
if not os.path.exists(persistent_directory):
print(f"\n--- Creating vector store {store_name} ---")
db = Chroma.from_documents(
docs, embeddings, persist_directory=persistent_directory
)
print(f"--- Finished creating vector store {store_name} ---")
else:
print(f"Vector store {store_name} already exists. No need to initialize.")


# 1. Character-based Splitting
# Splits text into chunks based on a specified number of characters.
# Useful for consistent chunk sizes regardless of content structure.
print("\n--- Using Character-based Splitting ---")
char_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
char_docs = char_splitter.split_documents(documents)
create_vector_store(char_docs, "chroma_db_char")

# 2. Sentence-based Splitting
# Splits text into chunks based on sentences, ensuring chunks end at sentence boundaries.
# Ideal for maintaining semantic coherence within chunks.
print("\n--- Using Sentence-based Splitting ---")
sent_splitter = SentenceTransformersTokenTextSplitter(chunk_size=1000)
sent_docs = sent_splitter.split_documents(documents)
create_vector_store(sent_docs, "chroma_db_sent")

# 3. Token-based Splitting
# Splits text into chunks based on tokens (words or subwords), using tokenizers like GPT-2.
# Useful for transformer models with strict token limits.
print("\n--- Using Token-based Splitting ---")
token_splitter = TokenTextSplitter(chunk_overlap=0, chunk_size=512)
token_docs = token_splitter.split_documents(documents)
create_vector_store(token_docs, "chroma_db_token")

# 4. Recursive Character-based Splitting
# Attempts to split text at natural boundaries (sentences, paragraphs) within character limit.
# Balances between maintaining coherence and adhering to character limits.
print("\n--- Using Recursive Character-based Splitting ---")
rec_char_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
rec_char_docs = rec_char_splitter.split_documents(documents)
create_vector_store(rec_char_docs, "chroma_db_rec_char")

# 5. Custom Splitting
# Allows creating custom splitting logic based on specific requirements.
# Useful for documents with unique structure that standard splitters can't handle.
print("\n--- Using Custom Splitting ---")


class CustomTextSplitter(TextSplitter):
def split_text(self, text):
# Custom logic for splitting text
return text.split("\n\n") # Example: split by paragraphs


custom_splitter = CustomTextSplitter()
custom_docs = custom_splitter.split_documents(documents)
create_vector_store(custom_docs, "chroma_db_custom")


# Function to query a vector store
def query_vector_store(store_name, query):
persistent_directory = os.path.join(current_dir, "db", store_name)
if os.path.exists(persistent_directory):
print(f"\n--- Querying the Vector Store {store_name} ---")
db = Chroma(
persist_directory=persistent_directory, embedding_function=embeddings
)
retriever = db.as_retriever(
search_type="similarity_score_threshold",
search_kwargs={"k": 1, "score_threshold": 0.75},
)
relevant_docs = retriever.invoke(query)
# Display the relevant results with metadata
print(f"\n--- Relevant Documents for {store_name} ---")
for i, doc in enumerate(relevant_docs, 1):
print(f"Document {i}:\n{doc.page_content}\n")
if doc.metadata:
print(f"Source: {doc.metadata.get('source', 'Unknown')}\n")
else:
print(f"Vector store {store_name} does not exist.")


# Define the user's question
query = "Who is Odysseus' wife?"

# Query each vector store
query_vector_store("chroma_db_char", query)
query_vector_store("chroma_db_sent", query)
query_vector_store("chroma_db_token", query)
query_vector_store("chroma_db_rec_char", query)
query_vector_store("chroma_db_custom", query)
Loading

0 comments on commit 666963d

Please sign in to comment.