forked from bhancockio/langchain-crash-course
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
b596f3c
commit 666963d
Showing
26 changed files
with
232,809 additions
and
81 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,2 +1,2 @@ | ||
.env | ||
chroma_db/ | ||
db/ |
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
import os | ||
|
||
from langchain.text_splitter import CharacterTextSplitter | ||
from langchain_community.document_loaders import TextLoader | ||
from langchain_community.vectorstores import Chroma | ||
from langchain_openai import OpenAIEmbeddings | ||
|
||
# Define the directory containing the text file and the persistent directory | ||
current_dir = os.path.dirname(os.path.abspath(__file__)) | ||
file_path = os.path.join(current_dir, "books/odyssey.txt") | ||
persistent_directory = os.path.join(current_dir, "db", "chroma_db") | ||
|
||
# Check if the Chroma vector store already exists | ||
if not os.path.exists(persistent_directory): | ||
print("Persistent directory does not exist. Initializing vector store...") | ||
|
||
# Ensure the text file exists | ||
if not os.path.exists(file_path): | ||
raise FileNotFoundError( | ||
f"The file {file_path} does not exist. Please check the path." | ||
) | ||
|
||
# Read the text content from the file | ||
loader = TextLoader(file_path) | ||
documents = loader.load() | ||
|
||
# Split the document into chunks | ||
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0) | ||
docs = text_splitter.split_documents(documents) | ||
|
||
# Display information about the split documents | ||
print("\n--- Document Chunks Information ---") | ||
print(f"Number of document chunks: {len(docs)}") | ||
print(f"Sample chunk:\n{docs[0].page_content}\n") | ||
|
||
# Create embeddings | ||
print("\n--- Creating embeddings ---") | ||
embeddings = OpenAIEmbeddings( | ||
model="text-embedding-3-small" | ||
) # Update to a valid embedding model if needed | ||
print("\n--- Finished creating embeddings ---") | ||
|
||
# Create the vector store and persist it automatically | ||
print("\n--- Creating vector store ---") | ||
db = Chroma.from_documents(docs, embeddings, persist_directory=persistent_directory) | ||
print("\n--- Finished creating vector store ---") | ||
|
||
else: | ||
print("Vector store already exists. No need to initialize.") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
import os | ||
|
||
from langchain_community.vectorstores import Chroma | ||
from langchain_openai import OpenAIEmbeddings | ||
|
||
# Define the persistent directory | ||
current_dir = os.path.dirname(os.path.abspath(__file__)) | ||
persistent_directory = os.path.join(current_dir, "db", "chroma_db") | ||
|
||
# Define the embedding model | ||
embeddings = OpenAIEmbeddings(model="text-embedding-3-small") | ||
|
||
# Load the existing vector store with the embedding function | ||
db = Chroma(persist_directory=persistent_directory, embedding_function=embeddings) | ||
|
||
# Define the user's question | ||
query = "Who is Odysseus' wife?" | ||
|
||
# Retrieve relevant documents based on the query | ||
retriever = db.as_retriever( | ||
search_type="similarity_score_threshold", | ||
search_kwargs={"k": 3, "score_threshold": 0.7}, | ||
) | ||
relevant_docs = retriever.invoke(query) | ||
|
||
# Display the relevant results with metadata | ||
print("\n--- Relevant Documents ---") | ||
for i, doc in enumerate(relevant_docs, 1): | ||
print(f"Document {i}:\n{doc.page_content}\n") | ||
if doc.metadata: | ||
print(f"Source: {doc.metadata.get('source', 'Unknown')}\n") |
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,61 @@ | ||
import os | ||
|
||
from langchain.text_splitter import CharacterTextSplitter | ||
from langchain_community.document_loaders import TextLoader | ||
from langchain_community.vectorstores import Chroma | ||
from langchain_openai import OpenAIEmbeddings | ||
|
||
# Define the directory containing the text files and the persistent directory | ||
current_dir = os.path.dirname(os.path.abspath(__file__)) | ||
books_dir = os.path.join(current_dir, "books") | ||
persistent_directory = os.path.join(current_dir, "db", "chroma_db_with_metadata") | ||
|
||
print(f"Books directory: {books_dir}") | ||
print(f"Persistent directory: {persistent_directory}") | ||
|
||
# Check if the Chroma vector store already exists | ||
if not os.path.exists(persistent_directory): | ||
print("Persistent directory does not exist. Initializing vector store...") | ||
|
||
# Ensure the books directory exists | ||
if not os.path.exists(books_dir): | ||
raise FileNotFoundError( | ||
f"The directory {books_dir} does not exist. Please check the path." | ||
) | ||
|
||
# List all text files in the directory | ||
book_files = [f for f in os.listdir(books_dir) if f.endswith(".txt")] | ||
|
||
# Read the text content from each file and store it with metadata | ||
documents = [] | ||
for book_file in book_files: | ||
file_path = os.path.join(books_dir, book_file) | ||
loader = TextLoader(file_path) | ||
book_docs = loader.load() | ||
for doc in book_docs: | ||
# Add metadata to each document indicating its source | ||
doc.metadata = {"source": book_file} | ||
documents.append(doc) | ||
|
||
# Split the documents into chunks | ||
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0) | ||
docs = text_splitter.split_documents(documents) | ||
|
||
# Display information about the split documents | ||
print("\n--- Document Chunks Information ---") | ||
print(f"Number of document chunks: {len(docs)}") | ||
|
||
# Create embeddings | ||
print("\n--- Creating embeddings ---") | ||
embeddings = OpenAIEmbeddings( | ||
model="text-embedding-ada-002" | ||
) # Update to a valid embedding model if needed | ||
print("\n--- Finished creating embeddings ---") | ||
|
||
# Create the vector store and persist it | ||
print("\n--- Creating and persisting vector store ---") | ||
db = Chroma.from_documents(docs, embeddings, persist_directory=persistent_directory) | ||
print("\n--- Finished creating and persisting vector store ---") | ||
|
||
else: | ||
print("Vector store already exists. No need to initialize.") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
import os | ||
|
||
from langchain_community.vectorstores import Chroma | ||
from langchain_openai import OpenAIEmbeddings | ||
|
||
# Define the persistent directory | ||
current_dir = os.path.dirname(os.path.abspath(__file__)) | ||
persistent_directory = os.path.join(current_dir, "db", "chroma_db_with_metadata") | ||
|
||
# Define the embedding model | ||
embeddings = OpenAIEmbeddings(model="text-embedding-ada-002") | ||
|
||
# Load the existing vector store with the embedding function | ||
db = Chroma(persist_directory=persistent_directory, embedding_function=embeddings) | ||
|
||
# Define the user's question | ||
query = "Who is the main character in moby dick?" | ||
|
||
# Retrieve relevant documents based on the query | ||
retriever = db.as_retriever( | ||
search_type="similarity_score_threshold", | ||
search_kwargs={"k": 3, "score_threshold": 0.7}, | ||
) | ||
relevant_docs = retriever.invoke(query) | ||
|
||
# Display the relevant results with metadata | ||
print("\n--- Relevant Documents ---") | ||
for i, doc in enumerate(relevant_docs, 1): | ||
print(f"Document {i}:\n{doc.page_content}\n") | ||
print(f"Source: {doc.metadata['source']}\n") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1,130 @@ | ||
# Multipe inputs + Metadata | ||
import os | ||
|
||
# Set environment variable to suppress tokenizer parallelism warning | ||
os.environ["TOKENIZERS_PARALLELISM"] = "false" | ||
|
||
from langchain.text_splitter import ( | ||
CharacterTextSplitter, | ||
RecursiveCharacterTextSplitter, | ||
SentenceTransformersTokenTextSplitter, | ||
TextSplitter, | ||
TokenTextSplitter, | ||
) | ||
from langchain_community.document_loaders import TextLoader | ||
from langchain_community.vectorstores import Chroma | ||
from langchain_openai import OpenAIEmbeddings | ||
|
||
# Define the directory containing the text file | ||
current_dir = os.path.dirname(os.path.abspath(__file__)) | ||
file_path = os.path.join(current_dir, "books/odyssey.txt") | ||
|
||
# Check if the text file exists | ||
if not os.path.exists(file_path): | ||
raise FileNotFoundError( | ||
f"The file {file_path} does not exist. Please check the path." | ||
) | ||
|
||
# Read the text content from the file | ||
loader = TextLoader(file_path) | ||
documents = loader.load() | ||
|
||
# Define the embedding model | ||
embeddings = OpenAIEmbeddings( | ||
model="text-embedding-ada-002" | ||
) # Update to a valid embedding model if needed | ||
|
||
|
||
# Function to create and persist vector store | ||
def create_vector_store(docs, store_name): | ||
persistent_directory = os.path.join(current_dir, "db", store_name) | ||
if not os.path.exists(persistent_directory): | ||
print(f"\n--- Creating vector store {store_name} ---") | ||
db = Chroma.from_documents( | ||
docs, embeddings, persist_directory=persistent_directory | ||
) | ||
print(f"--- Finished creating vector store {store_name} ---") | ||
else: | ||
print(f"Vector store {store_name} already exists. No need to initialize.") | ||
|
||
|
||
# 1. Character-based Splitting | ||
# Splits text into chunks based on a specified number of characters. | ||
# Useful for consistent chunk sizes regardless of content structure. | ||
print("\n--- Using Character-based Splitting ---") | ||
char_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=100) | ||
char_docs = char_splitter.split_documents(documents) | ||
create_vector_store(char_docs, "chroma_db_char") | ||
|
||
# 2. Sentence-based Splitting | ||
# Splits text into chunks based on sentences, ensuring chunks end at sentence boundaries. | ||
# Ideal for maintaining semantic coherence within chunks. | ||
print("\n--- Using Sentence-based Splitting ---") | ||
sent_splitter = SentenceTransformersTokenTextSplitter(chunk_size=1000) | ||
sent_docs = sent_splitter.split_documents(documents) | ||
create_vector_store(sent_docs, "chroma_db_sent") | ||
|
||
# 3. Token-based Splitting | ||
# Splits text into chunks based on tokens (words or subwords), using tokenizers like GPT-2. | ||
# Useful for transformer models with strict token limits. | ||
print("\n--- Using Token-based Splitting ---") | ||
token_splitter = TokenTextSplitter(chunk_overlap=0, chunk_size=512) | ||
token_docs = token_splitter.split_documents(documents) | ||
create_vector_store(token_docs, "chroma_db_token") | ||
|
||
# 4. Recursive Character-based Splitting | ||
# Attempts to split text at natural boundaries (sentences, paragraphs) within character limit. | ||
# Balances between maintaining coherence and adhering to character limits. | ||
print("\n--- Using Recursive Character-based Splitting ---") | ||
rec_char_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100) | ||
rec_char_docs = rec_char_splitter.split_documents(documents) | ||
create_vector_store(rec_char_docs, "chroma_db_rec_char") | ||
|
||
# 5. Custom Splitting | ||
# Allows creating custom splitting logic based on specific requirements. | ||
# Useful for documents with unique structure that standard splitters can't handle. | ||
print("\n--- Using Custom Splitting ---") | ||
|
||
|
||
class CustomTextSplitter(TextSplitter): | ||
def split_text(self, text): | ||
# Custom logic for splitting text | ||
return text.split("\n\n") # Example: split by paragraphs | ||
|
||
|
||
custom_splitter = CustomTextSplitter() | ||
custom_docs = custom_splitter.split_documents(documents) | ||
create_vector_store(custom_docs, "chroma_db_custom") | ||
|
||
|
||
# Function to query a vector store | ||
def query_vector_store(store_name, query): | ||
persistent_directory = os.path.join(current_dir, "db", store_name) | ||
if os.path.exists(persistent_directory): | ||
print(f"\n--- Querying the Vector Store {store_name} ---") | ||
db = Chroma( | ||
persist_directory=persistent_directory, embedding_function=embeddings | ||
) | ||
retriever = db.as_retriever( | ||
search_type="similarity_score_threshold", | ||
search_kwargs={"k": 1, "score_threshold": 0.75}, | ||
) | ||
relevant_docs = retriever.invoke(query) | ||
# Display the relevant results with metadata | ||
print(f"\n--- Relevant Documents for {store_name} ---") | ||
for i, doc in enumerate(relevant_docs, 1): | ||
print(f"Document {i}:\n{doc.page_content}\n") | ||
if doc.metadata: | ||
print(f"Source: {doc.metadata.get('source', 'Unknown')}\n") | ||
else: | ||
print(f"Vector store {store_name} does not exist.") | ||
|
||
|
||
# Define the user's question | ||
query = "Who is Odysseus' wife?" | ||
|
||
# Query each vector store | ||
query_vector_store("chroma_db_char", query) | ||
query_vector_store("chroma_db_sent", query) | ||
query_vector_store("chroma_db_token", query) | ||
query_vector_store("chroma_db_rec_char", query) | ||
query_vector_store("chroma_db_custom", query) |
Oops, something went wrong.