Rag 1 through 3 done

Lebaranto · Jun 6, 2024 · 666963d · 666963d
1 parent b596f3c
commit 666963d
Show file tree

Hide file tree

Showing 26 changed files with 232,809 additions and 81 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,2 +1,2 @@
 .env
-chroma_db/
+db/
diff --git a/4_rag/1_rag_basics.py b/4_rag/1_rag_basics.py
diff --git a/4_rag/1a_rag_basics.py b/4_rag/1a_rag_basics.py
@@ -0,0 +1,49 @@
+import os
+
+from langchain.text_splitter import CharacterTextSplitter
+from langchain_community.document_loaders import TextLoader
+from langchain_community.vectorstores import Chroma
+from langchain_openai import OpenAIEmbeddings
+
+# Define the directory containing the text file and the persistent directory
+current_dir = os.path.dirname(os.path.abspath(__file__))
+file_path = os.path.join(current_dir, "books/odyssey.txt")
+persistent_directory = os.path.join(current_dir, "db", "chroma_db")
+
+# Check if the Chroma vector store already exists
+if not os.path.exists(persistent_directory):
+    print("Persistent directory does not exist. Initializing vector store...")
+
+    # Ensure the text file exists
+    if not os.path.exists(file_path):
+        raise FileNotFoundError(
+            f"The file {file_path} does not exist. Please check the path."
+        )
+
+    # Read the text content from the file
+    loader = TextLoader(file_path)
+    documents = loader.load()
+
+    # Split the document into chunks
+    text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
+    docs = text_splitter.split_documents(documents)
+
+    # Display information about the split documents
+    print("\n--- Document Chunks Information ---")
+    print(f"Number of document chunks: {len(docs)}")
+    print(f"Sample chunk:\n{docs[0].page_content}\n")
+
+    # Create embeddings
+    print("\n--- Creating embeddings ---")
+    embeddings = OpenAIEmbeddings(
+        model="text-embedding-3-small"
+    )  # Update to a valid embedding model if needed
+    print("\n--- Finished creating embeddings ---")
+
+    # Create the vector store and persist it automatically
+    print("\n--- Creating vector store ---")
+    db = Chroma.from_documents(docs, embeddings, persist_directory=persistent_directory)
+    print("\n--- Finished creating vector store ---")
+
+else:
+    print("Vector store already exists. No need to initialize.")
diff --git a/4_rag/1b_rag_basics.py b/4_rag/1b_rag_basics.py
@@ -0,0 +1,31 @@
+import os
+
+from langchain_community.vectorstores import Chroma
+from langchain_openai import OpenAIEmbeddings
+
+# Define the persistent directory
+current_dir = os.path.dirname(os.path.abspath(__file__))
+persistent_directory = os.path.join(current_dir, "db", "chroma_db")
+
+# Define the embedding model
+embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
+
+# Load the existing vector store with the embedding function
+db = Chroma(persist_directory=persistent_directory, embedding_function=embeddings)
+
+# Define the user's question
+query = "Who is Odysseus' wife?"
+
+# Retrieve relevant documents based on the query
+retriever = db.as_retriever(
+    search_type="similarity_score_threshold",
+    search_kwargs={"k": 3, "score_threshold": 0.7},
+)
+relevant_docs = retriever.invoke(query)
+
+# Display the relevant results with metadata
+print("\n--- Relevant Documents ---")
+for i, doc in enumerate(relevant_docs, 1):
+    print(f"Document {i}:\n{doc.page_content}\n")
+    if doc.metadata:
+        print(f"Source: {doc.metadata.get('source', 'Unknown')}\n")
diff --git a/4_rag/2_rag_basics_metadata.py b/4_rag/2_rag_basics_metadata.py
diff --git a/4_rag/2a_rag_basics_metadata.py b/4_rag/2a_rag_basics_metadata.py
@@ -0,0 +1,61 @@
+import os
+
+from langchain.text_splitter import CharacterTextSplitter
+from langchain_community.document_loaders import TextLoader
+from langchain_community.vectorstores import Chroma
+from langchain_openai import OpenAIEmbeddings
+
+# Define the directory containing the text files and the persistent directory
+current_dir = os.path.dirname(os.path.abspath(__file__))
+books_dir = os.path.join(current_dir, "books")
+persistent_directory = os.path.join(current_dir, "db", "chroma_db_with_metadata")
+
+print(f"Books directory: {books_dir}")
+print(f"Persistent directory: {persistent_directory}")
+
+# Check if the Chroma vector store already exists
+if not os.path.exists(persistent_directory):
+    print("Persistent directory does not exist. Initializing vector store...")
+
+    # Ensure the books directory exists
+    if not os.path.exists(books_dir):
+        raise FileNotFoundError(
+            f"The directory {books_dir} does not exist. Please check the path."
+        )
+
+    # List all text files in the directory
+    book_files = [f for f in os.listdir(books_dir) if f.endswith(".txt")]
+
+    # Read the text content from each file and store it with metadata
+    documents = []
+    for book_file in book_files:
+        file_path = os.path.join(books_dir, book_file)
+        loader = TextLoader(file_path)
+        book_docs = loader.load()
+        for doc in book_docs:
+            # Add metadata to each document indicating its source
+            doc.metadata = {"source": book_file}
+            documents.append(doc)
+
+    # Split the documents into chunks
+    text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
+    docs = text_splitter.split_documents(documents)
+
+    # Display information about the split documents
+    print("\n--- Document Chunks Information ---")
+    print(f"Number of document chunks: {len(docs)}")
+
+    # Create embeddings
+    print("\n--- Creating embeddings ---")
+    embeddings = OpenAIEmbeddings(
+        model="text-embedding-ada-002"
+    )  # Update to a valid embedding model if needed
+    print("\n--- Finished creating embeddings ---")
+
+    # Create the vector store and persist it
+    print("\n--- Creating and persisting vector store ---")
+    db = Chroma.from_documents(docs, embeddings, persist_directory=persistent_directory)
+    print("\n--- Finished creating and persisting vector store ---")
+
+else:
+    print("Vector store already exists. No need to initialize.")
diff --git a/4_rag/2b_rag_basics_metadata.py b/4_rag/2b_rag_basics_metadata.py
@@ -0,0 +1,30 @@
+import os
+
+from langchain_community.vectorstores import Chroma
+from langchain_openai import OpenAIEmbeddings
+
+# Define the persistent directory
+current_dir = os.path.dirname(os.path.abspath(__file__))
+persistent_directory = os.path.join(current_dir, "db", "chroma_db_with_metadata")
+
+# Define the embedding model
+embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")
+
+# Load the existing vector store with the embedding function
+db = Chroma(persist_directory=persistent_directory, embedding_function=embeddings)
+
+# Define the user's question
+query = "Who is the main character in moby dick?"
+
+# Retrieve relevant documents based on the query
+retriever = db.as_retriever(
+    search_type="similarity_score_threshold",
+    search_kwargs={"k": 3, "score_threshold": 0.7},
+)
+relevant_docs = retriever.invoke(query)
+
+# Display the relevant results with metadata
+print("\n--- Relevant Documents ---")
+for i, doc in enumerate(relevant_docs, 1):
+    print(f"Document {i}:\n{doc.page_content}\n")
+    print(f"Source: {doc.metadata['source']}\n")
diff --git a/4_rag/3_rag_text_splitting_deep_dive.py b/4_rag/3_rag_text_splitting_deep_dive.py
@@ -1 +1,130 @@
-# Multipe inputs + Metadata
+import os
+
+# Set environment variable to suppress tokenizer parallelism warning
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+
+from langchain.text_splitter import (
+    CharacterTextSplitter,
+    RecursiveCharacterTextSplitter,
+    SentenceTransformersTokenTextSplitter,
+    TextSplitter,
+    TokenTextSplitter,
+)
+from langchain_community.document_loaders import TextLoader
+from langchain_community.vectorstores import Chroma
+from langchain_openai import OpenAIEmbeddings
+
+# Define the directory containing the text file
+current_dir = os.path.dirname(os.path.abspath(__file__))
+file_path = os.path.join(current_dir, "books/odyssey.txt")
+
+# Check if the text file exists
+if not os.path.exists(file_path):
+    raise FileNotFoundError(
+        f"The file {file_path} does not exist. Please check the path."
+    )
+
+# Read the text content from the file
+loader = TextLoader(file_path)
+documents = loader.load()
+
+# Define the embedding model
+embeddings = OpenAIEmbeddings(
+    model="text-embedding-ada-002"
+)  # Update to a valid embedding model if needed
+
+
+# Function to create and persist vector store
+def create_vector_store(docs, store_name):
+    persistent_directory = os.path.join(current_dir, "db", store_name)
+    if not os.path.exists(persistent_directory):
+        print(f"\n--- Creating vector store {store_name} ---")
+        db = Chroma.from_documents(
+            docs, embeddings, persist_directory=persistent_directory
+        )
+        print(f"--- Finished creating vector store {store_name} ---")
+    else:
+        print(f"Vector store {store_name} already exists. No need to initialize.")
+
+
+# 1. Character-based Splitting
+# Splits text into chunks based on a specified number of characters.
+# Useful for consistent chunk sizes regardless of content structure.
+print("\n--- Using Character-based Splitting ---")
+char_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
+char_docs = char_splitter.split_documents(documents)
+create_vector_store(char_docs, "chroma_db_char")
+
+# 2. Sentence-based Splitting
+# Splits text into chunks based on sentences, ensuring chunks end at sentence boundaries.
+# Ideal for maintaining semantic coherence within chunks.
+print("\n--- Using Sentence-based Splitting ---")
+sent_splitter = SentenceTransformersTokenTextSplitter(chunk_size=1000)
+sent_docs = sent_splitter.split_documents(documents)
+create_vector_store(sent_docs, "chroma_db_sent")
+
+# 3. Token-based Splitting
+# Splits text into chunks based on tokens (words or subwords), using tokenizers like GPT-2.
+# Useful for transformer models with strict token limits.
+print("\n--- Using Token-based Splitting ---")
+token_splitter = TokenTextSplitter(chunk_overlap=0, chunk_size=512)
+token_docs = token_splitter.split_documents(documents)
+create_vector_store(token_docs, "chroma_db_token")
+
+# 4. Recursive Character-based Splitting
+# Attempts to split text at natural boundaries (sentences, paragraphs) within character limit.
+# Balances between maintaining coherence and adhering to character limits.
+print("\n--- Using Recursive Character-based Splitting ---")
+rec_char_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
+rec_char_docs = rec_char_splitter.split_documents(documents)
+create_vector_store(rec_char_docs, "chroma_db_rec_char")
+
+# 5. Custom Splitting
+# Allows creating custom splitting logic based on specific requirements.
+# Useful for documents with unique structure that standard splitters can't handle.
+print("\n--- Using Custom Splitting ---")
+
+
+class CustomTextSplitter(TextSplitter):
+    def split_text(self, text):
+        # Custom logic for splitting text
+        return text.split("\n\n")  # Example: split by paragraphs
+
+
+custom_splitter = CustomTextSplitter()
+custom_docs = custom_splitter.split_documents(documents)
+create_vector_store(custom_docs, "chroma_db_custom")
+
+
+# Function to query a vector store
+def query_vector_store(store_name, query):
+    persistent_directory = os.path.join(current_dir, "db", store_name)
+    if os.path.exists(persistent_directory):
+        print(f"\n--- Querying the Vector Store {store_name} ---")
+        db = Chroma(
+            persist_directory=persistent_directory, embedding_function=embeddings
+        )
+        retriever = db.as_retriever(
+            search_type="similarity_score_threshold",
+            search_kwargs={"k": 1, "score_threshold": 0.75},
+        )
+        relevant_docs = retriever.invoke(query)
+        # Display the relevant results with metadata
+        print(f"\n--- Relevant Documents for {store_name} ---")
+        for i, doc in enumerate(relevant_docs, 1):
+            print(f"Document {i}:\n{doc.page_content}\n")
+            if doc.metadata:
+                print(f"Source: {doc.metadata.get('source', 'Unknown')}\n")
+    else:
+        print(f"Vector store {store_name} does not exist.")
+
+
+# Define the user's question
+query = "Who is Odysseus' wife?"
+
+# Query each vector store
+query_vector_store("chroma_db_char", query)
+query_vector_store("chroma_db_sent", query)
+query_vector_store("chroma_db_token", query)
+query_vector_store("chroma_db_rec_char", query)
+query_vector_store("chroma_db_custom", query)