Merge pull request #6 from pinecone-io/webapp

Merge Web App flow into Main
pinecone-io · Nov 12, 2024 · 86b82e6 · 86b82e6
2 parents 47e988e + 206eed0
commit 86b82e6
Show file tree

Hide file tree

Showing 10 changed files with 187 additions and 1,319 deletions.
diff --git a/README.md b/README.md
@@ -3,6 +3,9 @@ A simplified Contextual Video RAG implementation using Pinecone, AWS, and Claude
 
 Ever wanted to ask questions over your video data, such as Youtube, Zoom webinars, recorded meetings, etc? This application aims to create a RAG chatbot over these content using contextual retrieval and Pinecone, AWS, and Claude.
 
+This branch contains the **Streamlit Web App** version of the implementation. This allows you to run a local web app to interact with the RAG chatbot, and uses a makefile to make the data preprocessing smoother. Please read the following section to ensure you have the appropriate prerequisites before proceeding.
+
+If you'd rather work in Sagemaker Notebook, use the webinar-notebook branch above!
 ## Before you Begin
 
 This repo presents the RAG solution in two ways: one using scripting and makefiles, to create a Streamlit application, and another using a notebook intended for use on Sagemaker.
@@ -11,24 +14,6 @@ You'll also need access to AWS Bedrock, Pinecone (via an API Key), and Claude sp
 
 Finally, you need to add the videos you'd like to process under a folder called data, with a subfolder called videos. Leave them in .mp4 format. If you have access to your own Youtube channel, downloading videos from the console there will be perfect!
 
-
-### Using Sagemaker Notebooks
-
-First, ensure you have the appropriate permissions to use Sagemaker, Bedrock, and Bedrock inside Sagemaker.
-
-Then, create a notebook instance with the following configurations:
-
-- a powerful compute instance, we used ml.p3.2xlarge.
-- link to this public repo, so you can import all scripts (you can also fork this repo and link that instead, in that case you will need to auth your access)
-- the lifecycle_configuration.sh script, which will install packages on notebooks startup
-- 16gb volume size, in case you add a lot of videos
-
-**It's extremely important to use the lifecycle config script, otherwise you may run into compatibility issues**
-
-When selecting the kernel, use the conda_python3 environment.
-
-Next, upload your data as described above (video mp4 files under ./data/videos)
-
 ### Running the Scripts Locally
 
 Before beginning, authenthicate your session with AWS using your preferred method. You can
@@ -102,10 +87,8 @@ For more information on available commands, you can use:
 make help
 ```
 
-It's easiest to run the whole pipeline (setup) and then run the streamlit app.
-
-From there, the streamlit app should pop up locally and you can start querying!
-
+It's easiest to run the whole pipeline (setup) and then run the Streamlit app.
 
+From there, the Streamlit app should pop up locally and you can start querying!
 
 
diff --git a/data/videos/example.txt b/data/videos/example.txt
@@ -0,0 +1 @@
+add your videos here! .mp4 format
diff --git a/lifecycle_configuration.sh b/lifecycle_configuration.sh
diff --git a/preprocessing/app.py b/preprocessing/app.py
@@ -1,12 +1,15 @@
 import streamlit as st
 import os
-#from boto_testing import titan_multimodal_embedding
+
+# from boto_testing import titan_multimodal_embedding
 from upsert_vectors import titan_text_embedding
 from claude_utils import ask_claude_vqa_response
 from config import index_name
+
 # Initialize Pinecone
 from pinecone import Pinecone
 from dotenv import load_dotenv
+
 load_dotenv()
 
 pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
@@ -23,29 +26,27 @@
 if st.button("Query"):
     if query_text:
         # Embed the query
-        query_embedding =  titan_text_embedding(text=query_text)
+        query_embedding = titan_text_embedding(text=query_text)
 
-        response = index.query(vector=query_embedding["embedding"], top_k=5, include_metadata=True)
+        response = index.query(
+            vector=query_embedding["embedding"], top_k=5, include_metadata=True
+        )
 
-        #st.write(response)
+        # st.write(response)
         for r in response["matches"]:
             with st.expander(f"Match with Score: {r['score']}"):
                 st.markdown(f"**Score:** {r['score']}")
                 st.image(r["metadata"]["filepath"], caption="Matched Image")
                 st.markdown(f"**Transcript:** {r['metadata']['transcript']}")
-                st.markdown(f"**Contextual Frame Description:** {r['metadata']['contextual_frame_description']}")
+                st.markdown(
+                    f"**Contextual Frame Description:** {r['metadata']['contextual_frame_description']}"
+                )
                 st.markdown(f"**Timestamp Start:** {r['metadata']['timestamp_start']}")
                 st.markdown(f"**Timestamp End:** {r['metadata']['timestamp_end']}")
 
-
         # ask claude for an explanation of the returned results.
 
         claude_explanation = ask_claude_vqa_response(query_text, response["matches"])
         st.markdown(f"**Claude Explanation:** {claude_explanation}")
     else:
         st.write("Please enter text or image path to query.")
-
-
-# good queries
-# The interview spoke about mad libs for robots. What is that about?
-# At some point, the interviewer spoke about marrying customs. What was that about?
diff --git a/preprocessing/claude_utils.py b/preprocessing/claude_utils.py
@@ -6,7 +6,7 @@
 
 MODEL = "anthropic.claude-3-haiku-20240307-v1:0"
 MAX_TOKENS = 256
-system_prompt = '''
+system_prompt = """
 You are an expert in the field of AI, and are assistant employees of an AI company on searching over their webinars and 
 YouTube videos. You are tasked with, given a bunch of video screencaps, context, and transcriptions, to respond
 to the users query and questions for information.
@@ -17,7 +17,7 @@
 You should provide a response that is informative and helpful to the user, and you should refer back to the source presentations 
 where that information came from.
 
-'''
+"""
 
 from anthropic import AnthropicBedrock
 
@@ -26,23 +26,22 @@
 logging.basicConfig(level=logging.INFO)
 
 
-
 def convert_image_to_base64(image_path):
     with open(image_path, "rb") as image_file:
         binary_data = image_file.read()
         base_64_encoded_data = base64.b64encode(binary_data)
-        base64_string = base_64_encoded_data.decode('utf-8')
+        base64_string = base_64_encoded_data.decode("utf-8")
     return base64_string
 
 
 def format_messages_for_claude(user_query, vdb_response):
     """
     Formats the user's query and the vector database response into a structured message for Claude.
-    
+
     Args:
         user_query (str): The user's query.
         vdb_response (list): The response from the vector database, containing images and text.
-    
+
     Returns:
         list: A list of messages formatted for Claude.
     """
@@ -54,46 +53,50 @@ def format_messages_for_claude(user_query, vdb_response):
 
     for item in vdb_response:
         img_b64 = convert_image_to_base64(item["metadata"]["filepath"])
-        new_content.extend([
-            {
-            "type": "text",
-            "text": "Image: " + item["metadata"]["filepath"],
-            },
-            {
-            "type": "image",
-            "source": {
-                "type": "base64",
-                "media_type": "image/png",
-                "data": img_b64
-            }
-            },
-            {
-            "type": "text",
-            "text": "Contextual description: " + item["metadata"]["contextual_frame_description"]
-            },
-            {
-                "type": "text",
-                "text": "Transcript: " + item["metadata"]["transcript"]
-            }
-        ])
-    #reassign
+        new_content.extend(
+            [
+                {
+                    "type": "text",
+                    "text": "Image: " + item["metadata"]["filepath"],
+                },
+                {
+                    "type": "image",
+                    "source": {
+                        "type": "base64",
+                        "media_type": "image/png",
+                        "data": img_b64,
+                    },
+                },
+                {
+                    "type": "text",
+                    "text": "Contextual description: "
+                    + item["metadata"]["contextual_frame_description"],
+                },
+                {
+                    "type": "text",
+                    "text": "Transcript: " + item["metadata"]["transcript"],
+                },
+            ]
+        )
+    # reassign
     messages[0]["content"] = new_content
     return messages
 
+
 def ask_claude_vqa_response(user_query, vdb_response):
     """
     Sends the user's query and the vector database response to Claude and gets a response.
-    
+
     Args:
         user_query (str): The user's query.
         vdb_response (list): The response from the vector database, containing images and text.
-    
+
     Returns:
         str: The response from Claude.
     """
     client = AnthropicBedrock()
     messages = format_messages_for_claude(user_query, vdb_response)
-    system_prompt = '''
+    system_prompt = """
 
 You are a friendly assistant helping people interpret their videos at their company.
 
@@ -103,79 +106,75 @@ def ask_claude_vqa_response(user_query, vdb_response):
 
 Refer back to the images and text provided to guide the user to the appropriate slide, section, webinar, or talk
 where the information they are looking for is located.
-    '''
+    """
     response = client.messages.create(
-            model=MODEL,
-            max_tokens=MAX_TOKENS * 10,
-            system=system_prompt,
-            messages=messages
-        )
+        model=MODEL, max_tokens=MAX_TOKENS * 10, system=system_prompt, messages=messages
+    )
     return response.content[0].text
-
 
 
 def ask_claude(img, text):
     # best for one off queries
-    client = AnthropicBedrock(
-    aws_region="us-east-1")
+    client = AnthropicBedrock(aws_region="us-east-1")
     if img:
         img_b64 = convert_image_to_base64(img)
         message = client.messages.create(
             model=MODEL,
             max_tokens=MAX_TOKENS,
             messages=[
-            {
-                "role": "user", 
-                "content": [
-                    {"type": "image", "source": 
+                {
+                    "role": "user",
+                    "content": [
                         {
-                            "type": "base64",
-                            "media_type": "image/png",
-                            "data": img_b64
-                        }
-                    },
-                    {"type": "text", "text": text}
-                ]
-            }
-        ]
+                            "type": "image",
+                            "source": {
+                                "type": "base64",
+                                "media_type": "image/png",
+                                "data": img_b64,
+                            },
+                        },
+                        {"type": "text", "text": text},
+                    ],
+                }
+            ],
         )
     else:
         message = client.messages.create(
             model=MODEL,
             max_tokens=MAX_TOKENS,
-            messages=[{"role": "user", "content": text}]
+            messages=[{"role": "user", "content": text}],
         )
     return message.content[0].text
 
 
-
 def make_claude_transcript_summary(transcript):
-    client = AnthropicBedrock(
-    aws_region="us-east-1")
+    client = AnthropicBedrock(aws_region="us-east-1")
 
     prompt = "Summarize the following transcript, being as concise as possible:"
     message = client.messages.create(
         model=MODEL,
         messages=[{"role": "user", "content": prompt + ": " + transcript}],
-        max_tokens=MAX_TOKENS
+        max_tokens=MAX_TOKENS,
     )
     return message.content[0].text
 
-def create_contextual_frame_description(frame_caption_index, frame_caption_pairs, transcript_summary, window=60, frame_width=15):
+
+def create_contextual_frame_description(
+    frame_caption_index, frame_caption_pairs, transcript_summary
+):
     # frame caption pair will have an image, and a transcript. Window is in seconds
-    client = AnthropicBedrock(
-    aws_region="us-east-1")
+    client = AnthropicBedrock(aws_region="us-east-1")
 
     # gather context, look 4 frame widths before and after. Make sure not to go out of bounds if near beginning or end of video.
-    
-    surrounding_frames = frame_caption_pairs[max(0, frame_caption_index - 4 * frame_width):frame_caption_index + 1]
+
+    # surrounding_frames = frame_caption_pairs[max(0, frame_caption_index - 4 * frame_width):frame_caption_index + 1]
 
     current_frame = frame_caption_pairs[frame_caption_index]
 
     # summarize past frames
     # removed for now
-   #past_frames_summary = make_claude_transcript_summary(" ".join([f["words"] for f in surrounding_frames]))
-    meta_prompt = f'''
+    # past_frames_summary = make_claude_transcript_summary(" ".join([f["words"] for f in surrounding_frames]))
+    meta_prompt = f"""
 
     You are watching a video and trying to explain what has happened in the video using a global summary, some recent context, and the transcript of the current frame.
 
@@ -192,10 +191,7 @@ def create_contextual_frame_description(frame_caption_index, frame_caption_pairs
     been talked about. If a question was asked, and answered, include the question and answer in the description as well.
 
     Description:
-    '''
+    """
 
     rich_summary = ask_claude(img=current_frame["frame_path"], text=meta_prompt)
     return rich_summary
-
-
-
diff --git a/preprocessing/config.py b/preprocessing/config.py
@@ -1,12 +1,12 @@
 from pathlib import Path
 
 package_dir = Path(__file__).parent
-project_dir =   package_dir.parent
+project_dir = package_dir.parent
 data_dir = project_dir / "data"
 
 videos_dir = data_dir / "videos"
 
 
 # Pinecone variables that are helpful
 
-index_name = "test-vqa"
+index_name = "test-vqa"