diff --git a/README.md b/README.md index b710c66..131c4ea 100644 --- a/README.md +++ b/README.md @@ -3,6 +3,9 @@ A simplified Contextual Video RAG implementation using Pinecone, AWS, and Claude Ever wanted to ask questions over your video data, such as Youtube, Zoom webinars, recorded meetings, etc? This application aims to create a RAG chatbot over these content using contextual retrieval and Pinecone, AWS, and Claude. +This branch contains the **Streamlit Web App** version of the implementation. This allows you to run a local web app to interact with the RAG chatbot, and uses a makefile to make the data preprocessing smoother. Please read the following section to ensure you have the appropriate prerequisites before proceeding. + +If you'd rather work in Sagemaker Notebook, use the webinar-notebook branch above! ## Before you Begin This repo presents the RAG solution in two ways: one using scripting and makefiles, to create a Streamlit application, and another using a notebook intended for use on Sagemaker. @@ -11,24 +14,6 @@ You'll also need access to AWS Bedrock, Pinecone (via an API Key), and Claude sp Finally, you need to add the videos you'd like to process under a folder called data, with a subfolder called videos. Leave them in .mp4 format. If you have access to your own Youtube channel, downloading videos from the console there will be perfect! - -### Using Sagemaker Notebooks - -First, ensure you have the appropriate permissions to use Sagemaker, Bedrock, and Bedrock inside Sagemaker. - -Then, create a notebook instance with the following configurations: - -- a powerful compute instance, we used ml.p3.2xlarge. -- link to this public repo, so you can import all scripts (you can also fork this repo and link that instead, in that case you will need to auth your access) -- the lifecycle_configuration.sh script, which will install packages on notebooks startup -- 16gb volume size, in case you add a lot of videos - -**It's extremely important to use the lifecycle config script, otherwise you may run into compatibility issues** - -When selecting the kernel, use the conda_python3 environment. - -Next, upload your data as described above (video mp4 files under ./data/videos) - ### Running the Scripts Locally Before beginning, authenthicate your session with AWS using your preferred method. You can @@ -102,10 +87,8 @@ For more information on available commands, you can use: make help ``` -It's easiest to run the whole pipeline (setup) and then run the streamlit app. - -From there, the streamlit app should pop up locally and you can start querying! - +It's easiest to run the whole pipeline (setup) and then run the Streamlit app. +From there, the Streamlit app should pop up locally and you can start querying! diff --git a/data/videos/example.txt b/data/videos/example.txt new file mode 100644 index 0000000..2d04c24 --- /dev/null +++ b/data/videos/example.txt @@ -0,0 +1 @@ +add your videos here! .mp4 format \ No newline at end of file diff --git a/lifecycle_configuration.sh b/lifecycle_configuration.sh deleted file mode 100644 index 5e6ac64..0000000 --- a/lifecycle_configuration.sh +++ /dev/null @@ -1,26 +0,0 @@ -#!/bin/bash - -set -e - -# OVERVIEW -# This script installs all necessary software for running the the notebook -# originally taken from https://github.com/aws-samples/aws-deepcomposer-samples/blob/master/gan/Lifecycle_configurations.sh - -sudo -u ec2-user -i <<'EOF' -ENVIRONMENT=python3 -source /home/ec2-user/anaconda3/bin/activate "$ENVIRONMENT" - -conda update --all --y -pip install anthropic==0.39.0 -pip install boto3==1.35.46 -pip install botocore==1.35.46 -pip install ffmpeg_python==0.2.0 -pip install pandas==2.2.3 -pip install pinecone==5.3.1 -pip install python-dotenv==1.0.1 -pip install streamlit==1.39.0 -pip install transformers==4.45.2 -pip install torch==2.5.0 -pip install tqdm==4.66.2 -source /home/ec2-user/anaconda3/bin/deactivate -EOF \ No newline at end of file diff --git a/preprocessing/app.py b/preprocessing/app.py index d56e871..051220e 100644 --- a/preprocessing/app.py +++ b/preprocessing/app.py @@ -1,12 +1,15 @@ import streamlit as st import os -#from boto_testing import titan_multimodal_embedding + +# from boto_testing import titan_multimodal_embedding from upsert_vectors import titan_text_embedding from claude_utils import ask_claude_vqa_response from config import index_name + # Initialize Pinecone from pinecone import Pinecone from dotenv import load_dotenv + load_dotenv() pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY")) @@ -23,29 +26,27 @@ if st.button("Query"): if query_text: # Embed the query - query_embedding = titan_text_embedding(text=query_text) + query_embedding = titan_text_embedding(text=query_text) - response = index.query(vector=query_embedding["embedding"], top_k=5, include_metadata=True) + response = index.query( + vector=query_embedding["embedding"], top_k=5, include_metadata=True + ) - #st.write(response) + # st.write(response) for r in response["matches"]: with st.expander(f"Match with Score: {r['score']}"): st.markdown(f"**Score:** {r['score']}") st.image(r["metadata"]["filepath"], caption="Matched Image") st.markdown(f"**Transcript:** {r['metadata']['transcript']}") - st.markdown(f"**Contextual Frame Description:** {r['metadata']['contextual_frame_description']}") + st.markdown( + f"**Contextual Frame Description:** {r['metadata']['contextual_frame_description']}" + ) st.markdown(f"**Timestamp Start:** {r['metadata']['timestamp_start']}") st.markdown(f"**Timestamp End:** {r['metadata']['timestamp_end']}") - # ask claude for an explanation of the returned results. claude_explanation = ask_claude_vqa_response(query_text, response["matches"]) st.markdown(f"**Claude Explanation:** {claude_explanation}") else: st.write("Please enter text or image path to query.") - - -# good queries -# The interview spoke about mad libs for robots. What is that about? -# At some point, the interviewer spoke about marrying customs. What was that about? \ No newline at end of file diff --git a/preprocessing/claude_utils.py b/preprocessing/claude_utils.py index 6f1c13a..12d6005 100644 --- a/preprocessing/claude_utils.py +++ b/preprocessing/claude_utils.py @@ -6,7 +6,7 @@ MODEL = "anthropic.claude-3-haiku-20240307-v1:0" MAX_TOKENS = 256 -system_prompt = ''' +system_prompt = """ You are an expert in the field of AI, and are assistant employees of an AI company on searching over their webinars and YouTube videos. You are tasked with, given a bunch of video screencaps, context, and transcriptions, to respond to the users query and questions for information. @@ -17,7 +17,7 @@ You should provide a response that is informative and helpful to the user, and you should refer back to the source presentations where that information came from. -''' +""" from anthropic import AnthropicBedrock @@ -26,23 +26,22 @@ logging.basicConfig(level=logging.INFO) - def convert_image_to_base64(image_path): with open(image_path, "rb") as image_file: binary_data = image_file.read() base_64_encoded_data = base64.b64encode(binary_data) - base64_string = base_64_encoded_data.decode('utf-8') + base64_string = base_64_encoded_data.decode("utf-8") return base64_string def format_messages_for_claude(user_query, vdb_response): """ Formats the user's query and the vector database response into a structured message for Claude. - + Args: user_query (str): The user's query. vdb_response (list): The response from the vector database, containing images and text. - + Returns: list: A list of messages formatted for Claude. """ @@ -54,46 +53,50 @@ def format_messages_for_claude(user_query, vdb_response): for item in vdb_response: img_b64 = convert_image_to_base64(item["metadata"]["filepath"]) - new_content.extend([ - { - "type": "text", - "text": "Image: " + item["metadata"]["filepath"], - }, - { - "type": "image", - "source": { - "type": "base64", - "media_type": "image/png", - "data": img_b64 - } - }, - { - "type": "text", - "text": "Contextual description: " + item["metadata"]["contextual_frame_description"] - }, - { - "type": "text", - "text": "Transcript: " + item["metadata"]["transcript"] - } - ]) - #reassign + new_content.extend( + [ + { + "type": "text", + "text": "Image: " + item["metadata"]["filepath"], + }, + { + "type": "image", + "source": { + "type": "base64", + "media_type": "image/png", + "data": img_b64, + }, + }, + { + "type": "text", + "text": "Contextual description: " + + item["metadata"]["contextual_frame_description"], + }, + { + "type": "text", + "text": "Transcript: " + item["metadata"]["transcript"], + }, + ] + ) + # reassign messages[0]["content"] = new_content return messages + def ask_claude_vqa_response(user_query, vdb_response): """ Sends the user's query and the vector database response to Claude and gets a response. - + Args: user_query (str): The user's query. vdb_response (list): The response from the vector database, containing images and text. - + Returns: str: The response from Claude. """ client = AnthropicBedrock() messages = format_messages_for_claude(user_query, vdb_response) - system_prompt = ''' + system_prompt = """ You are a friendly assistant helping people interpret their videos at their company. @@ -103,79 +106,75 @@ def ask_claude_vqa_response(user_query, vdb_response): Refer back to the images and text provided to guide the user to the appropriate slide, section, webinar, or talk where the information they are looking for is located. - ''' + """ response = client.messages.create( - model=MODEL, - max_tokens=MAX_TOKENS * 10, - system=system_prompt, - messages=messages - ) + model=MODEL, max_tokens=MAX_TOKENS * 10, system=system_prompt, messages=messages + ) return response.content[0].text - def ask_claude(img, text): # best for one off queries - client = AnthropicBedrock( - aws_region="us-east-1") + client = AnthropicBedrock(aws_region="us-east-1") if img: img_b64 = convert_image_to_base64(img) message = client.messages.create( model=MODEL, max_tokens=MAX_TOKENS, messages=[ - { - "role": "user", - "content": [ - {"type": "image", "source": + { + "role": "user", + "content": [ { - "type": "base64", - "media_type": "image/png", - "data": img_b64 - } - }, - {"type": "text", "text": text} - ] - } - ] + "type": "image", + "source": { + "type": "base64", + "media_type": "image/png", + "data": img_b64, + }, + }, + {"type": "text", "text": text}, + ], + } + ], ) else: message = client.messages.create( model=MODEL, max_tokens=MAX_TOKENS, - messages=[{"role": "user", "content": text}] + messages=[{"role": "user", "content": text}], ) return message.content[0].text - def make_claude_transcript_summary(transcript): - client = AnthropicBedrock( - aws_region="us-east-1") + client = AnthropicBedrock(aws_region="us-east-1") prompt = "Summarize the following transcript, being as concise as possible:" message = client.messages.create( model=MODEL, messages=[{"role": "user", "content": prompt + ": " + transcript}], - max_tokens=MAX_TOKENS + max_tokens=MAX_TOKENS, ) return message.content[0].text -def create_contextual_frame_description(frame_caption_index, frame_caption_pairs, transcript_summary, window=60, frame_width=15): + +def create_contextual_frame_description( + frame_caption_index, frame_caption_pairs, transcript_summary +): # frame caption pair will have an image, and a transcript. Window is in seconds - client = AnthropicBedrock( - aws_region="us-east-1") + client = AnthropicBedrock(aws_region="us-east-1") # gather context, look 4 frame widths before and after. Make sure not to go out of bounds if near beginning or end of video. - - surrounding_frames = frame_caption_pairs[max(0, frame_caption_index - 4 * frame_width):frame_caption_index + 1] + + # surrounding_frames = frame_caption_pairs[max(0, frame_caption_index - 4 * frame_width):frame_caption_index + 1] current_frame = frame_caption_pairs[frame_caption_index] # summarize past frames # removed for now - #past_frames_summary = make_claude_transcript_summary(" ".join([f["words"] for f in surrounding_frames])) - meta_prompt = f''' + # past_frames_summary = make_claude_transcript_summary(" ".join([f["words"] for f in surrounding_frames])) + meta_prompt = f""" You are watching a video and trying to explain what has happened in the video using a global summary, some recent context, and the transcript of the current frame. @@ -192,10 +191,7 @@ def create_contextual_frame_description(frame_caption_index, frame_caption_pairs been talked about. If a question was asked, and answered, include the question and answer in the description as well. Description: - ''' + """ rich_summary = ask_claude(img=current_frame["frame_path"], text=meta_prompt) return rich_summary - - - diff --git a/preprocessing/config.py b/preprocessing/config.py index e0b6545..2b506fa 100644 --- a/preprocessing/config.py +++ b/preprocessing/config.py @@ -1,7 +1,7 @@ from pathlib import Path package_dir = Path(__file__).parent -project_dir = package_dir.parent +project_dir = package_dir.parent data_dir = project_dir / "data" videos_dir = data_dir / "videos" @@ -9,4 +9,4 @@ # Pinecone variables that are helpful -index_name = "test-vqa" \ No newline at end of file +index_name = "test-vqa" diff --git a/preprocessing/enrich_and_create_vectors.py b/preprocessing/enrich_and_create_vectors.py index 9c8e726..e9adf51 100644 --- a/preprocessing/enrich_and_create_vectors.py +++ b/preprocessing/enrich_and_create_vectors.py @@ -1,10 +1,14 @@ -#this script take sthe output from preprocessing the video (transcript summary, frame caption pairs) and creates the high quality +# this script take sthe output from preprocessing the video (transcript summary, frame caption pairs) and creates the high quality # contextual frame descriptions for our vector earch -from claude_utils import create_contextual_frame_description, make_claude_transcript_summary +from claude_utils import ( + create_contextual_frame_description, + make_claude_transcript_summary, +) from tqdm import tqdm import json + with open("./data/all_videos_data.json", "r") as f: all_videos_data = json.load(f) @@ -23,16 +27,17 @@ for i, pair in tqdm(enumerate(frame_caption_pairs)): contextual_frame_description = create_contextual_frame_description( - frame_caption_index = i, - frame_caption_pairs=frame_caption_pairs, - transcript_summary=transcript_summary) - # write out the updated frame caption pairs + frame_caption_index=i, + frame_caption_pairs=frame_caption_pairs, + transcript_summary=transcript_summary, + ) + # write out the updated frame caption pairs new_pair = { "frame_path": pair["frame_path"], "words": pair["words"], "timestamp": pair["timestamp"], "transcript_summary": transcript_summary, - "contextual_frame_description": contextual_frame_description + "contextual_frame_description": contextual_frame_description, } finalized_data.append(new_pair) diff --git a/preprocessing/preprocess_videos.py b/preprocessing/preprocess_videos.py index 2213440..03c00ef 100644 --- a/preprocessing/preprocess_videos.py +++ b/preprocessing/preprocess_videos.py @@ -2,7 +2,7 @@ from transformers import pipeline import os import torch -import json +import json from config import videos_dir, data_dir # Important Globals @@ -13,10 +13,13 @@ # Step 1: Transcribe Video def transcribe_video(video_path): device = "cuda:0" if torch.cuda.is_available() else "cpu" - transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-tiny",device=device) + transcriber = pipeline( + "automatic-speech-recognition", model="openai/whisper-tiny", device=device + ) transcription = transcriber(video_path, return_timestamps="word") return transcription + # Step 2: Extract Frames and Pair with Dialogue def extract_frames(frames_output_path, video_path, interval): @@ -30,19 +33,24 @@ def extract_frames(frames_output_path, video_path, interval): except FileExistsError: print(f"Folder already exists for videofile {video_path}") print("Please delete it to start fresh or ensure it is empty.") - + # Use ffmpeg to extract frames ( - ffmpeg - .input(video_path) - .filter('fps', fps=1/interval) + ffmpeg.input(video_path) + .filter("fps", fps=1 / interval) # this sets the output to be a frame every Interval seconds - .output(f'{frame_output}/frame_%04d.png') + .output(f"{frame_output}/frame_%04d.png") .run() ) - + # Collect the frame file paths - frame_files = sorted([os.path.join(frame_output, f) for f in os.listdir(frame_output) if f.endswith('.png')]) + frame_files = sorted( + [ + os.path.join(frame_output, f) + for f in os.listdir(frame_output) + if f.endswith(".png") + ] + ) # create a dictionary of frame file paths and their corresponding timestamps according to interval and video length frames_and_intervals = [] @@ -50,17 +58,21 @@ def extract_frames(frames_output_path, video_path, interval): for i, frame in enumerate(frame_files): single_frame = { "frame_path": frame, - "timestamp": (interval_start, interval_start + (interval)) + "timestamp": (interval_start, interval_start + (interval)), } frames_and_intervals.append(single_frame) - interval_start+=interval - + interval_start += interval + # get video duration and make the last frame the same as the duration - video_duration = ffmpeg.probe(video_path)['format']['duration'] + video_duration = ffmpeg.probe(video_path)["format"]["duration"] final_frame_start_time = frames_and_intervals[-1]["timestamp"][0] - frames_and_intervals[-1]["timestamp"] = (final_frame_start_time, float(video_duration)) + frames_and_intervals[-1]["timestamp"] = ( + final_frame_start_time, + float(video_duration), + ) return frames_and_intervals + def assign_words_to_frames(transcription, frames, interval=INTERVAL): # given transcription word chunks and frames that occur w.r.t interval, assign words to frames # Transcription will be on word level, so for each frame, we find the words that occur in the time interval of the frame @@ -70,30 +82,37 @@ def assign_words_to_frames(transcription, frames, interval=INTERVAL): for f in frames: frame_start, frame_end = f["timestamp"][0], f["timestamp"][1] - + # filter for words that fall in frame - words_in_frame = list(filter(lambda w: w['timestamp'][0] > frame_start and w['timestamp'][1] < frame_end, transcription["chunks"])) - words = [w['text'] for w in words_in_frame] - words= "".join(words) + words_in_frame = list( + filter( + lambda w: w["timestamp"][0] > frame_start + and w["timestamp"][1] < frame_end, + transcription["chunks"], + ) + ) + words = [w["text"] for w in words_in_frame] + words = "".join(words) single_frame = { "frame_path": f["frame_path"], "words": words, - "timestamp": f["timestamp"] + "timestamp": f["timestamp"], } frames_and_words.append(single_frame) return frames_and_words + def align_frames_with_dialogue(frames, transcription): - #Pinecone for t + # Pinecone for t dialogue_frames = [] for frame, chunk in zip(frames, transcription["chunks"]): - dialogue_frames.append({ - "frame": frame, - "dialogue": chunk["text"], - "metadata": { - "timestamp": chunk["timestamp"] + dialogue_frames.append( + { + "frame": frame, + "dialogue": chunk["text"], + "metadata": {"timestamp": chunk["timestamp"]}, } - }) + ) return dialogue_frames @@ -114,34 +133,37 @@ def align_frames_with_dialogue(frames, transcription): os.mkdir(frames_and_words_dir) os.mkdir(frames_dir) except FileExistsError: - print("Folders already exist. Please delete them to start fresh or ensure they are empty.") + print( + "Folders already exist. Please delete them to start fresh or ensure they are empty." + ) for video_path in video_files: transcription = transcribe_video(video_path) video_filename = os.path.splitext(os.path.basename(video_path))[0] - + # Write transcription out as json - transcription_filename = os.path.join(transcriptions_dir, video_filename + "_transcription.json") + transcription_filename = os.path.join( + transcriptions_dir, video_filename + "_transcription.json" + ) with open(transcription_filename, "w") as f: json.dump(transcription["text"], f) - + frames = extract_frames(frames_dir, video_path, INTERVAL) frames_and_words = assign_words_to_frames(transcription, frames) - - frames_and_words_filename = os.path.join(frames_and_words_dir, video_filename + "_frames_and_words.json") + + frames_and_words_filename = os.path.join( + frames_and_words_dir, video_filename + "_frames_and_words.json" + ) with open(frames_and_words_filename, "w") as f: json.dump(frames_and_words, f) - + all_videos_data[video_filename] = { "transcription": transcription_filename, - "frames_and_words": frames_and_words_filename + "frames_and_words": frames_and_words_filename, } # Optionally, write all_videos_data to a summary file all_videos_data_path = data_dir / "all_videos_data.json" with open(all_videos_data_path, "w") as f: json.dump(all_videos_data, f) - - - \ No newline at end of file diff --git a/preprocessing/upsert_vectors.py b/preprocessing/upsert_vectors.py index f16ce94..0d1d7b9 100644 --- a/preprocessing/upsert_vectors.py +++ b/preprocessing/upsert_vectors.py @@ -5,15 +5,13 @@ import json import boto3 import base64 +import pandas as pd load_dotenv() -# turn data into dataframe -import pandas as pd - # iterate over dataframe, and embed vectors using titan multimodal - + boto3_session = boto3.session.Session() region_name = boto3_session.region_name bedrock_client = boto3.client( @@ -24,25 +22,21 @@ # Embedding code + def titan_multimodal_embedding( - image_path:str=None, # maximum 2048 x 2048 pixels - description:str=None, # English only and max input tokens 128 - dimension:int=1024, # 1,024 (default), 384, 256 - model_id:str="amazon.titan-embed-image-v1" + image_path: str = None, # maximum 2048 x 2048 pixels + description: str = None, # English only and max input tokens 128 + dimension: int = 1024, # 1,024 (default), 384, 256 + model_id: str = "amazon.titan-embed-image-v1", ): - payload_body = {} - embedding_config = { - "embeddingConfig": { - "outputEmbeddingLength": dimension - } - } + embedding_config = {"embeddingConfig": {"outputEmbeddingLength": dimension}} # You can specify either text or image or both if image_path: with open(image_path, "rb") as image_file: - input_image = base64.b64encode(image_file.read()).decode('utf8') + input_image = base64.b64encode(image_file.read()).decode("utf8") payload_body["inputImage"] = input_image if description: payload_body["inputText"] = description @@ -51,21 +45,19 @@ def titan_multimodal_embedding( print("\n".join(payload_body.keys())) response = bedrock_client.invoke_model( - body=json.dumps({**payload_body, **embedding_config}), + body=json.dumps({**payload_body, **embedding_config}), modelId=model_id, - accept="application/json", - contentType="application/json" + accept="application/json", + contentType="application/json", ) return json.loads(response.get("body").read()) - - def titan_text_embedding( text: str, # English only and max input tokens 128 dimension: int = 1024, # 1,024 (default), 384, 256 - model_id: str = "amazon.titan-embed-text-v2:0" + model_id: str = "amazon.titan-embed-text-v2:0", ): payload_body = { "inputText": text, @@ -75,7 +67,7 @@ def titan_text_embedding( body=json.dumps(payload_body), modelId=model_id, accept="application/json", - contentType="application/json" + contentType="application/json", ) response_body = json.loads(response.get("body").read()) @@ -88,7 +80,6 @@ def titan_text_embedding( return response_body - # transform dataframe into metadata, ids, and values (where values are the vectors) @@ -97,11 +88,10 @@ def titan_text_embedding( if __name__ == "__main__": # read in as json - file_path = './data/finalized_data.json' + file_path = "./data/finalized_data.json" with open(file_path, "r") as f: data = json.load(f) - values_to_embed = [item["contextual_frame_description"] for item in data] ids = [item["frame_path"] for item in data] @@ -111,43 +101,37 @@ def titan_text_embedding( embedding = titan_text_embedding(text=v) embeddings.append(embedding["embedding"]) - final_vectors = [] ids = [x for x in range(0, len(values_to_embed))] for v, e, id in tqdm(zip(data, embeddings, ids)): - final_vectors.append({ - "id": str(id), - "values": e, - "metadata": { - "transcript": v["words"], - "filepath": v["frame_path"], - "timestamp_start": v["timestamp"][0], - "timestamp_end": v["timestamp"][1], - "contextual_frame_description": v["contextual_frame_description"] + final_vectors.append( + { + "id": str(id), + "values": e, + "metadata": { + "transcript": v["words"], + "filepath": v["frame_path"], + "timestamp_start": v["timestamp"][0], + "timestamp_end": v["timestamp"][1], + "contextual_frame_description": v["contextual_frame_description"], + }, } - }) - + ) + # turn the list of dictionaries into a dataframe df = pd.DataFrame(final_vectors) print(df.head()) pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY")) - # create index if not pc.has_index(index_name): pc.create_index( name=index_name, dimension=1024, metric="cosine", - spec=ServerlessSpec( - cloud='aws', - region='us-east-1' - ) - ) + spec=ServerlessSpec(cloud="aws", region="us-east-1"), + ) index = pc.Index(index_name) index.upsert_from_dataframe(df) - - - diff --git a/vqa_workflow.ipynb b/vqa_workflow.ipynb deleted file mode 100644 index 9ac7a78..0000000 --- a/vqa_workflow.ipynb +++ /dev/null @@ -1,1098 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Contextual Video RAG over Webinars with Pinecone, Anthropic and AWS" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In doing so, we'll convert a multimodal problem into a purely text one on search, and leave the complex multimodal ingestion to the Claude Bedrock API. This saves us time and a bit of complexity on the multimodal embedding front!" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Welcome to the workshop! In this notebook, we'll setup a simple video RAG workflow using Pinecone, Claude and AWS. We'll take an input set of videos and ingest them (using Claude in pre and post processing) in order to allow for an contextual RAG experience over a traditionally vexing dataset. \n", - "\n", - "\n", - "Before running this notebook in Sagemaker, you'll need the following:\n", - "\n", - "\n", - "- A Sagemaker Instance with this Repo open\n", - "- Access to Claude Haiku and Sonnet via Bedrock\n", - "- A folder called \"data\" with a subfolder called \"videos\", with at least 1 video in .mp4 format there\n", - "- A Pinecone API Key, so we can create our index" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## First, some dependency cleanup" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# install torch down, install ffmpeg-python" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Looking in indexes: https://download.pytorch.org/whl/cu118\n", - "Requirement already satisfied: torch in /home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages (2.5.0)\n", - "Collecting torchvision\n", - " Downloading https://download.pytorch.org/whl/cu118/torchvision-0.20.1%2Bcu118-cp310-cp310-linux_x86_64.whl (6.5 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m6.5/6.5 MB\u001b[0m \u001b[31m123.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hCollecting torchaudio\n", - " Downloading https://download.pytorch.org/whl/cu118/torchaudio-2.5.1%2Bcu118-cp310-cp310-linux_x86_64.whl (3.3 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.3/3.3 MB\u001b[0m \u001b[31m144.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hRequirement already satisfied: filelock in /home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages (from torch) (3.16.1)\n", - "Requirement already satisfied: typing-extensions>=4.8.0 in /home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages (from torch) (4.12.2)\n", - "Requirement already satisfied: networkx in /home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages (from torch) (3.4.2)\n", - "Requirement already satisfied: jinja2 in /home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages (from torch) (3.1.4)\n", - "Requirement already satisfied: fsspec in /home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages (from torch) (2024.10.0)\n", - "Requirement already satisfied: nvidia-cuda-nvrtc-cu12==12.4.127 in /home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages (from torch) (12.4.127)\n", - "Requirement already satisfied: nvidia-cuda-runtime-cu12==12.4.127 in /home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages (from torch) (12.4.127)\n", - "Requirement already satisfied: nvidia-cuda-cupti-cu12==12.4.127 in /home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages (from torch) (12.4.127)\n", - "Requirement already satisfied: nvidia-cudnn-cu12==9.1.0.70 in /home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages (from torch) (9.1.0.70)\n", - "Requirement already satisfied: nvidia-cublas-cu12==12.4.5.8 in /home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages (from torch) (12.4.5.8)\n", - "Requirement already satisfied: nvidia-cufft-cu12==11.2.1.3 in /home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages (from torch) (11.2.1.3)\n", - "Requirement already satisfied: nvidia-curand-cu12==10.3.5.147 in /home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages (from torch) (10.3.5.147)\n", - "Requirement already satisfied: nvidia-cusolver-cu12==11.6.1.9 in /home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages (from torch) (11.6.1.9)\n", - "Requirement already satisfied: nvidia-cusparse-cu12==12.3.1.170 in /home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages (from torch) (12.3.1.170)\n", - "Requirement already satisfied: nvidia-nccl-cu12==2.21.5 in /home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages (from torch) (2.21.5)\n", - "Requirement already satisfied: nvidia-nvtx-cu12==12.4.127 in /home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages (from torch) (12.4.127)\n", - "Requirement already satisfied: nvidia-nvjitlink-cu12==12.4.127 in /home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages (from torch) (12.4.127)\n", - "Requirement already satisfied: triton==3.1.0 in /home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages (from torch) (3.1.0)\n", - "Requirement already satisfied: sympy==1.13.1 in /home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages (from torch) (1.13.1)\n", - "Requirement already satisfied: mpmath<1.4,>=1.1.0 in /home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages (from sympy==1.13.1->torch) (1.3.0)\n", - "Requirement already satisfied: numpy in /home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages (from torchvision) (1.26.4)\n", - "Collecting torch\n", - " Downloading https://download.pytorch.org/whl/cu118/torch-2.5.1%2Bcu118-cp310-cp310-linux_x86_64.whl (838.3 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m838.3/838.3 MB\u001b[0m \u001b[31m35.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n", - "\u001b[?25hRequirement already satisfied: pillow!=8.3.*,>=5.3.0 in /home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages (from torchvision) (10.4.0)\n", - "Collecting nvidia-cuda-nvrtc-cu11==11.8.89 (from torch)\n", - " Downloading https://download.pytorch.org/whl/cu118/nvidia_cuda_nvrtc_cu11-11.8.89-py3-none-manylinux1_x86_64.whl (23.2 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m23.2/23.2 MB\u001b[0m \u001b[31m99.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m\n", - "\u001b[?25hCollecting nvidia-cuda-runtime-cu11==11.8.89 (from torch)\n", - " Downloading https://download.pytorch.org/whl/cu118/nvidia_cuda_runtime_cu11-11.8.89-py3-none-manylinux1_x86_64.whl (875 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m875.6/875.6 kB\u001b[0m \u001b[31m78.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hCollecting nvidia-cuda-cupti-cu11==11.8.87 (from torch)\n", - " Downloading https://download.pytorch.org/whl/cu118/nvidia_cuda_cupti_cu11-11.8.87-py3-none-manylinux1_x86_64.whl (13.1 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m13.1/13.1 MB\u001b[0m \u001b[31m125.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hCollecting nvidia-cudnn-cu11==9.1.0.70 (from torch)\n", - " Downloading https://download.pytorch.org/whl/cu118/nvidia_cudnn_cu11-9.1.0.70-py3-none-manylinux2014_x86_64.whl (663.9 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m663.9/663.9 MB\u001b[0m \u001b[31m51.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n", - "\u001b[?25hCollecting nvidia-cublas-cu11==11.11.3.6 (from torch)\n", - " Downloading https://download.pytorch.org/whl/cu118/nvidia_cublas_cu11-11.11.3.6-py3-none-manylinux1_x86_64.whl (417.9 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m417.9/417.9 MB\u001b[0m \u001b[31m86.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n", - "\u001b[?25hCollecting nvidia-cufft-cu11==10.9.0.58 (from torch)\n", - " Downloading https://download.pytorch.org/whl/cu118/nvidia_cufft_cu11-10.9.0.58-py3-none-manylinux1_x86_64.whl (168.4 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m168.4/168.4 MB\u001b[0m \u001b[31m144.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n", - "\u001b[?25hCollecting nvidia-curand-cu11==10.3.0.86 (from torch)\n", - " Downloading https://download.pytorch.org/whl/cu118/nvidia_curand_cu11-10.3.0.86-py3-none-manylinux1_x86_64.whl (58.1 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m58.1/58.1 MB\u001b[0m \u001b[31m120.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m\n", - "\u001b[?25hCollecting nvidia-cusolver-cu11==11.4.1.48 (from torch)\n", - " Downloading https://download.pytorch.org/whl/cu118/nvidia_cusolver_cu11-11.4.1.48-py3-none-manylinux1_x86_64.whl (128.2 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m128.2/128.2 MB\u001b[0m \u001b[31m98.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n", - "\u001b[?25hCollecting nvidia-cusparse-cu11==11.7.5.86 (from torch)\n", - " Downloading https://download.pytorch.org/whl/cu118/nvidia_cusparse_cu11-11.7.5.86-py3-none-manylinux1_x86_64.whl (204.1 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m204.1/204.1 MB\u001b[0m \u001b[31m92.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n", - "\u001b[?25hCollecting nvidia-nccl-cu11==2.21.5 (from torch)\n", - " Downloading https://download.pytorch.org/whl/cu118/nvidia_nccl_cu11-2.21.5-py3-none-manylinux2014_x86_64.whl (147.8 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m147.8/147.8 MB\u001b[0m \u001b[31m106.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n", - "\u001b[?25hCollecting nvidia-nvtx-cu11==11.8.86 (from torch)\n", - " Downloading https://download.pytorch.org/whl/cu118/nvidia_nvtx_cu11-11.8.86-py3-none-manylinux1_x86_64.whl (99 kB)\n", - "Requirement already satisfied: MarkupSafe>=2.0 in /home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages (from jinja2->torch) (3.0.2)\n", - "Installing collected packages: nvidia-nvtx-cu11, nvidia-nccl-cu11, nvidia-cusparse-cu11, nvidia-curand-cu11, nvidia-cufft-cu11, nvidia-cuda-runtime-cu11, nvidia-cuda-nvrtc-cu11, nvidia-cuda-cupti-cu11, nvidia-cublas-cu11, nvidia-cusolver-cu11, nvidia-cudnn-cu11, torch, torchvision, torchaudio\n", - " Attempting uninstall: torch\n", - " Found existing installation: torch 2.5.0\n", - " Uninstalling torch-2.5.0:\n", - " Successfully uninstalled torch-2.5.0\n", - "Successfully installed nvidia-cublas-cu11-11.11.3.6 nvidia-cuda-cupti-cu11-11.8.87 nvidia-cuda-nvrtc-cu11-11.8.89 nvidia-cuda-runtime-cu11-11.8.89 nvidia-cudnn-cu11-9.1.0.70 nvidia-cufft-cu11-10.9.0.58 nvidia-curand-cu11-10.3.0.86 nvidia-cusolver-cu11-11.4.1.48 nvidia-cusparse-cu11-11.7.5.86 nvidia-nccl-cu11-2.21.5 nvidia-nvtx-cu11-11.8.86 torch-2.5.1+cu118 torchaudio-2.5.1+cu118 torchvision-0.20.1+cu118\n", - "Note: you may need to restart the kernel to use updated packages.\n" - ] - } - ], - "source": [ - "# important Environmental Variables\n", - "%pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%conda install ffmpeg-python" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Video Data as Input" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The trickiest part about working with Video data is the multimodal nature of the content presented on screen.\n", - " \n", - "For content such as webinars (like this one!), you may have multiple speakers, diagrams on screen, mis-matched transcripts and audio, etc.\n", - "\n", - "Without some sort of end-to-end encoder, it can be quite difficult to encompass all of these attributes. \n", - "\n", - "We'll take a simplified approach where we process our video set into frame-transcript pairs, which will allow us to reduce the dimensionality of the data to images and pairs.\n", - "\n", - "Lets begin by transcribing and processing our video data.\n", - "\n", - "**Don't forget to upload your videos manually into the data folder!**" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "![Test Title](./diagrams/Video_Preprocessing.png)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Video preprocessing: Transcription and Frames\n", - "\n", - "\n", - "\n", - "First, we'll do some housework to grab our video and setup some helper functions. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import os \n", - "from preprocessing.config import data_dir, videos_dir\n", - "from preprocessing.preprocess_videos import *\n", - "\n", - "\n", - "video_files = os.listdir(videos_dir)\n", - "print(video_files)\n", - "# add root dir to video files\n", - "video_files = [os.path.join(videos_dir, f) for f in video_files]\n", - "\n", - "all_videos_data = {}\n", - "transcriptions_dir = os.path.join(data_dir, \"transcriptions\")\n", - "frames_and_words_dir = os.path.join(data_dir, \"frames_and_words\")\n", - "frames_dir = os.path.join(data_dir, \"frames\")\n", - "\n", - "# folder setup\n", - "try:\n", - " os.mkdir(transcriptions_dir)\n", - " os.mkdir(frames_and_words_dir)\n", - " os.mkdir(frames_dir)\n", - "except FileExistsError:\n", - " print(\"Folders already exist. Please delete them to start fresh or ensure they are empty.\")\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now, we can iterate over the video files, for the following workflow:\n", - "\n", - "1. Transcribe the video and obtain the word-level timestamps (we do this in 45s intervals)\n", - "2. Walk over the video in INTERVAL length windows, and take the current frame on screen\n", - "3. Grab all words covering that frame, and save out along with the transcript and frames themselves\n", - "\n", - "**If you'd like to modify any of this code, take a look at the preprocess_videos.py script under preprocessing!**\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages/pandas/core/computation/expressions.py:21: UserWarning: Pandas requires version '2.8.4' or newer of 'numexpr' (version '2.7.3' currently installed).\n", - " from pandas.core.computation.check import NUMEXPR_INSTALLED\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "['mlsearch_webinar.mp4']\n", - "Folders already exist. Please delete them to start fresh or ensure they are empty.\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages/transformers/models/whisper/generation_whisper.py:496: FutureWarning: The input name `inputs` is deprecated. Please make sure to use `input_features` instead.\n", - " warnings.warn(\n", - "Due to a bug fix in https://github.com/huggingface/transformers/pull/28687 transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English.This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`.\n", - "Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.43.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.\n", - "WhisperModel is using WhisperSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True` or `layer_head_mask` not None. Falling back to the manual attention implementation, but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation=\"eager\"` when loading the model.\n", - "From v4.47 onwards, when a model cache is to be returned, `generate` will return a `Cache` instance instead by default (as opposed to the legacy tuple of tuples format). If you want to keep returning the legacy format, please set `return_legacy_cache=True`.\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Folder already exists for videofile /home/ec2-user/SageMaker/pc-yt-rag/data/videos/mlsearch_webinar.mp4\n", - "Please delete it to start fresh or ensure it is empty.\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "ffmpeg version 7.1 Copyright (c) 2000-2024 the FFmpeg developers\n", - " built with gcc 13.3.0 (conda-forge gcc 13.3.0-1)\n", - " configuration: --prefix=/home/ec2-user/anaconda3/envs/python3 --cc=/home/conda/feedstock_root/build_artifacts/ffmpeg_1730671409690/_build_env/bin/x86_64-conda-linux-gnu-cc --cxx=/home/conda/feedstock_root/build_artifacts/ffmpeg_1730671409690/_build_env/bin/x86_64-conda-linux-gnu-c++ --nm=/home/conda/feedstock_root/build_artifacts/ffmpeg_1730671409690/_build_env/bin/x86_64-conda-linux-gnu-nm --ar=/home/conda/feedstock_root/build_artifacts/ffmpeg_1730671409690/_build_env/bin/x86_64-conda-linux-gnu-ar --disable-doc --enable-openssl --enable-demuxer=dash --enable-hardcoded-tables --enable-libfreetype --enable-libharfbuzz --enable-libfontconfig --enable-libopenh264 --enable-libdav1d --disable-gnutls --enable-libmp3lame --enable-libvpx --enable-libass --enable-pthreads --enable-vaapi --enable-libopenvino --enable-gpl --enable-libx264 --enable-libx265 --enable-libaom --enable-libsvtav1 --enable-libxml2 --enable-pic --enable-shared --disable-static --enable-version3 --enable-zlib --enable-libopus --enable-librsvg --pkg-config=/home/conda/feedstock_root/build_artifacts/ffmpeg_1730671409690/_build_env/bin/pkg-config\n", - " libavutil 59. 39.100 / 59. 39.100\n", - " libavcodec 61. 19.100 / 61. 19.100\n", - " libavformat 61. 7.100 / 61. 7.100\n", - " libavdevice 61. 3.100 / 61. 3.100\n", - " libavfilter 10. 4.100 / 10. 4.100\n", - " libswscale 8. 3.100 / 8. 3.100\n", - " libswresample 5. 3.100 / 5. 3.100\n", - " libpostproc 58. 3.100 / 58. 3.100\n", - "Input #0, mov,mp4,m4a,3gp,3g2,mj2, from '/home/ec2-user/SageMaker/pc-yt-rag/data/videos/mlsearch_webinar.mp4':\n", - " Metadata:\n", - " major_brand : mp42\n", - " minor_version : 0\n", - " compatible_brands: isommp42\n", - " creation_time : 2024-09-07T05:52:15.000000Z\n", - " encoder : Google\n", - " Duration: 00:53:43.77, start: 0.000000, bitrate: 302 kb/s\n", - " Stream #0:0[0x1](und): Video: h264 (High) (avc1 / 0x31637661), yuv420p(tv, smpte170m/bt470bg/bt709, progressive), 1280x720 [SAR 1:1 DAR 16:9], 171 kb/s, 25 fps, 25 tbr, 12800 tbn (default)\n", - " Metadata:\n", - " creation_time : 2024-09-07T05:52:15.000000Z\n", - " handler_name : ISO Media file produced by Google Inc. Created on: 09/06/2024.\n", - " vendor_id : [0][0][0][0]\n", - " Stream #0:1[0x2](eng): Audio: aac (LC) (mp4a / 0x6134706D), 44100 Hz, stereo, fltp, 127 kb/s (default)\n", - " Metadata:\n", - " creation_time : 2024-09-07T05:52:15.000000Z\n", - " handler_name : ISO Media file produced by Google Inc. Created on: 09/06/2024.\n", - " vendor_id : [0][0][0][0]\n", - "Stream mapping:\n", - " Stream #0:0 (h264) -> fps:default\n", - " fps:default -> Stream #0:0 (png)\n", - "Press [q] to stop, [?] for help\n", - "Output #0, image2, to '/home/ec2-user/SageMaker/pc-yt-rag/data/frames/mlsearch_webinar/frame_%04d.png':\n", - " Metadata:\n", - " major_brand : mp42\n", - " minor_version : 0\n", - " compatible_brands: isommp42\n", - " encoder : Lavf61.7.100\n", - " Stream #0:0: Video: png, rgb24(pc, gbr/bt470bg/bt709, progressive), 1280x720 [SAR 1:1 DAR 16:9], q=2-31, 200 kb/s, 0.02 fps, 0.02 tbn\n", - " Metadata:\n", - " encoder : Lavc61.19.100 png\n", - "[out#0/image2 @ 0x5592e25f5680] video:19798KiB audio:0KiB subtitle:0KiB other streams:0KiB global headers:0KiB muxing overhead: unknown\n", - "frame= 72 fps=1.7 q=-0.0 Lsize=N/A time=00:54:00.00 bitrate=N/A speed=77.7x \n" - ] - } - ], - "source": [ - "INTERVAL=45\n", - "\n", - "for video_path in video_files:\n", - " transcription = transcribe_video(video_path)\n", - "\n", - " video_filename = os.path.splitext(os.path.basename(video_path))[0]\n", - " \n", - " # Write transcription out as json, for use later in the pipeline\n", - " transcription_filename = os.path.join(transcriptions_dir, video_filename + \"_transcription.json\")\n", - " with open(transcription_filename, \"w\") as f:\n", - " json.dump(transcription[\"text\"], f)\n", - " \n", - " frames = extract_frames(frames_dir, video_path, INTERVAL)\n", - "\n", - " # We group the words into the frames they belong to, here\n", - " frames_and_words = assign_words_to_frames(transcription, frames)\n", - " \n", - " frames_and_words_filename = os.path.join(frames_and_words_dir, video_filename + \"_frames_and_words.json\")\n", - " with open(frames_and_words_filename, \"w\") as f:\n", - " json.dump(frames_and_words, f)\n", - " \n", - " all_videos_data[video_filename] = {\n", - " \"transcription\": transcription_filename,\n", - " \"frames_and_words\": frames_and_words_filename\n", - "}\n", - "\n", - "# This file helps manage all of the videos we make, if there is more than one\n", - "all_videos_data_path = data_dir / \"all_videos_data.json\"\n", - "with open(all_videos_data_path, \"w\") as f:\n", - " json.dump(all_videos_data, f)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Using Claude on Ingest\n", - "\n", - "### Contextual Retrieval\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "As discussed earlier, the videos we desire to do RAG over have some properties that differentiate them from normal documents.\n", - "\n", - "Notably, these videos can be really long! So, how can we get high quality representations of each frame, if we just have the context of the transcripts?\n", - "\n", - "Lucky for us, we can use Claude's visual understanding capabilities to annotate each frame, conditioned on the **transcript, frame image, and overall transcript summary**. \n", - "\n", - "This is a basic form of **contextual retrieval**, where we enrich the initial text data with the context surrounding it. Anthropic [announced this technique](https://www.anthropic.com/news/contextual-retrieval) as a way to improve retrieval on texts where the context for chunked data is particularly important, and we extend it here to apply to video data!" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "First, we setup some helper functions to ingest the data we need to pass to Claude Haiku." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import base64\n", - "\n", - "def convert_image_to_base64(image_path):\n", - " with open(image_path, \"rb\") as image_file:\n", - " binary_data = image_file.read()\n", - " base_64_encoded_data = base64.b64encode(binary_data)\n", - " base64_string = base_64_encoded_data.decode('utf-8')\n", - " return base64_string\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Next, we create some functions to obtain Claude's response given images and tex response from our vector database, but also in cases where we just want a response on a single image." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "MODEL = \"anthropic.claude-3-haiku-20240307-v1:0\"\n", - "MAX_TOKENS = 256\n", - "\n", - "from anthropic import AnthropicBedrock\n", - "\n", - "\n", - "\n", - "def ask_claude(img, text):\n", - " # best for one off queries\n", - " client = AnthropicBedrock(\n", - " aws_region=\"us-east-1\")\n", - " if img:\n", - " img_b64 = convert_image_to_base64(img)\n", - " message = client.messages.create(\n", - " model=MODEL,\n", - " max_tokens=MAX_TOKENS,\n", - " messages=[\n", - " {\n", - " \"role\": \"user\", \n", - " \"content\": [\n", - " {\"type\": \"image\", \"source\": \n", - " {\n", - " \"type\": \"base64\",\n", - " \"media_type\": \"image/png\",\n", - " \"data\": img_b64\n", - " }\n", - " },\n", - " {\"type\": \"text\", \"text\": text}\n", - " ]\n", - " }\n", - " ]\n", - " )\n", - " else:\n", - " message = client.messages.create(\n", - " model=MODEL,\n", - " max_tokens=MAX_TOKENS,\n", - " messages=[{\"role\": \"user\", \"content\": text}]\n", - " )\n", - " return message.content[0].text\n", - "\n", - "\n", - " \n", - "\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "And, we make a helper function to make the transcript summaries specifically." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "\n", - "def make_claude_transcript_summary(transcript):\n", - " client = AnthropicBedrock(\n", - " aws_region=\"us-east-1\")\n", - "\n", - " prompt = \"Summarize the following transcript, being as concise as possible:\"\n", - " message = client.messages.create(\n", - " model=MODEL,\n", - " messages=[{\"role\": \"user\", \"content\": prompt + \": \" + transcript}],\n", - " max_tokens=MAX_TOKENS\n", - " )\n", - " return message.content[0].text" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Creating the Contextual Descriptions using VQA, Transcripts, and Claude\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Finally, the contextual rag step! We use the transcript summary, the current transcript in frame, in addition to the frame itself, in order for Claude to create a nice contextual description. This is what will be embedded, for search in Pinecone.\n", - "\n", - "To better understand how Claude deals with visual data, especially slides, take a look at Anthropic's cookbook [here](https://github.com/anthropics/anthropic-cookbook/blob/main/multimodal/reading_charts_graphs_powerpoints.ipynb):" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "![Our Contextual Embedding Workflow](./diagrams/Contextual_Retrieval_With_Video_RAG.png)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def create_contextual_frame_description(frame_caption_index, frame_caption_pairs, transcript_summary, window=60, frame_width=15):\n", - " # frame caption pair will have an image, and a transcript. Window is in seconds\n", - " client = AnthropicBedrock(\n", - " aws_region=\"us-east-1\")\n", - " \n", - " current_frame = frame_caption_pairs[frame_caption_index]\n", - "\n", - " meta_prompt = f'''\n", - "\n", - " You are watching a video and trying to explain what has\n", - " happened in the video using a global summary, some recent context, \n", - " and the transcript of the current frame.\n", - "\n", - " The video has been summarized as follows:\n", - " {transcript_summary}\n", - "\n", - " The current frame's transcript is as follows:\n", - " {current_frame[\"words\"]}\n", - "\n", - " You also want to provide a description of the current frame based on the context provided.\n", - "\n", - " Please describe this video snippet using the information above in addition to the frame visual. Explain any diagrams or code or important text that appears on screen,\n", - " especially if the snippet is of a slide or a code snippet. \n", - " If there are only people in the frame, focus on the transcript and the context provided to describe what has\n", - " been talked about. \n", - " If a question was asked, and answered, \n", - " include the question and answer in the description as well.\n", - "\n", - " Description:\n", - " '''\n", - "\n", - " rich_summary = ask_claude(img=current_frame[\"frame_path\"], text=meta_prompt)\n", - " return rich_summary" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Step 3: Putting it all Together" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Here is a concise summary of the key points from the transcript:\n", - "\n", - "- The webinar covers the magic of multilingual search, specifically multilingual semantic search. \n", - "\n", - "- It provides a crash course on vectors, vector embeddings, and how large language models can represent concepts across languages.\n", - "\n", - "- The focus is on using the multilingual E5 large model and Pinecone's vector database to enable efficient multilingual semantic search.\n", - "\n", - "- A demo is shown applying this approach to a language learning problem, allowing cross-lingual and model-lingual search over a dataset of English and Spanish sentence translations.\n", - "\n", - "- Key takeaways include embedding queries and passages differently, handling chunking and rate limiting, and evaluating performance with a domain-specific gold standard dataset.\n", - "\n", - "- The session covers theoretical aspects of multilingual embeddings as well as practical steps for implementing a multilingual semantic search application using Pinecone.\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "72it [07:36, 6.34s/it]\n" - ] - } - ], - "source": [ - "from tqdm import tqdm\n", - "import json\n", - "with open(\"./data/all_videos_data.json\", \"r\") as f:\n", - " all_videos_data = json.load(f)\n", - "\n", - "\n", - "finalized_data = []\n", - "\n", - "for video, data in all_videos_data.items():\n", - " with open(data[\"transcription\"], \"r\") as f:\n", - " transcript = json.load(f)\n", - " with open(data[\"frames_and_words\"], \"r\") as f:\n", - " frame_caption_pairs = json.load(f)\n", - "\n", - " transcript_summary = make_claude_transcript_summary(transcript=transcript)\n", - "\n", - " print(transcript_summary)\n", - "\n", - " for i, pair in tqdm(enumerate(frame_caption_pairs)):\n", - " contextual_frame_description = create_contextual_frame_description(\n", - " frame_caption_index = i, \n", - " frame_caption_pairs=frame_caption_pairs, \n", - " transcript_summary=transcript_summary)\n", - " # write out the updated frame caption pairs\n", - " # this data will compose the metadata for the vector database\n", - " # Note that only the contextual frame description will be searched over\n", - " new_pair = {\n", - " \"frame_path\": pair[\"frame_path\"],\n", - " \"words\": pair[\"words\"],\n", - " \"timestamp\": pair[\"timestamp\"],\n", - " \"transcript_summary\": transcript_summary,\n", - " \"contextual_frame_description\": contextual_frame_description\n", - " }\n", - " finalized_data.append(new_pair)\n", - "\n", - "# write out the finalized data\n", - "with open(\"./data/finalized_data.json\", \"w\") as f:\n", - " json.dump(finalized_data, f)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Using Pinecone\n", - "\n", - "Now that we've uploaded the \n", - "\n", - "\n", - "### What is Pinecone?\n", - "\n", - "### Using AWS Bedrock: Titan Text Embedding Models\n", - "\n", - "### Creating Index\n", - "\n", - "### A Note about Metadata\n", - "\n", - "\n", - "### And we're done!" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Embedding the Data with Titan\n", - "\n", - "We'll be doing roughly the following:\n", - "\n", - "![Pinecone Embedding and Upsertion](./diagrams/Pinecone_Upsertion.png)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "100%|██████████| 72/72 [00:06<00:00, 10.84it/s]\n", - "72it [00:00, 77612.41it/s]\n" - ] - } - ], - "source": [ - "from pinecone import Pinecone, ServerlessSpec\n", - "import boto3\n", - "\n", - "boto3_session = boto3.session.Session()\n", - "region_name = boto3_session.region_name\n", - "bedrock_client = boto3.client(\n", - " \"bedrock-runtime\",\n", - " region_name,\n", - ")\n", - "\n", - "\n", - "# Embedding code\n", - "def titan_text_embedding(\n", - " text: str, # English only and max input tokens 128\n", - " dimension: int = 1024, # 1,024 (default), 384, 256\n", - " model_id: str = \"amazon.titan-embed-text-v2:0\"\n", - "):\n", - " payload_body = {\n", - " \"inputText\": text,\n", - " }\n", - "\n", - " response = bedrock_client.invoke_model(\n", - " body=json.dumps(payload_body),\n", - " modelId=model_id,\n", - " accept=\"application/json\",\n", - " contentType=\"application/json\"\n", - " )\n", - "\n", - " response_body = json.loads(response.get(\"body\").read())\n", - "\n", - " finish_reason = response_body.get(\"message\")\n", - "\n", - " if finish_reason is not None:\n", - " raise Exception(f\"Embeddings generation error: {finish_reason}\")\n", - "\n", - " return response_body\n", - "\n", - "\n", - "# read in as json\n", - "file_path = './data/finalized_data.json'\n", - "with open(file_path, \"r\") as f:\n", - " data = json.load(f)\n", - "\n", - "\n", - "values_to_embed = [item[\"contextual_frame_description\"] for item in data]\n", - "ids = [item[\"frame_path\"] for item in data]\n", - "\n", - "embeddings = []\n", - "\n", - "# For large number of embeddings, take care to respect rate limits!\n", - "for v in tqdm(values_to_embed):\n", - " embedding = titan_text_embedding(text=v)\n", - " embeddings.append(embedding[\"embedding\"])\n", - "\n", - "\n", - "final_vectors = []\n", - "# Easy way to assign ids. Be careful of overwriting these \n", - "ids = [x for x in range(0, len(values_to_embed))]\n", - "\n", - "for v, e, id in tqdm(zip(data, embeddings, ids)):\n", - " final_vectors.append({\n", - " \"id\": str(id),\n", - " \"values\": e,\n", - " \"metadata\": {\n", - " \"transcript\": v[\"words\"],\n", - " \"filepath\": v[\"frame_path\"],\n", - " \"timestamp_start\": v[\"timestamp\"][0],\n", - " \"timestamp_end\": v[\"timestamp\"][1],\n", - " \"contextual_frame_description\": v[\"contextual_frame_description\"]\n", - " }\n", - "})\n", - " " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Creating an Index with Pinecone and upserting!\n", - "\n", - "\n", - "**Be sure to enter your Pinecone API Key here!**\n", - "\n", - "Don't have one? No problem, sign up [here](https://docs.pinecone.io/guides/get-started/quickstart):" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " id values \\\n", - "0 0 [-0.01581481657922268, 0.05500806123018265, -0... \n", - "1 1 [-0.007456343621015549, 0.04353542625904083, -... \n", - "2 2 [-0.030072269961237907, 0.045493949204683304, ... \n", - "3 3 [-0.030104508623480797, 0.06101180613040924, 0... \n", - "4 4 [-0.001742413965985179, 0.004221058916300535, ... \n", - "\n", - " metadata \n", - "0 {'transcript': ' All right, welcome everybody.... \n", - "1 {'transcript': ' take universal translation as... \n", - "2 {'transcript': ' a bit about vector embeddings... \n", - "3 {'transcript': ' about your weekend trip that ... \n", - "4 {'transcript': ' parking, which is completely ... \n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "328e6967ba4441438d8ec8c73234fea9", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "sending upsert requests: 0%| | 0/72 [00:00