added youtube transcript and code base analyzer

manufy · Jun 6, 2024 · 3c349bc · 3c349bc
1 parent b57cc5e
commit 3c349bc
Show file tree

Hide file tree

Showing 16 changed files with 744 additions and 1 deletion.
diff --git a/.gitignore b/.gitignore
@@ -17,6 +17,6 @@ old
 *photo*
 *.png
 *.pdf
-
+*.mp4
 
 
diff --git a/...5 Combining Components Together with Chains/1 Chains and Why They Are Used/1_llm_chain.py b/...5 Combining Components Together with Chains/1 Chains and Why They Are Used/1_llm_chain.py
@@ -0,0 +1,68 @@
+from langchain_core.prompts import PromptTemplate
+from langchain.chains import LLMChain
+from langchain_openai import OpenAI
+
+
+prompt_template = "What is a word to replace the following: {word}?"
+
+# Set the "OPENAI_API_KEY" environment variable before running following line.
+llm = OpenAI(model_name="gpt-3.5-turbo-instruct", temperature=0)
+
+llm_chain = LLMChain(
+    llm=llm,
+    prompt=PromptTemplate.from_template(prompt_template)
+)
+
+result = llm_chain("artificial")
+print(result)
+
+# It is also possible to use the .apply() method to pass multiple inputs
+# at once and receive a list for each input. 
+# The sole difference lies in the exclusion of inputs within the returned list.
+# Nonetheless, the returned list will maintain the identical order as the input.
+
+input_list = [
+    {"word": "artificial"},
+    {"word": "intelligence"},
+    {"word": "robot"}
+]
+
+result = llm_chain.apply(input_list)
+print(result)
+
+# The .generate() method will return an instance of LLMResult, 
+# hich provides more information. 
+# For example, the finish_reason key indicates the reason
+# behind the stop of the generation process. 
+# It could be stopped, meaning the model decided 
+# to finish or reach the length limit. 
+# There is other self-explanatory information 
+# like the number of total used tokens or the used model.
+
+result  = llm_chain.generate(input_list)
+print(result)
+
+# The next method we will discuss is .predict(). (which could be used interchangeably with .run())
+# Its best use case is to pass multiple inputs for a single prompt. 
+# However, it is possible to use it with one input variable as well.
+# The following prompt will pass both the word we want a substitute 
+# for and the context the model must consider.
+
+prompt_template = "Looking at the context of '{context}'. What is an appropriate word to replace the following: {word}?"
+
+llm_chain = LLMChain(
+    llm=llm,
+    prompt=PromptTemplate(template=prompt_template, input_variables=["word", "context"]))
+
+result = llm_chain.predict(word="fan", context="object")
+# or llm_chain.run(word="fan", context="object")
+print(result)
+
+# The model correctly suggested that a Ventilator would be a suitable replacement 
+# for the word fan in the context of objects. Furthermore,
+# when we repeat the experiment with a different context, humans,
+# the output will change the Admirer.
+
+result = llm_chain.predict(word="fan", context="humans")
+# or llm_chain.run(word="fan", context="object")
+print(result)
diff --git a/...n/5 Combining Components Together with Chains/1 Chains and Why They Are Used/2_parsers.py b/...n/5 Combining Components Together with Chains/1 Chains and Why They Are Used/2_parsers.py
@@ -0,0 +1,20 @@
+
+from langchain_core.prompts import PromptTemplate
+from langchain.chains import LLMChain
+from langchain.output_parsers import CommaSeparatedListOutputParser
+from langchain_openai import OpenAI
+
+
+# Set the "OPENAI_API_KEY" environment variable before running following line.
+llm = OpenAI(model_name="gpt-3.5-turbo-instruct", temperature=0)
+
+output_parser = CommaSeparatedListOutputParser()
+template = """List all possible words as substitute for 'artificial' as comma separated."""
+
+llm_chain = LLMChain(
+    llm=llm,
+    prompt=PromptTemplate(template=template, output_parser=output_parser, input_variables=[]),
+    output_parser=output_parser)
+
+result = llm_chain.predict()
+print(result)
diff --git a/...ents Together with Chains/1 Chains and Why They Are Used/3_conversational_chain_memory.py b/...ents Together with Chains/1 Chains and Why They Are Used/3_conversational_chain_memory.py
@@ -0,0 +1,32 @@
+from langchain_core.prompts import PromptTemplate
+from langchain.chains import LLMChain
+from langchain.output_parsers import CommaSeparatedListOutputParser
+from langchain_openai import OpenAI
+
+
+# Set the "OPENAI_API_KEY" environment variable before running following line.
+llm = OpenAI(model_name="gpt-3.5-turbo-instruct", temperature=0)
+
+# Depending on the application, memory is the next component 
+# that will complete a chain. LangChain provides a ConversationalChain 
+# to track previous prompts and responses using the ConversationalBufferMemory class.
+
+from langchain.chains import ConversationChain
+from langchain.memory import ConversationBufferMemory
+
+output_parser = CommaSeparatedListOutputParser()
+conversation = ConversationChain(
+    llm=llm,
+    memory=ConversationBufferMemory()
+)
+
+result = conversation.predict(input="List all possible words as substitute for 'artificial' as comma separated.")
+print(result)
+
+# Now, we can ask it to return the following four replacement words. 
+# It uses the memory to find the next options.
+
+result=conversation.predict(input="And the next 4?")
+print(result)
+
+
diff --git a/...ning Components Together with Chains/1 Chains and Why They Are Used/4_sequential_chain.py b/...ning Components Together with Chains/1 Chains and Why They Are Used/4_sequential_chain.py
@@ -0,0 +1,56 @@
+
+from langchain_core.prompts import PromptTemplate
+from langchain.chains import LLMChain
+from langchain.output_parsers import CommaSeparatedListOutputParser
+from langchain_openai import OpenAI
+
+
+# Set the "OPENAI_API_KEY" environment variable before running following line.
+llm = OpenAI(model_name="gpt-3.5-turbo-instruct", temperature=0)
+
+
+
+# Another helpful feature is using a sequential chain that concatenates 
+# multiple chains into one. The following code shows a sample usage.
+
+
+# poet
+poet_template: str = """You are an American poet, your job is to come up with\
+poems based on a given theme.
+
+Here is the theme you have been asked to generate a poem on:
+{input}\
+"""
+
+poet_prompt_template: PromptTemplate = PromptTemplate(
+    input_variables=["input"], template=poet_template)
+
+# creating the poet chain
+poet_chain: LLMChain = LLMChain(
+    llm=llm, output_key="poem", prompt=poet_prompt_template)
+
+# critic
+critic_template: str = """You are a critic of poems, you are tasked\
+to inspect the themes of poems. Identify whether the poem includes romantic expressions or descriptions of nature.
+
+Your response should be in the following format, as a Python Dictionary.
+poem: this should be the poem you received 
+Romantic_expressions: True or False
+Nature_descriptions: True or False
+
+Here is the poem submitted to you:
+{poem}\
+"""
+
+critic_prompt_template: PromptTemplate = PromptTemplate(
+    input_variables=["poem"], template=critic_template)
+
+# creating the critic chain
+#critic_chain: LLMChain = LLMChain(
+#    llm=llm, output_key="critic_verified", prompt=critic_prompt_template)
+
+critic_chain = critic_prompt_template | llm 
+result = critic_chain.invoke(critic_chain("The sun is shining bright"))
+
+print(critic_chain)
+print(result)
diff --git a/...ion/5 Combining Components Together with Chains/1 Chains and Why They Are Used/5_debug.py b/...ion/5 Combining Components Together with Chains/1 Chains and Why They Are Used/5_debug.py
@@ -0,0 +1,29 @@
+
+from langchain_core.prompts import PromptTemplate
+from langchain.chains import LLMChain
+from langchain_openai import OpenAI
+from langchain.chains import ConversationChain
+from langchain.memory import ConversationBufferMemory
+from langchain.output_parsers import CommaSeparatedListOutputParser
+
+
+output_parser = CommaSeparatedListOutputParser()
+
+# Set the "OPENAI_API_KEY" environment variable before running following line.
+llm = OpenAI(model_name="gpt-3.5-turbo-instruct", temperature=0)
+
+
+template = """List all possible words as substitute for 'artificial' as comma separated.
+
+Current conversation:
+{history}
+
+{input}"""
+
+conversation = ConversationChain(
+    llm=llm,
+    prompt=PromptTemplate(template=template, input_variables=["history", "input"], output_parser=output_parser),
+    memory=ConversationBufferMemory(),
+    verbose=True)
+
+result = conversation.predict(input="")
diff --git a/...e a YouTube Video Summarizer Using Whisper and LangChain /1-download_mp4_from_youtuber.py b/...e a YouTube Video Summarizer Using Whisper and LangChain /1-download_mp4_from_youtuber.py
@@ -0,0 +1,18 @@
+import yt_dlp
+
+def download_mp4_from_youtube(url):
+    # Set the options for the download
+    filename = 'lecuninterview.mp4'
+    ydl_opts = {
+        'format': 'bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]',
+        'outtmpl': filename,
+        'quiet': True,
+    }
+
+    # Download the video file
+    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
+        result = ydl.extract_info(url, download=True)
+
+url = "https://www.youtube.com/watch?v=mBjPyte2ZZo"
+
+download_mp4_from_youtube(url)
diff --git a/.../2 Create a YouTube Video Summarizer Using Whisper and LangChain /2-whisper_transcribe.py b/.../2 Create a YouTube Video Summarizer Using Whisper and LangChain /2-whisper_transcribe.py
@@ -0,0 +1,8 @@
+import whisper
+
+model = whisper.load_model("base")
+result = model.transcribe("lecuninterview.mp4")
+print(result['text'])
+
+with open ('text.txt', 'w') as file:  
+    file.write(result['text'])
diff --git a/...hains/2 Create a YouTube Video Summarizer Using Whisper and LangChain /3-summarization.py b/...hains/2 Create a YouTube Video Summarizer Using Whisper and LangChain /3-summarization.py
@@ -0,0 +1,103 @@
+#from langchain import OpenAI, LLMChain
+from langchain.chains.mapreduce import MapReduceChain
+#from langchain.prompts import PromptTemplate
+from langchain.chains.summarize import load_summarize_chain
+
+from langchain_openai import OpenAI
+from langchain_core.prompts import PromptTemplate
+
+llm = OpenAI(model_name="gpt-3.5-turbo-instruct", temperature=0)
+
+
+
+# creates an instance of the RecursiveCharacterTextSplitter
+# class, which is responsible for splitting input text into smaller chunks. 
+
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+text_splitter = RecursiveCharacterTextSplitter(
+    chunk_size=1000, chunk_overlap=0, separators=[" ", ",", "\n"]
+)
+
+# It is configured with a chunk_size of 1000 characters, 
+# no chunk_overlap, and uses spaces, commas, and newline characters as separators.
+# This ensures that the input text is broken down into manageable pieces, 
+# allowing for efficient processing by the language model.
+
+from langchain.docstore.document import Document
+
+with open('text.txt') as f:
+    text = f.read()
+
+texts = text_splitter.split_text(text)
+docs = [Document(page_content=t) for t in texts[:4]]
+
+# Each Document object is initialized with the content of a chunk from the texts list.
+# The [:4] slice notation indicates that only the first four chunks will be used 
+# to create the Document objects. 
+
+from langchain.chains.summarize import load_summarize_chain
+import textwrap
+
+chain = load_summarize_chain(llm, chain_type="map_reduce")
+
+output_summary = chain.run(docs)
+wrapped_text = textwrap.fill(output_summary, width=100)
+from termcolor import colored
+print ("----- SUMMARY -----")
+print(colored(wrapped_text, 'yellow'))
+
+# With the following line of code, we can see the prompt template
+# that is used with the map_reduce technique.
+# Now we’re changing the prompt and using another summarization method
+
+print ("------ PROMPT TEMPLATE ------")
+
+
+print(colored(chain.llm_chain.prompt.template, 'yellow'))
+
+# The "stuff" approach is the simplest and most naive one, 
+# in which all the text from the transcribed video is used in a single prompt. 
+# This method may raise exceptions if all text is longer than the available 
+# context size of the LLM and may not be the most efficient way to handle large amounts of text. 
+# We’re going to experiment with the prompt below.
+# This prompt will output the summary as bullet points.
+
+prompt_template = """Write a concise bullet point summary of the following:
+
+
+{text}
+
+
+CONSCISE SUMMARY IN BULLET POINTS:"""
+
+BULLET_POINT_PROMPT = PromptTemplate(template=prompt_template, 
+                        input_variables=["text"])
+
+#  Also, we initialized the summarization chain using the stuff as chain_type and the prompt above.
+
+chain = load_summarize_chain(llm, 
+                             chain_type="stuff", 
+                             prompt=BULLET_POINT_PROMPT)
+
+output_summary = chain.run(docs)
+
+wrapped_text = textwrap.fill(output_summary, 
+                             width=1000,
+                             break_long_words=False,
+                             replace_whitespace=False)
+print ("----- CONCISE SUMMARY -----")
+print(colored(wrapped_text, 'yellow'))
+
+# The 'refine' summarization chain is a method for generating more accurate
+# and context-aware summaries. This chain type is designed to iteratively
+# refine the summary by providing additional context when needed.
+# That means: it generates the summary of the first chunk. 
+# Then, for each successive chunk, the work-in-progress 
+# summary is integrated with new info from the new chunk.
+
+chain = load_summarize_chain(llm, chain_type="refine")
+
+output_summary = chain.run(docs)
+wrapped_text = textwrap.fill(output_summary, width=100)
+print("----- REFINED SUMMARY -----")
+print(colored(wrapped_text, 'yellow'))