Skip to content

Commit

Permalink
added youtube transcript and code base analyzer
Browse files Browse the repository at this point in the history
  • Loading branch information
manufy committed Jun 6, 2024
1 parent b57cc5e commit 3c349bc
Show file tree
Hide file tree
Showing 16 changed files with 744 additions and 1 deletion.
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,6 @@ old
*photo*
*.png
*.pdf

*.mp4


Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
from langchain_core.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain_openai import OpenAI


prompt_template = "What is a word to replace the following: {word}?"

# Set the "OPENAI_API_KEY" environment variable before running following line.
llm = OpenAI(model_name="gpt-3.5-turbo-instruct", temperature=0)

llm_chain = LLMChain(
llm=llm,
prompt=PromptTemplate.from_template(prompt_template)
)

result = llm_chain("artificial")
print(result)

# It is also possible to use the .apply() method to pass multiple inputs
# at once and receive a list for each input.
# The sole difference lies in the exclusion of inputs within the returned list.
# Nonetheless, the returned list will maintain the identical order as the input.

input_list = [
{"word": "artificial"},
{"word": "intelligence"},
{"word": "robot"}
]

result = llm_chain.apply(input_list)
print(result)

# The .generate() method will return an instance of LLMResult,
# hich provides more information.
# For example, the finish_reason key indicates the reason
# behind the stop of the generation process.
# It could be stopped, meaning the model decided
# to finish or reach the length limit.
# There is other self-explanatory information
# like the number of total used tokens or the used model.

result = llm_chain.generate(input_list)
print(result)

# The next method we will discuss is .predict(). (which could be used interchangeably with .run())
# Its best use case is to pass multiple inputs for a single prompt.
# However, it is possible to use it with one input variable as well.
# The following prompt will pass both the word we want a substitute
# for and the context the model must consider.

prompt_template = "Looking at the context of '{context}'. What is an appropriate word to replace the following: {word}?"

llm_chain = LLMChain(
llm=llm,
prompt=PromptTemplate(template=prompt_template, input_variables=["word", "context"]))

result = llm_chain.predict(word="fan", context="object")
# or llm_chain.run(word="fan", context="object")
print(result)

# The model correctly suggested that a Ventilator would be a suitable replacement
# for the word fan in the context of objects. Furthermore,
# when we repeat the experiment with a different context, humans,
# the output will change the Admirer.

result = llm_chain.predict(word="fan", context="humans")
# or llm_chain.run(word="fan", context="object")
print(result)
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@

from langchain_core.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain.output_parsers import CommaSeparatedListOutputParser
from langchain_openai import OpenAI


# Set the "OPENAI_API_KEY" environment variable before running following line.
llm = OpenAI(model_name="gpt-3.5-turbo-instruct", temperature=0)

output_parser = CommaSeparatedListOutputParser()
template = """List all possible words as substitute for 'artificial' as comma separated."""

llm_chain = LLMChain(
llm=llm,
prompt=PromptTemplate(template=template, output_parser=output_parser, input_variables=[]),
output_parser=output_parser)

result = llm_chain.predict()
print(result)
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
from langchain_core.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain.output_parsers import CommaSeparatedListOutputParser
from langchain_openai import OpenAI


# Set the "OPENAI_API_KEY" environment variable before running following line.
llm = OpenAI(model_name="gpt-3.5-turbo-instruct", temperature=0)

# Depending on the application, memory is the next component
# that will complete a chain. LangChain provides a ConversationalChain
# to track previous prompts and responses using the ConversationalBufferMemory class.

from langchain.chains import ConversationChain
from langchain.memory import ConversationBufferMemory

output_parser = CommaSeparatedListOutputParser()
conversation = ConversationChain(
llm=llm,
memory=ConversationBufferMemory()
)

result = conversation.predict(input="List all possible words as substitute for 'artificial' as comma separated.")
print(result)

# Now, we can ask it to return the following four replacement words.
# It uses the memory to find the next options.

result=conversation.predict(input="And the next 4?")
print(result)


Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@

from langchain_core.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain.output_parsers import CommaSeparatedListOutputParser
from langchain_openai import OpenAI


# Set the "OPENAI_API_KEY" environment variable before running following line.
llm = OpenAI(model_name="gpt-3.5-turbo-instruct", temperature=0)



# Another helpful feature is using a sequential chain that concatenates
# multiple chains into one. The following code shows a sample usage.


# poet
poet_template: str = """You are an American poet, your job is to come up with\
poems based on a given theme.
Here is the theme you have been asked to generate a poem on:
{input}\
"""

poet_prompt_template: PromptTemplate = PromptTemplate(
input_variables=["input"], template=poet_template)

# creating the poet chain
poet_chain: LLMChain = LLMChain(
llm=llm, output_key="poem", prompt=poet_prompt_template)

# critic
critic_template: str = """You are a critic of poems, you are tasked\
to inspect the themes of poems. Identify whether the poem includes romantic expressions or descriptions of nature.
Your response should be in the following format, as a Python Dictionary.
poem: this should be the poem you received
Romantic_expressions: True or False
Nature_descriptions: True or False
Here is the poem submitted to you:
{poem}\
"""

critic_prompt_template: PromptTemplate = PromptTemplate(
input_variables=["poem"], template=critic_template)

# creating the critic chain
#critic_chain: LLMChain = LLMChain(
# llm=llm, output_key="critic_verified", prompt=critic_prompt_template)

critic_chain = critic_prompt_template | llm
result = critic_chain.invoke(critic_chain("The sun is shining bright"))

print(critic_chain)
print(result)
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@

from langchain_core.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain_openai import OpenAI
from langchain.chains import ConversationChain
from langchain.memory import ConversationBufferMemory
from langchain.output_parsers import CommaSeparatedListOutputParser


output_parser = CommaSeparatedListOutputParser()

# Set the "OPENAI_API_KEY" environment variable before running following line.
llm = OpenAI(model_name="gpt-3.5-turbo-instruct", temperature=0)


template = """List all possible words as substitute for 'artificial' as comma separated.
Current conversation:
{history}
{input}"""

conversation = ConversationChain(
llm=llm,
prompt=PromptTemplate(template=template, input_variables=["history", "input"], output_parser=output_parser),
memory=ConversationBufferMemory(),
verbose=True)

result = conversation.predict(input="")
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
import yt_dlp

def download_mp4_from_youtube(url):
# Set the options for the download
filename = 'lecuninterview.mp4'
ydl_opts = {
'format': 'bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]',
'outtmpl': filename,
'quiet': True,
}

# Download the video file
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
result = ydl.extract_info(url, download=True)

url = "https://www.youtube.com/watch?v=mBjPyte2ZZo"

download_mp4_from_youtube(url)
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
import whisper

model = whisper.load_model("base")
result = model.transcribe("lecuninterview.mp4")
print(result['text'])

with open ('text.txt', 'w') as file:
file.write(result['text'])
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
#from langchain import OpenAI, LLMChain
from langchain.chains.mapreduce import MapReduceChain
#from langchain.prompts import PromptTemplate
from langchain.chains.summarize import load_summarize_chain

from langchain_openai import OpenAI
from langchain_core.prompts import PromptTemplate

llm = OpenAI(model_name="gpt-3.5-turbo-instruct", temperature=0)



# creates an instance of the RecursiveCharacterTextSplitter
# class, which is responsible for splitting input text into smaller chunks.

from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1000, chunk_overlap=0, separators=[" ", ",", "\n"]
)

# It is configured with a chunk_size of 1000 characters,
# no chunk_overlap, and uses spaces, commas, and newline characters as separators.
# This ensures that the input text is broken down into manageable pieces,
# allowing for efficient processing by the language model.

from langchain.docstore.document import Document

with open('text.txt') as f:
text = f.read()

texts = text_splitter.split_text(text)
docs = [Document(page_content=t) for t in texts[:4]]

# Each Document object is initialized with the content of a chunk from the texts list.
# The [:4] slice notation indicates that only the first four chunks will be used
# to create the Document objects.

from langchain.chains.summarize import load_summarize_chain
import textwrap

chain = load_summarize_chain(llm, chain_type="map_reduce")

output_summary = chain.run(docs)
wrapped_text = textwrap.fill(output_summary, width=100)
from termcolor import colored
print ("----- SUMMARY -----")
print(colored(wrapped_text, 'yellow'))

# With the following line of code, we can see the prompt template
# that is used with the map_reduce technique.
# Now we’re changing the prompt and using another summarization method

print ("------ PROMPT TEMPLATE ------")


print(colored(chain.llm_chain.prompt.template, 'yellow'))

# The "stuff" approach is the simplest and most naive one,
# in which all the text from the transcribed video is used in a single prompt.
# This method may raise exceptions if all text is longer than the available
# context size of the LLM and may not be the most efficient way to handle large amounts of text.
# We’re going to experiment with the prompt below.
# This prompt will output the summary as bullet points.

prompt_template = """Write a concise bullet point summary of the following:
{text}
CONSCISE SUMMARY IN BULLET POINTS:"""

BULLET_POINT_PROMPT = PromptTemplate(template=prompt_template,
input_variables=["text"])

# Also, we initialized the summarization chain using the stuff as chain_type and the prompt above.

chain = load_summarize_chain(llm,
chain_type="stuff",
prompt=BULLET_POINT_PROMPT)

output_summary = chain.run(docs)

wrapped_text = textwrap.fill(output_summary,
width=1000,
break_long_words=False,
replace_whitespace=False)
print ("----- CONCISE SUMMARY -----")
print(colored(wrapped_text, 'yellow'))

# The 'refine' summarization chain is a method for generating more accurate
# and context-aware summaries. This chain type is designed to iteratively
# refine the summary by providing additional context when needed.
# That means: it generates the summary of the first chunk.
# Then, for each successive chunk, the work-in-progress
# summary is integrated with new info from the new chunk.

chain = load_summarize_chain(llm, chain_type="refine")

output_summary = chain.run(docs)
wrapped_text = textwrap.fill(output_summary, width=100)
print("----- REFINED SUMMARY -----")
print(colored(wrapped_text, 'yellow'))
Loading

0 comments on commit 3c349bc

Please sign in to comment.