From 3c349bc5ecc072c0a97f6aff7cd488c35d2223c3 Mon Sep 17 00:00:00 2001 From: Manu Date: Thu, 6 Jun 2024 23:02:28 +0200 Subject: [PATCH] added youtube transcript and code base analyzer --- .gitignore | 2 +- .../1_llm_chain.py | 68 +++++++++ .../2_parsers.py | 20 +++ .../3_conversational_chain_memory.py | 32 +++++ .../4_sequential_chain.py | 56 ++++++++ .../1 Chains and Why They Are Used/5_debug.py | 29 ++++ .../1-download_mp4_from_youtuber.py | 18 +++ .../2-whisper_transcribe.py | 8 ++ .../3-summarization.py | 103 ++++++++++++++ ...videos_list_from_youtube_and_transcribe.py | 53 +++++++ .../5-summarization_videos_list.py | 92 ++++++++++++ .../requirements.txt | 3 + .../text.txt | 1 + .../1-scrape.py | 121 ++++++++++++++++ .../2-chat.py | 133 ++++++++++++++++++ .../content.txt | 6 + 16 files changed, 744 insertions(+), 1 deletion(-) create mode 100644 1 ActiveLoop Courses/1 LangChain & Vector Databases in Production/5 Combining Components Together with Chains/1 Chains and Why They Are Used/1_llm_chain.py create mode 100644 1 ActiveLoop Courses/1 LangChain & Vector Databases in Production/5 Combining Components Together with Chains/1 Chains and Why They Are Used/2_parsers.py create mode 100644 1 ActiveLoop Courses/1 LangChain & Vector Databases in Production/5 Combining Components Together with Chains/1 Chains and Why They Are Used/3_conversational_chain_memory.py create mode 100644 1 ActiveLoop Courses/1 LangChain & Vector Databases in Production/5 Combining Components Together with Chains/1 Chains and Why They Are Used/4_sequential_chain.py create mode 100644 1 ActiveLoop Courses/1 LangChain & Vector Databases in Production/5 Combining Components Together with Chains/1 Chains and Why They Are Used/5_debug.py create mode 100644 1 ActiveLoop Courses/1 LangChain & Vector Databases in Production/5 Combining Components Together with Chains/2 Create a YouTube Video Summarizer Using Whisper and LangChain /1-download_mp4_from_youtuber.py create mode 100644 1 ActiveLoop Courses/1 LangChain & Vector Databases in Production/5 Combining Components Together with Chains/2 Create a YouTube Video Summarizer Using Whisper and LangChain /2-whisper_transcribe.py create mode 100644 1 ActiveLoop Courses/1 LangChain & Vector Databases in Production/5 Combining Components Together with Chains/2 Create a YouTube Video Summarizer Using Whisper and LangChain /3-summarization.py create mode 100644 1 ActiveLoop Courses/1 LangChain & Vector Databases in Production/5 Combining Components Together with Chains/2 Create a YouTube Video Summarizer Using Whisper and LangChain /4-download_mp4_videos_list_from_youtube_and_transcribe.py create mode 100644 1 ActiveLoop Courses/1 LangChain & Vector Databases in Production/5 Combining Components Together with Chains/2 Create a YouTube Video Summarizer Using Whisper and LangChain /5-summarization_videos_list.py create mode 100644 1 ActiveLoop Courses/1 LangChain & Vector Databases in Production/5 Combining Components Together with Chains/2 Create a YouTube Video Summarizer Using Whisper and LangChain /requirements.txt create mode 100644 1 ActiveLoop Courses/1 LangChain & Vector Databases in Production/5 Combining Components Together with Chains/2 Create a YouTube Video Summarizer Using Whisper and LangChain /text.txt create mode 100644 1 ActiveLoop Courses/1 LangChain & Vector Databases in Production/5 Combining Components Together with Chains/3 Creating a Voice Assistant for your Knowledge Base/1-scrape.py create mode 100644 1 ActiveLoop Courses/1 LangChain & Vector Databases in Production/5 Combining Components Together with Chains/3 Creating a Voice Assistant for your Knowledge Base/2-chat.py create mode 100644 1 ActiveLoop Courses/1 LangChain & Vector Databases in Production/5 Combining Components Together with Chains/3 Creating a Voice Assistant for your Knowledge Base/content.txt diff --git a/.gitignore b/.gitignore index 2dca553..c1bcb69 100644 --- a/.gitignore +++ b/.gitignore @@ -17,6 +17,6 @@ old *photo* *.png *.pdf - +*.mp4 diff --git a/1 ActiveLoop Courses/1 LangChain & Vector Databases in Production/5 Combining Components Together with Chains/1 Chains and Why They Are Used/1_llm_chain.py b/1 ActiveLoop Courses/1 LangChain & Vector Databases in Production/5 Combining Components Together with Chains/1 Chains and Why They Are Used/1_llm_chain.py new file mode 100644 index 0000000..b40771c --- /dev/null +++ b/1 ActiveLoop Courses/1 LangChain & Vector Databases in Production/5 Combining Components Together with Chains/1 Chains and Why They Are Used/1_llm_chain.py @@ -0,0 +1,68 @@ +from langchain_core.prompts import PromptTemplate +from langchain.chains import LLMChain +from langchain_openai import OpenAI + + +prompt_template = "What is a word to replace the following: {word}?" + +# Set the "OPENAI_API_KEY" environment variable before running following line. +llm = OpenAI(model_name="gpt-3.5-turbo-instruct", temperature=0) + +llm_chain = LLMChain( + llm=llm, + prompt=PromptTemplate.from_template(prompt_template) +) + +result = llm_chain("artificial") +print(result) + +# It is also possible to use the .apply() method to pass multiple inputs +# at once and receive a list for each input. +# The sole difference lies in the exclusion of inputs within the returned list. +# Nonetheless, the returned list will maintain the identical order as the input. + +input_list = [ + {"word": "artificial"}, + {"word": "intelligence"}, + {"word": "robot"} +] + +result = llm_chain.apply(input_list) +print(result) + +# The .generate() method will return an instance of LLMResult, +# hich provides more information. +# For example, the finish_reason key indicates the reason +# behind the stop of the generation process. +# It could be stopped, meaning the model decided +# to finish or reach the length limit. +# There is other self-explanatory information +# like the number of total used tokens or the used model. + +result = llm_chain.generate(input_list) +print(result) + +# The next method we will discuss is .predict(). (which could be used interchangeably with .run()) +# Its best use case is to pass multiple inputs for a single prompt. +# However, it is possible to use it with one input variable as well. +# The following prompt will pass both the word we want a substitute +# for and the context the model must consider. + +prompt_template = "Looking at the context of '{context}'. What is an appropriate word to replace the following: {word}?" + +llm_chain = LLMChain( + llm=llm, + prompt=PromptTemplate(template=prompt_template, input_variables=["word", "context"])) + +result = llm_chain.predict(word="fan", context="object") +# or llm_chain.run(word="fan", context="object") +print(result) + +# The model correctly suggested that a Ventilator would be a suitable replacement +# for the word fan in the context of objects. Furthermore, +# when we repeat the experiment with a different context, humans, +# the output will change the Admirer. + +result = llm_chain.predict(word="fan", context="humans") +# or llm_chain.run(word="fan", context="object") +print(result) diff --git a/1 ActiveLoop Courses/1 LangChain & Vector Databases in Production/5 Combining Components Together with Chains/1 Chains and Why They Are Used/2_parsers.py b/1 ActiveLoop Courses/1 LangChain & Vector Databases in Production/5 Combining Components Together with Chains/1 Chains and Why They Are Used/2_parsers.py new file mode 100644 index 0000000..806163d --- /dev/null +++ b/1 ActiveLoop Courses/1 LangChain & Vector Databases in Production/5 Combining Components Together with Chains/1 Chains and Why They Are Used/2_parsers.py @@ -0,0 +1,20 @@ + +from langchain_core.prompts import PromptTemplate +from langchain.chains import LLMChain +from langchain.output_parsers import CommaSeparatedListOutputParser +from langchain_openai import OpenAI + + +# Set the "OPENAI_API_KEY" environment variable before running following line. +llm = OpenAI(model_name="gpt-3.5-turbo-instruct", temperature=0) + +output_parser = CommaSeparatedListOutputParser() +template = """List all possible words as substitute for 'artificial' as comma separated.""" + +llm_chain = LLMChain( + llm=llm, + prompt=PromptTemplate(template=template, output_parser=output_parser, input_variables=[]), + output_parser=output_parser) + +result = llm_chain.predict() +print(result) \ No newline at end of file diff --git a/1 ActiveLoop Courses/1 LangChain & Vector Databases in Production/5 Combining Components Together with Chains/1 Chains and Why They Are Used/3_conversational_chain_memory.py b/1 ActiveLoop Courses/1 LangChain & Vector Databases in Production/5 Combining Components Together with Chains/1 Chains and Why They Are Used/3_conversational_chain_memory.py new file mode 100644 index 0000000..733fa42 --- /dev/null +++ b/1 ActiveLoop Courses/1 LangChain & Vector Databases in Production/5 Combining Components Together with Chains/1 Chains and Why They Are Used/3_conversational_chain_memory.py @@ -0,0 +1,32 @@ +from langchain_core.prompts import PromptTemplate +from langchain.chains import LLMChain +from langchain.output_parsers import CommaSeparatedListOutputParser +from langchain_openai import OpenAI + + +# Set the "OPENAI_API_KEY" environment variable before running following line. +llm = OpenAI(model_name="gpt-3.5-turbo-instruct", temperature=0) + +# Depending on the application, memory is the next component +# that will complete a chain. LangChain provides a ConversationalChain +# to track previous prompts and responses using the ConversationalBufferMemory class. + +from langchain.chains import ConversationChain +from langchain.memory import ConversationBufferMemory + +output_parser = CommaSeparatedListOutputParser() +conversation = ConversationChain( + llm=llm, + memory=ConversationBufferMemory() +) + +result = conversation.predict(input="List all possible words as substitute for 'artificial' as comma separated.") +print(result) + +# Now, we can ask it to return the following four replacement words. +# It uses the memory to find the next options. + +result=conversation.predict(input="And the next 4?") +print(result) + + diff --git a/1 ActiveLoop Courses/1 LangChain & Vector Databases in Production/5 Combining Components Together with Chains/1 Chains and Why They Are Used/4_sequential_chain.py b/1 ActiveLoop Courses/1 LangChain & Vector Databases in Production/5 Combining Components Together with Chains/1 Chains and Why They Are Used/4_sequential_chain.py new file mode 100644 index 0000000..8c44caf --- /dev/null +++ b/1 ActiveLoop Courses/1 LangChain & Vector Databases in Production/5 Combining Components Together with Chains/1 Chains and Why They Are Used/4_sequential_chain.py @@ -0,0 +1,56 @@ + +from langchain_core.prompts import PromptTemplate +from langchain.chains import LLMChain +from langchain.output_parsers import CommaSeparatedListOutputParser +from langchain_openai import OpenAI + + +# Set the "OPENAI_API_KEY" environment variable before running following line. +llm = OpenAI(model_name="gpt-3.5-turbo-instruct", temperature=0) + + + +# Another helpful feature is using a sequential chain that concatenates +# multiple chains into one. The following code shows a sample usage. + + +# poet +poet_template: str = """You are an American poet, your job is to come up with\ +poems based on a given theme. + +Here is the theme you have been asked to generate a poem on: +{input}\ +""" + +poet_prompt_template: PromptTemplate = PromptTemplate( + input_variables=["input"], template=poet_template) + +# creating the poet chain +poet_chain: LLMChain = LLMChain( + llm=llm, output_key="poem", prompt=poet_prompt_template) + +# critic +critic_template: str = """You are a critic of poems, you are tasked\ +to inspect the themes of poems. Identify whether the poem includes romantic expressions or descriptions of nature. + +Your response should be in the following format, as a Python Dictionary. +poem: this should be the poem you received +Romantic_expressions: True or False +Nature_descriptions: True or False + +Here is the poem submitted to you: +{poem}\ +""" + +critic_prompt_template: PromptTemplate = PromptTemplate( + input_variables=["poem"], template=critic_template) + +# creating the critic chain +#critic_chain: LLMChain = LLMChain( +# llm=llm, output_key="critic_verified", prompt=critic_prompt_template) + +critic_chain = critic_prompt_template | llm +result = critic_chain.invoke(critic_chain("The sun is shining bright")) + +print(critic_chain) +print(result) \ No newline at end of file diff --git a/1 ActiveLoop Courses/1 LangChain & Vector Databases in Production/5 Combining Components Together with Chains/1 Chains and Why They Are Used/5_debug.py b/1 ActiveLoop Courses/1 LangChain & Vector Databases in Production/5 Combining Components Together with Chains/1 Chains and Why They Are Used/5_debug.py new file mode 100644 index 0000000..e5786df --- /dev/null +++ b/1 ActiveLoop Courses/1 LangChain & Vector Databases in Production/5 Combining Components Together with Chains/1 Chains and Why They Are Used/5_debug.py @@ -0,0 +1,29 @@ + +from langchain_core.prompts import PromptTemplate +from langchain.chains import LLMChain +from langchain_openai import OpenAI +from langchain.chains import ConversationChain +from langchain.memory import ConversationBufferMemory +from langchain.output_parsers import CommaSeparatedListOutputParser + + +output_parser = CommaSeparatedListOutputParser() + +# Set the "OPENAI_API_KEY" environment variable before running following line. +llm = OpenAI(model_name="gpt-3.5-turbo-instruct", temperature=0) + + +template = """List all possible words as substitute for 'artificial' as comma separated. + +Current conversation: +{history} + +{input}""" + +conversation = ConversationChain( + llm=llm, + prompt=PromptTemplate(template=template, input_variables=["history", "input"], output_parser=output_parser), + memory=ConversationBufferMemory(), + verbose=True) + +result = conversation.predict(input="") \ No newline at end of file diff --git a/1 ActiveLoop Courses/1 LangChain & Vector Databases in Production/5 Combining Components Together with Chains/2 Create a YouTube Video Summarizer Using Whisper and LangChain /1-download_mp4_from_youtuber.py b/1 ActiveLoop Courses/1 LangChain & Vector Databases in Production/5 Combining Components Together with Chains/2 Create a YouTube Video Summarizer Using Whisper and LangChain /1-download_mp4_from_youtuber.py new file mode 100644 index 0000000..c6abe6a --- /dev/null +++ b/1 ActiveLoop Courses/1 LangChain & Vector Databases in Production/5 Combining Components Together with Chains/2 Create a YouTube Video Summarizer Using Whisper and LangChain /1-download_mp4_from_youtuber.py @@ -0,0 +1,18 @@ +import yt_dlp + +def download_mp4_from_youtube(url): + # Set the options for the download + filename = 'lecuninterview.mp4' + ydl_opts = { + 'format': 'bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]', + 'outtmpl': filename, + 'quiet': True, + } + + # Download the video file + with yt_dlp.YoutubeDL(ydl_opts) as ydl: + result = ydl.extract_info(url, download=True) + +url = "https://www.youtube.com/watch?v=mBjPyte2ZZo" + +download_mp4_from_youtube(url) \ No newline at end of file diff --git a/1 ActiveLoop Courses/1 LangChain & Vector Databases in Production/5 Combining Components Together with Chains/2 Create a YouTube Video Summarizer Using Whisper and LangChain /2-whisper_transcribe.py b/1 ActiveLoop Courses/1 LangChain & Vector Databases in Production/5 Combining Components Together with Chains/2 Create a YouTube Video Summarizer Using Whisper and LangChain /2-whisper_transcribe.py new file mode 100644 index 0000000..ec92203 --- /dev/null +++ b/1 ActiveLoop Courses/1 LangChain & Vector Databases in Production/5 Combining Components Together with Chains/2 Create a YouTube Video Summarizer Using Whisper and LangChain /2-whisper_transcribe.py @@ -0,0 +1,8 @@ +import whisper + +model = whisper.load_model("base") +result = model.transcribe("lecuninterview.mp4") +print(result['text']) + +with open ('text.txt', 'w') as file: + file.write(result['text']) \ No newline at end of file diff --git a/1 ActiveLoop Courses/1 LangChain & Vector Databases in Production/5 Combining Components Together with Chains/2 Create a YouTube Video Summarizer Using Whisper and LangChain /3-summarization.py b/1 ActiveLoop Courses/1 LangChain & Vector Databases in Production/5 Combining Components Together with Chains/2 Create a YouTube Video Summarizer Using Whisper and LangChain /3-summarization.py new file mode 100644 index 0000000..6d245fb --- /dev/null +++ b/1 ActiveLoop Courses/1 LangChain & Vector Databases in Production/5 Combining Components Together with Chains/2 Create a YouTube Video Summarizer Using Whisper and LangChain /3-summarization.py @@ -0,0 +1,103 @@ +#from langchain import OpenAI, LLMChain +from langchain.chains.mapreduce import MapReduceChain +#from langchain.prompts import PromptTemplate +from langchain.chains.summarize import load_summarize_chain + +from langchain_openai import OpenAI +from langchain_core.prompts import PromptTemplate + +llm = OpenAI(model_name="gpt-3.5-turbo-instruct", temperature=0) + + + +# creates an instance of the RecursiveCharacterTextSplitter +# class, which is responsible for splitting input text into smaller chunks. + +from langchain.text_splitter import RecursiveCharacterTextSplitter +text_splitter = RecursiveCharacterTextSplitter( + chunk_size=1000, chunk_overlap=0, separators=[" ", ",", "\n"] +) + +# It is configured with a chunk_size of 1000 characters, +# no chunk_overlap, and uses spaces, commas, and newline characters as separators. +# This ensures that the input text is broken down into manageable pieces, +# allowing for efficient processing by the language model. + +from langchain.docstore.document import Document + +with open('text.txt') as f: + text = f.read() + +texts = text_splitter.split_text(text) +docs = [Document(page_content=t) for t in texts[:4]] + +# Each Document object is initialized with the content of a chunk from the texts list. +# The [:4] slice notation indicates that only the first four chunks will be used +# to create the Document objects. + +from langchain.chains.summarize import load_summarize_chain +import textwrap + +chain = load_summarize_chain(llm, chain_type="map_reduce") + +output_summary = chain.run(docs) +wrapped_text = textwrap.fill(output_summary, width=100) +from termcolor import colored +print ("----- SUMMARY -----") +print(colored(wrapped_text, 'yellow')) + +# With the following line of code, we can see the prompt template +# that is used with the map_reduce technique. +# Now we’re changing the prompt and using another summarization method + +print ("------ PROMPT TEMPLATE ------") + + +print(colored(chain.llm_chain.prompt.template, 'yellow')) + +# The "stuff" approach is the simplest and most naive one, +# in which all the text from the transcribed video is used in a single prompt. +# This method may raise exceptions if all text is longer than the available +# context size of the LLM and may not be the most efficient way to handle large amounts of text. +# We’re going to experiment with the prompt below. +# This prompt will output the summary as bullet points. + +prompt_template = """Write a concise bullet point summary of the following: + + +{text} + + +CONSCISE SUMMARY IN BULLET POINTS:""" + +BULLET_POINT_PROMPT = PromptTemplate(template=prompt_template, + input_variables=["text"]) + +# Also, we initialized the summarization chain using the stuff as chain_type and the prompt above. + +chain = load_summarize_chain(llm, + chain_type="stuff", + prompt=BULLET_POINT_PROMPT) + +output_summary = chain.run(docs) + +wrapped_text = textwrap.fill(output_summary, + width=1000, + break_long_words=False, + replace_whitespace=False) +print ("----- CONCISE SUMMARY -----") +print(colored(wrapped_text, 'yellow')) + +# The 'refine' summarization chain is a method for generating more accurate +# and context-aware summaries. This chain type is designed to iteratively +# refine the summary by providing additional context when needed. +# That means: it generates the summary of the first chunk. +# Then, for each successive chunk, the work-in-progress +# summary is integrated with new info from the new chunk. + +chain = load_summarize_chain(llm, chain_type="refine") + +output_summary = chain.run(docs) +wrapped_text = textwrap.fill(output_summary, width=100) +print("----- REFINED SUMMARY -----") +print(colored(wrapped_text, 'yellow')) \ No newline at end of file diff --git a/1 ActiveLoop Courses/1 LangChain & Vector Databases in Production/5 Combining Components Together with Chains/2 Create a YouTube Video Summarizer Using Whisper and LangChain /4-download_mp4_videos_list_from_youtube_and_transcribe.py b/1 ActiveLoop Courses/1 LangChain & Vector Databases in Production/5 Combining Components Together with Chains/2 Create a YouTube Video Summarizer Using Whisper and LangChain /4-download_mp4_videos_list_from_youtube_and_transcribe.py new file mode 100644 index 0000000..88bdad0 --- /dev/null +++ b/1 ActiveLoop Courses/1 LangChain & Vector Databases in Production/5 Combining Components Together with Chains/2 Create a YouTube Video Summarizer Using Whisper and LangChain /4-download_mp4_videos_list_from_youtube_and_transcribe.py @@ -0,0 +1,53 @@ +import yt_dlp + +# download list of yt urls mp4 + +def download_mp4_from_youtube(urls, job_id): + # This will hold the titles and authors of each downloaded video + video_info = [] + + for i, url in enumerate(urls): + # Set the options for the download + file_temp = f'./{job_id}_{i}.mp4' + ydl_opts = { + 'format': 'bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]', + 'outtmpl': file_temp, + 'quiet': True, + } + + # Download the video file + with yt_dlp.YoutubeDL(ydl_opts) as ydl: + result = ydl.extract_info(url, download=True) + title = result.get('title', "") + author = result.get('uploader', "") + + # Add the title and author to our list + video_info.append((file_temp, title, author)) + + return video_info + +urls=["https://www.youtube.com/watch?v=mBjPyte2ZZo&t=78s", + "https://www.youtube.com/watch?v=cjs7QKJNVYM",] +vides_details = download_mp4_from_youtube(urls, 1) + +# trasncribe the videos and save to text file + +import whisper + + +# load the model +model = whisper.load_model("base") + +# iterate through each video and transcribe +results = [] +for video in vides_details: + result = model.transcribe(video[0]) + results.append( result['text'] ) + print(f"Transcription for {video[0]}:\n{result['text']}\n") + +with open ('text.txt', 'w') as file: + file.write(results['text']) + + + + \ No newline at end of file diff --git a/1 ActiveLoop Courses/1 LangChain & Vector Databases in Production/5 Combining Components Together with Chains/2 Create a YouTube Video Summarizer Using Whisper and LangChain /5-summarization_videos_list.py b/1 ActiveLoop Courses/1 LangChain & Vector Databases in Production/5 Combining Components Together with Chains/2 Create a YouTube Video Summarizer Using Whisper and LangChain /5-summarization_videos_list.py new file mode 100644 index 0000000..b2a9352 --- /dev/null +++ b/1 ActiveLoop Courses/1 LangChain & Vector Databases in Production/5 Combining Components Together with Chains/2 Create a YouTube Video Summarizer Using Whisper and LangChain /5-summarization_videos_list.py @@ -0,0 +1,92 @@ +from langchain.text_splitter import RecursiveCharacterTextSplitter + +text_splitter = RecursiveCharacterTextSplitter( + chunk_size=1000, chunk_overlap=0, separators=[" ", ",", "\n"] +) + +# Load the texts +with open('text.txt') as f: + text = f.read() +texts = text_splitter.split_text(text) + +# Split the documents +text_splitter = RecursiveCharacterTextSplitter( + chunk_size=1000, chunk_overlap=0, separators=[" ", ",", "\n"] + ) +texts = text_splitter.split_text(text) + +# Pack all chunks into a documents + +from langchain.docstore.document import Document + +docs = [Document(page_content=t) for t in texts[:4]] + +# import Deep Lake and build a database with embedded documents: + +from langchain.vectorstores import DeepLake +from langchain.embeddings.openai import OpenAIEmbeddings + +embeddings = OpenAIEmbeddings(model='text-embedding-ada-002') + +# create Deep Lake dataset +# TODO: use your organization id here. (by default, org id is your username) +import os +my_activeloop_org_id = os.environ["ACTIVELOOP_ORG_ID"] +my_activeloop_dataset_name = "langchain_course_youtube_summarizer" +dataset_path = f"hub://{my_activeloop_org_id}/{my_activeloop_dataset_name}" + +db = DeepLake(dataset_path=dataset_path, embedding_function=embeddings) +db.add_documents(docs) + +# use a retriever to get the documents + +retriever = db.as_retriever() +retriever.search_kwargs['distance_metric'] = 'cos' +retriever.search_kwargs['k'] = 4 + +#  The distance metric determines how the Retriever measures "distance" or similarity +# between different data points in the database. +# By setting distance_metric to 'cos', the Retriever will use +# cosine similarity as its distance metric. Cosine similarity is a +# measure of similarity between two non-zero vectors of an inner product space +# that measures the cosine of the angle between them. +# It's often used in information retrieval to measure the similarity +# between documents or pieces of text. Also, by setting 'k' to 4, +# the Retriever will return the 4 most similar or closest results +# according to the distance metric when a search is performed. + +# We can construct and use a custom prompt template with the QA chain. +# The RetrievalQA chain is useful to query similiar contents from +# databse and use the returned records as context to answer questions. +# The custom prompt ability gives us the flexibility to define +# custom tasks like retrieving the documents and summaizing the results +# in a bullet-point style. + +from langchain.prompts import PromptTemplate +prompt_template = """Use the following pieces of transcripts from a video to answer the question in bullet points and summarized. If you don't know the answer, just say that you don't know, don't try to make up an answer. + +{context} + +Question: {question} +Summarized answer in bullter points:""" +PROMPT = PromptTemplate( + template=prompt_template, input_variables=["context", "question"] +) + + +# Lastly, we can use the chain_type_kwargs argument to define the +# custom prompt and for chain type the ‘stuff’ variation was picked. +# You can perform and test other types as well, as seen previously. + +from langchain.chains import RetrievalQA +from langchain_openai import OpenAI + +llm = OpenAI(model_name="gpt-3.5-turbo-instruct", temperature=0) + +chain_type_kwargs = {"prompt": PROMPT} +qa = RetrievalQA.from_chain_type(llm=llm, + chain_type="stuff", + retriever=retriever, + chain_type_kwargs=chain_type_kwargs) + +print( qa.run("Summarize the mentions of google according to their AI program") ) \ No newline at end of file diff --git a/1 ActiveLoop Courses/1 LangChain & Vector Databases in Production/5 Combining Components Together with Chains/2 Create a YouTube Video Summarizer Using Whisper and LangChain /requirements.txt b/1 ActiveLoop Courses/1 LangChain & Vector Databases in Production/5 Combining Components Together with Chains/2 Create a YouTube Video Summarizer Using Whisper and LangChain /requirements.txt new file mode 100644 index 0000000..743d36b --- /dev/null +++ b/1 ActiveLoop Courses/1 LangChain & Vector Databases in Production/5 Combining Components Together with Chains/2 Create a YouTube Video Summarizer Using Whisper and LangChain /requirements.txt @@ -0,0 +1,3 @@ +#brew install ffmpeg + yt_dlp +openai-whisper \ No newline at end of file diff --git a/1 ActiveLoop Courses/1 LangChain & Vector Databases in Production/5 Combining Components Together with Chains/2 Create a YouTube Video Summarizer Using Whisper and LangChain /text.txt b/1 ActiveLoop Courses/1 LangChain & Vector Databases in Production/5 Combining Components Together with Chains/2 Create a YouTube Video Summarizer Using Whisper and LangChain /text.txt new file mode 100644 index 0000000..c62a12b --- /dev/null +++ b/1 ActiveLoop Courses/1 LangChain & Vector Databases in Production/5 Combining Components Together with Chains/2 Create a YouTube Video Summarizer Using Whisper and LangChain /text.txt @@ -0,0 +1 @@ + Hi, I'm Craig Smith and this is I on A On. This week I talked to Jan LeCoon, one of the seminal figures in deep learning development and a long time proponent of self-supervised learning. Jan spoke about what's missing in large language models and about his new joint embedding predictive architecture which may be a step toward filling that gap. He also talked about his theory of consciousness and the potential for AI systems to someday exhibit the features of consciousness. It's a fascinating conversation that I hope you'll enjoy. Okay, so Jan, it's great to see you again. I wanted to talk to you about where you've gone with so supervised learning since last week spoke. In particular, I'm interested in how it relates to large language models because the large language models really came on stream since we spoke. In fact, in your talk about JEPA, which is joint embedding predictive architecture. There you go. Thank you. You mentioned that large language models lack a world model. I wanted to talk first about where you've gone with self-supervised learning and where this latest paper stands in your trajectory. But to start, if you could just introduce yourself and we'll go from there. Okay, so my name is Jan Le Ka or Jan Le Koon who want to do it in Gilleswee and I'm a professor at New York University and at the Quarantine Institute in the Center for Data Science. And I'm also the chief AI scientist at Fair, which is the fundamental AI research lab. That's what Fair stands for. Admetta, Neil, Facebook. So tell me about where you've gone with self-supervised learning, how the joint embedding predictive architecture fits into your research. And then if you could talk about how that relates to what's lacking in large language models. Okay, self-supervised learning has been, has basically brought about a revolution in natural language processing because of their use for pre-training transformer architectures. And the fact that we use transformer architectures for that is somewhat orthogonal to the fact that we use self-supervised learning. But the way those systems are trained is that you take a piece of text, you remove some of the words, you replace them by black markers, and then you train the very large neural net to predict the words that are missing. That's a pre-training phase. And then in the process of training itself to do so, the system learns good representations of text that you can then use as input to its subsequent downstream task, I don't know, translation or Hitchbitch detection or something like that. So that's been a career revolution over the last three or four years. And including in sort of very practical applications, like every sort of type of performing contact moderation systems on Facebook, Google, YouTube, et cetera, use this kind of technique. And there's all kinds of other applications. Now, large language models are partially this, but also the idea that you can train those things to just predict the next word in a text. And if you use that, you can have those system generate text spontaneously. So there's a few issues with this. First of all, those things are what's called generative models in the sense that they predict the words, the information that is missing, words in this case. And the problem with generative models is that it's very difficult to represent uncertain predictions. So in the case of words, it's easy because we just have the system produce essentially what amounts to a score or a probability for every word in the dictionary. And so it cannot tell you if the word missing in a sentence like the blank chases the mouse in the kitchen. It's probably a cat, could be a dog, but it's probably a cat, right? So you have some distribution of probability over all words in the dictionary. And you can handle uncertainty in the prediction this way. But then what if you want to apply this to let's say video, right? So you show a video to the system. You remove some of the frames in that video and you train you to predict the frames that I'm missing. For example, predict what comes next in a video and that doesn't work. And it doesn't work because it's very difficult to train the system to predict an image or whole image. We have techniques for that for generating images before actually predicting good images that could fit in the video. It doesn't work very well. Or if it works, it doesn't produce internal representations that are particularly good for downstream task like object recognition or something of that time. So attempting to transfer those SSL method that are successful in LP into the realm of images has not been a big success. It's been somewhat of a success in audio. But really the only thing that works in the domain of images is those generating architectures where instead of predicting the image, you predict a representation of the image, right? So you feed. Let's say one view of a scene to the system. You run it to something on that that computes a representation of it. And then you take a different view of the same scene. You run it through the same network that produces another representation and you train the system in such a way that those two representations are as close to each other as possible. And the only thing the systems can agree on is the content of the image. So they end up including the content of the image independently of the viewpoint. The difficulty of making this work is to make sure that when you show two different images, it would produce different representations. So to make sure that there are informative value inputs and your system didn't collapse. I've just produced always the same representation for everything. But that's the reason why the techniques that have been generative architectures have been successful in LP aren't working so well. In images is their inability to represent complicated complicated uncertainties if you want. So now that's for training a system in SSL to learn representations of data. But what I've been proposing to do in the position paper I published a few months ago is the idea that we should use SSL to get machines to learn predictive world models. So basically to predict where the world world is going to evolve. So predict the continuation of a video, for example. Possibly predict how it's going to evolve as a consequence of an action that an intelligent agent might take. Because if we have such a world model in an agent, the agent being capable of predicting what's going to happen as a consequence of its action will be able to plan complex sequence of actions to arrive at a particular goal. And that's what's missing from all the pretty much all the AI systems that everybody has been working on. Or has been talking about loudly. Except for a few people who are working on robotics or it's absolutely necessary. So some of the interesting work there comes out of the robotics community, the sort of machine learning and robotics committee. Because there you need to have the skip ability for planning. And the work that you've been doing is it possible to build that into a large language model or is it incompatible with the architecture of large language models. It is compatible with large language models. And in fact, it might solve some of the problems that we're observing with large language models. One point is large language models is that when you use them to generate text, you initialize them with a prompt. So you type in an initial segment of a text, which could be in the form of a question or something. And then you hope that it will generate a consistent answer to that text. And the problem with that is that those systems generate text that sounds fine grammatically. But semantically, but sometimes they make various stupid mistakes. And those mistakes are due to two things. The first thing is that to generate that text, they don't really have some sort of objective. But then just satisfying the sort of statistical consistency with the prompt that was typed. So there is no way to control the type of answer that will produce. At least no direct way, if you want. That's the first problem. And then the second problem, which is much more acute, is the fact that those large language models have no idea of the underlying reality that language describes. And so there is a limit to how smart it can be and how accurate it can be because they have no experience of the real world, which is really the underlying reality of language. So their understanding of reality is extremely superficial and only contained in whatever is contained in language that they've been trained on. And that's very shallow. Most of human knowledge is completely non-linguistic. It's very difficult for us to realize that's the case. But most of what we learn is nothing to do with language. Language is built on top of a massive amount of background knowledge that we all have in common, that we could come in sense. And those machines don't have that, but a cat has it, a dog has it. So we're able to reproduce some of the linguistic abilities of humans without having all the basics that a cat or dog has about how the world works. And that's why the systems are. I have failures is actually. So I think what we would need is an ability for machines to learn how the world works by observation in the manner of babies and infants and young animals. Accumulate all the background knowledge about the world that constitutes the basis of common sense if you want. And then use this word model as the tool for being able to plan sequence of actions to arrive at a goal. So setting goals is also an ability that humans and many animals have. The settings of goals for arriving at an overall goal and then planning sequences of actions to satisfy those goals. And those my goals don't have any of that. They don't have an understanding of the learning world. They don't have a capability of planning for planning. They don't have goals. They can just send send themselves goals other than through typing a point, which is a very weak way. Where are you in your experimentation with this? Jet architecture. So pretty early. So we have forms of it simplified form of them that we call joint-time meeting architectures without the P without the predictive. And they work quite well for learning representations of images. So you take an image you destroy it a little bit and you train on that to produce. Essentially we're also identical representations for those two distorted versions of the same image. And then you have some mechanism for making sure that it produces different representations for different images. And so that works really well. And we have simple forms of. Jet out the predictive version where the representation of one image is predicted from the representation of the other one. One version of this was actually presented that nervous this is called V. Craig L. For local and. It works very well for training that neural net to learn representations are good for image cementation for example. But we're still working on. It recipe if you want for a system that would be able to learn. The properties of the world by watching videos understanding for example very basic concepts like the word is three dimensional. The system could discover that the world is three dimensional by being shown video with the moving camera. And the best way to explain how the view of the world changes as the camera moves is that every pixel does a depth that explaining products motion, et cetera. Once that concept is learned then the notion of objects and. Occlusion objects are in front of others naturally emerges because objects are. Part of the image that moved together with products motion. At least in animated objects. Animate objects are objects that move by themselves so that could be also a natural distinction. This ability to spontaneously form the categories the babies do this at the age of a few months. They have an idea without having the names of anything they know right. They can tell a car from a bicycle chair table a tree excited. And then on top of this you can build. Notions of into the physics the fact that objects are not supported with all for example the babies on this at the age of nine months roughly it's pretty late and inertia six things are that type. And then after you've acquired those basic. Knowledge background knowledge about how the world works then. You have pretty good ability to predict and you can also predict perhaps the consequence of your actions when you start acting in the world. And then that gives you the ability to plan perhaps it gives you some basis for common sense. So that's the progression that we need to do we don't know how to do any of this yet. We don't have a good recipe for training system to predict. What's going to happen in the video for example within any degree of usefulness. Just for the training portion how much data would you need it seems to me you would need a tremendous amount of data. We need a couple hours of Instagram or YouTube that would be enough really the amount of data of raw video data that's available. It's incredibly large if you think about. Let's say five year old child and let's imagine that this five year old child can. Usually analyze. Visual percept maybe ten times a second. Okay so that's ten frames per second. And if you count how many seconds they are in five years it's something like 80 millions. So the child is in a hundred eight hundred million frames right or something like that if you yeah it's an approximation is a billion. It's not that much data we can have that tomorrow by just recording like saving a YouTube video or something. So I don't think it's an issue of data I think it's more an issue of architecture training paradigm. Principles mathematics and principles on which to base this one thing of cities. If you want to solve that problem. Abandon five major pillars of machine learning one of which is those generative models and to replace them with those joint embedding architectures. A lot of people envision already convinced of that. Then to abandon the idea of doing probabilistic modeling so we're not going to be able to predict to represent usefully the probability of the continuation of a video from. Condition on what we have already observed we have to be less ambitious about or mathematical framework if you want. So I've been advocating for many years to use something called energy based models which is a weaker form of modeling under a certain tea if you want. Then there is another concept that has been popular for training joint embedding architectures over the last few years. Which had the first paper on in the early 90s actually on something called same is networks. So it's called contrastive running and I'm actually advocating against that to. So used to this idea that once in a while you have to cover up new ideas and. And it's going to be very difficult to convince people who are very attached to those ideas to abandon them, but I think it's time for that to happen. Once you've trained one of these networks and you've established a world model. How do you transfer that to the equivalent of a large language model. One of the things that's fascinating about the development of LLM's in the last couple of years is that they're now multi model. They're not purely text and language. So how do you combine these two ideas or can you or do you need to. Yeah, so there's two or three different questions in that one question. One of them is. Can we usually transform existing language models. Whose purpose is only to produce text in such a way that they have. They can do the planning and objectives and things like that. The answer is yes, that's probably fairly simple to do. Can we can we train language model purely on language and expected to understand the underlying reality and the answer is no. And in fact. I have a paper on this in a. All places a philosophy magazine called noina, which I co-wrote with. A car carrying philosopher who is a post document about NYU. Where we say that there is a limit to what we can do with this because most of. A human knowledge is non linguistic and. If we only train systems on language, they will have a very superficial understanding of what they're talking about. So if you want systems that are robust and work, we need them to be grounded in reality. And it's an old debate whether they are actually being grounded or not. And so the approach that some people have taken at the moment is to basically. Turn everything including images and audio. Into text or something similar to text. So you take an image, you cut it into little squares, you turn those squares into vectors that's called tokenization. And now the image is just a sequence of tokens. The text is a sequence of words, right? You do this with everything and you get those multiple systems and they do something. Okay, now clear. That's the right approach long term, but they do something. I think the ingredients that I'm missing there is the fact that I think if we're dealing with sort of continuous type data like video. We should use the joint embedding architecture, not the generative architectures that large language models currently use. First of all, I don't think we should tokenize them because a lot of it get lost in translation when we tokenizing edges and videos. There's a problem also which is that those systems don't scale very well with the number of tokens you feed them with. So it works when you have a text and you need a context. To predict the next word that is maybe the 4000 last words, it's fine. But a 4000 tokens for an image or video is tiny like you need way more than that and those systems scale horribly with the number of tokens you feed them. So we're going to need to do a lot of new innovations in architectures there. And my guess is that we can't do it with generative models. I have to do the joint embedding. How does a computer recognize an image without tokenization? So, conditional nets for example, don't tokenize. They take an image as pixels, they extract local features, they detect local motifs on different windows on the image that overlap. And then those motifs get combined into other slightly less local motifs. And it's just kind of hierarchy where representations of larger and larger parts of the image are constructed as we go up in the layers. But there's no point where you cut the image into squares and you turn them into individual vectors. It's more sort of progressive. So there's been a bit of a back and forth competition between the transformer architectures that tend to rely on this tokenization and convolutional nets which which don't or in different ways. And my guess is that ultimately what would be the best solution is a combination of the two where the first few layers are more like convolutional nets. They exploit the structure of images and video certainly. And then by the time you get to up to several layers, they are the representation is more object based. And there you have an advantage in using those those transformers. But currently basically the image transformers only have one layer of conclusions at the bottom. And I think it's a bit of a waste and it doesn't scale very well when you want to apply the video. On the timeline, this is all moving very fast. It's moving very fast. How long do you think before you'll be able to scale this new architecture? It's not just scale is actually coming up with a good recipe that works that would allow us to just plug a large neural net or the smaller one. On on on YouTube and then learn how the work works by watching in a video. We don't have that recipe. We don't have probably don't have the architecture other than some vague idea, which I call hierarchical japa. But there's a lot of details to figure out that we haven't figured out this probably failure mode that we haven't yet encountered that we need to find solutions for. And so I can't give you a recipe and I can't tell you if welcome up with the recipe in the next six months year, two years, five years, ten years. It could be quick or it could be much more difficult than we think. But I think we're on the right path in searching for a solution in that direction. So once we come up with a good recipe. Then it will open the door to new breed of AI systems. Essentially that can. They can plan they can reason. And. It will be much more capable of having some level common sense perhaps. And have forms of intelligence that are more similar to what we observe being in animals and humans. Our work is inspired by the cognitive processes of the brain. Yeah. And that process of perception and then informing a world model. Is that confirmed in neuroscience? It's a hypothesis that is based on some evidence from both neuroscience and cognitive science. So I what I showed is proposal for what's called a cognitive architecture, which is some sort of modular architectures that. Would be capable of the things like like planning and reasoning that we observe in capabilities that we observe in animals and humans. And that the current most current AI systems except for a few robotics systems don't have. So I think that's important in that respect. But it's more of an inspiration really than a sort of direct copy. Interested in understanding the principles behind intelligence, but I would be perfectly happy to come up with some particular that is that uses backpropadial level but. At a higher level kind of does something different from the supervise running or something like that, which is why I work on self-supervised. And so I'm not necessarily convinced that the path towards the satisfying the goal of talking about of learning world models, etc. Necessarily goes through finding biological, plausible. Running procedures. What did you think of the forward forward algorithm and were you involved in that research? I was not involved, although I've thought about things that are somewhat similar for many decades, but very few of which is actually published. It's in the direct line of a series of work that Jeff has been very passionate about for four years of new learning procedures of different types for basically local learning worlds that can train fairly complex neural nets to learn good representations. And things like that. So he started with the boss machine, which was a really interesting concept that turned out to be somewhat in practical, but very interesting concept that a lot of people started. Backprop, which of course, he and I both had in developing something I worked on also simultaneously with backprop in the 1980s, the called target prop, where it's an attempt at making backpropadial local by computing a virtual target for a new model. Every neuron in a large neural net that can be locally optimized. Unfortunately, the way to compute this target is normal calls. And I haven't worked on this particular type of procedure for a long time, but Yoshua Benjou has published a few papers on this over the last 10 years or so. Yoshua Jeff and I when we started the deep learning conspiracy in the early 2000 to renew the interest of the community in deep learning. We focused largely on forms of kind of local self supervised learning methods. So things like in just case that was focused on restricted boss machines. Yoshua settled on something called denozing auto encoders, which is the basis for a lot of the large language model type training that we're using today. I was focusing more on what's called sparsato encoders. So this is different ways of doing training a layer if you want in the neural net to learn something useful without being it without it being focused on any particular task. So you don't need label data. And a lot of that work has been put aside a little bit by the incredible success of just pure supervised learning with very deep model we found ways to train very large neural nets with with very many layers with just back prop and so we put those techniques on the side and Jeff basically is coming back to them. I'm coming back to them in different form a little bit with this so the japa architecture. And he also had ideas in the past something called recirculation. A lot of informax method which actually the japa use this thing ideas are similar. He's a very productive source of ideas that are that sometimes seems out of the left field. And where the community is attention and then doesn't quite figure it right away and then it takes a few years for those things to disseminate and sometimes they don't just a minute. Hello. Beauregard I'm recording right now. Who? Rasmus I'll answer when I get back. Yeah, you'll be famous someday. Okay, okay, great. Thanks very much. Yeah, bye bye. Sorry about that. There was a very interesting talk by David Chalmers at some level. It was not a very serious talk because everyone knows as you described earlier that large language models are not reasoning. They don't have common sense. He doesn't claim that they do. No, that's right. What you're describing with this Japa architecture, if you could develop a large language model that is based on a world model. You'll be a large language model. You'll be a world model. At first, it would not be based on language. You'll be based on visual perception, maybe audio perception. If you have a machine that can do what a cat does, you don't need language. Language can be put on top of this. Language is easy, which is why we have those large language models. We don't have systems that run how they work. Yeah, but let's say that you build this world model and you put language on top of it so that you can interrogate it, communicate with it. Does that take you a step toward what Chalmers was talking about? I don't want to get into the theory of consciousness, but at least an AI model that would exhibit a lot of the features of consciousness. David actually has two different definitions for sentience and consciousness. You can have sentience with our consciousness. It's simple anymore, our sentience. In the sense that they have experience, emotions, and drives and things like that. But they may have the type of consciousness that we think we have. At least the illusion of consciousness. So, sentience, I think, can be achieved by the type of architecture I propose if we can make them work, which is a big if. And the reason I think that is that what those systems would be able to do is have objectives that they need to satisfy. Think of them as drives. And having the system compute those drives, which would be basically predictions of the outcome of a situation or a sequence of actions that the agent might take. Basically, those would be indistinguishable from emotions. So, if you have your own situation where you can take a sequence of actions to arrive at a result, and the outcomes that you're predicting is terrible results in your destruction. Okay, that creates fear. You try to figure out, I can say, oh, now the sequence of action I take, that would not result in the same outcome. And make those predictions, but there's a huge uncertainty in the prediction. One of which with probability half, maybe is that you get destroyed, it creates even more fear. And then on the contrary, if the outcome is going to be good, then it's more like elation. So, those are long term prediction of outcomes, which systems that use the architecture I'm proposing, I think we'll have. So, they will have some level of experience and they will have emotions that will drive the behavior because they would be able to anticipate outcomes. And that has act on them. Now, consciousness is a different story. So, my full theory of consciousness, which I've talked to David about, thinking it was going to tell me I'm crazy. But he said, no, actually that overlaps with some pretty common theories of consciousness among philosophers is the idea that we have essentially a single world model in our head. Somewhere in a prefrontal cortex. And that world model is configurable to the situation we're facing at the moment. And so, we're configuring our brain, including our world model, for solving the problem that, you know, satisfying the objective that we currently set to ourselves. And because we only have a civil world model engine, we can only solve one such task at any one time. This is a characteristic of humans and many animals, which is that when we focus on the task, we can't do anything else. We can do subconscious tasks simultaneously, but we can only do one conscious deliberate task at any one time. And it's because we have a single world model engine. Now, why would evolution build us in a way that we have a single world model engine? There's two reasons for this. One reason is that single world model engine can be configured for the situation at hand, but only the part that changes from one situation to another. And so it can share knowledge between different situations. The physics of the world doesn't change if you are building a table or trying to jump over a river or something. And so, your sort of basic knowledge about how the world works doesn't need to be reconfigured. It's only the thing that depends on the situation at hand. So that's one reason. And the second reason is that if we had multiple models of the world, they would have to be individually less powerful because you have to all fit them within your brain and that's an immediate size. So I think that's probably the reason why we only have one. And so, if you have only one world model that needs to be configured for the situation at hand, you need some sort of meta module that configures it. It configures that, like, what situation am I in? What sub-goals should I set myself? And how should I configure the rest of the... My brain to solve that problem. And that module would have to be able to observe the state and capabilities. We'd have to have a model of the rest of itself, of the agent. And that perhaps is something that gives us illusion of consciousness. So I must say this is very speculative. Okay, I'm not saying this is exactly what happens. But it fits with a few things that we know about consciousness. You were saying that this architecture is inspired by cognitive science or neuroscience. How much do you think your work, just work, other people's work, at the kind of the leading edge of deep learning or machine learning research is informing neuroscience? Or is it more of the other way around? Certainly in the beginning, it was the other way around. But at this point, it seems that there's a lot of information that then is reflecting back to those fields. So it's been a bit of a feedback loop. So new concepts in machine learning have driven people in neuroscience and cognitive science to use computational models if you want for whatever studying. And many of my colleagues, my favorite colleagues work on this. The whole field of computational neuroscience basically is around this. And what we're seeing today is a big influence or rather a wide use of deep learning models such as conditional net and transformers as models, explanatory model of what goes on in the visual cortex, for example. So the people, you know, for a number of years now who have done FMRI experiments and then showed the same image to a subject in the FMRI machine and to a conventional net and then try to explain the variance, the observed in the activity of various areas of the brain with the activity that is observed in corresponding neural net. And what comes out of the studies is that the notion of multilayer hierarchy that we have conventional nets matches the type of hierarchy that we observe in the at this eventual pathway of the visual system. So V1 corresponds to the first few layers of the conventional net and in V2 to some of the following layers and V4 more and then the E4 temporal cortex to the top layers are the best explanation of each other if you try to do the matching, right. One of my colleagues at Fair Paris, there's a dual affiliation also with nor a spin that academic lab in Paris has done the same type of experiment using transformer architectures and language models essentially and observing when activity of people who are listening to stories and attempting to understand the story so that they can answer questions about the story or give a summary of it. And there the matching is not that great in the sense that there is some sort of correspondence between the type of activity you observe in those large transformers and the type of activity is in the brain, but the hierarchy is not nearly as clear. And what is clear is that the brain is a capable of making much longer term prediction that those language models are capable of today. So that begs the question of what are we missing in terms of architecture and to some extent it's jibes with the idea that the models that we should have should build hierarchical representations of the percept that different levels of abstraction. So that the highest level of abstraction are able to make long term predictions that they had said are less accurate than the lower level, but longer term. We don't seem to have that in current models. I had a question I wanted to ask you since our last conversation you have a lot of things going on. You teach you have your roll at Facebook, your roll I think at CVPR or how do you work on this have like three days a week or two hours a day where you're just focused. And then are you a tinkering with code or with diagrams or is it in iterations with some of your graduates who the or is this something where it's kind of always in your mind and you're in the shower and you think yeah that might work. I'm just curious how do you love all of it. Okay, so first of all, once you understand is that my position at meta at fair is not a position of management. I don't manage anything. I'm chief scientist, which means I try to inspire others to work on things that I think are promising. And I advise several projects that I'm not personally involved in. I work on strategy and orientations and things like this, but I don't do that to them. I'm very thankful that you know is doing this for fair and doing very very good job. I'm not very good at it either. So it's for you better if I don't if I don't do it. So that allows me to spend quite a bit of time on research itself. And I don't have a group of engineers and scientists working with me have a group of. More junior people working with me students and postdocs both at fair and at NYU. Both in New York and in Paris and and working with students and postdoc is wonderful because. They are fearless, they're recreative. Many of them have amazing talents in theoretical abilities or implementation abilities or an academic things work. And so what happens very often is. Either one of them will come up with an idea that whose results surprise me and said I was thinking that is wrong. And that's the best thing that can happen. Or sometimes I come up with an idea. And turns out to work, which is great. Usually not in the form that I formulated it normally it's there's a lot of contributions that have to. We brought an idea for it to make it work. And then what's happened also quite a bit in the last few years is. I come up with an idea that I'm sure it's going to work. And. She students and postdoc tried to make it work and they come back to me and said I was sorry it doesn't work and here is a fair. Oh yeah, we should have thought about this. It's okay. So here's a new idea to get around this problem. So for example several years ago. I was advocating for the use of generative models with latent variables to handle the uncertainty. And I completely changed my mind about this now. I'm advocating for those joint evading architecture that do not actually predict. And I was. I more or less invented those contrasting methods that a lot of people are talking about and using at this point. Now I'm advocating against them now in favor of. Those methods are such as V Craig or about the twins that basically. Instead of using contrasting methods can try to maximize the information content of representations and that idea. Of information maximization at not about for decades because Jeff was working on this in the 1980s when I was opposed to. And he abandoned the idea pretty much. He had a couple papers with one of his students who back are in the early 90s. That show that he could work but only in sort of small dimension and it pretty much abandoned it. And the reason he abandoned it is because of a major flaw with those methods. Due to the fact that we don't have any good measures of information content or the measures that we had are up about not lower bound. So we can try to maximize information content very well. And so I never thought about those that those methods could ever work because of my experience with. With that. And one of my postdocs, Stephen did actually kind of revise the idea and show that it worked that was about a twins paper. So we changed our mind. And so now that we had a new tool information, maximumization applied to the joint embedding architectures and. Came up with an improvement of it called Vickreg. And and now we're working on that. But there are other ideas we're working on to solve the same problem with other groups of people at the moment. Which probably will come up in the next few months. So we don't again, we don't have a perfect recipe yet. And we're looking for one and hopefully one of the things that we are working on with stick. Now, are you coding models and then training them and running them or are you conceptualizing and turning it over to someone else? So it's mostly conceptualizing and mostly letting the students and postdocs doing the implementation, although I do a little bit of coding myself, but. Not enough to my taste. I wish I could do more. I have a lot of postdocs and students and so I have to devote sufficient amount of my time to interact with them. Sure. And then leave them some breathing room to do the work they do best. So it's an interesting question because that question was asked to Jeff to start right. Yeah. And he said he was using matlab and he said you have to do this those things yourself because there's something doesn't. If you give a project to a student and a project come back saying it doesn't work, you don't know if. Yeah. Because there is a conceptual problem with the idea or whether it's just some stupid detail that wasn't done right. And when I'm facing with this, that's when I start looking at the code and perhaps experimenting with it myself. Yeah. Or I get multiple students to work on them to collaborate on the project so that if one makes an error perhaps the other one will detect what it is. I love coding. I just don't do as much as I like to. Yeah. So if you have a Java or the forward forward, things have moved so quickly. You think back to when the transformers were introduced or at least the attention mechanism and that kind of shifted the field. It's difficult for an outsider to judge when I hear the jump attack. Is this one of those moments that wow this idea is going to transform the field? I have you been through many of these moments and they contribute to some extent but they're not the answer to ship the paradigm. It's hard to tell at first, but whenever I kind of keep pursuing an idea and promote it, it's because I have a good hunch that they're going to have a relatively big impact. And it was easy for me to do before I was as famous as I am now because I wasn't listen to that much. So I could make some claim and now I have to be careful what I claim because a lot of people listen to me. Yeah. And it's the same issue with Jeff. So Jeff, for example, a few years ago was promoting this idea of capsules. Yeah. Everybody was thinking this is going to be like a big thing a lot of people started working on it. It turns out it's very hard to make it work and it didn't have the impact that many people started would have, including Jeff. And they turned out to be limited by implementation issues and stuff like that. The underlying idea behind it is good, but like very often the practical side of it kills it. There was the case also with lots of machines. They're conceptually super interesting. They just don't work that well. They don't scale very well. They're very slow to train because actually it's a very interesting idea that everybody should know about. So there's a lot of those ideas that are conceptually that allow us. There are some mental objects that allow us to think differently about what we do. But they may not actually have that much practical impact. For forward, we don't know yet. Okay, it could be like the wake sleep algorithm that Jeff talked about 20 years ago or something. Or it could be the new back prop. We don't know. Or the new target prop, which is interesting, but not really mainstream. Because it. It has some advantages in some situations, but it's not. It brings you like an improved performance on some standard benchmark that people are interested in. So it doesn't have the right to deal perhaps. So it's hard to figure out. But what I can tell you is that if we figure out how to train one of those. Jet by start architecture from video. And the representations that it learns are good. And the predictive model that he learns are good. This is going to open the door to. And you breed of AI systems. I have no no doubt about that. It's exciting. The speed at which. Things have been moving in particular in the last three years. About about transformers and the history of transformers. One thing I want to say about this is that. We see the most visible progress, but we don't realize that how much of a history there was behind it. And even the people who actually came up with some of those ideas don't realize that. The idea is actually had roots in other things. And the example back in the 90s, people were already working on. Things that we could now call mixer of experts. And also multiplicative interactions, which at the time were called the semi-py networks or things like that. So this is the idea that instead of having two variables that you add together with weights, you multiply them and then you have a way for you have weights before you multiply. It doesn't matter. This side it goes back every long time since the 1980s. And then you had ideas of linearly combining multiple inputs with weights that are between 0 and 1 and some to 1 and are dependent. So now we call this attention, but this is a circuit that was used in mixer mixer of expert models back in the early 90s also. So that's the old. Then there were ideas of neural networks that have a separate module for computation and memory that's two separate modules. So one module that is a classical neural net and the output of that module would be an address into an associative memory that itself would be a different type of neural net and those different types of neural net associative memories. Use what we now call attention so they compute the similarity or the product between a query vector and a bunch of key vectors. And then they normalize and so this onto one and then the output of the memory is weighted some of the value value vectors. There was a series of papers by my colleagues in the early days of fair actually in 2014, 15 one called memory network one called end to end memory network one called the stack of maintained memory network and other one called key value memory network and then a whole bunch of things. And those use those associative memories that basically are the basic modules that are used inside the transformers and then attention mechanism like this were popularized in around 2015 by a paper from the usual bench was good at Miller and demonstrated that they are extremely powerful for doing things like translation language translation in LLP. And that really started the craze on attention. And so you come on all those ideas and you get a transformer that uses something called self attention where the input tokens are used both as queries and keys in a associative memory very much like a memory network. And then you use this as a layer if you want you put several of those in a layer and then you stack those layers and that's what the transformer is. The feeling is not obvious but there is one those ideas have been around and people have been talking about it and the similar work also around 2015 16 and from deep mind called the neural turning machine or differentiable neural computer those ideas that you have a separate module for competition and other one for memory. And there's a paper by step or writer and his group also on neural nets that have separate memory associative memory type system. They are the same type of things. I think this idea is very powerful. The big advantage of transformers is that the same way commercial nets are equivalent to shift so to shift the input of a commercial net. So shift but otherwise doesn't change. It transformer if you commute the input tokens the output tokens get permuted the same way but are otherwise unchanged so. Comments are equivalent to shifts. Transformers are equivalent to permutation and with a combination of the two is great. That's why I think the combination of the low level and transformer at the top I think for natural into data like image and video is a very combination. Is there a combinatorial effect as the field progresses all of these ideas create a cascade of new ideas. Is that why the field is speeding up? It's not the only reason the there's a number of reasons the. So one of the reasons is that you build on each other's ideas and etc which of course is the whole mark of science in general also art. But there is a number of characteristics I think that. Help that to a large extent the one in particular is the fact that. Most research work in this area now comes with code that other people can use and build upon right so. The habit of distributing your code in a source I think is a is an enormous. Contributor to the acceleration of progress the other one is. The availability of the most sophisticated tools like pito arch for example or TensorFlow or jacks or things like that where which where researchers can. Build on top of each other's code base basically to. Come up with really complex concepts. And all of this is committed by the fact that some of the main contributors that are from industry to those ideas. Don't seem to be too. Obsessive compulsive about IP protection. So meta and in particular is very open. We may occasionally fight patterns but we're not going to see you for infringing them unless you sue us. Google as a similar policy. You don't see this much from companies that tend to be a little more secretive about their research like Apple and Amazon but although I just talked to Sam in bed. Yeah he's trying to implement that openness. More power to him good luck. It's a culture change for a company like Apple so this is not a battle I want to fight but if you can win it like good for him. Yeah. It's difficult difficult battle. Also I think another contributor is that there are real practical commercial applications of all of this. They're not just imagined they are real. And so that creates a market and that increases the size of the community. And so that creates more appeal for new ideas right more more. Outlets if you want for new ideas do you think that this. Hockey stick curve is going to continue for a while or do you think will hit a plateau then. Is it difficult to say nothing works more like a next next financial that the beginning of a sigmoid so every natural process has to saturate at some point. Yeah the question is when. And I don't see any obvious wall that is being hit by a research at the moment it's quite the opposite seems to be an acceleration in fact of progress. And there's no question that we need the new concepts and new ideas in fact that's the purpose of my research at the moment because I think there are limitations to current approaches. So this is not to say that we just need to. Scale up deep learning and turn the crank and we'll get to human level intelligence. I don't believe that. I don't believe that it's just a matter of making reinforcement learning more efficient. I don't think that's possible with the current way reinforcement learning is formulated. And we're not going to get there with supervised learning either. I think we definitely need. New innovative concepts. But I don't see any slowdown yet. I don't see any people turning away from me I'm saying it's obviously not going to work. Despite there is. Screams of various critics right. Yeah sure about that but. But. I think to some extent at the moment are fighting a real guard battle. Yeah because they plan to flag this and you're never going to be able to do this and then. Turns out you can do this or they plan to flag a little further down. And now you're not going to be able to do this. So it's a tiny yeah. Okay my last question. Are you still doing music? I am. And are you still building instruments or I'm building instruments electronic wind instruments? Yes. Any process of designing a new one. Wow. Yeah okay maybe I think I said this last time maybe I could get some recordings and. Put them into the podcast or something. I probably told you and that's such a great performer. I'm probably better at conceptualizing and building those instruments and playing them. But yeah it's possible. That's it for this episode. I want to thank you and for his time. If you want to read a transcript of today's conversation you can find one on our website. I on a I that's EY E hyphen O N dot A I. Feel free to drop us a line with comments or suggestions at Craig at I on A I that's C R A I G. At EY E hyphen O N dot A I. And remember the singularity may not be near. But A I is about to change your world. So pay attention. Thank you. \ No newline at end of file diff --git a/1 ActiveLoop Courses/1 LangChain & Vector Databases in Production/5 Combining Components Together with Chains/3 Creating a Voice Assistant for your Knowledge Base/1-scrape.py b/1 ActiveLoop Courses/1 LangChain & Vector Databases in Production/5 Combining Components Together with Chains/3 Creating a Voice Assistant for your Knowledge Base/1-scrape.py new file mode 100644 index 0000000..5065d73 --- /dev/null +++ b/1 ActiveLoop Courses/1 LangChain & Vector Databases in Production/5 Combining Components Together with Chains/3 Creating a Voice Assistant for your Knowledge Base/1-scrape.py @@ -0,0 +1,121 @@ +import os +import requests +from bs4 import BeautifulSoup +from dotenv import load_dotenv +from langchain.embeddings.openai import OpenAIEmbeddings +from langchain.vectorstores import DeepLake +from langchain.text_splitter import CharacterTextSplitter +from langchain.document_loaders import TextLoader +import re +# Load environment variables from the .env file +load_dotenv() +# Get the dataset path from the environment variable +dataset_path= os.environ.get('DEEPLAKE_DATASET_PATH') + +embeddings = OpenAIEmbeddings() + +def get_documentation_urls(): + # List of relative URLs for Hugging Face documentation pages, commented a lot of these because it would take too long to scrape all of them + return [ + '/docs/huggingface_hub/guides/overview', + '/docs/huggingface_hub/guides/download', + '/docs/huggingface_hub/guides/upload', + '/docs/huggingface_hub/guides/hf_file_system', + '/docs/huggingface_hub/guides/repository', + '/docs/huggingface_hub/guides/search', + # '/docs/huggingface_hub/guides/inference', + # '/docs/huggingface_hub/guides/community', + # '/docs/huggingface_hub/guides/manage-cache', + # '/docs/huggingface_hub/guides/model-cards', + # '/docs/huggingface_hub/guides/manage-spaces', + # '/docs/huggingface_hub/guides/integrations', + # '/docs/huggingface_hub/guides/webhooks_server', + # Add the rest of the URLs here + ] + + +def construct_full_url(base_url, relative_url): + # Construct the full URL by appending the relative URL to the base URL + return base_url + relative_url + + +def scrape_page_content(url): + # Send a GET request to the URL and parse the HTML response using BeautifulSoup + response = requests.get(url) + soup = BeautifulSoup(response.text, 'html.parser') + # Extract the desired content from the page (in this case, the body text) + text=soup.body.text.strip() + # Remove non-ASCII characters + text = re.sub(r'[\x00-\x08\x0b-\x0c\x0e-\x1f\x7f-\xff]', '', text) + # Remove extra whitespace and newlines + text = re.sub(r'\s+', ' ', text) + return text.strip() + +def scrape_all_content(base_url, relative_urls, filename): + # Loop through the list of URLs, scrape content, and add it to the content list + content = [] + for relative_url in relative_urls: + full_url = construct_full_url(base_url, relative_url) + scraped_content = scrape_page_content(full_url) + content.append(scraped_content.rstrip('\n')) + + # Write the scraped content to a file + with open(filename, 'w', encoding='utf-8') as file: + for item in content: + file.write("%s\n" % item) + + return content + +# Define a function to load documents from a file +def load_docs(root_dir,filename): + # Create an empty list to hold the documents + docs = [] + try: + # Load the file using the TextLoader class and UTF-8 encoding + loader = TextLoader(os.path.join( + root_dir, filename), encoding='utf-8') + # Split the loaded file into separate documents and add them to the list of documents + docs.extend(loader.load_and_split()) + except Exception as e: + # If an error occurs during loading, ignore it and return an empty list of documents + pass + # Return the list of documents + return docs + + +def split_docs(docs): + text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0) + return text_splitter.split_documents(docs) + +def load_vectors_into_deeplake(dataset_path, source_chunks): + # Initialize the DeepLake database with the dataset path and embedding function + deeplake_db = DeepLake(dataset_path=dataset_path, embedding_function=embeddings) + # Add the text chunks to the database + deeplakedb=deeplake_db.add_texts(source_chunks) + return deeplakedb + + +# Define the main function +def main(): + base_url = 'https://huggingface.co' + # Set the name of the file to which the scraped content will be saved + filename='content.txt' + # Set the root directory where the content file will be saved + root_dir ='./' + relative_urls = get_documentation_urls() + # Scrape all the content from the relative urls and save it to the content file + content = scrape_all_content(base_url, relative_urls,filename) + # Load the content from the file + docs = load_docs(root_dir,filename) + # Split the content into individual documents + texts = split_docs(docs) + # Create a DeepLake database with the given dataset path and embedding function + db = DeepLake(dataset_path=dataset_path, embedding_function=embeddings) + # Add the individual documents to the database + db.add_documents(texts) + # Clean up by deleting the content file + os.remove(filename) + +# Call the main function if this script is being run as the main program +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/1 ActiveLoop Courses/1 LangChain & Vector Databases in Production/5 Combining Components Together with Chains/3 Creating a Voice Assistant for your Knowledge Base/2-chat.py b/1 ActiveLoop Courses/1 LangChain & Vector Databases in Production/5 Combining Components Together with Chains/3 Creating a Voice Assistant for your Knowledge Base/2-chat.py new file mode 100644 index 0000000..daa15f2 --- /dev/null +++ b/1 ActiveLoop Courses/1 LangChain & Vector Databases in Production/5 Combining Components Together with Chains/3 Creating a Voice Assistant for your Knowledge Base/2-chat.py @@ -0,0 +1,133 @@ +import os +import openai +import streamlit as st +from audio_recorder_streamlit import audio_recorder +from elevenlabs import generate +from langchain.chains import RetrievalQA +from langchain.chat_models import ChatOpenAI +from langchain.embeddings.openai import OpenAIEmbeddings +from langchain.vectorstores import DeepLake +from streamlit_chat import message +from dotenv import load_dotenv + +# Load environment variables from the .env file +load_dotenv() + +# Constants +TEMP_AUDIO_PATH = "temp_audio.wav" +AUDIO_FORMAT = "audio/wav" + +# Load environment variables from .env file and return the keys +openai.api_key = os.environ.get('OPENAI_API_KEY') +eleven_api_key = os.environ.get('ELEVEN_API_KEY') +active_loop_data_set_path = os.environ.get('DEEPLAKE_DATASET_PATH') + +# Load embeddings and DeepLake database +def load_embeddings_and_database(active_loop_data_set_path): + embeddings = OpenAIEmbeddings() + db = DeepLake( + dataset_path=active_loop_data_set_path, + read_only=True, + embedding_function=embeddings + ) + return db + +# Transcribe audio using OpenAI Whisper API +def transcribe_audio(audio_file_path, openai_key): + openai.api_key = openai_key + try: + with open(audio_file_path, "rb") as audio_file: + response = openai.Audio.transcribe("whisper-1", audio_file) + return response["text"] + except Exception as e: + print(f"Error calling Whisper API: {str(e)}") + return None + +# Record audio using audio_recorder and transcribe using transcribe_audio +def record_and_transcribe_audio(): + audio_bytes = audio_recorder() + transcription = None + if audio_bytes: + st.audio(audio_bytes, format=AUDIO_FORMAT) + + with open(TEMP_AUDIO_PATH, "wb") as f: + f.write(audio_bytes) + + if st.button("Transcribe"): + transcription = transcribe_audio(TEMP_AUDIO_PATH, openai.api_key) + os.remove(TEMP_AUDIO_PATH) + display_transcription(transcription) + + return transcription + +# Display the transcription of the audio on the app +def display_transcription(transcription): + if transcription: + st.write(f"Transcription: {transcription}") + with open("audio_transcription.txt", "w+") as f: + f.write(transcription) + else: + st.write("Error transcribing audio.") + +# Get user input from Streamlit text input field +def get_user_input(transcription): + return st.text_input("", value=transcription if transcription else "", key="input") + +# Search the database for a response based on the user's query +def search_db(user_input, db): + print(user_input) + retriever = db.as_retriever() + retriever.search_kwargs['distance_metric'] = 'cos' + retriever.search_kwargs['fetch_k'] = 100 + retriever.search_kwargs['maximal_marginal_relevance'] = True + retriever.search_kwargs['k'] = 10 + model = ChatOpenAI(model='gpt-3.5-turbo') + qa = RetrievalQA.from_llm(model, retriever=retriever, return_source_documents=True) + return qa({'query': user_input}) + +# Display conversation history using Streamlit messages +def display_conversation(history): + for i in range(len(history["generated"])): + message(history["past"][i], is_user=True, key=str(i) + "_user") + message(history["generated"][i],key=str(i)) + #Voice using Eleven API + voice= "Bella" + text= history["generated"][i] + audio = generate(text=text, voice=voice,api_key=eleven_api_key) + st.audio(audio, format='audio/mp3') + +# Main function to run the app +def main(): + # Initialize Streamlit app with a title + st.write("# JarvisBase 🧙") + + # Load embeddings and the DeepLake database + db = load_embeddings_and_database(active_loop_data_set_path) + + # Record and transcribe audio + transcription = record_and_transcribe_audio() + + # Get user input from text input or audio transcription + user_input = get_user_input(transcription) + + # Initialize session state for generated responses and past messages + if "generated" not in st.session_state: + st.session_state["generated"] = ["I am ready to help you"] + if "past" not in st.session_state: + st.session_state["past"] = ["Hey there!"] + + # Search the database for a response based on user input and update session state + if user_input: + output = search_db(user_input, db) + print(output['source_documents']) + st.session_state.past.append(user_input) + response = str(output["result"]) + st.session_state.generated.append(response) + + # Display conversation history using Streamlit messages + if st.session_state["generated"]: + display_conversation(st.session_state) + +# Run the main function when the script is executed +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/1 ActiveLoop Courses/1 LangChain & Vector Databases in Production/5 Combining Components Together with Chains/3 Creating a Voice Assistant for your Knowledge Base/content.txt b/1 ActiveLoop Courses/1 LangChain & Vector Databases in Production/5 Combining Components Together with Chains/3 Creating a Voice Assistant for your Knowledge Base/content.txt new file mode 100644 index 0000000..95e478c --- /dev/null +++ b/1 ActiveLoop Courses/1 LangChain & Vector Databases in Production/5 Combining Components Together with Chains/3 Creating a Voice Assistant for your Knowledge Base/content.txt @@ -0,0 +1,6 @@ +Hugging Face Models Datasets Spaces Posts Docs Solutions Pricing Log In Sign Up Hub Python Library documentation How-to guides Hub Python Library 🏡 View all docsAWS Trainium & InferentiaAccelerateAmazon SageMakerAutoTrainBitsandbytesChat UICompetitionsDataset viewerDatasetsDiffusersEvaluateGoogle TPUsGradioHubHub Python LibraryHuggingface.jsInference API (serverless)Inference Endpoints (dedicated)OptimumPEFTSafetensorsSentence TransformersTRLTasksText Embeddings InferenceText Generation InferenceTokenizersTransformersTransformers.jstimm Search documentation mainv0.23.3v0.22.2v0.21.4v0.20.3v0.19.3v0.18.0.rc0v0.17.3v0.16.3v0.15.1v0.14.1v0.13.4v0.12.1v0.11.0v0.10.1v0.9.1v0.8.1v0.7.0.rc0v0.6.0.rc0v0.5.1 CNDEENFRHIKO Get started Home Quickstart Installation How-to guides Overview Download files Upload files Use the CLI HfFileSystem Repository Search Inference Inference Endpoints Community Tab Collections Cache Model Cards Manage your Space Integrate a library Webhooks server Conceptual guides Git vs HTTP paradigm Reference Overview Login and logout Environment variables Managing local and online repositories Hugging Face Hub API Downloading files Mixins & serialization methods Inference Types Inference Client Inference Endpoints HfFileSystem Utilities Discussions and Pull Requests Cache-system reference Repo Cards and Repo Card Data Space runtime Collections TensorBoard logger Webhooks server Serialization Join the Hugging Face community and get access to the augmented documentation experience Collaborate on models, datasets and Spaces Faster examples with accelerated inference Switch between documentation themes Sign Up to get started How-to guides In this section, you will find practical guides to help you achieve a specific goal. Take a look at these guides to learn how to use huggingface_hub to solve real-world problems: Repository How to create a repository on the Hub? How to configure it? How to interact with it? Download files How do I download a file from the Hub? How do I download a repository? Upload files How to upload a file or a folder? How to make changes to an existing repository on the Hub? Search How to efficiently search through the 200k+ public models, datasets and spaces? HfFileSystem How to interact with the Hub through a convenient interface that mimics Python's file interface? Inference How to make predictions using the accelerated Inference API? Community Tab How to interact with the Community tab (Discussions and Pull Requests)? Collections How to programmatically build collections? Cache How does the cache-system work? How to benefit from it? Model Cards How to create and share Model Cards? Manage your Space How to manage your Space hardware and configuration? Integrate a library What does it mean to integrate a library with the Hub? And how to do it? Webhooks server How to create a server to receive Webhooks and deploy it as a Space? < > Update on GitHub ←Installation Download files→ How-to guides +Hugging Face Models Datasets Spaces Posts Docs Solutions Pricing Log In Sign Up Hub Python Library documentation Download files from the Hub Hub Python Library 🏡 View all docsAWS Trainium & InferentiaAccelerateAmazon SageMakerAutoTrainBitsandbytesChat UICompetitionsDataset viewerDatasetsDiffusersEvaluateGoogle TPUsGradioHubHub Python LibraryHuggingface.jsInference API (serverless)Inference Endpoints (dedicated)OptimumPEFTSafetensorsSentence TransformersTRLTasksText Embeddings InferenceText Generation InferenceTokenizersTransformersTransformers.jstimm Search documentation mainv0.23.3v0.22.2v0.21.4v0.20.3v0.19.3v0.18.0.rc0v0.17.3v0.16.3v0.15.1v0.14.1v0.13.4v0.12.1v0.11.0v0.10.1v0.9.1v0.8.1v0.7.0.rc0v0.6.0.rc0v0.5.1 CNDEENFRHIKO Get started Home Quickstart Installation How-to guides Overview Download files Upload files Use the CLI HfFileSystem Repository Search Inference Inference Endpoints Community Tab Collections Cache Model Cards Manage your Space Integrate a library Webhooks server Conceptual guides Git vs HTTP paradigm Reference Overview Login and logout Environment variables Managing local and online repositories Hugging Face Hub API Downloading files Mixins & serialization methods Inference Types Inference Client Inference Endpoints HfFileSystem Utilities Discussions and Pull Requests Cache-system reference Repo Cards and Repo Card Data Space runtime Collections TensorBoard logger Webhooks server Serialization Join the Hugging Face community and get access to the augmented documentation experience Collaborate on models, datasets and Spaces Faster examples with accelerated inference Switch between documentation themes Sign Up to get started Download files from the Hub The huggingface_hub library provides functions to download files from the repositories stored on the Hub. You can use these functions independently or integrate them into your own library, making it more convenient for your users to interact with the Hub. This guide will show you how to: Download and cache a single file. Download and cache an entire repository. Download files to a local folder. Download a single file The hf_hub_download() function is the main function for downloading files from the Hub. It downloads the remote file, caches it on disk (in a version-aware way), and returns its local file path. The returned filepath is a pointer to the HF local cache. Therefore, it is important to not modify the file to avoid having a corrupted cache. If you are interested in getting to know more about how files are cached, please refer to our caching guide. From latest version Select the file to download using the repo_id, repo_type and filename parameters. By default, the file will be considered as being part of a model repo. Copied >>> from huggingface_hub import hf_hub_download >>> hf_hub_download(repo_id="lysandre/arxiv-nlp", filename="config.json") '/root/.cache/huggingface/hub/models--lysandre--arxiv-nlp/snapshots/894a9adde21d9a3e3843e6d5aeaaf01875c7fade/config.json' # Download from a dataset >>> hf_hub_download(repo_id="google/fleurs", filename="fleurs.py", repo_type="dataset") '/root/.cache/huggingface/hub/datasets--google--fleurs/snapshots/199e4ae37915137c555b1765c01477c216287d34/fleurs.py' From specific version By default, the latest version from the main branch is downloaded. However, in some cases you want to download a file at a particular version (e.g. from a specific branch, a PR, a tag or a commit hash). To do so, use the revision parameter: Copied # Download from the `v1.0` tag >>> hf_hub_download(repo_id="lysandre/arxiv-nlp", filename="config.json", revision="v1.0") # Download from the `test-branch` branch >>> hf_hub_download(repo_id="lysandre/arxiv-nlp", filename="config.json", revision="test-branch") # Download from Pull Request #3 >>> hf_hub_download(repo_id="lysandre/arxiv-nlp", filename="config.json", revision="refs/pr/3") # Download from a specific commit hash >>> hf_hub_download(repo_id="lysandre/arxiv-nlp", filename="config.json", revision="877b84a8f93f2d619faa2a6e514a32beef88ab0a") Note: When using the commit hash, it must be the full-length hash instead of a 7-character commit hash. Construct a download URL In case you want to construct the URL used to download a file from a repo, you can use hf_hub_url() which returns a URL. Note that it is used internally by hf_hub_download(). Download an entire repository snapshot_download() downloads an entire repository at a given revision. It uses internally hf_hub_download() which means all downloaded files are also cached on your local disk. Downloads are made concurrently to speed-up the process. To download a whole repository, just pass the repo_id and repo_type: Copied >>> from huggingface_hub import snapshot_download >>> snapshot_download(repo_id="lysandre/arxiv-nlp") '/home/lysandre/.cache/huggingface/hub/models--lysandre--arxiv-nlp/snapshots/894a9adde21d9a3e3843e6d5aeaaf01875c7fade' # Or from a dataset >>> snapshot_download(repo_id="google/fleurs", repo_type="dataset") '/home/lysandre/.cache/huggingface/hub/datasets--google--fleurs/snapshots/199e4ae37915137c555b1765c01477c216287d34' snapshot_download() downloads the latest revision by default. If you want a specific repository revision, use the revision parameter: Copied >>> from huggingface_hub import snapshot_download >>> snapshot_download(repo_id="lysandre/arxiv-nlp", revision="refs/pr/1") Filter files to download snapshot_download() provides an easy way to download a repository. However, you don’t always want to download the entire content of a repository. For example, you might want to prevent downloading all .bin files if you know you’ll only use the .safetensors weights. You can do that using allow_patterns and ignore_patterns parameters. These parameters accept either a single pattern or a list of patterns. Patterns are Standard Wildcards (globbing patterns) as documented here. The pattern matching is based on fnmatch. For example, you can use allow_patterns to only download JSON configuration files: Copied >>> from huggingface_hub import snapshot_download >>> snapshot_download(repo_id="lysandre/arxiv-nlp", allow_patterns="*.json") On the other hand, ignore_patterns can exclude certain files from being downloaded. The following example ignores the .msgpack and .h5 file extensions: Copied >>> from huggingface_hub import snapshot_download >>> snapshot_download(repo_id="lysandre/arxiv-nlp", ignore_patterns=["*.msgpack", "*.h5"]) Finally, you can combine both to precisely filter your download. Here is an example to download all json and markdown files except vocab.json. Copied >>> from huggingface_hub import snapshot_download >>> snapshot_download(repo_id="gpt2", allow_patterns=["*.md", "*.json"], ignore_patterns="vocab.json") Download file(s) to a local folder By default, we recommend using the cache system to download files from the Hub. You can specify a custom cache location using the cache_dir parameter in hf_hub_download() and snapshot_download(), or by setting the HF_HOME environment variable. However, if you need to download files to a specific folder, you can pass a local_dir parameter to the download function. This is useful to get a workflow closer to what the git command offers. The downloaded files will maintain their original file structure within the specified folder. For example, if filename="data/train.csv" and local_dir="path/to/folder", the resulting filepath will be "path/to/folder/data/train.csv". A ./huggingface/ folder is created at the root of your local directory containing metadata about the downloaded files. This prevents re-downloading files if they’re already up-to-date. If the metadata has changed, then the new file version is downloaded. This makes the local_dir optimized for pulling only the latest changes. After completing the download, you can safely remove the .huggingface/ folder if you no longer need it. However, be aware that re-running your script without this folder may result in longer recovery times, as metadata will be lost. Rest assured that your local data will remain intact and unaffected. Don’t worry about the .huggingface/ folder when committing changes to the Hub! This folder is automatically ignored by both git and upload_folder(). Download from the CLI You can use the huggingface-cli download command from the terminal to directly download files from the Hub. Internally, it uses the same hf_hub_download() and snapshot_download() helpers described above and prints the returned path to the terminal. Copied >>> huggingface-cli download gpt2 config.json /home/wauplin/.cache/huggingface/hub/models--gpt2/snapshots/11c5a3d5811f50298f278a704980280950aedb10/config.json You can download multiple files at once which displays a progress bar and returns the snapshot path in which the files are located: Copied >>> huggingface-cli download gpt2 config.json model.safetensors Fetching 2 files: 100%|████████████████████████████████████████████| 2/2 [00:00<00:00, 23831.27it/s] /home/wauplin/.cache/huggingface/hub/models--gpt2/snapshots/11c5a3d5811f50298f278a704980280950aedb10 For more details about the CLI download command, please refer to the CLI guide. Faster downloads If you are running on a machine with high bandwidth, you can increase your download speed with hf_transfer, a Rust-based library developed to speed up file transfers with the Hub. To enable it: Specify the hf_transfer extra when installing huggingface_hub (e.g. pip install huggingface_hub[hf_transfer]). Set HF_HUB_ENABLE_HF_TRANSFER=1 as an environment variable. hf_transfer is a power user tool! It is tested and production-ready, but it lacks user-friendly features like advanced error handling or proxies. For more details, please take a look at this section. < > Update on GitHub ←Overview Upload files→ Download files from the Hub Download a single file From latest version From specific version Construct a download URL Download an entire repository Filter files to download Download file(s) to a local folder Download from the CLI Faster downloads +Hugging Face Models Datasets Spaces Posts Docs Solutions Pricing Log In Sign Up Hub Python Library documentation Upload files to the Hub Hub Python Library 🏡 View all docsAWS Trainium & InferentiaAccelerateAmazon SageMakerAutoTrainBitsandbytesChat UICompetitionsDataset viewerDatasetsDiffusersEvaluateGoogle TPUsGradioHubHub Python LibraryHuggingface.jsInference API (serverless)Inference Endpoints (dedicated)OptimumPEFTSafetensorsSentence TransformersTRLTasksText Embeddings InferenceText Generation InferenceTokenizersTransformersTransformers.jstimm Search documentation mainv0.23.3v0.22.2v0.21.4v0.20.3v0.19.3v0.18.0.rc0v0.17.3v0.16.3v0.15.1v0.14.1v0.13.4v0.12.1v0.11.0v0.10.1v0.9.1v0.8.1v0.7.0.rc0v0.6.0.rc0v0.5.1 CNDEENFRHIKO Get started Home Quickstart Installation How-to guides Overview Download files Upload files Use the CLI HfFileSystem Repository Search Inference Inference Endpoints Community Tab Collections Cache Model Cards Manage your Space Integrate a library Webhooks server Conceptual guides Git vs HTTP paradigm Reference Overview Login and logout Environment variables Managing local and online repositories Hugging Face Hub API Downloading files Mixins & serialization methods Inference Types Inference Client Inference Endpoints HfFileSystem Utilities Discussions and Pull Requests Cache-system reference Repo Cards and Repo Card Data Space runtime Collections TensorBoard logger Webhooks server Serialization Join the Hugging Face community and get access to the augmented documentation experience Collaborate on models, datasets and Spaces Faster examples with accelerated inference Switch between documentation themes Sign Up to get started Upload files to the Hub Sharing your files and work is an important aspect of the Hub. The huggingface_hub offers several options for uploading your files to the Hub. You can use these functions independently or integrate them into your library, making it more convenient for your users to interact with the Hub. This guide will show you how to push files: without using Git. that are very large with Git LFS. with the commit context manager. with the push_to_hub() function. Whenever you want to upload files to the Hub, you need to log in to your Hugging Face account. For more details about authentication, check out this section. Upload a file Once you’ve created a repository with create_repo(), you can upload a file to your repository using upload_file(). Specify the path of the file to upload, where you want to upload the file to in the repository, and the name of the repository you want to add the file to. Depending on your repository type, you can optionally set the repository type as a dataset, model, or space. Copied >>> from huggingface_hub import HfApi >>> api = HfApi() >>> api.upload_file( ... path_or_fileobj="/path/to/local/folder/README.md", ... path_in_repo="README.md", ... repo_id="username/test-dataset", ... repo_type="dataset", ... ) Upload a folder Use the upload_folder() function to upload a local folder to an existing repository. Specify the path of the local folder to upload, where you want to upload the folder to in the repository, and the name of the repository you want to add the folder to. Depending on your repository type, you can optionally set the repository type as a dataset, model, or space. Copied >>> from huggingface_hub import HfApi >>> api = HfApi() # Upload all the content from the local folder to your remote Space. # By default, files are uploaded at the root of the repo >>> api.upload_folder( ... folder_path="/path/to/local/space", ... repo_id="username/my-cool-space", ... repo_type="space", ... ) By default, the .gitignore file will be taken into account to know which files should be committed or not. By default we check if a .gitignore file is present in a commit, and if not, we check if it exists on the Hub. Please be aware that only a .gitignore file present at the root of the directory with be used. We do not check for .gitignore files in subdirectories. If you don’t want to use an hardcoded .gitignore file, you can use the allow_patterns and ignore_patterns arguments to filter which files to upload. These parameters accept either a single pattern or a list of patterns. Patterns are Standard Wildcards (globbing patterns) as documented here. If both allow_patterns and ignore_patterns are provided, both constraints apply. Beside the .gitignore file and allow/ignore patterns, any .git/ folder present in any subdirectory will be ignored. Copied >>> api.upload_folder( ... folder_path="/path/to/local/folder", ... path_in_repo="my-dataset/train", # Upload to a specific folder ... repo_id="username/test-dataset", ... repo_type="dataset", ... ignore_patterns="**/logs/*.txt", # Ignore all text logs ... ) You can also use the delete_patterns argument to specify files you want to delete from the repo in the same commit. This can prove useful if you want to clean a remote folder before pushing files in it and you don’t know which files already exists. The example below uploads the local ./logs folder to the remote /experiment/logs/ folder. Only txt files are uploaded but before that, all previous logs on the repo on deleted. All of this in a single commit. Copied >>> api.upload_folder( ... folder_path="/path/to/local/folder/logs", ... repo_id="username/trained-model", ... path_in_repo="experiment/logs/", ... allow_patterns="*.txt", # Upload all local text files ... delete_patterns="*.txt", # Delete all remote text files before ... ) Upload from the CLI You can use the huggingface-cli upload command from the terminal to directly upload files to the Hub. Internally it uses the same upload_file() and upload_folder() helpers described above. You can either upload a single file or an entire folder: Copied # Usage: huggingface-cli upload [repo_id] [local_path] [path_in_repo] >>> huggingface-cli upload Wauplin/my-cool-model ./models/model.safetensors model.safetensors https://huggingface.co/Wauplin/my-cool-model/blob/main/model.safetensors >>> huggingface-cli upload Wauplin/my-cool-model ./models . https://huggingface.co/Wauplin/my-cool-model/tree/main local_path and path_in_repo are optional and can be implicitly inferred. If local_path is not set, the tool will check if a local folder or file has the same name as the repo_id. If that’s the case, its content will be uploaded. Otherwise, an exception is raised asking the user to explicitly set local_path. In any case, if path_in_repo is not set, files are uploaded at the root of the repo. For more details about the CLI upload command, please refer to the CLI guide. Advanced features In most cases, you won’t need more than upload_file() and upload_folder() to upload your files to the Hub. However, huggingface_hub has more advanced features to make things easier. Let’s have a look at them! Non-blocking uploads In some cases, you want to push data without blocking your main thread. This is particularly useful to upload logs and artifacts while continuing a training. To do so, you can use the run_as_future argument in both upload_file() and upload_folder(). This will return a concurrent.futures.Future object that you can use to check the status of the upload. Copied >>> from huggingface_hub import HfApi >>> api = HfApi() >>> future = api.upload_folder( # Upload in the background (non-blocking action) ... repo_id="username/my-model", ... folder_path="checkpoints-001", ... run_as_future=True, ... ) >>> future Future(...) >>> future.done() False >>> future.result() # Wait for the upload to complete (blocking action) ... Background jobs are queued when using run_as_future=True. This means that you are guaranteed that the jobs will be executed in the correct order. Even though background jobs are mostly useful to upload data/create commits, you can queue any method you like using run_as_future(). For instance, you can use it to create a repo and then upload data to it in the background. The built-in run_as_future argument in upload methods is just an alias around it. Copied >>> from huggingface_hub import HfApi >>> api = HfApi() >>> api.run_as_future(api.create_repo, "username/my-model", exists_ok=True) Future(...) >>> api.upload_file( ... repo_id="username/my-model", ... path_in_repo="file.txt", ... path_or_fileobj=b"file content", ... run_as_future=True, ... ) Future(...) Upload a folder by chunks upload_folder() makes it easy to upload an entire folder to the Hub. However, for large folders (thousands of files or hundreds of GB), it can still be challenging. If you have a folder with a lot of files, you might want to upload it in several commits. If you experience an error or a connection issue during the upload, you would not have to resume the process from the beginning. To upload a folder in multiple commits, just pass multi_commits=True as argument. Under the hood, huggingface_hub will list the files to upload/delete and split them in several commits. The “strategy” (i.e. how to split the commits) is based on the number and size of the files to upload. A PR is open on the Hub to push all the commits. Once the PR is ready, the commits are squashed into a single commit. If the process is interrupted before completing, you can rerun your script to resume the upload. The created PR will be automatically detected and the upload will resume from where it stopped. It is recommended to pass multi_commits_verbose=True to get a better understanding of the upload and its progress. The example below will upload the checkpoints folder to a dataset in multiple commits. A PR will be created on the Hub and merged automatically once the upload is complete. If you prefer the PR to stay open and review it manually, you can pass create_pr=True. Copied >>> upload_folder( ... folder_path="local/checkpoints", ... repo_id="username/my-dataset", ... repo_type="dataset", ... multi_commits=True, ... multi_commits_verbose=True, ... ) If you want a better control on the upload strategy (i.e. the commits that are created), you can have a look at the low-level plan_multi_commits() and create_commits_on_pr() methods. multi_commits is still an experimental feature. Its API and behavior is subject to change in the future without prior notice. Scheduled uploads The Hugging Face Hub makes it easy to save and version data. However, there are some limitations when updating the same file thousands of times. For instance, you might want to save logs of a training process or user feedback on a deployed Space. In these cases, uploading the data as a dataset on the Hub makes sense, but it can be hard to do properly. The main reason is that you don’t want to version every update of your data because it’ll make the git repository unusable. The CommitScheduler class offers a solution to this problem. The idea is to run a background job that regularly pushes a local folder to the Hub. Let’s assume you have a Gradio Space that takes as input some text and generates two translations of it. Then, the user can select their preferred translation. For each run, you want to save the input, output, and user preference to analyze the results. This is a perfect use case for CommitScheduler; you want to save data to the Hub (potentially millions of user feedback), but you don’t need to save in real-time each user’s input. Instead, you can save the data locally in a JSON file and upload it every 10 minutes. For example: Copied >>> import json >>> import uuid >>> from pathlib import Path >>> import gradio as gr >>> from huggingface_hub import CommitScheduler # Define the file where to save the data. Use UUID to make sure not to overwrite existing data from a previous run. >>> feedback_file = Path("user_feedback/") / f"data_{uuid.uuid4()}.json" >>> feedback_folder = feedback_file.parent # Schedule regular uploads. Remote repo and local folder are created if they don't already exist. >>> scheduler = CommitScheduler( ... repo_id="report-translation-feedback", ... repo_type="dataset", ... folder_path=feedback_folder, ... path_in_repo="data", ... every=10, ... ) # Define the function that will be called when the user submits its feedback (to be called in Gradio) >>> def save_feedback(input_text:str, output_1: str, output_2:str, user_choice: int) -> None: ... """ ... Append input/outputs and user feedback to a JSON Lines file using a thread lock to avoid concurrent writes from different users. ... """ ... with scheduler.lock: ... with feedback_file.open("a") as f: ... f.write(json.dumps({"input": input_text, "output_1": output_1, "output_2": output_2, "user_choice": user_choice})) ... f.write("\n") # Start Gradio >>> with gr.Blocks() as demo: >>> ... # define Gradio demo + use `save_feedback` >>> demo.launch() And that’s it! User input/outputs and feedback will be available as a dataset on the Hub. By using a unique JSON file name, you are guaranteed you won’t overwrite data from a previous run or data from another Spaces/replicas pushing concurrently to the same repository. For more details about the CommitScheduler, here is what you need to know: append-only: It is assumed that you will only add content to the folder. You must only append data to existing files or create new files. Deleting or overwriting a file might corrupt your repository. git history: The scheduler will commit the folder every every minutes. To avoid polluting the git repository too much, it is recommended to set a minimal value of 5 minutes. Besides, the scheduler is designed to avoid empty commits. If no new content is detected in the folder, the scheduled commit is dropped. errors: The scheduler run as background thread. It is started when you instantiate the class and never stops. In particular, if an error occurs during the upload (example: connection issue), the scheduler will silently ignore it and retry at the next scheduled commit. thread-safety: In most cases it is safe to assume that you can write to a file without having to worry about a lock file. The scheduler will not crash or be corrupted if you write content to the folder while it’s uploading. In practice, it is possible that concurrency issues happen for heavy-loaded apps. In this case, we advice to use the scheduler.lock lock to ensure thread-safety. The lock is blocked only when the scheduler scans the folder for changes, not when it uploads data. You can safely assume that it will not affect the user experience on your Space. Space persistence demo Persisting data from a Space to a Dataset on the Hub is the main use case for CommitScheduler. Depending on the use case, you might want to structure your data differently. The structure has to be robust to concurrent users and restarts which often implies generating UUIDs. Besides robustness, you should upload data in a format readable by the 🤗 Datasets library for later reuse. We created a Space that demonstrates how to save several different data formats (you may need to adapt it for your own specific needs). Custom uploads CommitScheduler assumes your data is append-only and should be uploading “as is”. However, you might want to customize the way data is uploaded. You can do that by creating a class inheriting from CommitScheduler and overwrite the push_to_hub method (feel free to overwrite it any way you want). You are guaranteed it will be called every every minutes in a background thread. You don’t have to worry about concurrency and errors but you must be careful about other aspects, such as pushing empty commits or duplicated data. In the (simplified) example below, we overwrite push_to_hub to zip all PNG files in a single archive to avoid overloading the repo on the Hub: Copied class ZipScheduler(CommitScheduler): def push_to_hub(self): # 1. List PNG files png_files = list(self.folder_path.glob("*.png")) if len(png_files) == 0: return None # return early if nothing to commit # 2. Zip png files in a single archive with tempfile.TemporaryDirectory() as tmpdir: archive_path = Path(tmpdir) / "train.zip" with zipfile.ZipFile(archive_path, "w", zipfile.ZIP_DEFLATED) as zip: for png_file in png_files: zip.write(filename=png_file, arcname=png_file.name) # 3. Upload archive self.api.upload_file(..., path_or_fileobj=archive_path) # 4. Delete local png files to avoid re-uploading them later for png_file in png_files: png_file.unlink() When you overwrite push_to_hub, you have access to the attributes of CommitScheduler and especially: HfApi client: api Folder parameters: folder_path and path_in_repo Repo parameters: repo_id, repo_type, revision The thread lock: lock For more examples of custom schedulers, check out our demo Space containing different implementations depending on your use cases. create_commit The upload_file() and upload_folder() functions are high-level APIs that are generally convenient to use. We recommend trying these functions first if you don’t need to work at a lower level. However, if you want to work at a commit-level, you can use the create_commit() function directly. There are three types of operations supported by create_commit(): CommitOperationAdd uploads a file to the Hub. If the file already exists, the file contents are overwritten. This operation accepts two arguments: path_in_repo: the repository path to upload a file to. path_or_fileobj: either a path to a file on your filesystem or a file-like object. This is the content of the file to upload to the Hub. CommitOperationDelete removes a file or a folder from a repository. This operation accepts path_in_repo as an argument. CommitOperationCopy copies a file within a repository. This operation accepts three arguments: src_path_in_repo: the repository path of the file to copy. path_in_repo: the repository path where the file should be copied. src_revision: optional - the revision of the file to copy if your want to copy a file from a different branch/revision. For example, if you want to upload two files and delete a file in a Hub repository: Use the appropriate CommitOperation to add or delete a file and to delete a folder: Copied >>> from huggingface_hub import HfApi, CommitOperationAdd, CommitOperationDelete >>> api = HfApi() >>> operations = [ ... CommitOperationAdd(path_in_repo="LICENSE.md", path_or_fileobj="~/repo/LICENSE.md"), ... CommitOperationAdd(path_in_repo="weights.h5", path_or_fileobj="~/repo/weights-final.h5"), ... CommitOperationDelete(path_in_repo="old-weights.h5"), ... CommitOperationDelete(path_in_repo="logs/"), ... CommitOperationCopy(src_path_in_repo="image.png", path_in_repo="duplicate_image.png"), ... ] Pass your operations to create_commit(): Copied >>> api.create_commit( ... repo_id="lysandre/test-model", ... operations=operations, ... commit_message="Upload my model weights and license", ... ) In addition to upload_file() and upload_folder(), the following functions also use create_commit() under the hood: delete_file() deletes a single file from a repository on the Hub. delete_folder() deletes an entire folder from a repository on the Hub. metadata_update() updates a repository’s metadata. For more detailed information, take a look at the HfApi reference. Preupload LFS files before commit In some cases, you might want to upload huge files to S3 before making the commit call. For example, if you are committing a dataset in several shards that are generated in-memory, you would need to upload the shards one by one to avoid an out-of-memory issue. A solution is to upload each shard as a separate commit on the repo. While being perfectly valid, this solution has the drawback of potentially messing the git history by generating tens of commits. To overcome this issue, you can upload your files one by one to S3 and then create a single commit at the end. This is possible using preupload_lfs_files() in combination with create_commit(). This is a power-user method. Directly using upload_file(), upload_folder() or create_commit() instead of handling the low-level logic of pre-uploading files is the way to go in the vast majority of cases. The main caveat of preupload_lfs_files() is that until the commit is actually made, the upload files are not accessible on the repo on the Hub. If you have a question, feel free to ping us on our Discord or in a GitHub issue. Here is a simple example illustrating how to pre-upload files: Copied >>> from huggingface_hub import CommitOperationAdd, preupload_lfs_files, create_commit, create_repo >>> repo_id = create_repo("test_preupload").repo_id >>> operations = [] # List of all `CommitOperationAdd` objects that will be generated >>> for i in range(5): ... content = ... # generate binary content ... addition = CommitOperationAdd(path_in_repo=f"shard_{i}_of_5.bin", path_or_fileobj=content) ... preupload_lfs_files(repo_id, additions=[addition]) ... operations.append(addition) >>> # Create commit >>> create_commit(repo_id, operations=operations, commit_message="Commit all shards") First, we create the CommitOperationAdd objects one by one. In a real-world example, those would contain the generated shards. Each file is uploaded before generating the next one. During the preupload_lfs_files() step, the CommitOperationAdd object is mutated. You should only use it to pass it directly to create_commit(). The main update of the object is that the binary content is removed from it, meaning that it will be garbage-collected if you don’t store another reference to it. This is expected as we don’t want to keep in memory the content that is already uploaded. Finally we create the commit by passing all the operations to create_commit(). You can pass additional operations (add, delete or copy) that have not been processed yet and they will be handled correctly. Tips and tricks for large uploads There are some limitations to be aware of when dealing with a large amount of data in your repo. Given the time it takes to stream the data, getting an upload/push to fail at the end of the process or encountering a degraded experience, be it on hf.co or when working locally, can be very annoying. Check out our Repository limitations and recommendations guide for best practices on how to structure your repositories on the Hub. Next, let’s move on with some practical tips to make your upload process as smooth as possible. Start small: We recommend starting with a small amount of data to test your upload script. It’s easier to iterate on a script when failing takes only a little time. Expect failures: Streaming large amounts of data is challenging. You don’t know what can happen, but it’s always best to consider that something will fail at least once -no matter if it’s due to your machine, your connection, or our servers. For example, if you plan to upload a large number of files, it’s best to keep track locally of which files you already uploaded before uploading the next batch. You are ensured that an LFS file that is already committed will never be re-uploaded twice but checking it client-side can still save some time. Use hf_transfer: this is a Rust-based library meant to speed up uploads on machines with very high bandwidth. To use hf_transfer: Specify the hf_transfer extra when installing huggingface_hub (e.g. pip install huggingface_hub[hf_transfer]). Set HF_HUB_ENABLE_HF_TRANSFER=1 as an environment variable. hf_transfer is a power user tool! It is tested and production-ready, but it lacks user-friendly features like advanced error handling or proxies. For more details, please take a look at this section. (legacy) Upload files with Git LFS All the methods described above use the Hub’s API to upload files. This is the recommended way to upload files to the Hub. However, we also provide Repository, a wrapper around the git tool to manage a local repository. Although Repository is not formally deprecated, we recommend using the HTTP-based methods described above instead. For more details about this recommendation, please have a look at this guide explaining the core differences between HTTP-based and Git-based approaches. Git LFS automatically handles files larger than 10MB. But for very large files (>5GB), you need to install a custom transfer agent for Git LFS: Copied huggingface-cli lfs-enable-largefiles You should install this for each repository that has a very large file. Once installed, you’ll be able to push files larger than 5GB. commit context manager The commit context manager handles four of the most common Git commands: pull, add, commit, and push. git-lfs automatically tracks any file larger than 10MB. In the following example, the commit context manager: Pulls from the text-files repository. Adds a change made to file.txt. Commits the change. Pushes the change to the text-files repository. Copied >>> from huggingface_hub import Repository >>> with Repository(local_dir="text-files", clone_from="/text-files").commit(commit_message="My first file :)"): ... with open("file.txt", "w+") as f: ... f.write(json.dumps({"hey": 8})) Here is another example of how to use the commit context manager to save and upload a file to a repository: Copied >>> import torch >>> model = torch.nn.Transformer() >>> with Repository("torch-model", clone_from="/torch-model", token=True).commit(commit_message="My cool model :)"): ... torch.save(model.state_dict(), "model.pt") Set blocking=False if you would like to push your commits asynchronously. Non-blocking behavior is helpful when you want to continue running your script while your commits are being pushed. Copied >>> with repo.commit(commit_message="My cool model :)", blocking=False) You can check the status of your push with the command_queue method: Copied >>> last_command = repo.command_queue[-1] >>> last_command.status Refer to the table below for the possible statuses: Status Description -1 The push is ongoing. 0 The push has completed successfully. Non-zero An error has occurred. When blocking=False, commands are tracked, and your script will only exit when all pushes are completed, even if other errors occur in your script. Some additional useful commands for checking the status of a push include: Copied # Inspect an error. >>> last_command.stderr # Check whether a push is completed or ongoing. >>> last_command.is_done # Check whether a push command has errored. >>> last_command.failed push_to_hub The Repository class has a push_to_hub() function to add files, make a commit, and push them to a repository. Unlike the commit context manager, you’ll need to pull from a repository first before calling push_to_hub(). For example, if you’ve already cloned a repository from the Hub, then you can initialize the repo from the local directory: Copied >>> from huggingface_hub import Repository >>> repo = Repository(local_dir="path/to/local/repo") Update your local clone with git_pull() and then push your file to the Hub: Copied >>> repo.git_pull() >>> repo.push_to_hub(commit_message="Commit my-awesome-file to the Hub") However, if you aren’t ready to push a file yet, you can use git_add() and git_commit() to only add and commit your file: Copied >>> repo.git_add("path/to/file") >>> repo.git_commit(commit_message="add my first model config file :)") When you’re ready, push the file to your repository with git_push(): Copied >>> repo.git_push() < > Update on GitHub ←Download files Use the CLI→ Upload files to the Hub Upload a file Upload a folder Upload from the CLI Advanced features Non-blocking uploads Upload a folder by chunks Scheduled uploads Space persistence demo Custom uploads create_commit Preupload LFS files before commit Tips and tricks for large uploads (legacy) Upload files with Git LFS commit context manager push_to_hub +Hugging Face Models Datasets Spaces Posts Docs Solutions Pricing Log In Sign Up Hub Python Library documentation Interact with the Hub through the Filesystem API Hub Python Library 🏡 View all docsAWS Trainium & InferentiaAccelerateAmazon SageMakerAutoTrainBitsandbytesChat UICompetitionsDataset viewerDatasetsDiffusersEvaluateGoogle TPUsGradioHubHub Python LibraryHuggingface.jsInference API (serverless)Inference Endpoints (dedicated)OptimumPEFTSafetensorsSentence TransformersTRLTasksText Embeddings InferenceText Generation InferenceTokenizersTransformersTransformers.jstimm Search documentation mainv0.23.3v0.22.2v0.21.4v0.20.3v0.19.3v0.18.0.rc0v0.17.3v0.16.3v0.15.1v0.14.1v0.13.4v0.12.1v0.11.0v0.10.1v0.9.1v0.8.1v0.7.0.rc0v0.6.0.rc0v0.5.1 CNDEENFRHIKO Get started Home Quickstart Installation How-to guides Overview Download files Upload files Use the CLI HfFileSystem Repository Search Inference Inference Endpoints Community Tab Collections Cache Model Cards Manage your Space Integrate a library Webhooks server Conceptual guides Git vs HTTP paradigm Reference Overview Login and logout Environment variables Managing local and online repositories Hugging Face Hub API Downloading files Mixins & serialization methods Inference Types Inference Client Inference Endpoints HfFileSystem Utilities Discussions and Pull Requests Cache-system reference Repo Cards and Repo Card Data Space runtime Collections TensorBoard logger Webhooks server Serialization Join the Hugging Face community and get access to the augmented documentation experience Collaborate on models, datasets and Spaces Faster examples with accelerated inference Switch between documentation themes Sign Up to get started Interact with the Hub through the Filesystem API In addition to the HfApi, the huggingface_hub library provides HfFileSystem, a pythonic fsspec-compatible file interface to the Hugging Face Hub. The HfFileSystem builds of top of the HfApi and offers typical filesystem style operations like cp, mv, ls, du, glob, get_file, and put_file. Usage Copied >>> from huggingface_hub import HfFileSystem >>> fs = HfFileSystem() >>> # List all files in a directory >>> fs.ls("datasets/my-username/my-dataset-repo/data", detail=False) ['datasets/my-username/my-dataset-repo/data/train.csv', 'datasets/my-username/my-dataset-repo/data/test.csv'] >>> # List all ".csv" files in a repo >>> fs.glob("datasets/my-username/my-dataset-repo/**.csv") ['datasets/my-username/my-dataset-repo/data/train.csv', 'datasets/my-username/my-dataset-repo/data/test.csv'] >>> # Read a remote file >>> with fs.open("datasets/my-username/my-dataset-repo/data/train.csv", "r") as f: ... train_data = f.readlines() >>> # Read the content of a remote file as a string >>> train_data = fs.read_text("datasets/my-username/my-dataset-repo/data/train.csv", revision="dev") >>> # Write a remote file >>> with fs.open("datasets/my-username/my-dataset-repo/data/validation.csv", "w") as f: ... f.write("text,label") ... f.write("Fantastic movie!,good") The optional revision argument can be passed to run an operation from a specific commit such as a branch, tag name, or a commit hash. Unlike Python’s built-in open, fsspec’s open defaults to binary mode, "rb". This means you must explicitly set mode as "r" for reading and "w" for writing in text mode. Appending to a file (modes "a" and "ab") is not supported yet. Integrations The HfFileSystem can be used with any library that integrates fsspec, provided the URL follows the scheme: Copied hf://[][@]/ The repo_type_prefix is datasets/ for datasets, spaces/ for spaces, and models don’t need a prefix in the URL. Some interesting integrations where HfFileSystem simplifies interacting with the Hub are listed below: Reading/writing a Pandas DataFrame from/to a Hub repository: Copied >>> import pandas as pd >>> # Read a remote CSV file into a dataframe >>> df = pd.read_csv("hf://datasets/my-username/my-dataset-repo/train.csv") >>> # Write a dataframe to a remote CSV file >>> df.to_csv("hf://datasets/my-username/my-dataset-repo/test.csv") The same workflow can also be used for Dask and Polars DataFrames. Querying (remote) Hub files with DuckDB: Copied >>> from huggingface_hub import HfFileSystem >>> import duckdb >>> fs = HfFileSystem() >>> duckdb.register_filesystem(fs) >>> # Query a remote file and get the result back as a dataframe >>> fs_query_file = "hf://datasets/my-username/my-dataset-repo/data_dir/data.parquet" >>> df = duckdb.query(f"SELECT * FROM '{fs_query_file}' LIMIT 10").df() Using the Hub as an array store with Zarr: Copied >>> import numpy as np >>> import zarr >>> embeddings = np.random.randn(50000, 1000).astype("float32") >>> # Write an array to a repo >>> with zarr.open_group("hf://my-username/my-model-repo/array-store", mode="w") as root: ... foo = root.create_group("embeddings") ... foobar = foo.zeros('experiment_0', shape=(50000, 1000), chunks=(10000, 1000), dtype='f4') ... foobar[:] = embeddings >>> # Read an array from a repo >>> with zarr.open_group("hf://my-username/my-model-repo/array-store", mode="r") as root: ... first_row = root["embeddings/experiment_0"][0] Authentication In many cases, you must be logged in with a Hugging Face account to interact with the Hub. Refer to the Authentication section of the documentation to learn more about authentication methods on the Hub. It is also possible to login programmatically by passing your token as an argument to HfFileSystem: Copied >>> from huggingface_hub import HfFileSystem >>> fs = HfFileSystem(token=token) If you login this way, be careful not to accidentally leak the token when sharing your source code! < > Update on GitHub ←Use the CLI Repository→ Interact with the Hub through the Filesystem API Usage Integrations Authentication +Hugging Face Models Datasets Spaces Posts Docs Solutions Pricing Log In Sign Up Hub Python Library documentation Create and manage a repository Hub Python Library 🏡 View all docsAWS Trainium & InferentiaAccelerateAmazon SageMakerAutoTrainBitsandbytesChat UICompetitionsDataset viewerDatasetsDiffusersEvaluateGoogle TPUsGradioHubHub Python LibraryHuggingface.jsInference API (serverless)Inference Endpoints (dedicated)OptimumPEFTSafetensorsSentence TransformersTRLTasksText Embeddings InferenceText Generation InferenceTokenizersTransformersTransformers.jstimm Search documentation mainv0.23.3v0.22.2v0.21.4v0.20.3v0.19.3v0.18.0.rc0v0.17.3v0.16.3v0.15.1v0.14.1v0.13.4v0.12.1v0.11.0v0.10.1v0.9.1v0.8.1v0.7.0.rc0v0.6.0.rc0v0.5.1 CNDEENFRHIKO Get started Home Quickstart Installation How-to guides Overview Download files Upload files Use the CLI HfFileSystem Repository Search Inference Inference Endpoints Community Tab Collections Cache Model Cards Manage your Space Integrate a library Webhooks server Conceptual guides Git vs HTTP paradigm Reference Overview Login and logout Environment variables Managing local and online repositories Hugging Face Hub API Downloading files Mixins & serialization methods Inference Types Inference Client Inference Endpoints HfFileSystem Utilities Discussions and Pull Requests Cache-system reference Repo Cards and Repo Card Data Space runtime Collections TensorBoard logger Webhooks server Serialization Join the Hugging Face community and get access to the augmented documentation experience Collaborate on models, datasets and Spaces Faster examples with accelerated inference Switch between documentation themes Sign Up to get started Create and manage a repository The Hugging Face Hub is a collection of git repositories. Git is a widely used tool in software development to easily version projects when working collaboratively. This guide will show you how to interact with the repositories on the Hub, especially: Create and delete a repository. Manage branches and tags. Rename your repository. Update your repository visibility. Manage a local copy of your repository. If you are used to working with platforms such as GitLab/GitHub/Bitbucket, your first instinct might be to use git CLI to clone your repo (git clone), commit changes (git add, git commit) and push them (git push). This is valid when using the Hugging Face Hub. However, software engineering and machine learning do not share the same requirements and workflows. Model repositories might maintain large model weight files for different frameworks and tools, so cloning the repository can lead to you maintaining large local folders with massive sizes. As a result, it may be more efficient to use our custom HTTP methods. You can read our Git vs HTTP paradigm explanation page for more details. If you want to create and manage a repository on the Hub, your machine must be logged in. If you are not, please refer to this section. In the rest of this guide, we will assume that your machine is logged in. Repo creation and deletion The first step is to know how to create and delete repositories. You can only manage repositories that you own (under your username namespace) or from organizations in which you have write permissions. Create a repository Create an empty repository with create_repo() and give it a name with the repo_id parameter. The repo_id is your namespace followed by the repository name: username_or_org/repo_name. Copied >>> from huggingface_hub import create_repo >>> create_repo("lysandre/test-model") 'https://huggingface.co/lysandre/test-model' By default, create_repo() creates a model repository. But you can use the repo_type parameter to specify another repository type. For example, if you want to create a dataset repository: Copied >>> from huggingface_hub import create_repo >>> create_repo("lysandre/test-dataset", repo_type="dataset") 'https://huggingface.co/datasets/lysandre/test-dataset' When you create a repository, you can set your repository visibility with the private parameter. Copied >>> from huggingface_hub import create_repo >>> create_repo("lysandre/test-private", private=True) If you want to change the repository visibility at a later time, you can use the update_repo_visibility() function. Delete a repository Delete a repository with delete_repo(). Make sure you want to delete a repository because this is an irreversible process! Specify the repo_id of the repository you want to delete: Copied >>> delete_repo(repo_id="lysandre/my-corrupted-dataset", repo_type="dataset") Duplicate a repository (only for Spaces) In some cases, you want to copy someone else’s repo to adapt it to your use case. This is possible for Spaces using the duplicate_space() method. It will duplicate the whole repository. You will still need to configure your own settings (hardware, sleep-time, storage, variables and secrets). Check out our Manage your Space guide for more details. Copied >>> from huggingface_hub import duplicate_space >>> duplicate_space("multimodalart/dreambooth-training", private=False) RepoUrl('https://huggingface.co/spaces/nateraw/dreambooth-training',...) Upload and download files Now that you have created your repository, you are interested in pushing changes to it and downloading files from it. These 2 topics deserve their own guides. Please refer to the upload and the download guides to learn how to use your repository. Branches and tags Git repositories often make use of branches to store different versions of a same repository. Tags can also be used to flag a specific state of your repository, for example, when releasing a version. More generally, branches and tags are referred as git references. Create branches and tags You can create new branch and tags using create_branch() and create_tag(): Copied >>> from huggingface_hub import create_branch, create_tag # Create a branch on a Space repo from `main` branch >>> create_branch("Matthijs/speecht5-tts-demo", repo_type="space", branch="handle-dog-speaker") # Create a tag on a Dataset repo from `v0.1-release` branch >>> create_tag("bigcode/the-stack", repo_type="dataset", revision="v0.1-release", tag="v0.1.1", tag_message="Bump release version.") You can use the delete_branch() and delete_tag() functions in the same way to delete a branch or a tag. List all branches and tags You can also list the existing git refs from a repository using list_repo_refs(): Copied >>> from huggingface_hub import list_repo_refs >>> list_repo_refs("bigcode/the-stack", repo_type="dataset") GitRefs( branches=[ GitRefInfo(name='main', ref='refs/heads/main', target_commit='18edc1591d9ce72aa82f56c4431b3c969b210ae3'), GitRefInfo(name='v1.1.a1', ref='refs/heads/v1.1.a1', target_commit='f9826b862d1567f3822d3d25649b0d6d22ace714') ], converts=[], tags=[ GitRefInfo(name='v1.0', ref='refs/tags/v1.0', target_commit='c37a8cd1e382064d8aced5e05543c5f7753834da') ] ) Change repository settings Repositories come with some settings that you can configure. Most of the time, you will want to do that manually in the repo settings page in your browser. You must have write access to a repo to configure it (either own it or being part of an organization). In this section, we will see the settings that you can also configure programmatically using huggingface_hub. Some settings are specific to Spaces (hardware, environment variables,…). To configure those, please refer to our Manage your Spaces guide. Update visibility A repository can be public or private. A private repository is only visible to you or members of the organization in which the repository is located. Change a repository to private as shown in the following: Copied >>> from huggingface_hub import update_repo_visibility >>> update_repo_visibility(repo_id=repo_id, private=True) Rename your repository You can rename your repository on the Hub using move_repo(). Using this method, you can also move the repo from a user to an organization. When doing so, there are a few limitations that you should be aware of. For example, you can’t transfer your repo to another user. Copied >>> from huggingface_hub import move_repo >>> move_repo(from_id="Wauplin/cool-model", to_id="huggingface/cool-model") Manage a local copy of your repository All the actions described above can be done using HTTP requests. However, in some cases you might be interested in having a local copy of your repository and interact with it using the Git commands you are familiar with. The Repository class allows you to interact with files and repositories on the Hub with functions similar to Git commands. It is a wrapper over Git and Git-LFS methods to use the Git commands you already know and love. Before starting, please make sure you have Git-LFS installed (see here for installation instructions). Repository is deprecated in favor of the http-based alternatives implemented in HfApi. Given its large adoption in legacy code, the complete removal of Repository will only happen in release v1.0. For more details, please read this explanation page. Use a local repository Instantiate a Repository object with a path to a local repository: Copied >>> from huggingface_hub import Repository >>> repo = Repository(local_dir="//") Clone The clone_from parameter clones a repository from a Hugging Face repository ID to a local directory specified by the local_dir argument: Copied >>> from huggingface_hub import Repository >>> repo = Repository(local_dir="w2v2", clone_from="facebook/wav2vec2-large-960h-lv60") clone_from can also clone a repository using a URL: Copied >>> repo = Repository(local_dir="huggingface-hub", clone_from="https://huggingface.co/facebook/wav2vec2-large-960h-lv60") You can combine the clone_from parameter with create_repo() to create and clone a repository: Copied >>> repo_url = create_repo(repo_id="repo_name") >>> repo = Repository(local_dir="repo_local_path", clone_from=repo_url) You can also configure a Git username and email to a cloned repository by specifying the git_user and git_email parameters when you clone a repository. When users commit to that repository, Git will be aware of the commit author. Copied >>> repo = Repository( ... "my-dataset", ... clone_from="/", ... token=True, ... repo_type="dataset", ... git_user="MyName", ... git_email="me@cool.mail" ... ) Branch Branches are important for collaboration and experimentation without impacting your current files and code. Switch between branches with git_checkout(). For example, if you want to switch from branch1 to branch2: Copied >>> from huggingface_hub import Repository >>> repo = Repository(local_dir="huggingface-hub", clone_from="/", revision='branch1') >>> repo.git_checkout("branch2") Pull git_pull() allows you to update a current local branch with changes from a remote repository: Copied >>> from huggingface_hub import Repository >>> repo.git_pull() Set rebase=True if you want your local commits to occur after your branch is updated with the new commits from the remote: Copied >>> repo.git_pull(rebase=True) < > Update on GitHub ←HfFileSystem Search→ Create and manage a repository Repo creation and deletion Create a repository Delete a repository Duplicate a repository (only for Spaces) Upload and download files Branches and tags Create branches and tags List all branches and tags Change repository settings Update visibility Rename your repository Manage a local copy of your repository Use a local repository Clone Branch Pull +Hugging Face Models Datasets Spaces Posts Docs Solutions Pricing Log In Sign Up Hub Python Library documentation Search the Hub Hub Python Library 🏡 View all docsAWS Trainium & InferentiaAccelerateAmazon SageMakerAutoTrainBitsandbytesChat UICompetitionsDataset viewerDatasetsDiffusersEvaluateGoogle TPUsGradioHubHub Python LibraryHuggingface.jsInference API (serverless)Inference Endpoints (dedicated)OptimumPEFTSafetensorsSentence TransformersTRLTasksText Embeddings InferenceText Generation InferenceTokenizersTransformersTransformers.jstimm Search documentation mainv0.23.3v0.22.2v0.21.4v0.20.3v0.19.3v0.18.0.rc0v0.17.3v0.16.3v0.15.1v0.14.1v0.13.4v0.12.1v0.11.0v0.10.1v0.9.1v0.8.1v0.7.0.rc0v0.6.0.rc0v0.5.1 CNDEENFRHIKO Get started Home Quickstart Installation How-to guides Overview Download files Upload files Use the CLI HfFileSystem Repository Search Inference Inference Endpoints Community Tab Collections Cache Model Cards Manage your Space Integrate a library Webhooks server Conceptual guides Git vs HTTP paradigm Reference Overview Login and logout Environment variables Managing local and online repositories Hugging Face Hub API Downloading files Mixins & serialization methods Inference Types Inference Client Inference Endpoints HfFileSystem Utilities Discussions and Pull Requests Cache-system reference Repo Cards and Repo Card Data Space runtime Collections TensorBoard logger Webhooks server Serialization Join the Hugging Face community and get access to the augmented documentation experience Collaborate on models, datasets and Spaces Faster examples with accelerated inference Switch between documentation themes Sign Up to get started Search the Hub In this tutorial, you will learn how to search models, datasets and spaces on the Hub using huggingface_hub. How to list repositories ? huggingface_hub library includes an HTTP client HfApi to interact with the Hub. Among other things, it can list models, datasets and spaces stored on the Hub: Copied >>> from huggingface_hub import HfApi >>> api = HfApi() >>> models = api.list_models() The output of list_models() is an iterator over the models stored on the Hub. Similarly, you can use list_datasets() to list datasets and list_spaces() to list Spaces. How to filter repositories ? Listing repositories is great but now you might want to filter your search. The list helpers have several attributes like: filter author search … Two of these parameters are intuitive (author and search), but what about that filter? filter takes as input a ModelFilter object (or DatasetFilter). You can instantiate it by specifying which models you want to filter. Let’s see an example to get all models on the Hub that does image classification, have been trained on the imagenet dataset and that runs with PyTorch. That can be done with a single ModelFilter. Attributes are combined as “logical AND”. Copied models = hf_api.list_models( filter=ModelFilter( task="image-classification", library="pytorch", trained_dataset="imagenet" ) ) While filtering, you can also sort the models and take only the top results. For example, the following example fetches the top 5 most downloaded datasets on the Hub: Copied >>> list(list_datasets(sort="downloads", direction=-1, limit=5)) [DatasetInfo( id='argilla/databricks-dolly-15k-curated-en', author='argilla', sha='4dcd1dedbe148307a833c931b21ca456a1fc4281', last_modified=datetime.datetime(2023, 10, 2, 12, 32, 53, tzinfo=datetime.timezone.utc), private=False, downloads=8889377, (...) To explore available filter on the Hub, visit models and datasets pages in your browser, search for some parameters and look at the values in the URL. < > Update on GitHub ←Repository Inference→ Search the Hub How to list repositories ? How to filter repositories ?