From 86d75ba93af9d48576331950314b7ae252b245b3 Mon Sep 17 00:00:00 2001 From: Merve Noyan Date: Sat, 31 Aug 2024 23:44:04 +0300 Subject: [PATCH 01/15] Add video-text-to-text task page --- .../src/tasks/image-text-to-text/data.ts | 2 +- .../src/tasks/video-text-to-text/about.md | 99 +++++++++++++++++++ .../src/tasks/video-text-to-text/data.ts | 59 +++++++++++ 3 files changed, 159 insertions(+), 1 deletion(-) create mode 100644 packages/tasks/src/tasks/video-text-to-text/about.md create mode 100644 packages/tasks/src/tasks/video-text-to-text/data.ts diff --git a/packages/tasks/src/tasks/image-text-to-text/data.ts b/packages/tasks/src/tasks/image-text-to-text/data.ts index 888dc3ada..4e19cf646 100644 --- a/packages/tasks/src/tasks/image-text-to-text/data.ts +++ b/packages/tasks/src/tasks/image-text-to-text/data.ts @@ -60,7 +60,7 @@ const taskData: TaskDataCustom = { }, { description: "Strong image-text-to-text model.", - id: "llava-hf/llava-v1.6-mistral-7b-hf", + id: "microsoft/Phi-3.5-vision-instruct", }, ], spaces: [ diff --git a/packages/tasks/src/tasks/video-text-to-text/about.md b/packages/tasks/src/tasks/video-text-to-text/about.md new file mode 100644 index 000000000..62e9007fc --- /dev/null +++ b/packages/tasks/src/tasks/video-text-to-text/about.md @@ -0,0 +1,99 @@ +Most of the video language models can take in videos, multiple videos, images and multiple images. Some of these models can also take interleaved inputs, which can have images and videos inside the text, where you can refer to the input image and input video within the text prompt. + +## Different Types of Vide Language Models + +Video language models come in three types: + +- **Base:** Pre-trained models that can be fine-tuned. +- **Instruction:** Base models fine-tuned on video-instruction pairs and answers. +- **Chatty/Conversational:** Base models fine-tuned on video conversation datasets. + + +## Use Cases + +### Video Question Answering + +Video language models trained on video-question-answer pairs can be used for video question answering and generating captions for videos. + +### Video Chat + +Video language models can be used to have a dialogue about a video. + +### Video Recognition with Instructions + +Video language models can recognize images through descriptions. When given detailed descriptions of specific entities, it can classify the entities in a video. + +## Inference + +You can use the Transformers library to interact with video-language models. +Below we load [a video language model](https://huggingface.co/llava-hf/LLaVA-NeXT-Video-7B-hf), write a simple utility to sample videos, use chat template to format the text prompt, process the video and the text prompt and infer. + +```python +import uuid +import requests +import cv2 +import torch +from transformers import LlavaNextVideoProcessor, LlavaNextVideoForConditionalGeneration + +device = "cuda" if torch.cuda.is_available() else "cpu" +model_id = "llava-hf/LLaVA-NeXT-Video-7B-hf" + +model = LlavaNextVideoForConditionalGeneration.from_pretrained( + model_id, + torch_dtype=torch.float16, + low_cpu_mem_usage=True, +).to(device) + +processor = LlavaNextVideoProcessor.from_pretrained(model_id) + +def sample_frames(url, num_frames): + + response = requests.get(url) + path_id = str(uuid.uuid4()) + + path = f"./{path_id}.mp4" + + with open(path, "wb") as f: + f.write(response.content) + + video = cv2.VideoCapture(path) + total_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT)) + interval = total_frames // num_frames + frames = [] + for i in range(total_frames): + ret, frame = video.read() + pil_img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)) + if not ret: + continue + if i % interval == 0: + frames.append(pil_img) + video.release() + return frames + +conversation = [ + { + + "role": "user", + "content": [ + {"type": "text", "text": "Why is this video funny?"}, + {"type": "video"}, + ], + }, +] + +prompt = processor.apply_chat_template(conversation, add_generation_prompt=True) + +video_url = "https://huggingface.co/spaces/merve/llava-interleave/resolve/main/cats_1.mp4" +video = sample_frames(video, 8) + +inputs = processor(text=prompt, videos=video, padding=True, return_tensors="pt").to(model.device) + +output = model.generate(**inputs, max_new_tokens=100, do_sample=False) +print(processor.decode(output[0][2:], skip_special_tokens=True)) + +# Why is this video funny? ASSISTANT: The humor in this video comes from the cat's facial expression and body language. The cat appears to be making a funny face, with its eyes squinted and mouth open, which can be interpreted as a playful or mischievous expression. Cats often make such faces when they are in a good mood or are playful, and this can be amusing to people who are familiar with their behavior. The combination of the cat's expression and the close- + +``` + +## Useful Resources +- [Transformers task guide on video-text-to-text](https://huggingface.co/docs/transformers/tasks/video_text_to_text) \ No newline at end of file diff --git a/packages/tasks/src/tasks/video-text-to-text/data.ts b/packages/tasks/src/tasks/video-text-to-text/data.ts new file mode 100644 index 000000000..22bf5e828 --- /dev/null +++ b/packages/tasks/src/tasks/video-text-to-text/data.ts @@ -0,0 +1,59 @@ +import type { TaskDataCustom } from ".."; + +const taskData: TaskDataCustom = { + datasets: [ + { + description: "Multiple-choice questions and answers about videos.", + id: "lmms-lab/Video-MME", + }, + { + description: "A dataset of instructions and question-answer pairs about videos.", + id: "lmms-lab/VideoChatGPT", + }, + + ], + demo: { + inputs: [ + { + filename: "video-text-to-text-input.gif", + type: "video", + }, + { + label: "Text Prompt", + content: "What is happening in this video?", + type: "text", + }, + ], + outputs: [ + { + label: "Answer", + content: + "The video shows a series of images showing a fountain with water jets and a variety of colorful flowers and butterflies in the background.", + type: "text", + }, + ], + }, + metrics: [], + models: [ + { + description: "A robust video-text-to-text model that can take in image and video inputs.", + id: "llava-hf/llava-interleave-qwen-7b-hf", + }, + { + description: "Large and powerful video-text-to-text model that can take in image and video inputs.", + id: "llava-hf/LLaVA-NeXT-Video-34B-hf", + }, + ], + spaces: [ + { + description: "An application to chat with a video-text-to-text model.", + id: "llava-hf/video-llava", + }, + ], + summary: + "Video-text-to-text models take in an video and text prompt and output text. These models are also called video-language models.", + widgetModels: [""], + youtubeId: "", +}; + +export default taskData; From cbd2e3eb0f145407a82ee7b8f10f388030aa87a2 Mon Sep 17 00:00:00 2001 From: Merve Noyan Date: Mon, 2 Sep 2024 15:14:45 +0300 Subject: [PATCH 02/15] replace model --- packages/tasks/src/tasks/video-text-to-text/data.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/tasks/src/tasks/video-text-to-text/data.ts b/packages/tasks/src/tasks/video-text-to-text/data.ts index 22bf5e828..a0dccfa11 100644 --- a/packages/tasks/src/tasks/video-text-to-text/data.ts +++ b/packages/tasks/src/tasks/video-text-to-text/data.ts @@ -37,7 +37,7 @@ const taskData: TaskDataCustom = { models: [ { description: "A robust video-text-to-text model that can take in image and video inputs.", - id: "llava-hf/llava-interleave-qwen-7b-hf", + id: "llava-hf/llava-onevision-qwen2-72b-ov-hf", }, { description: "Large and powerful video-text-to-text model that can take in image and video inputs.", From 73485b0a1db6925246e0adcbce84ea6eb5527aec Mon Sep 17 00:00:00 2001 From: Merve Noyan Date: Wed, 4 Sep 2024 13:03:16 +0300 Subject: [PATCH 03/15] Update packages/tasks/src/tasks/video-text-to-text/about.md Co-authored-by: Julien Chaumond --- packages/tasks/src/tasks/video-text-to-text/about.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/tasks/src/tasks/video-text-to-text/about.md b/packages/tasks/src/tasks/video-text-to-text/about.md index 62e9007fc..31fb1c0fb 100644 --- a/packages/tasks/src/tasks/video-text-to-text/about.md +++ b/packages/tasks/src/tasks/video-text-to-text/about.md @@ -1,6 +1,6 @@ Most of the video language models can take in videos, multiple videos, images and multiple images. Some of these models can also take interleaved inputs, which can have images and videos inside the text, where you can refer to the input image and input video within the text prompt. -## Different Types of Vide Language Models +## Different Types of Video Language Models Video language models come in three types: From 9960f90d1e482f7f231f8f0920305fc6aaf53ccf Mon Sep 17 00:00:00 2001 From: Merve Noyan Date: Wed, 4 Sep 2024 13:53:34 +0300 Subject: [PATCH 04/15] Update packages/tasks/src/tasks/video-text-to-text/about.md Co-authored-by: Pedro Cuenca --- packages/tasks/src/tasks/video-text-to-text/about.md | 1 - 1 file changed, 1 deletion(-) diff --git a/packages/tasks/src/tasks/video-text-to-text/about.md b/packages/tasks/src/tasks/video-text-to-text/about.md index 31fb1c0fb..940a38f82 100644 --- a/packages/tasks/src/tasks/video-text-to-text/about.md +++ b/packages/tasks/src/tasks/video-text-to-text/about.md @@ -47,7 +47,6 @@ model = LlavaNextVideoForConditionalGeneration.from_pretrained( processor = LlavaNextVideoProcessor.from_pretrained(model_id) def sample_frames(url, num_frames): - response = requests.get(url) path_id = str(uuid.uuid4()) From fa4d91d6621735d150804ba7eabe32ead4b26370 Mon Sep 17 00:00:00 2001 From: Merve Noyan Date: Wed, 4 Sep 2024 13:53:39 +0300 Subject: [PATCH 05/15] Update packages/tasks/src/tasks/video-text-to-text/about.md Co-authored-by: Pedro Cuenca --- packages/tasks/src/tasks/video-text-to-text/about.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/tasks/src/tasks/video-text-to-text/about.md b/packages/tasks/src/tasks/video-text-to-text/about.md index 940a38f82..81f0a79e8 100644 --- a/packages/tasks/src/tasks/video-text-to-text/about.md +++ b/packages/tasks/src/tasks/video-text-to-text/about.md @@ -53,7 +53,7 @@ def sample_frames(url, num_frames): path = f"./{path_id}.mp4" with open(path, "wb") as f: - f.write(response.content) + f.write(response.content) video = cv2.VideoCapture(path) total_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT)) From cc86687276b9a8f66f595a8bec625e9bfce19a3e Mon Sep 17 00:00:00 2001 From: Merve Noyan Date: Wed, 4 Sep 2024 13:53:48 +0300 Subject: [PATCH 06/15] Update packages/tasks/src/tasks/video-text-to-text/about.md Co-authored-by: Pedro Cuenca --- packages/tasks/src/tasks/video-text-to-text/about.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/tasks/src/tasks/video-text-to-text/about.md b/packages/tasks/src/tasks/video-text-to-text/about.md index 81f0a79e8..ee314710d 100644 --- a/packages/tasks/src/tasks/video-text-to-text/about.md +++ b/packages/tasks/src/tasks/video-text-to-text/about.md @@ -61,10 +61,10 @@ def sample_frames(url, num_frames): frames = [] for i in range(total_frames): ret, frame = video.read() - pil_img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)) if not ret: continue if i % interval == 0: + pil_img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)) frames.append(pil_img) video.release() return frames From 599230102ca6608505c454c7f635c00c1afd74de Mon Sep 17 00:00:00 2001 From: Merve Noyan Date: Wed, 4 Sep 2024 13:53:54 +0300 Subject: [PATCH 07/15] Update packages/tasks/src/tasks/video-text-to-text/data.ts Co-authored-by: Pedro Cuenca --- packages/tasks/src/tasks/video-text-to-text/data.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/tasks/src/tasks/video-text-to-text/data.ts b/packages/tasks/src/tasks/video-text-to-text/data.ts index a0dccfa11..63f0024cd 100644 --- a/packages/tasks/src/tasks/video-text-to-text/data.ts +++ b/packages/tasks/src/tasks/video-text-to-text/data.ts @@ -51,7 +51,7 @@ const taskData: TaskDataCustom = { }, ], summary: - "Video-text-to-text models take in an video and text prompt and output text. These models are also called video-language models.", + "Video-text-to-text models take in a video and a text prompt and output text. These models are also called video-language models.", widgetModels: [""], youtubeId: "", }; From 9a0291c22d882ac7405edd667c02c315ef5a23b4 Mon Sep 17 00:00:00 2001 From: Merve Noyan Date: Wed, 4 Sep 2024 14:07:47 +0300 Subject: [PATCH 08/15] Add opencv --- packages/tasks/src/tasks/video-text-to-text/about.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/packages/tasks/src/tasks/video-text-to-text/about.md b/packages/tasks/src/tasks/video-text-to-text/about.md index ee314710d..d5e1b9ad0 100644 --- a/packages/tasks/src/tasks/video-text-to-text/about.md +++ b/packages/tasks/src/tasks/video-text-to-text/about.md @@ -26,7 +26,7 @@ Video language models can recognize images through descriptions. When given deta ## Inference You can use the Transformers library to interact with video-language models. -Below we load [a video language model](https://huggingface.co/llava-hf/LLaVA-NeXT-Video-7B-hf), write a simple utility to sample videos, use chat template to format the text prompt, process the video and the text prompt and infer. +Below we load [a video language model](https://huggingface.co/llava-hf/LLaVA-NeXT-Video-7B-hf), write a simple utility to sample videos, use chat template to format the text prompt, process the video and the text prompt and infer. To run below snippet, please install [OpenCV](https://pypi.org/project/opencv-python/) by running `pip install opencv-python`. ```python import uuid @@ -95,4 +95,4 @@ print(processor.decode(output[0][2:], skip_special_tokens=True)) ``` ## Useful Resources -- [Transformers task guide on video-text-to-text](https://huggingface.co/docs/transformers/tasks/video_text_to_text) \ No newline at end of file +- [Transformers task guide on video-text-to-text](https://huggingface.co/docs/transformers/tasks/video_text_to_text) From 48c68c147d54b4c2fe12b9574097cffe5691fb61 Mon Sep 17 00:00:00 2001 From: Merve Noyan Date: Thu, 5 Sep 2024 13:30:31 +0300 Subject: [PATCH 09/15] fix type --- packages/tasks/src/tasks/video-text-to-text/data.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/tasks/src/tasks/video-text-to-text/data.ts b/packages/tasks/src/tasks/video-text-to-text/data.ts index 63f0024cd..266af9689 100644 --- a/packages/tasks/src/tasks/video-text-to-text/data.ts +++ b/packages/tasks/src/tasks/video-text-to-text/data.ts @@ -16,7 +16,7 @@ const taskData: TaskDataCustom = { inputs: [ { filename: "video-text-to-text-input.gif", - type: "video", + type: "img", }, { label: "Text Prompt", From 5f497262a74892657e263bc4c8898f2f32139e0b Mon Sep 17 00:00:00 2001 From: Merve Noyan Date: Fri, 6 Sep 2024 12:08:07 +0300 Subject: [PATCH 10/15] Update packages/tasks/src/tasks/video-text-to-text/data.ts Co-authored-by: Pedro Cuenca --- packages/tasks/src/tasks/video-text-to-text/data.ts | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/packages/tasks/src/tasks/video-text-to-text/data.ts b/packages/tasks/src/tasks/video-text-to-text/data.ts index 266af9689..6f6dc7fad 100644 --- a/packages/tasks/src/tasks/video-text-to-text/data.ts +++ b/packages/tasks/src/tasks/video-text-to-text/data.ts @@ -6,11 +6,10 @@ const taskData: TaskDataCustom = { description: "Multiple-choice questions and answers about videos.", id: "lmms-lab/Video-MME", }, - { + { description: "A dataset of instructions and question-answer pairs about videos.", id: "lmms-lab/VideoChatGPT", - }, - + }, ], demo: { inputs: [ From 62b90d6a2df1aa8f6e48d3a0ebd30faf5826413e Mon Sep 17 00:00:00 2001 From: Merve Noyan Date: Fri, 6 Sep 2024 12:08:14 +0300 Subject: [PATCH 11/15] Update packages/tasks/src/tasks/video-text-to-text/data.ts Co-authored-by: Pedro Cuenca --- packages/tasks/src/tasks/video-text-to-text/data.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/tasks/src/tasks/video-text-to-text/data.ts b/packages/tasks/src/tasks/video-text-to-text/data.ts index 6f6dc7fad..9e529415d 100644 --- a/packages/tasks/src/tasks/video-text-to-text/data.ts +++ b/packages/tasks/src/tasks/video-text-to-text/data.ts @@ -38,7 +38,7 @@ const taskData: TaskDataCustom = { description: "A robust video-text-to-text model that can take in image and video inputs.", id: "llava-hf/llava-onevision-qwen2-72b-ov-hf", }, - { + { description: "Large and powerful video-text-to-text model that can take in image and video inputs.", id: "llava-hf/LLaVA-NeXT-Video-34B-hf", }, From 666811d58b48062a8b564eaeef02541767821baf Mon Sep 17 00:00:00 2001 From: Merve Noyan Date: Fri, 6 Sep 2024 12:17:19 +0300 Subject: [PATCH 12/15] lint --- .../tasks/src/tasks/video-text-to-text/about.md | 14 +++++++------- .../tasks/src/tasks/video-text-to-text/data.ts | 2 +- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/packages/tasks/src/tasks/video-text-to-text/about.md b/packages/tasks/src/tasks/video-text-to-text/about.md index d5e1b9ad0..804760ae3 100644 --- a/packages/tasks/src/tasks/video-text-to-text/about.md +++ b/packages/tasks/src/tasks/video-text-to-text/about.md @@ -4,11 +4,10 @@ Most of the video language models can take in videos, multiple videos, images an Video language models come in three types: -- **Base:** Pre-trained models that can be fine-tuned. -- **Instruction:** Base models fine-tuned on video-instruction pairs and answers. +- **Base:** Pre-trained models that can be fine-tuned. +- **Instruction:** Base models fine-tuned on video-instruction pairs and answers. - **Chatty/Conversational:** Base models fine-tuned on video conversation datasets. - ## Use Cases ### Video Question Answering @@ -25,7 +24,7 @@ Video language models can recognize images through descriptions. When given deta ## Inference -You can use the Transformers library to interact with video-language models. +You can use the Transformers library to interact with video-language models. Below we load [a video language model](https://huggingface.co/llava-hf/LLaVA-NeXT-Video-7B-hf), write a simple utility to sample videos, use chat template to format the text prompt, process the video and the text prompt and infer. To run below snippet, please install [OpenCV](https://pypi.org/project/opencv-python/) by running `pip install opencv-python`. ```python @@ -39,9 +38,9 @@ device = "cuda" if torch.cuda.is_available() else "cpu" model_id = "llava-hf/LLaVA-NeXT-Video-7B-hf" model = LlavaNextVideoForConditionalGeneration.from_pretrained( - model_id, - torch_dtype=torch.float16, - low_cpu_mem_usage=True, + model_id, + torch_dtype=torch.float16, + low_cpu_mem_usage=True, ).to(device) processor = LlavaNextVideoProcessor.from_pretrained(model_id) @@ -95,4 +94,5 @@ print(processor.decode(output[0][2:], skip_special_tokens=True)) ``` ## Useful Resources + - [Transformers task guide on video-text-to-text](https://huggingface.co/docs/transformers/tasks/video_text_to_text) diff --git a/packages/tasks/src/tasks/video-text-to-text/data.ts b/packages/tasks/src/tasks/video-text-to-text/data.ts index 9e529415d..d81038e69 100644 --- a/packages/tasks/src/tasks/video-text-to-text/data.ts +++ b/packages/tasks/src/tasks/video-text-to-text/data.ts @@ -9,7 +9,7 @@ const taskData: TaskDataCustom = { { description: "A dataset of instructions and question-answer pairs about videos.", id: "lmms-lab/VideoChatGPT", - }, + }, ], demo: { inputs: [ From adaedfd82fcbd5263275b3d8d4d0ec9eddb3c83e Mon Sep 17 00:00:00 2001 From: Merve Noyan Date: Fri, 6 Sep 2024 13:15:23 +0300 Subject: [PATCH 13/15] Update packages/tasks/src/tasks/video-text-to-text/about.md Co-authored-by: Pedro Cuenca --- packages/tasks/src/tasks/video-text-to-text/about.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/tasks/src/tasks/video-text-to-text/about.md b/packages/tasks/src/tasks/video-text-to-text/about.md index 804760ae3..d954699d4 100644 --- a/packages/tasks/src/tasks/video-text-to-text/about.md +++ b/packages/tasks/src/tasks/video-text-to-text/about.md @@ -1,4 +1,4 @@ -Most of the video language models can take in videos, multiple videos, images and multiple images. Some of these models can also take interleaved inputs, which can have images and videos inside the text, where you can refer to the input image and input video within the text prompt. +Most of the video language models can take in videos, multiple videos, images and multiple images. Some of these models can also take interleaved inputs, which can have images and videos inside the text, where you can refer to the input images and input videos within the text prompt. ## Different Types of Video Language Models From 463279562eb606baccaa7e719012da01cedfba70 Mon Sep 17 00:00:00 2001 From: Merve Noyan Date: Fri, 6 Sep 2024 13:15:29 +0300 Subject: [PATCH 14/15] Update packages/tasks/src/tasks/video-text-to-text/about.md Co-authored-by: Pedro Cuenca --- packages/tasks/src/tasks/video-text-to-text/about.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/tasks/src/tasks/video-text-to-text/about.md b/packages/tasks/src/tasks/video-text-to-text/about.md index d954699d4..cfb9fd51f 100644 --- a/packages/tasks/src/tasks/video-text-to-text/about.md +++ b/packages/tasks/src/tasks/video-text-to-text/about.md @@ -20,7 +20,7 @@ Video language models can be used to have a dialogue about a video. ### Video Recognition with Instructions -Video language models can recognize images through descriptions. When given detailed descriptions of specific entities, it can classify the entities in a video. +Video language models can recognize images through descriptions. When given detailed descriptions of specific entities, they can classify the entities in a video. ## Inference From be95adc2455f4b7e7c24b144f93a1c167d0ff639 Mon Sep 17 00:00:00 2001 From: Merve Noyan Date: Fri, 6 Sep 2024 13:15:34 +0300 Subject: [PATCH 15/15] Update packages/tasks/src/tasks/video-text-to-text/about.md Co-authored-by: Pedro Cuenca --- packages/tasks/src/tasks/video-text-to-text/about.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/tasks/src/tasks/video-text-to-text/about.md b/packages/tasks/src/tasks/video-text-to-text/about.md index cfb9fd51f..2d9af88d3 100644 --- a/packages/tasks/src/tasks/video-text-to-text/about.md +++ b/packages/tasks/src/tasks/video-text-to-text/about.md @@ -25,7 +25,7 @@ Video language models can recognize images through descriptions. When given deta ## Inference You can use the Transformers library to interact with video-language models. -Below we load [a video language model](https://huggingface.co/llava-hf/LLaVA-NeXT-Video-7B-hf), write a simple utility to sample videos, use chat template to format the text prompt, process the video and the text prompt and infer. To run below snippet, please install [OpenCV](https://pypi.org/project/opencv-python/) by running `pip install opencv-python`. +Below we load [a video language model](https://huggingface.co/llava-hf/LLaVA-NeXT-Video-7B-hf), write a simple utility to sample videos, use chat template to format the text prompt, process the video and the text prompt and infer. To run the snippet below, please install [OpenCV](https://pypi.org/project/opencv-python/) by running `pip install opencv-python`. ```python import uuid