diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml index cf28eb3dc..3e4cc94c9 100644 --- a/.github/workflows/pr-test.yml +++ b/.github/workflows/pr-test.yml @@ -352,7 +352,7 @@ jobs: volume_size: 100 disk_size: 100 image: "ghcr.io/${{ github.repository }}/fastvideo-dev:py3.12-latest" - test_command: "uv pip install -e .[test] && pytest ./fastvideo/dataset/ -vs && pytest ./fastvideo/workflow/ -vs" + test_command: "uv pip install -e .[test] && pytest ./fastvideo/dataset/ -vs && pytest ./fastvideo/workflow/ -vs && pytest ./fastvideo/entrypoints/ -vs" timeout_minutes: 30 secrets: RUNPOD_API_KEY: ${{ secrets.RUNPOD_API_KEY }} diff --git a/assets/full.svg b/assets/full.svg new file mode 100644 index 000000000..0b8b058b5 --- /dev/null +++ b/assets/full.svg @@ -0,0 +1,18 @@ + + + + + + + + + + + + + + + + + + diff --git a/assets/icon-simple.svg b/assets/icon-simple.svg new file mode 100644 index 000000000..414643899 --- /dev/null +++ b/assets/icon-simple.svg @@ -0,0 +1,6 @@ + + + + + + diff --git a/examples/inference/gradio/local/README.md b/examples/inference/gradio/local/README.md new file mode 100644 index 000000000..f744dbf46 --- /dev/null +++ b/examples/inference/gradio/local/README.md @@ -0,0 +1,56 @@ +# FastVideo Gradio Local Demo + +This is a Gradio-based web interface for generating videos using the FastVideo framework. The demo allows users to create videos from text prompts with various customization options. + +## Overview + +The demo uses the FastVideo framework to generate videos based on text prompts. It provides a simple web interface built with Gradio that allows users to: + +- Enter text prompts to generate videos +- Customize video parameters (dimensions, number of frames, etc.) +- Use negative prompts to guide the generation process +- Set or randomize seeds for reproducibility + +--- + +## Usage + +Run the demo with: + +```bash +python examples/inference/gradio/local/gradio_local_demo.py +``` + +This will start a web server at `http://0.0.0.0:7860` where you can access the interface. + +--- + +## Model Initialization + +This demo initializes a `VideoGenerator` with the minimum required arguments for inference. Users can seamlessly adjust inference options between generations, including prompts, resolution, video length, *without ever needing to reload the model*. + +## Video Generation + +The core functionality is in the `generate_video` function, which: +1. Processes user inputs +2. Uses the FastVideo VideoGenerator from earlier to run inference (`generator.generate_video()`) + +## Gradio Interface + +The interface is built with several components: +- A text input for the prompt +- A video display for the result +- Inference options in a collapsible accordion: + - Height and width sliders + - Number of frames slider + - Guidance scale slider + - Negative prompt options + - Seed controls + +### Inference Options + +- **Height/Width**: Control the resolution of the generated video +- **Number of Frames**: Set how many frames to generate +- **Guidance Scale**: Control how closely the generation follows the prompt +- **Negative Prompt**: Specify what you don't want to see in the video +- **Seed**: Control randomness for reproducible results \ No newline at end of file diff --git a/examples/inference/gradio/local/gradio_local_demo.py b/examples/inference/gradio/local/gradio_local_demo.py new file mode 100644 index 000000000..a59d1c123 --- /dev/null +++ b/examples/inference/gradio/local/gradio_local_demo.py @@ -0,0 +1,656 @@ +import argparse +import os +import base64 +import time + +import gradio as gr +from fastvideo.entrypoints.video_generator import VideoGenerator +from fastvideo.configs.sample.base import SamplingParam +from copy import deepcopy + + +MODEL_PATH_MAPPING = { + "FastWan2.1-T2V-1.3B": "FastVideo/FastWan2.1-T2V-1.3B-Diffusers", + # "FastWan2.2-TI2V-5B-FullAttn": "FastVideo/FastWan2.2-TI2V-5B-FullAttn-Diffusers", +} + +def create_timing_display(inference_time, total_time, stage_execution_times, num_frames): + dit_denoising_time = f"{stage_execution_times[5]:.2f}s" if len(stage_execution_times) > 5 else "N/A" + + timing_html = f""" +
+

⏱️ Timing Breakdown

+
+
+
🚀
+
DiT Denoising
+
{dit_denoising_time}
+
+
+
🧠
+
E2E (w. vae/text encoder)
+
{inference_time:.2f}s
+
+
+
🎬
+
Video Encoding
+
N/A
+
+
+
🌐
+
Network Transfer
+
N/A
+
+
+
📊
+
Total Processing
+
{total_time:.2f}s
+
+
""" + + if inference_time > 0: + fps = num_frames / inference_time + timing_html += f""" +
+ Generation Speed: + {fps:.1f} frames/second +
""" + + return timing_html + "
" +def setup_model_environment(model_path: str) -> None: + if "fullattn" in model_path.lower(): + os.environ["FASTVIDEO_ATTENTION_BACKEND"] = "FLASH_ATTN" + else: + os.environ["FASTVIDEO_ATTENTION_BACKEND"] = "VIDEO_SPARSE_ATTN" + os.environ["FASTVIDEO_STAGE_LOGGING"] = "1" + +def load_example_prompts(): + def contains_chinese(text): + return any('\u4e00' <= char <= '\u9fff' for char in text) + + def load_from_file(filepath): + prompts, labels = [], [] + try: + with open(filepath, "r", encoding='utf-8') as f: + for line in f: + line = line.strip() + if line and not contains_chinese(line): + label = line[:100] + "..." if len(line) > 100 else line + labels.append(label) + prompts.append(line) + except Exception as e: + print(f"Warning: Could not read {filepath}: {e}") + return prompts, labels + + examples, example_labels = load_from_file("examples/inference/gradio/local/prompts_final.txt") + + if not examples: + examples = ["A crowded rooftop bar buzzes with energy, the city skyline twinkling like a field of stars in the background."] + example_labels = ["Crowded rooftop bar at night"] + + return examples, example_labels + + +def create_gradio_interface(default_params: dict[str, SamplingParam], generators: dict[str, VideoGenerator]): + def generate_video( + prompt, negative_prompt, use_negative_prompt, seed, guidance_scale, + num_frames, height, width, randomize_seed, model_selection, progress + ): + model_path = MODEL_PATH_MAPPING.get(model_selection, "FastVideo/FastWan2.1-T2V-1.3B-Diffusers") + setup_model_environment(model_path) + try: + if progress: + progress(0.1, desc="Loading model for local inference...") + + generator = generators[model_path] + params = deepcopy(default_params[model_path]) + total_start_time = time.time() + if progress: + progress(0.2, desc="Configuring parameters...") + + params.prompt = prompt + params.seed = int(seed) + params.guidance_scale = guidance_scale + params.num_frames = int(num_frames) + params.height = int(height) + params.width = int(width) + + if randomize_seed: + params.seed = torch.randint(0, 1000000, (1, )).item() + + if use_negative_prompt and negative_prompt: + params.negative_prompt = negative_prompt + else: + params.negative_prompt = default_params[model_path].negative_prompt + + if progress: + progress(0.4, desc="Generating video locally...") + + output_dir = "outputs/" + os.makedirs(output_dir, exist_ok=True) + start_time = time.time() + result = generator.generate_video(prompt=prompt, sampling_param=params, save_video=True, return_frames=False) + inference_time = time.time() - start_time + logging_info = result.get("logging_info", None) + if logging_info: + stage_names = logging_info.get_execution_order() + stage_execution_times = [ + logging_info.get_stage_info(stage_name).get("execution_time", 0.0) + for stage_name in stage_names + ] + else: + stage_names = [] + stage_execution_times = [] + total_time = time.time() - total_start_time + timing_details=create_timing_display(inference_time=inference_time, total_time=total_time, stage_execution_times=stage_execution_times, num_frames=params.num_frames) + safe_prompt = params.prompt[:100].replace(' ', '_').replace('/', '_').replace('\\', '_') + video_filename = f"{params.prompt[:100]}.mp4" + output_path = os.path.join(output_dir, video_filename) + + if progress: + progress(1.0, desc="Generation complete!") + + return output_path, params.seed, timing_details + + except Exception as e: + print(f"An error occurred during local generation: {e}") + return None, f"Generation failed: {str(e)}", "" + + examples, example_labels = load_example_prompts() + + theme = gr.themes.Base().set( + button_primary_background_fill="#2563eb", + button_primary_background_fill_hover="#1d4ed8", + button_primary_text_color="white", + slider_color="#2563eb", + checkbox_background_color_selected="#2563eb", + ) + + def get_default_values(model_name): + model_path = MODEL_PATH_MAPPING.get(model_name) + if model_path and model_path in default_params: + params = default_params[model_path] + return { + 'height': params.height, + 'width': params.width, + 'num_frames': params.num_frames, + 'guidance_scale': params.guidance_scale, + 'seed': params.seed, + } + + return { + 'height': 448, + 'width': 832, + 'num_frames': 61, + 'guidance_scale': 3.0, + 'seed': 1024, + } + + initial_values = get_default_values("FastWan2.1-T2V-1.3B") + + with gr.Blocks(title="FastWan", theme=theme) as demo: + gr.Image("assets/full.svg", show_label=False, container=False, height=80) + + gr.HTML(""" +
+

Make Video Generation Go Blurrrrrrr

+

Code | Blog | Docs

+
+ """) + + with gr.Accordion("🎥 What Is FastVideo?", open=False): + gr.HTML(""" +
+

+ FastVideo is an inference and post-training framework for diffusion models. It features an end-to-end unified pipeline for accelerating diffusion models, starting from data preprocessing to model training, finetuning, distillation, and inference. FastVideo is designed to be modular and extensible, allowing users to easily add new optimizations and techniques. Whether it is training-free optimizations or post-training optimizations, FastVideo has you covered. +

+
+ """) + + with gr.Row(): + model_selection = gr.Dropdown( + choices=list(MODEL_PATH_MAPPING.keys()), + value="FastWan2.1-T2V-1.3B", + label="Select Model", + interactive=True + ) + + with gr.Row(): + example_dropdown = gr.Dropdown( + choices=example_labels, + label="Example Prompts", + value=None, + interactive=True, + allow_custom_value=False + ) + + with gr.Row(): + with gr.Column(scale=6): + prompt = gr.Text( + label="Prompt", + show_label=False, + max_lines=3, + placeholder="Describe your scene...", + container=False, + lines=3, + autofocus=True, + ) + with gr.Column(scale=1, min_width=120, elem_classes="center-button"): + run_button = gr.Button("Run", variant="primary", size="lg") + + with gr.Row(): + with gr.Column(): + error_output = gr.Text(label="Error", visible=False) + timing_display = gr.Markdown(label="Timing Breakdown", visible=False) + + with gr.Row(equal_height=True, elem_classes="main-content-row"): + with gr.Column(scale=1, elem_classes="advanced-options-column"): + with gr.Group(): + gr.HTML("
Advanced Options
") + with gr.Row(): + height = gr.Number( + label="Height", + value=initial_values['height'], + interactive=False, + container=True + ) + width = gr.Number( + label="Width", + value=initial_values['width'], + interactive=False, + container=True + ) + + with gr.Row(): + num_frames = gr.Number( + label="Number of Frames", + value=initial_values['num_frames'], + interactive=False, + container=True + ) + guidance_scale = gr.Slider( + label="Guidance Scale", + minimum=1, + maximum=12, + value=initial_values['guidance_scale'], + ) + + with gr.Row(): + use_negative_prompt = gr.Checkbox( + label="Use negative prompt", value=False) + negative_prompt = gr.Text( + label="Negative prompt", + max_lines=3, + lines=3, + placeholder="Enter a negative prompt", + visible=False, + ) + + seed = gr.Slider( + label="Seed", + minimum=0, + maximum=1000000, + step=1, + value=initial_values['seed'], + ) + randomize_seed = gr.Checkbox(label="Randomize seed", value=False) + seed_output = gr.Number(label="Used Seed") + + with gr.Column(scale=1, elem_classes="video-column"): + result = gr.Video( + label="Generated Video", + show_label=True, + height=466, + width=600, + container=True, + elem_classes="video-component" + ) + + gr.HTML(""" + + """) + + def on_example_select(example_label): + if example_label and example_label in example_labels: + index = example_labels.index(example_label) + return examples[index] + return "" + + example_dropdown.change( + fn=on_example_select, + inputs=example_dropdown, + outputs=prompt, + ) + + gr.HTML(""" +
+

Note that this demo is meant to showcase FastWan's quality and that under a large number of requests, generation speed may be affected.

+
+ """) + + use_negative_prompt.change( + fn=lambda x: gr.update(visible=x), + inputs=use_negative_prompt, + outputs=negative_prompt, + ) + + def on_model_selection_change(selected_model): + if not selected_model: + selected_model = "FastWan2.1-T2V-1.3B" + + model_path = MODEL_PATH_MAPPING.get(selected_model) + + if model_path and model_path in default_params: + params = default_params[model_path] + return ( + gr.update(value=params.height), + gr.update(value=params.width), + gr.update(value=params.num_frames), + gr.update(value=params.guidance_scale), + gr.update(value=params.seed), + ) + + return ( + gr.update(value=448), + gr.update(value=832), + gr.update(value=61), + gr.update(value=3.0), + gr.update(value=1024), + ) + + model_selection.change( + fn=on_model_selection_change, + inputs=model_selection, + outputs=[height, width, num_frames, guidance_scale, seed], + ) + + def handle_generation(*args, progress=None, request: gr.Request = None): + model_selection, prompt, negative_prompt, use_negative_prompt, seed, guidance_scale, num_frames, height, width, randomize_seed = args + + result_path, seed_or_error, timing_details = generate_video( + prompt, negative_prompt, use_negative_prompt, seed, guidance_scale, + num_frames, height, width, randomize_seed, model_selection, progress + ) + if result_path and os.path.exists(result_path): + return ( + result_path, + seed_or_error, + gr.update(visible=False), + gr.update(visible=True, value=timing_details), + ) + else: + return ( + None, + seed_or_error, + gr.update(visible=True, value=seed_or_error), + gr.update(visible=False), + ) + + run_button.click( + fn=handle_generation, + inputs=[ + model_selection, + prompt, + negative_prompt, + use_negative_prompt, + seed, + guidance_scale, + num_frames, + height, + width, + randomize_seed, + ], + outputs=[result, seed_output, error_output, timing_display], + concurrency_limit=20, + ) + + return demo + + +def main(): + parser = argparse.ArgumentParser(description="FastVideo Gradio Local Demo") + parser.add_argument("--t2v_model_paths", type=str, + default="FastVideo/FastWan2.1-T2V-1.3B-Diffusers", + help="Comma separated list of paths to the T2V model(s)") + parser.add_argument("--host", type=str, default="0.0.0.0", + help="Host to bind to") + parser.add_argument("--port", type=int, default=7860, + help="Port to bind to") + args = parser.parse_args() + generators = {} + default_params = {} + model_paths = args.t2v_model_paths.split(",") + for model_path in model_paths: + print(f"Loading model: {model_path}") + setup_model_environment(model_path) + generators[model_path] = VideoGenerator.from_pretrained(model_path) + default_params[model_path] = SamplingParam.from_pretrained(model_path) + demo = create_gradio_interface(default_params, generators) + print(f"Starting Gradio frontend at http://{args.host}:{args.port}") + print(f"T2V Models: {args.t2v_model_paths}") + + from fastapi import FastAPI, Request, HTTPException + from fastapi.responses import HTMLResponse, FileResponse + import uvicorn + + app = FastAPI() + + @app.get("/logo.png") + def get_logo(): + return FileResponse( + "assets/full.svg", + media_type="image/svg+xml", + headers={ + "Cache-Control": "public, max-age=3600", + "Access-Control-Allow-Origin": "*" + } + ) + + @app.get("/favicon.ico") + def get_favicon(): + favicon_path = "assets/icon-simple.svg" + + if os.path.exists(favicon_path): + return FileResponse( + favicon_path, + media_type="image/svg+xml", + headers={ + "Cache-Control": "public, max-age=3600", + "Access-Control-Allow-Origin": "*" + } + ) + else: + raise HTTPException(status_code=404, detail="Favicon not found") + + @app.get("/", response_class=HTMLResponse) + def index(request: Request): + base_url = str(request.base_url).rstrip('/') + return f""" + + + + + + + FastWan + + + + + + + + + + + + + + + + + + + + + + + + + + + + """ + + app = gr.mount_gradio_app( + app, + demo, + path="/gradio", + allowed_paths=[os.path.abspath("outputs"), os.path.abspath("fastvideo-logos")] + ) + + uvicorn.run(app, host=args.host, port=args.port) + + +if __name__ == "__main__": + + main() \ No newline at end of file diff --git a/examples/inference/gradio/local/prompts_final.txt b/examples/inference/gradio/local/prompts_final.txt new file mode 100644 index 000000000..8334cfbe1 --- /dev/null +++ b/examples/inference/gradio/local/prompts_final.txt @@ -0,0 +1,11 @@ +A dynamic shot of a sleek black motorcycle accelerating down an empty highway at sunset. The bike's engine roars as it gains speed, smoke trailing from the tires. The rider, wearing a black leather jacket and helmet, leans forward with determination, gripping the handlebars tightly. The camera follows the motorcycle from a distance, capturing the dust kicked up behind it, then zooms in to show the intense focus on the rider's face. The background showcases the endless road stretching into the horizon with vibrant orange and pink hues of the setting sun. Medium shot transitioning to close-up. +A Jedi Master Yoda, recognizable by his green skin, large ears, and wise wrinkles, is performing on a small stage, strumming a guitar with great concentration. Yoda wears a casual robe and sits on a stool, his eyes closed as he plays, fully immersed in the music. The stage is dimly lit with spotlights highlighting Yoda, creating a mystical atmosphere. The background shows a live audience watching intently. Medium close-up shot focusing on Yoda's expressive face and hands moving gracefully over the guitar strings. +A cute, fluffy panda bear is preparing a meal in a cozy, modern kitchen. The panda is standing at a wooden countertop, wearing a white chef’s hat and apron. It skillfully stirs a pot on the stove with one hand while holding a spatula in the other. The kitchen is well-lit, with appliances and cabinets in pastel colors, creating a warm and inviting atmosphere. The panda moves gracefully, with a focused and determined expression, as steam rises from the pot. Medium shot focusing on the panda’s actions at the stove. +In a futuristic Tokyo rooftop during a heavy rainstorm, a robotic DJ stands behind a turntable, spinning vinyl records in a cyberpunk night setting. The robot has metallic, sleek body parts with glowing blue LED lights, and it moves gracefully with the beat. Raindrops create a shimmering effect as they hit the ground and the DJ. The surrounding environment features neon signs, towering skyscrapers, and a dark, misty atmosphere. The camera starts with a wide shot of the city skyline before zooming in on the DJ performing. Sci-fi, fantasy. +A realistic animated scene featuring a polar bear playing a guitar. The polar bear is standing upright, wearing a cozy fur vest and fingerless gloves. It holds the guitar with both hands, strumming the strings with one hand while plucking them with the other, showcasing natural, fluid motions. The polar bear's expressive face shows concentration and joy as it plays. The background is a snowy Arctic landscape with icebergs and a clear blue sky. The scene captures the bear from a mid-shot angle, focusing on its interaction with the guitar. +The scene opens to a breathtaking view of a tranquil ocean horizon at dusk, displaying a vibrant tapestry of oranges, pinks, and purples as the sun sets. In the foreground, tall, swaying palm trees frame the scene, their silhouettes stark against the colorful sky. The ocean itself shimmers with reflections of the sunset, creating a peaceful, almost ethereal atmosphere. A small boat can be seen in the distance, centered on the horizon, adding a sense of scale and solitude to the scene. The waves gently lap the shore, creating faint patterns on the sandy beach, which stretches across the foreground. Above, the sky is dotted with scattered clouds that catch the last light of the day, enhancing the drama and beauty of the scene. The overall mood is serene and contemplative, capturing a perfect moment of nature’s grandeur. +A large, modern semi-truck accelerating down an empty highway, gaining speed with each second. The truck's powerful engine roars as it moves forward, smoke billowing from the tires. The camera starts from a wide shot, capturing the truck in the distance, then smoothly zooms in to follow the vehicle as it speeds up. The truck's headlights illuminate the road ahead, casting a bright glow. The truck driver can be seen through the windshield, focused and determined. The background shows the vast openness of the highway stretching into the horizon under a clear blue sky. Medium to close-up shots of the truck as it accelerates. +Soft blue light pulses from the blade’s rune-etched hilt, illuminating nearby moss-covered roots and ferns. The surrounding trees are tall and gnarled, their branches curling like claws overhead. Fog swirls gently at ground level, parting slightly as a figure in a cloak approaches from the distance. Medium shot slowly zooming toward the sword, emphasizing its mystical aura. +The video opens with a tranquil scene in the heart of a dense forest, emphasizing two large, textured tree trunks in the foreground framing the view. Sunlight filters through the canopy above, casting intricate patterns of light and shadow on the trees and the ground. Between the tree trunks, a clear view of a calm, muddy river unfolds, its surface shimmering under the gentle sunlight. The riverbank is decorated with a variety of small bushes and vibrant foliage, subtly transitioning into the deep greens of tall, leafy plants. In the background, the dense forest looms, filled with dark, towering trees, their branches intertwining to form an intricate canopy. The scene is bathed in the soft glow of the sun, creating a serene and picturesque setting. Occasional sunbeams pierce through the foliage, adding a magical aura to the landscape. The vibrant reds and oranges of the smaller plants add contrast, bringing warmth to the earthy tones of the scenery. Overall, this harmonious blend of natural elements creates a peaceful and idyllic forest setting. +A lone figure stands on a large, moss-covered rock, surrounded by the soft rush of a nearby stream. The figure is wearing white sneakers and shorts, with a plaid shirt that hangs loosely in the breeze. The lighting creates dramatic shadows, enhancing the textures of the rock and the subtle movement of the water below. In the background, a waterfall cascades into the stream, completing this tranquil and serene nature scene. +In an industrial setting, a person leans casually against a railing, exuding a sense of confidence and composure. They are wearing a striking outfit, consisting of a vibrant, patterned jacket over a simple white crop top, creating a bold contrast. The atmosphere is infused with warm, ambient lighting that casts soft shadows on the concrete walls and metallic surfaces. Intricate wiring and pipes form an intricate backdrop, enhancing the urban aesthetic. Their relaxed posture and direct, engaging gaze suggest a sense of ease in this industrial environment. This scene encapsulates a blend of modern fashion and gritty, urban architecture, creating a visually compelling narrative. diff --git a/examples/inference/gradio/gradio_frontend.py b/examples/inference/gradio/serving/gradio_frontend.py similarity index 100% rename from examples/inference/gradio/gradio_frontend.py rename to examples/inference/gradio/serving/gradio_frontend.py diff --git a/examples/inference/gradio/ray_serve_backend.py b/examples/inference/gradio/serving/ray_serve_backend.py similarity index 100% rename from examples/inference/gradio/ray_serve_backend.py rename to examples/inference/gradio/serving/ray_serve_backend.py diff --git a/examples/inference/gradio/start.sh b/examples/inference/gradio/serving/start.sh similarity index 100% rename from examples/inference/gradio/start.sh rename to examples/inference/gradio/serving/start.sh diff --git a/examples/inference/gradio/start_ray_serve_app.py b/examples/inference/gradio/serving/start_ray_serve_app.py similarity index 100% rename from examples/inference/gradio/start_ray_serve_app.py rename to examples/inference/gradio/serving/start_ray_serve_app.py diff --git a/fastvideo/entrypoints/video_generator.py b/fastvideo/entrypoints/video_generator.py index 2606ecb56..5d232c77c 100644 --- a/fastvideo/entrypoints/video_generator.py +++ b/fastvideo/entrypoints/video_generator.py @@ -8,6 +8,7 @@ import math import os +import re import time from copy import deepcopy from typing import Any @@ -110,7 +111,7 @@ def generate_video( prompt: The prompt to use for generation (optional if prompt_txt is provided) negative_prompt: The negative prompt to use (overrides the one in fastvideo_args) output_path: Path to save the video (overrides the one in fastvideo_args) - output_video_name: Name of the video file to save. Default is the first 100 characters of the prompt. + prompt_path: Path to prompt file save_video: Whether to save the video to disk return_frames: Whether to return the raw frames num_inference_steps: Number of denoising steps (overrides fastvideo_args) @@ -127,8 +128,13 @@ def generate_video( Either the output dictionary, list of frames, or list of results for batch processing """ # Handle batch processing from text file - if self.fastvideo_args.prompt_txt is not None: - prompt_txt_path = self.fastvideo_args.prompt_txt + if sampling_param is None: + sampling_param = SamplingParam.from_pretrained( + self.fastvideo_args.model_path) + sampling_param.update(kwargs) + + if self.fastvideo_args.prompt_txt is not None or sampling_param.prompt_path is not None: + prompt_txt_path = sampling_param.prompt_path or self.fastvideo_args.prompt_txt if not os.path.exists(prompt_txt_path): raise FileNotFoundError( f"Prompt text file not found: {prompt_txt_path}") @@ -142,22 +148,19 @@ def generate_video( logger.info("Found %d prompts in %s", len(prompts), prompt_txt_path) - if sampling_param is not None: - original_output_video_name = sampling_param.output_video_name - else: - original_output_video_name = None - results = [] for i, batch_prompt in enumerate(prompts): logger.info("Processing prompt %d/%d: %s...", i + 1, len(prompts), batch_prompt[:100]) - try: # Generate video for this prompt using the same logic below - if sampling_param is not None and original_output_video_name is not None: - sampling_param.output_video_name = original_output_video_name + f"_{i}" + output_path = self._prepare_output_path( + sampling_param.output_path, batch_prompt) + kwargs["output_path"] = output_path result = self._generate_single_video( - batch_prompt, sampling_param, **kwargs) + prompt=batch_prompt, + sampling_param=sampling_param, + **kwargs) # Add prompt info to result if isinstance(result, dict): @@ -181,8 +184,73 @@ def generate_video( # Single prompt generation (original behavior) if prompt is None: raise ValueError("Either prompt or prompt_txt must be provided") + output_path = self._prepare_output_path(sampling_param.output_path, + prompt) + kwargs["output_path"] = output_path + return self._generate_single_video(prompt=prompt, + sampling_param=sampling_param, + **kwargs) + + def _prepare_output_path( + self, + output_path: str, + prompt: str, + ) -> str: + """Build a unique, sanitized .mp4 output file path. + + - If `output_path` ends with .mp4 (case-insensitive), treat it as a file path. + - Otherwise, treat `output_path` as a directory and derive the filename + from the prompt. + - Invalid filename characters are removed; if the name changes, a + warning is logged. + - If the target path already exists, a numeric suffix is appended. + """ - return self._generate_single_video(prompt, sampling_param, **kwargs) + def _sanitize_filename_component(name: str) -> str: + # Remove characters invalid on common filesystems, strip spaces/dots + sanitized = re.sub(r'[\\/:*?"<>|]', '', name) + sanitized = sanitized.strip().strip('.') + sanitized = re.sub(r'\s+', ' ', sanitized) + return sanitized or "video" + + base_path, extension = os.path.splitext(output_path) + extension_lower = extension.lower() + + if extension_lower == ".mp4": + output_dir = os.path.dirname(output_path) + base_name = os.path.basename( + base_path) # filename without extension + sanitized_base = _sanitize_filename_component(base_name) + if sanitized_base != base_name: + logger.warning( + "The video name '%s' contained invalid characters. It has been renamed to '%s.mp4'", + os.path.basename(output_path), + sanitized_base, + ) + video_name = f"{sanitized_base}.mp4" + else: + # Treat as directory; inform if an unexpected extension was provided. + if extension: + logger.info( + "Output path '%s' has non-mp4 extension '%s'; treating it as a directory and using a .mp4 filename derived from the prompt", + output_path, + extension, + ) + output_dir = output_path + prompt_component = _sanitize_filename_component(prompt[:100]) + video_name = f"{prompt_component}.mp4" + + if output_dir: + os.makedirs(output_dir, exist_ok=True) + + new_output_path = os.path.join(output_dir, video_name) + counter = 1 + while os.path.exists(new_output_path): + name_part, ext_part = os.path.splitext(video_name) + new_video_name = f"{name_part}_{counter}{ext_part}" + new_output_path = os.path.join(output_dir, new_video_name) + counter += 1 + return new_output_path def _generate_single_video( self, @@ -200,15 +268,9 @@ def _generate_single_video( raise TypeError( f"`prompt` must be a string, but got {type(prompt)}") prompt = prompt.strip() - if sampling_param is None: - sampling_param = SamplingParam.from_pretrained( - fastvideo_args.model_path) - else: - sampling_param = deepcopy(sampling_param) - - kwargs["prompt"] = prompt - sampling_param.update(kwargs) - + sampling_param = deepcopy(sampling_param) + output_path = kwargs["output_path"] + sampling_param.prompt = prompt # Process negative prompt if sampling_param.negative_prompt is not None: sampling_param.negative_prompt = sampling_param.negative_prompt.strip( @@ -277,7 +339,7 @@ def _generate_single_video( height: {target_height} width: {target_width} video_length: {sampling_param.num_frames} - prompt: {prompt} + prompt: {sampling_param.prompt} image_path: {sampling_param.image_path} neg_prompt: {sampling_param.negative_prompt} seed: {sampling_param.seed} @@ -288,7 +350,7 @@ def _generate_single_video( flow_shift: {fastvideo_args.pipeline_config.flow_shift} embedded_guidance_scale: {fastvideo_args.pipeline_config.embedded_cfg_scale} save_video: {sampling_param.save_video} - output_path: {sampling_param.output_path} + output_path: {output_path} """ # type: ignore[attr-defined] logger.info(debug_str) @@ -300,10 +362,6 @@ def _generate_single_video( VSA_sparsity=fastvideo_args.VSA_sparsity, ) - # Use prompt[:100] for video name - if batch.output_video_name is None: - batch.output_video_name = prompt[:100] - # Run inference start_time = time.perf_counter() output_batch = self.executor.execute_forward(batch, fastvideo_args) @@ -323,15 +381,8 @@ def _generate_single_video( # Save video if requested if batch.save_video: - output_path = batch.output_path - if output_path: - os.makedirs(output_path, exist_ok=True) - video_path = os.path.join(output_path, - f"{batch.output_video_name}.mp4") - imageio.mimsave(video_path, frames, fps=batch.fps, format="mp4") - logger.info("Saved video to %s", video_path) - else: - logger.warning("No output path provided, video not saved") + imageio.mimsave(output_path, frames, fps=batch.fps, format="mp4") + logger.info("Saved video to %s", output_path) if batch.return_frames: return frames diff --git a/fastvideo/platforms/cuda.py b/fastvideo/platforms/cuda.py index b6c4cc32d..39b593026 100644 --- a/fastvideo/platforms/cuda.py +++ b/fastvideo/platforms/cuda.py @@ -181,7 +181,11 @@ def get_attn_backend_cls(cls, selected_backend: AttentionBackendEnum | None, "Failed to import Video Sparse Attention backend: %s", str(e)) raise ImportError( - "Video Sparse Attention backend is not installed. ") from e + "The Video Sparse Attention backend is not installed. " + "To install it, please follow the instructions at: " + "https://hao-ai-lab.github.io/FastVideo/video_sparse_attention/installation.html " + ) from e + elif selected_backend == AttentionBackendEnum.VMOBA_ATTN: try: from csrc.attn.vmoba_attn.vmoba import ( # noqa: F401 diff --git a/fastvideo/tests/entrypoints/test_video_generator.py b/fastvideo/tests/entrypoints/test_video_generator.py new file mode 100644 index 000000000..d53b17b3e --- /dev/null +++ b/fastvideo/tests/entrypoints/test_video_generator.py @@ -0,0 +1,78 @@ +import os + +from fastvideo.entrypoints.video_generator import VideoGenerator + + +def _new_video_generator() -> VideoGenerator: + # Bypass __init__ since we only test a pure helper method. + return VideoGenerator.__new__(VideoGenerator) + + +def test_prepare_output_path_file_sanitization(tmp_path): + vg = _new_video_generator() + target_dir = tmp_path / "dir" + raw_path = target_dir / "inv:al*id?.mp4" + + result = vg._prepare_output_path(str(raw_path), prompt="ignored") + + assert os.path.dirname(result) == str(target_dir) + assert os.path.basename(result) == "invalid.mp4" + assert os.path.isdir(target_dir) + + +def test_prepare_output_path_directory_prompt_derived(tmp_path): + vg = _new_video_generator() + out_dir = tmp_path / "outputs" + prompt = "Hello:/\\*?<>| world" + + result = vg._prepare_output_path(str(out_dir), prompt=prompt) + + assert os.path.dirname(result) == str(out_dir) + # spaces are preserved (collapsed) by sanitizer; here it becomes "Hello world.mp4" + assert os.path.basename(result) == "Hello world.mp4" + assert os.path.isdir(out_dir) + + +def test_prepare_output_path_non_mp4_treated_as_dir(tmp_path): + vg = _new_video_generator() + weird_dir = tmp_path / "foo.gif" + prompt = "My Video" + + result = vg._prepare_output_path(str(weird_dir), prompt=prompt) + + assert os.path.dirname(result) == str(weird_dir) + assert os.path.basename(result) == "My Video.mp4" + assert os.path.isdir(weird_dir) + + +def test_prepare_output_path_uniqueness_suffix(tmp_path): + vg = _new_video_generator() + out_dir = tmp_path / "outputs" + prompt = "Sample Name" + + first = vg._prepare_output_path(str(out_dir), prompt=prompt) + # simulate existing file + os.makedirs(os.path.dirname(first), exist_ok=True) + with open(first, "wb") as f: + f.write(b"") + + second = vg._prepare_output_path(str(out_dir), prompt=prompt) + assert os.path.basename(second) == "Sample Name_1.mp4" + + # simulate second existing file as well + with open(second, "wb") as f: + f.write(b"") + third = vg._prepare_output_path(str(out_dir), prompt=prompt) + assert os.path.basename(third) == "Sample Name_2.mp4" + + +def test_prepare_output_path_empty_prompt_fallback(tmp_path): + vg = _new_video_generator() + out_dir = tmp_path / "outputs" + bad_prompt = ":/\\*?<>| .." # sanitizes to empty, should fallback to "video" + + result = vg._prepare_output_path(str(out_dir), prompt=bad_prompt) + + assert os.path.dirname(result) == str(out_dir) + assert os.path.basename(result) == "video.mp4" + diff --git a/fastvideo/tests/modal/pr_test.py b/fastvideo/tests/modal/pr_test.py index 05d3ae5c6..937ab95a3 100644 --- a/fastvideo/tests/modal/pr_test.py +++ b/fastvideo/tests/modal/pr_test.py @@ -124,4 +124,4 @@ def run_self_forcing_tests(): @app.function(gpu="L40S:1", image=image, timeout=900) def run_unit_test(): - run_test("pytest ./fastvideo/tests/dataset/ ./fastvideo/tests/workflow/ -vs") + run_test("pytest ./fastvideo/tests/dataset/ ./fastvideo/tests/workflow/ ./fastvideo/tests/entrypoints/ -vs") diff --git a/fastvideo/tests/ssim/L40S_reference_videos/FastHunyuan-diffusers/FLASH_ATTN/Will Smith casually eats noodles, his relaxed demeanor contrasting with the energetic background of .mp4 b/fastvideo/tests/ssim/L40S_reference_videos/FastHunyuan-diffusers/FLASH_ATTN/Will Smith casually eats noodles, his relaxed demeanor contrasting with the energetic background of.mp4 similarity index 100% rename from fastvideo/tests/ssim/L40S_reference_videos/FastHunyuan-diffusers/FLASH_ATTN/Will Smith casually eats noodles, his relaxed demeanor contrasting with the energetic background of .mp4 rename to fastvideo/tests/ssim/L40S_reference_videos/FastHunyuan-diffusers/FLASH_ATTN/Will Smith casually eats noodles, his relaxed demeanor contrasting with the energetic background of.mp4 diff --git a/fastvideo/tests/ssim/L40S_reference_videos/FastHunyuan-diffusers/TORCH_SDPA/Will Smith casually eats noodles, his relaxed demeanor contrasting with the energetic background of .mp4 b/fastvideo/tests/ssim/L40S_reference_videos/FastHunyuan-diffusers/TORCH_SDPA/Will Smith casually eats noodles, his relaxed demeanor contrasting with the energetic background of.mp4 similarity index 100% rename from fastvideo/tests/ssim/L40S_reference_videos/FastHunyuan-diffusers/TORCH_SDPA/Will Smith casually eats noodles, his relaxed demeanor contrasting with the energetic background of .mp4 rename to fastvideo/tests/ssim/L40S_reference_videos/FastHunyuan-diffusers/TORCH_SDPA/Will Smith casually eats noodles, his relaxed demeanor contrasting with the energetic background of.mp4 diff --git a/fastvideo/tests/ssim/L40S_reference_videos/SFWan2.1-T2V-1.3B-Diffusers/FLASH_ATTN/Will Smith casually eats noodles, his relaxed demeanor contrasting with the energetic background of .mp4 b/fastvideo/tests/ssim/L40S_reference_videos/SFWan2.1-T2V-1.3B-Diffusers/FLASH_ATTN/Will Smith casually eats noodles, his relaxed demeanor contrasting with the energetic background of.mp4 similarity index 100% rename from fastvideo/tests/ssim/L40S_reference_videos/SFWan2.1-T2V-1.3B-Diffusers/FLASH_ATTN/Will Smith casually eats noodles, his relaxed demeanor contrasting with the energetic background of .mp4 rename to fastvideo/tests/ssim/L40S_reference_videos/SFWan2.1-T2V-1.3B-Diffusers/FLASH_ATTN/Will Smith casually eats noodles, his relaxed demeanor contrasting with the energetic background of.mp4 diff --git a/fastvideo/tests/ssim/L40S_reference_videos/Wan2.1-T2V-1.3B-Diffusers/FLASH_ATTN/Will Smith casually eats noodles, his relaxed demeanor contrasting with the energetic background of .mp4 b/fastvideo/tests/ssim/L40S_reference_videos/Wan2.1-T2V-1.3B-Diffusers/FLASH_ATTN/Will Smith casually eats noodles, his relaxed demeanor contrasting with the energetic background of.mp4 similarity index 100% rename from fastvideo/tests/ssim/L40S_reference_videos/Wan2.1-T2V-1.3B-Diffusers/FLASH_ATTN/Will Smith casually eats noodles, his relaxed demeanor contrasting with the energetic background of .mp4 rename to fastvideo/tests/ssim/L40S_reference_videos/Wan2.1-T2V-1.3B-Diffusers/FLASH_ATTN/Will Smith casually eats noodles, his relaxed demeanor contrasting with the energetic background of.mp4 diff --git a/fastvideo/tests/ssim/L40S_reference_videos/Wan2.1-T2V-1.3B-Diffusers/TORCH_SDPA/Will Smith casually eats noodles, his relaxed demeanor contrasting with the energetic background of .mp4 b/fastvideo/tests/ssim/L40S_reference_videos/Wan2.1-T2V-1.3B-Diffusers/TORCH_SDPA/Will Smith casually eats noodles, his relaxed demeanor contrasting with the energetic background of.mp4 similarity index 100% rename from fastvideo/tests/ssim/L40S_reference_videos/Wan2.1-T2V-1.3B-Diffusers/TORCH_SDPA/Will Smith casually eats noodles, his relaxed demeanor contrasting with the energetic background of .mp4 rename to fastvideo/tests/ssim/L40S_reference_videos/Wan2.1-T2V-1.3B-Diffusers/TORCH_SDPA/Will Smith casually eats noodles, his relaxed demeanor contrasting with the energetic background of.mp4 diff --git a/fastvideo/tests/ssim/test_causal_similarity.py b/fastvideo/tests/ssim/test_causal_similarity.py index 3154873cc..48217a01d 100644 --- a/fastvideo/tests/ssim/test_causal_similarity.py +++ b/fastvideo/tests/ssim/test_causal_similarity.py @@ -19,6 +19,9 @@ device_reference_folder = "A40" + device_reference_folder_suffix elif "L40S" in device_name: device_reference_folder = "L40S" + device_reference_folder_suffix +else: + # device_reference_folder = "L40S" + device_reference_folder_suffix + raise ValueError(f"Unsupported device for ssim tests: {device_name}") # Base parameters from the shell script @@ -69,7 +72,7 @@ def test_causal_similarity(prompt, ATTENTION_BACKEND, model_id): base_output_dir = os.path.join(script_dir, 'generated_videos', model_id) output_dir = os.path.join(base_output_dir, ATTENTION_BACKEND) - output_video_name = f"{prompt[:100]}.mp4" + output_video_name = f"{prompt[:100].strip()}.mp4" os.makedirs(output_dir, exist_ok=True) @@ -119,7 +122,7 @@ def test_causal_similarity(prompt, ATTENTION_BACKEND, model_id): reference_video_name = None for filename in os.listdir(reference_folder): - if filename.endswith('.mp4') and prompt[:100] in filename: + if filename.endswith('.mp4') and prompt[:100].strip() in filename: reference_video_name = filename break diff --git a/fastvideo/tests/ssim/test_inference_similarity.py b/fastvideo/tests/ssim/test_inference_similarity.py index 608586897..0e5d1c683 100644 --- a/fastvideo/tests/ssim/test_inference_similarity.py +++ b/fastvideo/tests/ssim/test_inference_similarity.py @@ -19,6 +19,9 @@ device_reference_folder = "A40" + device_reference_folder_suffix elif "L40S" in device_name: device_reference_folder = "L40S" + device_reference_folder_suffix +else: + # device_reference_folder = "L40S" + device_reference_folder_suffix + raise ValueError(f"Unsupported device for ssim tests: {device_name}") # Base parameters from the shell script HUNYUAN_PARAMS = { @@ -115,7 +118,7 @@ def test_i2v_inference_similarity(prompt, ATTENTION_BACKEND, model_id): base_output_dir = os.path.join(script_dir, 'generated_videos', model_id) output_dir = os.path.join(base_output_dir, ATTENTION_BACKEND) - output_video_name = f"{prompt[:100]}.mp4" + output_video_name = f"{prompt[:100].strip()}.mp4" os.makedirs(output_dir, exist_ok=True) @@ -170,7 +173,7 @@ def test_i2v_inference_similarity(prompt, ATTENTION_BACKEND, model_id): reference_video_name = None for filename in os.listdir(reference_folder): - if filename.endswith('.mp4') and prompt[:100] in filename: + if filename.endswith('.mp4') and prompt[:100].strip() in filename: reference_video_name = filename break @@ -216,7 +219,7 @@ def test_inference_similarity(prompt, ATTENTION_BACKEND, model_id): base_output_dir = os.path.join(script_dir, 'generated_videos', model_id) output_dir = os.path.join(base_output_dir, ATTENTION_BACKEND) - output_video_name = f"{prompt[:100]}.mp4" + output_video_name = f"{prompt[:100].strip()}.mp4" os.makedirs(output_dir, exist_ok=True) @@ -270,7 +273,7 @@ def test_inference_similarity(prompt, ATTENTION_BACKEND, model_id): reference_video_name = None for filename in os.listdir(reference_folder): - if filename.endswith('.mp4') and prompt[:100] in filename: + if filename.endswith('.mp4') and prompt[:100].strip() in filename: reference_video_name = filename break diff --git a/pyproject.toml b/pyproject.toml index 345a17a61..60b69fca4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -49,7 +49,7 @@ dependencies = [ "pytest", "PyYAML==6.0.1", "protobuf>=5.28.3", - "gradio>=5.22.0", + "gradio==5.32.0", "moviepy>=2.0.0", "flask", "flask_restful",