From 4374a1e8ecd0a882057c9310927ee689c89e4a61 Mon Sep 17 00:00:00 2001
From: Wauplin <11801849+Wauplin@users.noreply.github.com>
Date: Sat, 6 Dec 2025 03:19:55 +0000
Subject: [PATCH] Update inference types (automated commit)

---
 .../en/package_reference/inference_types.md   | 24 +++++++
 .../ko/package_reference/inference_types.md   | 24 +++++++
 .../inference/_generated/types/__init__.py    | 12 ++++
 .../_generated/types/image_text_to_image.py   | 67 +++++++++++++++++++
 .../_generated/types/image_text_to_video.py   | 65 ++++++++++++++++++
 .../types/zero_shot_object_detection.py       |  1 +
 6 files changed, 193 insertions(+)
 create mode 100644 src/huggingface_hub/inference/_generated/types/image_text_to_image.py
 create mode 100644 src/huggingface_hub/inference/_generated/types/image_text_to_video.py

diff --git a/docs/source/en/package_reference/inference_types.md b/docs/source/en/package_reference/inference_types.md
index 535994221d..ac9d9b10ca 100644
--- a/docs/source/en/package_reference/inference_types.md
+++ b/docs/source/en/package_reference/inference_types.md
@@ -173,6 +173,30 @@ This part of the lib is still under development and will be improved in future r
 
 
 
+## image_text_to_image
+
+[[autodoc]] huggingface_hub.ImageTextToImageInput
+
+[[autodoc]] huggingface_hub.ImageTextToImageOutput
+
+[[autodoc]] huggingface_hub.ImageTextToImageParameters
+
+[[autodoc]] huggingface_hub.ImageTextToImageTargetSize
+
+
+
+## image_text_to_video
+
+[[autodoc]] huggingface_hub.ImageTextToVideoInput
+
+[[autodoc]] huggingface_hub.ImageTextToVideoOutput
+
+[[autodoc]] huggingface_hub.ImageTextToVideoParameters
+
+[[autodoc]] huggingface_hub.ImageTextToVideoTargetSize
+
+
+
 ## image_to_image
 
 [[autodoc]] huggingface_hub.ImageToImageInput
diff --git a/docs/source/ko/package_reference/inference_types.md b/docs/source/ko/package_reference/inference_types.md
index 84dda55956..c579e3e725 100644
--- a/docs/source/ko/package_reference/inference_types.md
+++ b/docs/source/ko/package_reference/inference_types.md
@@ -172,6 +172,30 @@ rendered properly in your Markdown viewer.
 
 
 
+## image_text_to_image[[huggingface_hub.ImageTextToImageInput]]
+
+[[autodoc]] huggingface_hub.ImageTextToImageInput
+
+[[autodoc]] huggingface_hub.ImageTextToImageOutput
+
+[[autodoc]] huggingface_hub.ImageTextToImageParameters
+
+[[autodoc]] huggingface_hub.ImageTextToImageTargetSize
+
+
+
+## image_text_to_video[[huggingface_hub.ImageTextToVideoInput]]
+
+[[autodoc]] huggingface_hub.ImageTextToVideoInput
+
+[[autodoc]] huggingface_hub.ImageTextToVideoOutput
+
+[[autodoc]] huggingface_hub.ImageTextToVideoParameters
+
+[[autodoc]] huggingface_hub.ImageTextToVideoTargetSize
+
+
+
 ## image_to_image[[huggingface_hub.ImageToImageInput]]
 
 [[autodoc]] huggingface_hub.ImageToImageInput
diff --git a/src/huggingface_hub/inference/_generated/types/__init__.py b/src/huggingface_hub/inference/_generated/types/__init__.py
index bfffc0ae3b..9f95dca555 100644
--- a/src/huggingface_hub/inference/_generated/types/__init__.py
+++ b/src/huggingface_hub/inference/_generated/types/__init__.py
@@ -77,6 +77,18 @@
     ImageSegmentationParameters,
     ImageSegmentationSubtask,
 )
+from .image_text_to_image import (
+    ImageTextToImageInput,
+    ImageTextToImageOutput,
+    ImageTextToImageParameters,
+    ImageTextToImageTargetSize,
+)
+from .image_text_to_video import (
+    ImageTextToVideoInput,
+    ImageTextToVideoOutput,
+    ImageTextToVideoParameters,
+    ImageTextToVideoTargetSize,
+)
 from .image_to_image import ImageToImageInput, ImageToImageOutput, ImageToImageParameters, ImageToImageTargetSize
 from .image_to_text import (
     ImageToTextEarlyStoppingEnum,
diff --git a/src/huggingface_hub/inference/_generated/types/image_text_to_image.py b/src/huggingface_hub/inference/_generated/types/image_text_to_image.py
new file mode 100644
index 0000000000..1ddd15335a
--- /dev/null
+++ b/src/huggingface_hub/inference/_generated/types/image_text_to_image.py
@@ -0,0 +1,67 @@
+# Inference code generated from the JSON schema spec in @huggingface/tasks.
+#
+# See:
+#   - script: https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/scripts/inference-codegen.ts
+#   - specs:  https://github.com/huggingface/huggingface.js/tree/main/packages/tasks/src/tasks.
+from typing import Any, Optional
+
+from .base import BaseInferenceType, dataclass_with_extra
+
+
+@dataclass_with_extra
+class ImageTextToImageTargetSize(BaseInferenceType):
+    """The size in pixels of the output image. This parameter is only supported by some
+    providers and for specific models. It will be ignored when unsupported.
+    """
+
+    height: int
+    width: int
+
+
+@dataclass_with_extra
+class ImageTextToImageParameters(BaseInferenceType):
+    """Additional inference parameters for Image Text To Image"""
+
+    guidance_scale: Optional[float] = None
+    """For diffusion models. A higher guidance scale value encourages the model to generate
+    images closely linked to the text prompt at the expense of lower image quality.
+    """
+    negative_prompt: Optional[str] = None
+    """One prompt to guide what NOT to include in image generation."""
+    num_inference_steps: Optional[int] = None
+    """For diffusion models. The number of denoising steps. More denoising steps usually lead to
+    a higher quality image at the expense of slower inference.
+    """
+    prompt: Optional[str] = None
+    """The text prompt to guide the image generation. Either this or inputs (image) must be
+    provided.
+    """
+    seed: Optional[int] = None
+    """Seed for the random number generator."""
+    target_size: Optional[ImageTextToImageTargetSize] = None
+    """The size in pixels of the output image. This parameter is only supported by some
+    providers and for specific models. It will be ignored when unsupported.
+    """
+
+
+@dataclass_with_extra
+class ImageTextToImageInput(BaseInferenceType):
+    """Inputs for Image Text To Image inference. Either inputs (image) or prompt (in parameters)
+    must be provided, or both.
+    """
+
+    inputs: Optional[str] = None
+    """The input image data as a base64-encoded string. If no `parameters` are provided, you can
+    also provide the image data as a raw bytes payload. Either this or prompt must be
+    provided.
+    """
+    parameters: Optional[ImageTextToImageParameters] = None
+    """Additional inference parameters for Image Text To Image"""
+
+
+@dataclass_with_extra
+class ImageTextToImageOutput(BaseInferenceType):
+    """Outputs of inference for the Image Text To Image task"""
+
+    image: Any
+    """The generated image returned as raw bytes in the payload."""
diff --git a/src/huggingface_hub/inference/_generated/types/image_text_to_video.py b/src/huggingface_hub/inference/_generated/types/image_text_to_video.py
new file mode 100644
index 0000000000..58b3a4f24e
--- /dev/null
+++ b/src/huggingface_hub/inference/_generated/types/image_text_to_video.py
@@ -0,0 +1,65 @@
+# Inference code generated from the JSON schema spec in @huggingface/tasks.
+#
+# See:
+#   - script: https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/scripts/inference-codegen.ts
+#   - specs:  https://github.com/huggingface/huggingface.js/tree/main/packages/tasks/src/tasks.
+from typing import Any, Optional
+
+from .base import BaseInferenceType, dataclass_with_extra
+
+
+@dataclass_with_extra
+class ImageTextToVideoTargetSize(BaseInferenceType):
+    """The size in pixel of the output video frames."""
+
+    height: int
+    width: int
+
+
+@dataclass_with_extra
+class ImageTextToVideoParameters(BaseInferenceType):
+    """Additional inference parameters for Image Text To Video"""
+
+    guidance_scale: Optional[float] = None
+    """For diffusion models. A higher guidance scale value encourages the model to generate
+    videos closely linked to the text prompt at the expense of lower image quality.
+    """
+    negative_prompt: Optional[str] = None
+    """One prompt to guide what NOT to include in video generation."""
+    num_frames: Optional[float] = None
+    """The num_frames parameter determines how many video frames are generated."""
+    num_inference_steps: Optional[int] = None
+    """The number of denoising steps. More denoising steps usually lead to a higher quality
+    video at the expense of slower inference.
+    """
+    prompt: Optional[str] = None
+    """The text prompt to guide the video generation. Either this or inputs (image) must be
+    provided.
+    """
+    seed: Optional[int] = None
+    """Seed for the random number generator."""
+    target_size: Optional[ImageTextToVideoTargetSize] = None
+    """The size in pixel of the output video frames."""
+
+
+@dataclass_with_extra
+class ImageTextToVideoInput(BaseInferenceType):
+    """Inputs for Image Text To Video inference. Either inputs (image) or prompt (in parameters)
+    must be provided, or both.
+    """
+
+    inputs: Optional[str] = None
+    """The input image data as a base64-encoded string. If no `parameters` are provided, you can
+    also provide the image data as a raw bytes payload. Either this or prompt must be
+    provided.
+    """
+    parameters: Optional[ImageTextToVideoParameters] = None
+    """Additional inference parameters for Image Text To Video"""
+
+
+@dataclass_with_extra
+class ImageTextToVideoOutput(BaseInferenceType):
+    """Outputs of inference for the Image Text To Video task"""
+
+    video: Any
+    """The generated video returned as raw bytes in the payload."""
diff --git a/src/huggingface_hub/inference/_generated/types/zero_shot_object_detection.py b/src/huggingface_hub/inference/_generated/types/zero_shot_object_detection.py
index e981463b25..d9512c77fd 100644
--- a/src/huggingface_hub/inference/_generated/types/zero_shot_object_detection.py
+++ b/src/huggingface_hub/inference/_generated/types/zero_shot_object_detection.py
@@ -3,6 +3,7 @@
 # See:
 #   - script: https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/scripts/inference-codegen.ts
 #   - specs:  https://github.com/huggingface/huggingface.js/tree/main/packages/tasks/src/tasks.
+
 from .base import BaseInferenceType, dataclass_with_extra