huggingface · HuggingFaceInfra · Dec 6, 2025
diff --git a/docs/source/en/package_reference/inference_types.md b/docs/source/en/package_reference/inference_types.md
@@ -173,6 +173,30 @@ This part of the lib is still under development and will be improved in future r
 
 
 
+## image_text_to_image
+
+[[autodoc]] huggingface_hub.ImageTextToImageInput
+
+[[autodoc]] huggingface_hub.ImageTextToImageOutput
+
+[[autodoc]] huggingface_hub.ImageTextToImageParameters
+
+[[autodoc]] huggingface_hub.ImageTextToImageTargetSize
+
+
+
+## image_text_to_video
+
+[[autodoc]] huggingface_hub.ImageTextToVideoInput
+
+[[autodoc]] huggingface_hub.ImageTextToVideoOutput
+
+[[autodoc]] huggingface_hub.ImageTextToVideoParameters
+
+[[autodoc]] huggingface_hub.ImageTextToVideoTargetSize
+
+
+
 ## image_to_image
 
 [[autodoc]] huggingface_hub.ImageToImageInput

diff --git a/docs/source/ko/package_reference/inference_types.md b/docs/source/ko/package_reference/inference_types.md
@@ -172,6 +172,30 @@ rendered properly in your Markdown viewer.
 
 
 
+## image_text_to_image[[huggingface_hub.ImageTextToImageInput]]
+
+[[autodoc]] huggingface_hub.ImageTextToImageInput
+
+[[autodoc]] huggingface_hub.ImageTextToImageOutput
+
+[[autodoc]] huggingface_hub.ImageTextToImageParameters
+
+[[autodoc]] huggingface_hub.ImageTextToImageTargetSize
+
+
+
+## image_text_to_video[[huggingface_hub.ImageTextToVideoInput]]
+
+[[autodoc]] huggingface_hub.ImageTextToVideoInput
+
+[[autodoc]] huggingface_hub.ImageTextToVideoOutput
+
+[[autodoc]] huggingface_hub.ImageTextToVideoParameters
+
+[[autodoc]] huggingface_hub.ImageTextToVideoTargetSize
+
+
+
 ## image_to_image[[huggingface_hub.ImageToImageInput]]
 
 [[autodoc]] huggingface_hub.ImageToImageInput

diff --git a/src/huggingface_hub/inference/_generated/types/__init__.py b/src/huggingface_hub/inference/_generated/types/__init__.py
@@ -77,6 +77,18 @@
     ImageSegmentationParameters,
     ImageSegmentationSubtask,
 )
+from .image_text_to_image import (
+    ImageTextToImageInput,
+    ImageTextToImageOutput,
+    ImageTextToImageParameters,
+    ImageTextToImageTargetSize,
+)
+from .image_text_to_video import (
+    ImageTextToVideoInput,
+    ImageTextToVideoOutput,
+    ImageTextToVideoParameters,
+    ImageTextToVideoTargetSize,
+)
 from .image_to_image import ImageToImageInput, ImageToImageOutput, ImageToImageParameters, ImageToImageTargetSize
 from .image_to_text import (
     ImageToTextEarlyStoppingEnum,

diff --git a/src/huggingface_hub/inference/_generated/types/image_text_to_image.py b/src/huggingface_hub/inference/_generated/types/image_text_to_image.py
@@ -0,0 +1,67 @@
+# Inference code generated from the JSON schema spec in @huggingface/tasks.
+#
+# See:
+#   - script: https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/scripts/inference-codegen.ts
+#   - specs:  https://github.com/huggingface/huggingface.js/tree/main/packages/tasks/src/tasks.
+from typing import Any, Optional
+
+from .base import BaseInferenceType, dataclass_with_extra
+
+
+@dataclass_with_extra
+class ImageTextToImageTargetSize(BaseInferenceType):
+    """The size in pixels of the output image. This parameter is only supported by some
+    providers and for specific models. It will be ignored when unsupported.
+    """
+
+    height: int
+    width: int
+
+
+@dataclass_with_extra
+class ImageTextToImageParameters(BaseInferenceType):
+    """Additional inference parameters for Image Text To Image"""
+
+    guidance_scale: Optional[float] = None
+    """For diffusion models. A higher guidance scale value encourages the model to generate
+    images closely linked to the text prompt at the expense of lower image quality.
+    """
+    negative_prompt: Optional[str] = None
+    """One prompt to guide what NOT to include in image generation."""
+    num_inference_steps: Optional[int] = None
+    """For diffusion models. The number of denoising steps. More denoising steps usually lead to
+    a higher quality image at the expense of slower inference.
+    """
+    prompt: Optional[str] = None
+    """The text prompt to guide the image generation. Either this or inputs (image) must be
+    provided.
+    """
+    seed: Optional[int] = None
+    """Seed for the random number generator."""
+    target_size: Optional[ImageTextToImageTargetSize] = None
+    """The size in pixels of the output image. This parameter is only supported by some
+    providers and for specific models. It will be ignored when unsupported.
+    """
+
+
+@dataclass_with_extra
+class ImageTextToImageInput(BaseInferenceType):
+    """Inputs for Image Text To Image inference. Either inputs (image) or prompt (in parameters)
+    must be provided, or both.
+    """
+
+    inputs: Optional[str] = None
+    """The input image data as a base64-encoded string. If no `parameters` are provided, you can
+    also provide the image data as a raw bytes payload. Either this or prompt must be
+    provided.
+    """
+    parameters: Optional[ImageTextToImageParameters] = None
+    """Additional inference parameters for Image Text To Image"""
+
+
+@dataclass_with_extra
+class ImageTextToImageOutput(BaseInferenceType):
+    """Outputs of inference for the Image Text To Image task"""
+
+    image: Any
+    """The generated image returned as raw bytes in the payload."""
diff --git a/src/huggingface_hub/inference/_generated/types/image_text_to_video.py b/src/huggingface_hub/inference/_generated/types/image_text_to_video.py
@@ -0,0 +1,65 @@
+# Inference code generated from the JSON schema spec in @huggingface/tasks.
+#
+# See:
+#   - script: https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/scripts/inference-codegen.ts
+#   - specs:  https://github.com/huggingface/huggingface.js/tree/main/packages/tasks/src/tasks.
+from typing import Any, Optional
+
+from .base import BaseInferenceType, dataclass_with_extra
+
+
+@dataclass_with_extra
+class ImageTextToVideoTargetSize(BaseInferenceType):
+    """The size in pixel of the output video frames."""
+
+    height: int
+    width: int
+
+
+@dataclass_with_extra
+class ImageTextToVideoParameters(BaseInferenceType):
+    """Additional inference parameters for Image Text To Video"""
+
+    guidance_scale: Optional[float] = None
+    """For diffusion models. A higher guidance scale value encourages the model to generate
+    videos closely linked to the text prompt at the expense of lower image quality.
+    """
+    negative_prompt: Optional[str] = None
+    """One prompt to guide what NOT to include in video generation."""
+    num_frames: Optional[float] = None
+    """The num_frames parameter determines how many video frames are generated."""
+    num_inference_steps: Optional[int] = None
+    """The number of denoising steps. More denoising steps usually lead to a higher quality
+    video at the expense of slower inference.
+    """
+    prompt: Optional[str] = None
+    """The text prompt to guide the video generation. Either this or inputs (image) must be
+    provided.
+    """
+    seed: Optional[int] = None
+    """Seed for the random number generator."""
+    target_size: Optional[ImageTextToVideoTargetSize] = None
+    """The size in pixel of the output video frames."""
+
+
+@dataclass_with_extra
+class ImageTextToVideoInput(BaseInferenceType):
+    """Inputs for Image Text To Video inference. Either inputs (image) or prompt (in parameters)
+    must be provided, or both.
+    """
+
+    inputs: Optional[str] = None
+    """The input image data as a base64-encoded string. If no `parameters` are provided, you can
+    also provide the image data as a raw bytes payload. Either this or prompt must be
+    provided.
+    """
+    parameters: Optional[ImageTextToVideoParameters] = None
+    """Additional inference parameters for Image Text To Video"""
+
+
+@dataclass_with_extra
+class ImageTextToVideoOutput(BaseInferenceType):
+    """Outputs of inference for the Image Text To Video task"""
+
+    video: Any
+    """The generated video returned as raw bytes in the payload."""
diff --git a/src/huggingface_hub/inference/_generated/types/zero_shot_object_detection.py b/src/huggingface_hub/inference/_generated/types/zero_shot_object_detection.py
@@ -3,6 +3,7 @@
 # See:
 #   - script: https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/scripts/inference-codegen.ts
 #   - specs:  https://github.com/huggingface/huggingface.js/tree/main/packages/tasks/src/tasks.
+
 from .base import BaseInferenceType, dataclass_with_extra
-Original file line number
+Diff line change
@@ Expand Up / @@ -3,6 +3,7 @@ @@
     # See:
     #   - script: https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/scripts/inference-codegen.ts
     #   - specs:  https://github.com/huggingface/huggingface.js/tree/main/packages/tasks/src/tasks.
     from .base import BaseInferenceType, dataclass_with_extra
@@ Expand Down @@