add feature Hunyuan_Video_Model

VikramxD · Jan 17, 2025 · 1f1201f · 1f1201f
1 parent 085907f
commit 1f1201f
Show file tree

Hide file tree

Showing 3 changed files with 261 additions and 21 deletions.
diff --git a/configs/__init__.py b/configs/__init__.py
@@ -0,0 +1 @@
+
diff --git a/configs/hunyuan_config.py b/configs/hunyuan_config.py
@@ -0,0 +1,74 @@
+"""
+Configuration for Hunyuan Video Generation.
+
+This module contains the configuration settings for the Hunyuan video generation model.
+Settings can be configured via environment variables with the VIDEO_GEN_ prefix.
+
+Example:
+    To use with default settings:
+        >>> from configs.hunyuan_config import HunyuanConfig
+        >>> config = HunyuanConfig()
+    
+    To override via environment variables:
+        $ export VIDEO_GEN_MODEL_ID="custom/model"
+        $ export VIDEO_GEN_DEVICE_MAP="cuda"
+        
+    To override in code:
+        >>> config = HunyuanConfig(
+        ...     model_id="custom/model",
+        ...     device_map="cuda"
+        ... )
+"""
+
+from pydantic_settings import BaseSettings
+
+
+class HunyuanConfig(BaseSettings):
+    """Configuration settings for Hunyuan Video Generation.
+    
+    This class uses Pydantic's BaseSettings to manage configuration with environment
+    variable support. All settings can be overridden via environment variables
+    with the prefix VIDEO_GEN_.
+    
+    Attributes:
+        model_id (str): The Huggingface model ID for the Hunyuan video model.
+            Default: "hunyuanvideo-community/HunyuanVideo"
+        device_map (str): Strategy for mapping model layers to devices.
+            Options: "auto", "balanced", "sequential", or specific device like "cuda:0".
+            Default: "balanced"
+        load_in_8bit (bool): Whether to load the model in 8-bit quantization.
+            Reduces memory usage at the cost of slight quality degradation.
+            Default: True
+        torch_dtype (str): PyTorch data type for model weights.
+            Options: "float16", "float32", "bfloat16".
+            Default: "float16"
+        output_dir (str): Directory where generated videos will be saved.
+            Default: "outputs"
+        default_fps (int): Default frames per second for generated videos.
+            Default: 15
+    
+    Example:
+        >>> config = HunyuanConfig(
+        ...     model_id="custom/model",
+        ...     device_map="cuda",
+        ...     output_dir="custom_outputs"
+        ... )
+        >>> print(config.model_id)
+        'custom/model'
+    """
+
+    model_id: str = "hunyuanvideo-community/HunyuanVideo"
+    device_map: str = "balanced"
+    load_in_8bit: bool = True
+    torch_dtype: str = "float16"
+    output_dir: str = "outputs"
+    default_fps: int = 15
+
+    class Config:
+        """Pydantic configuration class.
+        
+        Attributes:
+            env_prefix: Prefix for environment variables.
+                Example: VIDEO_GEN_MODEL_ID will set the model_id attribute.
+        """
+        env_prefix = "VIDEO_GEN_"
diff --git a/scripts/hunyuan_video_inference.py b/scripts/hunyuan_video_inference.py
@@ -3,38 +3,203 @@
 
 This script provides functionality for generating videos using the Hunyuan AI model.
 It handles model initialization, video generation configuration, and output saving.
+The script uses Pydantic settings for configuration management, which can be
+configured via environment variables or programmatically.
 
-The script can be configured via environment variables (with VIDEO_GEN_ prefix) or a .env file.
+Example:
+    Basic usage with default settings:
+        >>> from scripts.hunyuan_video_inference import HunyuanVideoInference
+        >>> generator = HunyuanVideoInference()
+        >>> video_path = generator.generate_video(
+        ...     prompt="A cat walks on the grass, realistic style."
+        ... )
+        >>> print(f"Video saved to: {video_path}")
+    
+    Usage with custom configuration:
+        >>> from configs.hunyuan_config import HunyuanConfig
+        >>> config = HunyuanConfig(device_map="cuda", output_dir="custom_outputs")
+        >>> generator = HunyuanVideoInference(config)
+        >>> video_path = generator.generate_video(
+        ...     prompt="A dog playing in the snow",
+        ...     num_frames=30,
+        ...     fps=30
+        ... )
+
+Note:
+    This script requires the Hunyuan model to be available either locally or
+    downloadable from the Huggingface Hub. The model requires significant
+    GPU memory, especially when not using 8-bit quantization.
 """
 
 import os
-import time
 from pathlib import Path
-from loguru import logger
+from typing import Optional
 from datetime import datetime
+
 import torch
-from diffusers import BitsAndBytesConfig as DiffusersBitsAndBytesConfig, HunyuanVideoTransformer3DModel, HunyuanVideoPipeline
+from loguru import logger
+from diffusers import (
+    BitsAndBytesConfig as DiffusersBitsAndBytesConfig,
+    HunyuanVideoTransformer3DModel,
+    HunyuanVideoPipeline,
+)
 from diffusers.utils import export_to_video
 
+from configs.hunyuan_config import HunyuanConfig
 
-model_id = "hunyuanvideo-community/HunyuanVideo"
 
+class HunyuanVideoInference:
+    """Hunyuan Video Generation inference class.
+    
+    This class provides an interface for generating videos using the Hunyuan model.
+    It handles model initialization, pipeline setup, and video generation with
+    configurable parameters.
+    
+    The class uses the HunyuanConfig for configuration management, which can be
+    provided during initialization or created with default values.
+    
+    Attributes:
+        config (HunyuanConfig): Configuration instance for the inference pipeline.
+        pipeline (HunyuanVideoPipeline): The loaded Hunyuan video generation pipeline.
+    
+    Example:
+        >>> generator = HunyuanVideoInference()
+        >>> video_path = generator.generate_video(
+        ...     prompt="A cat walks on the grass",
+        ...     num_frames=30,
+        ...     fps=30
+        ... )
+        >>> print(f"Generated video: {video_path}")
+    """
+
+    def __init__(self, config: Optional[HunyuanConfig] = None):
+        """Initialize the Hunyuan Video inference pipeline.
+        
+        Args:
+            config (Optional[HunyuanConfig]): Configuration for the inference pipeline.
+                If None, default configuration will be used.
+        """
+        self.config = config or HunyuanConfig()
+        self.setup_pipeline()
+
+    def setup_pipeline(self):
+        """Set up the Hunyuan Video pipeline with the specified configuration.
+        
+        This method initializes the model and creates the pipeline with the
+        configuration specified during class initialization. It handles:
+        1. Setting up quantization configuration
+        2. Loading the transformer model
+        3. Creating the pipeline with the loaded model
+        
+        The method sets the pipeline attribute of the class, which is then
+        used for video generation.
+        
+        Note:
+            This method is automatically called during initialization and typically
+            doesn't need to be called directly.
+        """
+        quant_config = DiffusersBitsAndBytesConfig(
+            load_in_8bit=self.config.load_in_8bit
+        )
+
+        torch_dtype = getattr(torch, self.config.torch_dtype)
+
+        transformer_8bit = HunyuanVideoTransformer3DModel.from_pretrained(
+            self.config.model_id,
+            subfolder="transformer",
+            quantization_config=quant_config,
+            torch_dtype=torch_dtype,
+        )
+
+        self.pipeline = HunyuanVideoPipeline.from_pretrained(
+            self.config.model_id,
+            transformer=transformer_8bit,
+            torch_dtype=torch_dtype,
+            device_map=self.config.device_map,
+        )
+
+    def generate_video(
+        self,
+        prompt: str,
+        output_path: Optional[str] = None,
+        num_frames: int = 61,
+        num_inference_steps: int = 30,
+        fps: Optional[int] = None,
+    ) -> str:
+        """Generate a video based on the given prompt.
+        
+        This method handles the entire video generation process, including:
+        1. Creating the output directory if needed
+        2. Generating the video frames using the Hunyuan pipeline
+        3. Exporting the frames to a video file
+        
+        Args:
+            prompt (str): Text prompt describing the video to generate.
+            output_path (Optional[str]): Path to save the video. If None,
+                a timestamped filename in the configured output directory
+                will be used.
+            num_frames (int): Number of frames to generate. More frames
+                result in longer videos but increase generation time.
+                Default: 61
+            num_inference_steps (int): Number of denoising steps. Higher
+                values may improve quality but increase generation time.
+                Default: 30
+            fps (Optional[int]): Frames per second for the output video.
+                If None, uses the configured default_fps.
+                Default: None
+            
+        Returns:
+            str: Path to the generated video file.
+            
+        Example:
+            >>> generator = HunyuanVideoInference()
+            >>> path = generator.generate_video(
+            ...     prompt="A cat walks on the grass",
+            ...     num_frames=30,
+            ...     fps=30
+            ... )
+            >>> print(f"Video saved to: {path}")
+        """
+        # Create output directory if it doesn't exist
+        if output_path is None:
+            os.makedirs(self.config.output_dir, exist_ok=True)
+            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+            output_path = os.path.join(
+                self.config.output_dir,
+                f"video_{timestamp}.mp4"
+            )
+
+        # Generate video
+        video = self.pipeline(
+            prompt=prompt,
+            num_frames=num_frames,
+            num_inference_steps=num_inference_steps
+        ).frames[0]
+
+        # Export video
+        export_to_video(
+            video,
+            output_path,
+            fps=fps or self.config.default_fps
+        )
+
+        logger.info(f"Video generated and saved to: {output_path}")
+        return output_path
 
-quant_config = DiffusersBitsAndBytesConfig(load_in_8bit=True)
-transformer_8bit = HunyuanVideoTransformer3DModel.from_pretrained(
-    "hunyuanvideo-community/HunyuanVideo",
-    subfolder="transformer",
-    quantization_config=quant_config,
-    torch_dtype=torch.float16,
-)
 
-pipeline = HunyuanVideoPipeline.from_pretrained(
-    "hunyuanvideo-community/HunyuanVideo",
-    transformer=transformer_8bit,
-    torch_dtype=torch.float16,
-    device_map="balanced",
-)
+def main():
+    """Example usage of the HunyuanVideoInference class.
+    
+    This function demonstrates how to use the HunyuanVideoInference class
+    with default settings to generate a simple video.
+    """
+    config = HunyuanConfig()
+    generator = HunyuanVideoInference(config)
+
+    prompt = "A cat walks on the grass, realistic style."
+    video_path = generator.generate_video(prompt)
+    print(f"Generated video saved at: {video_path}")
+
 
-prompt = "A cat walks on the grass, realistic style."
-video = pipeline(prompt=prompt, num_frames=61, num_inference_steps=30).frames[0]
-export_to_video(video, "cat.mp4", fps=15)
+if __name__ == "__main__":
+    main()