From 58cfd71b5e734edf9508f6da40c6c7f866246cf3 Mon Sep 17 00:00:00 2001
From: Zhang Peiyuan <a1286225768@gmail.com>
Date: Sun, 15 Dec 2024 17:03:29 -0800
Subject: [PATCH] Cleanup

Co-authored-by: rlsu9 <r3su@ucsd.edu>
---
 README.md                                    | 194 ++++-------------
 demo/gradio_web_demo.py                      |  21 +-
 docs/data_preprocess.md                      |  68 ++++++
 docs/distill_hunyuan.md                      |   5 -
 docs/distillation.md                         |  24 +++
 docs/finetuning.md                           |  34 +++
 fastvideo/models/hunyuan/inference.py        |  11 +-
 fastvideo/models/mochi_hf/pipeline_mochi.py  |   1 +
 fastvideo/sample/sample_t2v_hunyuan.py       |  14 +-
 fastvideo/sample/sample_t2v_hunyuan_no_sp.py | 208 -------------------
 fastvideo/train.py                           |   2 +-
 scripts/distill/distill_hunyuan.sh           |  29 ++-
 scripts/distill/distill_mochi.sh             |   2 +-
 scripts/finetune/finetune_mochi.sh           |   2 +-
 scripts/finetune/finetune_mochi_lora.sh      |  37 ++++
 scripts/inference/inference_hunyuan.sh       |  19 ++
 scripts/inference/inference_hunyuan_no_sp.sh |  14 --
 scripts/inference/inference_hunyuan_sp.sh    |  17 --
 scripts/inference/inference_mochi_no_sp.sh   |   2 +-
 scripts/inference/inference_mochi_sp.sh      |  11 +-
 scripts/preprocess/preprocess_mochi_data.sh  |   2 +-
 21 files changed, 279 insertions(+), 438 deletions(-)
 create mode 100644 docs/data_preprocess.md
 delete mode 100644 docs/distill_hunyuan.md
 create mode 100644 docs/distillation.md
 create mode 100644 docs/finetuning.md
 delete mode 100644 fastvideo/sample/sample_t2v_hunyuan_no_sp.py
 create mode 100644 scripts/finetune/finetune_mochi_lora.sh
 create mode 100644 scripts/inference/inference_hunyuan.sh
 delete mode 100644 scripts/inference/inference_hunyuan_no_sp.sh
 delete mode 100644 scripts/inference/inference_hunyuan_sp.sh
diff --git a/README.md b/README.md
index d92b638..563961b 100644
--- a/README.md
+++ b/README.md
@@ -1,26 +1,9 @@
-# FastVideo
-
 <div align="center">
-<img src=assets/logo.jpg width="50%"/>
+<img src=assets/logo.jpg width="30%"/>
 </div>
 
-FastVideo is a scalable framework for post-training video diffusion models, addressing the growing challenges of fine-tuning, distillation, and inference as model sizes and sequence lengths increase. As a first step, it provides an efficient script for distilling and fine-tuning the 10B Mochi model, with plans to expand features and support for more models.
-
-### Features
-
-- FastMochi, a distilled Mochi model that can generate videos with merely 8 sampling steps.
-- Finetuning with FSDP (both master weight and ema weight), sequence parallelism, and selective gradient checkpointing.
-- LoRA coupled with pecomputed the latents and text embedding for minumum memory consumption.
-- Finetuning with both image and videos.
-
-## Change Log
-
-
-- ```2024/12/17```: `FastVideo` v0.1 is released.
-
-
-## Fast and High-Quality Text-to-video Generation
-
+FastVideo is an open framework for distilling, training, and inferencing large video diffusion model.
+<div align="center">
 <table style="margin-left: auto; margin-right: auto; border: none;">
   <tr>
     <td>
@@ -33,163 +16,66 @@ FastVideo is a scalable framework for post-training video diffusion models, addr
     </td>
   </tr>
 </table>
+  </div>
 
-## Table of Contents
-
-Jump to a specific section:
-
-- [🔧 Installation](#-installation)
-- [🚀 Inference](#-inference)
-- [🧱 Data Preprocess](#-data-preprocess)
-- [🎯 Distill](#-distill)
-- [⚡ Finetune](#-finetune)
-
-
-## 🔧 Installation
-
-- Python >= 3.10.0
-- Cuda >= 12.1
-
-```
-git clone https://github.com/hao-ai-lab/FastVideo.git
-cd FastVideo 
-
-./env_setup.sh fastvideo
-# or you can install the working environment step by step following env_setup.sh
-```
-
-
+### What is this?
 
-## 🚀 Inference
-
-Use [scripts/huggingface/download_hf.py](scripts/huggingface/download_hf.py) to download the hugging-face style model to a local directory. Use it like this:
-```bash
-python scripts/huggingface/download_hf.py --repo_id=FastVideo/FastMochi --local_dir=data/FastMochi --repo_type=model
-```
+As state-of-the-art video diffusion models grow in size and sequence length, their become prohibitive to use. For instance, sampling a 5-second 720P video with Hunyuan takes 13 minutes on 4 X A100. FastVideo aim to make large video diffusion models fast to infer and efficient to train, and thus making them more **accessible**. 
 
+We introduce FastMochi and FastHunyuan, distilled versions of the Mochi and Hunyuan video diffusion models. FastMochi achieves high-quality sampling with just 8 inference steps. FastHunyuan maintains sampling quality with only 4 inference steps.
 
-### 🔛 Quick Start with Gradio UI
 
-```
-python demo/gradio_web_demo.py --model_path data/FastMochi
-```
 
-### 🔛 CLI Inference with Sequence Parallelism
+### What can I do with FastVideo?
+Other than the distilled weight, FastVideo provides a pipeline for training, distilling, and inferencing video diffusion models. Key capabilities include:
 
-We also provide CLI inference script featured with sequence parallelism in [scripts/inference](scripts/inference).
-
-```
-# bash scripts/inference/inference_mochi_sp.sh
-
-num_gpus=4
-
-torchrun --nnodes=1 --nproc_per_node=$num_gpus --master_port 29503 \
-    fastvideo/sample/sample_t2v_mochi.py \
-    --model_path data/FastMochi \
-    --prompt_path "assets/prompt.txt" \
-    --num_frames 93 \
-    --height 480 \
-    --width 848 \
-    --num_inference_steps 8 \
-    --guidance_scale 4.5 \
-    --output_path outputs_video/mochi_sp/ \
-    --shift 8 \
-    --seed 12345 \
-    --scheduler_type "pcm_linear_quadratic" 
+- **Scalable**: FastVideo supports FSDP, sequence parallelism, and selective gradient checkpointing. Our code seamlessly scales to 64 GPUs in our test.
+- **Memory Efficient**: FastVideo supports LoRA finetuning coupled with precomputed latents and text embeddings for minimal memory usage.
+- **Variable Sequence length**: You can finetuning with both image and videos.
 
-```
-
-## 🧱 Data Preprocess
-
-To reduce the memory cost and time consumption caused by VAE and T5 during distillation and finetuning, we offload the VAE and T5 preprocess media part to the Data Preprocess section.
-For data preprocessing, we need to prepare a source folder for the media we wish to use and a JSON file for the source information of these media.
-
-### Sample for Data Preprocess
-
-We provide a small sample dataset for you to start with, download the source media with command:
-```
-python scripts/huggingface/download_hf.py --repo_id=FastVideo/Image-Vid-Finetune-Src --local_dir=data/Image-Vid-Finetune-Src --repo_type=dataset
-```
-To preprocess dataset for finetune/distill run:
-
-```
-bash scripts/preprocess/preprocess_mochi_data.sh # for mochi
-bash scripts/preprocess/preprocess_hunyuan_data.sh # for hunyuan
-```
-
-The preprocessed dataset will be stored in `Image-Vid-Finetune-Mochi` or `Image-Vid-Finetune-HunYuan` correspondingly.
-
-### Create Custom Dataset
+## Change Log
 
-If you wish to create your own dataset for finetuning or distillation, please pay attention to the following format:
+- ```2024/12/16```: `FastVideo` v0.1 is released.
 
-Use a txt file to contain the source folder for media and the JSON file for meta information:
 
-```
-path_to_media_source_foder,path_to_json_file
-```
-The content of the JSON file is a list with each item corresponding to a media source.
+## 🔧 Installation
+The code is tested on Python 3.10.0 and CUDA 12.1.
 
-For image media, the JSON item needs to follow this format:
 ```
-{
-    "path": "0.jpg",
-    "cap": ["captions"]
-}
-```
-For video media, the JSON item needs to follow this format:
-```
-{
-    "path": "1.mp4",
-    "resolution": {
-      "width": 848,
-      "height": 480
-    },
-    "fps": 30.0,
-    "duration": 6.033333333333333,
-    "cap": [
-      "caption"
-    ]
-  }
-```
-Adjust the `DATA_MERGE_PATH` and `OUTPUT_DIR` in `scripts/preprocess/preprocess_****_data.sh` accordingly and run:
-```
-bash scripts/preprocess/preprocess_****_data.sh
+./env_setup.sh fastvideo
+conda activate fastvideo
 ```
-The preprocessed data will be put into the `OUTPUT_DIR` and the `videos2caption.json` can be used in finetune and distill scripts.
-
-## 🎯 Distill
 
-We provide a dataset example here. First download testing data. Use [scripts/huggingface/download_hf.py](scripts/huggingface/download_hf.py) to download the data to a local directory. Use it like this:
+## 🚀 Inference
+We recommend using a GPU with 80GB of memory. To run the inference, use the following command:
+### FastHunyuan
 ```bash
-python scripts/huggingface/download_hf.py --repo_id=FastVideo/Mochi-425-Data --local_dir=data/Mochi-425-Data --repo_type=dataset
-python scripts/huggingface/download_hf.py --repo_id=FastVideo/validation_embeddings --local_dir=data/validation_embeddings --repo_type=dataset
+# Download the model weight
+python scripts/huggingface/download_hf.py --repo_id=FastVideo/FastHunyuan --local_dir=data/FastHunyuan --repo_type=model
+# change the gpu count inside the script
+sh scripts/inference/inference_hunyuan.sh
 ```
 
-Then the distillation can be launched by:
+### FastMochi
+You can use FastMochi
 
+```bash
+# Download the model weight
+python scripts/huggingface/download_hf.py --repo_id=FastVideo/FastMochi-diffusers --local_dir=data/FastMochi-diffusers --repo_type=model
+# CLI inference
+bash scripts/inference/inference_mochi_sp.sh
+# Gradio web dem
+python demo/gradio_web_demo.py --model_path data/FastMochi-diffusers --guidance_scale 1.5 --num_frames 163
 ```
-bash scripts/distill/distill_mochi.sh # for mochi
-bash scripts/distill/distill_hunyuan.sh # for hunyuan
-```
-
-
-## ⚡ Finetune
-
 
-### 💰Hardware requirement
+## Distillation
+Please refer to the [distillation guide](docs/distilation.md).
 
-- 72G VRAM is required for finetuning 10B mochi model.
+## Finetuning
+Please refer to the [finetuning guide](docs/finetuning.md).
 
-To launch finetuning, you will need to prepare data in the according to formats described in section [Data Preprocess](#-data-preprocess). 
+## Development Plan
 
-If you are doing image-video mixture finetuning, make sure `--group_frame` is in your script.
-
-Then run the finetune with:
-```
-bash scripts/finetune/finetune_mochi.sh # for mochi
-bash scripts/finetune/finetune_hunyuan.sh # for hunyuan
-```
 
 ## Acknowledgement
-We learned from and reused code from the following projects: [PCM](https://github.com/G-U-N/Phased-Consistency-Model), [diffusers](https://github.com/huggingface/diffusers), and [OpenSoraPlan](https://github.com/PKU-YuanGroup/Open-Sora-Plan).
+We learned and reused code from the following projects: [PCM](https://github.com/G-U-N/Phased-Consistency-Model), [diffusers](https://github.com/huggingface/diffusers), and [OpenSoraPlan](https://github.com/PKU-YuanGroup/Open-Sora-Plan).
diff --git a/demo/gradio_web_demo.py b/demo/gradio_web_demo.py
index 32d1db9..fc91133 100644
--- a/demo/gradio_web_demo.py
+++ b/demo/gradio_web_demo.py
@@ -19,14 +19,14 @@ def init_args():
     parser.add_argument("--num_inference_steps", type=int, default=8)
     parser.add_argument("--guidance_scale", type=float, default=4.5)
     parser.add_argument("--model_path", type=str, default="data/mochi")
-    parser.add_argument("--seed", type=int, default=42)
+    parser.add_argument("--seed", type=int, default=12345)
     parser.add_argument("--transformer_path", type=str, default=None)
-    parser.add_argument("--scheduler_type", type=str, default="euler")
+    parser.add_argument("--scheduler_type", type=str, default="pcm_linear_quadratic")
     parser.add_argument("--lora_checkpoint_dir", type=str, default=None)
     parser.add_argument("--shift", type=float, default=8.0)
-    parser.add_argument("--num_euler_timesteps", type=int, default=100)
-    parser.add_argument("--linear_threshold", type=float, default=0.025)
-    parser.add_argument("--linear_range", type=float, default=0.5)
+    parser.add_argument("--num_euler_timesteps", type=int, default=50)
+    parser.add_argument("--linear_threshold", type=float, default=0.1)
+    parser.add_argument("--linear_range", type=float, default=0.75)
     parser.add_argument("--cpu_offload", action="store_true")
     return parser.parse_args()
 
@@ -36,11 +36,12 @@ def load_model(args):
     if args.scheduler_type == "euler":
         scheduler = FlowMatchEulerDiscreteScheduler()
     else:
+        linear_quadratic = True  if "linear_quadratic" in args.scheduler_type else False
         scheduler = PCMFMScheduler(
             1000,
             args.shift,
             args.num_euler_timesteps,
-            False,
+            linear_quadratic,
             args.linear_threshold,
             args.linear_range,
         )
@@ -58,10 +59,9 @@ def load_model(args):
     pipe.enable_vae_tiling()
     #pipe.to(device)
     #if args.cpu_offload:
-    pipe.enable_model_cpu_offload()
+    pipe.enable_sequential_cpu_offload()
     return pipe
 
-
 def generate_video(
     prompt,
     negative_prompt,
@@ -77,8 +77,6 @@ def generate_video(
     if randomize_seed:
         seed = torch.randint(0, 1000000, (1,)).item()
 
-    pipe = load_model(args)
-    print("load model successfully")
     generator = torch.Generator(device="cuda").manual_seed(seed)
 
     if not use_negative_prompt:
@@ -108,7 +106,8 @@ def generate_video(
 ]
 
 args = init_args()
-
+pipe = load_model(args)
+print("load model successfully")
 with gr.Blocks() as demo:
     gr.Markdown("# Fastvideo Mochi Video Generation Demo")
 
diff --git a/docs/data_preprocess.md b/docs/data_preprocess.md
new file mode 100644
index 0000000..710f7de
--- /dev/null
+++ b/docs/data_preprocess.md
@@ -0,0 +1,68 @@
+
+
+
+## 🧱 Data Preprocess
+
+To save GPU memory, we precompute text embeddings and VAE latents to eliminate the need to load the text encoder and VAE during training.
+
+
+We provide a sample dataset to help you get started. Download the source media using the following command:
+```bash
+python scripts/huggingface/download_hf.py --repo_id=FastVideo/Image-Vid-Finetune-Src --local_dir=data/Image-Vid-Finetune-Src --repo_type=dataset
+```
+To preprocess the dataset for fine-tuning or distillation, run:
+```
+bash scripts/preprocess/preprocess_mochi_data.sh # for mochi
+bash scripts/preprocess/preprocess_hunyuan_data.sh # for hunyuan
+```
+
+The preprocessed dataset will be stored in `Image-Vid-Finetune-Mochi` or `Image-Vid-Finetune-HunYuan` correspondingly.
+
+### Process your own dataset
+
+If you wish to create your own dataset for finetuning or distillation, please structure you video dataset in the following format:
+
+path_to_dataset_folder/
+├── media/
+│   ├── 0.jpg
+│   ├── 1.mp4
+│   ├── 2.jpg
+├── video2caption.json
+└── merge.txt
+
+Format the JSON file as a list, where each item represents a media source:
+
+For image media,
+```
+{
+    "path": "0.jpg",
+    "cap": ["captions"]
+}
+```
+For video media, 
+```
+{
+    "path": "1.mp4",
+    "resolution": {
+      "width": 848,
+      "height": 480
+    },
+    "fps": 30.0,
+    "duration": 6.033333333333333,
+    "cap": [
+      "caption"
+    ]
+  }
+```
+
+Use a txt file (merge.txt) to contain the source folder for media and the JSON file for meta information:
+
+```
+path_to_media_source_foder,path_to_json_file
+```
+
+Adjust the `DATA_MERGE_PATH` and `OUTPUT_DIR` in `scripts/preprocess/preprocess_****_data.sh` accordingly and run:
+```
+bash scripts/preprocess/preprocess_****_data.sh
+```
+The preprocessed data will be put into the `OUTPUT_DIR` and the `videos2caption.json` can be used in finetune and distill scripts.
diff --git a/docs/distill_hunyuan.md b/docs/distill_hunyuan.md
deleted file mode 100644
index 3376063..0000000
--- a/docs/distill_hunyuan.md
+++ /dev/null
@@ -1,5 +0,0 @@
-## How to Distill Hunyuan 
-
-python scripts/huggingface/download_hf --repo_id FastVideo/hunyuan --local_dir data/hunyuan --repo_type model
-python scripts/huggingface/download_hf --repo_id FastVideo/Hunyuan-Distill-Data --local_dir data/Hunyuan-Distill-Data --repo_type=dataset
-
diff --git a/docs/distillation.md b/docs/distillation.md
new file mode 100644
index 0000000..e106e03
--- /dev/null
+++ b/docs/distillation.md
@@ -0,0 +1,24 @@
+## 🎯 Distill
+
+
+Our distillation recipe is based on [Phased Consistency Model](https://github.com/G-U-N/Phased-Consistency-Model). We did not find significant improvement using multi-phase distillation, so we keep the one phase setup similar to the original latent consistency model's recipe.
+
+We use the [MixKit](https://huggingface.co/datasets/LanguageBind/Open-Sora-Plan-v1.1.0/tree/main/all_mixkit) dataset for distillation. To avoid running the text encoder and VAE during training, we preprocess all data to generate text embeddings and VAE latents.
+
+Preprocessing instructions can be found [data_preprocess.md](#-data-preprocess). For convenience, we also provide preprocessed data that can be downloaded directly using the following command:
+
+```bash
+python scripts/huggingface/download_hf.py --repo_id=FastVideo/HD-Mixkit-Finetune-Hunyuan --local_dir=data/HD-Mixkit-Finetune-Hunyuan --repo_type=dataset
+```
+Next, download the original model weights with:
+
+```bash
+python scripts/huggingface/download_hf.py --repo_id=FastVideo/hunyuan --local_dir=data/hunyuan --repo_type=model
+```
+To launch the distillation process, use the following commands:
+
+```
+bash scripts/distill/distill_mochi.sh # for mochi
+bash scripts/distill/distill_hunyuan.sh # for hunyuan
+```
+We also provide an optional script for distillation with adversarial loss, located at `fastvideo/distill_adv.py`. Although we tried adversarial loss, we did not observe significant improvements.
diff --git a/docs/finetuning.md b/docs/finetuning.md
new file mode 100644
index 0000000..82252de
--- /dev/null
+++ b/docs/finetuning.md
@@ -0,0 +1,34 @@
+
+## ⚡ Finetune
+
+We support full fine-tuning for both the Mochi and Hunyuan models. Additionally, we provide Image-Video Mix finetuning.
+
+
+Ensure your data is prepared and preprocessed in the format specified in the [Data Preprocess](#-data-preprocess). 
+Download the original model weights with:
+```bash
+python scripts/huggingface/download_hf.py --repo_id=genmo/mochi-1-preview --local_dir=data/mochi --repo_type=model
+python scripts/huggingface/download_hf.py --repo_id=FastVideo/hunyuan --local_dir=data/hunyuan --repo_type=model
+```
+
+
+FastVideo/BLACK-MYTH-YQ
+Then run the finetune with:
+```
+bash scripts/finetune/finetune_mochi.sh # for mochi
+bash scripts/finetune/finetune_hunyuan.sh # for hunyuan
+```
+For Image-Video Mixture Fine-tuning, make sure to enable the --group_frame option in your script.
+
+
+## Lora Finetune
+
+Currently, we only provide Lora Finetune for Mochi model, the command for Lora Finetune is
+```
+bash scripts/finetune/finetune_mochi_lora.sh
+```
+
+### 💰Hardware requirement
+
+- 72G VRAM is required for finetuning 10B mochi model.
+
diff --git a/fastvideo/models/hunyuan/inference.py b/fastvideo/models/hunyuan/inference.py
index 780cabd..ba7f697 100644
--- a/fastvideo/models/hunyuan/inference.py
+++ b/fastvideo/models/hunyuan/inference.py
@@ -20,7 +20,7 @@
 from fastvideo.models.hunyuan.utils.data_utils import align_to
 from fastvideo.models.hunyuan.diffusion.schedulers import FlowMatchDiscreteScheduler
 from fastvideo.models.hunyuan.diffusion.pipelines import HunyuanVideoPipeline
-
+from safetensors.torch import load_file as safetensors_load_file
 from fastvideo.utils.parallel_states import (
     initialize_sequence_parallel_state,
     nccl_info,
@@ -238,7 +238,14 @@ def load_state_dict(args, model, pretrained_model_path):
         if not model_path.exists():
             raise ValueError(f"model_path not exists: {model_path}")
         logger.info(f"Loading torch model {model_path}...")
-        state_dict = torch.load(model_path, map_location=lambda storage, loc: storage)
+        if model_path.suffix == ".safetensors":
+            # Use safetensors library for .safetensors files
+            state_dict = safetensors_load_file(model_path)
+        elif model_path.suffix == ".pt":
+            # Use torch for .pt files
+            state_dict = torch.load(model_path, map_location=lambda storage, loc: storage)
+        else:
+            raise ValueError(f"Unsupported file format: {model_path}")
 
         if bare_model == "unknown" and ("ema" in state_dict or "module" in state_dict):
             bare_model = False
diff --git a/fastvideo/models/mochi_hf/pipeline_mochi.py b/fastvideo/models/mochi_hf/pipeline_mochi.py
index 01c84e8..294cfcb 100644
--- a/fastvideo/models/mochi_hf/pipeline_mochi.py
+++ b/fastvideo/models/mochi_hf/pipeline_mochi.py
@@ -729,6 +729,7 @@ def __call__(
         self._num_timesteps = len(timesteps)
 
         # 6. Denoising loop
+        self._progress_bar_config = {"disable": nccl_info.rank_within_group != 0}
         with self.progress_bar(total=num_inference_steps) as progress_bar:
             for i, t in enumerate(timesteps):
                 if self.interrupt:
diff --git a/fastvideo/sample/sample_t2v_hunyuan.py b/fastvideo/sample/sample_t2v_hunyuan.py
index fe3375b..2afd7d9 100644
--- a/fastvideo/sample/sample_t2v_hunyuan.py
+++ b/fastvideo/sample/sample_t2v_hunyuan.py
@@ -58,7 +58,11 @@ def main(args):
 
     # Start sampling
     samples = []
-    for prompt in args.prompts:
+    
+    with open(args.prompt) as f:
+        prompts = f.readlines()
+    
+    for prompt in prompts:
         outputs = hunyuan_video_sampler.predict(
             prompt=prompt,
             height=args.height,
@@ -73,10 +77,7 @@ def main(args):
             batch_size=args.batch_size,
             embedded_guidance_scale=args.embedded_cfg_scale,
         )
-        samples.append(outputs["samples"][0])
-
-    for prompt, video in zip(args.prompts, samples):
-        videos = rearrange(video.unsqueeze(0), "b c t h w -> t b c h w")
+        videos = rearrange(outputs["samples"], "b c t h w -> t b c h w")
         outputs = []
         for x in videos:
             x = torchvision.utils.make_grid(x, nrow=6)
@@ -86,11 +87,12 @@ def main(args):
         imageio.mimsave(args.output_path + f"{prompt[:100]}.mp4", outputs, fps=args.fps)
 
 
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
 
     # Basic parameters
-    parser.add_argument("--prompts", nargs="+", default=[])
+    parser.add_argument("--prompt", type=str, help="prompt file for inference")
     parser.add_argument("--num_frames", type=int, default=16)
     parser.add_argument("--height", type=int, default=256)
     parser.add_argument("--width", type=int, default=256)
diff --git a/fastvideo/sample/sample_t2v_hunyuan_no_sp.py b/fastvideo/sample/sample_t2v_hunyuan_no_sp.py
deleted file mode 100644
index c3b6bd1..0000000
--- a/fastvideo/sample/sample_t2v_hunyuan_no_sp.py
+++ /dev/null
@@ -1,208 +0,0 @@
-import os
-import imageio
-import time
-from einops import rearrange
-
-import torch
-import torchvision
-import numpy as np
-from pathlib import Path
-from loguru import logger
-from datetime import datetime
-import argparse
-from diffusers.utils import export_to_video
-
-from fastvideo.models.hunyuan.utils.file_utils import save_videos_grid
-from fastvideo.models.hunyuan.inference import HunyuanVideoSampler
-
-
-def main(args):
-    print(args)
-    models_root_path = Path(args.model_path)
-    if not models_root_path.exists():
-        raise ValueError(f"`models_root` not exists: {models_root_path}")
-
-    # Create save folder to save the samples
-    save_path = args.output_path
-    os.makedirs(os.path.dirname(save_path), exist_ok=True)
-
-    # Load models
-    hunyuan_video_sampler = HunyuanVideoSampler.from_pretrained(
-        models_root_path, args=args
-    )
-
-    # Get the updated args
-    args = hunyuan_video_sampler.args
-
-    # Start sampling
-    samples = []
-    for prompt in args.prompts:
-        outputs = hunyuan_video_sampler.predict(
-            prompt=prompt,
-            height=args.height,
-            width=args.width,
-            video_length=args.num_frames,
-            seed=args.seed,
-            negative_prompt=args.neg_prompt,
-            infer_steps=args.num_inference_steps,
-            guidance_scale=args.guidance_scale,
-            num_videos_per_prompt=args.num_videos,
-            flow_shift=args.flow_shift,
-            batch_size=args.batch_size,
-            embedded_guidance_scale=args.embedded_cfg_scale,
-        )
-        samples.append(outputs["samples"][0])
-
-    for prompt, video in zip(args.prompts, samples):
-        videos = rearrange(video.unsqueeze(0), "b c t h w -> t b c h w")
-        outputs = []
-        for x in videos:
-            x = torchvision.utils.make_grid(x, nrow=6)
-            x = x.transpose(0, 1).transpose(1, 2).squeeze(-1)
-            outputs.append((x * 255).numpy().astype(np.uint8))
-        os.makedirs(os.path.dirname(args.output_path), exist_ok=True)
-        imageio.mimsave(args.output_path + f"{prompt[:100]}.mp4", outputs, fps=args.fps)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-
-    # Basic parameters
-    parser.add_argument("--prompts", nargs="+", default=[])
-    parser.add_argument("--num_frames", type=int, default=16)
-    parser.add_argument("--height", type=int, default=256)
-    parser.add_argument("--width", type=int, default=256)
-    parser.add_argument("--num_inference_steps", type=int, default=50)
-    parser.add_argument("--model_path", type=str, default="data/hunyuan")
-    parser.add_argument("--output_path", type=str, default="./outputs/video")
-    parser.add_argument("--fps", type=int, default=24)
-
-    # Additional parameters
-    parser.add_argument(
-        "--denoise-type",
-        type=str,
-        default="flow",
-        help="Denoise type for noised inputs.",
-    )
-    parser.add_argument("--seed", type=int, default=None, help="Seed for evaluation.")
-    parser.add_argument(
-        "--neg_prompt", type=str, default=None, help="Negative prompt for sampling."
-    )
-    parser.add_argument(
-        "--guidance_scale",
-        type=float,
-        default=1.0,
-        help="Classifier free guidance scale.",
-    )
-    parser.add_argument(
-        "--embedded_cfg_scale",
-        type=float,
-        default=6.0,
-        help="Embedded classifier free guidance scale.",
-    )
-    parser.add_argument(
-        "--flow_shift", type=int, default=7, help="Flow shift parameter."
-    )
-    parser.add_argument(
-        "--batch_size", type=int, default=1, help="Batch size for inference."
-    )
-    parser.add_argument(
-        "--num_videos",
-        type=int,
-        default=1,
-        help="Number of videos to generate per prompt.",
-    )
-    parser.add_argument(
-        "--load-key",
-        type=str,
-        default="module",
-        help="Key to load the model states. 'module' for the main model, 'ema' for the EMA model.",
-    )
-    parser.add_argument(
-        "--use-cpu-offload",
-        action="store_true",
-        help="Use CPU offload for the model load.",
-    )
-    parser.add_argument(
-        "--dit-weight",
-        type=str,
-        default="data/hunyuan/hunyuan-video-t2v-720p/transformers/mp_rank_00_model_states.pt",
-    )
-    parser.add_argument(
-        "--reproduce",
-        action="store_true",
-        help="Enable reproducibility by setting random seeds and deterministic algorithms.",
-    )
-    parser.add_argument(
-        "--disable-autocast",
-        action="store_true",
-        help="Disable autocast for denoising loop and vae decoding in pipeline sampling.",
-    )
-
-    # Flow Matching
-    parser.add_argument(
-        "--flow-reverse",
-        action="store_true",
-        help="If reverse, learning/sampling from t=1 -> t=0.",
-    )
-    parser.add_argument(
-        "--flow-solver", type=str, default="euler", help="Solver for flow matching."
-    )
-    parser.add_argument(
-        "--use-linear-quadratic-schedule",
-        action="store_true",
-        help="Use linear quadratic schedule for flow matching. Following MovieGen (https://ai.meta.com/static-resource/movie-gen-research-paper)",
-    )
-    parser.add_argument(
-        "--linear-schedule-end",
-        type=int,
-        default=25,
-        help="End step for linear quadratic schedule for flow matching.",
-    )
-
-    # Model parameters
-    parser.add_argument("--model", type=str, default="HYVideo-T/2-cfgdistill")
-    parser.add_argument("--latent-channels", type=int, default=16)
-    parser.add_argument(
-        "--precision", type=str, default="bf16", choices=["fp32", "fp16", "bf16"]
-    )
-    parser.add_argument(
-        "--rope-theta", type=int, default=256, help="Theta used in RoPE."
-    )
-
-    parser.add_argument("--vae", type=str, default="884-16c-hy")
-    parser.add_argument(
-        "--vae-precision", type=str, default="fp16", choices=["fp32", "fp16", "bf16"]
-    )
-    parser.add_argument("--vae-tiling", action="store_true", default=True)
-
-    parser.add_argument("--text-encoder", type=str, default="llm")
-    parser.add_argument(
-        "--text-encoder-precision",
-        type=str,
-        default="fp16",
-        choices=["fp32", "fp16", "bf16"],
-    )
-    parser.add_argument("--text-states-dim", type=int, default=4096)
-    parser.add_argument("--text-len", type=int, default=256)
-    parser.add_argument("--tokenizer", type=str, default="llm")
-    parser.add_argument("--prompt-template", type=str, default="dit-llm-encode")
-    parser.add_argument(
-        "--prompt-template-video", type=str, default="dit-llm-encode-video"
-    )
-    parser.add_argument("--hidden-state-skip-layer", type=int, default=2)
-    parser.add_argument("--apply-final-norm", action="store_true")
-
-    parser.add_argument("--text-encoder-2", type=str, default="clipL")
-    parser.add_argument(
-        "--text-encoder-precision-2",
-        type=str,
-        default="fp16",
-        choices=["fp32", "fp16", "bf16"],
-    )
-    parser.add_argument("--text-states-dim-2", type=int, default=768)
-    parser.add_argument("--tokenizer-2", type=str, default="clipL")
-    parser.add_argument("--text-len-2", type=int, default=77)
-
-    args = parser.parse_args()
-    main(args)
diff --git a/fastvideo/train.py b/fastvideo/train.py
index 637d7f4..d815404 100644
--- a/fastvideo/train.py
+++ b/fastvideo/train.py
@@ -278,7 +278,7 @@ def main(args):
         transformer.config.lora_rank = args.lora_rank
         transformer.config.lora_alpha = args.lora_alpha
         transformer.config.lora_target_modules = ["to_k", "to_q", "to_v", "to_out.0"]
-        transformer._no_split_modules = no_split_modules
+        transformer._no_split_modules = [no_split_module.__name__ for no_split_module in no_split_modules]
         fsdp_kwargs["auto_wrap_policy"] = fsdp_kwargs["auto_wrap_policy"](transformer)
 
     transformer = FSDP(
diff --git a/scripts/distill/distill_hunyuan.sh b/scripts/distill/distill_hunyuan.sh
index 0478be7..e09ad72 100644
--- a/scripts/distill/distill_hunyuan.sh
+++ b/scripts/distill/distill_hunyuan.sh
@@ -1,23 +1,30 @@
 export WANDB_BASE_URL="https://api.wandb.ai"
 export WANDB_MODE=online
 
-torchrun --nnodes 1 --nproc_per_node 4\
-    fastvideo/distill_adv.py\
+DATA_DIR=./data
+IP=10.4.139.86
+
+torchrun --nnodes 4 --nproc_per_node 8\
+    --node_rank=0 \
+    --rdzv_id=456 \
+    --rdzv_backend=c10d \
+    --rdzv_endpoint=$IP:29500 \
+    fastvideo/distill.py\
     --seed 42\
-    --pretrained_model_name_or_path data/FastHunyuan\
-    --dit_model_name_or_path data/hunyuan/hunyuan-video-t2v-720p/transformers/mp_rank_00_model_states.pt\
+    --pretrained_model_name_or_path $DATA_DIR/hunyuan\
+    --dit_model_name_or_path $DATA_DIR/hunyuan/hunyuan-video-t2v-720p/transformers/mp_rank_00_model_states.pt\
     --model_type "hunyuan" \
-    --cache_dir "data/.cache"\
-    --data_json_path "data/Hunyuan-Distill-Data/videos2caption.json"\
-    --validation_prompt_dir "data/Hunyuan-Distill-Data/validation"\
+    --cache_dir "$DATA_DIR/.cache"\
+    --data_json_path "$DATA_DIR/Hunyuan-30K-Distill-Data/videos2caption.json"\
+    --validation_prompt_dir "$DATA_DIR/Hunyuan-Distill-Data/validation"\
     --gradient_checkpointing\
     --train_batch_size=1\
-    --num_latent_t 1\
+    --num_latent_t 24\
     --sp_size 1\
     --train_sp_batch_size 1\
     --dataloader_num_workers 4\
     --gradient_accumulation_steps=1\
-    --max_train_steps=640\
+    --max_train_steps=2000\
     --learning_rate=1e-6\
     --mixed_precision="bf16"\
     --checkpointing_steps=64\
@@ -28,8 +35,8 @@ torchrun --nnodes 1 --nproc_per_node 4\
     --ema_start_step 0\
     --cfg 0.0\
     --log_validation\
-    --output_dir="outputs/debug"\
-    --tracker_project_name PCM \
+    --output_dir="$DATA_DIR/outputs/hy_phase1_shift17_bs_32"\
+    --tracker_project_name Hunyuan_Distill \
     --num_frames  93 \
     --shift 17 \
     --validation_guidance_scale "1.0" \
diff --git a/scripts/distill/distill_mochi.sh b/scripts/distill/distill_mochi.sh
index 1721941..4edcb66 100644
--- a/scripts/distill/distill_mochi.sh
+++ b/scripts/distill/distill_mochi.sh
@@ -4,7 +4,7 @@ export WANDB_MODE=online
 torchrun --nnodes 1 --nproc_per_node 4 \
     fastvideo/distill.py \
     --seed 42 \
-    --pretrained_model_name_or_path data/FastMochi \
+    --pretrained_model_name_or_path data/FastMochi-diffusers \
     --model_type "mochi" \
     --cache_dir data/.cache \
     --data_json_path data/Merge-30k-Data/video2caption.json \
diff --git a/scripts/finetune/finetune_mochi.sh b/scripts/finetune/finetune_mochi.sh
index c1ab042..e9f73db 100644
--- a/scripts/finetune/finetune_mochi.sh
+++ b/scripts/finetune/finetune_mochi.sh
@@ -4,7 +4,7 @@ export WANDB_MODE=online
 torchrun --nnodes 1 --nproc_per_node 1 \
     fastvideo/train.py \
     --seed 42 \
-    --pretrained_model_name_or_path data/FastMochi \
+    --pretrained_model_name_or_path data/FastMochi-diffusers \
     --cache_dir data/.cache \
     --data_json_path data/Image-Vid-Finetune-Mochi/videos2caption.json \
     --validation_prompt_dir data/Image-Vid-Finetune-Mochi/validation \
diff --git a/scripts/finetune/finetune_mochi_lora.sh b/scripts/finetune/finetune_mochi_lora.sh
new file mode 100644
index 0000000..ab82b38
--- /dev/null
+++ b/scripts/finetune/finetune_mochi_lora.sh
@@ -0,0 +1,37 @@
+export WANDB_BASE_URL="https://api.wandb.ai"
+export WANDB_MODE=online
+
+CUDA_VISIBLE_DEVICES=5 torchrun --nnodes 1 --nproc_per_node 1 \
+    fastvideo/train.py \
+    --seed 42 \
+    --pretrained_model_name_or_path data/mochi \
+    --cache_dir data/.cache \
+    --data_json_path data/Image-Vid-Finetune-Mochi/videos2caption.json \
+    --validation_prompt_dir data/Image-Vid-Finetune-Mochi/validation \
+    --gradient_checkpointing \
+    --train_batch_size=1 \
+    --num_latent_t 14 \
+    --sp_size 1 \
+    --train_sp_batch_size 1 \
+    --dataloader_num_workers 1 \
+    --gradient_accumulation_steps=1 \
+    --max_train_steps=2000 \
+    --learning_rate=5e-6 \
+    --mixed_precision=bf16 \
+    --checkpointing_steps=200 \
+    --validation_steps 100 \
+    --validation_sampling_steps 64 \
+    --checkpoints_total_limit 3 \
+    --allow_tf32 \
+    --ema_start_step 0 \
+    --cfg 0.0 \
+    --ema_decay 0.999 \
+    --log_validation \
+    --output_dir=data/outputs/HSH-Taylor-Finetune-Lora \
+    --tracker_project_name HSH-Taylor-Finetune-Lora \
+    --num_frames 91 \
+    --group_frame \
+    --lora_rank 128 \
+    --lora_alpha 256 \
+    --master_weight_type "bf16" \
+    --use_lora 
\ No newline at end of file
diff --git a/scripts/inference/inference_hunyuan.sh b/scripts/inference/inference_hunyuan.sh
new file mode 100644
index 0000000..5431fcd
--- /dev/null
+++ b/scripts/inference/inference_hunyuan.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+
+num_gpus=[Your GPU Count]
+
+torchrun --nnodes=1 --nproc_per_node=$num_gpus --master_port 29503 \
+    fastvideo/sample/sample_t2v_hunyuan.py \
+    --height 720 \
+    --width 1280 \
+    --num_frames 125 \
+    --num_inference_steps 6 \
+    --guidance_scale 1 \
+    --embedded_cfg_scale 6 \
+    --flow_shift 17 \
+    --flow-reverse \
+    --prompt ./assets/prompt.txt \
+    --seed 12345 \
+    --output_path outputs_video/hunyuan/ \
+    --model_path data/FastHunyuan \
+    --dit-weight data/FastHunyuan/hunyuan-video-t2v-720p/transformers/diffusion_pytorch_model.safetensors
\ No newline at end of file
diff --git a/scripts/inference/inference_hunyuan_no_sp.sh b/scripts/inference/inference_hunyuan_no_sp.sh
deleted file mode 100644
index e785204..0000000
--- a/scripts/inference/inference_hunyuan_no_sp.sh
+++ /dev/null
@@ -1,14 +0,0 @@
-#!/bin/bash
-
-python fastvideo/sample/sample_t2v_hunyuan_no_sp.py \
-    --height 480 \
-    --width 848 \
-    --num_frames 93 \
-    --num_inference_steps 4 \
-    --guidance_scale 1 \
-    --embedded_cfg_scale 6 \
-    --flow_shift 17 \
-    --flow-reverse \
-    --prompt_path "data/prompt.txt" \
-    --seed 12345 \
-    --output_path outputs_video/hunyuan_no_sp/
diff --git a/scripts/inference/inference_hunyuan_sp.sh b/scripts/inference/inference_hunyuan_sp.sh
deleted file mode 100644
index b016483..0000000
--- a/scripts/inference/inference_hunyuan_sp.sh
+++ /dev/null
@@ -1,17 +0,0 @@
-#!/bin/bash
-
-num_gpus=4
-
-torchrun --nnodes=1 --nproc_per_node=$num_gpus --master_port 29503 \
-    fastvideo/sample/sample_t2v_hunyuan.py \
-    --height 480 \
-    --width 848 \
-    --num_frames 93 \
-    --num_inference_steps 4 \
-    --guidance_scale 1 \
-    --embedded_cfg_scale 6 \
-    --flow_shift 17 \
-    --flow-reverse \
-    --prompt_path "data/prompt.txt" \
-    --seed 12345 \
-    --output_path outputs_video/hunyuan_sp/
diff --git a/scripts/inference/inference_mochi_no_sp.sh b/scripts/inference/inference_mochi_no_sp.sh
index a866b73..9036b63 100644
--- a/scripts/inference/inference_mochi_no_sp.sh
+++ b/scripts/inference/inference_mochi_no_sp.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 
 python fastvideo/sample/sample_t2v_mochi_no_sp.py \
-    --model_path data/FastMochi \
+    --model_path data/FastMochi-diffusers \
     --prompt_path "assets/prompt.txt" \
     --num_frames 93 \
     --height 480 \
diff --git a/scripts/inference/inference_mochi_sp.sh b/scripts/inference/inference_mochi_sp.sh
index f23c3b6..120ef33 100644
--- a/scripts/inference/inference_mochi_sp.sh
+++ b/scripts/inference/inference_mochi_sp.sh
@@ -4,15 +4,16 @@ num_gpus=4
 
 torchrun --nnodes=1 --nproc_per_node=$num_gpus --master_port 29503 \
     fastvideo/sample/sample_t2v_mochi.py \
-    --model_path data/FastMochi \
+    --model_path data/FastMochi-diffusers \
     --prompt_path "assets/prompt.txt" \
-    --num_frames 93 \
+    --num_frames 163 \
     --height 480 \
     --width 848 \
     --num_inference_steps 8 \
-    --guidance_scale 4.5 \
+    --guidance_scale 1.5 \
     --output_path outputs_video/mochi_sp/ \
-    --shift 8 \
     --seed 12345 \
-    --scheduler_type "pcm_linear_quadratic" 
+    --scheduler_type "pcm_linear_quadratic" \
+    --linear_threshold 0.1 \
+    --linear_range 0.75
 
diff --git a/scripts/preprocess/preprocess_mochi_data.sh b/scripts/preprocess/preprocess_mochi_data.sh
index 4631068..289f569 100644
--- a/scripts/preprocess/preprocess_mochi_data.sh
+++ b/scripts/preprocess/preprocess_mochi_data.sh
@@ -1,6 +1,6 @@
 # export WANDB_MODE="offline"
 GPU_NUM=1 # 2,4,8
-MODEL_PATH="data/FastMochi"
+MODEL_PATH="data/FastMochi-diffusers"
 MODEL_TYPE="mochi"
 DATA_MERGE_PATH="data/Image-Vid-Finetune-Src/merge.txt"
 OUTPUT_DIR="data/Image-Vid-Finetune-Mochi"