feat: data preprocessing code of hallo (fudan-generative-vision#103)

* feat: data preprocessing code of hallo * add data preprocessing * add utils functions of data preprocessing * add image processor and audio processor of data preprocessing * fix: train config and data processing param adjustment * add model weight postprocess after stage1 * make data processing param easier to understand
rivivian · Jun 27, 2024 · 53bc81a · 53bc81a
1 parent 0152cd9
commit 53bc81a
Show file tree

Hide file tree

Showing 8 changed files with 860 additions and 17 deletions.
diff --git a/configs/train/stage2.yaml b/configs/train/stage2.yaml
@@ -98,7 +98,7 @@ start_ratio: 0.05
 noise_offset: 0.05
 snr_gamma: 5.0
 enable_zero_snr: True
-stage1_ckpt_dir: "./pretrained_models/hallo/stage1"
+stage1_ckpt_dir: "./exp_output/stage1/"
 
 single_inference_times: 10
 inference_steps: 40
@@ -107,7 +107,7 @@ cfg_scale: 3.5
 seed: 42
 resume_from_checkpoint: "latest"
 checkpointing_steps: 500
-exp_name: "stage2_test"
+exp_name: "stage2"
 output_dir: "./exp_output"
 
 ref_img_path:

diff --git a/hallo/datasets/audio_processor.py b/hallo/datasets/audio_processor.py
@@ -73,7 +73,7 @@ def __init__(
         self.wav2vec_feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(wav2vec_model_path, local_files_only=True)
 
 
-    def preprocess(self, wav_file: str, clip_length: int):
+    def preprocess(self, wav_file: str, clip_length: int=-1):
         """
         Preprocess a WAV audio file by separating the vocals from the background and resampling it to a 16 kHz sample rate.
         The separated vocal track is then converted into wav2vec2 for further processing or analysis.
@@ -109,7 +109,8 @@ def preprocess(self, wav_file: str, clip_length: int):
         audio_length = seq_len
 
         audio_feature = torch.from_numpy(audio_feature).float().to(device=self.device)
-        if seq_len % clip_length != 0:
+
+        if clip_length>0 and seq_len % clip_length != 0:
             audio_feature = torch.nn.functional.pad(audio_feature, (0, (clip_length - seq_len % clip_length) * (self.sample_rate // self.fps)), 'constant', 0.0)
             seq_len += clip_length - seq_len % clip_length
         audio_feature = audio_feature.unsqueeze(0)

diff --git a/hallo/datasets/image_processor.py b/hallo/datasets/image_processor.py
@@ -1,3 +1,4 @@
+# pylint: disable=W0718
 """
 This module is responsible for processing images, particularly for face-related tasks.
 It uses various libraries such as OpenCV, NumPy, and InsightFace to perform tasks like
@@ -8,13 +9,15 @@
 from typing import List
 
 import cv2
+import mediapipe as mp
 import numpy as np
 import torch
 from insightface.app import FaceAnalysis
 from PIL import Image
 from torchvision import transforms
 
-from ..utils.util import get_mask
+from ..utils.util import (blur_mask, get_landmark_overframes, get_mask,
+                          get_union_face_mask, get_union_lip_mask)
 
 MEAN = 0.5
 STD = 0.5
@@ -207,3 +210,137 @@ def __enter__(self):
 
     def __exit__(self, _exc_type, _exc_val, _exc_tb):
         self.close()
+
+
+class ImageProcessorForDataProcessing():
+    """
+    ImageProcessor is a class responsible for processing images, particularly for face-related tasks.
+    It takes in an image and performs various operations such as augmentation, face detection,
+    face embedding extraction, and rendering a face mask. The processed images are then used for
+    further analysis or recognition purposes.
+
+    Attributes:
+        img_size (int): The size of the image to be processed.
+        face_analysis_model_path (str): The path to the face analysis model.
+
+    Methods:
+        preprocess(source_image_path, cache_dir):
+            Preprocesses the input image by performing augmentation, face detection,
+            face embedding extraction, and rendering a face mask.
+
+        close():
+            Closes the ImageProcessor and releases any resources being used.
+
+        _augmentation(images, transform, state=None):
+            Applies image augmentation to the input images using the given transform and state.
+
+        __enter__():
+            Enters a runtime context and returns the ImageProcessor object.
+
+        __exit__(_exc_type, _exc_val, _exc_tb):
+            Exits a runtime context and handles any exceptions that occurred during the processing.
+    """
+    def __init__(self, face_analysis_model_path, landmark_model_path, step) -> None:
+        if step == 2:
+            self.face_analysis = FaceAnalysis(
+                name="",
+                root=face_analysis_model_path,
+                providers=["CUDAExecutionProvider", "CPUExecutionProvider"],
+            )
+            self.face_analysis.prepare(ctx_id=0, det_size=(640, 640))
+            self.landmarker = None
+        else:
+            BaseOptions = mp.tasks.BaseOptions
+            FaceLandmarker = mp.tasks.vision.FaceLandmarker
+            FaceLandmarkerOptions = mp.tasks.vision.FaceLandmarkerOptions
+            VisionRunningMode = mp.tasks.vision.RunningMode
+            # Create a face landmarker instance with the video mode:
+            options = FaceLandmarkerOptions(
+                base_options=BaseOptions(model_asset_path=landmark_model_path),
+                running_mode=VisionRunningMode.IMAGE,
+            )
+            self.landmarker = FaceLandmarker.create_from_options(options)
+            self.face_analysis = None
+
+    def preprocess(self, source_image_path: str):
+        """
+        Apply preprocessing to the source image to prepare for face analysis.
+
+        Parameters:
+            source_image_path (str): The path to the source image.
+            cache_dir (str): The directory to cache intermediate results.
+
+        Returns:
+            None
+        """
+        # 1. get face embdeding
+        face_mask, face_emb, sep_pose_mask, sep_face_mask, sep_lip_mask = None, None, None, None, None
+        if self.face_analysis:
+            for frame in sorted(os.listdir(source_image_path)):
+                try:
+                    source_image = Image.open(
+                        os.path.join(source_image_path, frame))
+                    ref_image_pil = source_image.convert("RGB")
+                    # 2.1 detect face
+                    faces = self.face_analysis.get(cv2.cvtColor(
+                        np.array(ref_image_pil.copy()), cv2.COLOR_RGB2BGR))
+                    # use max size face
+                    face = sorted(faces, key=lambda x: (
+                        x["bbox"][2] - x["bbox"][0]) * (x["bbox"][3] - x["bbox"][1]))[-1]
+                    # 2.2 face embedding
+                    face_emb = face["embedding"]
+                    if face_emb is not None:
+                        break
+                except Exception as _:
+                    continue
+
+        if self.landmarker:
+            # 3.1 get landmark
+            landmarks, height, width = get_landmark_overframes(
+                self.landmarker, source_image_path)
+            assert len(landmarks) == len(os.listdir(source_image_path))
+
+            # 3 render face and lip mask
+            face_mask = get_union_face_mask(landmarks, height, width)
+            lip_mask = get_union_lip_mask(landmarks, height, width)
+
+            # 4 gaussian blur
+            blur_face_mask = blur_mask(face_mask, (64, 64), (51, 51))
+            blur_lip_mask = blur_mask(lip_mask, (64, 64), (31, 31))
+
+            # 5 seperate mask
+            sep_face_mask = cv2.subtract(blur_face_mask, blur_lip_mask)
+            sep_pose_mask = 255.0 - blur_face_mask
+            sep_lip_mask = blur_lip_mask
+
+        return face_mask, face_emb, sep_pose_mask, sep_face_mask, sep_lip_mask
+
+    def close(self):
+        """
+        Closes the ImageProcessor and releases any resources held by the FaceAnalysis instance.
+
+        Args:
+            self: The ImageProcessor instance.
+
+        Returns:
+            None.
+        """
+        for _, model in self.face_analysis.models.items():
+            if hasattr(model, "Dispose"):
+                model.Dispose()
+
+    def _augmentation(self, images, transform, state=None):
+        if state is not None:
+            torch.set_rng_state(state)
+        if isinstance(images, List):
+            transformed_images = [transform(img) for img in images]
+            ret_tensor = torch.stack(transformed_images, dim=0)  # (f, c, h, w)
+        else:
+            ret_tensor = transform(images)  # (c, h, w)
+        return ret_tensor
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, _exc_type, _exc_val, _exc_tb):
+        self.close()