forked from SkyFlap/Digital-Life-DL-B
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
71 changed files
with
7,608 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
import numpy as np | ||
import torch | ||
from sklearn.cluster import KMeans | ||
|
||
def get_cluster_model(ckpt_path): | ||
checkpoint = torch.load(ckpt_path) | ||
kmeans_dict = {} | ||
for spk, ckpt in checkpoint.items(): | ||
km = KMeans(ckpt["n_features_in_"]) | ||
km.__dict__["n_features_in_"] = ckpt["n_features_in_"] | ||
km.__dict__["_n_threads"] = ckpt["_n_threads"] | ||
km.__dict__["cluster_centers_"] = ckpt["cluster_centers_"] | ||
kmeans_dict[spk] = km | ||
return kmeans_dict | ||
|
||
def get_cluster_result(model, x, speaker): | ||
""" | ||
x: np.array [t, 256] | ||
return cluster class result | ||
""" | ||
return model[speaker].predict(x) | ||
|
||
def get_cluster_center_result(model, x,speaker): | ||
"""x: np.array [t, 256]""" | ||
predict = model[speaker].predict(x) | ||
return model[speaker].cluster_centers_[predict] | ||
|
||
def get_center(model, x,speaker): | ||
return model[speaker].cluster_centers_[x] |
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,72 @@ | ||
import os | ||
from glob import glob | ||
from pathlib import Path | ||
import torch | ||
import logging | ||
import argparse | ||
import torch | ||
import numpy as np | ||
from sklearn.cluster import KMeans, MiniBatchKMeans | ||
import tqdm | ||
logging.basicConfig(level=logging.INFO) | ||
logger = logging.getLogger(__name__) | ||
import time | ||
import random | ||
|
||
def train_cluster(in_dir, n_clusters, use_minibatch=True, verbose=False): | ||
|
||
logger.info(f"Loading features from {in_dir}") | ||
features = [] | ||
nums = 0 | ||
for path in tqdm.tqdm(in_dir.glob("*.soft.pt")): | ||
features.append(torch.load(path).squeeze(0).numpy().T) | ||
# print(features[-1].shape) | ||
features = np.concatenate(features, axis=0) | ||
print(nums, features.nbytes/ 1024**2, "MB , shape:",features.shape, features.dtype) | ||
features = features.astype(np.float32) | ||
logger.info(f"Clustering features of shape: {features.shape}") | ||
t = time.time() | ||
if use_minibatch: | ||
kmeans = MiniBatchKMeans(n_clusters=n_clusters,verbose=verbose, batch_size=4096, max_iter=80).fit(features) | ||
else: | ||
kmeans = KMeans(n_clusters=n_clusters,verbose=verbose).fit(features) | ||
print(time.time()-t, "s") | ||
|
||
x = { | ||
"n_features_in_": kmeans.n_features_in_, | ||
"_n_threads": kmeans._n_threads, | ||
"cluster_centers_": kmeans.cluster_centers_, | ||
} | ||
print("end") | ||
|
||
return x | ||
|
||
|
||
if __name__ == "__main__": | ||
|
||
parser = argparse.ArgumentParser() | ||
parser.add_argument('--dataset', type=Path, default="./dataset/44k", | ||
help='path of training data directory') | ||
parser.add_argument('--output', type=Path, default="./module/So-VITS/44k", | ||
help='path of model output directory') | ||
|
||
args = parser.parse_args() | ||
|
||
checkpoint_dir = args.output | ||
dataset = args.dataset | ||
n_clusters = 10000 | ||
|
||
ckpt = {} | ||
for spk in os.listdir(dataset): | ||
if os.path.isdir(dataset/spk): | ||
print(f"train kmeans for {spk}...") | ||
in_dir = dataset/spk | ||
x = train_cluster(in_dir, n_clusters, verbose=False) | ||
ckpt[spk] = x | ||
|
||
checkpoint_path = checkpoint_dir / f"kmeans_{n_clusters}.pt" | ||
checkpoint_path.parent.mkdir(exist_ok=True, parents=True) | ||
torch.save( | ||
ckpt, | ||
checkpoint_path, | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,93 @@ | ||
{ | ||
"train": { | ||
"log_interval": 200, | ||
"eval_interval": 800, | ||
"seed": 1234, | ||
"epochs": 10000, | ||
"learning_rate": 0.0001, | ||
"betas": [ | ||
0.8, | ||
0.99 | ||
], | ||
"eps": 1e-09, | ||
"batch_size": 6, | ||
"fp16_run": false, | ||
"lr_decay": 0.999875, | ||
"segment_size": 10240, | ||
"init_lr_ratio": 1, | ||
"warmup_epochs": 0, | ||
"c_mel": 45, | ||
"c_kl": 1.0, | ||
"use_sr": true, | ||
"max_speclen": 512, | ||
"port": "8001", | ||
"keep_ckpts": 10 | ||
}, | ||
"data": { | ||
"training_files": "filelists/train.txt", | ||
"validation_files": "filelists/val.txt", | ||
"max_wav_value": 32768.0, | ||
"sampling_rate": 44100, | ||
"filter_length": 2048, | ||
"hop_length": 512, | ||
"win_length": 2048, | ||
"n_mel_channels": 80, | ||
"mel_fmin": 0.0, | ||
"mel_fmax": 22050 | ||
}, | ||
"model": { | ||
"inter_channels": 192, | ||
"hidden_channels": 192, | ||
"filter_channels": 768, | ||
"n_heads": 2, | ||
"n_layers": 6, | ||
"kernel_size": 3, | ||
"p_dropout": 0.1, | ||
"resblock": "1", | ||
"resblock_kernel_sizes": [ | ||
3, | ||
7, | ||
11 | ||
], | ||
"resblock_dilation_sizes": [ | ||
[ | ||
1, | ||
3, | ||
5 | ||
], | ||
[ | ||
1, | ||
3, | ||
5 | ||
], | ||
[ | ||
1, | ||
3, | ||
5 | ||
] | ||
], | ||
"upsample_rates": [ | ||
8, | ||
8, | ||
2, | ||
2, | ||
2 | ||
], | ||
"upsample_initial_channel": 512, | ||
"upsample_kernel_sizes": [ | ||
16, | ||
16, | ||
4, | ||
4, | ||
4 | ||
], | ||
"n_layers_q": 3, | ||
"use_spectral_norm": false, | ||
"gin_channels": 256, | ||
"ssl_dim": 256, | ||
"n_speakers": 200 | ||
}, | ||
"spk": { | ||
"speaker0": 0 | ||
} | ||
} |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
数据集准备 | ||
|
||
raw | ||
├───speaker0 | ||
│ ├───xxx1-xxx1.wav | ||
│ ├───... | ||
│ └───Lxx-0xx8.wav | ||
└───speaker1 | ||
├───xx2-0xxx2.wav | ||
├───... | ||
└───xxx7-xxx007.wav | ||
|
||
此外还需要编辑config.json | ||
|
||
"n_speakers": 10 | ||
|
||
"spk":{ | ||
"speaker0": 0, | ||
"speaker1": 1, | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
# -*- coding: utf-8 -*- | ||
|
||
__author__ = """Adrian Bulat""" | ||
__email__ = '[email protected]' | ||
__version__ = '1.0.1' | ||
|
||
from .api import FaceAlignment, LandmarksType, NetworkSize |
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,79 @@ | ||
from __future__ import print_function | ||
import os | ||
import torch | ||
from torch.utils.model_zoo import load_url | ||
from enum import Enum | ||
import numpy as np | ||
import cv2 | ||
try: | ||
import urllib.request as request_file | ||
except BaseException: | ||
import urllib as request_file | ||
|
||
from .models import FAN, ResNetDepth | ||
from .utils import * | ||
|
||
|
||
class LandmarksType(Enum): | ||
"""Enum class defining the type of landmarks to detect. | ||
``_2D`` - the detected points ``(x,y)`` are detected in a 2D space and follow the visible contour of the face | ||
``_2halfD`` - this points represent the projection of the 3D points into 3D | ||
``_3D`` - detect the points ``(x,y,z)``` in a 3D space | ||
""" | ||
_2D = 1 | ||
_2halfD = 2 | ||
_3D = 3 | ||
|
||
|
||
class NetworkSize(Enum): | ||
# TINY = 1 | ||
# SMALL = 2 | ||
# MEDIUM = 3 | ||
LARGE = 4 | ||
|
||
def __new__(cls, value): | ||
member = object.__new__(cls) | ||
member._value_ = value | ||
return member | ||
|
||
def __int__(self): | ||
return self.value | ||
|
||
ROOT = os.path.dirname(os.path.abspath(__file__)) | ||
|
||
class FaceAlignment: | ||
def __init__(self, landmarks_type, network_size=NetworkSize.LARGE, | ||
device='cuda', flip_input=False, face_detector='sfd', verbose=False): | ||
self.device = device | ||
self.flip_input = flip_input | ||
self.landmarks_type = landmarks_type | ||
self.verbose = verbose | ||
|
||
network_size = int(network_size) | ||
|
||
if 'cuda' in device: | ||
torch.backends.cudnn.benchmark = True | ||
|
||
# Get the face detector | ||
face_detector_module = __import__('face_detection.detection.' + face_detector, | ||
globals(), locals(), [face_detector], 0) | ||
self.face_detector = face_detector_module.FaceDetector(device=device, verbose=verbose) | ||
|
||
def get_detections_for_batch(self, images): | ||
images = images[..., ::-1] | ||
detected_faces = self.face_detector.detect_from_batch(images.copy()) | ||
results = [] | ||
|
||
for i, d in enumerate(detected_faces): | ||
if len(d) == 0: | ||
results.append(None) | ||
continue | ||
d = d[0] | ||
d = np.clip(d, 0, None) | ||
|
||
x1, y1, x2, y2 = map(int, d[:-1]) | ||
results.append((x1, y1, x2, y2)) | ||
|
||
return results |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
from .core import FaceDetector |
Binary file not shown.
Binary file not shown.
Oops, something went wrong.