forked from fishaudio/Bert-VITS2
-
Notifications
You must be signed in to change notification settings - Fork 0
/
preprocess_text.py
65 lines (56 loc) · 2.23 KB
/
preprocess_text.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
import json
from random import shuffle
import tqdm
from text.cleaner import clean_text
from collections import defaultdict
stage = [1,2,3]
transcription_path = 'filelists/genshin.list'
train_path = 'filelists/train.list'
val_path = 'filelists/val.list'
config_path = "configs/config.json"
val_per_spk = 4
max_val_total = 8
if 1 in stage:
with open( transcription_path+'.cleaned', 'w', encoding='utf-8') as f:
for line in tqdm.tqdm(open(transcription_path, encoding='utf-8').readlines()):
try:
utt, spk, language, text = line.strip().split('|')
#language = "ZH"
norm_text, phones, tones, word2ph = clean_text(text, language)
f.write('{}|{}|{}|{}|{}|{}|{}\n'.format(utt, spk, language, norm_text, ' '.join(phones),
" ".join([str(i) for i in tones]),
" ".join([str(i) for i in word2ph])))
except:
print("err!", utt)
if 2 in stage:
spk_utt_map = defaultdict(list)
spk_id_map = {}
current_sid = 0
with open( transcription_path+'.cleaned', encoding='utf-8') as f:
for line in f.readlines():
utt, spk, language, text, phones, tones, word2ph = line.strip().split('|')
spk_utt_map[spk].append(line)
if spk not in spk_id_map.keys():
spk_id_map[spk] = current_sid
current_sid += 1
train_list = []
val_list = []
for spk, utts in spk_utt_map.items():
shuffle(utts)
val_list+=utts[:val_per_spk]
train_list+=utts[val_per_spk:]
if len(val_list) > max_val_total:
train_list+=val_list[max_val_total:]
val_list = val_list[:max_val_total]
with open( train_path,"w", encoding='utf-8') as f:
for line in train_list:
f.write(line)
with open(val_path, "w", encoding='utf-8') as f:
for line in val_list:
f.write(line)
if 3 in stage:
assert 2 in stage
config = json.load(open(config_path, encoding='utf-8'))
config["data"]['spk2id'] = spk_id_map
with open(config_path, 'w', encoding='utf-8') as f:
json.dump(config, f, indent=2, ensure_ascii=False)