delete pinyin merge process

phecda-xu · Jan 13, 2021 · 3ce90d6 · 3ce90d6
1 parent f6a5b58
commit 3ce90d6
Showing 1 changed file with 12 additions and 40 deletions.
diff --git a/data_utils/aishell_1_data_process.py b/data_utils/aishell_1_data_process.py
@@ -7,38 +7,18 @@
 import argparse
 from tqdm import tqdm
 from collections import Counter
-from pypinyin import lazy_pinyin, Style
+from pypinyin import pinyin
 
 parser = argparse.ArgumentParser(description='AIShell_1 processing')
 parser.add_argument('--BASE-PATH',
                     default='/home/xhongyang/Project/data/ASR/aishell_1/data_aishell',
                     type=str,
                     help='Batch size for training')
-parser.add_argument('--label-type',
-                    default='None',
+parser.add_argument('--OUT-PATH',
+                    default='../data',
                     type=str,
                     help='Batch size for training')
 args = parser.parse_args()
-style = Style.TONE3
-
-
-def pinyin_cover(char):
-    if 'zh' in char:
-        char = char.replace("zh", "z")
-    char = char.replace("z", "z-zh")
-    if 'ch' in char:
-        char = char.replace("ch", "c")
-    char = char.replace("c", "c-ch")
-    if 'sh' in char:
-        char = char.replace("sh", "s")
-    char = char.replace("s", "s-sh")
-    if 'l' in char:
-        char = char.replace("l", "n")
-    if 'ing' in char:
-        char = char.replace("ing", "in")
-    char = char.replace("in", "in-ing")
-    return char
-
 
 counter = Counter()
 transcript_path = os.path.join(args.BASE_PATH, 'transcript', 'aishell_transcript_v0.8.txt')
@@ -50,20 +30,16 @@ def pinyin_cover(char):
     audio_id, text = line.split(' ', 1)
     # remove withespace
     text = ''.join(text.split())
-    if args.label_type == 'pinyin':
-        pinyin_str = ' '.join([pinyin_cover(i) for i in lazy_pinyin(text, errors='ignore')])
-        transcript_dict[audio_id] = pinyin_str
-        counter.update(pinyin_str.split(' '))
-    elif args.label_type == 'fully_pinyin':
-        fully_pinyin_str = ' '.join([i for i in lazy_pinyin(text, errors='ignore', style=style)])
-        transcript_dict[audio_id] = fully_pinyin_str
-        counter.update(fully_pinyin_str.split(' '))
-    else:
-        transcript_dict[audio_id] = text
-        counter.update(text)
+    pins_ = pinyin(text)
+    pins = [i[0] for i in pins_]
+    transcript_dict[audio_id] = text
+    counter.update(pins)
+
+if not os.path.exists(args.OUT_PATH):
+    os.makedirs(args.OUT_PATH)
 
 count_sorted = sorted(counter.items(), key=lambda x: x[1], reverse=False)
-with codecs.open('data/am_tokens.txt', 'w', 'utf-8') as fout:
+with codecs.open('../data/am_tokens.txt', 'w', 'utf-8') as fout:
     fout.write('S' + '\n')
     fout.write('/S' + '\n')
     for char, count in count_sorted:
@@ -81,14 +57,10 @@ def pinyin_cover(char):
             # if no transcription for audio then skipped
             if audio_id not in transcript_dict:
                 continue
-            if args.label_type == 'pinyin':
-                transcript = transcript_dict[audio_id]
-            elif args.label_type == 'fully_pinyin':
-                transcript = transcript_dict[audio_id]
             else:
                 transcript = transcript_dict[audio_id]
             audio_list.append('{}\t{}'.format(audio_path, transcript))
-    manifest_path = 'data/am_{}_list.txt'.format(data_set)
+    manifest_path = '../data/am_{}_list.txt'.format(data_set)
     with codecs.open(manifest_path, 'w', 'utf-8') as fout:
         for line in audio_list:
             fout.write(line + '\n')