Skip to content

Commit

Permalink
delete pinyin merge process
Browse files Browse the repository at this point in the history
  • Loading branch information
phecda-xu committed Jan 13, 2021
1 parent f6a5b58 commit 3ce90d6
Showing 1 changed file with 12 additions and 40 deletions.
52 changes: 12 additions & 40 deletions data_utils/aishell_1_data_process.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,38 +7,18 @@
import argparse
from tqdm import tqdm
from collections import Counter
from pypinyin import lazy_pinyin, Style
from pypinyin import pinyin

parser = argparse.ArgumentParser(description='AIShell_1 processing')
parser.add_argument('--BASE-PATH',
default='/home/xhongyang/Project/data/ASR/aishell_1/data_aishell',
type=str,
help='Batch size for training')
parser.add_argument('--label-type',
default='None',
parser.add_argument('--OUT-PATH',
default='../data',
type=str,
help='Batch size for training')
args = parser.parse_args()
style = Style.TONE3


def pinyin_cover(char):
if 'zh' in char:
char = char.replace("zh", "z")
char = char.replace("z", "z-zh")
if 'ch' in char:
char = char.replace("ch", "c")
char = char.replace("c", "c-ch")
if 'sh' in char:
char = char.replace("sh", "s")
char = char.replace("s", "s-sh")
if 'l' in char:
char = char.replace("l", "n")
if 'ing' in char:
char = char.replace("ing", "in")
char = char.replace("in", "in-ing")
return char


counter = Counter()
transcript_path = os.path.join(args.BASE_PATH, 'transcript', 'aishell_transcript_v0.8.txt')
Expand All @@ -50,20 +30,16 @@ def pinyin_cover(char):
audio_id, text = line.split(' ', 1)
# remove withespace
text = ''.join(text.split())
if args.label_type == 'pinyin':
pinyin_str = ' '.join([pinyin_cover(i) for i in lazy_pinyin(text, errors='ignore')])
transcript_dict[audio_id] = pinyin_str
counter.update(pinyin_str.split(' '))
elif args.label_type == 'fully_pinyin':
fully_pinyin_str = ' '.join([i for i in lazy_pinyin(text, errors='ignore', style=style)])
transcript_dict[audio_id] = fully_pinyin_str
counter.update(fully_pinyin_str.split(' '))
else:
transcript_dict[audio_id] = text
counter.update(text)
pins_ = pinyin(text)
pins = [i[0] for i in pins_]
transcript_dict[audio_id] = text
counter.update(pins)

if not os.path.exists(args.OUT_PATH):
os.makedirs(args.OUT_PATH)

count_sorted = sorted(counter.items(), key=lambda x: x[1], reverse=False)
with codecs.open('data/am_tokens.txt', 'w', 'utf-8') as fout:
with codecs.open('../data/am_tokens.txt', 'w', 'utf-8') as fout:
fout.write('S' + '\n')
fout.write('/S' + '\n')
for char, count in count_sorted:
Expand All @@ -81,14 +57,10 @@ def pinyin_cover(char):
# if no transcription for audio then skipped
if audio_id not in transcript_dict:
continue
if args.label_type == 'pinyin':
transcript = transcript_dict[audio_id]
elif args.label_type == 'fully_pinyin':
transcript = transcript_dict[audio_id]
else:
transcript = transcript_dict[audio_id]
audio_list.append('{}\t{}'.format(audio_path, transcript))
manifest_path = 'data/am_{}_list.txt'.format(data_set)
manifest_path = '../data/am_{}_list.txt'.format(data_set)
with codecs.open(manifest_path, 'w', 'utf-8') as fout:
for line in audio_list:
fout.write(line + '\n')

0 comments on commit 3ce90d6

Please sign in to comment.