forked from espnet/espnet
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtrn2ctm.py
executable file
·78 lines (68 loc) · 2.68 KB
/
trn2ctm.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
#!/usr/bin/python
import argparse
import codecs
import math
import re
import sys
is_python2 = sys.version_info[0] == 2
def get_parser():
parser = argparse.ArgumentParser(description="convert trn to ctm")
parser.add_argument("trn", type=str, default=None, nargs="?", help="input trn")
parser.add_argument("ctm", type=str, default=None, nargs="?", help="output ctm")
return parser
def main(args):
args = get_parser().parse_args(args)
convert(args.trn, args.ctm)
def convert(trn=None, ctm=None):
if trn is not None:
with codecs.open(trn, "r", encoding="utf-8") as trn:
content = trn.readlines()
else:
trn = codecs.getreader("utf-8")(sys.stdin if is_python2 else sys.stdin.buffer)
content = trn.readlines()
split_content = []
for i, line in enumerate(content):
idx = line.rindex("(")
split = [line[:idx].strip().upper(), line[idx + 1 :].strip()[:-1]]
while "((" in split[0]:
split[0] = split[0].replace("((", "(")
while " " in split[0]:
split[0] = split[0].replace(" ", " ")
segm_info = re.split("[-_]", split[1])
segm_info = [s.strip() for s in segm_info]
col1 = segm_info[0] + "_" + segm_info[1]
col2 = segm_info[2]
start_time_int = int(segm_info[6])
end_time_int = int(segm_info[7])
diff_int = end_time_int - start_time_int
word_split = split[0].split(" ")
word_split = list(
filter(lambda x: len(x) > 0 and any([c != " " for c in x]), word_split)
)
if len(word_split) > 0:
step_int = int(math.floor(float(diff_int) / len(word_split)))
step = str(step_int)
for j, word in enumerate(word_split):
start_time = str(int(start_time_int + step_int * j))
col3 = (
(start_time[:-2] if len(start_time) > 2 else "0")
+ "."
+ (start_time[-2:] if len(start_time) > 1 else "00")
)
if j == len(word_split) - 1:
diff = str(int(end_time_int - int(start_time)))
else:
diff = step
col4 = (diff[:-2] if len(diff) > 2 else "0") + "." + diff[-2:]
segm_info = [col1, col2, col3, col4]
split_content.append(" ".join(segm_info) + " " + word)
if ctm is not None:
sys.stdout = codecs.open(ctm, "w", encoding="utf-8")
else:
sys.stdout = codecs.getwriter("utf-8")(
sys.stdout if is_python2 else sys.stdout.buffer
)
for c_line in split_content:
print(c_line)
if __name__ == "__main__":
main(sys.argv[1:])