-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathparse_languages.py
58 lines (50 loc) · 1.7 KB
/
parse_languages.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
from html.parser import HTMLParser
def split_first(lang, char):
splitted = lang.split(char)
if len(splitted) == 1:
return lang.strip()
else:
return splitted[0].strip()
def clean_language(lang):
lang = split_first(lang, '(')
lang = split_first(lang, '/')
return lang
class TableParser(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self.in_cell = False
self.cells = []
def handle_starttag(self, tag, attrs):
if tag == "td":
self.in_cell = True
def handle_data(self, data):
if self.in_cell:
self.cells.append(data)
def handle_endtag(self, tag):
if tag == "tr":
self.process_row()
elif tag == "td":
self.in_cell = False
def process_row(self):
if len(self.cells) == 2:
cleaned = [clean_language(x) for x in self.cells]
languages.append(cleaned)
elif len(self.cells) == 1 and len(languages) > 0:
print("Ignoring the second translation ({1}) for {0}".format(languages[-1][0], self.cells[0]))
self.cells = []
def get_language_translation_dict(lang_dict = None):
if lang_dict == None:
with open("data/language_to_language-name.html", "r") as fr:
data = fr.read()
parser = TableParser()
parser.feed(data)
lang_dict = dict(languages)
return lang_dict
languages = []
language_dict = get_language_translation_dict()
def translate_language(lang):
lang_cleaned = lang.strip()
if lang_cleaned in language_dict:
return language_dict[lang_cleaned]
print("Language not found in list: " + lang)
return lang_cleaned