-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathhebrew_utils.py
160 lines (132 loc) · 4.59 KB
/
hebrew_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
import numpy as np
# Unicode codepoints for nikud:
# NOTE: Some of these are extended nikud which we will not use
# 1456 HEBREW POINT SHEVA
# 1457 HEBREW POINT HATAF SEGOL
# 1458 HEBREW POINT HATAF PATAH
# 1459 HEBREW POINT HATAF QAMATS
# 1460 HEBREW POINT HIRIQ
# 1461 HEBREW POINT TSERE
# 1462 HEBREW POINT SEGOL
# 1463 HEBREW POINT PATAH
# 1464 HEBREW POINT QAMATS
# 1465 HEBREW POINT HOLAM
# 1466 HEBREW POINT HOLAM HASER FOR VAV ***EXTENDED***
# 1467 HEBREW POINT QUBUTS
# 1468 HEBREW POINT DAGESH OR MAPIQ
# 1469 HEBREW POINT METEG ***EXTENDED***
# 1470 HEBREW PUNCTUATION MAQAF ***EXTENDED***
# 1471 HEBREW POINT RAFE ***EXTENDED***
# 1472 HEBREW PUNCTUATION PASEQ ***EXTENDED***
# 1473 HEBREW POINT SHIN DOT
# 1474 HEBREW POINT SIN DOT
NIKUD_START_ORD = 1456
NIKUD_END_ORD = 1474
SPECIAL_ORDS = {1466, 1469, 1470, 1471, 1472}
# Extended nikud: includes symbols such as rafe which we strip, but do not add to texts
EXTENDED_NIKUD = {chr(i) for i in range(NIKUD_START_ORD, NIKUD_END_ORD + 1)}
# Nikud: ordinary nikud that we add to texts
NIKUD = {c for c in EXTENDED_NIKUD if ord(c) not in SPECIAL_ORDS}
N_VOWELS = len(NIKUD) - 3 # not including dagesh, shin dot, sin dot
idx2chr = dict()
j = 0
for i in range(NIKUD_START_ORD, NIKUD_END_ORD + 1):
if i not in SPECIAL_ORDS:
idx2chr[j] = chr(i)
j += 1
def strip_nikud(s):
if type(s) is str:
out = s
for N in EXTENDED_NIKUD:
out = out.replace(N, '')
return out
out = s.copy() # pd Series
for N in EXTENDED_NIKUD:
out = out.str.replace(N, '')
return out
def text_contains_nikud(text):
return len(set(text) & EXTENDED_NIKUD) > 0
ABG = set('אבגדהוזחטיכךלמםנןסעפףצץקרשת')
def text_contains_abg(text):
return len(set(text) & ABG) > 0
# CHARSET = NIKUD | ABG
YUD = 'י'
VAV = 'ו'
YV = YUD + VAV
### utilities for converting (haser, male) text pairs into input & target for nikud model: ###
# haser: includes nikud, but not extra yuds/vavs
def align_haser_male(haser, male):
'''Input: pairs of texts in ktiv haser (with nikud) and ktiv male
Output: list of pairs (c1, c2) of characters; c1 in haser, c2 in male'''
i = 0
j = 0
output = []
while i < len(haser) and j < len(male):
if i >= len(haser):
output += [('', male[j])]
j += 1
elif j >= len(male):
output += [(haser[i], '')]
i += 1
elif haser[i] == male[j]:
output += [(haser[i], male[j])]
i += 1
j += 1
elif haser[i] in NIKUD:
output += [(haser[i], '')]
i += 1
else:
output += [('', male[j])]
j += 1
return output
def chunk_haser_male(haser, male):
'''uses alignment from previous method to split text into chunks
outputs list of chunks, one chunk has format: (str, bool)
str: Hebrew consonant with vowel(s) attached
bool: True iff letter should be deleted (i.e. extra yud/vav)'''
aligned = align_haser_male(haser, male)
chunks = []
del_flags = []
cur_chunk = ''
for c1, c2 in aligned:
if c1 == c2:
if cur_chunk != '':
chunks.append(cur_chunk)
del_flags.append(False)
cur_chunk = ''
cur_chunk += c1
elif c1 == '':
if cur_chunk != '':
chunks.append(cur_chunk)
del_flags.append(False)
cur_chunk = ''
chunks.append(c2)
del_flags.append(True)
else:
cur_chunk += c1
if cur_chunk != '':
chunks.append(cur_chunk)
del_flags.append(False)
return list(zip(chunks, del_flags))
def chunk2target(chunk):
'''turns chunks from previous method into multilabel targets for nikud model'''
text, del_flag = chunk
nikkud_list = [
int(chr(n) in text)
for n in range(NIKUD_START_ORD, NIKUD_END_ORD + 1)
if n not in SPECIAL_ORDS
]
return nikkud_list + [int(del_flag)]
def haser_male2target(haser, male):
'''Input: pairs of texts in ktiv haser (with nikud) and ktiv male
Output: multilabel targets for nikud model'''
chunked = chunk_haser_male(haser, male)
return np.vstack([chunk2target(chunk) for chunk in chunked])
if __name__ == '__main__':
haser = 'הַכְּרֻבִים'
male = 'הכרובים'
print(haser)
print(male)
print(chunk_haser_male(haser, male))
print(haser_male2target(haser, male))
print(haser_male2target(haser, male).shape)