Skip to content

Commit

Permalink
initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
roedoejet committed May 7, 2019
0 parents commit 702fe66
Show file tree
Hide file tree
Showing 6 changed files with 241 additions and 0 deletions.
8 changes: 8 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
# Compiled python modules.
*.pyc

# Setuptools distribution folder.
/dist/

# Python egg metadata, regenerated from source files by setuptools.
/*.egg-info
1 change: 1 addition & 0 deletions g2p/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
VERSION = '0.0.1'
173 changes: 173 additions & 0 deletions g2p/cors/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,173 @@
import csv

class Correspondence(object):
def __init__(self, language):
# Load workbook, either from correspondence spreadsheets, or user loaded
this_dir = os.path.dirname(os.path.abspath(__file__))
if not isinstance(language, type(None)):
if isinstance(language, list):
if all('from' in d for d in language) and all('to' in d for d in language):
for cor in language:
cor["match_pattern"] = self.rule_to_regex(cor)
if self.order_as_is:
# Add match pattern regular expression
self.cor_list = language
else:
self.cor_list = self.process_intermediate(language)
else:
raise exceptions.MalformedCorrespondence()
elif language.endswith('csv'):
self.cor_list = self.load_from_csv(language)
else:
if language.endswith('xlsx'):
self.cor_list = self.load_from_workbook(language)
else:
try_default = os.path.join(this_dir, "correspondence_spreadsheets", language + '.xlsx')
if os.path.exists(try_default):
self.cor_list = self.load_from_workbook(try_default)
else:
raise exceptions.CorrespondenceMissing(language)
else:
raise exceptions.CorrespondenceMissing(language)

def load_from_csv(self, language):
ws = []
with open(language, encoding='utf8') as f:
reader = csv.reader(f)
for line in reader:
ws.append(line)
# Create wordlist
cor_list = []

# Loop through rows in worksheet, create if statements for different columns and append Cors to cor_list.
for entry in ws:
newCor = {"from": "", "to": "", "before": "", "after": ""}

newCor['from'] = entry[0]
newCor['to'] = entry[1]
try:
newCor['before'] = entry[2]
except IndexError:
newCor['before'] = ''

try:
newCor['after'] = entry[3]
except IndexError:
newCor['after'] = ''

for k in newCor:
if isinstance(newCor[k], float) or isinstance(newCor[k], int):
newCor[k] = str(newCor[k])

cor_list.append(newCor)

# Add match pattern regular expression
for cor in cor_list:
cor["match_pattern"] = self.rule_to_regex(cor)

if self.order_as_is:
return cor_list
else:
return self.process_intermediate(cor_list)

def load_from_workbook(self, language):
wb = load_workbook(language)
ws = wb.active
# Create wordlist
cor_list = []

# Loop through rows in worksheet, create if statements for different columns and append Cors to cor_list.
for entry in ws:
newCor = {"from": "", "to": "", "before": "", "after": ""}
for col in entry:
if col.column == 'A':
value = col.value
if type(value) == float or int:
value = str(value)
newCor["from"] = value
if col.column == 'B':
value = col.value
if type(value) == float or int:
value = str(value)
newCor["to"] = value
if col.column == 'C':
if col.value is not None:
value = col.value
if type(value) == float or int:
value = str(value)
newCor["before"] = value
if col.column == 'D':
if col.value is not None:
value = col.value
if type(value) == float or int:
value = str(value)
newCor["after"] = value
cor_list.append(newCor)

# Add match pattern regular expression
for cor in cor_list:
cor["match_pattern"] = self.rule_to_regex(cor)

if self.order_as_is:
return cor_list
else:
return self.process_intermediate(cor_list)

def process_intermediate(self, cor_list):

# To prevent feeding
for cor in cor_list:
# if output exists as input for another cor
if cor['to'] in [temp_cor['from'] for temp_cor in cor_list]:
# assign a random, unique character as a temporary value. this could be more efficient
random_char = chr(random.randrange(9632, 9727))
# make sure character is unique
if [temp_char for temp_char in cor_list if 'temp' in list(temp_char.keys())]:
while random_char in [temp_char['temp'] for temp_char in cor_list if 'temp' in list(temp_char.keys())]:
random_char = chr(random.randrange(9632, 9727))
cor['temp'] = random_char

# preserve rule ordering with regex, then apply context free changes from largest to smallest
context_sensitive_rules = [x for x in cor_list if (x['before'] != '' or x['after'] != "")]
context_free_rules = [x for x in cor_list if x['before'] == "" and x["after"] == ""]
context_free_rules.sort(key=lambda x: len(x["from"]), reverse=True)
cor_list = context_sensitive_rules + context_free_rules
return cor_list

def rule_to_regex(self, rule):
if rule['before'] is not None:
before = rule["before"]
else:
before = ''
if rule['after'] is not None:
after = rule["after"]
else:
after = ''
fromMatch = rule["from"]
try:
ruleRX = re.compile(f"(?<={before})" + fromMatch + f"(?={after})")
except:
raise Exception('Your regex is malformed. Escape all regular expression special characters in your conversion table.')
return ruleRX

def apply_rules(self, to_parse):
for cor in self.cor_list:
if re.search(cor["match_pattern"], to_parse):
# if a temporary value was assigned
if 'temp' in list(cor.keys()):
# turn the original value into the temporary one
to_parse = re.sub(cor["match_pattern"], cor["temp"], to_parse)
else:
# else turn it into the final value
to_parse = re.sub(cor["match_pattern"], cor["to"], to_parse)
# transliterate temporary values
for cor in self.cor_list:
# transliterate temp value to final value if it exists, otherwise pass
try:
if "temp" in cor and cor['temp'] and re.search(cor['temp'], to_parse):
to_parse = re.sub(cor['temp'], cor['to'], to_parse)
else:
pass
except KeyError:
pass
return to_parse
44 changes: 44 additions & 0 deletions g2p/cors/langs/hei/doulos.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
¹, p̓
¡, ṃ́
™, ṃ
µ, m̓
², ṃ̓
†, t̓
¢, ṇ́
Þ, ṇ
ñ, n̓
¦, ṇ̓
ç, c̓
ß, λ
Ò, ƛ
ƒ, ƛ̓
æ, ɫ
Æ, Ɫ
ø, ḷ́
ª, ḷ
¬, l̓
…, ḷ̓
ð, k̓
í, í
¥, y̓
î, i̓
ðv, k̓v
ú, ú
„, w̓
ü, u̓
©v, ǧv
œv, q̓v
þv, x̌v
©, ǧ
œ, q̓
þ, x̌
á, á
Ó, h̓
å, a̓
Ô, ħ
‰, ʔ
ÿx, x̌
ÿX, X̌
Í, ⅄
Î, ⅄
Ï, ⅄̓
15 changes: 15 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
''' Setup for G2P
'''
from setuptools import setup, find_packages
import g2p

setup(
name='g2p',
python_requires='>=3.6',
version=g2p.VERSION,
long_description='indexed grapheme to phoneme conversion',
packages=find_packages(),
include_package_data=True,
zip_safe=False,
install_requires=[]
)
Empty file added tests/__init__.py
Empty file.

0 comments on commit 702fe66

Please sign in to comment.