Skip to content

Commit

Permalink
Fix bugs in detokenization of transliteration and translation
Browse files Browse the repository at this point in the history
  • Loading branch information
gaigutherz committed Apr 4, 2022
1 parent 045d38e commit cc2c54f
Show file tree
Hide file tree
Showing 3 changed files with 16 additions and 8 deletions.
12 changes: 10 additions & 2 deletions akkadian/translate_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,22 +18,30 @@ def translation(line):
return False


def detokenize_source(line):
def detokenize_cuneiform(line):
splitted = line.split('\t')

tokenized = ' '.join(splitted[1:])
source = tokenized.replace('▁', '').replace(' ', '')
return splitted[0] + ' ' + source


def detokenize_transliteration(line):
splitted = line.split('\t')

tokenized = ' '.join(splitted[1:])
source = tokenized.replace('▁', '').replace('- ', '-').replace(' -', '-').replace('. ', '.').replace(' .', '.').replace('{ ', '{').replace(' }', '}')
return splitted[0] + ' ' + source


def detokenize_translation(line, include_line_number=False):
splitted = line.split('\t')

if len(splitted) > 1 :
del splitted[1]

tokenized = ' '.join(splitted[1:])
translation = tokenized.replace('▁', '').replace(' ,', ',').replace(' .', '.').replace(' -', '-').replace(' !', '!').replace(' ?', '?')
translation = tokenized.replace('▁', '').replace(' ,', ',').replace(' .', '.').replace('- ', '-').replace(' -', '-').replace(' !', '!').replace(' ?', '?').replace(' ;', ';').replace(' '', ''').replace('' ', ''')
if include_line_number:
return splitted[0] + ' ' + translation
return translation
6 changes: 3 additions & 3 deletions akkadian/translate_from_cuneiform.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import subprocess
from pathlib import Path
from translation_tokenize import tokenize
from translate_common import source, translation, detokenize_source, detokenize_translation
from translate_common import source, translation, detokenize_cuneiform, detokenize_translation


def translate_cuneiform_base(file, capture_output=False):
Expand All @@ -25,9 +25,9 @@ def translate_cuneiform_file(file):
raw_result = translate_cuneiform_base(file, True).stdout
for line in raw_result.decode().split('\n'):
if source(line):
print(detokenize_source(line))
print(detokenize_cuneiform(line))
if translation(line):
print(detokenize_translation(line, True))
print(detokenize_translation(line, True) + "\n")


if __name__ == '__main__':
Expand Down
6 changes: 3 additions & 3 deletions akkadian/translate_from_transliteration.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import subprocess
from pathlib import Path
from translation_tokenize import tokenize
from translate_common import source, translation, detokenize_source, detokenize_translation
from translate_common import source, translation, detokenize_transliteration, detokenize_translation


def translate_transliteration_base(file, capture_output=False):
Expand All @@ -25,9 +25,9 @@ def translate_transliteration_file(file):
raw_result = translate_transliteration_base(file, True).stdout
for line in raw_result.decode().split('\n'):
if source(line):
print(detokenize_source(line))
print(detokenize_transliteration(line))
if translation(line):
print(detokenize_translation(line, True))
print(detokenize_translation(line, True) + "\n")


if __name__ == '__main__':
Expand Down

0 comments on commit cc2c54f

Please sign in to comment.