diff --git a/interactive.sh b/interactive.sh new file mode 100644 index 0000000..a852600 --- /dev/null +++ b/interactive.sh @@ -0,0 +1,18 @@ +source_lang=en +target_lang=$1 +input_file=$2 +beam=$3 +nbest=$4 + + +fairseq-interactive corpus-bin \ + --path transformer/checkpoint_best.pt \ + --task translation_multi_simple_epoch \ + --beam $beam \ + --nbest $nbest \ + --source-lang $source_lang \ + --target-lang $target_lang \ + --encoder-langtok "tgt" \ + --lang-dict lang_list.txt \ + --input $input_file \ + --lang-pairs en-as,en-bn,en-brx,en-gom,en-gu,en-hi,en-kn,en-ks,en-mai,en-ml,en-mni,en-mr,en-ne,en-or,en-pa,en-sa,en-sd,en-si,en-ta,en-te,en-ur > output/${source_lang}_${target_lang}.txt \ No newline at end of file diff --git a/lang_list.txt b/lang_list.txt new file mode 100644 index 0000000..93ea437 --- /dev/null +++ b/lang_list.txt @@ -0,0 +1,22 @@ +en +as +bn +brx +gom +gu +hi +kn +ks +mai +ml +mni +mr +ne +or +pa +sa +sd +si +ta +te +ur diff --git a/load_input.py b/load_input.py new file mode 100644 index 0000000..3bc822f --- /dev/null +++ b/load_input.py @@ -0,0 +1,16 @@ +import sys +import re + +line = sys.argv[1] +line = re.sub(r'[^a-zA-Z ]+', '', line.strip().lower()) + +words = line.split(' ') + +input_words = [] + +for word in words: + input_words.append(' '.join(list(word))) + +with open('source/source.txt', 'w') as file: + for word in (input_words): + file.write(word + "\n") \ No newline at end of file diff --git a/transliterate_word.sh b/transliterate_word.sh new file mode 100644 index 0000000..fccf600 --- /dev/null +++ b/transliterate_word.sh @@ -0,0 +1,16 @@ +while getopts l:i:b:n:r: module +do + case "${module}" in + l)lang_abr=${OPTARG};; + i)input_file=${OPTARG};; + b)beam=${OPTARG};; + n)nbest=${OPTARG};; + r)rerank=${OPTARG};; + esac +done + +while IFS= read -r line; do + python3 load_input.py "$line" + bash interactive.sh $lang_abr 'source/source.txt' $beam $nbest + python3 generate_result_files_txt.py $lang_abr $rerank +done < $input_file \ No newline at end of file