forked from facebookresearch/fairseq
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Adding tok.sh and installation scripts for the dependecies (facebookr…
…esearch#1339) Summary: Adding tok.sh needed to evaluate performance of multilingual models. Aside tok.sh added installation script "install_dependecies.sh" that will install all the needed dependencies except Arabic. Arabic requires downloading separate installation packages and signing licensing agreements, so it can't be automated. # Before submitting - [X] Was this discussed/approved via a Github issue? (no need for typos, doc improvements) - [X] Did you read the [contributor guideline](https://github.com/pytorch/fairseq/blob/master/CONTRIBUTING.md)? - [X] Did you make sure to update the docs? - [X] Did you write any new necessary tests? ## PR review Anyone in the community is free to review the PR once the tests have passed. If we didn't discuss your PR in Github issues there's a high chance it will not be merged. ## Did you have fun? Make sure you had fun coding � Pull Request resolved: fairinternal/fairseq-py#1339 Reviewed By: shruti-bh Differential Revision: D24311526 Pulled By: edunov fbshipit-source-id: fe9d46b0c7d7dc090e03f504e048b0c6eb616df2
- Loading branch information
1 parent
6d3712e
commit 3c118ad
Showing
10 changed files
with
286 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
# MMMT Tokenizer | ||
|
||
We apply different tokenization strategies for different languages following the existing literature. Here we provide tok.sh a tokenizer that can be used to reproduce our results. | ||
|
||
To reproduce the results, follow these steps: | ||
|
||
``` | ||
tgt_lang=... | ||
reference_translation=... | ||
cat generation_output | grep -P "^H" |sort -V |cut -f 3- |sh tok.sh $tgt_lang > hyp | ||
cat $reference_translation |sh tok.sh $tgt_lang > ref | ||
sacrebleu -tok 'none' ref < hyp | ||
``` | ||
|
||
# Installation | ||
|
||
Tools needed for all the languages except Arabic can be installed by running install_dependencies.sh | ||
If you want to evaluate Arabic models, please follow the instructions provided here: http://alt.qcri.org/tools/arabic-normalizer/ to install |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,78 @@ | ||
#!/usr/bin/env bash | ||
# Copyright (c) Facebook, Inc. and its affiliates. | ||
# | ||
# This source code is licensed under the MIT license found in the | ||
# LICENSE file in the root directory of this source tree. | ||
|
||
|
||
CWD=`pwd` | ||
INSTALL_PATH=$CWD/tokenizers/thirdparty | ||
|
||
MOSES=$INSTALL_PATH/mosesdecoder | ||
if [ ! -d $MOSES ]; then | ||
echo 'Cloning Moses github repository (for tokenization scripts)...' | ||
git clone https://github.com/moses-smt/mosesdecoder.git $MOSES | ||
cd $MOSES | ||
# To deal with differences in handling ' vs " | ||
git checkout 03578921cc1a03402 | ||
cd - | ||
fi | ||
|
||
WMT16_SCRIPTS=$INSTALL_PATH/wmt16-scripts | ||
if [ ! -d $WMT16_SCRIPTS ]; then | ||
echo 'Cloning Romanian tokenization scripts' | ||
git clone https://github.com/rsennrich/wmt16-scripts.git $WMT16_SCRIPTS | ||
fi | ||
|
||
KYTEA=$INSTALL_PATH/kytea | ||
if [ ! -f $KYTEA/bin/kytea ]; then | ||
git clone https://github.com/neubig/kytea.git $KYTEA | ||
cd $KYTEA | ||
autoreconf -i | ||
./configure --prefix=`pwd` | ||
make | ||
make install | ||
cd .. | ||
fi | ||
|
||
export MECAB=$INSTALL_PATH/mecab-0.996-ko-0.9.2 | ||
if [ ! -f $MECAB/bin/mecab ]; then | ||
cd $INSTALL_PATH | ||
curl -LO https://bitbucket.org/eunjeon/mecab-ko/downloads/mecab-0.996-ko-0.9.2.tar.gz | ||
tar zxfv mecab-0.996-ko-0.9.2.tar.gz | ||
cd mecab-0.996-ko-0.9.2/ | ||
./configure --prefix=`pwd` | ||
make | ||
make install | ||
|
||
cd .. | ||
curl -LO https://bitbucket.org/eunjeon/mecab-ko-dic/downloads/mecab-ko-dic-2.1.1-20180720.tar.gz | ||
tar zxfv mecab-ko-dic-2.1.1-20180720.tar.gz | ||
cd mecab-ko-dic-2.1.1-20180720/ | ||
./autogen.sh | ||
./configure --prefix=`pwd` --with-dicdir=$MECAB/lib/mecab/dic/mecab-ko-dic --with-mecab-config=$MECAB/bin/mecab-config | ||
make | ||
sh -c 'echo "dicdir=$MECAB/lib/mecab/dic/mecab-ko-dic" > $MECAB/etc/mecabrc' | ||
make install | ||
cd $CWD | ||
fi | ||
|
||
INDIC_RESOURCES_PATH=$INSTALL_PATH/indic_nlp_resources | ||
if [ ! -d $INDIC_RESOURCES_PATH ]; then | ||
echo 'Cloning indic_nlp_resources' | ||
git clone https://github.com/anoopkunchukuttan/indic_nlp_resources.git $INDIC_RESOURCES_PATH | ||
fi | ||
|
||
|
||
if [ ! -f $INSTALL_PATH/seg_my.py ]; then | ||
cd $INSTALL_PATH | ||
wget http://lotus.kuee.kyoto-u.ac.jp/WAT/my-en-data/wat2020.my-en.zip | ||
unzip wat2020.my-en.zip | ||
# switch to python3 | ||
cat wat2020.my-en/myseg.py |sed 's/^sys.std/###sys.std/g' | sed 's/### sys/sys/g' | sed 's/unichr/chr/g' > seg_my.py | ||
cd $CWD | ||
fi | ||
|
||
|
||
pip install pythainlp sacrebleu indic-nlp-library | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,83 @@ | ||
#!/usr/bin/env bash | ||
# Copyright (c) 2019-present, Facebook, Inc. | ||
# All rights reserved. | ||
# | ||
# This source code is licensed under the license found in the | ||
# LICENSE file in the root directory of this source tree. | ||
# | ||
|
||
set -e | ||
|
||
TOKENIZERS_SCRIPTS=tokenizers | ||
INSTALL_PATH=$TOKENIZERS_SCRIPTS/thirdparty | ||
|
||
N_THREADS=8 | ||
|
||
lg=$1 | ||
|
||
MOSES=$INSTALL_PATH/mosesdecoder | ||
REPLACE_UNICODE_PUNCT=$MOSES/scripts/tokenizer/replace-unicode-punctuation.perl | ||
NORM_PUNC=$MOSES/scripts/tokenizer/normalize-punctuation.perl | ||
REM_NON_PRINT_CHAR=$MOSES/scripts/tokenizer/remove-non-printing-char.perl | ||
TOKENIZER=$MOSES/scripts/tokenizer/tokenizer.perl | ||
|
||
# special tokenization for Romanian | ||
WMT16_SCRIPTS=$INSTALL_PATH/wmt16-scripts | ||
|
||
NORMALIZE_ROMANIAN=$WMT16_SCRIPTS/preprocess/normalise-romanian.py | ||
REMOVE_DIACRITICS=$WMT16_SCRIPTS/preprocess/remove-diacritics.py | ||
|
||
# Burmese | ||
MY_SEGMENT=$INSTALL_PATH/seg_my.py | ||
|
||
# Arabic | ||
AR_TOKENIZER=$TOKENIZERS_SCRIPTS/tokenizer_ar.sh | ||
|
||
# Korean | ||
KO_SEGMENT=$TOKENIZERS_SCRIPTS/seg_ko.sh | ||
|
||
# Japanese | ||
JA_SEGMENT=$TOKENIZERS_SCRIPTS/seg_ja.sh | ||
|
||
# Indic | ||
IN_TOKENIZER=$TOKENIZERS_SCRIPTS/tokenize_indic.py | ||
INDIC_RESOURCES_PATH=$INSTALL_PATH/indic_nlp_resources | ||
|
||
# Thai | ||
THAI_TOKENIZER=$TOKENIZERS_SCRIPTS/tokenize_thai.py | ||
|
||
# Chinese | ||
CHINESE_TOKENIZER=$TOKENIZERS_SCRIPTS/tokenize_zh.py | ||
|
||
# Chinese | ||
if [ "$lg" = "zh" ]; then | ||
cat - | $REPLACE_UNICODE_PUNCT | $NORM_PUNC -l $lg | $REM_NON_PRINT_CHAR | python $CHINESE_TOKENIZER | ||
# Thai | ||
elif [ "$lg" = "th" ]; then | ||
cat - | python $THAI_TOKENIZER | ||
# Japanese | ||
elif [ "$lg" = "ja" ]; then | ||
cat - | $REPLACE_UNICODE_PUNCT | $NORM_PUNC -l $lg | $REM_NON_PRINT_CHAR | ${JA_SEGMENT} | ||
# Korean | ||
elif [ "$lg" = "ko" ]; then | ||
cat - | $REM_NON_PRINT_CHAR | ${KO_SEGMENT} | ||
# Romanian | ||
elif [ "$lg" = "ro" ]; then | ||
cat - | $REPLACE_UNICODE_PUNCT | $NORM_PUNC -l $lg | $REM_NON_PRINT_CHAR | $NORMALIZE_ROMANIAN | $REMOVE_DIACRITICS | $TOKENIZER -no-escape -threads $N_THREADS -l $lg | ||
# Burmese | ||
elif [ "$lg" = "my" ]; then | ||
cat - | python ${MY_SEGMENT} | ||
# Arabic | ||
elif [ "$lg" = "ar" ]; then | ||
cat - | ${AR_TOKENIZER} | ||
# Indic | ||
elif [ "$lg" = "ne" ]; then | ||
cat - | python ${IN_TOKENIZER} $lg | ||
elif [ "$lg" = "si" ]; then | ||
cat - | python ${IN_TOKENIZER} $lg | ||
elif [ "$lg" = "hi" ]; then | ||
cat - | python ${IN_TOKENIZER} $lg | ||
# other languages | ||
else | ||
cat - | $REPLACE_UNICODE_PUNCT | $NORM_PUNC -l $lg | $REM_NON_PRINT_CHAR | $TOKENIZER -no-escape -threads $N_THREADS -l $lg | ||
fi |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
#!/usr/bin/env bash | ||
# Copyright (c) Facebook, Inc. and its affiliates. | ||
# | ||
# This source code is licensed under the MIT license found in the | ||
# LICENSE file in the root directory of this source tree. | ||
SCRIPT=`realpath $0` | ||
KYTEA=`dirname $SCRIPT`/thirdparty/kytea | ||
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$KYTEA/lib:/usr/local/lib | ||
export PATH=$PATH:"$KYTEA/bin" | ||
|
||
cat - | tr -d "[:blank:]" | kytea -notags |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
#!/usr/bin/env bash | ||
# Copyright (c) Facebook, Inc. and its affiliates. | ||
# | ||
# This source code is licensed under the MIT license found in the | ||
# LICENSE file in the root directory of this source tree. | ||
SCRIPT=`realpath $0` | ||
MECAB=`dirname $SCRIPT`/thirdparty/mecab-0.996-ko-0.9.2 | ||
|
||
export PATH=$PATH:"$MECAB/bin":"$MECAB/lib" | ||
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:"$MECAB/lib" | ||
|
||
cat - | mecab -O wakati |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
seg_my.py | ||
indic_nlp_library/ | ||
indic_nlp_resources/ | ||
kytea/ | ||
mecab-0.996-ko-0.9.2.tar.gz | ||
mecab-0.996-ko-0.9.2/ | ||
mosesdecoder/ | ||
wat2020.my-en.zip | ||
wat2020.my-en/ | ||
wmt16-scripts/ | ||
mecab-ko-dic-2.1.1-20180720/ | ||
mecab-ko-dic-2.1.1-20180720.tar.gz |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
#!/usr/bin/env python3 | ||
# Copyright (c) Facebook, Inc. and its affiliates. | ||
# | ||
# This source code is licensed under the MIT license found in the | ||
# LICENSE file in the root directory of this source tree. | ||
|
||
# Use: echo {text} | python tokenize_indic.py {language} | ||
|
||
import sys | ||
|
||
from indicnlp.tokenize.indic_tokenize import trivial_tokenize | ||
from indicnlp.normalize.indic_normalize import IndicNormalizerFactory | ||
|
||
factory=IndicNormalizerFactory() | ||
normalizer=factory.get_normalizer(sys.argv[1],remove_nuktas=False,nasals_mode='do_nothing') | ||
|
||
for line in sys.stdin: | ||
normalized_line=normalizer.normalize(line.strip()) | ||
tokenized_line=' '.join(trivial_tokenize(normalized_line, sys.argv[1])) | ||
print(tokenized_line) | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
#!/usr/bin/env python3 | ||
# Copyright (c) Facebook, Inc. and its affiliates. | ||
# | ||
# This source code is licensed under the MIT license found in the | ||
# LICENSE file in the root directory of this source tree. | ||
|
||
import sys | ||
|
||
from pythainlp import word_tokenize | ||
|
||
for line in sys.stdin: | ||
print(" ".join(word_tokenize(line.strip()))) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
#!/usr/bin/env python3 | ||
# Copyright (c) Facebook, Inc. and its affiliates. | ||
# | ||
# This source code is licensed under the MIT license found in the | ||
# LICENSE file in the root directory of this source tree. | ||
|
||
|
||
import fileinput | ||
import sacrebleu | ||
|
||
for line in fileinput.input(): | ||
print(sacrebleu.tokenize_zh(line)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
#!/usr/bin/env sh | ||
# Copyright (c) Facebook, Inc. and its affiliates. | ||
# | ||
# This source code is licensed under the MIT license found in the | ||
# LICENSE file in the root directory of this source tree. | ||
# | ||
# Please follow the instructions here http://alt.qcri.org/tools/arabic-normalizer/ | ||
# to install tools needed for Arabic | ||
|
||
echo "Please install Arabic tools: http://alt.qcri.org/tools/arabic-normalizer/" | ||
echo "Then update environment variables in tokenizer_ar.sh" | ||
exit 1 | ||
|
||
SVMTOOL=... | ||
GOMOSESGO=... | ||
QCRI_ARABIC_NORMALIZER=... | ||
|
||
export PERL5LIB="$SVMTOOL/lib":"$GOMOSESGO/bin/MADA-3.2":$PERL5LIB | ||
|
||
|
||
tempfile=$(mktemp) | ||
cat - > $tempfile | ||
|
||
cd $QCRI_ARABIC_NORMALIZER | ||
|
||
bash qcri_normalizer_mada3.2_aramorph1.2.1.sh $tempfile | ||
cat $tempfile.mada_norm-aramorph.europarl_tok |