forked from facebookresearch/fairseq
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
9 changed files
with
119 additions
and
13 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
import linecache | ||
|
||
full_en = 'data/ai_hub/full.en' | ||
full_ko = 'data/ai_hub/full.ko' | ||
|
||
output_en = open('data/ai_hub/clean.en', 'w') | ||
output_ko = open('data/ai_hub/clean.ko', 'w') | ||
|
||
def clean(input): | ||
ret = input.replace('>', '') | ||
ret = ret.strip() | ||
# print(input, ret) | ||
return ret | ||
|
||
with open(full_en, 'r') as fp: | ||
for idx, en in enumerate(fp): | ||
en = clean(en) | ||
output_en.write(en+'\n') | ||
|
||
ko = linecache.getline(full_ko, idx+1) | ||
ko = clean(ko) | ||
output_ko.write(ko+'\n') | ||
|
||
print(idx) | ||
|
||
output_en.close() | ||
output_ko.close() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
import linecache | ||
|
||
train_size = 500000 | ||
val_size = 50000 | ||
test_size = 5000 | ||
|
||
full_en = 'data/ai_hub/clean.en' | ||
full_ko = 'data/ai_hub/clean.ko' | ||
|
||
with open(full_en, 'r') as fp: | ||
for idx, en in enumerate(fp): | ||
ko = linecache.getline() | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
# from konlpy.tag import Kkma, Okt | ||
from nltk.tokenize import word_tokenize | ||
|
||
input = "data/ai_hub/train.en" | ||
output = "data/ai_hub_tok/train.en" | ||
|
||
output_fp = open(output, "w") | ||
|
||
with open(input) as fp: | ||
lines = fp.readlines() | ||
for idx, line in enumerate(lines): | ||
tok = word_tokenize(line) | ||
tok = ' '.join(tok) | ||
# print(tok) | ||
output_fp.write(tok+'\n') | ||
print(idx) | ||
|
||
output_fp.close() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
from konlpy.tag import Kkma, Okt | ||
# from nltk.tokenize import word_tokenize | ||
import sentencepiece as spm | ||
|
||
input = "data/ai_hub/train.en" | ||
output = "data/ai_hub_sp/train.en" | ||
|
||
# tokenizer = Okt() | ||
sp = spm.SentencePieceProcessor() | ||
sp.Load('data/sp/subword_tokenizer_en.model') | ||
output_fp = open(output, "w") | ||
|
||
with open(input) as fp: | ||
lines = fp.readlines() | ||
for idx, line in enumerate(lines): | ||
# tok = tokenizer.morphs(line) | ||
# tok = ' '.join(tok) | ||
# output_fp.write(tok) | ||
tok = sp.encode(line, out_type=str) | ||
tok = ' '.join(tok) | ||
output_fp.write(tok+'\n') | ||
print(idx) | ||
|
||
output_fp.close() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,5 @@ | ||
TEXT=data/ai_hub | ||
TEXT=data/ai_hub_sp | ||
|
||
fairseq-preprocess --source-lang ko --target-lang en \ | ||
--trainpref $TEXT/train --validpref $TEXT/valid --testpref $TEXT/test \ | ||
--destdir data-bin/ai_hub.ko-en | ||
--destdir data-bin/ai_hub_sp.ko-en |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5,4 +5,7 @@ hydra-core | |
omegaconf | ||
bitarray | ||
tensorboard | ||
tensorboardX | ||
tensorboardX | ||
konlpy | ||
nltk | ||
sentencepiece |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
import sentencepiece as spm | ||
|
||
input_file = 'data/ai_hub/train.en' | ||
vocab_size = 32000 | ||
model_name = 'data/sp/subword_tokenizer_en' | ||
model_type = 'bpe' | ||
user_defined_symbols = '[PAD],[UNK],[CLS],[SEP],[MASK],[UNK1],[UNK2],[UNK3],[UNK4],[UNK5]' | ||
|
||
input_argument = '--input=%s --model_prefix=%s --vocab_size=%s --user_defined_symbols=%s --model_type=%s' | ||
cmd = input_argument%(input_file, model_name, vocab_size,user_defined_symbols, model_type) | ||
|
||
spm.SentencePieceTrainer.Train(cmd) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,7 +1,13 @@ | ||
# fairseq-generate data-bin/ai_hub.ko-en \ | ||
# --path checkpoints/koen/checkpoint_best.pt \ | ||
# --batch-size 128 --beam 5 --cpu | ||
# DATA=data-bin/ai_hub_sp.ko-en | ||
# CKPT=checkpoints/koen_sp2/checkpoint_best.pt | ||
DATA=data-bin/ai_hub_tok.ko-en | ||
CKPT=checkpoints/koen_tok/checkpoint_best.pt | ||
|
||
fairseq-interactive \ | ||
--path checkpoints/koen/checkpoint_best.pt \ | ||
--cpu --beam 5 --source-lang ko --target-lang en \ | ||
fairseq-generate $DATA \ | ||
--path $CKPT \ | ||
--batch-size 128 --beam 5 \ | ||
# --remove-bpe | ||
|
||
# fairseq-interactive \ | ||
# --path checkpoints/koen/checkpoint_best.pt checkpoints/koen \ | ||
# --beam 5 --source-lang ko --target-lang en |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,8 +1,10 @@ | ||
OUTPUT=checkpoints/koen | ||
OUTPUT=checkpoints/koen_sp2 | ||
DATE=20220927_2 | ||
DATA=data-bin/ai_hub_sp.ko-en | ||
|
||
mkdir -p $OUTPUT | ||
CUDA_VISIBLE_DEVICES=0 fairseq-train data-bin/ai_hub.ko-en \ | ||
CUDA_VISIBLE_DEVICES=0 fairseq-train $DATA \ | ||
--optimizer nag --lr 0.25 --clip-norm 0.1 --dropout 0.2 --max-tokens 4000 \ | ||
--arch transformer_tiny --save-dir $OUTPUT --batch-size 256 \ | ||
--tensorboard-logdir log/tfboard --log-file log/20220923.log \ | ||
--lr-scheduler reduce_lr_on_plateau | ||
--tensorboard-logdir log/tfboard --log-file log/$DATE.log \ | ||
--max-epoch 40 |