Skip to content

Commit

Permalink
update
Browse files Browse the repository at this point in the history
  • Loading branch information
yms9654 committed Sep 27, 2022
1 parent 7d1d9ba commit dcb9adf
Show file tree
Hide file tree
Showing 9 changed files with 119 additions and 13 deletions.
27 changes: 27 additions & 0 deletions data_clean.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
import linecache

full_en = 'data/ai_hub/full.en'
full_ko = 'data/ai_hub/full.ko'

output_en = open('data/ai_hub/clean.en', 'w')
output_ko = open('data/ai_hub/clean.ko', 'w')

def clean(input):
ret = input.replace('>', '')
ret = ret.strip()
# print(input, ret)
return ret

with open(full_en, 'r') as fp:
for idx, en in enumerate(fp):
en = clean(en)
output_en.write(en+'\n')

ko = linecache.getline(full_ko, idx+1)
ko = clean(ko)
output_ko.write(ko+'\n')

print(idx)

output_en.close()
output_ko.close()
14 changes: 14 additions & 0 deletions data_split.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
import linecache

train_size = 500000
val_size = 50000
test_size = 5000

full_en = 'data/ai_hub/clean.en'
full_ko = 'data/ai_hub/clean.ko'

with open(full_en, 'r') as fp:
for idx, en in enumerate(fp):
ko = linecache.getline()


18 changes: 18 additions & 0 deletions en_token.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# from konlpy.tag import Kkma, Okt
from nltk.tokenize import word_tokenize

input = "data/ai_hub/train.en"
output = "data/ai_hub_tok/train.en"

output_fp = open(output, "w")

with open(input) as fp:
lines = fp.readlines()
for idx, line in enumerate(lines):
tok = word_tokenize(line)
tok = ' '.join(tok)
# print(tok)
output_fp.write(tok+'\n')
print(idx)

output_fp.close()
24 changes: 24 additions & 0 deletions ko_token.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
from konlpy.tag import Kkma, Okt
# from nltk.tokenize import word_tokenize
import sentencepiece as spm

input = "data/ai_hub/train.en"
output = "data/ai_hub_sp/train.en"

# tokenizer = Okt()
sp = spm.SentencePieceProcessor()
sp.Load('data/sp/subword_tokenizer_en.model')
output_fp = open(output, "w")

with open(input) as fp:
lines = fp.readlines()
for idx, line in enumerate(lines):
# tok = tokenizer.morphs(line)
# tok = ' '.join(tok)
# output_fp.write(tok)
tok = sp.encode(line, out_type=str)
tok = ' '.join(tok)
output_fp.write(tok+'\n')
print(idx)

output_fp.close()
4 changes: 2 additions & 2 deletions preprocess.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
TEXT=data/ai_hub
TEXT=data/ai_hub_sp

fairseq-preprocess --source-lang ko --target-lang en \
--trainpref $TEXT/train --validpref $TEXT/valid --testpref $TEXT/test \
--destdir data-bin/ai_hub.ko-en
--destdir data-bin/ai_hub_sp.ko-en
5 changes: 4 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,7 @@ hydra-core
omegaconf
bitarray
tensorboard
tensorboardX
tensorboardX
konlpy
nltk
sentencepiece
12 changes: 12 additions & 0 deletions sp_token.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
import sentencepiece as spm

input_file = 'data/ai_hub/train.en'
vocab_size = 32000
model_name = 'data/sp/subword_tokenizer_en'
model_type = 'bpe'
user_defined_symbols = '[PAD],[UNK],[CLS],[SEP],[MASK],[UNK1],[UNK2],[UNK3],[UNK4],[UNK5]'

input_argument = '--input=%s --model_prefix=%s --vocab_size=%s --user_defined_symbols=%s --model_type=%s'
cmd = input_argument%(input_file, model_name, vocab_size,user_defined_symbols, model_type)

spm.SentencePieceTrainer.Train(cmd)
18 changes: 12 additions & 6 deletions test.sh
Original file line number Diff line number Diff line change
@@ -1,7 +1,13 @@
# fairseq-generate data-bin/ai_hub.ko-en \
# --path checkpoints/koen/checkpoint_best.pt \
# --batch-size 128 --beam 5 --cpu
# DATA=data-bin/ai_hub_sp.ko-en
# CKPT=checkpoints/koen_sp2/checkpoint_best.pt
DATA=data-bin/ai_hub_tok.ko-en
CKPT=checkpoints/koen_tok/checkpoint_best.pt

fairseq-interactive \
--path checkpoints/koen/checkpoint_best.pt \
--cpu --beam 5 --source-lang ko --target-lang en \
fairseq-generate $DATA \
--path $CKPT \
--batch-size 128 --beam 5 \
# --remove-bpe

# fairseq-interactive \
# --path checkpoints/koen/checkpoint_best.pt checkpoints/koen \
# --beam 5 --source-lang ko --target-lang en
10 changes: 6 additions & 4 deletions train.sh
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
OUTPUT=checkpoints/koen
OUTPUT=checkpoints/koen_sp2
DATE=20220927_2
DATA=data-bin/ai_hub_sp.ko-en

mkdir -p $OUTPUT
CUDA_VISIBLE_DEVICES=0 fairseq-train data-bin/ai_hub.ko-en \
CUDA_VISIBLE_DEVICES=0 fairseq-train $DATA \
--optimizer nag --lr 0.25 --clip-norm 0.1 --dropout 0.2 --max-tokens 4000 \
--arch transformer_tiny --save-dir $OUTPUT --batch-size 256 \
--tensorboard-logdir log/tfboard --log-file log/20220923.log \
--lr-scheduler reduce_lr_on_plateau
--tensorboard-logdir log/tfboard --log-file log/$DATE.log \
--max-epoch 40

0 comments on commit dcb9adf

Please sign in to comment.