Skip to content

Commit

Permalink
Add source case information to json file
Browse files Browse the repository at this point in the history
  • Loading branch information
hirofumi0810 committed Jul 23, 2020
1 parent 76c448b commit 246f891
Show file tree
Hide file tree
Showing 5 changed files with 56 additions and 56 deletions.
16 changes: 8 additions & 8 deletions egs/fisher_callhome_spanish/st1/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ backend=pytorch # chainer or pytorch
stage=0 # start from 0 if you need to start from data preparation
stop_stage=100
ngpu=1 # number of gpus ("0" uses cpu, otherwise use gpu)
nj=4 # numebr of parallel jobs for decoding
nj=4 # number of parallel jobs for decoding
debugmode=1
dumpdir=dump # directory to dump full features
N=0 # number of minibatches to be used (mainly for debugging). "0" uses all minibatches.
Expand Down Expand Up @@ -218,13 +218,13 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then

echo "make json files"
data2json.sh --nj 16 --feat ${feat_tr_dir}/feats.scp --text data/${train_set}/text.${tgt_case} --bpecode ${bpemodel}.model --lang en \
data/${train_set} ${dict} > ${feat_tr_dir}/data_${bpemode}${nbpe}.${tgt_case}.json
data/${train_set} ${dict} > ${feat_tr_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}.json
data2json.sh --feat ${feat_dt_dir}/feats.scp --text data/${train_dev}/text.${tgt_case} --bpecode ${bpemodel}.model --lang en \
data/${train_dev} ${dict} > ${feat_dt_dir}/data_${bpemode}${nbpe}.${tgt_case}.json
data/${train_dev} ${dict} > ${feat_dt_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}.json
for ttask in ${trans_set}; do
feat_trans_dir=${dumpdir}/${ttask}/delta${do_delta}
data2json.sh --feat ${feat_trans_dir}/feats.scp --text data/${ttask}/text.${tgt_case} --bpecode ${bpemodel}.model --lang en \
data/${ttask} ${dict} > ${feat_trans_dir}/data_${bpemode}${nbpe}.${tgt_case}.json
data/${ttask} ${dict} > ${feat_trans_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}.json
done

# Fisher has 4 references per utterance
Expand All @@ -241,7 +241,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
feat_dir=${dumpdir}/${x}/delta${do_delta}
data_dir=data/$(echo ${x} | cut -f 1 -d ".").es
update_json.sh --text ${data_dir}/text.${src_case} --bpecode ${bpemodel}.model \
${feat_dir}/data_${bpemode}${nbpe}.${tgt_case}.json ${data_dir} ${dict}
${feat_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}.json ${data_dir} ${dict}
done
for x in fisher_dev.en fisher_dev2.en fisher_test.en; do
feat_dir=${dumpdir}/${x}/delta${do_delta}
Expand Down Expand Up @@ -293,8 +293,8 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
--seed ${seed} \
--verbose ${verbose} \
--resume ${resume} \
--train-json ${feat_tr_dir}/data_${bpemode}${nbpe}.${tgt_case}.json \
--valid-json ${feat_dt_dir}/data_${bpemode}${nbpe}.${tgt_case}.json \
--train-json ${feat_tr_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}.json \
--valid-json ${feat_dt_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}.json \
--enc-init ${asr_model} \
--dec-init ${mt_model}
fi
Expand Down Expand Up @@ -325,7 +325,7 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
feat_trans_dir=${dumpdir}/${ttask}/delta${do_delta}

# split data
splitjson.py --parts ${nj} ${feat_trans_dir}/data_${bpemode}${nbpe}.${tgt_case}.json
splitjson.py --parts ${nj} ${feat_trans_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}.json

#### use CPU for decoding
ngpu=0
Expand Down
16 changes: 8 additions & 8 deletions egs/how2/st1/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ backend=pytorch # chainer or pytorch
stage=0 # start from -1 if you need to start from data download
stop_stage=100
ngpu=1 # number of gpus ("0" uses cpu, otherwise use gpu)
nj=16 # numebr of parallel jobs for decoding
nj=16 # number of parallel jobs for decoding
debugmode=1
dumpdir=dump # directory to dump full features
N=0 # number of minibatches to be used (mainly for debugging). "0" uses all minibatches.
Expand Down Expand Up @@ -164,21 +164,21 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then

echo "make json files"
data2json.sh --nj 16 --feat ${feat_tr_dir}/feats.scp --text data/${train_set}/text.${tgt_case} --bpecode ${bpemodel}.model --lang pt \
data/${train_set} ${dict} > ${feat_tr_dir}/data_${bpemode}${nbpe}.${tgt_case}.json
data/${train_set} ${dict} > ${feat_tr_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}.json
data2json.sh --feat ${feat_dt_dir}/feats.scp --text data/${train_dev}/text.${tgt_case} --bpecode ${bpemodel}.model --lang pt \
data/${train_dev} ${dict} > ${feat_dt_dir}/data_${bpemode}${nbpe}.${tgt_case}.json
data/${train_dev} ${dict} > ${feat_dt_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}.json
for ttask in ${trans_set}; do
feat_trans_dir=${dumpdir}/${ttask}/delta${do_delta}
data2json.sh --feat ${feat_trans_dir}/feats.scp --text data/${ttask}/text.${tgt_case} --bpecode ${bpemodel}.model --lang pt \
data/${ttask} ${dict} > ${feat_trans_dir}/data_${bpemode}${nbpe}.${tgt_case}.json
data/${ttask} ${dict} > ${feat_trans_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}.json
done

# update json (add source references)
for x in ${train_set} ${train_dev}; do
feat_dir=${dumpdir}/${x}/delta${do_delta}
data_dir=data/$(echo ${x} | cut -f 1 -d ".").en
update_json.sh --text ${data_dir}/text.${src_case} --bpecode ${bpemodel}.model \
${feat_dir}/data_${bpemode}${nbpe}.${tgt_case}.json ${data_dir} ${dict}
${feat_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}.json ${data_dir} ${dict}
done
fi

Expand Down Expand Up @@ -222,8 +222,8 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
--seed ${seed} \
--verbose ${verbose} \
--resume ${resume} \
--train-json ${feat_tr_dir}/data_${bpemode}${nbpe}.${tgt_case}.json \
--valid-json ${feat_dt_dir}/data_${bpemode}${nbpe}.${tgt_case}.json \
--train-json ${feat_tr_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}.json \
--valid-json ${feat_dt_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}.json \
--enc-init ${asr_model} \
--dec-init ${mt_model}
fi
Expand Down Expand Up @@ -254,7 +254,7 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
feat_trans_dir=${dumpdir}/${ttask}/delta${do_delta}

# split data
splitjson.py --parts ${nj} ${feat_trans_dir}/data_${bpemode}${nbpe}.${tgt_case}.json
splitjson.py --parts ${nj} ${feat_trans_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}.json

#### use CPU for decoding
ngpu=0
Expand Down
24 changes: 12 additions & 12 deletions egs/libri_trans/st1/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -206,29 +206,29 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then

echo "make json files"
data2json.sh --nj 16 --feat ${feat_tr_dir}/feats.scp --text data/${train_set}/text.${tgt_case} --bpecode ${bpemodel}.model --lang fr \
data/${train_set} ${dict} > ${feat_tr_dir}/data_${bpemode}${nbpe}.${tgt_case}.json
data/${train_set} ${dict} > ${feat_tr_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}.json
data2json.sh --nj 16 --feat ${feat_tr_dir}/feats.scp --text data/${train_set_prefix}.fr.gtranslate/text.${tgt_case} --bpecode ${bpemodel}.model --lang fr \
data/${train_set_prefix}.fr.gtranslate ${dict} > ${feat_tr_dir}/data_gtranslate${bpemode}${nbpe}.${tgt_case}.json
data/${train_set_prefix}.fr.gtranslate ${dict} > ${feat_tr_dir}/data_gtranslate${bpemode}${nbpe}.${src_case}_${tgt_case}.json
data2json.sh --feat ${feat_dt_dir}/feats.scp --text data/${train_dev}/text.${tgt_case} --bpecode ${bpemodel}.model --lang fr \
data/${train_dev} ${dict} > ${feat_dt_dir}/data_${bpemode}${nbpe}.${tgt_case}.json
data/${train_dev} ${dict} > ${feat_dt_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}.json
for ttask in ${trans_set}; do
feat_trans_dir=${dumpdir}/${ttask}/delta${do_delta}
data2json.sh --feat ${feat_trans_dir}/feats.scp --text data/${ttask}/text.${tgt_case} --bpecode ${bpemodel}.model --lang fr \
data/${ttask} ${dict} > ${feat_trans_dir}/data_${bpemode}${nbpe}.${tgt_case}.json
data/${ttask} ${dict} > ${feat_trans_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}.json
done

# update json (add source references)
update_json.sh --text data/"$(echo ${train_set} | cut -f 1 -d ".")".en/text.${src_case} --bpecode ${bpemodel}.model \
${feat_tr_dir}/data_${bpemode}${nbpe}.${tgt_case}.json data/"$(echo ${train_set} | cut -f 1 -d ".")".en ${dict}
${feat_tr_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}.json data/"$(echo ${train_set} | cut -f 1 -d ".")".en ${dict}
update_json.sh --text data/"$(echo ${train_set} | cut -f 1 -d ".")".en/text.${src_case} --bpecode ${bpemodel}.model \
${feat_tr_dir}/data_gtranslate${bpemode}${nbpe}.${tgt_case}.json data/"$(echo ${train_set} | cut -f 1 -d ".")".en ${dict}
${feat_tr_dir}/data_gtranslate${bpemode}${nbpe}.${src_case}_${tgt_case}.json data/"$(echo ${train_set} | cut -f 1 -d ".")".en ${dict}
update_json.sh --text data/"$(echo ${train_dev} | cut -f 1 -d ".")".en/text.${src_case} --bpecode ${bpemodel}.model \
${feat_dt_dir}/data_${bpemode}${nbpe}.${tgt_case}.json data/"$(echo ${train_dev} | cut -f 1 -d ".")".en ${dict}
${feat_dt_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}.json data/"$(echo ${train_dev} | cut -f 1 -d ".")".en ${dict}

# concatenate Fr and Fr (Google translation) jsons
local/concat_json_multiref.py \
${feat_tr_dir}/data_${bpemode}${nbpe}.${tgt_case}.json \
${feat_tr_dir}/data_gtranslate${bpemode}${nbpe}.${tgt_case}.json > ${feat_tr_dir}/data_2ref_${bpemode}${nbpe}.${tgt_case}.json
${feat_tr_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}.json \
${feat_tr_dir}/data_gtranslate${bpemode}${nbpe}.${src_case}_${tgt_case}.json > ${feat_tr_dir}/data_2ref_${bpemode}${nbpe}.${src_case}_${tgt_case}.json
fi

# NOTE: skip stage 3: LM Preparation
Expand Down Expand Up @@ -271,8 +271,8 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
--seed ${seed} \
--verbose ${verbose} \
--resume ${resume} \
--train-json ${feat_tr_dir}/data_2ref_${bpemode}${nbpe}.${tgt_case}.json \
--valid-json ${feat_dt_dir}/data_${bpemode}${nbpe}.${tgt_case}.json \
--train-json ${feat_tr_dir}/data_2ref_${bpemode}${nbpe}.${src_case}_${tgt_case}.json \
--valid-json ${feat_dt_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}.json \
--enc-init ${asr_model} \
--dec-init ${mt_model}
fi
Expand Down Expand Up @@ -303,7 +303,7 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
feat_trans_dir=${dumpdir}/${ttask}/delta${do_delta}

# split data
splitjson.py --parts ${nj} ${feat_trans_dir}/data_${bpemode}${nbpe}.${tgt_case}.json
splitjson.py --parts ${nj} ${feat_trans_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}.json

#### use CPU for decoding
ngpu=0
Expand Down
20 changes: 10 additions & 10 deletions egs/mboshi_french/st1/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -190,20 +190,20 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then

echo "make json files"
data2json.sh --nj 16 --feat ${feat_tr_dir}/feats.scp --text data/${train_set}/text.${tgt_case} --nlsyms ${nlsyms} --lang fr \
data/${train_set} ${dict} > ${feat_tr_dir}/data.${tgt_case}.json
data/${train_set} ${dict} > ${feat_tr_dir}/data.${src_case}_${tgt_case}.json
data2json.sh --feat ${feat_dt_dir}/feats.scp --text data/${train_dev}/text.${tgt_case} --nlsyms ${nlsyms} --lang fr \
data/${train_dev} ${dict} > ${feat_dt_dir}/data.${tgt_case}.json
data/${train_dev} ${dict} > ${feat_dt_dir}/data.${src_case}_${tgt_case}.json
for ttask in ${trans_set}; do
feat_trans_dir=${dumpdir}/${ttask}/delta${do_delta}
data2json.sh --feat ${feat_trans_dir}/feats.scp --text data/${ttask}/text.${tgt_case} --nlsyms ${nlsyms} --lang fr \
data/${ttask} ${dict} > ${feat_trans_dir}/data.${tgt_case}.json
data/${ttask} ${dict} > ${feat_trans_dir}/data.${src_case}_${tgt_case}.json
done

# update json (add source references)
update_json.sh --text data/"$(echo ${train_set} | cut -f -1 -d ".")".mb/text.${src_case} --nlsyms ${nlsyms} --lang mb \
${feat_tr_dir}/data.${tgt_case}.json data/"$(echo ${train_set} | cut -f -1 -d ".")".mb ${dict}
update_json.sh --text data/"$(echo ${train_dev} | cut -f -1 -d ".")".mb/text.${src_case} --nlsyms ${nlsyms} --lang mb \
${feat_dt_dir}/data.${tgt_case}.json data/"$(echo ${train_dev} | cut -f -1 -d ".")".mb ${dict}
update_json.sh --text data/"$(echo ${train_set} | cut -f -1 -d ".")".mb/text.${src_case} --nlsyms ${nlsyms} \
${feat_tr_dir}/data.${src_case}_${tgt_case}.json data/"$(echo ${train_set} | cut -f -1 -d ".")".mb ${dict}
update_json.sh --text data/"$(echo ${train_dev} | cut -f -1 -d ".")".mb/text.${src_case} --nlsyms ${nlsyms} \
${feat_dt_dir}/data.${src_case}_${tgt_case}.json data/"$(echo ${train_dev} | cut -f -1 -d ".")".mb ${dict}
fi

# NOTE: skip stage 3: LM Preparation
Expand Down Expand Up @@ -246,8 +246,8 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
--seed ${seed} \
--verbose ${verbose} \
--resume ${resume} \
--train-json ${feat_tr_dir}/data.${tgt_case}.json \
--valid-json ${feat_dt_dir}/data.${tgt_case}.json \
--train-json ${feat_tr_dir}/data.${src_case}_${tgt_case}.json \
--valid-json ${feat_dt_dir}/data.${src_case}_${tgt_case}.json \
--enc-init ${asr_model} \
--dec-init ${mt_model}
fi
Expand Down Expand Up @@ -278,7 +278,7 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
feat_trans_dir=${dumpdir}/${ttask}/delta${do_delta}

# split data
splitjson.py --parts ${nj} ${feat_trans_dir}/data.${tgt_case}.json
splitjson.py --parts ${nj} ${feat_trans_dir}/data.${src_case}_${tgt_case}.json

#### use CPU for decoding
ngpu=0
Expand Down
36 changes: 18 additions & 18 deletions egs/must_c/st1/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -192,35 +192,35 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
echo "stage 2: Dictionary and Json Data Preparation"
mkdir -p data/lang_1spm/

echo "make a non-linguistic symbol list for all languages"
grep sp1.0 data/train_sp.en-${tgt_lang}.*/text.${tgt_case} | cut -f 2- -d' ' | grep -o -P '&[^;]*;'| sort | uniq > ${nlsyms}
cat ${nlsyms}

echo "make a joint source and target dictionary"
echo "<unk> 1" > ${dict} # <unk> must be 1, 0 will be used for "blank" in CTC
offset=$(wc -l < ${dict})
grep sp1.0 data/train_sp.en-${tgt_lang}.*/text.${tgt_case} | cut -f 2- -d' ' | grep -v -e '^\s*$' > data/lang_1spm/input.txt
spm_train --user_defined_symbols="$(tr "\n" "," < ${nlsyms})" --input=data/lang_1spm/input.txt --vocab_size=${nbpe} --model_type=${bpemode} --model_prefix=${bpemodel} --input_sentence_size=100000000 --character_coverage=1.0
spm_encode --model=${bpemodel}.model --output_format=piece < data/lang_1spm/input.txt | tr ' ' '\n' | sort | uniq | awk -v offset=${offset} '{print $0 " " NR+offset}' >> ${dict}
wc -l ${dict}
# echo "make a non-linguistic symbol list for all languages"
# grep sp1.0 data/train_sp.en-${tgt_lang}.*/text.${tgt_case} | cut -f 2- -d' ' | grep -o -P '&[^;]*;'| sort | uniq > ${nlsyms}
# cat ${nlsyms}
#
# echo "make a joint source and target dictionary"
# echo "<unk> 1" > ${dict} # <unk> must be 1, 0 will be used for "blank" in CTC
# offset=$(wc -l < ${dict})
# grep sp1.0 data/train_sp.en-${tgt_lang}.*/text.${tgt_case} | cut -f 2- -d' ' | grep -v -e '^\s*$' > data/lang_1spm/input.txt
# spm_train --user_defined_symbols="$(tr "\n" "," < ${nlsyms})" --input=data/lang_1spm/input.txt --vocab_size=${nbpe} --model_type=${bpemode} --model_prefix=${bpemodel} --input_sentence_size=100000000 --character_coverage=1.0
# spm_encode --model=${bpemodel}.model --output_format=piece < data/lang_1spm/input.txt | tr ' ' '\n' | sort | uniq | awk -v offset=${offset} '{print $0 " " NR+offset}' >> ${dict}
# wc -l ${dict}

echo "make json files"
data2json.sh --nj 16 --feat ${feat_tr_dir}/feats.scp --text data/${train_set}/text.${tgt_case} --bpecode ${bpemodel}.model --lang ${tgt_lang} \
data/${train_set} ${dict} > ${feat_tr_dir}/data_${bpemode}${nbpe}.${tgt_case}.json
data/${train_set} ${dict} > ${feat_tr_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}.json
data2json.sh --feat ${feat_dt_dir}/feats.scp --text data/${train_dev}/text.${tgt_case} --bpecode ${bpemodel}.model --lang ${tgt_lang} \
data/${train_dev} ${dict} > ${feat_dt_dir}/data_${bpemode}${nbpe}.${tgt_case}.json
data/${train_dev} ${dict} > ${feat_dt_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}.json
for ttask in ${trans_set}; do
feat_trans_dir=${dumpdir}/${ttask}/delta${do_delta}
data2json.sh --feat ${feat_trans_dir}/feats.scp --text data/${ttask}/text.${tgt_case} --bpecode ${bpemodel}.model --lang ${tgt_lang} \
data/${ttask} ${dict} > ${feat_trans_dir}/data_${bpemode}${nbpe}.${tgt_case}.json
data/${ttask} ${dict} > ${feat_trans_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}.json
done

# update json (add source references)
for x in ${train_set} ${train_dev}; do
feat_dir=${dumpdir}/${x}/delta${do_delta}
data_dir=data/$(echo ${x} | cut -f 1 -d ".").en-${tgt_lang}.en
update_json.sh --text ${data_dir}/text.${src_case} --bpecode ${bpemodel}.model \
${feat_dir}/data_${bpemode}${nbpe}.${tgt_case}.json ${data_dir} ${dict}
${feat_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}.json ${data_dir} ${dict}
done
fi

Expand Down Expand Up @@ -264,8 +264,8 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
--seed ${seed} \
--verbose ${verbose} \
--resume ${resume} \
--train-json ${feat_tr_dir}/data_${bpemode}${nbpe}.${tgt_case}.json \
--valid-json ${feat_dt_dir}/data_${bpemode}${nbpe}.${tgt_case}.json \
--train-json ${feat_tr_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}.json \
--valid-json ${feat_dt_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}.json \
--enc-init ${asr_model} \
--dec-init ${mt_model}
fi
Expand Down Expand Up @@ -296,7 +296,7 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
feat_trans_dir=${dumpdir}/${ttask}/delta${do_delta}

# split data
splitjson.py --parts ${nj} ${feat_trans_dir}/data_${bpemode}${nbpe}.${tgt_case}.json
splitjson.py --parts ${nj} ${feat_trans_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}.json

#### use CPU for decoding
ngpu=0
Expand Down

0 comments on commit 246f891

Please sign in to comment.