Skip to content

Commit

Permalink
support multi gpu in pytorch lm
Browse files Browse the repository at this point in the history
  • Loading branch information
ShigekiKarita committed Aug 8, 2019
1 parent 36bdb9a commit bdc13d2
Show file tree
Hide file tree
Showing 3 changed files with 15 additions and 21 deletions.
4 changes: 0 additions & 4 deletions egs/an4/asr1/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -175,10 +175,6 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
| cut -f 2- -d" " > ${lmdatadir}/test.txt
fi

# use only 1 gpu
if [ ${ngpu} -gt 1 ]; then
echo "LM training does not support multi-gpu. signle gpu will be used."
fi
${cuda_cmd} --gpu ${ngpu} ${lmexpdir}/train.log \
lm_train.py \
--config ${lm_config} \
Expand Down
15 changes: 7 additions & 8 deletions egs/tedlium3/asr1/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -161,15 +161,14 @@ mkdir -p ${lmexpdir}
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
echo "stage 3: LM Preparation"
lmdatadir=data/local/lm_train_${bpemode}${nbpe}
[ ! -e ${lmdatadir} ] && mkdir -p ${lmdatadir}
gunzip -c db/TEDLIUM_release-3/LM/*.en.gz | sed 's/ <\/s>//g' | local/join_suffix.py |\
spm_encode --model=${bpemodel}.model --output_format=piece > ${lmdatadir}/train.txt
cut -f 2- -d" " data/${train_dev}/text | spm_encode --model=${bpemodel}.model --output_format=piece \
> ${lmdatadir}/valid.txt
# use only 1 gpu
if [ ${ngpu} -gt 1 ]; then
echo "LM training does not support multi-gpu. signle gpu will be used."
if [ ! -e ${lmdatadir} ]; then
mkdir -p ${lmdatadir}
gunzip -c db/TEDLIUM_release-3/LM/*.en.gz | sed 's/ <\/s>//g' | local/join_suffix.py |\
spm_encode --model=${bpemodel}.model --output_format=piece > ${lmdatadir}/train.txt
cut -f 2- -d" " data/${train_dev}/text | spm_encode --model=${bpemodel}.model --output_format=piece \
> ${lmdatadir}/valid.txt
fi

${cuda_cmd} --gpu ${ngpu} ${lmexpdir}/train.log \
lm_train.py \
--config ${lm_config} \
Expand Down
17 changes: 8 additions & 9 deletions espnet/lm/pytorch_backend/lm.py
Original file line number Diff line number Diff line change
Expand Up @@ -269,10 +269,10 @@ def update_core(self):
loss += loss_batch * non_zeros
count += int(non_zeros)

loss = loss.mean()
reporter.report({'loss': float(loss.detach())}, optimizer.target)
reporter.report({'count': count}, optimizer.target)
# update
loss = loss / batch_size # normalized by batch size
self.model.zero_grad() # Clear the parameter gradients
loss.backward() # Backprop
if self.gradclip is not None:
Expand Down Expand Up @@ -313,7 +313,7 @@ def evaluate(self):
# report validation loss
observation = {}
with reporter.report_scope(observation):
reporter.report({'loss': float(loss / count)}, self.model.reporter)
reporter.report({'loss': float(loss.mean() / count)}, self.model.reporter)
return observation


Expand Down Expand Up @@ -350,21 +350,20 @@ def train(args):

use_sortagrad = args.sortagrad == -1 or args.sortagrad > 0
# Create the dataset iterators
train_iter = ParallelSentenceIterator(train, args.batchsize,
batch_size = args.batchsize * max(args.ngpu, 1)
train_iter = ParallelSentenceIterator(train, batch_size,
max_length=args.maxlen, sos=eos, eos=eos, shuffle=not use_sortagrad)
val_iter = ParallelSentenceIterator(val, args.batchsize,
val_iter = ParallelSentenceIterator(val, batch_size,
max_length=args.maxlen, sos=eos, eos=eos, repeat=False)
logging.info('#iterations per epoch = ' + str(len(train_iter.batch_indices)))
logging.info('#total iterations = ' + str(args.epoch * len(train_iter.batch_indices)))
# Prepare an RNNLM model
rnn = RNNLM(args.n_vocab, args.layer, args.unit, args.type, args.dropout_rate)
model = ClassifierWithState(rnn)
if args.ngpu > 1:
logging.warning("currently, multi-gpu is not supported. use single gpu.")
if args.ngpu > 0:
# Make the specified GPU current
model = torch.nn.DataParallel(model).cuda()
setattr(model, "reporter", model.module.reporter)
gpu_id = 0
model.cuda(gpu_id)
else:
gpu_id = -1

Expand Down Expand Up @@ -424,7 +423,7 @@ def train(args):
logging.info('#sentences in the test data = ' + str(len(test)))
logging.info('#tokens in the test data = ' + str(n_test_tokens))
logging.info('oov rate in the test data = %.2f %%' % (n_test_oovs / n_test_tokens * 100))
test_iter = ParallelSentenceIterator(test, args.batchsize,
test_iter = ParallelSentenceIterator(test, batch_size,
max_length=args.maxlen, sos=eos, eos=eos, repeat=False)
evaluator = LMEvaluator(test_iter, model, reporter, device=gpu_id)
result = evaluator()
Expand Down

0 comments on commit bdc13d2

Please sign in to comment.