Skip to content

Commit

Permalink
Merge yizhong's updates
Browse files Browse the repository at this point in the history
Change-Id: Ie1562ec64085e1d4157ee112f6b3efb7959aad14
  • Loading branch information
liuyuuan committed Nov 15, 2017
2 parents 96bc65b + cf9388e commit 15c9517
Show file tree
Hide file tree
Showing 4 changed files with 16 additions and 8 deletions.
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,13 @@ For more details about DuReader dataset please refer to [DuReader Homepage](http
### Preprocess the Data
After the dataset is downloaded, there is still some work to do to run the baseline systems. DuReader dataset offers rich amount of documents for every user question, the documents are too long for popular RC models to cope with. In our baseline models, we preprocess the train set and development set data by selecting the paragraph that is most related to the answer string, while for inferring(no available golden answer), we select the paragraph that is most related to the question string. The preprocessing strategy is implemented in `utils/preprocess.py`. To preprocess the raw data, run:
```
cat data/raw/search.train.json | python utils/preprocess.py > data/preprocessed/search.train.json
cat data/raw/trainset/search.train.json | python utils/preprocess.py > data/preprocessed/trainset/search.train.json
```
The preprocessing is already included in `data/download.sh`, the preprocessed data is stored in `data/preprocess`, the downloaded raw data is under `data/raw`.

Once the preprocessed data is ready, you can run `utils/get_vocab.py` to generate the vocabulary file, for example, if you want to train model with Baidu Search data:
```
python utils/get_vocab.py --files data/preprocess/search.train.json data/preprocess/search.dev.json --vocab data/vocab.search
python utils/get_vocab.py --files data/preprocessed/trainset/search.train.json data/preprocessed/devset/search.dev.json --vocab data/vocab.search
```

### Run PaddlePaddle
Expand Down Expand Up @@ -95,7 +95,7 @@ To train the reading comprehension model, you can specify the model type by usin
python run.py --task zhidao --algo BIDAF --epochs 10
```

The training process includes an evaluation on the dev set after each training epoch. By default, the model with the least bleu4 score on the dev set will be saved.
The training process includes an evaluation on the dev set after each training epoch. By default, the model with the least Bleu-4 score on the dev set will be saved.

#### Evaluation
To conduct a single evaluation on the dev set with the the model already trained, you can run the following command:
Expand Down
6 changes: 3 additions & 3 deletions paddle/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ class Dataset(object):
Base dataset class for various tasks.
"""
def __init__(self,
file_name=None,
file_names=None,
vocab_file=None,
vocab_size=0,
shuffle=False,
Expand All @@ -47,7 +47,7 @@ def __init__(self,
append_raw=False,
is_infer=False,
max_p_len=500):
self.file_names = file_name
self.file_names = file_names
self.data = []
self.raw = []
self.vocab = self.read_vocab(vocab_file, vocab_size) \
Expand Down Expand Up @@ -140,7 +140,7 @@ def _reader_preload():

def _reader_stream():
for file_name in self.file_names:
with open(self.file_name, 'r') as fn:
with open(file_name, 'r') as fn:
for line in fn:
data = self.parse(line.strip())
if not data:
Expand Down
4 changes: 2 additions & 2 deletions paddle/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,8 @@ vocab_size=200000
train() {
cp *.py $env_dir/
PYTHONPATH=$PWD:$ROOT CUDA_VISIBLE_DEVICES=0 python $env_dir/run.py \
--trainset ../data/preprocessed/search.train.json \
--testset ../data/preprocessed/search.dev.json \
--trainset ../data/preprocessed/trainset/search.train.json \
--testset ../data/preprocessed/devset/search.dev.json \
--vocab_file ../data/vocab.search \
--emb_dim $emb_dim \
--batch_size 32 \
Expand Down
8 changes: 8 additions & 0 deletions tensorflow/rc_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ def __init__(self, vocab, args):
self.optim_type = args.optim
self.learning_rate = args.learning_rate
self.weight_decay = args.weight_decay
self.use_dropout = args.dropout_keep_prob < 1

# length limit
self.max_p_num = args.max_p_num
Expand Down Expand Up @@ -125,6 +126,9 @@ def _encode(self):
self.sep_p_encodes, _ = rnn('bi-lstm', self.p_emb, self.p_length, self.hidden_size)
with tf.variable_scope('question_encoding'):
self.sep_q_encodes, _ = rnn('bi-lstm', self.q_emb, self.q_length, self.hidden_size)
if self.use_dropout:
self.sep_p_encodes = tf.nn.dropout(self.sep_p_encodes, self.dropout_keep_prob)
self.sep_q_encodes = tf.nn.dropout(self.sep_q_encodes, self.dropout_keep_prob)

def _match(self):
"""
Expand All @@ -138,6 +142,8 @@ def _match(self):
raise NotImplementedError('The algorithm {} is not implemented.'.format(self.algo))
self.match_p_encodes, _ = match_layer.match(self.sep_p_encodes, self.sep_q_encodes,
self.p_length, self.q_length)
if self.use_dropout:
self.match_p_encodes = tf.nn.dropout(self.match_p_encodes, self.dropout_keep_prob)

def _fuse(self):
"""
Expand All @@ -146,6 +152,8 @@ def _fuse(self):
with tf.variable_scope('fusion'):
self.fuse_p_encodes, _ = rnn('bi-lstm', self.match_p_encodes, self.p_length,
self.hidden_size, layer_num=1)
if self.use_dropout:
self.fuse_p_encodes = tf.nn.dropout(self.fuse_p_encodes, self.dropout_keep_prob)

def _decode(self):
"""
Expand Down

0 comments on commit 15c9517

Please sign in to comment.