Merge yizhong's updates

Change-Id: Ie1562ec64085e1d4157ee112f6b3efb7959aad14
flyounger · Nov 15, 2017 · 15c9517 · 15c9517
2 parents 96bc65b + cf9388e
commit 15c9517
Show file tree

Hide file tree

Showing 4 changed files with 16 additions and 8 deletions.
diff --git a/README.md b/README.md
@@ -19,13 +19,13 @@ For more details about DuReader dataset please refer to [DuReader Homepage](http
 ### Preprocess the Data
 After the dataset is downloaded, there is still some work to do to run the baseline systems. DuReader dataset offers rich amount of documents for every user question, the documents are too long for popular RC models to cope with. In our baseline models, we preprocess the train set and development set data by selecting the paragraph that is most related to the answer string, while for inferring(no available golden answer), we select the paragraph that is most related to the question string. The preprocessing strategy is implemented in `utils/preprocess.py`. To preprocess the raw data, run:
 ```
-cat data/raw/search.train.json | python utils/preprocess.py > data/preprocessed/search.train.json
+cat data/raw/trainset/search.train.json | python utils/preprocess.py > data/preprocessed/trainset/search.train.json
 ```
 The preprocessing is already included in `data/download.sh`, the preprocessed data is stored in `data/preprocess`, the downloaded raw data is under `data/raw`.
 
 Once the preprocessed data is ready, you can run `utils/get_vocab.py` to generate the vocabulary file, for example, if you want to train model with Baidu Search data:
 ```
-python utils/get_vocab.py --files data/preprocess/search.train.json data/preprocess/search.dev.json  --vocab data/vocab.search
+python utils/get_vocab.py --files data/preprocessed/trainset/search.train.json data/preprocessed/devset/search.dev.json  --vocab data/vocab.search
 ```
 
 ### Run PaddlePaddle
@@ -95,7 +95,7 @@ To train the reading comprehension model, you can specify the model type by usin
 python run.py --task zhidao --algo BIDAF --epochs 10
 ```
 
-The training process includes an evaluation on the dev set after each training epoch. By default, the model with the least bleu4 score on the dev set will be saved.
+The training process includes an evaluation on the dev set after each training epoch. By default, the model with the least Bleu-4 score on the dev set will be saved.
 
 #### Evaluation
 To conduct a single evaluation on the dev set with the the model already trained, you can run the following command:

diff --git a/paddle/dataset.py b/paddle/dataset.py
@@ -38,7 +38,7 @@ class Dataset(object):
     Base dataset class for various tasks.
     """
     def __init__(self,
-                 file_name=None,
+                 file_names=None,
                  vocab_file=None,
                  vocab_size=0,
                  shuffle=False,
@@ -47,7 +47,7 @@ def __init__(self,
                  append_raw=False,
                  is_infer=False,
                  max_p_len=500):
-        self.file_names = file_name
+        self.file_names = file_names
         self.data = []
         self.raw = []
         self.vocab = self.read_vocab(vocab_file, vocab_size) \
@@ -140,7 +140,7 @@ def _reader_preload():
 
         def _reader_stream():
             for file_name in self.file_names:
-                with open(self.file_name, 'r') as fn:
+                with open(file_name, 'r') as fn:
                     for line in fn:
                         data = self.parse(line.strip())
                         if not data:

diff --git a/paddle/run.sh b/paddle/run.sh
@@ -52,8 +52,8 @@ vocab_size=200000
 train() {
     cp *.py $env_dir/
     PYTHONPATH=$PWD:$ROOT CUDA_VISIBLE_DEVICES=0 python $env_dir/run.py \
-        --trainset ../data/preprocessed/search.train.json \
-        --testset ../data/preprocessed/search.dev.json \
+        --trainset ../data/preprocessed/trainset/search.train.json \
+        --testset ../data/preprocessed/devset/search.dev.json \
         --vocab_file ../data/vocab.search \
         --emb_dim $emb_dim \
         --batch_size 32 \

diff --git a/tensorflow/rc_model.py b/tensorflow/rc_model.py
@@ -51,6 +51,7 @@ def __init__(self, vocab, args):
         self.optim_type = args.optim
         self.learning_rate = args.learning_rate
         self.weight_decay = args.weight_decay
+        self.use_dropout = args.dropout_keep_prob < 1
 
         # length limit
         self.max_p_num = args.max_p_num
@@ -125,6 +126,9 @@ def _encode(self):
             self.sep_p_encodes, _ = rnn('bi-lstm', self.p_emb, self.p_length, self.hidden_size)
         with tf.variable_scope('question_encoding'):
             self.sep_q_encodes, _ = rnn('bi-lstm', self.q_emb, self.q_length, self.hidden_size)
+        if self.use_dropout:
+            self.sep_p_encodes = tf.nn.dropout(self.sep_p_encodes, self.dropout_keep_prob)
+            self.sep_q_encodes = tf.nn.dropout(self.sep_q_encodes, self.dropout_keep_prob)
 
     def _match(self):
         """
@@ -138,6 +142,8 @@ def _match(self):
             raise NotImplementedError('The algorithm {} is not implemented.'.format(self.algo))
         self.match_p_encodes, _ = match_layer.match(self.sep_p_encodes, self.sep_q_encodes,
                                                     self.p_length, self.q_length)
+        if self.use_dropout:
+            self.match_p_encodes = tf.nn.dropout(self.match_p_encodes, self.dropout_keep_prob)
 
     def _fuse(self):
         """
@@ -146,6 +152,8 @@ def _fuse(self):
         with tf.variable_scope('fusion'):
             self.fuse_p_encodes, _ = rnn('bi-lstm', self.match_p_encodes, self.p_length,
                                          self.hidden_size, layer_num=1)
+            if self.use_dropout:
+                self.fuse_p_encodes = tf.nn.dropout(self.fuse_p_encodes, self.dropout_keep_prob)
 
     def _decode(self):
         """