Skip to content

Commit

Permalink
Yesno type now works fine.
Browse files Browse the repository at this point in the history
But the evaluation logic of this type has already changed.

Change-Id: I302d8ba29ea5c749c8af2b33046bdf184534ebef
  • Loading branch information
liuyuuan committed Nov 3, 2017
1 parent af47ae4 commit a2d0fff
Show file tree
Hide file tree
Showing 5 changed files with 308 additions and 37 deletions.
83 changes: 66 additions & 17 deletions paddle/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,38 +150,85 @@ class DuReaderYesNo(Dataset):
Implements parser for yesno task.
"""
def __init__(self, *args, **kwargs):
self.labels = {'None': 0, 'Yes': 1, 'No': 2, 'Depends': 3}
super(DuReaderYesNo, self).__init__(*args, **kwargs)
self.schema = ['q_ids', 'a_ids', 'label']
self.feeding = {name: i for i, name in enumerate(self.schema)}
if self.is_infer:
assert self.shuffle == False, 'Shuffling is forbidden for inference'

def parse(self, line):
def __get_id(self, s):
s_ids = []
if not isinstance(s, list):
s = s.split(' ')
for t in s:
s_ids.append(self.vocab.get(t, self.unk_id))
return s_ids

def parse_train(self, line):
"""
Parses one line.
Parses one line for training.
Args:
line: A legal json string.
Returns:
A record as self.schema describes.
"""

obj = json.loads(line.strip())
label = obj['label']
ret = []
if obj['query_type'] != 'YES_NO':
return ret
label_ids = [self.labels[l] for l in obj['yesno_answers']]
query = [
self.vocab.get(x, self.unk_id)
for x in obj['segmented_query']]
para = [
self.vocab.get(x, self.unk_id)
for x in obj['segmented_answer']]
paras = map(self.__get_id, obj['segmented_answers'])

ret = []
if not query or not para or label not in set(range(4)):
if not query or not paras:
return ret
record = [query, para, label]
if self.is_infer:
record.append(obj)
ret.append(record)
for para, lbl in zip(paras, label_ids):
ret.append((query, para, lbl))
return ret

def parse_infer(self, line):
"""
Parses one line for inferring.
Args:
line: A legal json string.
Returns:
A record as self.schema describes.
"""
obj = json.loads(line.strip())
ret = []
paras = map(self.__get_id, obj['answers'])
query = [self.vocab.get(x, self.unk_id) for x in obj['query']]
fake_label = 0
for idx, para in enumerate(paras):
info = copy.deepcopy(obj)
info['answer_idx'] = idx
info['yesno_answers_ref'] = info['yesno_answers_ref']
info['yesno_answers'] = []
ret.append((query, para, fake_label, info))
return ret

def parse(self, line):
"""
Parses one line for inferring.
Args:
line: A legal json string.
Returns:
A record as self.schema describes.
"""
if self.is_infer:
return self.parse_infer(line)
return self.parse_train(line)


class DuReaderQA(Dataset):
"""
Expand Down Expand Up @@ -259,11 +306,13 @@ def __get_label(idx, ref):
def __get_infer_info(self, obj, paras):
info = {}
info['tokens'] = list(itertools.chain(*paras))
info['answers'] = obj.get('answers', [])
info['query'] = obj['query']
info['answers'] = []
info['answers_ref'] = obj.get('segmented_answers', [])
info['query'] = obj['segmented_query']
info['query_id'] = obj['query_id']
info['query_type'] = obj['query_type']
info['yesno_answers'] = obj.get('yesno_answers', [])
info['yesno_answers_ref'] = obj.get('yesno_answers', [])
info['yesno_answers'] = []
info['entities'] = obj.get('entity_answers', [[]])
return info

Expand Down Expand Up @@ -306,7 +355,7 @@ def parse(self, line):
if __name__ == '__main__':
data = sys.argv[1]
vocab = sys.argv[2]
baidu = DuReaderQA(file_name=data,
dataset = DuReaderYesNo(file_name=data,
vocab_file=vocab,
preload=False,
max_p_len=300,
Expand All @@ -315,7 +364,7 @@ def parse(self, line):
vocab_size=218967)

# test reader
reader = baidu.create_reader()
reader = dataset.create_reader()
for r in reader():
print r

22 changes: 13 additions & 9 deletions paddle/qa_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -229,14 +229,16 @@ def __parse_infer_ret(self, infer_ret):
end_prob_slice)
pred_tokens = [] if start_idx > end_idx \
else ins['tokens'][start_idx:end_idx + 1]
pred = normalize([' '.join(pred_tokens)])
ref = normalize(ins['answers'])

pred = [' '.join(pred_tokens)]
ref = ins['answers_ref']

idx_len += self.doc_num
idx_prob += prob_len * 2
pred_obj = {ins['query_id']: pred}
ref_obj = {ins['query_id']: ref}
stored_obj = copy.deepcopy(ins)
stored_obj['answers_pred'] = pred
stored_obj['answers'] = pred
objs.append(stored_obj)
pred_list.append(pred_obj)
ref_list.append(ref_obj)
Expand All @@ -249,8 +251,8 @@ def __read_list(self, infer_file):
with open(infer_file, 'r') as inf:
for line in inf:
obj = json.loads(line.strip())
ref_obj = {obj['query_id']: obj['answers']}
pred_obj = {obj['query_id']: obj['answers_pred']}
ref_obj = {obj['query_id']: obj['answers_ref']}
pred_obj = {obj['query_id']: obj['answers']}
ref_list.append(ref_obj)
pred_list.append(pred_obj)
return ref_list, pred_list
Expand Down Expand Up @@ -291,10 +293,11 @@ def evaluate(self,
the ret as input for evaluation.
"""
def __merge_dict(obj_list):
def __merge_and_normalize(obj_list):
ret = {}
for obj in obj_list:
ret.update(obj)
normalized = {k: normalize(v) for k, v in obj.items()}
ret.update(normalized)
return ret

pred_list = []
Expand All @@ -308,8 +311,9 @@ def __merge_dict(obj_list):
with open(infer_file, 'w') as of:
for o in objs:
print >> of, json.dumps(o, ensure_ascii=False).encode('utf8')
metrics = compute_bleu_rouge(__merge_dict(pred_list),
__merge_dict(ref_list))
metrics = compute_bleu_rouge(
__merge_and_normalize(pred_list),
__merge_and_normalize(ref_list))
res_str = '{} {}'.format(infer_file,
' '.join('{}={}'.format(k, v) for k, v in metrics.items()))
logger.info(res_str)
Expand Down
12 changes: 5 additions & 7 deletions paddle/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@

from bidaf import BiDAF
from match_lstm import MatchLstm
from yesno import TypeCls
from yesno import OpinionClassifier

from trainer import Trainer
from inferer import Inferer
Expand Down Expand Up @@ -73,9 +73,9 @@ def __prepare(self):
emb_dim=self.args.emb_dim)
elif self.args.algo == Algos.YESNO:
self.__create_yesno_data()
self.model = TypeCls(
self.model = OpinionClassifier(
Algos.YESNO,
train_reader.schema,
self.datasets[1].schema,
is_infer=self.args.is_infer,
vocab_size=self.args.vocab_size,
static_emb=(self.args.pre_emb.strip() != ''),
Expand Down Expand Up @@ -109,18 +109,16 @@ def __create_yesno_data(self):
if self.args.is_infer:
train_reader = None
else:
train_reader = dataset.BaiduYesNo(
train_reader = dataset.DuReaderYesNo(
file_name=self.args.trainset,
vocab_file=self.args.vocab_file,
vocab_size=self.args.vocab_size,
keep_raw=False,
preload=True,
shuffle=True)
test_reader = dataset.BaiduYesNo(
test_reader = dataset.DuReaderYesNo(
file_name=self.args.testset,
vocab_file=self.args.vocab_file,
vocab_size=self.args.vocab_size,
keep_raw=False,
is_infer=self.args.is_infer,
preload=(not self.args.is_infer),
shuffle=False)
Expand Down
6 changes: 2 additions & 4 deletions paddle/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -56,8 +56,6 @@ train() {
infer() {
model_name=`basename $2`
PYTHONPATH=$PWD:$ROOT CUDA_VISIBLE_DEVICES=3 python $env_dir/run.py \
--trainset ../data/preprocessed/search.train.json \
--testset ../data/preprocessed/search.dev.json \
--vocab_file ../data/vocab.search \
--emb_dim $emb_dim \
--batch_size 32 \
Expand All @@ -74,7 +72,7 @@ dir_infer() {
for f in $( ls -t $dir );
do
model_file=$dir/$f
infer --model_file $model_file
infer --model_file $model_file $@
done
}

Expand All @@ -83,5 +81,5 @@ echo "rest args: $@"
if [ $job == "train" ]; then
train $@
else
dir_infer $model_dir
dir_infer $model_dir $@
fi
Loading

0 comments on commit a2d0fff

Please sign in to comment.