diff --git a/paddle/dataset.py b/paddle/dataset.py index 35bd0ac..316e9af 100644 --- a/paddle/dataset.py +++ b/paddle/dataset.py @@ -150,13 +150,24 @@ class DuReaderYesNo(Dataset): Implements parser for yesno task. """ def __init__(self, *args, **kwargs): + self.labels = {'None': 0, 'Yes': 1, 'No': 2, 'Depends': 3} super(DuReaderYesNo, self).__init__(*args, **kwargs) self.schema = ['q_ids', 'a_ids', 'label'] self.feeding = {name: i for i, name in enumerate(self.schema)} + if self.is_infer: + assert self.shuffle == False, 'Shuffling is forbidden for inference' - def parse(self, line): + def __get_id(self, s): + s_ids = [] + if not isinstance(s, list): + s = s.split(' ') + for t in s: + s_ids.append(self.vocab.get(t, self.unk_id)) + return s_ids + + def parse_train(self, line): """ - Parses one line. + Parses one line for training. Args: line: A legal json string. @@ -164,24 +175,60 @@ def parse(self, line): Returns: A record as self.schema describes. """ + obj = json.loads(line.strip()) - label = obj['label'] + ret = [] + if obj['query_type'] != 'YES_NO': + return ret + label_ids = [self.labels[l] for l in obj['yesno_answers']] query = [ self.vocab.get(x, self.unk_id) for x in obj['segmented_query']] - para = [ - self.vocab.get(x, self.unk_id) - for x in obj['segmented_answer']] + paras = map(self.__get_id, obj['segmented_answers']) - ret = [] - if not query or not para or label not in set(range(4)): + if not query or not paras: return ret - record = [query, para, label] - if self.is_infer: - record.append(obj) - ret.append(record) + for para, lbl in zip(paras, label_ids): + ret.append((query, para, lbl)) return ret + def parse_infer(self, line): + """ + Parses one line for inferring. + + Args: + line: A legal json string. + + Returns: + A record as self.schema describes. + """ + obj = json.loads(line.strip()) + ret = [] + paras = map(self.__get_id, obj['answers']) + query = [self.vocab.get(x, self.unk_id) for x in obj['query']] + fake_label = 0 + for idx, para in enumerate(paras): + info = copy.deepcopy(obj) + info['answer_idx'] = idx + info['yesno_answers_ref'] = info['yesno_answers_ref'] + info['yesno_answers'] = [] + ret.append((query, para, fake_label, info)) + return ret + + def parse(self, line): + """ + Parses one line for inferring. + + Args: + line: A legal json string. + + Returns: + A record as self.schema describes. + """ + if self.is_infer: + return self.parse_infer(line) + return self.parse_train(line) + class DuReaderQA(Dataset): """ @@ -259,11 +306,13 @@ def __get_label(idx, ref): def __get_infer_info(self, obj, paras): info = {} info['tokens'] = list(itertools.chain(*paras)) - info['answers'] = obj.get('answers', []) - info['query'] = obj['query'] + info['answers'] = [] + info['answers_ref'] = obj.get('segmented_answers', []) + info['query'] = obj['segmented_query'] info['query_id'] = obj['query_id'] info['query_type'] = obj['query_type'] - info['yesno_answers'] = obj.get('yesno_answers', []) + info['yesno_answers_ref'] = obj.get('yesno_answers', []) + info['yesno_answers'] = [] info['entities'] = obj.get('entity_answers', [[]]) return info @@ -306,7 +355,7 @@ def parse(self, line): if __name__ == '__main__': data = sys.argv[1] vocab = sys.argv[2] - baidu = DuReaderQA(file_name=data, + dataset = DuReaderYesNo(file_name=data, vocab_file=vocab, preload=False, max_p_len=300, @@ -315,7 +364,7 @@ def parse(self, line): vocab_size=218967) # test reader - reader = baidu.create_reader() + reader = dataset.create_reader() for r in reader(): print r diff --git a/paddle/qa_model.py b/paddle/qa_model.py index 290a603..1634585 100644 --- a/paddle/qa_model.py +++ b/paddle/qa_model.py @@ -229,14 +229,16 @@ def __parse_infer_ret(self, infer_ret): end_prob_slice) pred_tokens = [] if start_idx > end_idx \ else ins['tokens'][start_idx:end_idx + 1] - pred = normalize([' '.join(pred_tokens)]) - ref = normalize(ins['answers']) + + pred = [' '.join(pred_tokens)] + ref = ins['answers_ref'] + idx_len += self.doc_num idx_prob += prob_len * 2 pred_obj = {ins['query_id']: pred} ref_obj = {ins['query_id']: ref} stored_obj = copy.deepcopy(ins) - stored_obj['answers_pred'] = pred + stored_obj['answers'] = pred objs.append(stored_obj) pred_list.append(pred_obj) ref_list.append(ref_obj) @@ -249,8 +251,8 @@ def __read_list(self, infer_file): with open(infer_file, 'r') as inf: for line in inf: obj = json.loads(line.strip()) - ref_obj = {obj['query_id']: obj['answers']} - pred_obj = {obj['query_id']: obj['answers_pred']} + ref_obj = {obj['query_id']: obj['answers_ref']} + pred_obj = {obj['query_id']: obj['answers']} ref_list.append(ref_obj) pred_list.append(pred_obj) return ref_list, pred_list @@ -291,10 +293,11 @@ def evaluate(self, the ret as input for evaluation. """ - def __merge_dict(obj_list): + def __merge_and_normalize(obj_list): ret = {} for obj in obj_list: - ret.update(obj) + normalized = {k: normalize(v) for k, v in obj.items()} + ret.update(normalized) return ret pred_list = [] @@ -308,8 +311,9 @@ def __merge_dict(obj_list): with open(infer_file, 'w') as of: for o in objs: print >> of, json.dumps(o, ensure_ascii=False).encode('utf8') - metrics = compute_bleu_rouge(__merge_dict(pred_list), - __merge_dict(ref_list)) + metrics = compute_bleu_rouge( + __merge_and_normalize(pred_list), + __merge_and_normalize(ref_list)) res_str = '{} {}'.format(infer_file, ' '.join('{}={}'.format(k, v) for k, v in metrics.items())) logger.info(res_str) diff --git a/paddle/run.py b/paddle/run.py index 9ee16ab..bcebad6 100644 --- a/paddle/run.py +++ b/paddle/run.py @@ -20,7 +20,7 @@ from bidaf import BiDAF from match_lstm import MatchLstm -from yesno import TypeCls +from yesno import OpinionClassifier from trainer import Trainer from inferer import Inferer @@ -73,9 +73,9 @@ def __prepare(self): emb_dim=self.args.emb_dim) elif self.args.algo == Algos.YESNO: self.__create_yesno_data() - self.model = TypeCls( + self.model = OpinionClassifier( Algos.YESNO, - train_reader.schema, + self.datasets[1].schema, is_infer=self.args.is_infer, vocab_size=self.args.vocab_size, static_emb=(self.args.pre_emb.strip() != ''), @@ -109,18 +109,16 @@ def __create_yesno_data(self): if self.args.is_infer: train_reader = None else: - train_reader = dataset.BaiduYesNo( + train_reader = dataset.DuReaderYesNo( file_name=self.args.trainset, vocab_file=self.args.vocab_file, vocab_size=self.args.vocab_size, - keep_raw=False, preload=True, shuffle=True) - test_reader = dataset.BaiduYesNo( + test_reader = dataset.DuReaderYesNo( file_name=self.args.testset, vocab_file=self.args.vocab_file, vocab_size=self.args.vocab_size, - keep_raw=False, is_infer=self.args.is_infer, preload=(not self.args.is_infer), shuffle=False) diff --git a/paddle/run.sh b/paddle/run.sh index d963068..b462a2d 100644 --- a/paddle/run.sh +++ b/paddle/run.sh @@ -56,8 +56,6 @@ train() { infer() { model_name=`basename $2` PYTHONPATH=$PWD:$ROOT CUDA_VISIBLE_DEVICES=3 python $env_dir/run.py \ - --trainset ../data/preprocessed/search.train.json \ - --testset ../data/preprocessed/search.dev.json \ --vocab_file ../data/vocab.search \ --emb_dim $emb_dim \ --batch_size 32 \ @@ -74,7 +72,7 @@ dir_infer() { for f in $( ls -t $dir ); do model_file=$dir/$f - infer --model_file $model_file + infer --model_file $model_file $@ done } @@ -83,5 +81,5 @@ echo "rest args: $@" if [ $job == "train" ]; then train $@ else - dir_infer $model_dir + dir_infer $model_dir $@ fi diff --git a/paddle/yesno.py b/paddle/yesno.py new file mode 100644 index 0000000..611c323 --- /dev/null +++ b/paddle/yesno.py @@ -0,0 +1,222 @@ +# -*- coding:utf-8 -*- +############################################################################### +# +# Copyright (c) 2017 Baidu.com, Inc. All Rights Reserved +# +############################################################################### +""" +This module implements an opinion classification model to classify a +query answer pair into 4 categories: None(no opinion), Yes(positive opinion), +No(negative opinion), Depends(depends on conditions). + +Authors: liuyuan(liuyuan04@baidu.com) +Date: 2017/09/20 12:00:00 +""" +import hashlib +import logging +import json +import sys +import paddle.v2.layer as layer +import paddle.v2.attr as Attr +import paddle.v2.activation as Act +import paddle.v2.data_type as data_type +import paddle.v2 as paddle + +from match_lstm import MatchLstm + +logger = logging.getLogger("paddle") +logger.setLevel(logging.INFO) + + +class OpinionClassifier(MatchLstm): + """ + Implements a opinion classifer model, replace the pointer net of MatchLstm + with a one-layered classifer. Inherits from `MatchLstm`. + """ + def __init__(self, name, inputs, *args, **kwargs): + self.name = name + self.inputs = inputs + self.emb_dim = kwargs['emb_dim'] + self.vocab_size = kwargs['vocab_size'] + self.is_infer = kwargs['is_infer'] + self.label_dim = 4 + self.static_emb = kwargs['static_emb'] + self.labels = ['None', 'Yes', 'No', 'Depends'] + self.label_dict = {v: idx for idx, v in enumerate(self.labels)} + super(OpinionClassifier, self).__init__(name, inputs, *args, **kwargs) + + def check_and_create_data(self): + """ + Checks if the input data is legal and creates the data layers + according to the input fields. + """ + if self.is_infer: + expected = ['q_ids', 'a_ids'] + if len(self.inputs) < 2: + raise ValueError('''Input schema: expected vs given: + {} vs {}'''.format(expected, self.inputs)) + else: + expected = ['q_ids', 'a_ids', 'label'] + if len(self.inputs) < 3: + raise ValueError('''Input schema: expected vs given: + {} vs {}'''.format(expected, self.inputs)) + self.label = layer.data(name=self.inputs[2], + type=data_type.integer_value(4)) + + self.q_ids = layer.data( + name=self.inputs[0], + type=data_type.integer_value_sequence(self.vocab_size)) + + self.a_ids = layer.data( + name=self.inputs[1], + type=data_type.integer_value_sequence(self.vocab_size)) + + def network(self): + """ + Implements the detail of the model. + """ + self.check_and_create_data() + self.create_shared_params() + q_enc = self.get_enc(self.q_ids, type='q') + a_enc = self.get_enc(self.a_ids, type='q') + + q_proj_left = layer.fc(size=self.emb_dim * 2, + bias_attr=False, + param_attr=Attr.Param(self.name + '_left.wq'), + input=q_enc) + q_proj_right = layer.fc(size=self.emb_dim * 2, + bias_attr=False, + param_attr=Attr.Param(self.name + '_right.wq'), + input=q_enc) + left_match = self.recurrent_group(self.name + '_left', + [layer.StaticInput(q_enc), + layer.StaticInput(q_proj_left), a_enc], + reverse=False) + right_match = self.recurrent_group(self.name + '_right', + [layer.StaticInput(q_enc), + layer.StaticInput(q_proj_right), a_enc], + reverse=True) + match_seq = layer.concat(input=[left_match, right_match]) + with layer.mixed(size=match_seq.size, + act=Act.Identity(), + layer_attr=Attr.ExtraLayerAttribute(drop_rate=0.2), + bias_attr=False) as dropped: + dropped += layer.identity_projection(match_seq) + match_result = layer.pooling(input=dropped, + pooling_type=paddle.pooling.Max()) + cls = layer.fc(input=match_result, + act=Act.Softmax(), + size=self.label_dim) + return cls + + def train(self): + """ + Trains the model. + """ + cls = self.network() + loss = layer.cross_entropy_cost(input=cls, + label=self.label, + name=self.name + '_cost') + evaluator = paddle.evaluator.precision_recall( + input=cls, name='label0', label=self.label, positive_label=0) + evaluator = paddle.evaluator.precision_recall( + input=cls, name='label1', label=self.label, positive_label=1) + evaluator = paddle.evaluator.precision_recall( + input=cls, name='label2', label=self.label, positive_label=2) + evaluator = paddle.evaluator.precision_recall( + input=cls, name='label3', label=self.label, positive_label=3) + evaluator = paddle.evaluator.precision_recall( + input=cls, name='label_all', label=self.label) + return loss + + def infer(self): + """ + Infers with the trained models. + """ + cls = self.network() + return cls + + def evaluate(self, + infer_file, + ret=None, + from_file=False): + """ + Processes and evaluates the inferred result of one batch. + """ + results, stored_objs = self.__parse_infer_ret(ret) + #print >> sys.stderr, '+++:', len(results), len(stored_objs) + with open(infer_file, 'w') as inf: + for obj in stored_objs: + sorted_ans = sorted(obj['yesno_answers'], key=lambda x: x[0]) + obj['yesno_answers'] = [x[1] for x in sorted_ans] + print >> inf, json.dumps(obj, ensure_ascii=False).encode('utf8') + self.__calc_pr(results) + + def __parse_infer_ret(self, infer_ret): + results = [] + stored_objs = [] + if not infer_ret: + return results, stored_objs + for batch_input, batch_output in infer_ret: + pred_labels = map(int, batch_output[0].argmax(axis=1)) + for ins, pred in zip(batch_input, pred_labels): + obj = ins[-1] + obj['yesno_answers'] = [(obj['answer_idx'], self.labels[pred])] + stored_objs.append(obj) + return results, self.__merge_objs(stored_objs) + + def __getid(self, query): + if isinstance(query, unicode): + query = query.encode('utf8') + m = hashlib.md5() + m.update(query) + return m.hexdigest() + + def __merge_objs(self, obj_list): + merged_objs = [] + last_id = None + + for obj in obj_list: + qid = obj['query_id'] + if last_id != qid: + merged_objs.append(obj) + last_id = qid + continue + merged_objs[-1]['yesno_answers'].append(obj['yesno_answers'][0]) + + return merged_objs + + def __calc_pr(self, results): + # {label: [true, pred, real]} + labels = {} + if len(results) > 0: + acc = 1.0 * len([(x, y) for x, y in results if x == y]) / len(results) + else: + acc = 0.0 + for label, pred in results: + labels[label] = labels.get(label, [0, 0, 0]) + labels[label][2] += 1 + if label == pred: + labels[label][0] += 1 + labels[pred] = labels.get(pred, [0, 0, 0]) + labels[pred][1] += 1 + + eval_result = {} + for label, counts in labels.items(): + true, pred, real = counts + recall = 1.0 * true / real if real > 0 else 0.0 + precision = 1.0 * true / pred if pred > 0 else 0.0 + f1 = 2 * recall * precision / (recall + precision) \ + if recall + precision > 0 \ + else 0.0 + eval_result['label_{}_recall'.format(label)] = recall + eval_result['label_{}_precision'.format(label)] = precision + eval_result['label_{}_f1'.format(label)] = f1 + eval_result['accuracy'] = acc + logger.info('eval resulsts: {}'.format( + ' '.join(["{}={}".format(x, y) for x, y in eval_result.items()]))) + + def __call__(self): + if self.is_infer: + return self.infer() + return self.train()