forked from PaddlePaddle/PaddleNLP
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpredict.py
137 lines (114 loc) · 6.3 KB
/
predict.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import numpy as np
import paddle
from scipy.special import softmax
from paddlenlp.data import JiebaTokenizer, Pad, Stack, Tuple, Vocab
# fmt: off
parser = argparse.ArgumentParser()
parser.add_argument("--model_file", type=str, required=True, default='./static_graph_params.pdmodel', help="The path to model info in static graph.")
parser.add_argument("--params_file", type=str, required=True, default='./static_graph_params.pdiparams', help="The path to parameters in static graph.")
parser.add_argument('--network', choices=['bow', 'lstm', 'bilstm', 'gru', 'bigru', 'rnn', 'birnn', 'bilstm_attn', 'cnn', 'textcnn'], default="bilstm", help="Select which network to train, defaults to bilstm.")
parser.add_argument("--vocab_path", type=str, default="./vocab.json", help="The file path to save vocabulary.")
parser.add_argument("--max_seq_length", default=128, type=int, help="The maximum total input sequence length after tokenization. Sequences longer than this will be truncated, sequences shorter will be padded.")
parser.add_argument("--batch_size", default=2, type=int, help="Batch size per GPU/CPU for training.")
parser.add_argument('--device', choices=['cpu', 'gpu', 'xpu'], default="gpu", help="Select which device to train model, defaults to gpu.")
args = parser.parse_args()
# fmt: on
def preprocess_prediction_data(text, tokenizer):
"""
It process the prediction data as the format used as training.
Args:
text (obj:`str`): The input text.
tokenizer(obj: `paddlenlp.data.JiebaTokenizer`): It use jieba to cut the chinese string.
Returns:
input_ids (obj: `list[int]`): The word ids of the `text`.
seq_len (obj: `int`): The length of words.
"""
input_id = tokenizer.encode(text)
seq_len = len(input_id)
return input_id, seq_len
class Predictor(object):
def __init__(self, model_file, params_file, device, max_seq_length):
self.max_seq_length = max_seq_length
config = paddle.inference.Config(model_file, params_file)
if device == "gpu":
# set GPU configs accordingly
config.enable_use_gpu(100, 0)
elif device == "cpu":
# set CPU configs accordingly,
# such as enable_mkldnn, set_cpu_math_library_num_threads
config.disable_gpu()
elif device == "xpu":
# set XPU configs accordingly
config.enable_xpu(100)
config.switch_use_feed_fetch_ops(False)
self.predictor = paddle.inference.create_predictor(config)
self.input_handles = [self.predictor.get_input_handle(name) for name in self.predictor.get_input_names()]
self.output_handle = self.predictor.get_output_handle(self.predictor.get_output_names()[0])
def predict(self, data, tokenizer, label_map, batch_size=1, network="bilstm"):
"""
Predicts the data labels.
Args:
model (obj:`paddle.nn.Layer`): A model to classify texts.
data (obj:`List(Example)`): The processed data whose each element is a Example (numedtuple) object.
A Example object contains `text`(word_ids) and `se_len`(sequence length).
tokenizer(obj:`PretrainedTokenizer`): This tokenizer inherits from
:class:`~paddlenlp.transformers.PretrainedTokenizer` which contains most of the methods.
Users should refer to the superclass for more information regarding methods.
label_map(obj:`dict`): The label id (key) to label str (value) map.
batch_size(obj:`int`, defaults to 1): The number of batch.
Returns:
results(obj:`dict`): All the predictions labels.
"""
examples = []
for text in data:
input_id, seq_len = preprocess_prediction_data(text, tokenizer)
examples.append((input_id, seq_len))
batchify_fn = lambda samples, fn=Tuple(
Pad(axis=0, pad_val=tokenizer.vocab.token_to_idx.get("[PAD]", 0)), Stack() # input_id # seq_len
): fn(samples)
# Separates data into some batches.
batches = [examples[idx : idx + batch_size] for idx in range(0, len(examples), batch_size)]
results = []
for batch in batches:
input_ids, seq_lens = batchify_fn(batch)
self.input_handles[0].copy_from_cpu(input_ids)
if network in ["lstm", "bilstm", "gru", "bigru", "rnn", "birnn", "bilstm_attn"]:
self.input_handles[1].copy_from_cpu(seq_lens)
self.predictor.run()
logits = self.output_handle.copy_to_cpu()
probs = softmax(logits, axis=1)
print(probs)
idx = np.argmax(probs, axis=1)
idx = idx.tolist()
labels = [label_map[i] for i in idx]
results.extend(labels)
return results
if __name__ == "__main__":
# Define predictor to do prediction.
predictor = Predictor(args.model_file, args.params_file, args.device, args.max_seq_length)
# Firstly pre-processing prediction data and then do predict.
data = [
"非常不错,服务很好,位于市中心区,交通方便,不过价格也高!",
"怀着十分激动的心情放映,可是看着看着发现,在放映完毕后,出现一集米老鼠的动画片",
"作为老的四星酒店,房间依然很整洁,相当不错。机场接机服务很好,可以在车上办理入住手续,节省时间。",
]
vocab = Vocab.from_json(args.vocab_path)
tokenizer = JiebaTokenizer(vocab)
label_map = {0: "negative", 1: "positive"}
results = predictor.predict(data, tokenizer, label_map, batch_size=args.batch_size, network=args.network)
for idx, text in enumerate(data):
print("Data: {} \t Label: {}".format(text, results[idx]))