forked from ContinualAI/avalanche
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathnlp_qa.py
227 lines (197 loc) · 7.32 KB
/
nlp_qa.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
"""
Simple example that show how to use Avalanche for Question Answering on
Squad by using T5
"""
from avalanche.benchmarks.utils import DataAttribute, ConstantSequence
from avalanche.training.plugins import ReplayPlugin
from transformers import DataCollatorForSeq2Seq
import torch
import avalanche
import torch.nn
from avalanche.benchmarks import CLScenario, CLStream, CLExperience
import avalanche.training.templates.base
from avalanche.benchmarks.utils import AvalancheDataset
from transformers import AutoTokenizer
from transformers import T5ForConditionalGeneration
from datasets import load_dataset
import numpy as np
class HGNaive(avalanche.training.Naive):
"""There are only a couple of modifications needed to
use huggingface:
- we add a bunch of attributes corresponding to the batch items,
redefining mb_x and mb_y too
- _unpack_minibatch sends the dictionary values to the GPU device
- forward and criterion are adapted for machine translation tasks.
"""
@property
def mb_attention_mask(self):
return self.mbatch["attention_mask"]
@property
def mb_x(self):
"""Current mini-batch input."""
return self.mbatch["input_ids"]
@property
def mb_y(self):
"""Current mini-batch target."""
return self.mbatch["labels"]
@property
def mb_decoder_in_ids(self):
"""Current mini-batch target."""
return self.mbatch["decoder_input_ids"]
@property
def mb_token_type_ids(self):
return self.mbatch[3]
def _unpack_minibatch(self):
"""HuggingFace minibatches are dictionaries of tensors.
Move tensors to the current device."""
for k in self.mbatch.keys():
self.mbatch[k] = self.mbatch[k].to(self.device)
def forward(self):
out = self.model(
input_ids=self.mb_x,
attention_mask=self.mb_attention_mask,
labels=self.mb_y,
)
return out.logits
def criterion(self):
mb_output = self.mb_output.view(-1, self.mb_output.size(-1))
ll = self._criterion(mb_output, self.mb_y.view(-1))
return ll
def main():
# Load SQuAD datasets from HuggingFace
squad_tr = load_dataset("squad", split="train")
squad_val = load_dataset("squad", split="validation")
tokenizer = AutoTokenizer.from_pretrained("t5-small")
encoder_max_len = tokenizer.model_max_length
decoder_max_len = 60
"""
Define a preprocessing function (code from HuggingFace) to:
1. Convert squad dataset to be used in a text 2 text setting
2. Encode the question and context with the tokenizer of T5 model
"""
def t2t_converter(example):
example["input_text"] = f"question: {example['question']}"
+f"context: {example['context']} </s>"
example["target_text"] = f"{example['answers']['text'][0]} </s>"
return example
def preprocess_function(
examples, encoder_max_len=encoder_max_len, decoder_max_len=decoder_max_len
):
encoder_inputs = tokenizer(
examples["input_text"],
truncation=True,
return_tensors="pt",
max_length=encoder_max_len,
pad_to_max_length=True,
)
decoder_inputs = tokenizer(
examples["target_text"],
truncation=True,
return_tensors="pt",
max_length=decoder_max_len,
pad_to_max_length=True,
)
input_ids = encoder_inputs["input_ids"]
input_attention = encoder_inputs["attention_mask"]
target_ids = decoder_inputs["input_ids"]
target_attention = decoder_inputs["attention_mask"]
outputs = {
"input_ids": input_ids,
"attention_mask": input_attention,
"labels": target_ids,
"decoder_attention_mask": target_attention,
}
return outputs
# Map the preprocessing function to the dataset so that it's applied to
# all examples
squad_tr = squad_tr.map(t2t_converter)
squad_tr = squad_tr.map(preprocess_function, batched=True)
squad_tr = squad_tr.remove_columns(
["id", "title", "context", "question", "answers", "input_text", "target_text"]
)
squad_val = squad_val.map(t2t_converter)
squad_val = squad_val.map(preprocess_function, batched=True)
# ,' input_text', 'target_text'])
squad_val = squad_val.remove_columns(
["id", "title", "context", "question", "answers"]
)
model = T5ForConditionalGeneration.from_pretrained("t5-small")
# Use a standard data collator for QA
data_collator = DataCollatorForSeq2Seq(tokenizer)
train_exps = []
for i in range(0, 2):
# We use very small experiences only to showcase the library.
# Adapt this to your own benchmark
exp_data = squad_tr.select(range(30 * i, 30 * (i + 1)))
tl = DataAttribute(ConstantSequence(i, len(exp_data)), "targets_task_labels")
exp = CLExperience()
exp.dataset = AvalancheDataset(
[exp_data], data_attributes=[tl], collate_fn=data_collator
)
train_exps.append(exp)
tl = DataAttribute(ConstantSequence(2, len(squad_val)), "targets_task_labels")
val_exp = CLExperience()
val_exp.dataset = AvalancheDataset(
[squad_val], data_attributes=[tl], collate_fn=data_collator
)
val_exp = [val_exp]
benchmark = CLScenario(
[
CLStream("train", train_exps),
CLStream("valid", val_exp),
# add more stream here (validation, test, out-of-domain, ...)
]
)
eval_plugin = avalanche.training.plugins.EvaluationPlugin(
avalanche.evaluation.metrics.loss_metrics(
epoch=True, experience=True, stream=True
),
loggers=[avalanche.logging.InteractiveLogger()],
strict_checks=False,
)
plugins = [ReplayPlugin(mem_size=200)]
optimizer = torch.optim.Adam(model.parameters(), lr=2)
strategy = HGNaive(
model,
optimizer,
torch.nn.CrossEntropyLoss(ignore_index=-100),
evaluator=eval_plugin,
train_epochs=1,
train_mb_size=10,
plugins=plugins,
)
for experience in benchmark.train_stream:
strategy.train(experience, collate_fn=data_collator)
strategy.eval(benchmark.train_stream)
# Test the model:
model.eval()
question = "Which libraries is Avalanche based upon?"
context = """
Avalanche is an End-to-End Continual Learning Library
based on PyTorch, born within ContinualAI with the goal of providing
a shared and collaborative open-source (MIT licensed) codebase for fast
prototyping, training and reproducible evaluation of continual learning
algorithms."
"""
input_text = f"answer_me: {question} context: {context} </s>"
encoded_query = tokenizer(
input_text,
return_tensors="pt",
pad_to_max_length=True,
truncation=True,
max_length=250,
)
input_ids = encoded_query["input_ids"]
attention_mask = encoded_query["attention_mask"]
generated_answer = model.generate(
input_ids,
attention_mask=attention_mask,
max_length=50,
top_p=0.95,
top_k=50,
repetition_penalty=2.0,
)
decoded_answer = tokenizer.batch_decode(generated_answer, skip_special_tokens=True)
print(f"Answer: {decoded_answer}")
if __name__ == "__main__":
main()