Skip to content

Commit

Permalink
Revert "Xlnet outputs (huggingface#5881)" (huggingface#5882)
Browse files Browse the repository at this point in the history
This reverts commit 13be487.
  • Loading branch information
TevenLeScao authored Jul 18, 2020
1 parent 13be487 commit a558092
Show file tree
Hide file tree
Showing 4 changed files with 45 additions and 133 deletions.
2 changes: 1 addition & 1 deletion src/transformers/configuration_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ class PretrainedConfig(object):
Whether or not the model should return all hidden-states.
output_attentions (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether or not the model should returns all attentions.
use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
use_cache (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether or not the model should return the last key/values attentions (not used by all models).
return_tuple (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether or not the model should return tuples instead of :obj:`ModelOutput` objects.
Expand Down
2 changes: 0 additions & 2 deletions src/transformers/configuration_xlnet.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,8 +110,6 @@ class XLNetConfig(PretrainedConfig):
Used in the SQuAD evaluation script for XLM and XLNet.
end_n_top (:obj:`int`, optional, defaults to 5):
Used in the SQuAD evaluation script for XLM and XLNet.
use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
Differs slightly from other models as it is always turned on at training time.
Example::
Expand Down
54 changes: 19 additions & 35 deletions src/transformers/modeling_xlnet.py
Original file line number Diff line number Diff line change
Expand Up @@ -575,7 +575,7 @@ class XLNetModelOutput(ModelOutput):
``num_predict`` corresponds to ``target_mapping.shape[1]``. If ``target_mapping`` is ``None``, then
``num_predict`` corresponds to ``sequence_length``.
mems (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers`):
Contains pre-computed hidden-states.
Contains pre-computed hidden-states (key and values in the attention blocks).
Can be used (see `mems` input) to speed up sequential decoding. The token ids which have their past given to this model
should not be passed as input ids as they have already been computed.
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Expand Down Expand Up @@ -611,7 +611,7 @@ class XLNetLMHeadModelOutput(ModelOutput):
``num_predict`` corresponds to ``target_mapping.shape[1]``. If ``target_mapping`` is ``None``, then
``num_predict`` corresponds to ``sequence_length``.
mems (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers`):
Contains pre-computed hidden-states.
Contains pre-computed hidden-states (key and values in the attention blocks).
Can be used (see `mems` input) to speed up sequential decoding. The token ids which have their past given to this model
should not be passed as input ids as they have already been computed.
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Expand Down Expand Up @@ -645,7 +645,7 @@ class XLNetForSequenceClassificationOutput(ModelOutput):
logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`):
Classification (or regression if config.num_labels==1) scores (before SoftMax).
mems (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers`):
Contains pre-computed hidden-states.
Contains pre-computed hidden-states (key and values in the attention blocks).
Can be used (see `mems` input) to speed up sequential decoding. The token ids which have their past given to this model
should not be passed as input ids as they have already been computed.
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Expand Down Expand Up @@ -679,7 +679,7 @@ class XLNetForTokenClassificationOutput(ModelOutput):
logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`):
Classification scores (before SoftMax).
mems (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers`):
Contains pre-computed hidden-states.
Contains pre-computed hidden-states (key and values in the attention blocks).
Can be used (see `mems` input) to speed up sequential decoding. The token ids which have their past given to this model
should not be passed as input ids as they have already been computed.
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Expand Down Expand Up @@ -715,7 +715,7 @@ class XLNetForMultipleChoiceOutput(ModelOutput):
Classification scores (before SoftMax).
mems (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers`):
Contains pre-computed hidden-states.
Contains pre-computed hidden-states (key and values in the attention blocks).
Can be used (see `mems` input) to speed up sequential decoding. The token ids which have their past given to this model
should not be passed as input ids as they have already been computed.
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Expand Down Expand Up @@ -751,7 +751,7 @@ class XLNetForQuestionAnsweringSimpleOutput(ModelOutput):
end_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length,)`):
Span-end scores (before SoftMax).
mems (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers`):
Contains pre-computed hidden-states.
Contains pre-computed hidden-states (key and values in the attention blocks).
Can be used (see `mems` input) to speed up sequential decoding. The token ids which have their past given to this model
should not be passed as input ids as they have already been computed.
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Expand Down Expand Up @@ -794,7 +794,7 @@ class XLNetForQuestionAnsweringOutput(ModelOutput):
cls_logits (``torch.FloatTensor`` of shape ``(batch_size,)``, `optional`, returned if ``start_positions`` or ``end_positions`` is not provided):
Log probabilities for the ``is_impossible`` label of the answers.
mems (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers`):
Contains pre-computed hidden-states.
Contains pre-computed hidden-states (key and values in the attention blocks).
Can be used (see `mems` input) to speed up sequential decoding. The token ids which have their past given to this model
should not be passed as input ids as they have already been computed.
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Expand Down Expand Up @@ -850,7 +850,7 @@ class XLNetForQuestionAnsweringOutput(ModelOutput):
`What are attention masks? <../glossary.html#attention-mask>`__
mems (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers`):
Contains pre-computed hidden-states as computed by the model
Contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
(see `mems` output below). Can be used to speed up sequential decoding. The token ids which have their mems
given to this model should not be passed as input ids as they have already been computed.
`use_cache` has to be set to `True` to make use of `mems`.
Expand Down Expand Up @@ -964,19 +964,10 @@ def cache_mem(self, curr_out, prev_mem):
if self.reuse_len is not None and self.reuse_len > 0:
curr_out = curr_out[: self.reuse_len]

if self.mem_len is None or self.mem_len == 0:
# If `use_cache` is active but no `mem_len` is defined, the model behaves like GPT-2 at inference time
# and returns all of the past and current hidden states.
cutoff = 0
else:
# If `use_cache` is active and `mem_len` is defined, the model returns the last `mem_len` hidden
# states. This is the preferred setting for training and long-form generation.
cutoff = -self.mem_len
if prev_mem is None:
# if `use_cache` is active and `mem_len` is defined, the model
new_mem = curr_out[cutoff:]
new_mem = curr_out[-self.mem_len :]
else:
new_mem = torch.cat([prev_mem, curr_out], dim=0)[cutoff:]
new_mem = torch.cat([prev_mem, curr_out], dim=0)[-self.mem_len :]

return new_mem.detach()

Expand Down Expand Up @@ -1048,7 +1039,7 @@ def forward(
input_mask=None,
head_mask=None,
inputs_embeds=None,
use_cache=None,
use_cache=True,
output_attentions=None,
output_hidden_states=None,
return_tuple=None,
Expand All @@ -1058,7 +1049,6 @@ def forward(
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
use_cache = self.training or (use_cache if use_cache is not None else self.config.use_cache)

# the original code for XLNet uses shapes [len, bsz] with the batch dimension at the end
# but we want a unified interface in the library with the batch size on the first dimension
Expand Down Expand Up @@ -1189,7 +1179,7 @@ def forward(
attentions = [] if output_attentions else None
hidden_states = [] if output_hidden_states else None
for i, layer_module in enumerate(self.layer):
if use_cache:
if self.mem_len is not None and self.mem_len > 0 and use_cache is True:
# cache new mems
new_mems = new_mems + (self.cache_mem(output_h, mems[i]),)
if output_hidden_states:
Expand Down Expand Up @@ -1221,7 +1211,7 @@ def forward(
output = output.permute(1, 0, 2).contiguous()

# TODO Teven: fix this test to only use use_cache.
if not use_cache:
if not (self.mem_len is not None and self.mem_len > 0 and use_cache is True):
new_mems = None

if output_hidden_states:
Expand Down Expand Up @@ -1322,7 +1312,7 @@ def forward(
head_mask=None,
inputs_embeds=None,
labels=None,
use_cache=None,
use_cache=True,
output_attentions=None,
output_hidden_states=None,
return_tuple=None,
Expand Down Expand Up @@ -1370,7 +1360,6 @@ def forward(
"""
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
use_cache = self.training or (use_cache if use_cache is not None else self.config.use_cache)

transformer_outputs = self.transformer(
input_ids,
Expand Down Expand Up @@ -1444,7 +1433,7 @@ def forward(
head_mask=None,
inputs_embeds=None,
labels=None,
use_cache=None,
use_cache=True,
output_attentions=None,
output_hidden_states=None,
return_tuple=None,
Expand All @@ -1457,7 +1446,6 @@ def forward(
If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy).
"""
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
use_cache = self.training or (use_cache if use_cache is not None else self.config.use_cache)

transformer_outputs = self.transformer(
input_ids,
Expand Down Expand Up @@ -1536,7 +1524,7 @@ def forward(
head_mask=None,
inputs_embeds=None,
labels=None,
use_cache=None,
use_cache=True,
output_attentions=None,
output_hidden_states=None,
return_tuple=None,
Expand All @@ -1548,7 +1536,6 @@ def forward(
of the input tensors. (see `input_ids` above)
"""
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
use_cache = self.training or (use_cache if use_cache is not None else self.config.use_cache)

outputs = self.transformer(
input_ids,
Expand Down Expand Up @@ -1631,7 +1618,7 @@ def forward(
head_mask=None,
inputs_embeds=None,
labels=None,
use_cache=None,
use_cache=True,
output_attentions=None,
output_hidden_states=None,
return_tuple=None,
Expand All @@ -1643,7 +1630,6 @@ def forward(
of the input tensors. (see `input_ids` above)
"""
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
use_cache = self.training or (use_cache if use_cache is not None else self.config.use_cache)
num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]

flat_input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
Expand Down Expand Up @@ -1731,7 +1717,7 @@ def forward(
inputs_embeds=None,
start_positions=None,
end_positions=None,
use_cache=None,
use_cache=True,
output_attentions=None,
output_hidden_states=None,
return_tuple=None,
Expand All @@ -1747,7 +1733,6 @@ def forward(
Position outside of the sequence are not taken into account for computing the loss.
"""
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
use_cache = self.training or (use_cache if use_cache is not None else self.config.use_cache)

outputs = self.transformer(
input_ids,
Expand Down Expand Up @@ -1839,7 +1824,7 @@ def forward(
is_impossible=None,
cls_index=None,
p_mask=None,
use_cache=None,
use_cache=True,
output_attentions=None,
output_hidden_states=None,
return_tuple=None,
Expand Down Expand Up @@ -1879,7 +1864,6 @@ def forward(
>>> loss = outputs[0]
"""
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
use_cache = self.training or (use_cache if use_cache is not None else self.config.use_cache)

transformer_outputs = self.transformer(
input_ids,
Expand Down
Loading

0 comments on commit a558092

Please sign in to comment.