Forbid PretrainedConfig from saving generate parameters; Update d…

…eprecations in `generate`-related code 🧹 (huggingface#32659) Co-authored-by: amyeroberts <[email protected]>
kashif · Aug 23, 2024 · 970a16e · 970a16e
1 parent 22e6f14
commit 970a16e
Show file tree

Hide file tree

Showing 53 changed files with 195 additions and 670 deletions.
diff --git a/docs/source/en/internal/generation_utils.md b/docs/source/en/internal/generation_utils.md
@@ -140,9 +140,6 @@ generation.
 [[autodoc]] ForcedEOSTokenLogitsProcessor
     - __call__
 
-[[autodoc]] ForceTokensLogitsProcessor
-    - __call__
-
 [[autodoc]] HammingDiversityLogitsProcessor
     - __call__
 

diff --git a/docs/source/ja/internal/generation_utils.md b/docs/source/ja/internal/generation_utils.md
@@ -139,9 +139,6 @@ generation_output[:2]
 [[autodoc]] ForcedEOSTokenLogitsProcessor
     - __call__
 
-[[autodoc]] ForceTokensLogitsProcessor
-    - __call__
-
 [[autodoc]] HammingDiversityLogitsProcessor
     - __call__
 

diff --git a/docs/source/zh/internal/generation_utils.md b/docs/source/zh/internal/generation_utils.md
@@ -133,9 +133,6 @@ generation_output[:2]
 [[autodoc]] ForcedEOSTokenLogitsProcessor
     - __call__
 
-[[autodoc]] ForceTokensLogitsProcessor
-    - __call__
-
 [[autodoc]] HammingDiversityLogitsProcessor
     - __call__
 

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
@@ -1276,7 +1276,6 @@
             "ExponentialDecayLengthPenalty",
             "ForcedBOSTokenLogitsProcessor",
             "ForcedEOSTokenLogitsProcessor",
-            "ForceTokensLogitsProcessor",
             "GenerationMixin",
             "HammingDiversityLogitsProcessor",
             "InfNanRemoveLogitsProcessor",
@@ -6059,7 +6058,6 @@
             ExponentialDecayLengthPenalty,
             ForcedBOSTokenLogitsProcessor,
             ForcedEOSTokenLogitsProcessor,
-            ForceTokensLogitsProcessor,
             GenerationMixin,
             HammingDiversityLogitsProcessor,
             InfNanRemoveLogitsProcessor,

diff --git a/src/transformers/commands/pt_to_tf.py b/src/transformers/commands/pt_to_tf.py
diff --git a/src/transformers/configuration_utils.py b/src/transformers/configuration_utils.py
@@ -81,6 +81,15 @@ class PretrainedConfig(PushToHubMixin):
       model.
     - **num_hidden_layers** (`int`) -- The number of blocks in the model.
 
+    <Tip warning={true}>
+
+    Setting parameters for sequence generation in the model config is deprecated. For backward compatibility, loading
+    some of them will still be possible, but attempting to overwrite them will throw an exception -- you should set
+    them in a [~transformers.GenerationConfig]. Check the documentation of [~transformers.GenerationConfig] for more
+    information about the individual parameters.
+
+    </Tip>
+
     Arg:
         name_or_path (`str`, *optional*, defaults to `""`):
             Store the string that was passed to [`PreTrainedModel.from_pretrained`] or
@@ -117,77 +126,6 @@ class PretrainedConfig(PushToHubMixin):
             sequence_length embeddings at a time. For more information on feed forward chunking, see [How does Feed
             Forward Chunking work?](../glossary.html#feed-forward-chunking).
 
-        > Parameters for sequence generation
-
-        max_length (`int`, *optional*, defaults to 20):
-            Maximum length that will be used by default in the `generate` method of the model.
-        min_length (`int`, *optional*, defaults to 0):
-            Minimum length that will be used by default in the `generate` method of the model.
-        do_sample (`bool`, *optional*, defaults to `False`):
-            Flag that will be used by default in the `generate` method of the model. Whether or not to use sampling ;
-            use greedy decoding otherwise.
-        early_stopping (`bool`, *optional*, defaults to `False`):
-            Flag that will be used by default in the `generate` method of the model. Whether to stop the beam search
-            when at least `num_beams` sentences are finished per batch or not.
-        num_beams (`int`, *optional*, defaults to 1):
-            Number of beams for beam search that will be used by default in the `generate` method of the model. 1 means
-            no beam search.
-        num_beam_groups (`int`, *optional*, defaults to 1):
-            Number of groups to divide `num_beams` into in order to ensure diversity among different groups of beams
-            that will be used by default in the `generate` method of the model. 1 means no group beam search.
-        diversity_penalty (`float`, *optional*, defaults to 0.0):
-            Value to control diversity for group beam search. that will be used by default in the `generate` method of
-            the model. 0 means no diversity penalty. The higher the penalty, the more diverse are the outputs.
-        temperature (`float`, *optional*, defaults to 1.0):
-            The value used to module the next token probabilities that will be used by default in the `generate` method
-            of the model. Must be strictly positive.
-        top_k (`int`, *optional*, defaults to 50):
-            Number of highest probability vocabulary tokens to keep for top-k-filtering that will be used by default in
-            the `generate` method of the model.
-        top_p (`float`, *optional*, defaults to 1):
-            Value that will be used by default in the `generate` method of the model for `top_p`. If set to float < 1,
-            only the most probable tokens with probabilities that add up to `top_p` or higher are kept for generation.
-        typical_p (`float`, *optional*, defaults to 1):
-            Local typicality measures how similar the conditional probability of predicting a target token next is to
-            the expected conditional probability of predicting a random token next, given the partial text already
-            generated. If set to float < 1, the smallest set of the most locally typical tokens with probabilities that
-            add up to `typical_p` or higher are kept for generation. See [this
-            paper](https://arxiv.org/pdf/2202.00666.pdf) for more details.
-        repetition_penalty (`float`, *optional*, defaults to 1):
-            Parameter for repetition penalty that will be used by default in the `generate` method of the model. 1.0
-            means no penalty.
-        length_penalty (`float`, *optional*, defaults to 1):
-            Exponential penalty to the length that is used with beam-based generation. It is applied as an exponent to
-            the sequence length, which in turn is used to divide the score of the sequence. Since the score is the log
-            likelihood of the sequence (i.e. negative), `length_penalty` > 0.0 promotes longer sequences, while
-            `length_penalty` < 0.0 encourages shorter sequences.
-        no_repeat_ngram_size (`int`, *optional*, defaults to 0) -- Value that will be used by default in the
-            `generate` method of the model for `no_repeat_ngram_size`. If set to int > 0, all ngrams of that size can
-            only occur once.
-        encoder_no_repeat_ngram_size (`int`, *optional*, defaults to 0) -- Value that will be used by
-            default in the `generate` method of the model for `encoder_no_repeat_ngram_size`. If set to int > 0, all
-            ngrams of that size that occur in the `encoder_input_ids` cannot occur in the `decoder_input_ids`.
-        bad_words_ids (`List[int]`, *optional*):
-            List of token ids that are not allowed to be generated that will be used by default in the `generate`
-            method of the model. In order to get the tokens of the words that should not appear in the generated text,
-            use `tokenizer.encode(bad_word, add_prefix_space=True)`.
-        num_return_sequences (`int`, *optional*, defaults to 1):
-            Number of independently computed returned sequences for each element in the batch that will be used by
-            default in the `generate` method of the model.
-        output_scores (`bool`, *optional*, defaults to `False`):
-            Whether the model should return the logits when used for generation.
-        return_dict_in_generate (`bool`, *optional*, defaults to `False`):
-            Whether the model should return a [`~transformers.utils.ModelOutput`] instead of a `torch.LongTensor`.
-        forced_bos_token_id (`int`, *optional*):
-            The id of the token to force as the first generated token after the `decoder_start_token_id`. Useful for
-            multilingual models like [mBART](../model_doc/mbart) where the first generated token needs to be the target
-            language token.
-        forced_eos_token_id (`int`, *optional*):
-            The id of the token to force as the last generated token when `max_length` is reached.
-        remove_invalid_values (`bool`, *optional*):
-            Whether to remove possible _nan_ and _inf_ outputs of the model to prevent the generation method to crash.
-            Note that using `remove_invalid_values` can slow down generation.
-
         > Parameters for fine-tuning tasks
 
         architectures (`List[str]`, *optional*):
@@ -287,7 +225,7 @@ def __init__(self, **kwargs):
 
         # Retrocompatibility: Parameters for sequence generation. While we will keep the ability to load these
         # parameters, saving them will be deprecated. In a distant future, we won't need to load them.
-        for parameter_name, default_value in self._get_generation_defaults().items():
+        for parameter_name, default_value in self._get_global_generation_defaults().items():
             setattr(self, parameter_name, kwargs.pop(parameter_name, default_value))
 
         # Fine-tuning task arguments
@@ -440,16 +378,13 @@ def save_pretrained(self, save_directory: Union[str, os.PathLike], push_to_hub:
         if os.path.isfile(save_directory):
             raise AssertionError(f"Provided path ({save_directory}) should be a directory, not a file")
 
-        non_default_generation_parameters = {}
-        for parameter_name, default_value in self._get_generation_defaults().items():
-            if hasattr(self, parameter_name) and getattr(self, parameter_name) != default_value:
-                non_default_generation_parameters[parameter_name] = getattr(self, parameter_name)
+        non_default_generation_parameters = self._get_non_default_generation_parameters()
         if len(non_default_generation_parameters) > 0:
-            logger.warning(
-                "Some non-default generation parameters are set in the model config. These should go into a "
-                "GenerationConfig file (https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model) "
-                "instead. This warning will be raised to an exception in v4.41.\n"
-                f"Non-default generation parameters: {str(non_default_generation_parameters)}"
+            raise ValueError(
+                "Some non-default generation parameters are set in the model config. These should go into either a) "
+                "`model.generation_config` (as opposed to `model.config`); OR b) a GenerationConfig file "
+                "(https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model) "
+                f"\nNon-default generation parameters: {str(non_default_generation_parameters)}"
             )
 
         os.makedirs(save_directory, exist_ok=True)
@@ -1049,7 +984,7 @@ def register_for_auto_class(cls, auto_class="AutoConfig"):
         cls._auto_class = auto_class
 
     @staticmethod
-    def _get_generation_defaults() -> Dict[str, Any]:
+    def _get_global_generation_defaults() -> Dict[str, Any]:
         return {
             "max_length": 20,
             "min_length": 0,
@@ -1078,14 +1013,49 @@ def _get_generation_defaults() -> Dict[str, Any]:
             "begin_suppress_tokens": None,
         }
 
-    def _has_non_default_generation_parameters(self) -> bool:
+    def _get_non_default_generation_parameters(self) -> Dict[str, Any]:
         """
-        Whether or not this instance holds non-default generation parameters.
+        Gets the non-default generation parameters on the PretrainedConfig instance
         """
-        for parameter_name, default_value in self._get_generation_defaults().items():
-            if hasattr(self, parameter_name) and getattr(self, parameter_name) != default_value:
-                return True
-        return False
+        non_default_generation_parameters = {}
+        decoder_attribute_name = None
+        default_config = None
+
+        # Composite models don't have a default config, use their decoder config as a fallback for default values
+        # If no known pattern is matched, then `default_config = None` -> check against the global generation defaults
+        try:
+            default_config = self.__class__()
+        except ValueError:
+            for decoder_attribute_name in ("decoder", "generator", "text_config"):
+                if hasattr(self, decoder_attribute_name):
+                    default_config = getattr(self, decoder_attribute_name).__class__()
+                    break
+
+        # If it is a composite model, we want to check the subconfig that will be used for generation
+        self_decoder_config = self if decoder_attribute_name is None else getattr(self, decoder_attribute_name)
+
+        for parameter_name, default_global_value in self._get_global_generation_defaults().items():
+            if hasattr(self_decoder_config, parameter_name):
+                is_default_in_config = is_default_generation_value = None
+                parameter_value = getattr(self_decoder_config, parameter_name)
+                # Three cases in which is okay for the model config to hold generation config parameters:
+                # 1. The parameter is set to `None`, effectivelly delegating its value to the generation config
+                if parameter_value is None:
+                    continue
+                # 2. If we have a default config, then the instance should hold the same generation defaults
+                if default_config is not None:
+                    is_default_in_config = parameter_value == getattr(default_config, parameter_name)
+                # 3. if we don't have a default config, then the instance should hold the global generation defaults
+                else:
+                    is_default_generation_value = parameter_value == default_global_value
+
+                is_non_default = (is_default_in_config is False) or (
+                    is_default_in_config is None and is_default_generation_value is False
+                )
+                if is_non_default:
+                    non_default_generation_parameters[parameter_name] = getattr(self_decoder_config, parameter_name)
+
+        return non_default_generation_parameters
 
 
 def get_configuration_file(configuration_files: List[str]) -> str:

diff --git a/src/transformers/generation/__init__.py b/src/transformers/generation/__init__.py
@@ -55,7 +55,6 @@
         "ExponentialDecayLengthPenalty",
         "ForcedBOSTokenLogitsProcessor",
         "ForcedEOSTokenLogitsProcessor",
-        "ForceTokensLogitsProcessor",
         "HammingDiversityLogitsProcessor",
         "InfNanRemoveLogitsProcessor",
         "LogitNormalization",
@@ -201,7 +200,6 @@
             ExponentialDecayLengthPenalty,
             ForcedBOSTokenLogitsProcessor,
             ForcedEOSTokenLogitsProcessor,
-            ForceTokensLogitsProcessor,
             HammingDiversityLogitsProcessor,
             InfNanRemoveLogitsProcessor,
             LogitNormalization,

diff --git a/src/transformers/generation/logits_process.py b/src/transformers/generation/logits_process.py
@@ -15,7 +15,6 @@
 
 import inspect
 import math
-import warnings
 from typing import Callable, Dict, Iterable, List, Optional, Tuple, Union
 
 import numpy as np
@@ -1844,34 +1843,6 @@ def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> to
         return scores
 
 
-class ForceTokensLogitsProcessor(LogitsProcessor):
-    r"""
-    This processor takes a list of pairs of integers which indicates a mapping from generation indices to token
-    indices that will be forced before generation. The processor will set their log probs to `inf` so that they are
-    sampled at their corresponding index. Originally created for
-    [Whisper](https://huggingface.co/docs/transformers/model_doc/whisper).
-    """
-
-    def __init__(self, force_token_map: List[List[int]], _has_warned: Optional[bool] = False):
-        self.force_token_map = dict(force_token_map)
-        if not _has_warned:
-            # TODO(Sanchit): remove this processor entirely in v4.40
-            warnings.warn(
-                "This `ForceTokensLogitsProcessor` has been deprecated and will be removed in v4.40. Should you need to provide prompt ids for generation, specify `input_ids` to the generate method for decoder-only models, or `decoder_input_ids` for encoder-decoder models.",
-                FutureWarning,
-            )
-
-    @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING)
-    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
-        generation_idx = input_ids.shape[-1]
-        current_token = self.force_token_map.get(generation_idx, None)
-        scores_processed = scores
-        if current_token is not None:
-            scores_processed = torch.full_like(scores, -float("inf"))
-            scores_processed[:, current_token] = 0
-        return scores_processed
-
-
 class WhisperTimeStampLogitsProcessor(LogitsProcessor):
     r"""
 

diff --git a/src/transformers/generation/stopping_criteria.py b/src/transformers/generation/stopping_criteria.py
@@ -85,36 +85,6 @@ def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwa
         return torch.full((input_ids.shape[0],), is_done, device=input_ids.device, dtype=torch.bool)
 
 
-class MaxNewTokensCriteria(StoppingCriteria):
-    """
-    This class can be used to stop generation whenever the generated number of tokens exceeds `max_new_tokens`. Keep in
-    mind for decoder-only type of transformers, this will **not** include the initial prompted tokens. This is very
-    close to `MaxLengthCriteria` but ignores the number of initial tokens.
-
-    Args:
-        start_length (`int`):
-            The number of initial tokens.
-        max_new_tokens (`int`):
-            The maximum number of tokens to generate.
-    """
-
-    def __init__(self, start_length: int, max_new_tokens: int):
-        warnings.warn(
-            "The class `MaxNewTokensCriteria` is deprecated and will be removed in v4.43. "
-            f"Please use `MaxLengthCriteria(max_length={start_length + max_new_tokens})` "
-            "with `max_length = start_length + max_new_tokens` instead.",
-            FutureWarning,
-        )
-        self.start_length = start_length
-        self.max_new_tokens = max_new_tokens
-        self.max_length = start_length + max_new_tokens
-
-    @add_start_docstrings(STOPPING_CRITERIA_INPUTS_DOCSTRING)
-    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> torch.BoolTensor:
-        is_done = input_ids.shape[-1] >= self.max_length
-        return torch.full((input_ids.shape[0],), is_done, device=input_ids.device, dtype=torch.bool)
-
-
 class MaxTimeCriteria(StoppingCriteria):
     """
     This class can be used to stop generation whenever the full generation exceeds some amount of time. By default, the
@@ -516,8 +486,6 @@ def max_length(self) -> Optional[int]:
         for stopping_criterium in self:
             if isinstance(stopping_criterium, MaxLengthCriteria):
                 return stopping_criterium.max_length
-            elif isinstance(stopping_criterium, MaxNewTokensCriteria):
-                return stopping_criterium.max_length
         return None