From 4f78bcb2871e0c51bec55edb87aadcaedce58069 Mon Sep 17 00:00:00 2001
From: Steven Liu <59462357+stevhliu@users.noreply.github.com>
Date: Tue, 6 Dec 2022 12:18:58 -0800
Subject: [PATCH] add missing is_decoder param (#20631)

---
 src/transformers/models/bert/configuration_bert.py              | 2 ++
 .../models/bert_generation/configuration_bert_generation.py     | 2 ++
 src/transformers/models/big_bird/configuration_big_bird.py      | 2 ++
 src/transformers/models/camembert/configuration_camembert.py    | 2 ++
 src/transformers/models/data2vec/configuration_data2vec_text.py | 2 ++
 src/transformers/models/ernie/configuration_ernie.py            | 2 ++
 src/transformers/models/esm/configuration_esm.py                | 2 ++
 .../models/megatron_bert/configuration_megatron_bert.py         | 2 ++
 src/transformers/models/nezha/configuration_nezha.py            | 2 ++
 src/transformers/models/qdqbert/configuration_qdqbert.py        | 2 ++
 src/transformers/models/rembert/configuration_rembert.py        | 2 ++
 src/transformers/models/roberta/configuration_roberta.py        | 2 ++
 src/transformers/models/roc_bert/configuration_roc_bert.py      | 2 ++
 src/transformers/models/roformer/configuration_roformer.py      | 2 ++
 .../models/xlm_roberta/configuration_xlm_roberta.py             | 2 ++
 15 files changed, 30 insertions(+)

diff --git a/src/transformers/models/bert/configuration_bert.py b/src/transformers/models/bert/configuration_bert.py
index b3dfcdff75e9..b2d64b7fde67 100644
--- a/src/transformers/models/bert/configuration_bert.py
+++ b/src/transformers/models/bert/configuration_bert.py
@@ -114,6 +114,8 @@ class BertConfig(PretrainedConfig):
             [Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155).
             For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models
             with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658).
+        is_decoder (`bool`, *optional*, defaults to `False`):
+            Whether the model is used as a decoder or not. If `False`, the model is used as an encoder.
         use_cache (`bool`, *optional*, defaults to `True`):
             Whether or not the model should return the last key/values attentions (not used by all models). Only
             relevant if `config.is_decoder=True`.
diff --git a/src/transformers/models/bert_generation/configuration_bert_generation.py b/src/transformers/models/bert_generation/configuration_bert_generation.py
index ca6792bd0cf0..d602de22f044 100644
--- a/src/transformers/models/bert_generation/configuration_bert_generation.py
+++ b/src/transformers/models/bert_generation/configuration_bert_generation.py
@@ -60,6 +60,8 @@ class BertGenerationConfig(PretrainedConfig):
             [Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155).
             For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models
             with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658).
+        is_decoder (`bool`, *optional*, defaults to `False`):
+            Whether the model is used as a decoder or not. If `False`, the model is used as an encoder.
         use_cache (`bool`, *optional*, defaults to `True`):
             Whether or not the model should return the last key/values attentions (not used by all models). Only
             relevant if `config.is_decoder=True`.
diff --git a/src/transformers/models/big_bird/configuration_big_bird.py b/src/transformers/models/big_bird/configuration_big_bird.py
index d387bab2881c..d9bcbfef081b 100644
--- a/src/transformers/models/big_bird/configuration_big_bird.py
+++ b/src/transformers/models/big_bird/configuration_big_bird.py
@@ -70,6 +70,8 @@ class BigBirdConfig(PretrainedConfig):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
         layer_norm_eps (`float`, *optional*, defaults to 1e-12):
             The epsilon used by the layer normalization layers.
+        is_decoder (`bool`, *optional*, defaults to `False`):
+            Whether the model is used as a decoder or not. If `False`, the model is used as an encoder.
         use_cache (`bool`, *optional*, defaults to `True`):
             Whether or not the model should return the last key/values attentions (not used by all models). Only
             relevant if `config.is_decoder=True`.
diff --git a/src/transformers/models/camembert/configuration_camembert.py b/src/transformers/models/camembert/configuration_camembert.py
index f37a351ef313..09989f1cb85f 100644
--- a/src/transformers/models/camembert/configuration_camembert.py
+++ b/src/transformers/models/camembert/configuration_camembert.py
@@ -81,6 +81,8 @@ class CamembertConfig(PretrainedConfig):
             [Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155).
             For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models
             with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658).
+        is_decoder (`bool`, *optional*, defaults to `False`):
+            Whether the model is used as a decoder or not. If `False`, the model is used as an encoder.
         use_cache (`bool`, *optional*, defaults to `True`):
             Whether or not the model should return the last key/values attentions (not used by all models). Only
             relevant if `config.is_decoder=True`.
diff --git a/src/transformers/models/data2vec/configuration_data2vec_text.py b/src/transformers/models/data2vec/configuration_data2vec_text.py
index c6da31294a35..a990e933bc67 100644
--- a/src/transformers/models/data2vec/configuration_data2vec_text.py
+++ b/src/transformers/models/data2vec/configuration_data2vec_text.py
@@ -73,6 +73,8 @@ class Data2VecTextConfig(PretrainedConfig):
             [Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155).
             For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models
             with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658).
+        is_decoder (`bool`, *optional*, defaults to `False`):
+            Whether the model is used as a decoder or not. If `False`, the model is used as an encoder.
         use_cache (`bool`, *optional*, defaults to `True`):
             Whether or not the model should return the last key/values attentions (not used by all models). Only
             relevant if `config.is_decoder=True`.
diff --git a/src/transformers/models/ernie/configuration_ernie.py b/src/transformers/models/ernie/configuration_ernie.py
index 7fd1c861da77..71dfde96dafb 100644
--- a/src/transformers/models/ernie/configuration_ernie.py
+++ b/src/transformers/models/ernie/configuration_ernie.py
@@ -87,6 +87,8 @@ class ErnieConfig(PretrainedConfig):
             [Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155).
             For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models
             with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658).
+        is_decoder (`bool`, *optional*, defaults to `False`):
+            Whether the model is used as a decoder or not. If `False`, the model is used as an encoder.
         use_cache (`bool`, *optional*, defaults to `True`):
             Whether or not the model should return the last key/values attentions (not used by all models). Only
             relevant if `config.is_decoder=True`.
diff --git a/src/transformers/models/esm/configuration_esm.py b/src/transformers/models/esm/configuration_esm.py
index af6ee73c93c5..90662c5e7b48 100644
--- a/src/transformers/models/esm/configuration_esm.py
+++ b/src/transformers/models/esm/configuration_esm.py
@@ -79,6 +79,8 @@ class EsmConfig(PretrainedConfig):
             [Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155).
             For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models
             with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658).
+        is_decoder (`bool`, *optional*, defaults to `False`):
+            Whether the model is used as a decoder or not. If `False`, the model is used as an encoder.
         use_cache (`bool`, *optional*, defaults to `True`):
             Whether or not the model should return the last key/values attentions (not used by all models). Only
             relevant if `config.is_decoder=True`.
diff --git a/src/transformers/models/megatron_bert/configuration_megatron_bert.py b/src/transformers/models/megatron_bert/configuration_megatron_bert.py
index a0ec5750fb9c..577f7868d5ae 100644
--- a/src/transformers/models/megatron_bert/configuration_megatron_bert.py
+++ b/src/transformers/models/megatron_bert/configuration_megatron_bert.py
@@ -70,6 +70,8 @@ class MegatronBertConfig(PretrainedConfig):
             [Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155).
             For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models
             with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658).
+        is_decoder (`bool`, *optional*, defaults to `False`):
+            Whether the model is used as a decoder or not. If `False`, the model is used as an encoder.
         use_cache (`bool`, *optional*, defaults to `True`):
             Whether or not the model should return the last key/values attentions (not used by all models). Only
             relevant if `config.is_decoder=True`.
diff --git a/src/transformers/models/nezha/configuration_nezha.py b/src/transformers/models/nezha/configuration_nezha.py
index d6d92f983710..6e8adc0e84a6 100644
--- a/src/transformers/models/nezha/configuration_nezha.py
+++ b/src/transformers/models/nezha/configuration_nezha.py
@@ -48,6 +48,8 @@ class NezhaConfig(PretrainedConfig):
             The epsilon used by the layer normalization layers.
         classifier_dropout (`float`, optional, defaults to 0.1):
             The dropout ratio for attached classifiers.
+        is_decoder (`bool`, *optional*, defaults to `False`):
+            Whether the model is used as a decoder or not. If `False`, the model is used as an encoder.
 
     Example:
 
diff --git a/src/transformers/models/qdqbert/configuration_qdqbert.py b/src/transformers/models/qdqbert/configuration_qdqbert.py
index ffe89cf113b8..090617a6308f 100644
--- a/src/transformers/models/qdqbert/configuration_qdqbert.py
+++ b/src/transformers/models/qdqbert/configuration_qdqbert.py
@@ -65,6 +65,8 @@ class QDQBertConfig(PretrainedConfig):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
         layer_norm_eps (`float`, *optional*, defaults to 1e-12):
             The epsilon used by the layer normalization layers.
+        is_decoder (`bool`, *optional*, defaults to `False`):
+            Whether the model is used as a decoder or not. If `False`, the model is used as an encoder.
         use_cache (`bool`, *optional*, defaults to `True`):
             Whether or not the model should return the last key/values attentions (not used by all models). Only
             relevant if `config.is_decoder=True`.
diff --git a/src/transformers/models/rembert/configuration_rembert.py b/src/transformers/models/rembert/configuration_rembert.py
index 6a2d8a13c5e8..550722569178 100644
--- a/src/transformers/models/rembert/configuration_rembert.py
+++ b/src/transformers/models/rembert/configuration_rembert.py
@@ -76,6 +76,8 @@ class RemBertConfig(PretrainedConfig):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
         layer_norm_eps (`float`, *optional*, defaults to 1e-12):
             The epsilon used by the layer normalization layers.
+        is_decoder (`bool`, *optional*, defaults to `False`):
+            Whether the model is used as a decoder or not. If `False`, the model is used as an encoder.
         use_cache (`bool`, *optional*, defaults to `True`):
             Whether or not the model should return the last key/values attentions (not used by all models). Only
             relevant if `config.is_decoder=True`.
diff --git a/src/transformers/models/roberta/configuration_roberta.py b/src/transformers/models/roberta/configuration_roberta.py
index bebacab01169..3d911f5847dc 100644
--- a/src/transformers/models/roberta/configuration_roberta.py
+++ b/src/transformers/models/roberta/configuration_roberta.py
@@ -79,6 +79,8 @@ class RobertaConfig(PretrainedConfig):
             [Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155).
             For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models
             with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658).
+        is_decoder (`bool`, *optional*, defaults to `False`):
+            Whether the model is used as a decoder or not. If `False`, the model is used as an encoder.
         use_cache (`bool`, *optional*, defaults to `True`):
             Whether or not the model should return the last key/values attentions (not used by all models). Only
             relevant if `config.is_decoder=True`.
diff --git a/src/transformers/models/roc_bert/configuration_roc_bert.py b/src/transformers/models/roc_bert/configuration_roc_bert.py
index 7ac989d00524..83bc35b6ff7e 100644
--- a/src/transformers/models/roc_bert/configuration_roc_bert.py
+++ b/src/transformers/models/roc_bert/configuration_roc_bert.py
@@ -64,6 +64,8 @@ class RoCBertConfig(PretrainedConfig):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
         layer_norm_eps (`float`, *optional*, defaults to 1e-12):
             The epsilon used by the layer normalization layers.
+        is_decoder (`bool`, *optional*, defaults to `False`):
+            Whether the model is used as a decoder or not. If `False`, the model is used as an encoder.
         use_cache (`bool`, *optional*, defaults to `True`):
             Whether or not the model should return the last key/values attentions (not used by all models). Only
             relevant if `config.is_decoder=True`.
diff --git a/src/transformers/models/roformer/configuration_roformer.py b/src/transformers/models/roformer/configuration_roformer.py
index ea547ca52d1b..cbd92b412ba3 100644
--- a/src/transformers/models/roformer/configuration_roformer.py
+++ b/src/transformers/models/roformer/configuration_roformer.py
@@ -84,6 +84,8 @@ class RoFormerConfig(PretrainedConfig):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
         layer_norm_eps (`float`, *optional*, defaults to 1e-12):
             The epsilon used by the layer normalization layers.
+        is_decoder (`bool`, *optional*, defaults to `False`):
+            Whether the model is used as a decoder or not. If `False`, the model is used as an encoder.
         use_cache (`bool`, *optional*, defaults to `True`):
             Whether or not the model should return the last key/values attentions (not used by all models). Only
             relevant if `config.is_decoder=True`.
diff --git a/src/transformers/models/xlm_roberta/configuration_xlm_roberta.py b/src/transformers/models/xlm_roberta/configuration_xlm_roberta.py
index 9a11afbe1936..c8bad5905a48 100644
--- a/src/transformers/models/xlm_roberta/configuration_xlm_roberta.py
+++ b/src/transformers/models/xlm_roberta/configuration_xlm_roberta.py
@@ -88,6 +88,8 @@ class XLMRobertaConfig(PretrainedConfig):
             [Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155).
             For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models
             with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658).
+        is_decoder (`bool`, *optional*, defaults to `False`):
+            Whether the model is used as a decoder or not. If `False`, the model is used as an encoder.
         use_cache (`bool`, *optional*, defaults to `True`):
             Whether or not the model should return the last key/values attentions (not used by all models). Only
             relevant if `config.is_decoder=True`.