Allow # Ignore copy (huggingface#27328)

* fix --------- Co-authored-by: ydshieh <[email protected]> Co-authored-by: Arthur <[email protected]>
jquesnelle · Dec 7, 2023 · 5274692 · 5274692
1 parent 44b5506
commit 5274692
Show file tree

Hide file tree

Showing 3 changed files with 617 additions and 52 deletions.
diff --git a/tests/models/longformer/test_tokenization_longformer.py b/tests/models/longformer/test_tokenization_longformer.py
@@ -28,7 +28,9 @@
 
 
 @require_tokenizers
+# Copied from tests.models.roberta.test_tokenization_roberta.RobertaTokenizationTest with roberta-base->allenai/longformer-base-4096,Roberta->Longformer,roberta->longformer,
 class LongformerTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
+    # Ignore copy
     tokenizer_class = LongformerTokenizer
     test_slow_tokenizer = True
     rust_tokenizer_class = LongformerTokenizerFast
@@ -71,23 +73,19 @@ def setUp(self):
         with open(self.merges_file, "w", encoding="utf-8") as fp:
             fp.write("\n".join(merges))
 
-    # Copied from tests.models.roberta.test_tokenization_roberta.RobertaTokenizationTest.get_tokenizer
     def get_tokenizer(self, **kwargs):
         kwargs.update(self.special_tokens_map)
         return self.tokenizer_class.from_pretrained(self.tmpdirname, **kwargs)
 
-    # Copied from tests.models.roberta.test_tokenization_roberta.RobertaTokenizationTest.get_rust_tokenizer
     def get_rust_tokenizer(self, **kwargs):
         kwargs.update(self.special_tokens_map)
         return self.rust_tokenizer_class.from_pretrained(self.tmpdirname, **kwargs)
 
-    # Copied from tests.models.roberta.test_tokenization_roberta.RobertaTokenizationTest.get_input_output_texts
     def get_input_output_texts(self, tokenizer):
         input_text = "lower newer"
         output_text = "lower newer"
         return input_text, output_text
 
-    # Copied from tests.models.roberta.test_tokenization_roberta.RobertaTokenizationTest.test_full_tokenizer
     def test_full_tokenizer(self):
         tokenizer = self.tokenizer_class(self.vocab_file, self.merges_file, **self.special_tokens_map)
         text = "lower newer"
@@ -99,7 +97,6 @@ def test_full_tokenizer(self):
         input_bpe_tokens = [0, 1, 2, 15, 10, 9, 3, 2, 15, 19]
         self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
 
-    # Copied from tests.models.roberta.test_tokenization_roberta.RobertaTokenizationTest.roberta_dict_integration_testing with roberta->longformer
     def longformer_dict_integration_testing(self):
         tokenizer = self.get_tokenizer()
 
@@ -110,7 +107,6 @@ def longformer_dict_integration_testing(self):
         )
 
     @slow
-    # Copied from tests.models.roberta.test_tokenization_roberta.RobertaTokenizationTest.test_sequence_builders with roberta-base->allenai/longformer-base-4096
     def test_sequence_builders(self):
         tokenizer = self.tokenizer_class.from_pretrained("allenai/longformer-base-4096")
 
@@ -130,7 +126,6 @@ def test_sequence_builders(self):
         assert encoded_sentence == encoded_text_from_decode
         assert encoded_pair == encoded_pair_from_decode
 
-    # Copied from tests.models.roberta.test_tokenization_roberta.RobertaTokenizationTest.test_space_encoding
     def test_space_encoding(self):
         tokenizer = self.get_tokenizer()
 
@@ -171,11 +166,9 @@ def test_space_encoding(self):
         first_char = tokenizer.convert_ids_to_tokens(encoded[mask_loc + 1])[0]
         self.assertNotEqual(first_char, space_encoding)
 
-    # Copied from tests.models.roberta.test_tokenization_roberta.RobertaTokenizationTest.test_pretokenized_inputs
     def test_pretokenized_inputs(self):
         pass
 
-    # Copied from tests.models.roberta.test_tokenization_roberta.RobertaTokenizationTest.test_embeded_special_tokens
     def test_embeded_special_tokens(self):
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
@@ -208,7 +201,6 @@ def test_embeded_special_tokens(self):
                     tokens_r_str, ["<s>", "A", ",", "<mask>", "ĠAllen", "N", "LP", "Ġsentence", ".", "</s>"]
                 )
 
-    # Copied from tests.models.roberta.test_tokenization_roberta.RobertaTokenizationTest.test_change_add_prefix_space_and_trim_offsets_args
     def test_change_add_prefix_space_and_trim_offsets_args(self):
         for trim_offsets, add_prefix_space in itertools.product([True, False], repeat=2):
             tokenizer_r = self.rust_tokenizer_class.from_pretrained(
@@ -223,7 +215,6 @@ def test_change_add_prefix_space_and_trim_offsets_args(self):
             self.assertEqual(post_processor_state["add_prefix_space"], add_prefix_space)
             self.assertEqual(post_processor_state["trim_offsets"], trim_offsets)
 
-    # Copied from tests.models.roberta.test_tokenization_roberta.RobertaTokenizationTest.test_offsets_mapping_with_different_add_prefix_space_and_trim_space_arguments
     def test_offsets_mapping_with_different_add_prefix_space_and_trim_space_arguments(self):
         # Test which aims to verify that the offsets are well adapted to the argument `add_prefix_space` and
         # `trim_offsets`

diff --git a/tests/repo_utils/test_check_copies.py b/tests/repo_utils/test_check_copies.py
@@ -95,13 +95,156 @@ def forward(self, x):
 """
 
 
+MOCK_DUMMY_BERT_CODE_MATCH = """
+class BertDummyModel:
+    attr_1 = 1
+    attr_2 = 2
+
+    def __init__(self, a=1, b=2):
+        self.a = a
+        self.b = b
+
+    # Copied from transformers.models.dummy_gpt2.modeling_dummy_gpt2.GPT2DummyModel.forward
+    def forward(self, c):
+        return 1
+
+    def existing_common(self, c):
+        return 4
+
+    def existing_diff_to_be_ignored(self, c):
+        return 9
+"""
+
+
+MOCK_DUMMY_ROBERTA_CODE_MATCH = """
+# Copied from transformers.models.dummy_bert_match.modeling_dummy_bert_match.BertDummyModel with BertDummy->RobertaBertDummy
+class RobertaBertDummyModel:
+
+    attr_1 = 1
+    attr_2 = 2
+
+    def __init__(self, a=1, b=2):
+        self.a = a
+        self.b = b
+
+    # Ignore copy
+    def only_in_roberta_to_be_ignored(self, c):
+        return 3
+
+    # Copied from transformers.models.dummy_gpt2.modeling_dummy_gpt2.GPT2DummyModel.forward
+    def forward(self, c):
+        return 1
+
+    def existing_common(self, c):
+        return 4
+
+    # Ignore copy
+    def existing_diff_to_be_ignored(self, c):
+        return 6
+"""
+
+
+MOCK_DUMMY_BERT_CODE_NO_MATCH = """
+class BertDummyModel:
+    attr_1 = 1
+    attr_2 = 2
+
+    def __init__(self, a=1, b=2):
+        self.a = a
+        self.b = b
+
+    # Copied from transformers.models.dummy_gpt2.modeling_dummy_gpt2.GPT2DummyModel.forward
+    def forward(self, c):
+        return 1
+
+    def only_in_bert(self, c):
+        return 7
+
+    def existing_common(self, c):
+        return 4
+
+    def existing_diff_not_ignored(self, c):
+        return 8
+
+    def existing_diff_to_be_ignored(self, c):
+        return 9
+"""
+
+
+MOCK_DUMMY_ROBERTA_CODE_NO_MATCH = """
+# Copied from transformers.models.dummy_bert_no_match.modeling_dummy_bert_no_match.BertDummyModel with BertDummy->RobertaBertDummy
+class RobertaBertDummyModel:
+
+    attr_1 = 1
+    attr_2 = 3
+
+    def __init__(self, a=1, b=2):
+        self.a = a
+        self.b = b
+
+    # Ignore copy
+    def only_in_roberta_to_be_ignored(self, c):
+        return 3
+
+    # Copied from transformers.models.dummy_gpt2.modeling_dummy_gpt2.GPT2DummyModel.forward
+    def forward(self, c):
+        return 1
+
+    def only_in_roberta_not_ignored(self, c):
+        return 2
+
+    def existing_common(self, c):
+        return 4
+
+    def existing_diff_not_ignored(self, c):
+        return 5
+
+    # Ignore copy
+    def existing_diff_to_be_ignored(self, c):
+        return 6
+"""
+
+
+EXPECTED_REPLACED_CODE = """
+# Copied from transformers.models.dummy_bert_no_match.modeling_dummy_bert_no_match.BertDummyModel with BertDummy->RobertaBertDummy
+class RobertaBertDummyModel:
+    attr_1 = 1
+    attr_2 = 2
+
+    def __init__(self, a=1, b=2):
+        self.a = a
+        self.b = b
+
+    # Copied from transformers.models.dummy_gpt2.modeling_dummy_gpt2.GPT2DummyModel.forward
+    def forward(self, c):
+        return 1
+
+    def only_in_bert(self, c):
+        return 7
+
+    def existing_common(self, c):
+        return 4
+
+    def existing_diff_not_ignored(self, c):
+        return 8
+
+    # Ignore copy
+    def existing_diff_to_be_ignored(self, c):
+        return 6
+
+    # Ignore copy
+    def only_in_roberta_to_be_ignored(self, c):
+        return 3
+"""
+
+
 def replace_in_file(filename, old, new):
     with open(filename, "r", encoding="utf-8") as f:
         content = f.read()
 
     content = content.replace(old, new)
 
-    with open(filename, "w", encoding="utf-8") as f:
+    with open(filename, "w", encoding="utf-8", newline="\n") as f:
         f.write(content)
 
 
@@ -117,11 +260,18 @@ def create_tmp_repo(tmp_dir):
     model_dir = tmp_dir / "src" / "transformers" / "models"
     model_dir.mkdir(parents=True, exist_ok=True)
 
-    models = {"bert": MOCK_BERT_CODE, "bertcopy": MOCK_BERT_COPY_CODE}
+    models = {
+        "bert": MOCK_BERT_CODE,
+        "bertcopy": MOCK_BERT_COPY_CODE,
+        "dummy_bert_match": MOCK_DUMMY_BERT_CODE_MATCH,
+        "dummy_roberta_match": MOCK_DUMMY_ROBERTA_CODE_MATCH,
+        "dummy_bert_no_match": MOCK_DUMMY_BERT_CODE_NO_MATCH,
+        "dummy_roberta_no_match": MOCK_DUMMY_ROBERTA_CODE_NO_MATCH,
+    }
     for model, code in models.items():
         model_subdir = model_dir / model
         model_subdir.mkdir(exist_ok=True)
-        with open(model_subdir / f"modeling_{model}.py", "w", encoding="utf-8") as f:
+        with open(model_subdir / f"modeling_{model}.py", "w", encoding="utf-8", newline="\n") as f:
             f.write(code)
 
 
@@ -176,11 +326,47 @@ def test_is_copy_consistent(self):
                 diffs = is_copy_consistent(file_to_check)
                 self.assertEqual(diffs, [["models.bert.modeling_bert.BertModel", 22]])
 
-                diffs = is_copy_consistent(file_to_check, overwrite=True)
+                _ = is_copy_consistent(file_to_check, overwrite=True)
 
                 with open(file_to_check, "r", encoding="utf-8") as f:
                     self.assertEqual(f.read(), MOCK_BERT_COPY_CODE)
 
+    def test_is_copy_consistent_with_ignored_match(self):
+        path_to_check = ["src", "transformers", "models", "dummy_roberta_match", "modeling_dummy_roberta_match.py"]
+        with tempfile.TemporaryDirectory() as tmp_folder:
+            # Base check
+            create_tmp_repo(tmp_folder)
+            with patch_transformer_repo_path(tmp_folder):
+                file_to_check = os.path.join(tmp_folder, *path_to_check)
+                diffs = is_copy_consistent(file_to_check)
+                self.assertEqual(diffs, [])
+
+    def test_is_copy_consistent_with_ignored_no_match(self):
+        path_to_check = [
+            "src",
+            "transformers",
+            "models",
+            "dummy_roberta_no_match",
+            "modeling_dummy_roberta_no_match.py",
+        ]
+        with tempfile.TemporaryDirectory() as tmp_folder:
+            # Base check with an inconsistency
+            create_tmp_repo(tmp_folder)
+            with patch_transformer_repo_path(tmp_folder):
+                file_to_check = os.path.join(tmp_folder, *path_to_check)
+
+                diffs = is_copy_consistent(file_to_check)
+                # line 6: `attr_2 = 3` in `MOCK_DUMMY_ROBERTA_CODE_NO_MATCH`.
+                # (which has a leading `\n`.)
+                self.assertEqual(
+                    diffs, [["models.dummy_bert_no_match.modeling_dummy_bert_no_match.BertDummyModel", 6]]
+                )
+
+                _ = is_copy_consistent(file_to_check, overwrite=True)
+
+                with open(file_to_check, "r", encoding="utf-8") as f:
+                    self.assertEqual(f.read(), EXPECTED_REPLACED_CODE)
+
     def test_convert_to_localized_md(self):
         localized_readme = check_copies.LOCALIZED_READMES["README_zh-hans.md"]