fix flake8 issues (facebookresearch#2570)

Summary: # Before submitting - [ ] Was this discussed/approved via a Github issue? (no need for typos, doc improvements) - [ ] Did you read the [contributor guideline](https://github.com/pytorch/fairseq/blob/main/CONTRIBUTING.md)? - [ ] Did you make sure to update the docs? - [ ] Did you write any new necessary tests? ## What does this PR do? - [x] applies flake8 fixes to main branch (fairinternal/fairseq-py#2546) - still more to be fixed Fix GPU tests: - [x] when torch.ao.quantization import doesn't work use torch.quantization - [x] build apex from earlier commit in circleci so that its compatible with pytorch 1.8 and 1.9 ## PR review Anyone in the community is free to review the PR once the tests have passed. If we didn't discuss your PR in Github issues there's a high chance it will not be merged. ## Did you have fun? Make sure you had fun coding � Pull Request resolved: fairinternal/fairseq-py#2570 Reviewed By: Mortimerp9 Differential Revision: D32955312 Pulled By: dianaml0 fbshipit-source-id: e163cbd4998f171f819e31b0682c1c0f1986f9e1
One-sixth · Dec 9, 2021 · 88e7d25 · 88e7d25
1 parent c620ed0
commit 88e7d25
Show file tree

Hide file tree

Showing 22 changed files with 73 additions and 61 deletions.
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -10,7 +10,7 @@ gpu: &gpu
   machine:
     image: ubuntu-1604-cuda-11.1:202012-01
   resource_class: gpu.nvidia.medium.multi
-  
+
 
 # -------------------------------------------------------------------------------------
 # Re-usable commands
@@ -25,7 +25,7 @@ install_dep_common: &install_dep_common
         pip install --upgrade setuptools
         pip install bitarray boto3 deepspeed editdistance fastBPE iopath ipdb ipython pyarrow pytest sacremoses sentencepiece subword-nmt hydra-core==1.0.7 omegaconf==2.0.6
         pip install --progress-bar off pytest
-        pip install --progress-bar off fairscale==0.4.1
+        pip install --progress-bar off fairscale
         pip install -i https://test.pypi.org/simple/ bitsandbytes-cuda111 -U
         python -c 'import torch; print("Torch version:", torch.__version__)'
         python -m torch.utils.collect_env
@@ -38,6 +38,7 @@ install_dep_fused_ops: &install_dep_fused_ops
         source activate fairseq
         git clone https://github.com/NVIDIA/apex
         cd apex
+        git checkout e2083df5eb96643c61613b9df48dd4eea6b07690
         pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" --global-option="--deprecated_fused_adam" --global-option="--xentropy" --global-option="--fast_multihead_attn" ./
         cd ~/
         git clone --depth=1 --branch v2.4 https://github.com/NVIDIA/Megatron-LM.git

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -17,7 +17,7 @@ repos:
     -   id: end-of-file-fixer
 
 -   repo: https://github.com/ambv/black
-    rev: 20.8b1
+    rev: 21.12b0
     hooks:
     - id: black
       language_version: python3.8

diff --git a/fairseq/logging/meters.py b/fairseq/logging/meters.py
@@ -8,7 +8,6 @@
 from collections import OrderedDict
 from typing import Dict, Optional
 
-
 try:
     import torch
 
@@ -18,7 +17,6 @@ def type_as(a, b):
         else:
             return a
 
-
 except ImportError:
     torch = None
 

diff --git a/fairseq/models/speech_to_text/modules/emformer.py b/fairseq/models/speech_to_text/modules/emformer.py
@@ -14,23 +14,30 @@
 
 import torch
 import torch.nn as nn
-from fairseq.models import (
-    FairseqEncoder,
-)
+from torch import Tensor
+from torch import device as Device
+
+from fairseq.models import FairseqEncoder
 from fairseq.models.speech_to_text.utils import (
     NoOp,
-    lengths_to_padding_mask,
-    segments_to_sequence,
-)
-from fairseq.models.speech_to_text.utils import (
     attention_suppression,
     layer_norm_backward_hook,
+    lengths_to_padding_mask,
+    segments_to_sequence,
 )
-from torch import Tensor, device as Device
-from torch.ao.quantization.qconfig import (
-    default_dynamic_qconfig,
-    per_channel_dynamic_qconfig,
-)
+
+try:
+    import torch.ao.quantization as quantization
+    from torch.ao.quantization.qconfig import (
+        default_dynamic_qconfig,
+        per_channel_dynamic_qconfig,
+    )
+except ImportError:
+    import torch.quantization as quantization
+    from torch.quantization.qconfig import (
+        default_dynamic_qconfig,
+        per_channel_dynamic_qconfig,
+    )
 
 
 class RelativePositionEmbedding(nn.Module):
@@ -140,7 +147,7 @@ def quantize_(self, params=None):
             qconfig = per_channel_dynamic_qconfig
         else:
             qconfig = default_dynamic_qconfig
-        torch.ao.quantization.quantize_dynamic(
+        quantization.quantize_dynamic(
             self, {torch.nn.Linear: qconfig}, dtype=torch.qint8, inplace=True
         )
         return self
@@ -728,7 +735,7 @@ def quantize_(self, params=None):
             qconfig = per_channel_dynamic_qconfig
         else:
             qconfig = default_dynamic_qconfig
-        torch.ao.quantization.quantize_dynamic(
+        quantization.quantize_dynamic(
             self, {torch.nn.Linear: qconfig}, dtype=torch.qint8, inplace=True
         )
         return self
@@ -1771,7 +1778,7 @@ def quantize_(self, params=None):
             qconfig = per_channel_dynamic_qconfig
         else:
             qconfig = default_dynamic_qconfig
-        torch.ao.quantization.quantize_dynamic(
+        quantization.quantize_dynamic(
             self, {torch.nn.Linear: qconfig}, dtype=torch.qint8, inplace=True
         )
         return self

diff --git a/fairseq/modules/cross_entropy.py b/fairseq/modules/cross_entropy.py
@@ -8,7 +8,6 @@
 import torch
 import torch.nn.functional as F
 
-
 logger = logging.getLogger(__name__)
 
 
@@ -54,7 +53,6 @@ def cross_entropy(logits, target, ignore_index=-100, reduction="mean"):
             else:
                 raise NotImplementedError
 
-
 except ImportError:
 
     def cross_entropy(logits, target, ignore_index=-100, reduction="mean"):

diff --git a/fairseq/modules/layer_norm.py b/fairseq/modules/layer_norm.py
@@ -7,7 +7,6 @@
 import torch.nn as nn
 import torch.nn.functional as F
 
-
 try:
     from apex.normalization import FusedLayerNorm as _FusedLayerNorm
 
@@ -22,7 +21,6 @@ def forward(self, x):
                 with torch.cuda.device(x.device):
                     return super().forward(x)
 
-
 except ImportError:
     has_fused_layernorm = False
 

diff --git a/fairseq/modules/quantization/scalar/ops.py b/fairseq/modules/quantization/scalar/ops.py
@@ -5,6 +5,11 @@
 
 import torch
 
+try:
+    import torch.ao.quantization as quantization
+except ImportError:
+    import torch.quantization as quantization
+
 
 def emulate_int(w, bits, method, scale=None, zero_point=None):
     q = globals()[f"emulate_int8_{method}"]
@@ -21,7 +26,7 @@ def quantize(w, scale, zero_point, bits=8):
 
 def emulate_int8_histogram(w, scale=None, zero_point=None, bits=8):
     if scale is None:
-        obs = torch.ao.quantization.observer.HistogramObserver()
+        obs = quantization.observer.HistogramObserver()
         obs.to(device=w.device)
         _ = obs(w.float())
         scale, zero_point = obs.calculate_qparams()
@@ -32,7 +37,7 @@ def emulate_int8_histogram(w, scale=None, zero_point=None, bits=8):
 
 def emulate_int8_channel(w, scale=None, zero_point=None, bits=8):
     if scale is None:
-        obs = torch.ao.quantization.observer.PerChannelMinMaxObserver(
+        obs = quantization.observer.PerChannelMinMaxObserver(
             ch_axis=-1, qscheme=torch.per_channel_symmetric
         )
         obs.to(device=w.device)
@@ -45,7 +50,7 @@ def emulate_int8_channel(w, scale=None, zero_point=None, bits=8):
 
 def emulate_int8_tensor(w, scale=None, zero_point=None, bits=8):
     if scale is None:
-        obs = torch.ao.quantization.observer.MinMaxObserver()
+        obs = quantization.observer.MinMaxObserver()
         obs.to(device=w.device)
         _ = obs(w)
         scale, zero_point = obs.calculate_qparams()

diff --git a/fairseq/optim/fused_adam.py b/fairseq/optim/fused_adam.py
@@ -27,8 +27,8 @@ def get_fused_adam_class():
     except ImportError:
         try:
             # fallback to the newer interface
-            from apex.optimizers import FusedAdam as _FusedAdam  # noqa
             from apex.multi_tensor_apply import multi_tensor_applier
+            from apex.optimizers import FusedAdam as _FusedAdam  # noqa
 
             if multi_tensor_applier.available:
                 return FusedAdamV2
@@ -252,8 +252,8 @@ def inf_norm(t):
 
 
 try:
-    from apex.optimizers import FusedAdam
     from apex.multi_tensor_apply import multi_tensor_applier
+    from apex.optimizers import FusedAdam
 
     class FusedAdamV2(FusedAdam):
         """
@@ -382,6 +382,5 @@ def step(
 
             return loss
 
-
 except ImportError:
     pass
diff --git a/scripts/average_checkpoints.py b/scripts/average_checkpoints.py
@@ -108,16 +108,18 @@ def main():
                         help='Write the new checkpoint containing the averaged weights to this path.')
     num_group = parser.add_mutually_exclusive_group()
     num_group.add_argument('--num-epoch-checkpoints', type=int,
-                           help='if set, will try to find checkpoints with names checkpoint_xx.pt in the path specified by input, '
-                           'and average last this many of them.')
+                           help='if set, will try to find checkpoints with names checkpoint_xx.pt in the '
+                           'path specified by input, and average last this many of them.')
     num_group.add_argument('--num-update-checkpoints', type=int,
-                           help='if set, will try to find checkpoints with names checkpoint_ee_xx.pt in the path specified by input, '
-                           'and average last this many of them.')
+                           help='if set, will try to find checkpoints with names checkpoint_ee_xx.pt in the path specified by'
+                           ' input, and average last this many of them.')
     parser.add_argument('--checkpoint-upper-bound', type=int,
                         help='when using --num-epoch-checkpoints, this will set an upper bound on which epoch to use, '
                         'when using --num-update-checkpoints, this will set an upper bound on which update to use'
-                        'e.g., with --num-epoch-checkpoints=10 --checkpoint-upper-bound=50, checkpoints 41-50 would be averaged.'
-                        'e.g., with --num-update-checkpoints=10 --checkpoint-upper-bound=50000, checkpoints 40500-50000 would be averaged assuming --save-interval-updates 500'
+                        'e.g., with --num-epoch-checkpoints=10 --checkpoint-upper-bound=50, checkpoints 41-50 would be'
+                        ' averaged.'
+                        'e.g., with --num-update-checkpoints=10 --checkpoint-upper-bound=50000, checkpoints 40500-50000 would'
+                        ' be averaged assuming --save-interval-updates 500'
                         )
     # fmt: on
     args = parser.parse_args()

diff --git a/scripts/constraints/extract.py b/scripts/constraints/extract.py
@@ -11,8 +11,6 @@
 import random
 import sys
 
-from sacrebleu import extract_ngrams
-
 
 def get_phrase(words, index, length):
     assert index < len(words) - length + 1

diff --git a/scripts/spm_decode.py b/scripts/spm_decode.py
@@ -26,13 +26,13 @@ def main():
 
     if args.input_format == "piece":
 
-        def decode(l):
-            return "".join(sp.DecodePieces(l))
+        def decode(input):
+            return "".join(sp.DecodePieces(input))
 
     elif args.input_format == "id":
 
-        def decode(l):
-            return "".join(sp.DecodeIds(l))
+        def decode(input):
+            return "".join(sp.DecodeIds(input))
 
     else:
         raise NotImplementedError

diff --git a/scripts/spm_encode.py b/scripts/spm_encode.py
@@ -49,13 +49,13 @@ def main():
 
     if args.output_format == "piece":
 
-        def encode(l):
-            return sp.EncodeAsPieces(l)
+        def encode(input):
+            return sp.EncodeAsPieces(input)
 
     elif args.output_format == "id":
 
-        def encode(l):
-            return list(map(str, sp.EncodeAsIds(l)))
+        def encode(input):
+            return list(map(str, sp.EncodeAsIds(input)))
 
     else:
         raise NotImplementedError

diff --git a/setup.cfg b/setup.cfg
@@ -1,3 +1,4 @@
 [flake8]
 max-line-length = 127
 extend-ignore = E203, W503
+extend-exclude = fairseq/model_parallel/megatron
diff --git a/tests/distributed/test_bmuf.py b/tests/distributed/test_bmuf.py
@@ -140,7 +140,6 @@ def setup_args():
 @unittest.skipIf(torch.cuda.device_count() < 2, "test requires 2 GPUs")
 class TestBMUF(unittest.TestCase):
     def bmuf_process(self, cfg, args, iterations):
-        processes = []
         results = Manager().dict()
         torch.multiprocessing.spawn(
             fn=functools.partial(single_gpu_training, cfg, args),

diff --git a/tests/gpu/test_binaries_gpu.py b/tests/gpu/test_binaries_gpu.py
@@ -399,6 +399,9 @@ def _quantize_language_model(data_dir, arch, extra_flags=None, run_validation=Fa
     train.main(quantize_args)
 
 
+@unittest.skipIf(
+    int(torch.__version__[2]) < 10, reason="quantized kernels are only supported on CPU"
+)
 @unittest.skipIf(not torch.cuda.is_available(), "test requires a GPU")
 class TestQuantization(unittest.TestCase):
     def setUp(self):

diff --git a/tests/test_constraints.py b/tests/test_constraints.py
@@ -3,11 +3,17 @@
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 
-import sys
 import unittest
+from typing import List
 
 import torch
-from fairseq.token_generation_constraints import *
+
+from fairseq.token_generation_constraints import (
+    ConstraintNode,
+    OrderedConstraintState,
+    UnorderedConstraintState,
+    pack_constraints,
+)
 
 
 def tensorize(constraints: List[List[int]]) -> torch.Tensor:
@@ -53,7 +59,7 @@ def setUp(self):
         self.examples = [
             (
                 tensorize([[1, 2, 3], [1, 3], [1, 4], [4, 5, 6, 7], [1], [4, 5]]),
-                "([None].False#6 ([1].True#4 ([2].False#1 [3].True#1) [3].True#1 [4].True#1) ([4].False#2 ([5].True#2 ([6].False#1 [7].True#1))))",
+                "([None].False#6 ([1].True#4 ([2].False#1 [3].True#1) [3].True#1 [4].True#1) ([4].False#2 ([5].True#2 ([6].False#1 [7].True#1))))",  # noqa
                 {1: 4, 2: 1, 3: 2, 4: 3, 5: 2, 6: 1, 7: 1},
             ),
             ([], "[None].False#0", {}),

diff --git a/tests/test_file_io.py b/tests/test_file_io.py
@@ -49,7 +49,7 @@ def test_file_io_oss(self):
     def test_file_io_async(self):
         # ioPath `PathManager` is initialized after the first `opena` call.
         try:
-            from fairseq.file_io import IOPathManager, PathManager
+            from fairseq.file_io import PathManager
 
             _asyncfile = os.path.join(self._tmpdir, "async.txt")
             f = PathManager.opena(_asyncfile, "wb")

diff --git a/tests/test_fp16_optimizer.py b/tests/test_fp16_optimizer.py
@@ -3,7 +3,6 @@
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 
-import argparse
 import copy
 import logging
 import unittest

diff --git a/tests/test_multi_corpus_sampled_dataset.py b/tests/test_multi_corpus_sampled_dataset.py
@@ -79,7 +79,7 @@ def test_multi_corpus_sampled_dataset_uniform_sample(self):
 
     def test_multi_corpus_sampled_dataset_weighted_sample(self):
         def naive_weighted_sample(weights):
-            def f(l):
+            def f(input):
                 v = np.random.random()
                 agg = 0
                 for i, weight in enumerate(weights):

diff --git a/tests/test_reproducibility.py b/tests/test_reproducibility.py
@@ -3,12 +3,10 @@
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 
-import contextlib
 import json
 import os
 import tempfile
 import unittest
-from io import StringIO
 
 import torch