diff --git a/.circleci/config.yml b/.circleci/config.yml
index 25f3a73881..de40a6e9c5 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -10,7 +10,7 @@ gpu: &gpu
   machine:
     image: ubuntu-1604-cuda-11.1:202012-01
   resource_class: gpu.nvidia.medium.multi
-  
+
 
 # -------------------------------------------------------------------------------------
 # Re-usable commands
@@ -25,7 +25,7 @@ install_dep_common: &install_dep_common
         pip install --upgrade setuptools
         pip install bitarray boto3 deepspeed editdistance fastBPE iopath ipdb ipython pyarrow pytest sacremoses sentencepiece subword-nmt hydra-core==1.0.7 omegaconf==2.0.6
         pip install --progress-bar off pytest
-        pip install --progress-bar off fairscale==0.4.1
+        pip install --progress-bar off fairscale
         pip install -i https://test.pypi.org/simple/ bitsandbytes-cuda111 -U
         python -c 'import torch; print("Torch version:", torch.__version__)'
         python -m torch.utils.collect_env
@@ -38,6 +38,7 @@ install_dep_fused_ops: &install_dep_fused_ops
         source activate fairseq
         git clone https://github.com/NVIDIA/apex
         cd apex
+        git checkout e2083df5eb96643c61613b9df48dd4eea6b07690
         pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" --global-option="--deprecated_fused_adam" --global-option="--xentropy" --global-option="--fast_multihead_attn" ./
         cd ~/
         git clone --depth=1 --branch v2.4 https://github.com/NVIDIA/Megatron-LM.git
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index f9efb2e96e..4817f6e876 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -17,7 +17,7 @@ repos:
     -   id: end-of-file-fixer
 
 -   repo: https://github.com/ambv/black
-    rev: 20.8b1
+    rev: 21.12b0
     hooks:
     - id: black
       language_version: python3.8
diff --git a/fairseq/logging/meters.py b/fairseq/logging/meters.py
index 2100b1fa0b..d5f7c775d9 100644
--- a/fairseq/logging/meters.py
+++ b/fairseq/logging/meters.py
@@ -8,7 +8,6 @@
 from collections import OrderedDict
 from typing import Dict, Optional
 
-
 try:
     import torch
 
@@ -18,7 +17,6 @@ def type_as(a, b):
         else:
             return a
 
-
 except ImportError:
     torch = None
 
diff --git a/fairseq/models/speech_to_text/modules/emformer.py b/fairseq/models/speech_to_text/modules/emformer.py
index 6233546ab8..70339788f7 100644
--- a/fairseq/models/speech_to_text/modules/emformer.py
+++ b/fairseq/models/speech_to_text/modules/emformer.py
@@ -14,23 +14,30 @@
 
 import torch
 import torch.nn as nn
-from fairseq.models import (
-    FairseqEncoder,
-)
+from torch import Tensor
+from torch import device as Device
+
+from fairseq.models import FairseqEncoder
 from fairseq.models.speech_to_text.utils import (
     NoOp,
-    lengths_to_padding_mask,
-    segments_to_sequence,
-)
-from fairseq.models.speech_to_text.utils import (
     attention_suppression,
     layer_norm_backward_hook,
+    lengths_to_padding_mask,
+    segments_to_sequence,
 )
-from torch import Tensor, device as Device
-from torch.ao.quantization.qconfig import (
-    default_dynamic_qconfig,
-    per_channel_dynamic_qconfig,
-)
+
+try:
+    import torch.ao.quantization as quantization
+    from torch.ao.quantization.qconfig import (
+        default_dynamic_qconfig,
+        per_channel_dynamic_qconfig,
+    )
+except ImportError:
+    import torch.quantization as quantization
+    from torch.quantization.qconfig import (
+        default_dynamic_qconfig,
+        per_channel_dynamic_qconfig,
+    )
 
 
 class RelativePositionEmbedding(nn.Module):
@@ -140,7 +147,7 @@ def quantize_(self, params=None):
             qconfig = per_channel_dynamic_qconfig
         else:
             qconfig = default_dynamic_qconfig
-        torch.ao.quantization.quantize_dynamic(
+        quantization.quantize_dynamic(
             self, {torch.nn.Linear: qconfig}, dtype=torch.qint8, inplace=True
         )
         return self
@@ -728,7 +735,7 @@ def quantize_(self, params=None):
             qconfig = per_channel_dynamic_qconfig
         else:
             qconfig = default_dynamic_qconfig
-        torch.ao.quantization.quantize_dynamic(
+        quantization.quantize_dynamic(
             self, {torch.nn.Linear: qconfig}, dtype=torch.qint8, inplace=True
         )
         return self
@@ -1771,7 +1778,7 @@ def quantize_(self, params=None):
             qconfig = per_channel_dynamic_qconfig
         else:
             qconfig = default_dynamic_qconfig
-        torch.ao.quantization.quantize_dynamic(
+        quantization.quantize_dynamic(
             self, {torch.nn.Linear: qconfig}, dtype=torch.qint8, inplace=True
         )
         return self
diff --git a/fairseq/modules/cross_entropy.py b/fairseq/modules/cross_entropy.py
index 6f33c24cb5..286c00eecc 100644
--- a/fairseq/modules/cross_entropy.py
+++ b/fairseq/modules/cross_entropy.py
@@ -8,7 +8,6 @@
 import torch
 import torch.nn.functional as F
 
-
 logger = logging.getLogger(__name__)
 
 
@@ -54,7 +53,6 @@ def cross_entropy(logits, target, ignore_index=-100, reduction="mean"):
             else:
                 raise NotImplementedError
 
-
 except ImportError:
 
     def cross_entropy(logits, target, ignore_index=-100, reduction="mean"):
diff --git a/fairseq/modules/layer_norm.py b/fairseq/modules/layer_norm.py
index 234609d9e2..78332d1749 100644
--- a/fairseq/modules/layer_norm.py
+++ b/fairseq/modules/layer_norm.py
@@ -7,7 +7,6 @@
 import torch.nn as nn
 import torch.nn.functional as F
 
-
 try:
     from apex.normalization import FusedLayerNorm as _FusedLayerNorm
 
@@ -22,7 +21,6 @@ def forward(self, x):
                 with torch.cuda.device(x.device):
                     return super().forward(x)
 
-
 except ImportError:
     has_fused_layernorm = False
 
diff --git a/fairseq/modules/quantization/scalar/ops.py b/fairseq/modules/quantization/scalar/ops.py
index 9144083ac7..ad1e14e051 100644
--- a/fairseq/modules/quantization/scalar/ops.py
+++ b/fairseq/modules/quantization/scalar/ops.py
@@ -5,6 +5,11 @@
 
 import torch
 
+try:
+    import torch.ao.quantization as quantization
+except ImportError:
+    import torch.quantization as quantization
+
 
 def emulate_int(w, bits, method, scale=None, zero_point=None):
     q = globals()[f"emulate_int8_{method}"]
@@ -21,7 +26,7 @@ def quantize(w, scale, zero_point, bits=8):
 
 def emulate_int8_histogram(w, scale=None, zero_point=None, bits=8):
     if scale is None:
-        obs = torch.ao.quantization.observer.HistogramObserver()
+        obs = quantization.observer.HistogramObserver()
         obs.to(device=w.device)
         _ = obs(w.float())
         scale, zero_point = obs.calculate_qparams()
@@ -32,7 +37,7 @@ def emulate_int8_histogram(w, scale=None, zero_point=None, bits=8):
 
 def emulate_int8_channel(w, scale=None, zero_point=None, bits=8):
     if scale is None:
-        obs = torch.ao.quantization.observer.PerChannelMinMaxObserver(
+        obs = quantization.observer.PerChannelMinMaxObserver(
             ch_axis=-1, qscheme=torch.per_channel_symmetric
         )
         obs.to(device=w.device)
@@ -45,7 +50,7 @@ def emulate_int8_channel(w, scale=None, zero_point=None, bits=8):
 
 def emulate_int8_tensor(w, scale=None, zero_point=None, bits=8):
     if scale is None:
-        obs = torch.ao.quantization.observer.MinMaxObserver()
+        obs = quantization.observer.MinMaxObserver()
         obs.to(device=w.device)
         _ = obs(w)
         scale, zero_point = obs.calculate_qparams()
diff --git a/fairseq/optim/fused_adam.py b/fairseq/optim/fused_adam.py
index da872033d1..1290ecfdbf 100644
--- a/fairseq/optim/fused_adam.py
+++ b/fairseq/optim/fused_adam.py
@@ -27,8 +27,8 @@ def get_fused_adam_class():
     except ImportError:
         try:
             # fallback to the newer interface
-            from apex.optimizers import FusedAdam as _FusedAdam  # noqa
             from apex.multi_tensor_apply import multi_tensor_applier
+            from apex.optimizers import FusedAdam as _FusedAdam  # noqa
 
             if multi_tensor_applier.available:
                 return FusedAdamV2
@@ -252,8 +252,8 @@ def inf_norm(t):
 
 
 try:
-    from apex.optimizers import FusedAdam
     from apex.multi_tensor_apply import multi_tensor_applier
+    from apex.optimizers import FusedAdam
 
     class FusedAdamV2(FusedAdam):
         """
@@ -382,6 +382,5 @@ def step(
 
             return loss
 
-
 except ImportError:
     pass
diff --git a/scripts/average_checkpoints.py b/scripts/average_checkpoints.py
index c512f802bc..a4711e4840 100644
--- a/scripts/average_checkpoints.py
+++ b/scripts/average_checkpoints.py
@@ -108,16 +108,18 @@ def main():
                         help='Write the new checkpoint containing the averaged weights to this path.')
     num_group = parser.add_mutually_exclusive_group()
     num_group.add_argument('--num-epoch-checkpoints', type=int,
-                           help='if set, will try to find checkpoints with names checkpoint_xx.pt in the path specified by input, '
-                           'and average last this many of them.')
+                           help='if set, will try to find checkpoints with names checkpoint_xx.pt in the '
+                           'path specified by input, and average last this many of them.')
     num_group.add_argument('--num-update-checkpoints', type=int,
-                           help='if set, will try to find checkpoints with names checkpoint_ee_xx.pt in the path specified by input, '
-                           'and average last this many of them.')
+                           help='if set, will try to find checkpoints with names checkpoint_ee_xx.pt in the path specified by'
+                           ' input, and average last this many of them.')
     parser.add_argument('--checkpoint-upper-bound', type=int,
                         help='when using --num-epoch-checkpoints, this will set an upper bound on which epoch to use, '
                         'when using --num-update-checkpoints, this will set an upper bound on which update to use'
-                        'e.g., with --num-epoch-checkpoints=10 --checkpoint-upper-bound=50, checkpoints 41-50 would be averaged.'
-                        'e.g., with --num-update-checkpoints=10 --checkpoint-upper-bound=50000, checkpoints 40500-50000 would be averaged assuming --save-interval-updates 500'
+                        'e.g., with --num-epoch-checkpoints=10 --checkpoint-upper-bound=50, checkpoints 41-50 would be'
+                        ' averaged.'
+                        'e.g., with --num-update-checkpoints=10 --checkpoint-upper-bound=50000, checkpoints 40500-50000 would'
+                        ' be averaged assuming --save-interval-updates 500'
                         )
     # fmt: on
     args = parser.parse_args()
diff --git a/scripts/constraints/extract.py b/scripts/constraints/extract.py
index f6155d0a05..437b373856 100755
--- a/scripts/constraints/extract.py
+++ b/scripts/constraints/extract.py
@@ -11,8 +11,6 @@
 import random
 import sys
 
-from sacrebleu import extract_ngrams
-
 
 def get_phrase(words, index, length):
     assert index < len(words) - length + 1
diff --git a/scripts/spm_decode.py b/scripts/spm_decode.py
index 1c18b1d2a7..7d7b68b240 100644
--- a/scripts/spm_decode.py
+++ b/scripts/spm_decode.py
@@ -26,13 +26,13 @@ def main():
 
     if args.input_format == "piece":
 
-        def decode(l):
-            return "".join(sp.DecodePieces(l))
+        def decode(input):
+            return "".join(sp.DecodePieces(input))
 
     elif args.input_format == "id":
 
-        def decode(l):
-            return "".join(sp.DecodeIds(l))
+        def decode(input):
+            return "".join(sp.DecodeIds(input))
 
     else:
         raise NotImplementedError
diff --git a/scripts/spm_encode.py b/scripts/spm_encode.py
index 83facfb3b1..f91e0bb728 100644
--- a/scripts/spm_encode.py
+++ b/scripts/spm_encode.py
@@ -49,13 +49,13 @@ def main():
 
     if args.output_format == "piece":
 
-        def encode(l):
-            return sp.EncodeAsPieces(l)
+        def encode(input):
+            return sp.EncodeAsPieces(input)
 
     elif args.output_format == "id":
 
-        def encode(l):
-            return list(map(str, sp.EncodeAsIds(l)))
+        def encode(input):
+            return list(map(str, sp.EncodeAsIds(input)))
 
     else:
         raise NotImplementedError
diff --git a/setup.cfg b/setup.cfg
index 3ea6243324..3fa679ddf1 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,3 +1,4 @@
 [flake8]
 max-line-length = 127
 extend-ignore = E203, W503
+extend-exclude = fairseq/model_parallel/megatron
diff --git a/tests/distributed/test_bmuf.py b/tests/distributed/test_bmuf.py
index 2a0f20d0be..995d0db180 100644
--- a/tests/distributed/test_bmuf.py
+++ b/tests/distributed/test_bmuf.py
@@ -140,7 +140,6 @@ def setup_args():
 @unittest.skipIf(torch.cuda.device_count() < 2, "test requires 2 GPUs")
 class TestBMUF(unittest.TestCase):
     def bmuf_process(self, cfg, args, iterations):
-        processes = []
         results = Manager().dict()
         torch.multiprocessing.spawn(
             fn=functools.partial(single_gpu_training, cfg, args),
diff --git a/tests/gpu/test_binaries_gpu.py b/tests/gpu/test_binaries_gpu.py
index 550e751b1f..1d5b6e62bc 100644
--- a/tests/gpu/test_binaries_gpu.py
+++ b/tests/gpu/test_binaries_gpu.py
@@ -399,6 +399,9 @@ def _quantize_language_model(data_dir, arch, extra_flags=None, run_validation=Fa
     train.main(quantize_args)
 
 
+@unittest.skipIf(
+    int(torch.__version__[2]) < 10, reason="quantized kernels are only supported on CPU"
+)
 @unittest.skipIf(not torch.cuda.is_available(), "test requires a GPU")
 class TestQuantization(unittest.TestCase):
     def setUp(self):
diff --git a/tests/test_constraints.py b/tests/test_constraints.py
index 1c37f7e1fb..d766d5130f 100755
--- a/tests/test_constraints.py
+++ b/tests/test_constraints.py
@@ -3,11 +3,17 @@
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 
-import sys
 import unittest
+from typing import List
 
 import torch
-from fairseq.token_generation_constraints import *
+
+from fairseq.token_generation_constraints import (
+    ConstraintNode,
+    OrderedConstraintState,
+    UnorderedConstraintState,
+    pack_constraints,
+)
 
 
 def tensorize(constraints: List[List[int]]) -> torch.Tensor:
@@ -53,7 +59,7 @@ def setUp(self):
         self.examples = [
             (
                 tensorize([[1, 2, 3], [1, 3], [1, 4], [4, 5, 6, 7], [1], [4, 5]]),
-                "([None].False#6 ([1].True#4 ([2].False#1 [3].True#1) [3].True#1 [4].True#1) ([4].False#2 ([5].True#2 ([6].False#1 [7].True#1))))",
+                "([None].False#6 ([1].True#4 ([2].False#1 [3].True#1) [3].True#1 [4].True#1) ([4].False#2 ([5].True#2 ([6].False#1 [7].True#1))))",  # noqa
                 {1: 4, 2: 1, 3: 2, 4: 3, 5: 2, 6: 1, 7: 1},
             ),
             ([], "[None].False#0", {}),
diff --git a/tests/test_file_io.py b/tests/test_file_io.py
index d1f33dadd1..af7c4cedb8 100644
--- a/tests/test_file_io.py
+++ b/tests/test_file_io.py
@@ -49,7 +49,7 @@ def test_file_io_oss(self):
     def test_file_io_async(self):
         # ioPath `PathManager` is initialized after the first `opena` call.
         try:
-            from fairseq.file_io import IOPathManager, PathManager
+            from fairseq.file_io import PathManager
 
             _asyncfile = os.path.join(self._tmpdir, "async.txt")
             f = PathManager.opena(_asyncfile, "wb")
diff --git a/tests/test_fp16_optimizer.py b/tests/test_fp16_optimizer.py
index ce4f1c055c..27085a12da 100644
--- a/tests/test_fp16_optimizer.py
+++ b/tests/test_fp16_optimizer.py
@@ -3,7 +3,6 @@
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 
-import argparse
 import copy
 import logging
 import unittest
diff --git a/tests/test_multi_corpus_sampled_dataset.py b/tests/test_multi_corpus_sampled_dataset.py
index 05b20328c5..88f0817a54 100644
--- a/tests/test_multi_corpus_sampled_dataset.py
+++ b/tests/test_multi_corpus_sampled_dataset.py
@@ -79,7 +79,7 @@ def test_multi_corpus_sampled_dataset_uniform_sample(self):
 
     def test_multi_corpus_sampled_dataset_weighted_sample(self):
         def naive_weighted_sample(weights):
-            def f(l):
+            def f(input):
                 v = np.random.random()
                 agg = 0
                 for i, weight in enumerate(weights):
diff --git a/tests/test_reproducibility.py b/tests/test_reproducibility.py
index 94931b2a07..b285593272 100644
--- a/tests/test_reproducibility.py
+++ b/tests/test_reproducibility.py
@@ -3,12 +3,10 @@
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 
-import contextlib
 import json
 import os
 import tempfile
 import unittest
-from io import StringIO
 
 import torch
 
diff --git a/tests/test_roberta.py b/tests/test_roberta.py
index b0b9cfd31e..0030d3e5a9 100644
--- a/tests/test_roberta.py
+++ b/tests/test_roberta.py
@@ -292,18 +292,18 @@ def test_roberta_incremental_decoder(self, device: str):
         # Decode with incremental state
         inc_state = {}
         ro_dec_inc = []
-        for l in range(tgt_len):
+        for i in range(tgt_len):
             ro, _ = model.decoder.forward(
-                ro_tokens[:, : l + 1], encoder_out=en_enc, incremental_state=inc_state
+                ro_tokens[:, : i + 1], encoder_out=en_enc, incremental_state=inc_state
             )
             self.assertEqual(ro.shape, (bs, 1, VOCAB_SIZE))
             ro_dec_inc.append(ro)
 
-        for l in range(tgt_len):
+        for i in range(tgt_len):
             # Intra-batch
-            self.assertTensorEqual(ro_dec_inc[l][0], ro_dec_inc[l][1])
+            self.assertTensorEqual(ro_dec_inc[i][0], ro_dec_inc[i][1])
             # Incremental vs non-incremental
-            self.assertTensorEqual(ro_dec_inc[l][:, 0], ro_dec[:, l])
+            self.assertTensorEqual(ro_dec_inc[i][:, 0], ro_dec[:, i])
 
 
 def params(model, name):
diff --git a/tests/test_sequence_generator.py b/tests/test_sequence_generator.py
index b9f91ffa76..a14d739898 100644
--- a/tests/test_sequence_generator.py
+++ b/tests/test_sequence_generator.py
@@ -320,7 +320,7 @@ def test_generation_with_additional_input(self):
         sample = self.sample.copy()
         sample["net_input"]["fancy_other_input"] = sample["net_input"]["src_tokens"]
         hypos = generator.forward(self.sample)
-        eos, w1, w2 = self.tgt_dict.eos(), self.w1, self.w2
+        eos, w1 = self.tgt_dict.eos(), self.w1
         # sentence 1, beam 1
         self.assertHypoTokens(hypos[0][0], [w1, eos])
         self.assertHypoScore(hypos[0][0], [0.9, 1.0])