diff --git a/.circleci/config.yml b/.circleci/config.yml index 25f3a73881..de40a6e9c5 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -10,7 +10,7 @@ gpu: &gpu machine: image: ubuntu-1604-cuda-11.1:202012-01 resource_class: gpu.nvidia.medium.multi - + # ------------------------------------------------------------------------------------- # Re-usable commands @@ -25,7 +25,7 @@ install_dep_common: &install_dep_common pip install --upgrade setuptools pip install bitarray boto3 deepspeed editdistance fastBPE iopath ipdb ipython pyarrow pytest sacremoses sentencepiece subword-nmt hydra-core==1.0.7 omegaconf==2.0.6 pip install --progress-bar off pytest - pip install --progress-bar off fairscale==0.4.1 + pip install --progress-bar off fairscale pip install -i https://test.pypi.org/simple/ bitsandbytes-cuda111 -U python -c 'import torch; print("Torch version:", torch.__version__)' python -m torch.utils.collect_env @@ -38,6 +38,7 @@ install_dep_fused_ops: &install_dep_fused_ops source activate fairseq git clone https://github.com/NVIDIA/apex cd apex + git checkout e2083df5eb96643c61613b9df48dd4eea6b07690 pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" --global-option="--deprecated_fused_adam" --global-option="--xentropy" --global-option="--fast_multihead_attn" ./ cd ~/ git clone --depth=1 --branch v2.4 https://github.com/NVIDIA/Megatron-LM.git diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index f9efb2e96e..4817f6e876 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -17,7 +17,7 @@ repos: - id: end-of-file-fixer - repo: https://github.com/ambv/black - rev: 20.8b1 + rev: 21.12b0 hooks: - id: black language_version: python3.8 diff --git a/fairseq/logging/meters.py b/fairseq/logging/meters.py index 2100b1fa0b..d5f7c775d9 100644 --- a/fairseq/logging/meters.py +++ b/fairseq/logging/meters.py @@ -8,7 +8,6 @@ from collections import OrderedDict from typing import Dict, Optional - try: import torch @@ -18,7 +17,6 @@ def type_as(a, b): else: return a - except ImportError: torch = None diff --git a/fairseq/models/speech_to_text/modules/emformer.py b/fairseq/models/speech_to_text/modules/emformer.py index 6233546ab8..70339788f7 100644 --- a/fairseq/models/speech_to_text/modules/emformer.py +++ b/fairseq/models/speech_to_text/modules/emformer.py @@ -14,23 +14,30 @@ import torch import torch.nn as nn -from fairseq.models import ( - FairseqEncoder, -) +from torch import Tensor +from torch import device as Device + +from fairseq.models import FairseqEncoder from fairseq.models.speech_to_text.utils import ( NoOp, - lengths_to_padding_mask, - segments_to_sequence, -) -from fairseq.models.speech_to_text.utils import ( attention_suppression, layer_norm_backward_hook, + lengths_to_padding_mask, + segments_to_sequence, ) -from torch import Tensor, device as Device -from torch.ao.quantization.qconfig import ( - default_dynamic_qconfig, - per_channel_dynamic_qconfig, -) + +try: + import torch.ao.quantization as quantization + from torch.ao.quantization.qconfig import ( + default_dynamic_qconfig, + per_channel_dynamic_qconfig, + ) +except ImportError: + import torch.quantization as quantization + from torch.quantization.qconfig import ( + default_dynamic_qconfig, + per_channel_dynamic_qconfig, + ) class RelativePositionEmbedding(nn.Module): @@ -140,7 +147,7 @@ def quantize_(self, params=None): qconfig = per_channel_dynamic_qconfig else: qconfig = default_dynamic_qconfig - torch.ao.quantization.quantize_dynamic( + quantization.quantize_dynamic( self, {torch.nn.Linear: qconfig}, dtype=torch.qint8, inplace=True ) return self @@ -728,7 +735,7 @@ def quantize_(self, params=None): qconfig = per_channel_dynamic_qconfig else: qconfig = default_dynamic_qconfig - torch.ao.quantization.quantize_dynamic( + quantization.quantize_dynamic( self, {torch.nn.Linear: qconfig}, dtype=torch.qint8, inplace=True ) return self @@ -1771,7 +1778,7 @@ def quantize_(self, params=None): qconfig = per_channel_dynamic_qconfig else: qconfig = default_dynamic_qconfig - torch.ao.quantization.quantize_dynamic( + quantization.quantize_dynamic( self, {torch.nn.Linear: qconfig}, dtype=torch.qint8, inplace=True ) return self diff --git a/fairseq/modules/cross_entropy.py b/fairseq/modules/cross_entropy.py index 6f33c24cb5..286c00eecc 100644 --- a/fairseq/modules/cross_entropy.py +++ b/fairseq/modules/cross_entropy.py @@ -8,7 +8,6 @@ import torch import torch.nn.functional as F - logger = logging.getLogger(__name__) @@ -54,7 +53,6 @@ def cross_entropy(logits, target, ignore_index=-100, reduction="mean"): else: raise NotImplementedError - except ImportError: def cross_entropy(logits, target, ignore_index=-100, reduction="mean"): diff --git a/fairseq/modules/layer_norm.py b/fairseq/modules/layer_norm.py index 234609d9e2..78332d1749 100644 --- a/fairseq/modules/layer_norm.py +++ b/fairseq/modules/layer_norm.py @@ -7,7 +7,6 @@ import torch.nn as nn import torch.nn.functional as F - try: from apex.normalization import FusedLayerNorm as _FusedLayerNorm @@ -22,7 +21,6 @@ def forward(self, x): with torch.cuda.device(x.device): return super().forward(x) - except ImportError: has_fused_layernorm = False diff --git a/fairseq/modules/quantization/scalar/ops.py b/fairseq/modules/quantization/scalar/ops.py index 9144083ac7..ad1e14e051 100644 --- a/fairseq/modules/quantization/scalar/ops.py +++ b/fairseq/modules/quantization/scalar/ops.py @@ -5,6 +5,11 @@ import torch +try: + import torch.ao.quantization as quantization +except ImportError: + import torch.quantization as quantization + def emulate_int(w, bits, method, scale=None, zero_point=None): q = globals()[f"emulate_int8_{method}"] @@ -21,7 +26,7 @@ def quantize(w, scale, zero_point, bits=8): def emulate_int8_histogram(w, scale=None, zero_point=None, bits=8): if scale is None: - obs = torch.ao.quantization.observer.HistogramObserver() + obs = quantization.observer.HistogramObserver() obs.to(device=w.device) _ = obs(w.float()) scale, zero_point = obs.calculate_qparams() @@ -32,7 +37,7 @@ def emulate_int8_histogram(w, scale=None, zero_point=None, bits=8): def emulate_int8_channel(w, scale=None, zero_point=None, bits=8): if scale is None: - obs = torch.ao.quantization.observer.PerChannelMinMaxObserver( + obs = quantization.observer.PerChannelMinMaxObserver( ch_axis=-1, qscheme=torch.per_channel_symmetric ) obs.to(device=w.device) @@ -45,7 +50,7 @@ def emulate_int8_channel(w, scale=None, zero_point=None, bits=8): def emulate_int8_tensor(w, scale=None, zero_point=None, bits=8): if scale is None: - obs = torch.ao.quantization.observer.MinMaxObserver() + obs = quantization.observer.MinMaxObserver() obs.to(device=w.device) _ = obs(w) scale, zero_point = obs.calculate_qparams() diff --git a/fairseq/optim/fused_adam.py b/fairseq/optim/fused_adam.py index da872033d1..1290ecfdbf 100644 --- a/fairseq/optim/fused_adam.py +++ b/fairseq/optim/fused_adam.py @@ -27,8 +27,8 @@ def get_fused_adam_class(): except ImportError: try: # fallback to the newer interface - from apex.optimizers import FusedAdam as _FusedAdam # noqa from apex.multi_tensor_apply import multi_tensor_applier + from apex.optimizers import FusedAdam as _FusedAdam # noqa if multi_tensor_applier.available: return FusedAdamV2 @@ -252,8 +252,8 @@ def inf_norm(t): try: - from apex.optimizers import FusedAdam from apex.multi_tensor_apply import multi_tensor_applier + from apex.optimizers import FusedAdam class FusedAdamV2(FusedAdam): """ @@ -382,6 +382,5 @@ def step( return loss - except ImportError: pass diff --git a/scripts/average_checkpoints.py b/scripts/average_checkpoints.py index c512f802bc..a4711e4840 100644 --- a/scripts/average_checkpoints.py +++ b/scripts/average_checkpoints.py @@ -108,16 +108,18 @@ def main(): help='Write the new checkpoint containing the averaged weights to this path.') num_group = parser.add_mutually_exclusive_group() num_group.add_argument('--num-epoch-checkpoints', type=int, - help='if set, will try to find checkpoints with names checkpoint_xx.pt in the path specified by input, ' - 'and average last this many of them.') + help='if set, will try to find checkpoints with names checkpoint_xx.pt in the ' + 'path specified by input, and average last this many of them.') num_group.add_argument('--num-update-checkpoints', type=int, - help='if set, will try to find checkpoints with names checkpoint_ee_xx.pt in the path specified by input, ' - 'and average last this many of them.') + help='if set, will try to find checkpoints with names checkpoint_ee_xx.pt in the path specified by' + ' input, and average last this many of them.') parser.add_argument('--checkpoint-upper-bound', type=int, help='when using --num-epoch-checkpoints, this will set an upper bound on which epoch to use, ' 'when using --num-update-checkpoints, this will set an upper bound on which update to use' - 'e.g., with --num-epoch-checkpoints=10 --checkpoint-upper-bound=50, checkpoints 41-50 would be averaged.' - 'e.g., with --num-update-checkpoints=10 --checkpoint-upper-bound=50000, checkpoints 40500-50000 would be averaged assuming --save-interval-updates 500' + 'e.g., with --num-epoch-checkpoints=10 --checkpoint-upper-bound=50, checkpoints 41-50 would be' + ' averaged.' + 'e.g., with --num-update-checkpoints=10 --checkpoint-upper-bound=50000, checkpoints 40500-50000 would' + ' be averaged assuming --save-interval-updates 500' ) # fmt: on args = parser.parse_args() diff --git a/scripts/constraints/extract.py b/scripts/constraints/extract.py index f6155d0a05..437b373856 100755 --- a/scripts/constraints/extract.py +++ b/scripts/constraints/extract.py @@ -11,8 +11,6 @@ import random import sys -from sacrebleu import extract_ngrams - def get_phrase(words, index, length): assert index < len(words) - length + 1 diff --git a/scripts/spm_decode.py b/scripts/spm_decode.py index 1c18b1d2a7..7d7b68b240 100644 --- a/scripts/spm_decode.py +++ b/scripts/spm_decode.py @@ -26,13 +26,13 @@ def main(): if args.input_format == "piece": - def decode(l): - return "".join(sp.DecodePieces(l)) + def decode(input): + return "".join(sp.DecodePieces(input)) elif args.input_format == "id": - def decode(l): - return "".join(sp.DecodeIds(l)) + def decode(input): + return "".join(sp.DecodeIds(input)) else: raise NotImplementedError diff --git a/scripts/spm_encode.py b/scripts/spm_encode.py index 83facfb3b1..f91e0bb728 100644 --- a/scripts/spm_encode.py +++ b/scripts/spm_encode.py @@ -49,13 +49,13 @@ def main(): if args.output_format == "piece": - def encode(l): - return sp.EncodeAsPieces(l) + def encode(input): + return sp.EncodeAsPieces(input) elif args.output_format == "id": - def encode(l): - return list(map(str, sp.EncodeAsIds(l))) + def encode(input): + return list(map(str, sp.EncodeAsIds(input))) else: raise NotImplementedError diff --git a/setup.cfg b/setup.cfg index 3ea6243324..3fa679ddf1 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,3 +1,4 @@ [flake8] max-line-length = 127 extend-ignore = E203, W503 +extend-exclude = fairseq/model_parallel/megatron diff --git a/tests/distributed/test_bmuf.py b/tests/distributed/test_bmuf.py index 2a0f20d0be..995d0db180 100644 --- a/tests/distributed/test_bmuf.py +++ b/tests/distributed/test_bmuf.py @@ -140,7 +140,6 @@ def setup_args(): @unittest.skipIf(torch.cuda.device_count() < 2, "test requires 2 GPUs") class TestBMUF(unittest.TestCase): def bmuf_process(self, cfg, args, iterations): - processes = [] results = Manager().dict() torch.multiprocessing.spawn( fn=functools.partial(single_gpu_training, cfg, args), diff --git a/tests/gpu/test_binaries_gpu.py b/tests/gpu/test_binaries_gpu.py index 550e751b1f..1d5b6e62bc 100644 --- a/tests/gpu/test_binaries_gpu.py +++ b/tests/gpu/test_binaries_gpu.py @@ -399,6 +399,9 @@ def _quantize_language_model(data_dir, arch, extra_flags=None, run_validation=Fa train.main(quantize_args) +@unittest.skipIf( + int(torch.__version__[2]) < 10, reason="quantized kernels are only supported on CPU" +) @unittest.skipIf(not torch.cuda.is_available(), "test requires a GPU") class TestQuantization(unittest.TestCase): def setUp(self): diff --git a/tests/test_constraints.py b/tests/test_constraints.py index 1c37f7e1fb..d766d5130f 100755 --- a/tests/test_constraints.py +++ b/tests/test_constraints.py @@ -3,11 +3,17 @@ # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. -import sys import unittest +from typing import List import torch -from fairseq.token_generation_constraints import * + +from fairseq.token_generation_constraints import ( + ConstraintNode, + OrderedConstraintState, + UnorderedConstraintState, + pack_constraints, +) def tensorize(constraints: List[List[int]]) -> torch.Tensor: @@ -53,7 +59,7 @@ def setUp(self): self.examples = [ ( tensorize([[1, 2, 3], [1, 3], [1, 4], [4, 5, 6, 7], [1], [4, 5]]), - "([None].False#6 ([1].True#4 ([2].False#1 [3].True#1) [3].True#1 [4].True#1) ([4].False#2 ([5].True#2 ([6].False#1 [7].True#1))))", + "([None].False#6 ([1].True#4 ([2].False#1 [3].True#1) [3].True#1 [4].True#1) ([4].False#2 ([5].True#2 ([6].False#1 [7].True#1))))", # noqa {1: 4, 2: 1, 3: 2, 4: 3, 5: 2, 6: 1, 7: 1}, ), ([], "[None].False#0", {}), diff --git a/tests/test_file_io.py b/tests/test_file_io.py index d1f33dadd1..af7c4cedb8 100644 --- a/tests/test_file_io.py +++ b/tests/test_file_io.py @@ -49,7 +49,7 @@ def test_file_io_oss(self): def test_file_io_async(self): # ioPath `PathManager` is initialized after the first `opena` call. try: - from fairseq.file_io import IOPathManager, PathManager + from fairseq.file_io import PathManager _asyncfile = os.path.join(self._tmpdir, "async.txt") f = PathManager.opena(_asyncfile, "wb") diff --git a/tests/test_fp16_optimizer.py b/tests/test_fp16_optimizer.py index ce4f1c055c..27085a12da 100644 --- a/tests/test_fp16_optimizer.py +++ b/tests/test_fp16_optimizer.py @@ -3,7 +3,6 @@ # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. -import argparse import copy import logging import unittest diff --git a/tests/test_multi_corpus_sampled_dataset.py b/tests/test_multi_corpus_sampled_dataset.py index 05b20328c5..88f0817a54 100644 --- a/tests/test_multi_corpus_sampled_dataset.py +++ b/tests/test_multi_corpus_sampled_dataset.py @@ -79,7 +79,7 @@ def test_multi_corpus_sampled_dataset_uniform_sample(self): def test_multi_corpus_sampled_dataset_weighted_sample(self): def naive_weighted_sample(weights): - def f(l): + def f(input): v = np.random.random() agg = 0 for i, weight in enumerate(weights): diff --git a/tests/test_reproducibility.py b/tests/test_reproducibility.py index 94931b2a07..b285593272 100644 --- a/tests/test_reproducibility.py +++ b/tests/test_reproducibility.py @@ -3,12 +3,10 @@ # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. -import contextlib import json import os import tempfile import unittest -from io import StringIO import torch diff --git a/tests/test_roberta.py b/tests/test_roberta.py index b0b9cfd31e..0030d3e5a9 100644 --- a/tests/test_roberta.py +++ b/tests/test_roberta.py @@ -292,18 +292,18 @@ def test_roberta_incremental_decoder(self, device: str): # Decode with incremental state inc_state = {} ro_dec_inc = [] - for l in range(tgt_len): + for i in range(tgt_len): ro, _ = model.decoder.forward( - ro_tokens[:, : l + 1], encoder_out=en_enc, incremental_state=inc_state + ro_tokens[:, : i + 1], encoder_out=en_enc, incremental_state=inc_state ) self.assertEqual(ro.shape, (bs, 1, VOCAB_SIZE)) ro_dec_inc.append(ro) - for l in range(tgt_len): + for i in range(tgt_len): # Intra-batch - self.assertTensorEqual(ro_dec_inc[l][0], ro_dec_inc[l][1]) + self.assertTensorEqual(ro_dec_inc[i][0], ro_dec_inc[i][1]) # Incremental vs non-incremental - self.assertTensorEqual(ro_dec_inc[l][:, 0], ro_dec[:, l]) + self.assertTensorEqual(ro_dec_inc[i][:, 0], ro_dec[:, i]) def params(model, name): diff --git a/tests/test_sequence_generator.py b/tests/test_sequence_generator.py index b9f91ffa76..a14d739898 100644 --- a/tests/test_sequence_generator.py +++ b/tests/test_sequence_generator.py @@ -320,7 +320,7 @@ def test_generation_with_additional_input(self): sample = self.sample.copy() sample["net_input"]["fancy_other_input"] = sample["net_input"]["src_tokens"] hypos = generator.forward(self.sample) - eos, w1, w2 = self.tgt_dict.eos(), self.w1, self.w2 + eos, w1 = self.tgt_dict.eos(), self.w1 # sentence 1, beam 1 self.assertHypoTokens(hypos[0][0], [w1, eos]) self.assertHypoScore(hypos[0][0], [0.9, 1.0])