benchmarks/amoebanetd-speed/main.py

"""AmoebaNet-D (18, 256) Speed Benchmark"""
import platform
import time
from typing import Any, Callable, Dict, List, Optional, Tuple, cast

import click
import torch
from torch import nn
import torch.nn.functional as F
from torch.optim import SGD
import torch.utils.data

from amoebanet import amoebanetd
import torchgpipe
from torchgpipe import GPipe

Stuffs = Tuple[nn.Module, int, List[torch.device]]  # (model, batch_size, devices)
Experiment = Callable[[nn.Module, List[int]], Stuffs]


def _gpipe(model: nn.Module,
           devices: List[int],
           batch_size: int,
           chunks: int,
           balance: List[int],
           checkpoint: str,
           ) -> Stuffs:
    model = cast(nn.Sequential, model)
    model = GPipe(model, balance, devices=devices, chunks=chunks, checkpoint=checkpoint)
    return model, batch_size, list(model.devices)


class Experiments:

    @staticmethod
    def n2m1(model: nn.Module, devices: List[int]) -> Stuffs:
        batch_size = 96
        chunks = 1
        balance = [7, 17]
        return _gpipe(model, devices, batch_size, chunks, balance, checkpoint='always')

    @staticmethod
    def n2m4(model: nn.Module, devices: List[int]) -> Stuffs:
        batch_size = 256
        chunks = 4
        balance = [9, 15]
        return _gpipe(model, devices, batch_size, chunks, balance, checkpoint='except_last')

    @staticmethod
    def n2m32(model: nn.Module, devices: List[int]) -> Stuffs:
        batch_size = 1280
        chunks = 32
        balance = [9, 15]
        return _gpipe(model, devices, batch_size, chunks, balance, checkpoint='except_last')

    @staticmethod
    def n4m1(model: nn.Module, devices: List[int]) -> Stuffs:
        batch_size = 160
        chunks = 1
        balance = [3, 4, 5, 12]
        return _gpipe(model, devices, batch_size, chunks, balance, checkpoint='always')

    @staticmethod
    def n4m4(model: nn.Module, devices: List[int]) -> Stuffs:
        batch_size = 360
        chunks = 4
        balance = [3, 6, 7, 8]
        return _gpipe(model, devices, batch_size, chunks, balance, checkpoint='except_last')

    @staticmethod
    def n4m32(model: nn.Module, devices: List[int]) -> Stuffs:
        batch_size = 1152
        chunks = 32
        balance = [3, 6, 7, 8]
        return _gpipe(model, devices, batch_size, chunks, balance, checkpoint='except_last')

    @staticmethod
    def n8m1(model: nn.Module, devices: List[int]) -> Stuffs:
        batch_size = 196
        chunks = 1
        balance = [2, 2, 2, 2, 2, 3, 4, 7]
        return _gpipe(model, devices, batch_size, chunks, balance, checkpoint='always')

    @staticmethod
    def n8m4(model: nn.Module, devices: List[int]) -> Stuffs:
        batch_size = 480
        chunks = 4
        balance = [2, 2, 2, 3, 3, 4, 4, 4]
        return _gpipe(model, devices, batch_size, chunks, balance, checkpoint='except_last')

    @staticmethod
    def n8m32(model: nn.Module, devices: List[int]) -> Stuffs:
        batch_size = 1280
        chunks = 32
        balance = [2, 2, 2, 3, 3, 4, 4, 4]
        return _gpipe(model, devices, batch_size, chunks, balance, checkpoint='except_last')


EXPERIMENTS: Dict[str, Experiment] = {
    'n2m1': Experiments.n2m1,
    'n2m4': Experiments.n2m4,
    'n2m32': Experiments.n2m32,
    'n4m1': Experiments.n4m1,
    'n4m4': Experiments.n4m4,
    'n4m32': Experiments.n4m32,
    'n8m1': Experiments.n8m1,
    'n8m4': Experiments.n8m4,
    'n8m32': Experiments.n8m32,
}


BASE_TIME: float = 0


def hr() -> None:
    """Prints a horizontal line."""
    width, _ = click.get_terminal_size()
    click.echo('-' * width)


def log(msg: str, clear: bool = False, nl: bool = True) -> None:
    """Prints a message with elapsed time."""
    if clear:
        # Clear the output line to overwrite.
        width, _ = click.get_terminal_size()
        click.echo('\b\r', nl=False)
        click.echo(' ' * width, nl=False)
        click.echo('\b\r', nl=False)

    t = time.time() - BASE_TIME
    h = t // 3600
    t %= 3600
    m = t // 60
    t %= 60
    s = t

    click.echo('%02d:%02d:%02d | ' % (h, m, s), nl=False)
    click.echo(msg, nl=nl)


def parse_devices(ctx: Any, param: Any, value: Optional[str]) -> List[int]:
    if value is None:
        return list(range(torch.cuda.device_count()))
    return [int(x) for x in value.split(',')]


@click.command()
@click.pass_context
@click.argument(
    'experiment',
    type=click.Choice(sorted(EXPERIMENTS.keys())),
)
@click.option(
    '--epochs', '-e',
    type=int,
    default=10,
    help='Number of epochs (default: 10)',
)
@click.option(
    '--skip-epochs', '-k',
    type=int,
    default=1,
    help='Number of epochs to skip in result (default: 1)',
)
@click.option(
    '--devices', '-d',
    metavar='0,1,2,3',
    callback=parse_devices,
    help='Device IDs to use (default: all CUDA devices)',
)
def cli(ctx: click.Context,
        experiment: str,
        epochs: int,
        skip_epochs: int,
        devices: List[int],
        ) -> None:
    """AmoebaNet-D (18, 256) Speed Benchmark"""
    if skip_epochs >= epochs:
        ctx.fail('--skip-epochs=%d must be less than --epochs=%d' % (skip_epochs, epochs))

    model: nn.Module = amoebanetd(num_classes=1000, num_layers=18, num_filters=256)

    f: Experiment = EXPERIMENTS[experiment]
    try:
        model, batch_size, _devices = f(model, devices)
    except ValueError as exc:
        # Examples:
        #   ValueError: too few devices to hold given partitions (devices: 1, paritions: 2)
        ctx.fail(str(exc))

    optimizer = SGD(model.parameters(), lr=0.1)

    in_device = _devices[0]
    out_device = _devices[-1]
    torch.cuda.set_device(in_device)

    # This experiment cares about only training speed, rather than accuracy.
    # To eliminate any overhead due to data loading, we use fake random 224x224
    # images over 1000 labels.
    dataset_size = 10000

    input = torch.rand(batch_size, 3, 224, 224, device=in_device)
    target = torch.randint(1000, (batch_size,), device=out_device)
    data = [(input, target)] * (dataset_size//batch_size)

    if dataset_size % batch_size != 0:
        last_input = input[:dataset_size % batch_size]
        last_target = target[:dataset_size % batch_size]
        data.append((last_input, last_target))

    # HEADER ======================================================================================

    title = f'{experiment}, {skip_epochs+1}-{epochs} epochs'
    click.echo(title)

    if isinstance(model, GPipe):
        click.echo(f'batch size: {batch_size}, chunks: {model.chunks}, '
                   f'balance: {model.balance}, checkpoint: {model.checkpoint}')
    else:
        click.echo(f'batch size: {batch_size}')

    click.echo('torchgpipe: %s, python: %s, torch: %s, cudnn: %s, cuda: %s, gpu: %s' % (
        torchgpipe.__version__,
        platform.python_version(),
        torch.__version__,
        torch.backends.cudnn.version(),
        torch.version.cuda,
        torch.cuda.get_device_name(in_device)))

    # TRAIN =======================================================================================

    global BASE_TIME
    BASE_TIME = time.time()

    def run_epoch(epoch: int) -> Tuple[float, float]:
        torch.cuda.synchronize(in_device)
        tick = time.time()

        data_trained = 0
        for i, (input, target) in enumerate(data):
            data_trained += input.size(0)

            output = model(input)
            loss = F.cross_entropy(output, target)
            loss.backward()

            optimizer.step()
            optimizer.zero_grad()

            # 00:01:02 | 1/20 epoch (42%) | 200.000 samples/sec (estimated)
            percent = (i+1) / len(data) * 100
            throughput = data_trained / (time.time()-tick)
            log('%d/%d epoch (%d%%) | %.3f samples/sec (estimated)'
                '' % (epoch+1, epochs, percent, throughput), clear=True, nl=False)

        torch.cuda.synchronize(in_device)
        tock = time.time()

        # 00:02:03 | 1/20 epoch | 200.000 samples/sec, 123.456 sec/epoch
        elapsed_time = tock - tick
        throughput = dataset_size / elapsed_time
        log('%d/%d epoch | %.3f samples/sec, %.3f sec/epoch'
            '' % (epoch+1, epochs, throughput, elapsed_time), clear=True)

        return throughput, elapsed_time

    throughputs = []
    elapsed_times = []

    hr()
    for epoch in range(epochs):
        throughput, elapsed_time = run_epoch(epoch)

        if epoch < skip_epochs:
            continue

        throughputs.append(throughput)
        elapsed_times.append(elapsed_time)
    hr()

    # RESULT ======================================================================================

    # pipeline-4, 2-10 epochs | 200.000 samples/sec, 123.456 sec/epoch (average)
    n = len(throughputs)
    throughput = sum(throughputs) / n
    elapsed_time = sum(elapsed_times) / n
    click.echo('%s | %.3f samples/sec, %.3f sec/epoch (average)'
               '' % (title, throughput, elapsed_time))


if __name__ == '__main__':
    cli()