Skip to content

Commit

Permalink
[testing] add beit model for unit testings (hpcaitech#2196)
Browse files Browse the repository at this point in the history
* [testing] add beit model

* [beit] fix bugs

* [beit] fix bugs

* [testing] fix bugs
  • Loading branch information
1SAA authored Dec 26, 2022
1 parent 5682e6d commit a3100bd
Show file tree
Hide file tree
Showing 5 changed files with 58 additions and 7 deletions.
3 changes: 2 additions & 1 deletion tests/components_to_test/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from . import (
beit,
bert,
gpt2,
hanging_param_model,
Expand All @@ -14,5 +15,5 @@

__all__ = [
'bert', 'gpt2', 'hanging_param_model', 'inline_op_model', 'nested_model', 'repeated_computed_layers', 'resnet',
'simple_net', 'run_fwd_bwd', 'albert'
'simple_net', 'run_fwd_bwd', 'albert', 'beit'
]
42 changes: 42 additions & 0 deletions tests/components_to_test/beit.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
import torch
from timm.models.beit import Beit

from colossalai.utils.cuda import get_current_device

from .registry import non_distributed_component_funcs
from .utils.dummy_data_generator import DummyDataGenerator


class DummyDataLoader(DummyDataGenerator):
img_size = 64
num_channel = 3
num_class = 10
batch_size = 4

def generate(self):
data = torch.randn((DummyDataLoader.batch_size, DummyDataLoader.num_channel, DummyDataLoader.img_size,
DummyDataLoader.img_size),
device=get_current_device())
label = torch.randint(low=0,
high=DummyDataLoader.num_class,
size=(DummyDataLoader.batch_size,),
device=get_current_device())
return data, label


@non_distributed_component_funcs.register(name='beit')
def get_training_components():

def model_buider(checkpoint=False):
model = Beit(img_size=DummyDataLoader.img_size,
num_classes=DummyDataLoader.num_class,
embed_dim=32,
depth=2,
num_heads=4)
return model

trainloader = DummyDataLoader()
testloader = DummyDataLoader()

criterion = torch.nn.CrossEntropyLoss()
return model_buider, trainloader, testloader, torch.optim.Adam, criterion
4 changes: 2 additions & 2 deletions tests/test_gemini/update/test_optim.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
# this model is large enough to slice to chunks
TEST_MODELS = ['gpt2']
# these models are too small, all parameters in these models are compacted into one chunk
EXAMPLE_MODELS = ['albert', 'hanging_param_model', 'bert', 'simple_net', 'nested_model', 'repeated_computed_layers']
EXAMPLE_MODELS = ['albert', 'beit', 'bert', 'hanging_param_model', 'nested_model', 'repeated_computed_layers']


def check_param(model: ZeroDDP, torch_model: torch.nn.Module):
Expand Down Expand Up @@ -142,7 +142,7 @@ def exam_tiny_example(placement_policy, model_name: str):

torch_loss = run_fwd_bwd(torch_model, input_ids, label, criterion, torch_optim)
loss = run_fwd_bwd(model, input_ids, label, criterion, zero_optim)
assert_close(torch_loss, loss)
assert_close(torch_loss, loss, rtol=1.5e-6, atol=2e-5) # atol should be 2e-5 for torch lower than 1.12

zero_optim.step()
torch_optim.step()
Expand Down
7 changes: 5 additions & 2 deletions tests/test_tensor/common_utils/_utils.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
import os
import random

import numpy as np
import torch
import torch.distributed as dist
from colossalai.core import global_context as gpc

from colossalai.context import ParallelMode
from colossalai.tensor import ShardSpec, ComputeSpec, ComputePattern
from colossalai.core import global_context as gpc
from colossalai.tensor import ComputePattern, ComputeSpec, ShardSpec


def set_seed(seed):
Expand All @@ -15,6 +17,7 @@ def set_seed(seed):
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False


def check_equal(A, B):
Expand Down
9 changes: 7 additions & 2 deletions tests/test_zero/test_init_context.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,12 @@
def run_model_test(init_device_type, shard_strategy_class):
logger = get_dist_logger("test_zero_init")

for get_components_func in non_distributed_component_funcs:
for name, get_components_func in non_distributed_component_funcs._registry.items():
# because the ZeroInitContext automatically turns parameters to fp16
# and the beit model use tensor.erfinv_() function to initialize weights
# tensor.erfinv_() doesn't support Half in CPU, we omit the beit model
if name == 'beit':
continue
model_builder, _, _, _, _ = get_components_func()
if init_device_type == 'cuda':
init_device = get_current_device()
Expand Down Expand Up @@ -70,4 +75,4 @@ def test_zero_init_context(world_size):


if __name__ == '__main__':
test_zero_init_context(4)
test_zero_init_context(1)

0 comments on commit a3100bd

Please sign in to comment.