Skip to content

Commit

Permalink
[setup] support pre-build and jit-build of cuda kernels (hpcaitech#2374)
Browse files Browse the repository at this point in the history
* [setup] support pre-build and jit-build of cuda kernels

* polish code

* polish code

* polish code

* polish code

* polish code

* polish code
  • Loading branch information
FrankLeeeee authored Jan 6, 2023
1 parent 12c8bf3 commit 40d376c
Show file tree
Hide file tree
Showing 36 changed files with 418 additions and 394 deletions.
7 changes: 7 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -144,3 +144,10 @@ docs/.build

# ignore version.py generated by setup.py
colossalai/version.py

# ignore any kernel build files
.o
.so

# ignore python interface defition file
.pyi
Empty file added colossalai/_C/__init__.py
Empty file.
9 changes: 0 additions & 9 deletions colossalai/_C/__init__.pyi

This file was deleted.

8 changes: 0 additions & 8 deletions colossalai/_C/cpu_optim.pyi

This file was deleted.

23 changes: 0 additions & 23 deletions colossalai/_C/fused_optim.pyi

This file was deleted.

11 changes: 0 additions & 11 deletions colossalai/_C/layer_norm.pyi

This file was deleted.

20 changes: 0 additions & 20 deletions colossalai/_C/moe.pyi

This file was deleted.

55 changes: 0 additions & 55 deletions colossalai/_C/multihead_attention.pyi

This file was deleted.

12 changes: 0 additions & 12 deletions colossalai/_C/scaled_masked_softmax.pyi

This file was deleted.

8 changes: 0 additions & 8 deletions colossalai/_C/scaled_upper_triang_masked_softmax.pyi

This file was deleted.

16 changes: 15 additions & 1 deletion colossalai/amp/naive_amp/_fp16_optimizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,16 +8,28 @@

from colossalai.context import ParallelMode
from colossalai.core import global_context as gpc
from colossalai.kernel import fused_optim
from colossalai.kernel.op_builder import FusedOptimBuilder
from colossalai.logging import get_dist_logger
from colossalai.utils import clip_grad_norm_fp32, copy_tensor_parallel_attributes, multi_tensor_applier

from ._utils import has_inf_or_nan, zero_gard_by_list
from .grad_scaler import BaseGradScaler

try:
from colossalai._C import fused_optim
except:
fused_optim = None

__all__ = ['FP16Optimizer']


def load_fused_optim():
global fused_optim

if fused_optim is None:
fused_optim = FusedOptimBuilder().load()


def _multi_tensor_copy_this_to_that(this, that, overflow_buf=None):
"""
adapted from Megatron-LM (https://github.com/NVIDIA/Megatron-LM)
Expand All @@ -30,6 +42,8 @@ def _multi_tensor_copy_this_to_that(this, that, overflow_buf=None):
if overflow_buf:
overflow_buf.fill_(0)
# Scaling with factor `1.0` is equivalent to copy.
global fused_optim
load_fused_optim()
multi_tensor_applier(fused_optim.multi_tensor_scale, overflow_buf, [this, that], 1.0)
else:
for this_, that_ in zip(this, that):
Expand Down
35 changes: 0 additions & 35 deletions colossalai/kernel/__init__.py
Original file line number Diff line number Diff line change
@@ -1,42 +1,7 @@
from .cuda_native import FusedScaleMaskSoftmax, LayerNorm, MultiHeadAttention

try:
from colossalai._C import fused_optim
except:
from colossalai.kernel.op_builder.fused_optim import FusedOptimBuilder
fused_optim = FusedOptimBuilder().load()

try:
from colossalai._C import cpu_optim
except ImportError:
from colossalai.kernel.op_builder import CPUAdamBuilder
cpu_optim = CPUAdamBuilder().load()

try:
from colossalai._C import multihead_attention
except ImportError:
from colossalai.kernel.op_builder import MultiHeadAttnBuilder
multihead_attention = MultiHeadAttnBuilder().load()

try:
from colossalai._C import scaled_upper_triang_masked_softmax
except ImportError:
from colossalai.kernel.op_builder import ScaledSoftmaxBuilder
scaled_upper_triang_masked_softmax = ScaledSoftmaxBuilder().load()

try:
from colossalai._C import moe
except ImportError:
from colossalai.kernel.op_builder import MOEBuilder
moe = MOEBuilder().load()

__all__ = [
"fused_optim",
"cpu_optim",
"multihead_attention",
"moe",
"LayerNorm",
"FusedScaleMaskSoftmax",
"MultiHeadAttention",
"scaled_upper_triang_masked_softmax",
]
3 changes: 2 additions & 1 deletion colossalai/kernel/cuda_native/multihead_attention.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,8 @@ def __init__(self, hidden_size, nhead, batch_size, max_seq_len, dropout=0.0, nor
# Load cuda modules if needed
global colossal_multihead_attention
if colossal_multihead_attention is None:
from colossalai.kernel import multihead_attention
from colossalai.kernel.op_builder import MultiHeadAttnBuilder
multihead_attention = MultiHeadAttnBuilder().load()
colossal_multihead_attention = multihead_attention

# create the layer in cuda kernels.
Expand Down
29 changes: 28 additions & 1 deletion colossalai/nn/layer/moe/_operation.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,32 @@
from torch.distributed import ProcessGroup

COL_MOE_KERNEL_FLAG = False
from colossalai.kernel import moe

try:
from colossalai._C import moe
except:
moe = None


def build_moe_if_not_prebuilt():
# load moe kernel during runtime if not pre-built
global moe
if moe is None:
from colossalai.kernel.op_builder import MOEBuilder
moe = MOEBuilder().load()


class AllGather(torch.autograd.Function):

@staticmethod
def forward(ctx: Any, inputs: Tensor, group: Optional[ProcessGroup] = None) -> Tensor:

global moe

if moe is None:
from colossalai.kernel.op_builder import MOEBuilder
moe = MOEBuilder().load()

if ctx is not None:
ctx.comm_grp = group

Expand Down Expand Up @@ -85,6 +104,9 @@ def forward(ctx, tokens, mask, dest_idx, ec):
s = tokens.size(0)
h = tokens.size(1)

# load moe kernel during runtime if not pre-built
build_moe_if_not_prebuilt()

expert_input = moe.dispatch_forward(s, ec, h, tokens, mask, dest_idx)

ctx.save_for_backward(mask, dest_idx)
Expand Down Expand Up @@ -112,6 +134,9 @@ def forward(ctx, expert_tokens, logits, mask, dest_idx, ec):
c = ec // e
h = expert_tokens.size(-1)

# load moe kernel during runtime if not pre-built
build_moe_if_not_prebuilt()

fp16_flag = (expert_tokens.dtype == torch.float16)
cb_input = expert_tokens.to(torch.float32) if fp16_flag else expert_tokens
ctokens = moe.combine_forward(s, e, c, h, cb_input, logits, mask, dest_idx)
Expand Down Expand Up @@ -143,6 +168,8 @@ def moe_cumsum(inputs: Tensor):
dim0 = inputs.size(0)
flag = (dim0 <= 1024) or (dim0 <= 2048 and dim0 % 2 == 0) or (dim0 % 4 == 0)
if flag and COL_MOE_KERNEL_FLAG:
# load moe kernel during runtime if not pre-built
build_moe_if_not_prebuilt()
return moe.cumsum_sub_one(inputs)
else:
return torch.cumsum(inputs, dim=0) - 1
9 changes: 3 additions & 6 deletions colossalai/nn/optimizer/cpu_adam.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

import torch

from colossalai.kernel.op_builder import CPUAdamBuilder
from colossalai.registry import OPTIMIZERS

from .nvme_optimizer import NVMeOptimizer
Expand Down Expand Up @@ -76,12 +77,8 @@ def __init__(self,
default_args = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay, bias_correction=bias_correction)
super(CPUAdam, self).__init__(model_params, default_args, nvme_offload_fraction, nvme_offload_dir)
self.adamw_mode = adamw_mode
try:
import colossalai._C.cpu_optim
except ImportError:
raise ImportError('Please install colossalai from source code to use CPUAdam')
self.cpu_adam_op = colossalai._C.cpu_optim.CPUAdamOptimizer(lr, betas[0], betas[1], eps, weight_decay,
adamw_mode)
cpu_adam = CPUAdamBuilder().load()
self.cpu_adam_op = cpu_adam.CPUAdamOptimizer(lr, betas[0], betas[1], eps, weight_decay, adamw_mode)

def torch_adam_update(self,
data,
Expand Down
3 changes: 2 additions & 1 deletion colossalai/nn/optimizer/fused_adam.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,8 @@ def __init__(self,
self.adamw_mode = 1 if adamw_mode else 0
self.set_grad_none = set_grad_none
if multi_tensor_applier.available:
from colossalai.kernel import fused_optim
from colossalai.kernel.op_builder import FusedOptimBuilder
fused_optim = FusedOptimBuilder().load()

# Skip buffer
self._dummy_overflow_buf = torch.cuda.IntTensor([0])
Expand Down
3 changes: 2 additions & 1 deletion colossalai/nn/optimizer/fused_lamb.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,8 @@ def __init__(self,
max_grad_norm=max_grad_norm)
super(FusedLAMB, self).__init__(params, defaults)
if multi_tensor_applier.available:
from colossalai.kernel import fused_optim
from colossalai.kernel.op_builder import FusedOptimBuilder
fused_optim = FusedOptimBuilder().load()

self.multi_tensor_l2norm = fused_optim.multi_tensor_l2norm
# Skip buffer
Expand Down
3 changes: 2 additions & 1 deletion colossalai/nn/optimizer/fused_sgd.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,8 @@ def __init__(self,
self.wd_after_momentum = wd_after_momentum

if multi_tensor_applier.available:
from colossalai.kernel import fused_optim
from colossalai.kernel.op_builder import FusedOptimBuilder
fused_optim = FusedOptimBuilder().load()

# Skip buffer
self._dummy_overflow_buf = torch.tensor([0],
Expand Down
5 changes: 4 additions & 1 deletion colossalai/nn/optimizer/hybrid_adam.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import torch

from colossalai.kernel.op_builder import CPUAdamBuilder, FusedOptimBuilder
from colossalai.registry import OPTIMIZERS
from colossalai.utils import multi_tensor_applier

Expand Down Expand Up @@ -77,7 +78,9 @@ def __init__(self,
super(HybridAdam, self).__init__(model_params, default_args, nvme_offload_fraction, nvme_offload_dir)
self.adamw_mode = adamw_mode

from colossalai.kernel import cpu_optim, fused_optim
# build during runtime if not found
cpu_optim = CPUAdamBuilder().load()
fused_optim = FusedOptimBuilder().load()
self.cpu_adam_op = cpu_optim.CPUAdamOptimizer(lr, betas[0], betas[1], eps, weight_decay, adamw_mode)

self.gpu_adam_op = fused_optim.multi_tensor_adam
Expand Down
Loading

0 comments on commit 40d376c

Please sign in to comment.