Skip to content

Commit

Permalink
DeepSpeed-FastGen (microsoft#4604)
Browse files Browse the repository at this point in the history
Co-authored-by: Jeff Rasley <[email protected]>
Co-authored-by: Michael Wyatt <[email protected]>
Co-authored-by: Ammar Ahmad Awan <[email protected]>
Co-authored-by: Masahiro Tanaka <[email protected]>
Co-authored-by: Logan Adams <[email protected]>
  • Loading branch information
6 people authored Nov 3, 2023
1 parent 737ef29 commit 38b41df
Show file tree
Hide file tree
Showing 263 changed files with 19,167 additions and 33 deletions.
56 changes: 56 additions & 0 deletions .github/workflows/nv-a6000.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
name: nv-a6000

on:
pull_request:
paths-ignore:
- 'docs/**'
- 'blogs/**'
workflow_dispatch:

concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true

permissions:
contents: read
issues: write

jobs:
unit-tests:
runs-on: [self-hosted, nvidia, a6000]
container:
image: nvcr.io/nvidia/pytorch:23.03-py3
ports:
- 80
options: --gpus all --shm-size "8G"

steps:
- uses: actions/checkout@v3

- name: Check container state
run: |
ldd --version
nvcc --version
nvidia-smi
python -c "import torch; print('torch:', torch.__version__, torch)"
python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
- name: Install transformers
run: |
git clone https://github.com/huggingface/transformers
cd transformers
git rev-parse --short HEAD
python -m pip install .
- name: Install deepspeed
run: |
python -m pip install docutils==0.18.1 jinja2==3.0 urllib3==1.26.11 ninja
python -m pip install .[dev,1bit,autotuning]
ds_report
- name: Python environment
run: |
python -m pip list
- name: Unit tests
run: |
unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
cd tests
python -m pytest --color=yes --durations=0 --verbose -rF -m 'inference_v2' unit/ --torch_ver="2.0" --cuda_ver="12"
python -m pytest --color=yes --durations=0 --verbose -rF -m 'inference_v2_ops' unit/ --torch_ver="2.0" --cuda_ver="12"
2 changes: 1 addition & 1 deletion .github/workflows/nv-pre-compile-ops.yml
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ jobs:
#python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
- name: Compile DeepSpeed Ops
run: |
TORCH_CUDA_ARCH_LIST="7.0;7.5;8.0" DS_BUILD_OPS=1 DS_BUILD_SPARSE_ATTN=0 DS_BUILD_EVOFORMER_ATTN=0 pip3 install .
TORCH_CUDA_ARCH_LIST="7.0;7.5;8.0" DS_BUILD_OPS=1 DS_BUILD_SPARSE_ATTN=0 DS_BUILD_CUTLASS_OPS=0 DS_BUILD_RAGGED_DEVICE_OPS=0 DS_BUILD_EVOFORMER_ATTN=0 pip3 install .
- name: DS Report
run: |
ds_report
Empty file added .gitmodules
Empty file.
1 change: 1 addition & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ repos:
entry: ./scripts/check-license.py
language: python
files: \.(py|c|cpp|cu|cc|h|hpp|cuh|hip|tr)$
exclude: ^(deepspeed/inference/v2/kernels/ragged_ops/blocked_flash|deepspeed/inference/v2/kernels/cutlass_ops/grouped_gemm)

- repo: https://github.com/codespell-project/codespell
rev: v2.1.0
Expand Down
2 changes: 2 additions & 0 deletions MANIFEST.in
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
include *.txt README.md
include deepspeed/inference/v2/kernels/ragged_ops/libs/*.so
include deepspeed/inference/v2/kernels/cutlass_ops/libs/*.so
recursive-include requirements *.txt
recursive-include deepspeed *.cpp *.h *.cu *.hip *.tr *.cuh *.cc *.json
recursive-include csrc *.cpp *.h *.cu *.tr *.cuh *.cc
Expand Down
19 changes: 18 additions & 1 deletion accelerator/cuda_accelerator.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,9 +153,26 @@ def max_memory_reserved(self, device_index=None):
def total_memory(self, device_index=None):
return torch.cuda.get_device_properties(device_index).total_memory

def _get_nvml_gpu_id(self, torch_gpu_id):
"""
credit: https://discuss.pytorch.org/t/making-pynvml-match-torch-device-ids-cuda-visible-devices/103020
Remap torch device id to nvml device id, respecting CUDA_VISIBLE_DEVICES.
If the latter isn't set return the same id
"""
# if CUDA_VISIBLE_DEVICES is used automagically remap the id since pynvml ignores this env var
if "CUDA_VISIBLE_DEVICES" in os.environ:
ids = list(map(int, os.environ.get("CUDA_VISIBLE_DEVICES", "").split(",")))
return ids[torch_gpu_id] # remap
else:
return torch_gpu_id

def available_memory(self, device_index=None):
if pynvml:
handle = pynvml.nvmlDeviceGetHandleByIndex(device_index)
if device_index is None:
device_index = self.current_device()
handle = pynvml.nvmlDeviceGetHandleByIndex(self._get_nvml_gpu_id(device_index))
info = pynvml.nvmlDeviceGetMemoryInfo(handle)
return info.free
else:
Expand Down
17 changes: 17 additions & 0 deletions csrc/includes/activation_type.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
// Copyright (c) Microsoft Corporation.
// SPDX-License-Identifier: Apache-2.0

// DeepSpeed Team

#pragma once

enum ActivationType {
GELU = 0,
RELU = 1,
SILU = 2,
GEGLU = 3,
ReGLU = 4,
SiGLU = 5,
IDENTITY = 6,
InvalidType = -1
};
5 changes: 5 additions & 0 deletions csrc/includes/ds_kernel_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,11 @@ used throughout the codebase.
#pragma once

#include <cuda.h>
#include <cuda_fp16.h>

#ifdef BF16_AVAILABLE
#include <cuda_bf16.h>
#endif

#define DS_HD_INLINE __host__ __device__ __forceinline__
#define DS_D_INLINE __device__ __forceinline__
Expand Down
Loading

0 comments on commit 38b41df

Please sign in to comment.