Skip to content

Commit

Permalink
added enroot+pyxis support
Browse files Browse the repository at this point in the history
  • Loading branch information
JingchaoZhang committed Sep 14, 2022
1 parent 5e71337 commit b95e4f0
Show file tree
Hide file tree
Showing 5 changed files with 58 additions and 39 deletions.
12 changes: 0 additions & 12 deletions experimental/fairseq_moe_docker_slurm/Dockerfile

This file was deleted.

6 changes: 6 additions & 0 deletions experimental/fairseq_moe_docker_slurm/image.config
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
apt-get -y install build-essential
pip install fairscale==0.4.0
pip install hydra-core==1.0.7 omegaconf==2.0.6
pip install boto3
./fairseq_moe.sh
./megatron-lm.sh
34 changes: 34 additions & 0 deletions experimental/fairseq_moe_docker_slurm/ndv4-topo.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
<system version="1">
<cpu numaid="0" affinity="0000ffff,0000ffff" arch="x86_64" vendor="AuthenticAMD" familyid="23" modelid="49">
<pci busid="ffff:ff:01.0" class="0x060400" link_speed="16 GT/s" link_width="16">
<pci busid="0001:00:00.0" class="0x030200" link_speed="16 GT/s" link_width="16"/>
<pci busid="0101:00:00.0" class="0x020700" link_speed="16 GT/s" link_width="16"/>
<pci busid="0002:00:00.0" class="0x030200" link_speed="16 GT/s" link_width="16"/>
<pci busid="0102:00:00.0" class="0x020700" link_speed="16 GT/s" link_width="16"/>
</pci>
</cpu>
<cpu numaid="1" affinity="0000ffff,0000ffff" arch="x86_64" vendor="AuthenticAMD" familyid="23" modelid="49">
<pci busid="ffff:ff:02.0" class="0x060400" link_speed="16 GT/s" link_width="16">
<pci busid="0003:00:00.0" class="0x030200" link_speed="16 GT/s" link_width="16"/>
<pci busid="0103:00:00.0" class="0x020700" link_speed="16 GT/s" link_width="16"/>
<pci busid="0004:00:00.0" class="0x030200" link_speed="16 GT/s" link_width="16"/>
<pci busid="0104:00:00.0" class="0x020700" link_speed="16 GT/s" link_width="16"/>
</pci>
</cpu>
<cpu numaid="2" affinity="0000ffff,0000ffff" arch="x86_64" vendor="AuthenticAMD" familyid="23" modelid="49">
<pci busid="ffff:ff:03.0" class="0x060400" link_speed="16 GT/s" link_width="16">
<pci busid="000b:00:00.0" class="0x030200" link_speed="16 GT/s" link_width="16"/>
<pci busid="0105:00:00.0" class="0x020700" link_speed="16 GT/s" link_width="16"/>
<pci busid="000c:00:00.0" class="0x030200" link_speed="16 GT/s" link_width="16"/>
<pci busid="0106:00:00.0" class="0x020700" link_speed="16 GT/s" link_width="16"/>
</pci>
</cpu>
<cpu numaid="3" affinity="0000ffff,0000ffff" arch="x86_64" vendor="AuthenticAMD" familyid="23" modelid="49">
<pci busid="ffff:ff:04.0" class="0x060400" link_speed="16 GT/s" link_width="16">
<pci busid="000d:00:00.0" class="0x030200" link_speed="16 GT/s" link_width="16"/>
<pci busid="0107:00:00.0" class="0x020700" link_speed="16 GT/s" link_width="16"/>
<pci busid="000e:00:00.0" class="0x030200" link_speed="16 GT/s" link_width="16"/>
<pci busid="0108:00:00.0" class="0x020700" link_speed="16 GT/s" link_width="16"/>
</pci>
</cpu>
</system>
26 changes: 15 additions & 11 deletions experimental/fairseq_moe_docker_slurm/readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,30 +15,34 @@ Total number of paramters = Number of GPUS * 2B + 4.5B
- Compute node(s), ND96asr_v4 (Running Ubuntu-hpc 18.04)
- Azure container registry deployed

## Docker set-up on compute nodes and scheduler
## Enroot authentication

```
pdsh -w ^/path/to/hostfile sudo </path/to/docker_setup.sh
mkdir -p $HOME/.config/enroot/
echo "machine nvcr.io login $oauthtoken password YOUR_KEY" > $HOME/.config/enroot/.credentials
```
>Note: Make sure scheduler and the compute nodes have the name GID for the docker group. Modify script update "USER".
>Note: You need to get your key from [Nvidia API Key Page](https://ngc.nvidia.com/setup/api-key)

## Build docker container
## Build enroot squashfs image

```
az acr build --registry <your_acr_name> --image docker/<container_name:version> .
enroot import docker://nvcr.io/nvidia/pytorch:21.10-py3
enroot create --name pytorch nvidia+pytorch+21.10-py3.sqsh
enroot start --root --rw --mount .:/workspace pytorch
```
Exit the container and export the updated version
```bash
enroot export --output fairseq.sqsh pytorch
enroot create --name fairseq fairseq.sqsh
enroot list
```
>Note: Make sure you are in the same directory as yur Dockerfile. Change "your_acr_name", "container_name" and
"version" to appropriate values for your ACR, the name/version of your container respectively.


## Run fairseq_moe benchmark

```
sbatch -N <Number of nodes> run_fairseq_moe.slrm
```
>Note: Modify run_fairseq_moe.slrm, updating appropriate vlaues for "DOCKER_USERNAME", "DOCKER_PASSWD", "CONTAINER_NAME" and "EXECUTE_SCRIPT". If you are using a shared filesystem, you will only need to authenicate to docker once and can remove docker login from the run_fairseq_moe.slrm script. If you would like to do a restart of a job then comment out "rm $SAVE_DIR/*"
>Note: If you would like to do a restart of a job then comment out "rm $SAVE_DIR/*"
## Verify benchmark Ran ok

Expand Down
19 changes: 3 additions & 16 deletions experimental/fairseq_moe_docker_slurm/run_fairseq_moe.slrm
Original file line number Diff line number Diff line change
Expand Up @@ -8,22 +8,9 @@
#SBATCH --gpus-per-node=8
#SBATCH --cpus-per-task=96

DOCKER_USERNAME="<DOCKER_USERNAME>"
DOCKER_PASSWD="<DOCKER_PASSWORD>"

CONTAINER_NAME="<CONTAINER_NAME>"
DOCKER_ARGS="--gpus all --rm"
CONTAINER_ENV="--env SLURM_NODEID --env SLURM_NNODES --env SLURM_SUBMIT_HOST"
CONTAINER_MOUNTS="-v $HOME:$HOME -v /opt:/workspace/opt:ro -v /mnt:/workspace/mnt"
CONTAINER_LIMITS="--shm-size=256m --ulimit memlock=-1"
CONTAINER_DEVICES="--privileged --ipc=host --net=host"
SLURM_PINNING="--cpu-bind=mask_cpu:ffffff000000,ffffff000000,ffffff,ffffff,ffffff000000000000000000,ffffff000000000000000000,ffffff000000000000,ffffff000000000000"

EXECUTE_SCRIPT="<PATH_TO>/launch.sh"

CONTAINER="$DOCKER_USERNAME.azurecr.io/docker/$CONTAINER_NAME"


srun docker login -u $DOCKER_USERNAME -p $DOCKER_PASSWD ${DOCKER_USERNAME}.azurecr.io
EXECUTE_SCRIPT="launch.sh"
CONTAINER_IMAGE="/shared/home/hpcadmin/fairseq.sqsh"

srun $SLURM_PINNING docker run $DOCKER_ARGS $CONTAINER_ENV $CONTAINER_MOUNTS $CONTAINER_LIMITS $CONTAINER_DEVICES $CONTAINER $EXECUTE_SCRIPT
srun $SLURM_PINNING --container-image $CONTAINER_IMAGE --container-mounts .:/workspace $EXECUTE_SCRIPT

0 comments on commit b95e4f0

Please sign in to comment.