added enroot+pyxis support

vinil-v · Sep 14, 2022 · b95e4f0 · b95e4f0
1 parent 5e71337
commit b95e4f0
Show file tree

Hide file tree

Showing 5 changed files with 58 additions and 39 deletions.
diff --git a/experimental/fairseq_moe_docker_slurm/Dockerfile b/experimental/fairseq_moe_docker_slurm/Dockerfile
diff --git a/experimental/fairseq_moe_docker_slurm/image.config b/experimental/fairseq_moe_docker_slurm/image.config
@@ -0,0 +1,6 @@
+apt-get -y install build-essential
+pip install fairscale==0.4.0
+pip install hydra-core==1.0.7 omegaconf==2.0.6
+pip install boto3
+./fairseq_moe.sh
+./megatron-lm.sh
diff --git a/experimental/fairseq_moe_docker_slurm/ndv4-topo.xml b/experimental/fairseq_moe_docker_slurm/ndv4-topo.xml
@@ -0,0 +1,34 @@
+<system version="1">
+  <cpu numaid="0" affinity="0000ffff,0000ffff" arch="x86_64" vendor="AuthenticAMD" familyid="23" modelid="49">
+    <pci busid="ffff:ff:01.0" class="0x060400" link_speed="16 GT/s" link_width="16">
+      <pci busid="0001:00:00.0" class="0x030200" link_speed="16 GT/s" link_width="16"/>
+      <pci busid="0101:00:00.0" class="0x020700" link_speed="16 GT/s" link_width="16"/>
+      <pci busid="0002:00:00.0" class="0x030200" link_speed="16 GT/s" link_width="16"/>
+      <pci busid="0102:00:00.0" class="0x020700" link_speed="16 GT/s" link_width="16"/>
+    </pci>
+  </cpu>
+  <cpu numaid="1" affinity="0000ffff,0000ffff" arch="x86_64" vendor="AuthenticAMD" familyid="23" modelid="49">
+    <pci busid="ffff:ff:02.0" class="0x060400" link_speed="16 GT/s" link_width="16">
+      <pci busid="0003:00:00.0" class="0x030200" link_speed="16 GT/s" link_width="16"/>
+      <pci busid="0103:00:00.0" class="0x020700" link_speed="16 GT/s" link_width="16"/>
+      <pci busid="0004:00:00.0" class="0x030200" link_speed="16 GT/s" link_width="16"/>
+      <pci busid="0104:00:00.0" class="0x020700" link_speed="16 GT/s" link_width="16"/>
+    </pci>
+  </cpu>
+  <cpu numaid="2" affinity="0000ffff,0000ffff" arch="x86_64" vendor="AuthenticAMD" familyid="23" modelid="49">
+      <pci busid="ffff:ff:03.0" class="0x060400" link_speed="16 GT/s" link_width="16">
+      <pci busid="000b:00:00.0" class="0x030200" link_speed="16 GT/s" link_width="16"/>
+      <pci busid="0105:00:00.0" class="0x020700" link_speed="16 GT/s" link_width="16"/>
+      <pci busid="000c:00:00.0" class="0x030200" link_speed="16 GT/s" link_width="16"/>
+      <pci busid="0106:00:00.0" class="0x020700" link_speed="16 GT/s" link_width="16"/>
+    </pci>
+  </cpu>
+  <cpu numaid="3" affinity="0000ffff,0000ffff" arch="x86_64" vendor="AuthenticAMD" familyid="23" modelid="49">
+    <pci busid="ffff:ff:04.0" class="0x060400" link_speed="16 GT/s" link_width="16">
+      <pci busid="000d:00:00.0" class="0x030200" link_speed="16 GT/s" link_width="16"/>
+      <pci busid="0107:00:00.0" class="0x020700" link_speed="16 GT/s" link_width="16"/>
+      <pci busid="000e:00:00.0" class="0x030200" link_speed="16 GT/s" link_width="16"/>
+      <pci busid="0108:00:00.0" class="0x020700" link_speed="16 GT/s" link_width="16"/>
+    </pci>
+  </cpu>
+</system>
diff --git a/experimental/fairseq_moe_docker_slurm/readme.md b/experimental/fairseq_moe_docker_slurm/readme.md
@@ -15,30 +15,34 @@ Total number of paramters = Number of GPUS * 2B + 4.5B
 - Compute node(s), ND96asr_v4 (Running Ubuntu-hpc 18.04)
 - Azure container registry deployed
 
-## Docker set-up on compute nodes and scheduler
+## Enroot authentication
 
 ```
-pdsh -w ^/path/to/hostfile sudo </path/to/docker_setup.sh
+mkdir -p $HOME/.config/enroot/
+echo "machine nvcr.io login $oauthtoken password YOUR_KEY" > $HOME/.config/enroot/.credentials
 ```
->Note: Make sure scheduler and the compute nodes have the name GID for the docker group. Modify script update "USER".
+>Note: You need to get your key from [Nvidia API Key Page](https://ngc.nvidia.com/setup/api-key)
 
-
-## Build docker container
+## Build enroot squashfs image
 
 ```
-az acr build --registry <your_acr_name> --image docker/<container_name:version> .
+enroot import docker://nvcr.io/nvidia/pytorch:21.10-py3
+enroot create --name pytorch nvidia+pytorch+21.10-py3.sqsh
+enroot start --root --rw --mount .:/workspace pytorch
+```
+Exit the container and export the updated version
+```bash
+enroot export --output fairseq.sqsh pytorch
+enroot create --name fairseq fairseq.sqsh
+enroot list
 ```
->Note: Make sure you are in the same directory as yur Dockerfile. Change "your_acr_name", "container_name" and 
-"version" to appropriate values for your ACR, the name/version of your container respectively.
-
 
 ## Run fairseq_moe benchmark
 
 ```
 sbatch -N <Number of nodes> run_fairseq_moe.slrm
 ```
->Note: Modify run_fairseq_moe.slrm, updating appropriate vlaues for "DOCKER_USERNAME", "DOCKER_PASSWD", "CONTAINER_NAME" and "EXECUTE_SCRIPT". If you are using a shared filesystem, you will only need to authenicate to docker once and can remove docker login from the run_fairseq_moe.slrm script. If you would like to do a restart of a job then comment out "rm $SAVE_DIR/*"
-
+>Note: If you would like to do a restart of a job then comment out "rm $SAVE_DIR/*"
 
 ## Verify benchmark Ran ok
 

diff --git a/experimental/fairseq_moe_docker_slurm/run_fairseq_moe.slrm b/experimental/fairseq_moe_docker_slurm/run_fairseq_moe.slrm
@@ -8,22 +8,9 @@
 #SBATCH --gpus-per-node=8
 #SBATCH --cpus-per-task=96
 
-DOCKER_USERNAME="<DOCKER_USERNAME>"
-DOCKER_PASSWD="<DOCKER_PASSWORD>"
-
-CONTAINER_NAME="<CONTAINER_NAME>"
-DOCKER_ARGS="--gpus all --rm"
-CONTAINER_ENV="--env SLURM_NODEID --env SLURM_NNODES --env SLURM_SUBMIT_HOST"
-CONTAINER_MOUNTS="-v $HOME:$HOME -v /opt:/workspace/opt:ro -v /mnt:/workspace/mnt"
-CONTAINER_LIMITS="--shm-size=256m --ulimit memlock=-1"
-CONTAINER_DEVICES="--privileged --ipc=host --net=host"
 SLURM_PINNING="--cpu-bind=mask_cpu:ffffff000000,ffffff000000,ffffff,ffffff,ffffff000000000000000000,ffffff000000000000000000,ffffff000000000000,ffffff000000000000"
 
-EXECUTE_SCRIPT="<PATH_TO>/launch.sh"
-
-CONTAINER="$DOCKER_USERNAME.azurecr.io/docker/$CONTAINER_NAME"
-
-
-srun docker login -u $DOCKER_USERNAME -p $DOCKER_PASSWD ${DOCKER_USERNAME}.azurecr.io
+EXECUTE_SCRIPT="launch.sh"
+CONTAINER_IMAGE="/shared/home/hpcadmin/fairseq.sqsh"
 
-srun $SLURM_PINNING docker run $DOCKER_ARGS $CONTAINER_ENV $CONTAINER_MOUNTS $CONTAINER_LIMITS $CONTAINER_DEVICES $CONTAINER $EXECUTE_SCRIPT
+srun $SLURM_PINNING --container-image $CONTAINER_IMAGE --container-mounts .:/workspace $EXECUTE_SCRIPT