From ae734fc7e31633b3daa3e66e227e8abdda24f58c Mon Sep 17 00:00:00 2001 From: Rollin Thomas Date: Tue, 22 Sep 2020 08:24:42 -0700 Subject: [PATCH] Enable DGX access woohoo --- .../web-jupyterhub/jupyterhub_config.py | 24 ++++ .../web-jupyterhub/nerscslurmspawner.py | 103 ++++++++++++++++++ jupyter-nersc/web-jupyterhub/nerscspawner.py | 12 ++ 3 files changed, 139 insertions(+) diff --git a/jupyter-nersc/web-jupyterhub/jupyterhub_config.py b/jupyter-nersc/web-jupyterhub/jupyterhub_config.py index f21d092..30fe37b 100644 --- a/jupyter-nersc/web-jupyterhub/jupyterhub_config.py +++ b/jupyter-nersc/web-jupyterhub/jupyterhub_config.py @@ -1019,6 +1019,7 @@ def comma_split(string): { "name": "cori-exclusive-node-cpu" }, { "name": "cori-exclusive-node-largemem" }, { "name": "cori-configurable-gpu" }, + { "name": "cori-configurable-dgx" }, { "name": "spin-shared-node-cpu" }, ] @@ -1064,6 +1065,11 @@ def comma_split(string): "name": "gpu", "description": "Configurable GPU", "roles": ["gpu"], + }, + { + "name": "dgx", + "description": "Configurable DGX", + "roles": ["dgx"], } ], "resources": "Use multiple compute nodes with specialized settings.", @@ -1200,6 +1206,24 @@ def comma_split(string): ]) } ), + "cori-configurable-dgx": ( + "nerscslurmspawner.NERSCConfigurableDGXSlurmSpawner", { + "cmd": ["/global/common/cori_cle7/software/jupyter/cgpu/20-09/bin/jupyterhub-singleuser"], + "args": ["--transport=ipc"], + "exec_prefix": "/usr/bin/ssh -q -o StrictHostKeyChecking=no -o preferredauthentications=publickey -l {username} -i /certs/{username}.key {remote_host}", + "startup_poll_interval": 30.0, + "req_remote_host": "cori19-224.nersc.gov", + "req_homedir": "/tmp", + "req_ngpus": "1", + "req_runtime": "240", + "hub_api_url": f"https://{nersc_jupyterhub_subdomain}.nersc.gov/hub/api", + "path": "/global/common/cori_cle7/software/jupyter/cgpu/20-09/bin:/global/common/cori/das/jupyterhub:/usr/common/usg/bin:/usr/bin:/bin", + "batchspawner_singleuser_cmd" : " ".join([ + "/global/common/cori/das/jupyterhub/jupyter-launcher.sh", + "/global/common/cori_cle7/software/jupyter/cgpu/20-09/bin/batchspawner-singleuser", + ]) + } + ), "spin-shared-node-cpu": ( "sshspawner.sshspawner.SSHSpawner", { "cmd": ["/global/common/cori/das/jupyterhub/jupyter-launcher.sh", diff --git a/jupyter-nersc/web-jupyterhub/nerscslurmspawner.py b/jupyter-nersc/web-jupyterhub/nerscslurmspawner.py index 6b88afa..791a36b 100644 --- a/jupyter-nersc/web-jupyterhub/nerscslurmspawner.py +++ b/jupyter-nersc/web-jupyterhub/nerscslurmspawner.py @@ -411,6 +411,109 @@ def options_from_form(self, formdata): # continue # yield allocation +class NERSCConfigurableDGXSlurmSpawner(NERSCSlurmSpawner): + + batch_submit_cmd = Unicode("/bin/bash -l /global/common/cori/das/jupyterhub/dgx-wrapper.sh sbatch").tag(config=True) + batch_query_cmd = Unicode("/bin/bash -l /global/common/cori/das/jupyterhub/dgx-wrapper.sh squeue -h -j {job_id} -o '%T\ %B-144.nersc.gov'").tag(config=True) + batch_cancel_cmd = Unicode("/bin/bash -l /global/common/cori/das/jupyterhub/dgx-wrapper.sh scancel {job_id}").tag(config=True) + + batch_script = Unicode("""#!/bin/bash +#SBATCH --account={{ account }} +#SBATCH --constraint=dgx +#SBATCH --job-name=jupyter +#SBATCH --nodes={{ nodes }} +#SBATCH --ntasks-per-node={{ ntasks_per_node }} +#SBATCH --cpus-per-task={{ cpus_per_task }} +#SBATCH --gpus-per-task={{ gpus_per_task }} +#SBATCH --time={{ runtime }} +{{ env_text }} +unset XDG_RUNTIME_DIR +{{ cmd }}""").tag(config=True) + + async def options_form(self, spawner): + form = "" + + # Account + + form += dedent(""" + + + """) + +# # QOS, would be nice to constrain from qos + +# form += dedent(""" +# +# +# """) + +# # GPUs per node, should come from model + +# form += dedent(""" +# +# +# """) + + # Nodes, should come from model + + form += dedent(""" + + + """) + + # Number of tasks per node, should come from model + + form += dedent(""" + + + """) + + # Number of CPUs per task, should come from model + + form += dedent(""" + + + """) + + # Number of GPUs per task, should come from model + + form += dedent(""" + + + """) + + # Time, should come from model + + form += dedent(""" + + + """) + + return form + + def options_from_form(self, formdata): + options = dict() + options["account"] = formdata["account"][0] +# options["qos"] = formdata["qos"][0] +# options["ngpus"] = formdata["ngpus"][0] + options["ntasks_per_node"] = formdata["ntasks-per-node"][0] + options["cpus_per_task"] = formdata["cpus-per-task"][0] + options["gpus_per_task"] = formdata["gpus-per-task"][0] + options["runtime"] = formdata["runtime"][0] + return options class NERSCConfigurableSlurmSpawner(NERSCSlurmSpawner): diff --git a/jupyter-nersc/web-jupyterhub/nerscspawner.py b/jupyter-nersc/web-jupyterhub/nerscspawner.py index 8049baf..8f20e87 100644 --- a/jupyter-nersc/web-jupyterhub/nerscspawner.py +++ b/jupyter-nersc/web-jupyterhub/nerscspawner.py @@ -45,6 +45,8 @@ def check_role(self, auth_state, role): return self.check_role_cori_exclusive_node_cpu(auth_state) if role == "cmem": return self.check_role_cmem(auth_state) + if role == "dgx": + return self.check_role_dgx(auth_state) return False def check_role_cori_exclusive_node_cpu(self, auth_state): @@ -82,6 +84,16 @@ def default_gpu_repo(self, auth_state): return allocation["computeAllocation"]["repoName"] return None + def check_role_dgx(self, auth_state): + return self.default_dgx_repo(auth_state) is not None + + def default_dgx_repo(self, auth_state): + for allocation in self.user_allocations(auth_state): + for qos in allocation["userAllocationQos"]: + if qos["qos"]["qos"] in ["dgx"]: + return allocation["computeAllocation"]["repoName"] + return None + def user_allocations(self, auth_state, repos=[]): for allocation in auth_state["userdata"].get("userAllocations", []): if repos and allocation["computeAllocation"]["repoName"] not in repos: