Skip to content
This repository has been archived by the owner on Apr 29, 2021. It is now read-only.

Commit

Permalink
Enable DGX access woohoo
Browse files Browse the repository at this point in the history
  • Loading branch information
rcthomas committed Sep 22, 2020
1 parent 025c92e commit ae734fc
Show file tree
Hide file tree
Showing 3 changed files with 139 additions and 0 deletions.
24 changes: 24 additions & 0 deletions jupyter-nersc/web-jupyterhub/jupyterhub_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -1019,6 +1019,7 @@ def comma_split(string):
{ "name": "cori-exclusive-node-cpu" },
{ "name": "cori-exclusive-node-largemem" },
{ "name": "cori-configurable-gpu" },
{ "name": "cori-configurable-dgx" },
{ "name": "spin-shared-node-cpu" },
]

Expand Down Expand Up @@ -1064,6 +1065,11 @@ def comma_split(string):
"name": "gpu",
"description": "Configurable GPU",
"roles": ["gpu"],
},
{
"name": "dgx",
"description": "Configurable DGX",
"roles": ["dgx"],
}
],
"resources": "Use multiple compute nodes with specialized settings.",
Expand Down Expand Up @@ -1200,6 +1206,24 @@ def comma_split(string):
])
}
),
"cori-configurable-dgx": (
"nerscslurmspawner.NERSCConfigurableDGXSlurmSpawner", {
"cmd": ["/global/common/cori_cle7/software/jupyter/cgpu/20-09/bin/jupyterhub-singleuser"],
"args": ["--transport=ipc"],
"exec_prefix": "/usr/bin/ssh -q -o StrictHostKeyChecking=no -o preferredauthentications=publickey -l {username} -i /certs/{username}.key {remote_host}",
"startup_poll_interval": 30.0,
"req_remote_host": "cori19-224.nersc.gov",
"req_homedir": "/tmp",
"req_ngpus": "1",
"req_runtime": "240",
"hub_api_url": f"https://{nersc_jupyterhub_subdomain}.nersc.gov/hub/api",
"path": "/global/common/cori_cle7/software/jupyter/cgpu/20-09/bin:/global/common/cori/das/jupyterhub:/usr/common/usg/bin:/usr/bin:/bin",
"batchspawner_singleuser_cmd" : " ".join([
"/global/common/cori/das/jupyterhub/jupyter-launcher.sh",
"/global/common/cori_cle7/software/jupyter/cgpu/20-09/bin/batchspawner-singleuser",
])
}
),
"spin-shared-node-cpu": (
"sshspawner.sshspawner.SSHSpawner", {
"cmd": ["/global/common/cori/das/jupyterhub/jupyter-launcher.sh",
Expand Down
103 changes: 103 additions & 0 deletions jupyter-nersc/web-jupyterhub/nerscslurmspawner.py
Original file line number Diff line number Diff line change
Expand Up @@ -411,6 +411,109 @@ def options_from_form(self, formdata):
# continue
# yield allocation

class NERSCConfigurableDGXSlurmSpawner(NERSCSlurmSpawner):

batch_submit_cmd = Unicode("/bin/bash -l /global/common/cori/das/jupyterhub/dgx-wrapper.sh sbatch").tag(config=True)
batch_query_cmd = Unicode("/bin/bash -l /global/common/cori/das/jupyterhub/dgx-wrapper.sh squeue -h -j {job_id} -o '%T\ %B-144.nersc.gov'").tag(config=True)
batch_cancel_cmd = Unicode("/bin/bash -l /global/common/cori/das/jupyterhub/dgx-wrapper.sh scancel {job_id}").tag(config=True)

batch_script = Unicode("""#!/bin/bash
#SBATCH --account={{ account }}
#SBATCH --constraint=dgx
#SBATCH --job-name=jupyter
#SBATCH --nodes={{ nodes }}
#SBATCH --ntasks-per-node={{ ntasks_per_node }}
#SBATCH --cpus-per-task={{ cpus_per_task }}
#SBATCH --gpus-per-task={{ gpus_per_task }}
#SBATCH --time={{ runtime }}
{{ env_text }}
unset XDG_RUNTIME_DIR
{{ cmd }}""").tag(config=True)

async def options_form(self, spawner):
form = ""

# Account

form += dedent("""
<label for="account">Account:</label>
<select class="form-control" name="account" required autofocus>
""")

for allocation in spawner.userdata["userAllocations"]:
account = allocation["computeAllocation"]["repoName"]
for qos in allocation["userAllocationQos"]:
if qos["qos"]["qos"] in ["dgx"]:
form += """<option value="{}">{}</option>""".format(account, account)

form += dedent("""
</select>
""")

# # QOS, would be nice to constrain from qos

# form += dedent("""
# <label for="qos">QOS:</label>
# <select class="form-control" name="qos" required autofocus>
# <option value="gpu">gpu</option>
# <option value="special">special (m1759 only)</option>
# </select>
# """)

# # GPUs per node, should come from model

# form += dedent("""
# <label for="nodes">GPUs per Node:</label>
# <input class="form-control" type="number" name="ngpus" min="1" max="8" value="1" required autofocus>
# """)

# Nodes, should come from model

form += dedent("""
<label for="nodes">nodes:</label>
<input class="form-control" type="number" name="nodes" min="1" max="2" value="1" required autofocus>
""")

# Number of tasks per node, should come from model

form += dedent("""
<label for="ntasks-per-node">ntasks-per-node (up to 8 tasks):</label>
<input class="form-control" type="number" name="ntasks-per-node" min="1" max="8" value="1" required autofocus>
""")

# Number of CPUs per task, should come from model

form += dedent("""
<label for="cpus-per-task">cpus-per-task (node has 128 cores):</label>
<input class="form-control" type="number" name="cpus-per-task" min="1" max="128" value="16" required autofocus>
""")

# Number of GPUs per task, should come from model

form += dedent("""
<label for="gpus-per-task">gpus-per-task (node has 8 GPUs):</label>
<input class="form-control" type="number" name="gpus-per-task" min="1" max="8" value="1" required autofocus>
""")

# Time, should come from model

form += dedent("""
<label for="runtime">time (time limit in minutes):</label>
<input class="form-control" type="number" name="runtime" min="10" max="240" value="240" step="10" required autofocus>
""")

return form

def options_from_form(self, formdata):
options = dict()
options["account"] = formdata["account"][0]
# options["qos"] = formdata["qos"][0]
# options["ngpus"] = formdata["ngpus"][0]
options["ntasks_per_node"] = formdata["ntasks-per-node"][0]
options["cpus_per_task"] = formdata["cpus-per-task"][0]
options["gpus_per_task"] = formdata["gpus-per-task"][0]
options["runtime"] = formdata["runtime"][0]
return options


class NERSCConfigurableSlurmSpawner(NERSCSlurmSpawner):
Expand Down
12 changes: 12 additions & 0 deletions jupyter-nersc/web-jupyterhub/nerscspawner.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@ def check_role(self, auth_state, role):
return self.check_role_cori_exclusive_node_cpu(auth_state)
if role == "cmem":
return self.check_role_cmem(auth_state)
if role == "dgx":
return self.check_role_dgx(auth_state)
return False

def check_role_cori_exclusive_node_cpu(self, auth_state):
Expand Down Expand Up @@ -82,6 +84,16 @@ def default_gpu_repo(self, auth_state):
return allocation["computeAllocation"]["repoName"]
return None

def check_role_dgx(self, auth_state):
return self.default_dgx_repo(auth_state) is not None

def default_dgx_repo(self, auth_state):
for allocation in self.user_allocations(auth_state):
for qos in allocation["userAllocationQos"]:
if qos["qos"]["qos"] in ["dgx"]:
return allocation["computeAllocation"]["repoName"]
return None

def user_allocations(self, auth_state, repos=[]):
for allocation in auth_state["userdata"].get("userAllocations", []):
if repos and allocation["computeAllocation"]["repoName"] not in repos:
Expand Down

0 comments on commit ae734fc

Please sign in to comment.