Skip to content

Commit

Permalink
Set gpuNum as optional (microsoft#1389)
Browse files Browse the repository at this point in the history
  • Loading branch information
SparkSnail authored Aug 2, 2019
1 parent 555334d commit 204b1eb
Show file tree
Hide file tree
Showing 5 changed files with 33 additions and 18 deletions.
2 changes: 1 addition & 1 deletion docs/en_US/Tutorial/ExperimentConfig.md
Original file line number Diff line number Diff line change
Expand Up @@ -266,7 +266,7 @@ machineList:

* __gpuNum__

__gpuNum__ specifies the gpu number to run the tuner process. The value of this field should be a positive number.
__gpuNum__ specifies the gpu number to run the tuner process. The value of this field should be a positive number. If the field is not set, NNI will not set `CUDA_VISIBLE_DEVICES` in script (that is, will not control the visibility of GPUs on trial command through `CUDA_VISIBLE_DEVICES`), and will not manage gpu resource.

Note: users could only specify one way to set tuner, for example, set {tunerName, optimizationMode} or {tunerCommand, tunerCwd}, and could not set them both.

Expand Down
28 changes: 18 additions & 10 deletions src/nni_manager/training_service/local/localTrainingService.ts
Original file line number Diff line number Diff line change
Expand Up @@ -307,9 +307,11 @@ class LocalTrainingService implements TrainingService {
if (this.localTrailConfig === undefined) {
throw new Error('trial config parsed failed');
}
this.log.info(`required GPU number is ${this.localTrailConfig.gpuNum}`);
if (this.gpuScheduler === undefined && this.localTrailConfig.gpuNum > 0) {
this.gpuScheduler = new GPUScheduler();
if (this.localTrailConfig.gpuNum !== undefined) {
this.log.info(`required GPU number is ${this.localTrailConfig.gpuNum}`);
if (this.gpuScheduler === undefined && this.localTrailConfig.gpuNum > 0) {
this.gpuScheduler = new GPUScheduler();
}
}
break;
case TrialConfigMetadataKey.LOCAL_CONFIG:
Expand Down Expand Up @@ -399,7 +401,8 @@ class LocalTrainingService implements TrainingService {

private getEnvironmentVariables(
trialJobDetail: TrialJobDetail,
resource: { gpuIndices: number[] }): { key: string; value: string }[] {
resource: { gpuIndices: number[] },
gpuNum: number | undefined): { key: string; value: string }[] {
const envVariables: { key: string; value: string }[] = [
{ key: 'NNI_PLATFORM', value: 'local' },
{ key: 'NNI_EXP_ID', value: this.experimentId },
Expand All @@ -409,11 +412,12 @@ class LocalTrainingService implements TrainingService {
{ key: 'NNI_TRIAL_SEQ_ID', value: trialJobDetail.sequenceId.toString() },
{ key: 'MULTI_PHASE', value: this.isMultiPhase.toString() }
];

envVariables.push({
key: 'CUDA_VISIBLE_DEVICES',
value: this.gpuScheduler === undefined ? '-1' : resource.gpuIndices.join(',')
});
if (gpuNum !== undefined) {
envVariables.push({
key: 'CUDA_VISIBLE_DEVICES',
value: this.gpuScheduler === undefined ? '-1' : resource.gpuIndices.join(',')
});
}

return envVariables;
}
Expand Down Expand Up @@ -490,6 +494,7 @@ class LocalTrainingService implements TrainingService {
if (!success) {
break;
}

this.occupyResource(resource);
await this.runTrialJob(trialJobId, resource);
}
Expand Down Expand Up @@ -526,7 +531,10 @@ class LocalTrainingService implements TrainingService {

private async runTrialJob(trialJobId: string, resource: {gpuIndices: number[]}): Promise<void> {
const trialJobDetail: LocalTrialJobDetail = <LocalTrialJobDetail>this.jobMap.get(trialJobId);
const variables: { key: string; value: string }[] = this.getEnvironmentVariables(trialJobDetail, resource);
if (this.localTrailConfig === undefined) {
throw new Error(`localTrialConfig not initialized!`);
}
const variables: { key: string; value: string }[] = this.getEnvironmentVariables(trialJobDetail, resource, this.localTrailConfig.gpuNum);

if (this.localTrailConfig === undefined) {
throw new Error('trial config is not initialized');
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -48,14 +48,17 @@ export class GPUScheduler {
* Schedule a machine according to the constraints (requiredGPUNum)
* @param requiredGPUNum required GPU number
*/
public scheduleMachine(requiredGPUNum: number, trialJobDetail : RemoteMachineTrialJobDetail) : RemoteMachineScheduleResult {
public scheduleMachine(requiredGPUNum: number | undefined, trialJobDetail : RemoteMachineTrialJobDetail) : RemoteMachineScheduleResult {
if(requiredGPUNum === undefined) {
requiredGPUNum = 0;
}
assert(requiredGPUNum >= 0);
const allRMs: RemoteMachineMeta[] = Array.from(this.machineSSHClientMap.keys());
assert(allRMs.length > 0);

// Step 1: Check if required GPU number not exceeds the total GPU number in all machines
const eligibleRM: RemoteMachineMeta[] = allRMs.filter((rmMeta : RemoteMachineMeta) =>
rmMeta.gpuSummary === undefined || requiredGPUNum === 0 || rmMeta.gpuSummary.gpuCount >= requiredGPUNum);
rmMeta.gpuSummary === undefined || requiredGPUNum === 0 || (requiredGPUNum !== undefined && rmMeta.gpuSummary.gpuCount >= requiredGPUNum));
if (eligibleRM.length === 0) {
// If the required gpu number exceeds the upper limit of all machine's GPU number
// Return REQUIRE_EXCEED_TOTAL directly
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -601,12 +601,16 @@ class RemoteMachineTrainingService implements TrainingService {
let command: string;
// Set CUDA_VISIBLE_DEVICES environment variable based on cuda_visible_device
// If no valid cuda_visible_device is defined, set CUDA_VISIBLE_DEVICES to empty string to hide GPU device
if (typeof cuda_visible_device === 'string' && cuda_visible_device.length > 0) {
command = `CUDA_VISIBLE_DEVICES=${cuda_visible_device} ${this.trialConfig.command}`;
// If gpuNum is undefined, will not set CUDA_VISIBLE_DEVICES in script
if (this.trialConfig.gpuNum === undefined) {
command = this.trialConfig.command;
} else {
command = `CUDA_VISIBLE_DEVICES=" " ${this.trialConfig.command}`;
if (typeof cuda_visible_device === 'string' && cuda_visible_device.length > 0) {
command = `CUDA_VISIBLE_DEVICES=${cuda_visible_device} ${this.trialConfig.command}`;
} else {
command = `CUDA_VISIBLE_DEVICES=" " ${this.trialConfig.command}`;
}
}

// tslint:disable-next-line: strict-boolean-expressions
const nniManagerIp: string = this.nniManagerIpConfig ? this.nniManagerIpConfig.nniManagerIp : getIPV4Address();
if (this.remoteRestServerPort === undefined) {
Expand Down
2 changes: 1 addition & 1 deletion tools/nni_cmd/config_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -220,7 +220,7 @@ def setPathCheck(key):
'trial':{
'command': setType('command', str),
'codeDir': setPathCheck('codeDir'),
'gpuNum': setNumberRange('gpuNum', int, 0, 99999),
Optional('gpuNum'): setNumberRange('gpuNum', int, 0, 99999),
Optional('nasMode'): setChoice('classic_mode', 'enas_mode', 'oneshot_mode')
}
}
Expand Down

0 comments on commit 204b1eb

Please sign in to comment.