diff --git a/README.md b/README.md index 0d4057f7a2..caaefc8108 100644 --- a/README.md +++ b/README.md @@ -106,7 +106,7 @@ We encourage researchers and students leverage these projects to accelerate the ## **Install & Verify** -If you choose NNI Windows local mode and you use PowerShell to run script for the first time, you need to **run PowerShell as administrator** with this command first: +If you are using NNI on Windows and use PowerShell to run script for the first time, you need to **run PowerShell as administrator** with this command first: ```bash Set-ExecutionPolicy -ExecutionPolicy Unrestricted @@ -114,7 +114,7 @@ If you choose NNI Windows local mode and you use PowerShell to run script for th **Install through pip** -* We support Linux, MacOS and Windows(local mode) in current stage, Ubuntu 16.04 or higher, MacOS 10.14.1 along with Windows 10.1809 are tested and supported. Simply run the following `pip install` in an environment that has `python >= 3.5`. +* We support Linux, MacOS and Windows(local, remote and pai mode) in current stage, Ubuntu 16.04 or higher, MacOS 10.14.1 along with Windows 10.1809 are tested and supported. Simply run the following `pip install` in an environment that has `python >= 3.5`. Linux and MacOS @@ -131,12 +131,12 @@ python -m pip install --upgrade nni Note: * `--user` can be added if you want to install NNI in your home directory, which does not require any special privileges. -* Currently NNI on Windows only support local mode. Anaconda or Miniconda is highly recommended to install NNI on Windows. +* Currently NNI on Windows support local, remote and pai mode. Anaconda or Miniconda is highly recommended to install NNI on Windows. * If there is any error like `Segmentation fault`, please refer to [FAQ](docs/en_US/FAQ.md) **Install through source code** -* We support Linux (Ubuntu 16.04 or higher), MacOS (10.14.1) and Windows local mode (10.1809) in our current stage. +* We support Linux (Ubuntu 16.04 or higher), MacOS (10.14.1) and Windows (10.1809) in our current stage. Linux and MacOS @@ -160,7 +160,7 @@ Windows For the system requirements of NNI, please refer to [Install NNI](docs/en_US/Installation.md) -For NNI Windows local mode, please refer to [NNI Windows local mode](docs/en_US/WindowsLocalMode.md) +For NNI on Windows, please refer to [NNI on Windows](docs/en_US/NniOnWindows.md) **Verify install** diff --git a/deployment/pypi/Makefile b/deployment/pypi/Makefile index b75cc3212c..4854f24ca1 100644 --- a/deployment/pypi/Makefile +++ b/deployment/pypi/Makefile @@ -20,22 +20,28 @@ ifeq ($(version_ts), true) NNI_VERSION_VALUE := $(NNI_VERSION_VALUE).$(TIME_STAMP) endif NNI_VERSION_TEMPLATE = 999.0.0-developing - +NNI_YARN_TARBALL ?= $(CWD)nni-yarn.tar.gz +NNI_YARN_FOLDER ?= $(CWD)nni-yarn +NNI_YARN := PATH=$(CWD)node-$(OS_SPEC)-x64/bin:$${PATH} $(NNI_YARN_FOLDER)/bin/yarn .PHONY: build build: python3 -m pip install --user --upgrade setuptools wheel - wget https://aka.ms/nni/nodejs-download/$(OS_SPEC) -O $(CWD)node-$(OS_SPEC)-x64.tar.xz + wget -q https://aka.ms/nni/nodejs-download/$(OS_SPEC) -O $(CWD)node-$(OS_SPEC)-x64.tar.xz rm -rf $(CWD)node-$(OS_SPEC)-x64 mkdir $(CWD)node-$(OS_SPEC)-x64 tar xf $(CWD)node-$(OS_SPEC)-x64.tar.xz -C node-$(OS_SPEC)-x64 --strip-components 1 - cd $(CWD)../../src/nni_manager && yarn && yarn build - cd $(CWD)../../src/webui && yarn && yarn build + wget -q https://aka.ms/yarn-download -O $(NNI_YARN_TARBALL) + rm -rf $(NNI_YARN_FOLDER) + mkdir $(NNI_YARN_FOLDER) + tar -xf $(NNI_YARN_TARBALL) -C $(NNI_YARN_FOLDER) --strip-components 1 + cd $(CWD)../../src/nni_manager && $(NNI_YARN) && $(NNI_YARN) build + cd $(CWD)../../src/webui && $(NNI_YARN) && $(NNI_YARN) build rm -rf $(CWD)nni cp -r $(CWD)../../src/nni_manager/dist $(CWD)nni cp -r $(CWD)../../src/webui/build $(CWD)nni/static cp $(CWD)../../src/nni_manager/package.json $(CWD)nni sed -ie 's/$(NNI_VERSION_TEMPLATE)/$(NNI_VERSION_VALUE)/' $(CWD)nni/package.json - cd $(CWD)nni && yarn --prod + cd $(CWD)nni && $(NNI_YARN) --prod cd $(CWD) && sed -ie 's/$(NNI_VERSION_TEMPLATE)/$(NNI_VERSION_VALUE)/' setup.py && python3 setup.py bdist_wheel -p $(WHEEL_SPEC) cd $(CWD) @@ -50,4 +56,4 @@ clean: rm -rf $(CWD)dist rm -rf $(CWD)nni rm -rf $(CWD)nni.egg-info - rm -rf $(CWD)node-$(OS_SPEC)-x64 \ No newline at end of file + rm -rf $(CWD)node-$(OS_SPEC)-x64 diff --git a/docs/en_US/FAQ.md b/docs/en_US/FAQ.md index 05756fd08b..1b1c9146bf 100644 --- a/docs/en_US/FAQ.md +++ b/docs/en_US/FAQ.md @@ -36,8 +36,8 @@ Unable to open the WebUI may have the following reasons: * If you still can't see the WebUI after you use the server IP, you can check the proxy and the firewall of your machine. Or use the browser on the machine where you start your NNI experiment. * Another reason may be your experiment is failed and NNI may fail to get the experiment infomation. You can check the log of NNImanager in the following directory: ~/nni/experiment/[your_experiment_id] /log/nnimanager.log -### Windows local mode problems -Please refer to [NNI Windows local mode](WindowsLocalMode.md) +### NNI on Windows problems +Please refer to [NNI on Windows](NniOnWindows.md) ### Help us improve Please inquiry the problem in https://github.com/Microsoft/nni/issues to see whether there are other people already reported the problem, create a new one if there are no existing issues been created. diff --git a/docs/en_US/Installation.md b/docs/en_US/Installation.md index 91156481b8..6d4cd9065b 100644 --- a/docs/en_US/Installation.md +++ b/docs/en_US/Installation.md @@ -1,6 +1,6 @@ # Installation of NNI -Currently we support installation on Linux, Mac and Windows(local mode). +Currently we support installation on Linux, Mac and Windows(local, remote and pai mode). ## **Installation on Linux & Mac** diff --git a/docs/en_US/WindowsLocalMode.md b/docs/en_US/NniOnWindows.md similarity index 89% rename from docs/en_US/WindowsLocalMode.md rename to docs/en_US/NniOnWindows.md index d4a1e172e6..913b48a699 100644 --- a/docs/en_US/WindowsLocalMode.md +++ b/docs/en_US/NniOnWindows.md @@ -1,6 +1,6 @@ -# Windows Local Mode (experimental feature) +# NNI on Windows (experimental feature) -Currently we only support local mode on Windows. Windows 10.1809 is well tested and recommended. +Currently we support local, remote and pai mode on Windows. Windows 10.1809 is well tested and recommended. ## **Installation on Windows** @@ -25,15 +25,15 @@ Set-ExecutionPolicy -ExecutionPolicy Unrestricted Prerequisite: `python >=3.5`, `git`, `PowerShell` ```bash - git clone -b v0.7 https://github.com/Microsoft/nni.git + git clone -b v0.8 https://github.com/Microsoft/nni.git cd nni - powershell ./install.ps1 + powershell -file install.ps1 ``` When these things are done, use the **config_windows.yml** configuration to start an experiment for validation. ```bash -nnictl create --config nni/examples/trials/mnist/config_windows.yml +nnictl create --config nni\examples\trials\mnist\config_windows.yml ``` For other examples you need to change trial command `python3` into `python` in each example YAML. diff --git a/docs/en_US/QuickStart.md b/docs/en_US/QuickStart.md index 9f7e929ac7..9faef5529c 100644 --- a/docs/en_US/QuickStart.md +++ b/docs/en_US/QuickStart.md @@ -2,7 +2,7 @@ ## Installation -We support Linux MacOS and Windows(local mode) in current stage, Ubuntu 16.04 or higher, MacOS 10.14.1 and Windows 10.1809 are tested and supported. Simply run the following `pip install` in an environment that has `python >= 3.5`. +We support Linux MacOS and Windows in current stage, Ubuntu 16.04 or higher, MacOS 10.14.1 and Windows 10.1809 are tested and supported. Simply run the following `pip install` in an environment that has `python >= 3.5`. #### Linux and MacOS ```bash @@ -10,7 +10,7 @@ We support Linux MacOS and Windows(local mode) in current stage, Ubuntu 16.04 or ``` #### Windows -If you choose Windows local mode and use PowerShell to run script, you need run below PowerShell command as administrator at first time. +If you are using NNI on Windows, you need run below PowerShell command as administrator at first time. ```bash Set-ExecutionPolicy -ExecutionPolicy Unrestricted ``` @@ -151,7 +151,7 @@ Run the **config.yml** file from your command line to start MNIST experiment. #### Windows Run the **config_windows.yml** file from your command line to start MNIST experiment. -**Note**, if you're using windows local mode, it needs to change `python3` to `python` in the config.yml file, or use the config_windows.yml file to start the experiment. +**Note**, if you're using NNI on Windows, it needs to change `python3` to `python` in the config.yml file, or use the config_windows.yml file to start the experiment. ```bash nnictl create --config nni/examples/trials/mnist/config_windows.yml diff --git a/docs/en_US/RemoteMachineMode.md b/docs/en_US/RemoteMachineMode.md index f5e0aa3859..8d7d1a3c34 100644 --- a/docs/en_US/RemoteMachineMode.md +++ b/docs/en_US/RemoteMachineMode.md @@ -55,7 +55,8 @@ machineList: username: bob passwd: bob123 ``` - +You can use different systems to run experiments on the remote machine. +#### Linux and MacOS Simply filling the `machineList` section and then run: ```bash @@ -64,5 +65,14 @@ nnictl create --config ~/nni/examples/trials/mnist-annotation/config_remote.yml to start the experiment. +#### Windows +Simply filling the `machineList` section and then run: + +```bash +nnictl create --config %userprofile%\nni\examples\trials\mnist-annotation\config_remote.yml +``` + +to start the experiment. + ## version check NNI support version check feature in since version 0.6, [refer](PaiMode.md) \ No newline at end of file diff --git a/install.ps1 b/install.ps1 index 31d8ba2fe7..5de27a45a2 100644 --- a/install.ps1 +++ b/install.ps1 @@ -15,7 +15,7 @@ $yarnUrl = "https://yarnpkg.com/latest.tar.gz" $unzipNodeDir = "node-v*" $unzipYarnDir = "yarn-v*" -$NNI_DEPENDENCY_FOLDER = "C:\tmp\$env:USERNAME" +$NNI_DEPENDENCY_FOLDER = [System.IO.Path]::GetTempPath()+$env:USERNAME $WHICH_PYTHON = where.exe python if($WHICH_PYTHON -eq $null){ diff --git a/src/nni_manager/common/utils.ts b/src/nni_manager/common/utils.ts index b741b4e9a9..f3a57cf9e0 100644 --- a/src/nni_manager/common/utils.ts +++ b/src/nni_manager/common/utils.ts @@ -43,11 +43,11 @@ function getExperimentRootDir(): string { .getLogDir(); } -function getLogDir(): string{ +function getLogDir(): string { return path.join(getExperimentRootDir(), 'log'); } -function getLogLevel(): string{ +function getLogLevel(): string { return getExperimentStartupInfo() .getLogLevel(); } @@ -149,7 +149,7 @@ function parseArg(names: string[]): string { return ''; } -function encodeCmdLineArgs(args:any):any{ +function encodeCmdLineArgs(args: any): any { if(process.platform === 'win32'){ return JSON.stringify(args); } @@ -158,7 +158,7 @@ function encodeCmdLineArgs(args:any):any{ } } -function getCmdPy():string{ +function getCmdPy(): string { let cmd = 'python3'; if(process.platform === 'win32'){ cmd = 'python'; @@ -390,7 +390,7 @@ async function getVersion(): Promise { /** * run command as ChildProcess */ -function getTunerProc(command: string, stdio: StdioOptions, newCwd: string, newEnv: any): ChildProcess{ +function getTunerProc(command: string, stdio: StdioOptions, newCwd: string, newEnv: any): ChildProcess { let cmd: string = command; let arg: string[] = []; let newShell: boolean = true; @@ -411,7 +411,7 @@ function getTunerProc(command: string, stdio: StdioOptions, newCwd: string, newE /** * judge whether the process is alive */ -async function isAlive(pid:any): Promise{ +async function isAlive(pid:any): Promise { let deferred : Deferred = new Deferred(); let alive: boolean = false; if(process.platform ==='win32'){ @@ -439,7 +439,7 @@ async function isAlive(pid:any): Promise{ /** * kill process */ -async function killPid(pid:any): Promise{ +async function killPid(pid:any): Promise { let deferred : Deferred = new Deferred(); try { if (process.platform === "win32") { @@ -455,7 +455,7 @@ async function killPid(pid:any): Promise{ return deferred.promise; } -function getNewLine(): string{ +function getNewLine(): string { if (process.platform === "win32") { return "\r\n"; } diff --git a/src/nni_manager/core/nnimanager.ts b/src/nni_manager/core/nnimanager.ts index 9eee97c91e..6997a57380 100644 --- a/src/nni_manager/core/nnimanager.ts +++ b/src/nni_manager/core/nnimanager.ts @@ -58,7 +58,8 @@ class NNIManager implements Manager { private status: NNIManagerStatus; private waitingTrials: string[]; private trialJobs: Map; - + private trialJobMetricListener: (metric: TrialJobMetric) => void; + constructor() { this.currSubmittedTrialNum = 0; this.trialConcurrencyChange = 0; @@ -76,6 +77,11 @@ class NNIManager implements Manager { status: 'INITIALIZED', errors: [] }; + this.trialJobMetricListener = (metric: TrialJobMetric) => { + this.onTrialJobMetrics(metric).catch((err: Error) => { + this.criticalError(NNIError.FromError(err, 'Job metrics error: ')); + }); + }; } public updateExperimentProfile(experimentProfile: ExperimentProfile, updateType: ProfileUpdateType): Promise { @@ -342,6 +348,7 @@ class NNIManager implements Manager { if (this.dispatcher === undefined) { throw new Error('Error: tuner has not been setup'); } + this.trainingService.removeTrialJobMetricListener(this.trialJobMetricListener); this.dispatcher.sendCommand(TERMINATE); let tunerAlive: boolean = true; // gracefully terminate tuner and assessor here, wait at most 30 seconds. @@ -589,11 +596,7 @@ class NNIManager implements Manager { if (this.dispatcher === undefined) { throw new Error('Error: tuner or job maintainer have not been setup'); } - this.trainingService.addTrialJobMetricListener((metric: TrialJobMetric) => { - this.onTrialJobMetrics(metric).catch((err: Error) => { - this.criticalError(NNIError.FromError(err, 'Job metrics error: ')); - }); - }); + this.trainingService.addTrialJobMetricListener(this.trialJobMetricListener); this.dispatcher.onCommand((commandType: string, content: string) => { this.onTunerCommand(commandType, content).catch((err: Error) => { diff --git a/src/nni_manager/training_service/common/util.ts b/src/nni_manager/training_service/common/util.ts index 031d277fab..556dc79806 100644 --- a/src/nni_manager/training_service/common/util.ts +++ b/src/nni_manager/training_service/common/util.ts @@ -24,7 +24,10 @@ import { getLogger } from "common/log"; import { countFilesRecursively } from '../../common/utils' import * as cpp from 'child-process-promise'; import * as cp from 'child_process'; -import { GPU_INFO_COLLECTOR_FORMAT_LINUX, GPU_INFO_COLLECTOR_FORMAT_WINDOWS } from './gpuData' +import * as os from 'os'; +import * as fs from 'fs'; +import { getNewLine } from '../../common/utils'; +import { GPU_INFO_COLLECTOR_FORMAT_LINUX, GPU_INFO_COLLECTOR_FORMAT_WINDOWS } from './gpuData'; import * as path from 'path'; import { String } from 'typescript-string-operations'; import { file } from "../../node_modules/@types/tmp"; @@ -66,6 +69,20 @@ export async function execMkdir(directory: string): Promise { return Promise.resolve(); } +/** + * copy files to the directory + * @param source + * @param destination + */ +export async function execCopydir(source: string, destination: string): Promise { + if (process.platform === 'win32') { + await cpp.exec(`powershell.exe Copy-Item ${source} -Destination ${destination} -Recurse`); + } else { + await cpp.exec(`cp -r ${source} ${destination}`); + } + return Promise.resolve(); +} + /** * crete a new file * @param filename @@ -91,8 +108,6 @@ export function execScript(filePath: string): cp.ChildProcess { } } - - /** * output the last line of a file * @param filePath @@ -111,9 +126,9 @@ export async function execTail(filePath: string): Promise{ +export async function execRemove(directory: string): Promise { if (process.platform === 'win32') { - await cpp.exec(`powershell.exe Remove-Item ${directory}`); + await cpp.exec(`powershell.exe Remove-Item ${directory} -Recurse -Force`); } else { await cpp.exec(`rm -rf ${directory}`); } @@ -124,7 +139,7 @@ export async function execRemove(directory: string): Promise{ * kill a process * @param directory */ -export async function execKill(pid: string): Promise{ +export async function execKill(pid: string): Promise { if (process.platform === 'win32') { await cpp.exec(`cmd /c taskkill /PID ${pid} /T /F`); } else { @@ -138,7 +153,7 @@ export async function execKill(pid: string): Promise{ * @param variable * @returns command string */ -export function setEnvironmentVariable(variable: { key: string; value: string }): string{ +export function setEnvironmentVariable(variable: { key: string; value: string }): string { if (process.platform === 'win32') { return `$env:${variable.key}="${variable.value}"`; } @@ -147,6 +162,32 @@ export function setEnvironmentVariable(variable: { key: string; value: string }) } } +/** + * Compress files in directory to tar file + * @param source_path + * @param tar_path + */ +export async function tarAdd(tar_path: string, source_path: string): Promise { + if (process.platform === 'win32') { + tar_path = tar_path.split('\\').join('\\\\'); + source_path = source_path.split('\\').join('\\\\'); + let script: string[] = []; + script.push( + `import os`, + `import tarfile`, + String.Format(`tar = tarfile.open("{0}","w:gz")\r\nfor root,dir,files in os.walk("{1}"):`, tar_path, source_path), + ` for file in files:`, + ` fullpath = os.path.join(root,file)`, + ` tar.add(fullpath, arcname=file)`, + `tar.close()`); + await fs.promises.writeFile(path.join(os.tmpdir(), 'tar.py'), script.join(getNewLine()), { encoding: 'utf8', mode: 0o777 }); + const tarScript: string = path.join(os.tmpdir(), 'tar.py'); + await cpp.exec(`python ${tarScript}`); + } else { + await cpp.exec(`tar -czf ${tar_path} -C ${source_path} .`); + } + return Promise.resolve(); +} /** * generate script file name diff --git a/src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts b/src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts index d1a2a379a9..ae48251483 100644 --- a/src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts +++ b/src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts @@ -36,7 +36,7 @@ import { ObservableTimer } from '../../common/observableTimer'; import { HostJobApplicationForm, HyperParameters, JobApplicationForm, TrainingService, TrialJobApplicationForm, TrialJobDetail, TrialJobMetric, NNIManagerIpConfig } from '../../common/trainingService'; -import { delay, generateParamFileName, getExperimentRootDir, uniqueString, getJobCancelStatus, getRemoteTmpDir,getIPV4Address } from '../../common/utils'; +import { delay, generateParamFileName, getExperimentRootDir, uniqueString, getJobCancelStatus, getRemoteTmpDir,getIPV4Address, getVersion, unixPathJoin } from '../../common/utils'; import { GPUSummary } from '../common/gpuData'; import { TrialConfig } from '../common/trialConfig'; import { TrialConfigMetadataKey } from '../common/trialConfigMetadataKey'; @@ -48,10 +48,9 @@ import { } from './remoteMachineData'; import { GPU_INFO_COLLECTOR_FORMAT_LINUX } from '../common/gpuData'; import { SSHClientUtility } from './sshClientUtility'; -import { validateCodeDir } from '../common/util'; +import { validateCodeDir, execRemove, execMkdir, execCopydir } from '../common/util'; import { RemoteMachineJobRestServer } from './remoteMachineJobRestServer'; import { CONTAINER_INSTALL_NNI_SHELL_FORMAT } from '../common/containerJobData'; -import { mkDirP, getVersion } from '../../common/utils'; /** * Training Service implementation for Remote Machine (Linux) @@ -234,7 +233,7 @@ class RemoteMachineTrainingService implements TrainingService { } else if (form.jobType === 'TRIAL') { // Generate trial job id(random) const trialJobId: string = uniqueString(5); - const trialWorkingFolder: string = path.join(this.remoteExpRootDir, 'trials', trialJobId); + const trialWorkingFolder: string = unixPathJoin(this.remoteExpRootDir, 'trials', trialJobId); const trialJobDetail: RemoteMachineTrialJobDetail = new RemoteMachineTrialJobDetail( trialJobId, @@ -354,7 +353,7 @@ class RemoteMachineTrainingService implements TrainingService { case TrialConfigMetadataKey.MACHINE_LIST: await this.setupConnections(value); //remove local temp files - await cpp.exec(`rm -rf ${this.getLocalGpuMetricCollectorDir()}`); + await execRemove(this.getLocalGpuMetricCollectorDir()); break; case TrialConfigMetadataKey.TRIAL_CONFIG: const remoteMachineTrailConfig: TrialConfig = JSON.parse(value); @@ -417,7 +416,7 @@ class RemoteMachineTrainingService implements TrainingService { private async cleanupConnections(): Promise { try{ for (const [rmMeta, sshClientManager] of this.machineSSHClientMap.entries()) { - let jobpidPath: string = path.join(this.getRemoteScriptsPath(rmMeta.username), 'pid'); + let jobpidPath: string = unixPathJoin(this.getRemoteScriptsPath(rmMeta.username), 'pid'); let client: Client | undefined = sshClientManager.getFirstSSHClient(); if(client) { await SSHClientUtility.remoteExeCommand(`pkill -P \`cat ${jobpidPath}\``, client); @@ -438,7 +437,7 @@ class RemoteMachineTrainingService implements TrainingService { */ private getLocalGpuMetricCollectorDir(): string { let userName: string = path.basename(os.homedir()); //get current user name of os - return `${os.tmpdir()}/${userName}/nni/scripts/`; + return path.join(os.tmpdir(), userName, 'nni', 'scripts'); } /** @@ -447,14 +446,14 @@ class RemoteMachineTrainingService implements TrainingService { */ private async generateGpuMetricsCollectorScript(userName: string): Promise { let gpuMetricCollectorScriptFolder : string = this.getLocalGpuMetricCollectorDir(); - await cpp.exec(`mkdir -p ${path.join(gpuMetricCollectorScriptFolder, userName)}`); + await execMkdir(path.join(gpuMetricCollectorScriptFolder, userName)); //generate gpu_metrics_collector.sh let gpuMetricsCollectorScriptPath: string = path.join(gpuMetricCollectorScriptFolder, userName, 'gpu_metrics_collector.sh'); const remoteGPUScriptsDir: string = this.getRemoteScriptsPath(userName); // This directory is used to store gpu_metrics and pid created by script const gpuMetricsCollectorScriptContent: string = String.Format( GPU_INFO_COLLECTOR_FORMAT_LINUX, remoteGPUScriptsDir, - path.join(remoteGPUScriptsDir, 'pid'), + unixPathJoin(remoteGPUScriptsDir, 'pid'), ); await fs.promises.writeFile(gpuMetricsCollectorScriptPath, gpuMetricsCollectorScriptContent, { encoding: 'utf8' }); } @@ -481,7 +480,7 @@ class RemoteMachineTrainingService implements TrainingService { private async initRemoteMachineOnConnected(rmMeta: RemoteMachineMeta, conn: Client): Promise { // Create root working directory after ssh connection is ready await this.generateGpuMetricsCollectorScript(rmMeta.username); //generate gpu script in local machine first, will copy to remote machine later - const nniRootDir: string = `${os.tmpdir()}/nni`; + const nniRootDir: string = unixPathJoin(getRemoteTmpDir(this.remoteOS), 'nni'); await SSHClientUtility.remoteExeCommand(`mkdir -p ${this.remoteExpRootDir}`, conn); // Copy NNI scripts to remote expeirment working directory @@ -490,15 +489,15 @@ class RemoteMachineTrainingService implements TrainingService { await SSHClientUtility.remoteExeCommand(`mkdir -p ${remoteGpuScriptCollectorDir}`, conn); await SSHClientUtility.remoteExeCommand(`chmod 777 ${nniRootDir} ${nniRootDir}/* ${nniRootDir}/scripts/*`, conn); //copy gpu_metrics_collector.sh to remote - await SSHClientUtility.copyFileToRemote(path.join(localGpuScriptCollectorDir, rmMeta.username, 'gpu_metrics_collector.sh'), path.join(remoteGpuScriptCollectorDir, 'gpu_metrics_collector.sh'), conn); + await SSHClientUtility.copyFileToRemote(path.join(localGpuScriptCollectorDir, rmMeta.username, 'gpu_metrics_collector.sh'), unixPathJoin(remoteGpuScriptCollectorDir, 'gpu_metrics_collector.sh'), conn); //Begin to execute gpu_metrics_collection scripts - SSHClientUtility.remoteExeCommand(`bash ${path.join(remoteGpuScriptCollectorDir, 'gpu_metrics_collector.sh')}`, conn); + SSHClientUtility.remoteExeCommand(`bash ${unixPathJoin(remoteGpuScriptCollectorDir, 'gpu_metrics_collector.sh')}`, conn); this.timer.subscribe( async (tick: number) => { const cmdresult: RemoteCommandResult = await SSHClientUtility.remoteExeCommand( - `tail -n 1 ${path.join(remoteGpuScriptCollectorDir, 'gpu_metrics')}`, conn); + `tail -n 1 ${unixPathJoin(remoteGpuScriptCollectorDir, 'gpu_metrics')}`, conn); if (cmdresult && cmdresult.stdout) { rmMeta.gpuSummary = JSON.parse(cmdresult.stdout); } @@ -531,7 +530,7 @@ class RemoteMachineTrainingService implements TrainingService { } else if (rmScheduleResult.resultType === ScheduleResultType.SUCCEED && rmScheduleResult.scheduleInfo !== undefined) { const rmScheduleInfo : RemoteMachineScheduleInfo = rmScheduleResult.scheduleInfo; - const trialWorkingFolder: string = path.join(this.remoteExpRootDir, 'trials', trialJobId); + const trialWorkingFolder: string = unixPathJoin(this.remoteExpRootDir, 'trials', trialJobId); trialJobDetail.rmMeta = rmScheduleInfo.rmMeta; @@ -575,7 +574,7 @@ class RemoteMachineTrainingService implements TrainingService { const trialLocalTempFolder: string = path.join(this.expRootDir, 'trials-local', trialJobId); await SSHClientUtility.remoteExeCommand(`mkdir -p ${trialWorkingFolder}`, sshClient); - await SSHClientUtility.remoteExeCommand(`mkdir -p ${path.join(trialWorkingFolder, '.nni')}`, sshClient); + await SSHClientUtility.remoteExeCommand(`mkdir -p ${unixPathJoin(trialWorkingFolder, '.nni')}`, sshClient); // RemoteMachineRunShellFormat is the run shell format string, // See definition in remoteMachineData.ts @@ -603,20 +602,20 @@ class RemoteMachineTrainingService implements TrainingService { getExperimentId(), trialJobDetail.sequenceId.toString(), this.isMultiPhase, - path.join(trialWorkingFolder, '.nni', 'jobpid'), + unixPathJoin(trialWorkingFolder, '.nni', 'jobpid'), command, nniManagerIp, this.remoteRestServerPort, version, this.logCollection, - path.join(trialWorkingFolder, '.nni', 'code') + unixPathJoin(trialWorkingFolder, '.nni', 'code') ) //create tmp trial working folder locally. - await cpp.exec(`mkdir -p ${path.join(trialLocalTempFolder, '.nni')}`); + await execMkdir(path.join(trialLocalTempFolder, '.nni')); //create tmp trial working folder locally. - await cpp.exec(`cp -r ${this.trialConfig.codeDir}/* ${trialLocalTempFolder}`); + await execCopydir(path.join(this.trialConfig.codeDir, '*'), trialLocalTempFolder); const installScriptContent : string = CONTAINER_INSTALL_NNI_SHELL_FORMAT; // Write NNI installation file to local tmp files await fs.promises.writeFile(path.join(trialLocalTempFolder, 'install_nni.sh'), installScriptContent, { encoding: 'utf8' }); @@ -626,7 +625,7 @@ class RemoteMachineTrainingService implements TrainingService { // Copy files in codeDir to remote working directory await SSHClientUtility.copyDirectoryToRemote(trialLocalTempFolder, trialWorkingFolder, sshClient, this.remoteOS); // Execute command in remote machine - SSHClientUtility.remoteExeCommand(`bash ${path.join(trialWorkingFolder, 'run.sh')}`, sshClient); + SSHClientUtility.remoteExeCommand(`bash ${unixPathJoin(trialWorkingFolder, 'run.sh')}`, sshClient); } private async runHostJob(form: HostJobApplicationForm): Promise { @@ -646,8 +645,8 @@ class RemoteMachineTrainingService implements TrainingService { ); await fs.promises.writeFile(path.join(localDir, 'run.sh'), runScriptContent, { encoding: 'utf8' }); await SSHClientUtility.copyFileToRemote( - path.join(localDir, 'run.sh'), path.join(remoteDir, 'run.sh'), sshClient); - SSHClientUtility.remoteExeCommand(`bash ${path.join(remoteDir, 'run.sh')}`, sshClient); + path.join(localDir, 'run.sh'), unixPathJoin(remoteDir, 'run.sh'), sshClient); + SSHClientUtility.remoteExeCommand(`bash ${unixPathJoin(remoteDir, 'run.sh')}`, sshClient); const jobDetail: RemoteMachineTrialJobDetail = new RemoteMachineTrialJobDetail( jobId, 'RUNNING', Date.now(), remoteDir, form, this.generateSequenceId() @@ -672,7 +671,7 @@ class RemoteMachineTrainingService implements TrainingService { private async updateTrialJobStatus(trialJob: RemoteMachineTrialJobDetail, sshClient: Client): Promise { const deferred: Deferred = new Deferred(); const jobpidPath: string = this.getJobPidPath(trialJob.id); - const trialReturnCodeFilePath: string = path.join(this.remoteExpRootDir, 'trials', trialJob.id, '.nni', 'code'); + const trialReturnCodeFilePath: string = unixPathJoin(this.remoteExpRootDir, 'trials', trialJob.id, '.nni', 'code'); try { const killResult: number = (await SSHClientUtility.remoteExeCommand(`kill -0 \`cat ${jobpidPath}\``, sshClient)).exitCode; // if the process of jobpid is not alive any more @@ -712,15 +711,15 @@ class RemoteMachineTrainingService implements TrainingService { } private getRemoteScriptsPath(userName: string): string { - return path.join(getRemoteTmpDir(this.remoteOS), userName, 'nni', 'scripts'); + return unixPathJoin(getRemoteTmpDir(this.remoteOS), userName, 'nni', 'scripts'); } private getHostJobRemoteDir(jobId: string): string { - return path.join(this.remoteExpRootDir, 'hostjobs', jobId); + return unixPathJoin(this.remoteExpRootDir, 'hostjobs', jobId); } private getRemoteExperimentRootDir(): string{ - return path.join(getRemoteTmpDir(this.remoteOS), 'nni', 'experiments', getExperimentId()); + return unixPathJoin(getRemoteTmpDir(this.remoteOS), 'nni', 'experiments', getExperimentId()); } public get MetricsEmitter() : EventEmitter { @@ -735,9 +734,9 @@ class RemoteMachineTrainingService implements TrainingService { let jobpidPath: string; if (trialJobDetail.form.jobType === 'TRIAL') { - jobpidPath = path.join(trialJobDetail.workingDirectory, '.nni', 'jobpid'); + jobpidPath = unixPathJoin(trialJobDetail.workingDirectory, '.nni', 'jobpid'); } else if (trialJobDetail.form.jobType === 'HOST') { - jobpidPath = path.join(this.getHostJobRemoteDir(jobId), 'jobpid'); + jobpidPath = unixPathJoin(this.getHostJobRemoteDir(jobId), 'jobpid'); } else { throw new Error(`Job type not supported: ${trialJobDetail.form.jobType}`); } @@ -751,14 +750,14 @@ class RemoteMachineTrainingService implements TrainingService { throw new Error('sshClient is undefined.'); } - const trialWorkingFolder: string = path.join(this.remoteExpRootDir, 'trials', trialJobId); + const trialWorkingFolder: string = unixPathJoin(this.remoteExpRootDir, 'trials', trialJobId); const trialLocalTempFolder: string = path.join(this.expRootDir, 'trials-local', trialJobId); const fileName: string = generateParamFileName(hyperParameters); const localFilepath: string = path.join(trialLocalTempFolder, fileName); await fs.promises.writeFile(localFilepath, hyperParameters.value, { encoding: 'utf8' }); - await SSHClientUtility.copyFileToRemote(localFilepath, path.join(trialWorkingFolder, fileName), sshClient); + await SSHClientUtility.copyFileToRemote(localFilepath, unixPathJoin(trialWorkingFolder, fileName), sshClient); } private generateSequenceId(): number { diff --git a/src/nni_manager/training_service/remote_machine/sshClientUtility.ts b/src/nni_manager/training_service/remote_machine/sshClientUtility.ts index f1f227ecc5..bd3aa0cf42 100644 --- a/src/nni_manager/training_service/remote_machine/sshClientUtility.ts +++ b/src/nni_manager/training_service/remote_machine/sshClientUtility.ts @@ -28,8 +28,9 @@ import * as stream from 'stream'; import { Deferred } from 'ts-deferred'; import { NNIError, NNIErrorNames } from '../../common/errors'; import { getLogger, Logger } from '../../common/log'; -import { uniqueString, getRemoteTmpDir } from '../../common/utils'; +import { uniqueString, getRemoteTmpDir, unixPathJoin } from '../../common/utils'; import { RemoteCommandResult } from './remoteMachineData'; +import { execRemove, tarAdd } from '../common/util'; /** * @@ -47,13 +48,13 @@ export namespace SSHClientUtility { const deferred: Deferred = new Deferred(); const tmpTarName: string = `${uniqueString(10)}.tar.gz`; const localTarPath: string = path.join(os.tmpdir(), tmpTarName); - const remoteTarPath: string = path.join(getRemoteTmpDir(remoteOS), tmpTarName); + const remoteTarPath: string = unixPathJoin(getRemoteTmpDir(remoteOS), tmpTarName); // Compress files in local directory to experiment root directory - await cpp.exec(`tar -czf ${localTarPath} -C ${localDirectory} .`); + await tarAdd(localTarPath, localDirectory); // Copy the compressed file to remoteDirectory and delete it await copyFileToRemote(localTarPath, remoteTarPath, sshClient); - await cpp.exec(`rm ${localTarPath}`); + await execRemove(localTarPath); // Decompress the remote compressed file in and delete it await remoteExeCommand(`tar -oxzf ${remoteTarPath} -C ${remoteDirectory}`, sshClient); await remoteExeCommand(`rm ${remoteTarPath}`, sshClient); diff --git a/test/pipelines-it-remote-windows.yml b/test/pipelines-it-remote-windows.yml new file mode 100644 index 0000000000..8eaff656c1 --- /dev/null +++ b/test/pipelines-it-remote-windows.yml @@ -0,0 +1,49 @@ +jobs: +- job: 'integration_test_remote_windows' + + steps: + - script: python -m pip install --upgrade pip setuptools + displayName: 'Install python tools' + - task: CopyFilesOverSSH@0 + inputs: + sshEndpoint: $(end_point) + targetFolder: /tmp/nnitest/$(Build.BuildId)/nni-remote + overwrite: true + displayName: 'Copy all files to remote machine' + - script: | + powershell.exe -file install.ps1 + displayName: 'Install nni toolkit via source code' + - script: | + python -m pip install scikit-learn==0.20.1 --user + displayName: 'Install dependencies for integration tests' + - task: SSH@0 + inputs: + sshEndpoint: $(end_point) + runOptions: inline + inline: cd /tmp/nnitest/$(Build.BuildId)/nni-remote/deployment/pypi;make build + continueOnError: true + displayName: 'build nni bdsit_wheel' + - task: SSH@0 + inputs: + sshEndpoint: $(end_point) + runOptions: commands + commands: python3 /tmp/nnitest/$(Build.BuildId)/nni-remote/test/remote_docker.py --mode start --name $(Build.BuildId) --image nni/nni --os windows + displayName: 'Start docker' + - powershell: | + Write-Host "Downloading Putty..." + (New-Object Net.WebClient).DownloadFile("https://the.earth.li/~sgtatham/putty/latest/w64/pscp.exe", "$(Agent.TempDirectory)\pscp.exe") + $(Agent.TempDirectory)\pscp.exe -hostkey $(hostkey) -pw $(pscp_pwd) $(remote_user)@$(remote_host):/tmp/nnitest/$(Build.BuildId)/port test\port + Get-Content test\port + displayName: 'Get docker port' + - powershell: | + cd test + python generate_ts_config.py --ts remote --remote_user $(docker_user) --remote_host $(remote_host) --remote_port $(Get-Content port) --remote_pwd $(docker_pwd) --nni_manager_ip $(nni_manager_ip) + Get-Content training_service.yml + python config_test.py --ts remote --exclude cifar10,smac,bohb + displayName: 'integration test' + - task: SSH@0 + inputs: + sshEndpoint: $(end_point) + runOptions: commands + commands: python3 /tmp/nnitest/$(Build.BuildId)/nni-remote/test/remote_docker.py --mode stop --name $(Build.BuildId) --os windows + displayName: 'Stop docker' diff --git a/test/remote_docker.py b/test/remote_docker.py index 98f37a1444..576f54ffce 100644 --- a/test/remote_docker.py +++ b/test/remote_docker.py @@ -30,18 +30,33 @@ def find_wheel_package(dir): return file_name return None -def start_container(image, name): +def start_container(image, name, nnimanager_os): '''Start docker container, generate a port in /tmp/nnitest/{name}/port file''' port = find_port() source_dir = '/tmp/nnitest/' + name run_cmds = ['docker', 'run', '-d', '-p', str(port) + ':22', '--name', name, '--mount', 'type=bind,source=' + source_dir + ',target=/tmp/nni', image] output = check_output(run_cmds) commit_id = output.decode('utf-8') - wheel_name = find_wheel_package(os.path.join(source_dir, 'dist')) + + if nnimanager_os == 'windows': + wheel_name = find_wheel_package(os.path.join(source_dir, 'nni-remote/deployment/pypi/dist')) + else: + wheel_name = find_wheel_package(os.path.join(source_dir, 'dist')) + if not wheel_name: print('Error: could not find wheel package in {0}'.format(source_dir)) exit(1) - sdk_cmds = ['docker', 'exec', name, 'python3', '-m', 'pip', 'install', '/tmp/nni/dist/{0}'.format(wheel_name)] + + def get_dist(wheel_name): + '''get the wheel package path''' + if nnimanager_os == 'windows': + return '/tmp/nni/nni-remote/deployment/pypi/dist/{0}'.format(wheel_name) + else: + return '/tmp/nni/dist/{0}'.format(wheel_name) + + pip_cmds = ['docker', 'exec', name, 'python3', '-m', 'pip', 'install', '--upgrade', 'pip'] + check_call(pip_cmds) + sdk_cmds = ['docker', 'exec', name, 'python3', '-m', 'pip', 'install', get_dist(wheel_name)] check_call(sdk_cmds) with open(source_dir + '/port', 'w') as file: file.write(str(port)) @@ -58,8 +73,9 @@ def stop_container(name): parser.add_argument('--mode', required=True, choices=['start', 'stop'], dest='mode', help='start or stop a container') parser.add_argument('--name', required=True, dest='name', help='the name of container to be used') parser.add_argument('--image', dest='image', help='the image to be used') + parser.add_argument('--os', dest='os', default='unix', choices=['unix', 'windows'], help='nniManager os version') args = parser.parse_args() if args.mode == 'start': - start_container(args.image, args.name) + start_container(args.image, args.name, args.os) else: stop_container(args.name) diff --git a/uninstall.ps1 b/uninstall.ps1 index 29446f3836..578a4f24b7 100644 --- a/uninstall.ps1 +++ b/uninstall.ps1 @@ -1,5 +1,4 @@ - -$NNI_DEPENDENCY_FOLDER = "C:\tmp\$env:USERNAME" +$NNI_DEPENDENCY_FOLDER = [System.IO.Path]::GetTempPath()+$env:USERNAME $env:PYTHONIOENCODING = "UTF-8" if($env:VIRTUAL_ENV){ @@ -27,4 +26,4 @@ Remove-Item "src/nni_manager/node_modules" -Recurse -Force Remove-Item "src/webui/build" -Recurse -Force Remove-Item "src/webui/node_modules" -Recurse -Force Remove-Item $NNI_YARN_FOLDER -Recurse -Force -Remove-Item $NNI_NODE_FOLDER -Recurse -Force \ No newline at end of file +Remove-Item $NNI_NODE_FOLDER -Recurse -Force