Skip to content

Commit

Permalink
large_scale_boot benchmark that is capable of booting as many VMs as …
Browse files Browse the repository at this point in the history
…there is quota for.

-------------
Created by MOE: https://github.com/google/moe
MOE_MIGRATED_REVID=301609210
  • Loading branch information
jellyfishcake authored and yuyantingzero committed Mar 20, 2020
1 parent 4e76896 commit 2a6ed3d
Show file tree
Hide file tree
Showing 4 changed files with 527 additions and 0 deletions.
1 change: 1 addition & 0 deletions CHANGES.next.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
- Add replication cluster support to bigtable.
- Add Basic VPC peering support
- Add Snowflake Warehouse support
- Add large_scale_boot benchmark.

### Enhancements:

Expand Down
389 changes: 389 additions & 0 deletions perfkitbenchmarker/linux_benchmarks/large_scale_boot_benchmark.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,389 @@
# Copyright 2020 PerfKitBenchmarker Authors. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Run large scale boot benchmark for virtual machines.
This benchmark measures the boot time for virtual machines. It is different from
the cluster_boot benchmark because this one scales better and is capable of
measuring boot time for a large number of machines.
The way it works is as follows:
1) benchmark spins up a variable number of launcher server VM(s) (num_vms flag).
2) launcher server VM(s) start up a server that listens for curl requests.
3) launcher server VM(s) record the system time as start time
4) launcher server VM(s) run a script to create N VMs per server.
5) VMs curl the launcher server as soon as they start up.
6) once launcher server VM(s) get a curl request, it use separate process to
confirm connection.
7) launcher server VM(s) records the system time as end time for this VM.
8) launcher server VM(s) report the measurements
9) total provisioning time is that of the slowest VM.
10) VMs have startup scripts to shut themselves down after TIMEOUT seconds.
"""
import logging
import posixpath

from perfkitbenchmarker import configs
from perfkitbenchmarker import data
from perfkitbenchmarker import errors
from perfkitbenchmarker import flags
from perfkitbenchmarker import os_types
from perfkitbenchmarker import sample
from perfkitbenchmarker import vm_util

from perfkitbenchmarker.providers.gcp import util as gcp_util


BENCHMARK_NAME = 'large_scale_boot'
BENCHMARK_CONFIG = """
large_scale_boot:
description: >
Create a cluster of launcher servers,
where each launcher server launches FLAGS.boots_per_launcher machines.
vm_groups:
servers:
vm_spec:
GCP:
machine_type: n1-standard-2
zone: us-central1-a
boot_disk_type: pd-ssd
vm_count: 1
os_type: debian9
clients:
vm_spec:
GCP:
machine_type: n1-standard-2
boot_disk_type: pd-ssd
os_type: debian9
vm_count: 1
"""

FLAGS = flags.FLAGS
flags.DEFINE_integer('boots_per_launcher', 1, 'Number of VMs to boot per '
'launcher server VM. Defaults to 1.')
flags.register_validator('boots_per_launcher',
lambda value: 1 <= value <= 1000,
message='The number of VMs booted by each launcher '
'should be between 1 and 1000.')
flags.DEFINE_string('boot_os_type', 'debian9', 'OS to boot on the VMs. '
'Defaults to debian9. OS on launcher server VM is set '
'using os_type flag.')
flags.DEFINE_string('boot_machine_type', 'n1-standard-2', 'Machine type to boot'
'on the VMs. Defaults to n1-standard-2. Set machine type '
'on launcher server VM with launcher_machine_type flag.')
flags.DEFINE_string('launcher_machine_type', 'n1-standard-16', 'Machine type '
'to launcher the VMs. Defaults to n1-standard-16. Set '
'machine type on boot VMs with boot_machine_type flag.')

# remote tmp directory used for this benchmark.
_REMOTE_DIR = vm_util.VM_TMP_DIR
# boot script to use on the launcher server vms.
_BOOT_SCRIPT = 'boot_script.sh'
# local boot template to build boot script.
_BOOT_TEMPLATE = 'large_scale_boot/boot_script.sh.jinja2'
# Remote boot script path
_BOOT_PATH = posixpath.join(_REMOTE_DIR, _BOOT_SCRIPT)
# python listener server to run on launcher server vms.
_LISTENER_SERVER = 'large_scale_boot/listener_server.py'
# log for python listener server.
_LISTENER_SERVER_LOG = 'http.log'
# clean up script to use on the launcher server vms.
_CLEAN_UP_SCRIPT = 'clean_up.sh'
# local clean up template to build the clean up script
_CLEAN_UP_TEMPLATE = 'large_scale_boot/clean_up_script.jinja2'
# Remote clean up script path
_CLEAN_UP_SCRIPT_PATH = posixpath.join(_REMOTE_DIR, _CLEAN_UP_SCRIPT)
# port where listener server listens for incoming booted vms.
_PORT = 8000
# file to record the start time of the boots using system time in nanoseconds.
_START_TIME_FILE = 'start_time'
# start time file path
_START_TIME_FILE_PATH = posixpath.join(_REMOTE_DIR, _START_TIME_FILE)
# file to record the end time of the boots using system time in naneseconds.
_RESULTS_FILE = 'results'
# results file path
_RESULTS_FILE_PATH = posixpath.join(_REMOTE_DIR, _RESULTS_FILE)
# Seconds to wait for vms to boot.
_TIMEOUT_SECONDS = 60 * 10
# Seconds to deplay between polling for launcher server task complete.
_POLLING_DELAY = 3
# command to start the listener server
_START_SERVER_COMMAND = (
'python3 {server} {port} {results_path} > {server_log} 2>&1 &'.format(
server=posixpath.join(_REMOTE_DIR, _LISTENER_SERVER.split('/')[-1]),
port=_PORT,
results_path=_RESULTS_FILE_PATH,
server_log=_LISTENER_SERVER_LOG))
# sha256sum for preprovisioned service account credentials.
# If not using service account credentials from preprovisioned data bucket,
# use --gcp_service_account_key_file flag to specify the same credentials.
BENCHMARK_DATA = {
'p3rf-scaling-a1828b03ba93.json':
'c0cf08d79dd717e33e155164e35c8330bb26e9031eafab30064fff31afa86e99',
}


class InsufficientBootsError(Exception):
"""Error thrown if there are insufficient boots during wait."""


def CheckPrerequisites(_):
"""Verifies that the required resources are present.
Raises:
perfkitbenchmarker.data.ResourceNotFound: On missing resource.
"""
data.ResourcePath(_BOOT_TEMPLATE)
data.ResourcePath(_LISTENER_SERVER)
data.ResourcePath(_CLEAN_UP_TEMPLATE)
if FLAGS.cloud != 'GCP':
raise errors.Benchmarks.PrepareException(
'Booting VMs on non-GCP clouds is not yet supported.')
if FLAGS.boot_os_type in os_types.WINDOWS_OS_TYPES:
raise errors.Benchmarks.PrepareException(
'Booting Windows VMs is not yet supported')


def GetConfig(user_config):
"""Load and updates the benchmark config with user flags.
Args:
user_config: user supplied configuration (flags and config file)
Returns:
loaded benchmark configuration
"""
config = configs.LoadConfig(BENCHMARK_CONFIG, user_config, BENCHMARK_NAME)
launcher_config = config['vm_groups']['servers']
launcher_config['vm_count'] = FLAGS.num_vms
launcher_config['vm_spec'][FLAGS.cloud]['machine_type'] = (
FLAGS.launcher_machine_type)
booter_template = config['vm_groups']['clients']
booter_template['os_type'] = FLAGS.boot_os_type
booter_template['vm_spec'][FLAGS.cloud]['machine_type'] = (
FLAGS.boot_machine_type)
if FLAGS.machine_type:
raise errors.Setup.InvalidConfigurationError(
'Do not set machine type flag as it will override both launcher and '
'booter machine types. Use launcher_machine_type and boot_machine_type'
'instead.')
if booter_template['vm_count'] != 1:
raise errors.Setup.InvalidConfigurationError(
'Booter_template is a configuration template VM. '
'Booter count should be set by number of launchers (FLAGS.num_vms) and '
'booters per launcher (FLAGS.boots_per_launcher).')
return config


def _BuildContext(launcher_vm, booter_template_vm):
"""Returns the context variables for Jinja2 template during rendering."""
return {
'cloud': FLAGS.cloud,
'start_time_file': _START_TIME_FILE_PATH,
'vm_count': FLAGS.boots_per_launcher,
'launcher_vm_name': launcher_vm.name,
'project': FLAGS.project,
'image_family': booter_template_vm.image_family,
'image_project': booter_template_vm.image_project,
'boot_machine_type': booter_template_vm.machine_type,
'server_ip': launcher_vm.internal_ip,
'server_port': _PORT,
'timeout': _TIMEOUT_SECONDS,
'zone': launcher_vm.zone,
'gcloud_path': FLAGS.gcloud_path,
}


def _Install(launcher_vm, booter_template_vm):
"""Installs benchmark scripts and packages on the launcher vm."""
# Render boot script on launcher server VM(s)
context = _BuildContext(launcher_vm, booter_template_vm)
launcher_vm.RenderTemplate(data.ResourcePath(_BOOT_TEMPLATE), _BOOT_PATH,
context)

# Installs and start listener server on launcher VM(s).
launcher_vm.InstallPackages('netcat')
launcher_vm.PushDataFile(_LISTENER_SERVER, _REMOTE_DIR)
launcher_vm.RemoteCommand(_START_SERVER_COMMAND)
# Render clean up script on launcher server VM(s).
launcher_vm.RenderTemplate(data.ResourcePath(_CLEAN_UP_TEMPLATE),
_CLEAN_UP_SCRIPT_PATH, context)


def Prepare(benchmark_spec):
"""Prepare the launcher server vm(s).
Prepare the launcher server vm(s) by:
1) Build the script that each launcher server will use to kick off boot.
2) Start a listening server to wait for booting vms.
Args:
benchmark_spec: The benchmark specification. Contains all data that is
required to run the benchmark.
"""
benchmark_spec.always_call_cleanup = True
launcher_vms = benchmark_spec.vm_groups['servers']
booter_template_vm = benchmark_spec.vm_groups['clients'][0]
# Setup account/IAM credentials/permissions on launcher servers.
if FLAGS.cloud == 'GCP':
for vm in launcher_vms:
gcp_util.AuthenticateServiceAccount(vm, benchmark=BENCHMARK_NAME)

# fail early if launched VMs exceeds more than 50 per vcpu.
# High CPU usage can negatively impact measured boot times.
if FLAGS.boots_per_launcher > (launcher_vms[0].num_cpus * 50):
raise errors.Setup.InvalidConfigurationError(
'Each launcher server VM is launching too many VMs. '
'Increase launcher server VM size or decrease boots_per_launcher. '
'For a VM with {} CPUs, launch at most {} VMs.'.format(
launcher_vms[0].num_cpus, launcher_vms[0].num_cpus * 50))

vm_util.RunThreaded(
lambda vm: _Install(vm, booter_template_vm), launcher_vms)


def _GetExpectedBoots():
"""Return the number of expected boots."""
return int(FLAGS.num_vms) * int(FLAGS.boots_per_launcher)


@vm_util.Retry(poll_interval=_POLLING_DELAY, timeout=_TIMEOUT_SECONDS,
retryable_exceptions=(InsufficientBootsError))
def _WaitForResponses(launcher_vms):
"""Wait for all results or server shutdown or TIMEOUT_SECONDS."""
# if any listener server exited, stop waiting.
def _LauncherError(vm):
error, _ = vm.RemoteCommand('grep ERROR ' + _LISTENER_SERVER_LOG,
ignore_failure=True)
return error
error_str = vm_util.RunThreaded(_LauncherError, launcher_vms)
if any(error_str):
raise errors.Benchmarks.RunError(
'Some listening server errored out: %s' % error_str)

def _BootCountInLauncher(vm):
stdout, _ = vm.RemoteCommand('wc -l ' + _RESULTS_FILE_PATH)
return int(stdout.split()[0])
boots = vm_util.RunThreaded(_BootCountInLauncher, launcher_vms)
for vm, boot_count in zip(launcher_vms, boots):
logging.info('Launcher %s reported %d/%d',
vm.internal_ip, boot_count, FLAGS.boots_per_launcher)
reporting_vms_count = sum(boots)
if reporting_vms_count != _GetExpectedBoots():
raise InsufficientBootsError(
'Launcher vms reported %d total boots. Expecting %d.' %
(reporting_vms_count, _GetExpectedBoots()))


def _ParseResult(launcher_vms):
"""Parse the results on the launcher VMs and send it back.
Boot time is the boot duration of the slowest machine.
Args:
launcher_vms: Launcher server VMs.
Returns:
A list of benchmark samples.
"""
vm_count = 0
slowest_time = -1
get_starttime_cmd = 'cat {startime}'.format(startime=_START_TIME_FILE_PATH)
get_results_cmd = 'cat {results}'.format(results=_RESULTS_FILE_PATH)
samples = []
common_metadata = {
'cloud': FLAGS.cloud,
'num_launchers': FLAGS.num_vms,
'expected_boots_per_launcher': FLAGS.boots_per_launcher,
'boot_os_type': FLAGS.boot_os_type,
'boot_machine_type': FLAGS.boot_machine_type,
'launcher_machine_type': FLAGS.launcher_machine_type
}
for vm in launcher_vms:
start_time_str, _ = vm.RemoteCommand(get_starttime_cmd)
start_time = int(start_time_str)
results, _ = vm.RemoteCommand(get_results_cmd)
cur_launcher_success = 0
cur_launcher_closed_incoming = 0
durations = []
for line in results.splitlines():
state, _, duration = line.split(':')
end_time = int(duration)
if state == 'Pass':
duration_in_ns = end_time - start_time
durations.append(duration_in_ns)
slowest_time = max(slowest_time, duration_in_ns)
cur_launcher_success += 1
elif state == 'Fail':
# outgoing port was open but incoming port was closed.
cur_launcher_closed_incoming += 1

vm_count += cur_launcher_success
current_metadata = {
'zone': vm.zone,
'launcher_successes': cur_launcher_success,
'launcher_boot_durations_ns': durations,
'launcher_closed_incoming': cur_launcher_closed_incoming,
}
current_metadata.update(common_metadata)
samples.append(sample.Sample('Launcher Boot Details', -1,
'', current_metadata))

samples.append(sample.Sample('Cluster Max Boot Time', slowest_time,
'nanoseconds', common_metadata))
samples.append(sample.Sample('Cluster Expected Boots', _GetExpectedBoots(),
'', common_metadata))
samples.append(sample.Sample('Cluster Success Boots', vm_count,
'', common_metadata))
return samples


def Run(benchmark_spec):
"""Kick off gartner boot script on launcher server vms.
Args:
benchmark_spec: The benchmark specification. Contains all data that is
required to run the benchmark.
Returns:
A list of benchmark samples.
"""
launcher_vms = benchmark_spec.vm_groups['servers']
vm_util.RunThreaded(
lambda vm: vm.RemoteCommand('bash {} 2>&1 | tee log'.format(_BOOT_PATH)),
launcher_vms)
try:
_WaitForResponses(launcher_vms)
except InsufficientBootsError:
# On really large-scale boots, some failures are expected.
logging.info('Some VMs failed to boot.')
return _ParseResult(launcher_vms)


def Cleanup(benchmark_spec):
"""Clean up.
Launcher VMs and booter template VM are deleted by pkb resource management.
Boot VMs are self-destructing, but we will make a second attempt at destroying
them anyway for good hygene.
Args:
benchmark_spec: The benchmark specification. Contains all data that is
required to run the benchmark.
"""
launcher_vms = benchmark_spec.vm_groups['servers']
command = 'bash {} 2>&1 | tee clean_up_log'.format(_CLEAN_UP_SCRIPT_PATH)
vm_util.RunThreaded(
lambda vm: vm.RemoteCommand(command),
launcher_vms)
Loading

0 comments on commit 2a6ed3d

Please sign in to comment.