Skip to content

Commit

Permalink
Merge pull request #29 from justinjfu/dev
Browse files Browse the repository at this point in the history
Re-enable EC2 support
  • Loading branch information
justinjfu authored Mar 24, 2020
2 parents b751a10 + ee16821 commit b4cd75f
Show file tree
Hide file tree
Showing 6 changed files with 53 additions and 66 deletions.
94 changes: 40 additions & 54 deletions doodad/mode.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,6 @@ def __init__(self,
ec2_credentials,
s3_bucket,
s3_log_path,
s3_log_prefix='ec2_experiment',
ami_name=None,
terminate_on_end=True,
region='auto',
Expand All @@ -99,14 +98,17 @@ def __init__(self,
aws_key_name=None,
iam_instance_profile_name='doodad',
swap_size=4096,
tag_exp_name='doodad_experiment',
**kwargs):
super(EC2Mode, self).__init__(**kwargs)
self.credentials = ec2_credentials
self.s3_bucket = s3_bucket
self.s3_log_path = s3_log_path
self.s3_log_prefix = s3_log_prefix
self.tag_exp_name = tag_exp_name
self.ami = ami_name
self.terminate_on_end = terminate_on_end
if region == 'auto':
region = 'us-west-1'
self.region = region
self.instance_type = instance_type
self.use_gpu = False
Expand All @@ -124,9 +126,8 @@ def dedent(self, s):
return '\n'.join(lines)

def run_script(self, script_name, dry=False, return_output=False, verbose=False):
if not dry:
raise NotImplementedError("EC2 is not implemented.")
assert not return_output
if return_output:
raise ValueError("Cannot return output for AWS scripts.")

default_config = dict(
image_id=self.image_id,
Expand All @@ -140,25 +141,21 @@ def run_script(self, script_name, dry=False, return_output=False, verbose=False)
)
aws_config = dict(default_config)
time_key = gcp_util.make_timekey()
exp_name = "{}-{}".format(self.s3_log_prefix, time_key)
exp_prefix = self.s3_log_prefix

s3_base_dir = os.path.join(self.s3_log_path, exp_prefix.replace("_", "-"), exp_name)
s3_log_dir = os.path.join(s3_base_dir, 'logs')
s3_base_dir = os.path.join('s3://'+self.s3_bucket, self.s3_log_path)
s3_log_dir = os.path.join(s3_base_dir, 'outputs')
stdout_log_s3_path = os.path.join(s3_base_dir, 'stdout_$EC2_INSTANCE_ID.log')

sio = six.StringIO()
sio.write("#!/bin/bash\n")
sio.write("truncate -s 0 /home/ubuntu/user_data.log\n")
sio.write("truncate -s 0 /tmp/user_data.log\n")
sio.write("{\n")
sio.write("echo hello!\n")
sio.write('die() { status=$1; shift; echo "FATAL: $*"; exit $status; }\n')
sio.write('EC2_INSTANCE_ID="`wget -q -O - http://169.254.169.254/latest/meta-data/instance-id`"\n')
sio.write("""
aws ec2 create-tags --resources $EC2_INSTANCE_ID --tags Key=Name,Value={exp_name} --region {aws_region}
""".format(exp_name=exp_name, aws_region=self.region))
sio.write("""
aws ec2 create-tags --resources $EC2_INSTANCE_ID --tags Key=exp_prefix,Value={exp_prefix} --region {aws_region}
""".format(exp_prefix=exp_prefix, aws_region=self.region))
""".format(exp_name=self.tag_exp_name, aws_region=self.region))

# Add swap file
if self.use_gpu:
Expand All @@ -182,12 +179,14 @@ def run_script(self, script_name, dry=False, return_output=False, verbose=False)
""")

# 1) Upload script and download it to remote
aws_util.s3_upload(script_name, self.s3_bucket, 'doodad/mount', dry=dry)
script_split = os.path.split(script_name)[-1]
aws_util.s3_upload(script_name, self.s3_bucket, os.path.join('doodad/mount', script_split), dry=dry)
script_s3_filename = 's3://{bucket_name}/doodad/mount/{script_name}'.format(
bucket_name=self.s3_bucket,
script_name=script_name
script_name=script_split
)
sio.write('aws s3 cp {script_s3_filename} /tmp/remote_script.sh\n'.format(
sio.write('aws s3 cp --region {region} {script_s3_filename} /tmp/remote_script.sh\n'.format(
region=self.region,
script_s3_filename=script_s3_filename
))

Expand Down Expand Up @@ -225,8 +224,8 @@ def run_script(self, script_name, dry=False, return_output=False, verbose=False)
if [ -z $(curl -Is http://169.254.169.254/latest/meta-data/spot/termination-time | head -1 | grep 404 | cut -d \ -f 2) ]
then
logger "Running shutdown hook."
aws s3 cp --recursive {log_dir} {s3_path}
aws s3 cp /home/ubuntu/user_data.log {stdout_log_s3_path}
aws s3 cp --region {region} --recursive {log_dir} {s3_path}
aws s3 cp --region {region} /tmp/user_data.log {stdout_log_s3_path}
break
else
# Spot instance not yet marked for termination.
Expand All @@ -237,32 +236,32 @@ def run_script(self, script_name, dry=False, return_output=False, verbose=False)
fi
done & echo log sync initiated
""".format(
region=self.region,
log_dir=ec2_local_dir,
s3_path=s3_log_dir,
stdout_log_s3_path=stdout_log_s3_path,
))

sio.write("""
while /bin/true; do
aws s3 cp /home/ubuntu/user_data.log {stdout_log_s3_path}
aws s3 cp --region {region} /tmp/user_data.log {stdout_log_s3_path}
sleep {periodic_sync_interval}
done & echo sync initiated
""".format(
region=self.region,
stdout_log_s3_path=stdout_log_s3_path,
periodic_sync_interval=self.sync_interval
))

if self.use_gpu:
sio.write("""
for i in {1..800}; do su -c "nvidia-modprobe -u -c=0" ubuntu && break || sleep 3; done
systemctl start nvidia-docker
""")
#sio.write("""
# for i in {1..800}; do su -c "nvidia-modprobe -u -c=0" ec2-user && break || sleep 3; done
# systemctl start nvidia-docker
#""")
sio.write("echo 'Testing nvidia-smi'\n")
sio.write("nvidia-smi\n")
sio.write("echo 'Testing nvidia-smi inside docker'\n")
#sio.write("nvidia-docker run --rm {docker_image} nvidia-smi\n".format(docker_image=self.docker_image))

#docker_cmd = self.get_docker_cmd(main_cmd, use_tty=False, extra_args=mnt_args, pythonpath=py_path, use_docker_generated_name=True)
sio.write("nvidia-docker run --rm {docker_image} nvidia-smi\n".format(docker_image=self.docker_image))

docker_cmd = '%s /tmp/remote_script.sh' % self.shell_interpreter
sio.write(docker_cmd+'\n')
Expand All @@ -271,20 +270,22 @@ def run_script(self, script_name, dry=False, return_output=False, verbose=False)
# Ideally the earlier while loop would be sufficient, but it might be
# the case that the earlier while loop isn't fast enough to catch a
# termination. So, we explicitly sync on termination.
sio.write("aws s3 cp --recursive {local_dir} {s3_dir}\n".format(
sio.write("aws s3 cp --region {region} --recursive {local_dir} {s3_dir}\n".format(
region=self.region,
local_dir=ec2_local_dir,
s3_dir=s3_log_dir
))
sio.write("aws s3 cp /home/ubuntu/user_data.log {}\n".format(
stdout_log_s3_path,
sio.write("aws s3 cp --region {region} /tmp/user_data.log {s3_dir}\n".format(
region=self.region,
s3_dir=stdout_log_s3_path,
))

if self.terminate_on_end:
sio.write("""
EC2_INSTANCE_ID="`wget -q -O - http://169.254.169.254/latest/meta-data/instance-id || die \"wget instance-id has failed: $?\"`"
aws ec2 terminate-instances --instance-ids $EC2_INSTANCE_ID --region {aws_region}
""".format(aws_region=self.region))
sio.write("} >> /home/ubuntu/user_data.log 2>&1\n")
sio.write("} >> /tmp/user_data.log 2>&1\n")

full_script = self.dedent(sio.getvalue())
ec2 = boto3.client(
Expand All @@ -294,24 +295,7 @@ def run_script(self, script_name, dry=False, return_output=False, verbose=False)
aws_secret_access_key=self.credentials.aws_secret_key,
)

if len(full_script) > 10000 or len(base64.b64encode(full_script.encode()).decode("utf-8")) > 10000:
s3_path = aws_util.s3_upload(full_script, self.s3_bucket, 'doodad/mount', dry=dry)
sio = six.StringIO()
sio.write("#!/bin/bash\n")
sio.write("""
aws s3 cp {s3_path} /home/ubuntu/remote_script.sh --region {aws_region} && \\
chmod +x /home/ubuntu/remote_script.sh && \\
bash /home/ubuntu/remote_script.sh
""".format(s3_path=s3_path, aws_region=self.s3_bucket))
user_data = self.dedent(sio.getvalue())
else:
user_data = full_script

if verbose:
print(full_script)
with open("/tmp/full_ec2_script", "w") as f:
f.write(full_script)

user_data = full_script
instance_args = dict(
ImageId=aws_config["image_id"],
KeyName=aws_config["key_name"],
Expand All @@ -336,7 +320,7 @@ def run_script(self, script_name, dry=False, return_output=False, verbose=False)
DryRun=dry,
InstanceCount=1,
LaunchSpecification=instance_args,
SpotPrice=aws_config["spot_price"],
SpotPrice=str(aws_config["spot_price"]),
# ClientToken=params_list[0]["exp_name"],
)

Expand All @@ -354,7 +338,7 @@ def run_script(self, script_name, dry=False, return_output=False, verbose=False)
ec2.create_tags(
Resources=[spot_request_id],
Tags=[
{'Key': 'Name', 'Value': exp_name}
{'Key': 'Name', 'Value': self.tag_exp_name}
],
)
break
Expand All @@ -367,15 +351,15 @@ def __init__(self,
autoconfig_file=None,
region='us-west-1',
s3_bucket=None,
ami_image=None,
ami_name=None,
aws_key_name=None,
iam_instance_profile_name=None,
**kwargs
):
# find config file
autoconfig = Autoconfig(autoconfig_file)
s3_bucket = autoconfig.s3_bucket() if s3_bucket is None else s3_bucket
image_id = autoconfig.aws_image_id(region) if ami_image is None else ami_image
image_id = autoconfig.aws_image_id(region) if ami_name is None else ami_name
aws_key_name= autoconfig.aws_key_name(region) if aws_key_name is None else aws_key_name
iam_profile= autoconfig.iam_profile_name() if iam_instance_profile_name is None else iam_instance_profile_name
credentials=AWSCredentials(aws_key=autoconfig.aws_access_key(), aws_secret=autoconfig.aws_access_secret())
Expand Down Expand Up @@ -412,6 +396,8 @@ class GCPMode(LaunchMode):
preemptible (bool): Start a preemptible instance
zone (str): GCE compute zone.
instance_type (str): GCE instance type
gpu_model (str): GCP GPU model. See https://cloud.google.com/compute/docs/gpus.
data_sync_interval (int): Number of seconds before each sync on mounts.
"""
def __init__(self,
gcp_project,
Expand Down Expand Up @@ -457,7 +443,7 @@ def print_launch_message(self):

def run_script(self, script, dry=False, return_output=False, verbose=False):
if return_output:
raise NotImplementedError()
raise ValueError("Cannot return output for GCP scripts.")

# Upload script to GCS
cmd_split = shlex.split(script)
Expand Down
5 changes: 3 additions & 2 deletions doodad/mount.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,7 @@ def __init__(self,
raise NotImplementedError('Local dir cannot be absolute')
else:
# We store everything into a fixed dir /doodad on the remote machine
# so GCPMode knows to simply sync /doodad
# so EC2Mode knows to simply sync /doodad
# (this is b/c we no longer pass in mounts to the launch mode)
self.sync_dir = os.path.join('/doodad', s3_path)
self.output = output
Expand All @@ -184,7 +184,7 @@ def dar_build_archive(self, deps_dir):
return

def dar_extract_command(self):
return 'echo helloMountEC2'
return 'echo helloMountS3'


class MountGCP(Mount):
Expand Down Expand Up @@ -225,3 +225,4 @@ def dar_build_archive(self, deps_dir):

def dar_extract_command(self):
return 'echo helloMountGCP'

2 changes: 1 addition & 1 deletion scripts/gcp/gcp_shutdown_script.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ bucket_name=$(query_metadata bucket_name)
gcp_bucket_path=$(query_metadata gcp_bucket_path)
instance_name=$(curl http://metadata/computeMetadata/v1/instance/name -H "Metadata-Flavor: Google")

gsutil cp -r /doodad/* gs://$bucket_name/$gcp_bucket_path/logs
gsutil cp -r /doodad/* gs://$bucket_name/$gcp_bucket_path/outputs
# sync stdout
gcp_bucket_path=${gcp_bucket_path%/} # remove trailing slash if present
gsutil cp /home/ubuntu/user_data.log gs://$bucket_name/$gcp_bucket_path/${instance_name}_stdout.log
4 changes: 2 additions & 2 deletions scripts/gcp/gcp_startup_script.sh
Original file line number Diff line number Diff line change
Expand Up @@ -48,9 +48,9 @@ query_metadata() {
# we just make the archive store everything into /doodad
mkdir -p /doodad
while /bin/true; do
gsutil -m rsync -r /doodad gs://$bucket_name/$gcp_bucket_path/logs
gsutil -m rsync -r /doodad gs://$bucket_name/$gcp_bucket_path/outputs
sleep $data_sync_interval
done & echo sync from /doodad to gs://$bucket_name/$gcp_bucket_path/logs initiated
done & echo sync from /doodad to gs://$bucket_name/$gcp_bucket_path/outputs initiated

# sync stdout
gcp_bucket_path=${gcp_bucket_path%/} # remove trailing slash if present
Expand Down
1 change: 1 addition & 0 deletions testing/config.py.example
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
GCP_BUCKET=your_bucket
GCP_PROJECT=your_project
GCP_IMAGE=your_image
S3_BUCKET=your_bucket
13 changes: 6 additions & 7 deletions testing/remote/test_ec2.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@
from testing.config import S3_BUCKET

def run():
ec2_mount = mount.MountEC2(
gcp_path='secret_output',
ec2_mount = mount.MountS3(
s3_path='secret_output',
mount_point='/output'
)
local_mount = mount.MountLocal(
Expand All @@ -15,15 +15,14 @@ def run():
output=False
)
mounts = [local_mount, ec2_mount]
ec2_credentials

launcher = mode.EC2Autoconfig(
s3_bucket=S3_BUCKET,
s3_log_path='test_doodad/gcp_test',
instance_type='t3.nano',
spot_price=0.02,
region='us-west1',
ami_name='ami-089408c670f3e10c0',
instance_type='c4.large',
spot_price=0.03,
region='us-west-1',
ami_name='ami-874378e7',
)

launch_api.run_command(
Expand Down

0 comments on commit b4cd75f

Please sign in to comment.