Skip to content

Commit

Permalink
add a lock to prevent concurrent checkpointing of gem5
Browse files Browse the repository at this point in the history
  • Loading branch information
suchandler96 committed Jan 30, 2024
1 parent 085dc5b commit 622c120
Showing 1 changed file with 65 additions and 38 deletions.
103 changes: 65 additions & 38 deletions bsc-util/nvdla_utilities/sweep/sweeper.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,31 @@
}


class LockFile:
def __init__(self, path):
self.path = os.path.abspath(path)

def __enter__(self):
first_iteration = True
while os.path.exists(self.path):
creation_time = os.path.getmtime(self.path)
current_time = time.time()
if current_time - creation_time >= 3600: # if the "lock" file is older than an hour
os.remove(self.path)
print(f"Removed old 'lock' file: '{self.path}'")
break
if first_iteration:
print(f"'{self.path}' already exists, waiting for it to be deleted...")
first_iteration = False
time.sleep(10)
open(self.path, 'a').close() # This creates an empty file
print(f"Entered code block, created '{self.path}'.")

def __exit__(self, exc_type, exc_val, exc_tb):
os.remove(self.path)
print(f"Exited code block, removed '{self.path}'.")


class Sweeper:
def __init__(self, args):
self.home_path = os.popen("cd ~/ && pwd").readlines()[0].strip('\n')
Expand Down Expand Up @@ -298,40 +323,41 @@ def run_all(self, args):

assert 0 <= args.machine_id < args.num_machines
if args.machine_id == 0 and not args.skip_checkpoint:
print("Generating checkpoint...")
os.makedirs(os.path.join(self.gem5_nvdla_dir, "m5out"), exist_ok=True) # in case m5out doesn't exist
bin_path = "build/ARM/gem5.opt" if args.gem5_binary.endswith("opt") else "build/ARM/gem5.fast"
lg = os.popen("cd " + self.gem5_nvdla_dir + " && " + bin_path + " configs/example/arm/fs_bigLITTLE_RTL.py"
" --big-cpus 0 --little-cpus 1 --cpu-type atomic"
" --bootscript=configs/boot/hack_back_ckpt.rcS").readlines()
# get the exact directory of the checkpoint just generated
tick_match = re.search("at tick ([0-9]+)", lg[-4])
assert tick_match is not None
cpt_dir_name = "cpt." + tick_match.group(1)
self.cpt_dir = os.path.join(self.gem5_nvdla_dir, "m5out", cpt_dir_name)

# after generating the checkpoint, we can apply it to the scripts
# change the checkpoint directory of all simulation points, without the need to manually change them
for pt_dir in self.pt_dirs:
with open(os.path.join(pt_dir, "run.sh")) as fp:
run_sh_lines = fp.readlines()
for i, line in enumerate(run_sh_lines):
if "restore-from" in line:
if "cpt-dir" in line: # placeholder found
change_config_file(pt_dir, "run.sh", {"cpt-dir": self.cpt_dir})
else:
cpt_match = re.search(r"(cpt\.[0-9]+)", line)
assert cpt_match is not None
run_sh_lines[i] = run_sh_lines[i].replace(cpt_match.group(1), cpt_dir_name)
with open(os.path.join(pt_dir, "run.sh"), "w") as fp:
fp.writelines(run_sh_lines)
break

esc_home_path = self.home_path.replace('/', '\\/')
esc_new_home = self.new_home.replace('/', '\\/')
esc_out_dir = self.out_dir.replace('/', '\\/')
os.system('sed -i "s/' + esc_home_path + '/' + esc_new_home + '/g" `grep "' +
esc_home_path + '" -rl ' + esc_out_dir + '`')
with LockFile(os.path.join(self.gem5_nvdla_dir, "m5out/checkpointing_lock")):
print("Generating checkpoint...")
bin_path = "build/ARM/gem5.opt" if args.gem5_binary.endswith("opt") else "build/ARM/gem5.fast"
lg = os.popen("cd " + self.gem5_nvdla_dir + " && " + bin_path + " configs/example/arm/fs_bigLITTLE_RTL.py"
" --big-cpus 0 --little-cpus 1 --cpu-type atomic"
" --bootscript=configs/boot/hack_back_ckpt.rcS").readlines()
# get the exact directory of the checkpoint just generated
tick_match = re.search("at tick ([0-9]+)", lg[-4])
assert tick_match is not None
cpt_dir_name = "cpt." + tick_match.group(1)
self.cpt_dir = os.path.join(self.gem5_nvdla_dir, "m5out", cpt_dir_name)

# after generating the checkpoint, we can apply it to the scripts
# change the checkpoint directory of all simulation points, without the need to manually change them
for pt_dir in self.pt_dirs:
with open(os.path.join(pt_dir, "run.sh")) as fp:
run_sh_lines = fp.readlines()
for i, line in enumerate(run_sh_lines):
if "restore-from" in line:
if "cpt-dir" in line: # placeholder found
change_config_file(pt_dir, "run.sh", {"cpt-dir": self.cpt_dir})
else:
cpt_match = re.search(r"(cpt\.[0-9]+)", line)
assert cpt_match is not None
run_sh_lines[i] = run_sh_lines[i].replace(cpt_match.group(1), cpt_dir_name)
with open(os.path.join(pt_dir, "run.sh"), "w") as fp:
fp.writelines(run_sh_lines)
break

esc_home_path = self.home_path.replace('/', '\\/')
esc_new_home = self.new_home.replace('/', '\\/')
esc_out_dir = self.out_dir.replace('/', '\\/')
os.system('sed -i "s/' + esc_home_path + '/' + esc_new_home + '/g" `grep "' +
esc_home_path + '" -rl ' + esc_out_dir + '`')

print("machine_id = %d, running all data points..." % args.machine_id)
assert args.num_machines > 0
Expand All @@ -346,11 +372,12 @@ def run_all(self, args):
sims = []
pool = mp.Pool(
initializer=_init_counter, initargs=(counter, ), processes=args.num_threads)
for p in range(len(this_machine_pt_dirs)):
cmd = os.path.join(this_machine_pt_dirs[p], "run.sh")
sims.append(pool.apply_async(_run_simulation, args=(cmd, )))
time.sleep(0.5) # sleep for a while before launching next to avoid a gem5 bug (socket bind() failed)
# see https://gem5-users.gem5.narkive.com/tvnOFKtP/panic-listensocket-listen-listen-failed
with LockFile(os.path.join(self.gem5_nvdla_dir, "m5out/start_sim_lock")):
for p in range(len(this_machine_pt_dirs)):
cmd = os.path.join(this_machine_pt_dirs[p], "run.sh")
sims.append(pool.apply_async(_run_simulation, args=(cmd, )))
time.sleep(0.5) # sleep for a while before launching next to avoid a gem5 bug (socket bind() failed)
# see https://gem5-users.gem5.narkive.com/tvnOFKtP/panic-listensocket-listen-listen-failed
for sim in sims:
sim.get()
pool.close()
Expand Down

0 comments on commit 622c120

Please sign in to comment.