Skip to content

Commit

Permalink
use rtl simulation results for liveness analysis in remapping phase
Browse files Browse the repository at this point in the history
  • Loading branch information
suchandler96 committed Mar 2, 2024
1 parent 9faa89f commit 716d341
Show file tree
Hide file tree
Showing 7 changed files with 982 additions and 882 deletions.
28 changes: 1 addition & 27 deletions bsc-util/nvdla_utilities/BUILD.md
Original file line number Diff line number Diff line change
Expand Up @@ -248,36 +248,10 @@ Finally, remember to `source /root/.bashrc` after modification.
# (1) verilator -O3 to the verilog -> cpp code generation phase;
# (2) clang -O3 -Ofast to the cpp -> .o compilation;
# (3) profiling-guided optimization (PGO) to the cpp -> .o compilation.
# So the compilation done below is for generating the binary for profiling.
# We also need a second pass to utilize profiling data.
# If you don't want to do PGO, please uncomment the line in the Makefile annotated
# with "without profiling-guided optimization" and comment out the others.
# But please note in our case PGO can bring 40% performance improvement.
# Please note in our case PGO can bring 40% performance improvement.
# We have also fixed some of the UNOPTFLAT warnings in the original NVDLA
# vmod/, which is included in nvdla_hw.patch. If you don't want these
# modifications, simply do `git checkout` for files under nvdla/hw/vmod/.
(gem5)# ./tools/bin/tmake -build verilator
# If PGO is not enabled, compilation is already done here
# Here we go on
(gem5)# cd verif/verilator
(gem5)# mkdir ../traceplayer/lenet
(gem5)# cp /home/gem5-nvdla/bsc-util/nvdla_utilities/example_usage/traces/lenet/input.txn ../traceplayer/lenet/
(gem5)# make run TEST=cc_alexnet_conv5_relu5_int16_dtest_cvsram
(gem5)# make run TEST=googlenet_conv2_3x3_int16
(gem5)# make run TEST=lenet
# Do 3 profiling tests: lenet, a CONV layer in alexnet, a CONV layer in googlenet.
# gather the profiling results
(gem5)# llvm-profdata-10 merge -output=../../LAG.profdata \
../../outdir/nv_full/verilator/test/lenet/*.profraw \
../../outdir/nv_full/verilator/test/cc_alexnet_conv5_relu5_int16_dtest_cvsram/*.profraw \
../../outdir/nv_full/verilator/test/googlenet_conv2_3x3_int16/*.profraw
(gem5)# vi Makefile
# uncomment the line annotated with 'after collecting data with "llvm-profdata merge xxx"'
# and comment out the original one.
(gem5)# ./tools/bin/tmake -build verilator
```

Expand Down
13 changes: 5 additions & 8 deletions bsc-util/nvdla_utilities/caffe2trace.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,7 @@ def parse_mixed_type_trace(rd_wr_trace_file):
ax.set_yticklabels(ylabels)
ax.legend()
plt.tight_layout()
fig.savefig(os.path.join(os.path.dirname(rd_wr_trace_file), "VP_mem_rd_wr.png"), dpi=240)
fig.savefig(os.path.join(os.path.dirname(rd_wr_trace_file), rd_wr_trace_file + ".png"), dpi=240)


def process_log(options):
Expand All @@ -177,18 +177,15 @@ def process_log(options):
os.system("cd /usr/local/nvdla && mv qemu_log " + options.out_dir)

nvdla_utilities_dir = os.path.dirname(os.path.abspath(__file__))
workload = Workload(options.out_dir, True, options.true_data, options.dump_results)
workload = Workload(options.out_dir, in_compilation=True, use_real_data=options.true_data,
dump_results=options.dump_results)
assert os.path.exists(os.path.join(options.out_dir, "VP_mem_rd_wr"))
# rtl_mem_rd_wr is generated during the remapping phase
parse_mixed_type_trace(os.path.join(options.out_dir, "VP_mem_rd_wr"))
os.system("cd " + nvdla_utilities_dir + " && python3.6 fix_txn_discontinuous.py --vp-out-dir " + options.out_dir +
" --name try_input")
os.system("cd " + options.out_dir + " && mv input.txn bkp_input.txn")
os.system("cd " + options.out_dir + " && mv try_input.txn input.txn")
workload.write_rd_only_var_log(os.path.join(options.out_dir, "rd_only_var_log"))

# the nvdla/vp docker image has perl v5.22.1 installed, ok
perl_script_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "input_txn_to_verilator.pl")
os.system("perl " + perl_script_path + " " + os.path.join(options.out_dir, "input.txn") + " " +
os.path.join(options.out_dir, "trace.bin"))


def main():
Expand Down
142 changes: 82 additions & 60 deletions bsc-util/nvdla_utilities/match_reg_trace_addr/parse_qemu_log.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,6 @@ def __init__(self, tsd, tb, addr_id, offset, addr_base_map):
self.addr_id = addr_id
self.offset = offset

self.attr = None # weight, activation, unknown
self.addr = None # offset 0xc0000000 by default
self.size = None # will only be correct for input/output/weight tensor surfaces

Expand Down Expand Up @@ -82,7 +81,7 @@ def __init__(self):


class Workload:
def __init__(self, in_dir, to_convert=False, use_real_data=False, dump_results=False, axi_width=0x40):
def __init__(self, in_dir, in_compilation=False, use_real_data=False, dump_results=False, axi_width=0x40):
self.in_dir = in_dir # each workload corresponds to a directory of log files
self.tb = {} # tensor buffers = {tb_name: TensorBuffer}
self.ts = {} # tensor surfaces = {ts_name: TensorSurface}
Expand All @@ -104,7 +103,7 @@ def __init__(self, in_dir, to_convert=False, use_real_data=False, dump_results=F
self.raw_addr_log = None # = [['r' or 'w', addr], ...]

self.txn_lines = [] # for self.sclog2traces() to store translated contents of input.txn
self.to_convert = to_convert # whether to convert input.txn, mem traces from sc.log
self.in_compilation = in_compilation # constructor called in compilation or remapping phase
self.use_real_data = use_real_data # (does not contain dump_mem instructions)
self.dump_results = dump_results # whether to dump results for checking correctness

Expand All @@ -114,39 +113,58 @@ def __init__(self, in_dir, to_convert=False, use_real_data=False, dump_results=F
assert self.axi_width == 0x40 or self.axi_width == 0x20

with open(os.path.join(self.in_dir, "qemu_log")) as fp:
lines = fp.readlines()
self.addr_base_map = get_addr_mapping(lines)
qemu_log_lines = fp.readlines()
self.addr_base_map = get_addr_mapping(qemu_log_lines)

"""acquire self.ts and self.tb"""
self.read_compile_log(self.addr_base_map)
self.read_compile_log(self.addr_base_map, qemu_log_lines)

# remove redundant ts and tb
to_pop = []
for ts_name, ts in self.ts.items():
if ts.addr is None:
to_pop.append(ts_name)
for pop_item in to_pop:
self.ts.pop(pop_item)
to_pop = []
for tb_name, tb in self.tb.items():
if tb.addr is None:
to_pop.append(tb_name)
for pop_item in to_pop:
self.tb.pop(pop_item)

mem_trace_path = os.path.join(self.in_dir, "VP_mem_rd_wr")
if self.to_convert: # memory trace files and input.txn not yet generated
self.sclog2traces() # not yet prepended load_mem & dump instructions

"""acquire size info of tsd (only for input/out/weight are correct)"""
surfaces, data = construct_surfaces(lines)
for ts_name, ts in self.ts.items():
desc = (ts.addr_id, ts.offset)
ts.size = data[desc].size
if self.in_compilation: # memory trace files and input.txn not yet generated
self.sclog2traces() # not yet prepended load_mem & dump instructions

"""construct self.in_tb, self.out_tb, self.w_tb, self.act_tb, self.itm_act_tb"""
mem_trace_path = os.path.join(self.in_dir, "VP_mem_rd_wr" if self.in_compilation else "rtl_mem_rd_wr")
if not self.in_compilation:
# check validity of rtl_mem_rd_wr and nvdla_cpp.log
nvdla_cpp_log = os.path.join(self.in_dir, "nvdla_cpp.log")
legal = (os.path.exists(nvdla_cpp_log) and os.path.exists(mem_trace_path) and
os.stat(nvdla_cpp_log).st_size != 0 and os.stat(mem_trace_path).st_size != 0)
log_tail_lines = "".join(os.popen("tail -n 3 " + nvdla_cpp_log).readlines()) if legal else ""
legal = legal and ("done at" in log_tail_lines) and ("PASS" in log_tail_lines or "FAIL" in log_tail_lines)
if not legal:
assert os.getenv("VERILATOR_ROOT") is not None # must install verilator for simulation
bin_dir = os.path.abspath(os.path.join(os.path.dirname(os.path.abspath(__file__)),
"../../../ext/rtl/model_nvdla"))
os.system("cd " + bin_dir + " && make VNV_nvdla && ./VNV_nvdla " +
os.path.join(self.in_dir, "trace.bin") + " > " + nvdla_cpp_log)

with open(os.path.join(self.in_dir, "nvdla_cpp.log")) as fp:
nvdla_cpp_log_lines = fp.readlines()
rd_log_lines, wr_log_lines, rw_log_lines = [], [], []
for line in nvdla_cpp_log_lines:
rw_m = re.search("([a-z]+) request from dla, addr ([0-9a-zA-Z]+)", line)
if rw_m is not None:
is_read = (rw_m.group(1) == "read")
addr = int(rw_m.group(2), 16)
if is_read:
burst_match = re.search("burst ([0-9]+)", line)
assert burst_match is not None
burst_len = int(burst_match.group(1)) + 1
for burst_id in range(burst_len):
this_addr = addr + burst_id * self.axi_width
rd_log_lines.append(hex(this_addr) + "\n")
rw_log_lines.append("r " + hex(this_addr) + "\n")
else:
wr_log_lines.append(hex(addr) + "\n")
rw_log_lines.append("w " + hex(addr) + "\n")
with open(os.path.join(self.in_dir, "rtl_mem_rd_wr"), "w") as fp:
fp.writelines(rw_log_lines)
with open(os.path.join(self.in_dir, "rtl_mem_rd"), "w") as fp:
fp.writelines(rd_log_lines)
with open(os.path.join(self.in_dir, "rtl_mem_wr"), "w") as fp:
fp.writelines(wr_log_lines)
self.addr_log, self.sorted_addr, self.raw_addr_log = parse_rd_wr_trace(mem_trace_path)
self.get_various_tensor_buffers()
self.get_various_tensor_buffers() # in VP_mem_rd_wr, only the life cycles are inaccurate

if len(self.in_tb) > 1:
print("Critical situation: len(self.in_tb) > 1")
Expand All @@ -164,7 +182,7 @@ def __init__(self, in_dir, to_convert=False, use_real_data=False, dump_results=F
if 'r' in rw_and_addr[0] and rw_and_addr[1] in rd_only_addr2tb_name.keys():
self.rd_only_tbs.append(rd_only_addr2tb_name[rw_and_addr[1]])

if self.to_convert and self.use_real_data:
if self.in_compilation and self.use_real_data:
# care about self.w_tb and self.in_tb
memory = {} # {axi-aligned addr, [uint32_t]}
to_prepend_txn_lines = []
Expand All @@ -182,15 +200,6 @@ def __init__(self, in_dir, to_convert=False, use_real_data=False, dump_results=F
uint32_ts = info_match.group(3).replace("X", "0").replace("_", "").split()
contents = [int(uint32_t, 16) for uint32_t in uint32_ts]

'''
is_write_match = re.search("iswrite=([0-9]+)", line)
assert is_write_match is not None
is_write = int(is_write_match.group(1))
# only intermediate activations will be overwritten. We don't need their values in input.txn
if is_write == 0 and (addr in memory and memory[addr] != contents):
print("inconsistent memory access result!\nPrevious:\n", memory[addr], "\nNow:\n", contents)
'''
memory[addr] = contents

to_get_tb = self.w_tb + self.in_tb + self.out_tb if self.dump_results else self.w_tb + self.in_tb
Expand Down Expand Up @@ -236,19 +245,19 @@ def __init__(self, in_dir, to_convert=False, use_real_data=False, dump_results=F
" " + file_name + "\t#actual_len = " + hex(tb.size) + "\n")
self.txn_lines = to_prepend_txn_lines + self.txn_lines + to_append_txn_lines

if self.to_convert:
if self.in_compilation:
with open(os.path.join(self.in_dir, "input.txn"), "w") as fp:
fp.writelines(self.txn_lines)

"""do liveness analysis for each intermediate activation TensorBuffer object"""
for tb_name in self.itm_act_tb:
tb = self.tb[tb_name]
first = self.get_to_query_addr(tb)
last = last_aligned(tb.addr, tb.size, self.axi_width)
assert first in self.addr_log
assert last in self.addr_log
tb.liveness = (self.addr_log[first][0][0], self.addr_log[last][-1][0])
tb.num_access = len(self.addr_log[last])
else:
"""do liveness analysis for each intermediate activation TensorBuffer object"""
for tb_name in self.itm_act_tb:
tb = self.tb[tb_name]
first = self.get_to_query_addr(tb)
last = last_aligned(tb.addr, tb.size, self.axi_width)
assert first in self.addr_log
assert last in self.addr_log
tb.liveness = (self.addr_log[first][0][0], self.addr_log[last][-1][0])
tb.num_access = len(self.addr_log[last])

def get_to_query_addr(self, ts_or_tb): # it accepts either tensor surface or tensor buffer
if (ts_or_tb.addr // self.axi_width) * self.axi_width == ts_or_tb.addr:
Expand All @@ -260,15 +269,8 @@ def get_to_query_addr(self, ts_or_tb): # it accepts either tensor surface o
to_query_addr = (ts_or_tb.addr // self.axi_width) * self.axi_width
return to_query_addr

def write_rd_only_var_log(self, rd_var_log_path):
with open(rd_var_log_path, "wb") as fp:
for rd_only_var in self.rd_only_tbs:
tb = self.tb[rd_only_var]
fp.write(tb.addr.to_bytes(4, byteorder="little", signed=False))
fp.write(tb.size.to_bytes(4, byteorder="little", signed=False))

# read compile_log
def read_compile_log(self, addr_base_map):
def read_compile_log(self, addr_base_map, qemu_log_lines):
with open(os.path.join(self.in_dir, "compile_log")) as fp:
lines = fp.readlines()

Expand All @@ -294,6 +296,26 @@ def read_compile_log(self, addr_base_map):
assert prev_tb.size == tb_size
prev_tb.add_tensor_surface(tsd, new_ts, addr_base_map)

# remove redundant ts and tb
to_pop = []
for ts_name, ts in self.ts.items():
if ts.addr is None:
to_pop.append(ts_name)
for pop_item in to_pop:
self.ts.pop(pop_item)
to_pop = []
for tb_name, tb in self.tb.items():
if tb.addr is None:
to_pop.append(tb_name)
for pop_item in to_pop:
self.tb.pop(pop_item)

"""acquire size info of tsd (only for input/out/weight are correct)"""
surfaces, data = construct_surfaces(qemu_log_lines)
for ts_name, ts in self.ts.items():
desc = (ts.addr_id, ts.offset)
ts.size = data[desc].size

def sclog2traces(self):
txn_lines = []
rd_lines = []
Expand Down Expand Up @@ -370,7 +392,7 @@ def sclog2traces(self):
rd_lines.append(hex(axi_addr) + "\n")
rw_lines.append("r " + hex(axi_addr) + "\n")

self.txn_lines = txn_lines
self.txn_lines = txn_lines # don't write to files at this point. Write after getting load|dump_mem insts.
with open(os.path.join(self.in_dir, "VP_mem_rd_wr"), "w") as fp:
fp.writelines(rw_lines)
with open(os.path.join(self.in_dir, "VP_mem_rd"), "w") as fp:
Expand Down
1 change: 1 addition & 0 deletions bsc-util/nvdla_utilities/match_reg_trace_addr/remap.py
Original file line number Diff line number Diff line change
Expand Up @@ -701,6 +701,7 @@ def collect_gurobi_results(workload, cvsram_size, gurobi_out_path, gurobi_in_pat
plt.title("Buffer Allocation Result on CVSRAM size = 0x%x Bytes" % cvsram_size)
plt.xlabel("Logical Order")
plt.ylabel("CVSRAM Address")
plt.rcParams.update({'font.size': 22})
plt.tight_layout()
occ_fig.savefig(gurobi_out_path + "_vis.png", dpi=400)

Expand Down
Loading

0 comments on commit 716d341

Please sign in to comment.