Skip to content

Commit

Permalink
top-down: align top-down with Gem5 (OpenXiangShan#2085)
Browse files Browse the repository at this point in the history
* topdown: add defines of topdown counters enum

* redirect: add redirect type for perf

* top-down: add stallReason IOs

frontend -> ctrlBlock -> decode -> rename -> dispatch

* top-down: add dummy connections

* top-down: update TopdownCounters

* top-down: imp backend analysis and counter dump

* top-down: add HartId in `addSource`

* top-down: broadcast lqIdx of ROB head

* top-down: frontend signal done

* top-down: add memblock topdown interface

* Bump HuanCun: add TopDownMonitor

* top-down: receive and handle reasons in dispatch

* top-down: remove previous top-down code

* TopDown: add MemReqSource enum

* TopDown: extend mshr_latency range

* TopDown: add basic Req Source

TODO: distinguish prefetch

* dcache: distinguish L1DataPrefetch and CPUData

* top-down: comment out debugging perf counters in ibuffer

* TopDown: add path to pass MemReqSource to HuanCun

* TopDown: use simpler logic to count reqSource and update Probe count

* frontend: update topdown counters

* Update HuanCun Topdown for MemReqSource

* top-down: fix load stalls

* top-down: Change the priority of different stall reasons

* top-down: breakdown OtherCoreStall

* sbuffer: fix eviction

* when valid count reaches StoreBufferSize, do eviction

* sbuffer: fix replaceIdx

* If the way selected by the replacement algorithm cannot be written into dcache, its result is not used.

* dcache, ldu: fix vaddr in missqueue

This commit prevents the high bits of the virtual address from being truncated

* fix-ldst_pri-230506

* mainpipe: fix loadsAreComing

* top-down: disable dedup

* top-down: remove old top-down config

* top-down: split lq addr from ls_debug

* top-down: purge previous top-down code

* top-down: add debug_vaddr in LoadQueueReplay

* add source rob_head_other_repay

* remove load_l1_cache_stall_with/wihtou_bank_conflict

* dcache: split CPUData & refill latency

* split CPUData to CPUStoreData & CPULoadData & CPUAtomicData
* monitor refill latency for all type of req

* dcache: fix perfcounter in mq

* io.req.bits.cancel should be applied when counting req.fire

* TopDown: add TopDown for CPL2 in XiangShan

* top-down: add hartid params to L2Cache

* top-down: fix dispatch queue bound

* top-down: no DqStall when robFull

* topdown: buspmu support latency statistic (OpenXiangShan#2106)

* perf: add buspmu between L2 and L3, support name argument

* bump difftest

* perf: busmonitor supports latency stat

* config: fix cpl2 compatible problem

* bump utility

* bump coupledL2

* bump huancun

* misc: adapt to utility key&field

* config: fix key&field source, remove deprecated argument

* buspmu: remove debug print

* bump coupledl2&huancun

* top-down: fix sq full condition

* top-down: classify "lq full" load bound

* top-down: bump submodules

* bump coupledL2: fix reqSource in data path

* bump coupledL2

---------

Co-authored-by: tastynoob <[email protected]>
Co-authored-by: Guokai Chen <[email protected]>
Co-authored-by: lixin <[email protected]>
Co-authored-by: XiChen <[email protected]>
Co-authored-by: Zhou Yaoyang <[email protected]>
Co-authored-by: Lyn <[email protected]>
Co-authored-by: wakafa <[email protected]>
  • Loading branch information
8 people authored Jun 2, 2023
1 parent b9e121d commit d2b20d1
Show file tree
Hide file tree
Showing 49 changed files with 1,195 additions and 245 deletions.
11 changes: 0 additions & 11 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -55,17 +55,6 @@ else
override SIM_ARGS += --with-constantin
endif

# top-down
ifeq ($(CONFIG),DefaultConfig)
ENABLE_TOPDOWN ?= 1
endif
ifneq ($(NUM_CORES),1)
ENABLE_TOPDOWN = 0
endif
ifeq ($(ENABLE_TOPDOWN),1)
override SIM_ARGS += --enable-topdown
endif

# emu for the release version
RELEASE_ARGS = --disable-all --remove-assert --fpga-platform
DEBUG_ARGS = --enable-difftest
Expand Down
2 changes: 1 addition & 1 deletion coupledL2
Submodule coupledL2 updated 0 files
14 changes: 8 additions & 6 deletions src/main/scala/system/SoC.scala
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ import freechips.rocketchip.devices.tilelink.{CLINT, CLINTParams, DevNullParams,
import freechips.rocketchip.diplomacy.{AddressSet, IdRange, InModuleBody, LazyModule, LazyModuleImp, MemoryDevice, RegionType, SimpleDevice, TransferSizes}
import freechips.rocketchip.interrupts.{IntSourceNode, IntSourcePortSimple}
import freechips.rocketchip.regmapper.{RegField, RegFieldAccessType, RegFieldDesc, RegFieldGroup}
import utility.{BinaryArbiter, TLEdgeBuffer}
import utility.{BinaryArbiter, TLClientsMerger, TLEdgeBuffer}
import xiangshan.{DebugOptionsKey, HasXSParameter, XSBundle, XSCore, XSCoreParameters, XSTileKey}
import freechips.rocketchip.amba.axi4._
import freechips.rocketchip.tilelink._
Expand All @@ -42,7 +42,7 @@ case class SoCParameters
extIntrs: Int = 64,
L3NBanks: Int = 4,
L3CacheParamsOpt: Option[HCCacheParameters] = Some(HCCacheParameters(
name = "l3",
name = "L3",
level = 3,
ways = 8,
sets = 2048 // 1MB per bank
Expand Down Expand Up @@ -148,10 +148,13 @@ trait HaveAXI4MemPort {
))

val mem_xbar = TLXbar()
val l3_mem_pmu = BusPerfMonitor(name = "L3_Mem", enable = !debugOpts.FPGAPlatform, stat_latency = true, add_reqkey = true)
mem_xbar :=*
TLBuffer.chainNode(2) :=
TLCacheCork() :=
l3_mem_pmu :=
TLClientsMerger() :=
TLXbar() :=*
TLBuffer.chainNode(2) :=*
TLCacheCork() :=*
bankedNode

mem_xbar :=
Expand Down Expand Up @@ -232,10 +235,9 @@ class SoCMisc()(implicit p: Parameters) extends BaseSoC

val l3_in = TLTempNode()
val l3_out = TLTempNode()
val l3_mem_pmu = BusPerfMonitor(enable = !debugOpts.FPGAPlatform)

l3_in :*= TLEdgeBuffer(_ => true, Some("L3_in_buffer")) :*= l3_banked_xbar
bankedNode :*= TLLogger("MEM_L3", !debugOpts.FPGAPlatform) :*= l3_mem_pmu :*= l3_out
bankedNode :*= TLLogger("MEM_L3", !debugOpts.FPGAPlatform) :*= l3_out

if(soc.L3CacheParamsOpt.isEmpty){
l3_out :*= l3_in
Expand Down
4 changes: 0 additions & 4 deletions src/main/scala/top/ArgParser.scala
Original file line number Diff line number Diff line change
Expand Up @@ -92,10 +92,6 @@ object ArgParser {
nextOption(config.alter((site, here, up) => {
case DebugOptionsKey => up(DebugOptionsKey).copy(EnablePerfDebug = false)
}), tail)
case "--enable-topdown" :: tail =>
nextOption(config.alter((site, here, up) => {
case DebugOptionsKey => up(DebugOptionsKey).copy(EnableTopDown = true)
}), tail)
case "--mfc" :: tail =>
firrtlCompiler = MFC
nextOption(config, tail)
Expand Down
123 changes: 109 additions & 14 deletions src/main/scala/top/BusPerfMonitor.scala
Original file line number Diff line number Diff line change
@@ -1,3 +1,19 @@
/***************************************************************************************
* Copyright (c) 2020-2021 Institute of Computing Technology, Chinese Academy of Sciences
* Copyright (c) 2020-2021 Peng Cheng Laboratory
*
* XiangShan is licensed under Mulan PSL v2.
* You can use this software according to the terms and conditions of the Mulan PSL v2.
* You may obtain a copy of Mulan PSL v2 at:
* http://license.coscl.org.cn/MulanPSL2
*
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
* EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
* MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
*
* See the Mulan PSL v2 for more details.
***************************************************************************************/

package top

import chipsalliance.rocketchip.config.Parameters
Expand All @@ -6,13 +22,28 @@ import freechips.rocketchip.tilelink._
import chisel3._
import chisel3.util._
import utils.{XSPerfAccumulate, XSPerfPrint}
import freechips.rocketchip.tilelink.TLMessages._
import freechips.rocketchip.tilelink.TLPermissions._
import utility.{ReqSourceField, ReqSourceKey, GTimer}
import xiangshan.MemReqSource

class BusPerfMonitor()(implicit p: Parameters) extends LazyModule {
val node = TLAdapterNode()
lazy val module = new BusPerfMonitorImp(this)
class BusPerfMonitor(name: String, stat_latency: Boolean, add_reqkey: Boolean)(implicit p: Parameters) extends LazyModule {
val node = if (add_reqkey) TLAdapterNode(managerFn = { m =>
TLSlavePortParameters.v1(
m.managers.map { m =>
m.v2copy()
},
requestKeys = Seq(ReqSourceKey),
beatBytes = 32,
endSinkId = m.endSinkId
)
}) else {
TLAdapterNode()
}
lazy val module = new BusPerfMonitorImp(this, name, stat_latency)
}

class BusPerfMonitorImp(outer: BusPerfMonitor)
class BusPerfMonitorImp(outer: BusPerfMonitor, name: String, stat_latency: Boolean)
extends LazyModuleImp(outer)
{

Expand All @@ -24,7 +55,7 @@ class BusPerfMonitorImp(outer: BusPerfMonitor)
def PERF_CHN[T <: TLChannel](clientName: String, chn: DecoupledIO[T]) = {

val channelName = chn.bits.channelName.replaceAll(" ", "_").replaceAll("'", "")
XSPerfAccumulate(s"${clientName}_${channelName}_fire", chn.fire())
XSPerfAccumulate(s"${clientName}_${channelName}_fire", chn.fire)
XSPerfAccumulate(s"${clientName}_${channelName}_stall", chn.valid && !chn.ready)

val ops = chn.bits match {
Expand All @@ -40,28 +71,28 @@ class BusPerfMonitorImp(outer: BusPerfMonitor)
chn.bits match {
case a: TLBundleA =>
XSPerfAccumulate(s"${clientName}_${channelName}_${op}_fire",
i.U === a.opcode && chn.fire()
i.U === a.opcode && chn.fire
)
XSPerfAccumulate(s"${clientName}_${channelName}_${op}_stall",
i.U === a.opcode && chn.valid && !chn.ready
)
case b: TLBundleB =>
XSPerfAccumulate(s"${clientName}_${channelName}_${op}_fire",
i.U === b.opcode && chn.fire()
i.U === b.opcode && chn.fire
)
XSPerfAccumulate(s"${clientName}_${channelName}_${op}_stall",
i.U === b.opcode && chn.valid && !chn.ready
)
case c: TLBundleC =>
XSPerfAccumulate(s"${clientName}_${channelName}_${op}_fire",
i.U === c.opcode && chn.fire()
i.U === c.opcode && chn.fire
)
XSPerfAccumulate(s"${clientName}_${channelName}_${op}_stall",
i.U === c.opcode && chn.valid && !chn.ready
)
case d: TLBundleD =>
XSPerfAccumulate(s"${clientName}_${channelName}_${op}_fire",
i.U === d.opcode && chn.fire()
i.U === d.opcode && chn.fire
)
XSPerfAccumulate(s"${clientName}_${channelName}_${op}_stall",
i.U === d.opcode && chn.valid && !chn.ready
Expand All @@ -70,22 +101,86 @@ class BusPerfMonitorImp(outer: BusPerfMonitor)
}
}

for(((in, edgeIn), i) <- outer.node.in.zipWithIndex) {
val clientName = s"${edgeIn.master.masters.head.name}_bank_$i"
for (((in, edgeIn), i) <- outer.node.in.zipWithIndex) {
val clientName = s"${name}_${edgeIn.master.masters.head.name}_bank_$i"
PERF_CHN(clientName, in.a)
PERF_CHN(clientName, in.d)
if(in.params.hasBCE){
if (in.params.hasBCE) {
PERF_CHN(clientName, in.b)
PERF_CHN(clientName, in.c)
PERF_CHN(clientName, in.e)
}
}

if (stat_latency) {
val nrEdge = outer.node.in.length.toInt
val edgeIn = outer.node.in.head._2

class RecordEntry()(implicit p: Parameters) extends Bundle {
val valid = Bool()
val timeStamp = UInt(64.W)
val reqType = UInt(8.W)
}

// For simplicity, latency statistic works between nodes with SINGLE edge
require(nrEdge == 1)
val timer = GTimer()
val nrSource = math.pow(2, edgeIn.bundle.sourceBits).toInt
val latencyRecord = RegInit(VecInit(Seq.fill(nrSource)(0.U.asTypeOf(new RecordEntry()))))
val latencySum = RegInit(0.U(128.W))
val nrRecord = RegInit(0.U(128.W))

outer.node.in.zip(outer.node.out).zipWithIndex.foreach {
case (((in, edgeIn), (out, edgeOut)), i) =>
val channelA = in.a
when(channelA.fire &&
channelA.bits.opcode =/= Hint &&
channelA.bits.opcode =/= PutFullData &&
channelA.bits.opcode =/= PutPartialData
) {
// Valid channel A fire, record it
assert(latencyRecord(channelA.bits.source).valid === false.B)
latencyRecord(channelA.bits.source).valid := true.B
latencyRecord(channelA.bits.source).timeStamp := timer
latencyRecord(channelA.bits.source).reqType := channelA.bits.user.lift(ReqSourceKey).getOrElse(MemReqSource.NoWhere.id.U)
}
val channelD = in.d
val (first, _, _, _) = edgeIn.count(channelD)
// Valid channel D fire, resolve it
val resolveRecord = channelD.fire && first &&
channelD.bits.opcode =/= ReleaseAck &&
channelD.bits.opcode =/= AccessAck
val latency = WireInit(0.U(64.W))
when(resolveRecord) {
assert(latencyRecord(channelD.bits.source).valid === true.B)
latencyRecord(channelD.bits.source).valid := false.B
latency := timer - latencyRecord(channelD.bits.source).timeStamp
latencySum := latencySum + timer
nrRecord := nrRecord + 1.U
// printf("timer: %x\n", latency)
}
XSPerfAccumulate(name + "_nrRecord_all", resolveRecord)
XSPerfAccumulate(name + "_latencySum_all", Mux(resolveRecord, latency, 0.U))

for (j <- 0 until MemReqSource.ReqSourceCount.id) {
val typeMatch = latencyRecord(channelD.bits.source).reqType === j.U
XSPerfAccumulate(name + s"_nrRecord_type${j}", resolveRecord && typeMatch)
XSPerfAccumulate(name + s"_latencySum_type${j}", Mux(resolveRecord && typeMatch, latency, 0.U))
}
}
}

}

object BusPerfMonitor {
def apply(enable: Boolean = false)(implicit p: Parameters) = {
def apply(
name: String,
enable: Boolean = false,
stat_latency: Boolean = false,
add_reqkey: Boolean = false)(implicit p: Parameters) =
{
if(enable){
val busPMU = LazyModule(new BusPerfMonitor())
val busPMU = LazyModule(new BusPerfMonitor(name, stat_latency, add_reqkey))
busPMU.node
} else {
TLTempNode()
Expand Down
11 changes: 6 additions & 5 deletions src/main/scala/top/Configs.scala
Original file line number Diff line number Diff line change
Expand Up @@ -247,8 +247,7 @@ class WithNKBL2
n: Int,
ways: Int = 8,
inclusive: Boolean = true,
banks: Int = 1,
alwaysReleaseData: Boolean = false
banks: Int = 1
) extends Config((site, here, up) => {
case XSTileKey =>
val upParams = up(XSTileKey)
Expand All @@ -264,6 +263,7 @@ class WithNKBL2
ways = p.dcacheParametersOpt.get.nWays + 2,
aliasBitsOpt = p.dcacheParametersOpt.get.aliasBitsOpt
)),
reqField = Seq(utility.ReqSourceField()),
echoField = Seq(huancun.DirtyField()),
prefetch = Some(coupledL2.prefetch.PrefetchReceiverParams())
)),
Expand Down Expand Up @@ -295,6 +295,7 @@ class WithNKBL3(n: Int, ways: Int = 8, inclusive: Boolean = true, banks: Int = 1
address = 0x39000000,
numCores = tiles.size
)),
reqField = Seq(utility.ReqSourceField()),
sramClkDivBy2 = true,
sramDepthDiv = 4,
tagECC = Some("secded"),
Expand All @@ -318,21 +319,21 @@ class DefaultL3DebugConfig(n: Int = 1) extends Config(

class MinimalAliasDebugConfig(n: Int = 1) extends Config(
new WithNKBL3(512, inclusive = false) ++
new WithNKBL2(256, inclusive = false, alwaysReleaseData = true) ++
new WithNKBL2(256, inclusive = false) ++
new WithNKBL1D(128) ++
new MinimalConfig(n)
)

class MediumConfig(n: Int = 1) extends Config(
new WithNKBL3(4096, inclusive = false, banks = 4)
++ new WithNKBL2(512, inclusive = false, alwaysReleaseData = true)
++ new WithNKBL2(512, inclusive = false)
++ new WithNKBL1D(128)
++ new BaseConfig(n)
)

class DefaultConfig(n: Int = 1) extends Config(
new WithNKBL3(6 * 1024, inclusive = false, banks = 4, ways = 6)
++ new WithNKBL2(2 * 512, inclusive = false, banks = 4, alwaysReleaseData = true)
++ new WithNKBL2(2 * 512, inclusive = false, banks = 4)
++ new WithNKBL1D(128)
++ new BaseConfig(n)
)
4 changes: 3 additions & 1 deletion src/main/scala/top/Top.scala
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ class XSTop()(implicit p: Parameters) extends BaseXSSoc() with HasSoCParameter

val l3cacheOpt = soc.L3CacheParamsOpt.map(l3param =>
LazyModule(new HuanCun()(new Config((_, _, _) => {
case HCCacheParamsKey => l3param.copy(enableTopDown = debugOpts.EnableTopDown)
case HCCacheParamsKey => l3param.copy(hartIds = tiles.map(_.HartId))
})))
)

Expand Down Expand Up @@ -101,6 +101,8 @@ class XSTop()(implicit p: Parameters) extends BaseXSSoc() with HasSoCParameter
case Some(l3) =>
misc.l3_out :*= l3.node :*= TLBuffer.chainNode(2) :*= misc.l3_banked_xbar
case None =>
val dummyMatch = WireDefault(false.B)
tiles.map(_.HartId).foreach(hartId => ExcitingUtils.addSource(dummyMatch, s"L3MissMatch_${hartId}", ExcitingUtils.Perf, true))
}

lazy val module = new LazyRawModuleImp(this) {
Expand Down
12 changes: 11 additions & 1 deletion src/main/scala/xiangshan/Bundle.scala
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,9 @@ class CfiUpdateInfo(implicit p: Parameters) extends XSBundle with HasBPUParamete
val histPtr = new CGHPtr
val specCnt = Vec(numBr, UInt(10.W))
// need pipeline update
val br_hit = Bool()
val br_hit = Bool() // if in ftb entry
val jr_hit = Bool() // if in ftb entry
val sc_hit = Bool() // if used in ftb entry, invalid if !br_hit
val predTaken = Bool()
val target = UInt(VAddrBits.W)
val taken = Bool()
Expand Down Expand Up @@ -301,6 +303,8 @@ class Redirect(implicit p: Parameters) extends XSBundle {
val stFtqOffset = UInt(log2Up(PredictWidth).W)

val debug_runahead_checkpoint_id = UInt(64.W)
val debugIsCtrl = Bool()
val debugIsMemVio = Bool()

// def isUnconditional() = RedirectLevel.isUnconditional(level)
def flushItself() = RedirectLevel.flushItself(level)
Expand Down Expand Up @@ -413,6 +417,7 @@ class MemRSFeedbackIO(implicit p: Parameters) extends XSBundle {
class FrontendToCtrlIO(implicit p: Parameters) extends XSBundle {
// to backend end
val cfVec = Vec(DecodeWidth, DecoupledIO(new CtrlFlow))
val stallReason = new StallReasonIO(DecodeWidth)
val fromFtq = new FtqToCtrlIO
// from backend
val toFtq = Flipped(new CtrlToFtqIO)
Expand Down Expand Up @@ -662,6 +667,11 @@ class MatchTriggerIO(implicit p: Parameters) extends XSBundle {
val tdata2 = Output(UInt(64.W))
}

class StallReasonIO(width: Int) extends Bundle {
val reason = Output(Vec(width, UInt(log2Ceil(TopDownCounters.NumStallReasons.id).W)))
val backReason = Flipped(Valid(UInt(log2Ceil(TopDownCounters.NumStallReasons.id).W)))
}

// custom l2 - l1 interface
class L2ToL1Hint(implicit p: Parameters) extends XSBundle with HasDCacheParameters {
val sourceId = UInt(log2Up(cfg.nMissEntries).W) // tilelink sourceID -> mshr id
Expand Down
3 changes: 1 addition & 2 deletions src/main/scala/xiangshan/Parameters.scala
Original file line number Diff line number Diff line change
Expand Up @@ -297,8 +297,7 @@ case class DebugOptions
EnableDebug: Boolean = false,
EnablePerfDebug: Boolean = true,
UseDRAMSim: Boolean = false,
EnableConstantin: Boolean = false,
EnableTopDown: Boolean = false
EnableConstantin: Boolean = false
)

trait HasXSParameter {
Expand Down
Loading

0 comments on commit d2b20d1

Please sign in to comment.