Skip to content

Commit

Permalink
MemBlock: enable 3ld3st (OpenXiangShan#2524)
Browse files Browse the repository at this point in the history
* enable 3ld3st

* assign enqLsq

* fix IssQueSize

* remove performance regression

* MMU: Fix ptwrepeater when 3ld + 3st

* fix minimal config params

* fix minimal config LoadQueueReplaySize

* add 3ld3st switch

* fix bank conflict valid logic

* fix strict memory ambiguous logic

* fix wakeup logic

* disable 3ld3st by default

* modify minimal config params

---------

Co-authored-by: Lyn <[email protected]>
Co-authored-by: good-circle <[email protected]>
  • Loading branch information
3 people authored Jan 2, 2024
1 parent 988fb9a commit ec86549
Show file tree
Hide file tree
Showing 14 changed files with 105 additions and 41 deletions.
4 changes: 3 additions & 1 deletion src/main/scala/top/Configs.scala
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ class MinimalConfig(n: Int = 1) extends Config(
VirtualLoadQueueSize = 16,
LoadQueueRARSize = 16,
LoadQueueRAWSize = 12,
LoadQueueReplaySize = 8,
LoadQueueReplaySize = 12,
LoadUncacheBufferSize = 8,
LoadQueueNWriteBanks = 4, // NOTE: make sure that LoadQueue{RAR, RAW, Replay}Size is divided by LoadQueueNWriteBanks.
RollbackGroupSize = 8,
Expand All @@ -79,6 +79,8 @@ class MinimalConfig(n: Int = 1) extends Config(
IBufNBank = 2,
StoreBufferSize = 4,
StoreBufferThreshold = 3,
LoadPipelineWidth = 2,
StorePipelineWidth = 2,
dpParams = DispatchParameters(
IntDqSize = 12,
FpDqSize = 12,
Expand Down
4 changes: 4 additions & 0 deletions src/main/scala/xiangshan/Parameters.scala
Original file line number Diff line number Diff line change
Expand Up @@ -457,6 +457,10 @@ trait HasXSParameter {
val EnableAtCommitMissTrigger = coreParams.EnableAtCommitMissTrigger
val EnableStorePrefetchSMS = coreParams.EnableStorePrefetchSMS
val EnableStorePrefetchSPB = coreParams.EnableStorePrefetchSPB
require(LoadPipelineWidth == StorePipelineWidth, "LoadPipelineWidth must be equal StorePipelineWidth!")
require(LoadPipelineWidth == exuParameters.LduCnt, "LoadPipelineWidth must be equal exuParameters.LduCnt!")
require(StorePipelineWidth == exuParameters.StuCnt, "StorePipelineWidth must be equal exuParameters.StuCnt!")
val Enable3Load3Store = (LoadPipelineWidth == 3 && StorePipelineWidth == 3)
val asidLen = coreParams.MMUAsidLen
val BTLBWidth = coreParams.LoadPipelineWidth + coreParams.StorePipelineWidth
val refillBothTlb = coreParams.refillBothTlb
Expand Down
2 changes: 1 addition & 1 deletion src/main/scala/xiangshan/XSCore.scala
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,7 @@ abstract class XSCoreBase()(implicit p: config.Parameters) extends LazyModule

val memBlock = LazyModule(new MemBlock()(p.alter((site, here, up) => {
case XSCoreParamsKey => up(XSCoreParamsKey).copy(
IssQueSize = IssQueSize * 2 // exuBlocks.head.scheduler.getMemRsEntries
IssQueSize = IssQueSize * (if (Enable3Load3Store) 3 else 2) // exuBlocks.head.scheduler.getMemRsEntries
)
})))

Expand Down
11 changes: 8 additions & 3 deletions src/main/scala/xiangshan/backend/Backend.scala
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ class FakeMemBlockWbSourceImp(outer: FakeMemBlockWbSource) extends LazyModuleImp
}

// Merge CtrlBlock, exuBlocks, wbArbiter, wb2Ctrl, etc into 1 module
class Backend(memWbSource: HasWritebackSource)(implicit p: Parameters) extends LazyModule
class Backend(memWbSource: HasWritebackSource)(implicit p: Parameters) extends LazyModule
with HasXSParameter
with HasExuWbHelper
{
Expand Down Expand Up @@ -158,7 +158,7 @@ class BackendImp(outer: Backend)(implicit p: Parameters) extends LazyModuleImp(o
val loadFastImm = Vec(exuParameters.LduCnt, Output(UInt(12.W)))
val rsfeedback = Vec(exuParameters.LsExuCnt, Flipped(new MemRSFeedbackIO()(p.alter((site, here, up) => {
case XSCoreParamsKey => up(XSCoreParamsKey).copy(
IssQueSize = IssQueSize * 2
IssQueSize = IssQueSize * (if (Enable3Load3Store) 3 else 2)
)
}))))
val loadPc = Vec(exuParameters.LduCnt, Output(UInt(VAddrBits.W)))
Expand Down Expand Up @@ -337,7 +337,12 @@ class BackendImp(outer: Backend)(implicit p: Parameters) extends LazyModuleImp(o

ctrlBlock.perfinfo.perfEventsEu0 := exuBlocks(0).getPerf.dropRight(outer.exuBlocks(0).scheduler.numRs)
ctrlBlock.perfinfo.perfEventsEu1 := exuBlocks(1).getPerf.dropRight(outer.exuBlocks(1).scheduler.numRs)
ctrlBlock.perfinfo.perfEventsRs := outer.exuBlocks.flatMap(b => b.module.getPerf.takeRight(b.scheduler.numRs))

if (Enable3Load3Store) {
ctrlBlock.perfinfo.perfEventsRs := DontCare // outer.exuBlocks.flatMap(b => b.module.getPerf.takeRight(b.scheduler.numRs))
} else {
ctrlBlock.perfinfo.perfEventsRs := outer.exuBlocks.flatMap(b => b.module.getPerf.takeRight(b.scheduler.numRs))
}

csrioIn.hartId <> io.hartId

Expand Down
6 changes: 4 additions & 2 deletions src/main/scala/xiangshan/backend/MemBlock.scala
Original file line number Diff line number Diff line change
Expand Up @@ -277,6 +277,7 @@ class MemBlockImp(outer: MemBlock) extends LazyModuleImp(outer)
io.error.valid := false.B
}

println("Enable 3-load and 3-store: " + Enable3Load3Store)
val loadUnits = Seq.fill(exuParameters.LduCnt)(Module(new LoadUnit))
val storeUnits = Seq.fill(exuParameters.StuCnt)(Module(new StoreUnit))
val stdExeUnits = Seq.fill(exuParameters.StuCnt)(Module(new StdExeUnit))
Expand Down Expand Up @@ -498,9 +499,10 @@ class MemBlockImp(outer: MemBlock) extends LazyModuleImp(outer)
val ptw_resp_v = RegNext(ptwio.resp.valid && !(sfence.valid && tlbcsr.satp.changed), init = false.B)
ptwio.resp.ready := true.B

val tlbreplay = WireInit(VecInit(Seq.fill(2)(false.B)))
val tlbReplayWidth = (if (Enable3Load3Store) 3 else 2)
val tlbreplay = WireInit(VecInit(Seq.fill(tlbReplayWidth)(false.B)))
dontTouch(tlbreplay)
for (i <- 0 until 2) {
for (i <- 0 until tlbReplayWidth) {
tlbreplay(i) := dtlb_ld(0).ptw.req(i).valid && ptw_resp_next.vector(0) && ptw_resp_v &&
ptw_resp_next.data.hit(dtlb_ld(0).ptw.req(i).bits.vpn, tlbcsr.satp.asid, allType = true, ignoreAsid = true)
}
Expand Down
6 changes: 6 additions & 0 deletions src/main/scala/xiangshan/backend/dispatch/Dispatch2Rs.scala
Original file line number Diff line number Diff line change
Expand Up @@ -198,6 +198,12 @@ class Dispatch2RsDistinctImp(outer: Dispatch2Rs)(implicit p: Parameters) extends
val blockLoads = isLoadArrays.map(PopCount(_) >= LoadPipelineWidth.U)
val blockStores = isStoreArrays.map(PopCount(_) >= StorePipelineWidth.U)

for (i <- 0 until enqLsq.req.length) {
enqLsq.needAlloc(i) := false.B
enqLsq.req(i).valid := false.B
enqLsq.req(i).bits := DontCare
}

for (i <- io.in.indices) {
is_blocked(i) := (
if (i >= LoadPipelineWidth) Mux(isStore(i), blockStores(i), blockLoads(i)) || is_blocked(i - 1)
Expand Down
15 changes: 11 additions & 4 deletions src/main/scala/xiangshan/backend/issue/ReservationStation.scala
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,12 @@ class ReservationStationWrapper(implicit p: Parameters) extends LazyModule with
params.hasFeedback = true
params.checkWaitBit = false
}
if (cfg == LdExeUnitCfg) {
params.numDeq = LoadPipelineWidth
}
if (cfg == StaExeUnitCfg || cfg == StdExeUnitCfg) {
params.numDeq = StorePipelineWidth
}
if (cfg.hasCertainLatency) {
params.fixedLatency = if (cfg == MulDivExeUnitCfg) mulCfg.latency.latencyVal.get else cfg.latency.latencyVal.get
}
Expand Down Expand Up @@ -135,7 +141,7 @@ class ReservationStationWrapper(implicit p: Parameters) extends LazyModule with

override def toString: String = params.toString
// for better timing, we limits the size of RS to 2-deq
val maxRsDeq = 2
val maxRsDeq = (if (Enable3Load3Store) 4 else 2)
def numRS = (params.numDeq + (maxRsDeq - 1)) / maxRsDeq

class RSWrapperImp(wrapper: LazyModule) extends LazyModuleImp(wrapper) with HasPerfEvents {
Expand Down Expand Up @@ -410,9 +416,10 @@ class ReservationStation(params: RSParams)(implicit p: Parameters) extends XSMod
val numSelected = PopCount(s1_issuePtrOH.map(_.valid))
val numReadyEntries = PopCount(statusArray.io.canIssue)
val shouldSelected = Mux(numReadyEntries > params.numDeq.U, params.numDeq.U, numReadyEntries)
XSError(numSelected < shouldSelected,
p"performance regression: only $numSelected out of $shouldSelected selected (total: $numReadyEntries)\n")

if (!Enable3Load3Store) {
XSError(numSelected < shouldSelected,
p"performance regression: only $numSelected out of $shouldSelected selected (total: $numReadyEntries)\n")
}
// Allocation: store dispatch uops into payload and data array
s1_dispatchUops_dup.foreach(_.zip(enqReverse(io.fromDispatch)).zipWithIndex.foreach{ case ((uop, in), i) =>
val s0_valid = in.fire && !enqReverse(s0_enqFlushed)(i)
Expand Down
3 changes: 1 addition & 2 deletions src/main/scala/xiangshan/backend/rob/Rob.scala
Original file line number Diff line number Diff line change
Expand Up @@ -311,7 +311,6 @@ class ExceptionGen(implicit p: Parameters) extends XSModule with HasCircularQueu

def getOldest(valid: Seq[Bool], bits: Seq[RobExceptionInfo]): (Seq[Bool], Seq[RobExceptionInfo]) = {
assert(valid.length == bits.length)
assert(isPow2(valid.length))
if (valid.length == 1) {
(valid, bits)
} else if (valid.length == 2) {
Expand All @@ -324,7 +323,7 @@ class ExceptionGen(implicit p: Parameters) extends XSModule with HasCircularQueu
(Seq(oldest.valid), Seq(oldest.bits))
} else {
val left = getOldest(valid.take(valid.length / 2), bits.take(valid.length / 2))
val right = getOldest(valid.takeRight(valid.length / 2), bits.takeRight(valid.length / 2))
val right = getOldest(valid.takeRight(valid.length - valid.length / 2), bits.takeRight(valid.length - valid.length / 2))
getOldest(left._1 ++ right._1, left._2 ++ right._2)
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -277,7 +277,7 @@ class LoadPipe(id: Int)(implicit p: Parameters) extends DCacheModule with HasPer
XSPerfAccumulate("load_using_replacement", io.replace_way.set.valid && s1_need_replacement)

// data read
io.banked_data_read.valid := s1_fire && !s1_nack
io.banked_data_read.valid := s1_fire && !s1_nack && !io.lsu.s1_kill
io.banked_data_read.bits.addr := s1_vaddr
io.banked_data_read.bits.way_en := s1_pred_tag_match_way_dup_dc
io.banked_data_read.bits.bankMask := s1_bank_oh
Expand Down
68 changes: 48 additions & 20 deletions src/main/scala/xiangshan/cache/mmu/Repeater.scala
Original file line number Diff line number Diff line change
Expand Up @@ -202,28 +202,56 @@ class PTWFilterEntry(Width: Int, Size: Int, hasHint: Boolean = false)(implicit p
io.memidx := 0.U.asTypeOf(new MemBlockidxBundle)

// ugly code, should be optimized later
require(Width <= 3, s"DTLB Filter Width ($Width) must equal or less than 3")
if (Width == 1) {
require(Size == 8, s"prefetch filter Size ($Size) should be 8")
canenq(0) := !(Cat(v).andR)
enqidx(0) := firstValidIndex(v, false.B)
} else if (Width == 2) {
require(Size == 8, s"store filter Size ($Size) should be 8")
canenq(0) := !(Cat(v.take(Size/2)).andR)
enqidx(0) := firstValidIndex(v.take(Size/2), false.B)
canenq(1) := !(Cat(v.drop(Size/2)).andR)
enqidx(1) := firstValidIndex(v.drop(Size/2), false.B) + (Size/2).U
} else if (Width == 3) {
require(Size == 16, s"load filter Size ($Size) should be 16")
canenq(0) := !(Cat(v.take(8)).andR)
enqidx(0) := firstValidIndex(v.take(8), false.B)
canenq(1) := !(Cat(v.drop(8).take(4)).andR)
enqidx(1) := firstValidIndex(v.drop(8).take(4), false.B) + 8.U
// four entries for prefetch
canenq(2) := !(Cat(v.drop(12)).andR)
enqidx(2) := firstValidIndex(v.drop(12), false.B) + 12.U
if (Enable3Load3Store) {
require(Width <= 4, s"DTLB Filter Width ($Width) must equal or less than 4")
if (Width == 1) {
require(Size == 8, s"prefetch filter Size ($Size) should be 8")
canenq(0) := !(Cat(v).andR)
enqidx(0) := firstValidIndex(v, false.B)
} else if (Width == 3) {
require(Size == 8, s"store filter Size ($Size) should be 8")
canenq(0) := !(Cat(v.take(3)).andR)
enqidx(0) := firstValidIndex(v.take(3), false.B)
canenq(1) := !(Cat(v.drop(3).take(3)).andR)
enqidx(1) := firstValidIndex(v.drop(3).take(3), false.B) + 3.U
canenq(2) := !(Cat(v.drop(6).take(2)).andR)
enqidx(2) := firstValidIndex(v.drop(6).take(2), false.B) + 6.U
} else if (Width == 4) {
require(Size == 16, s"load filter Size ($Size) should be 16")
canenq(0) := !(Cat(v.take(4)).andR)
enqidx(0) := firstValidIndex(v.take(4), false.B)
canenq(1) := !(Cat(v.drop(4).take(4)).andR)
enqidx(1) := firstValidIndex(v.drop(4).take(4), false.B) + 4.U
canenq(2) := !(Cat(v.drop(8).take(4)).andR)
enqidx(2) := firstValidIndex(v.drop(8).take(4), false.B) + 8.U
canenq(3) := !(Cat(v.drop(12).take(4)).andR)
enqidx(3) := firstValidIndex(v.drop(12).take(4), false.B) + 12.U
}
} else {
require(Width <= 3, s"DTLB Filter Width ($Width) must equal or less than 3")
if (Width == 1) {
require(Size == 8, s"prefetch filter Size ($Size) should be 8")
canenq(0) := !(Cat(v).andR)
enqidx(0) := firstValidIndex(v, false.B)
} else if (Width == 2) {
require(Size == 8, s"store filter Size ($Size) should be 8")
canenq(0) := !(Cat(v.take(Size/2)).andR)
enqidx(0) := firstValidIndex(v.take(Size/2), false.B)
canenq(1) := !(Cat(v.drop(Size/2)).andR)
enqidx(1) := firstValidIndex(v.drop(Size/2), false.B) + (Size/2).U
} else if (Width == 3) {
require(Size == 16, s"load filter Size ($Size) should be 16")
canenq(0) := !(Cat(v.take(8)).andR)
enqidx(0) := firstValidIndex(v.take(8), false.B)
canenq(1) := !(Cat(v.drop(8).take(4)).andR)
enqidx(1) := firstValidIndex(v.drop(8).take(4), false.B) + 8.U
// four entries for prefetch
canenq(2) := !(Cat(v.drop(12)).andR)
enqidx(2) := firstValidIndex(v.drop(12), false.B) + 12.U
}
}


for (i <- 0 until Width) {
enqvalid(i) := io.tlb.req(i).valid && !ptwResp_ReqMatchVec(i) && !entryIsMatchVec(i) && canenq(i)
when (!enqvalid(i)) {
Expand Down
8 changes: 6 additions & 2 deletions src/main/scala/xiangshan/mem/lsqueue/LoadQueueReplay.scala
Original file line number Diff line number Diff line change
Expand Up @@ -215,6 +215,7 @@ class LoadQueueReplay(implicit p: Parameters) extends XSModule
val debug_vaddr = RegInit(VecInit(List.fill(LoadQueueReplaySize)(0.U(VAddrBits.W))))
val cause = RegInit(VecInit(List.fill(LoadQueueReplaySize)(0.U(LoadReplayCauses.allCauses.W))))
val blocking = RegInit(VecInit(List.fill(LoadQueueReplaySize)(false.B)))
val strict = RegInit(VecInit(List.fill(LoadQueueReplaySize)(false.B)))

// freeliset: store valid entries index.
// +---+---+--------------+-----+-----+
Expand Down Expand Up @@ -277,8 +278,8 @@ class LoadQueueReplay(implicit p: Parameters) extends XSModule
for (i <- 0 until LoadQueueReplaySize) {
// dequeue
// FIXME: store*Ptr is not accurate
dataNotBlockVec(i) := !isBefore(io.stDataReadySqPtr, blockSqIdx(i)) || stDataReadyVec(blockSqIdx(i).value) || io.sqEmpty // for better timing
addrNotBlockVec(i) := !isBefore(io.stAddrReadySqPtr, blockSqIdx(i)) || stAddrReadyVec(blockSqIdx(i).value) || io.sqEmpty // for better timing
dataNotBlockVec(i) := isAfter(io.stDataReadySqPtr, blockSqIdx(i)) || stDataReadyVec(blockSqIdx(i).value) || io.sqEmpty // for better timing
addrNotBlockVec(i) := Mux(strict(i), isAfter(io.stAddrReadySqPtr, blockSqIdx(i)), stAddrReadyVec(blockSqIdx(i).value)) || io.sqEmpty // for better timing

// store address execute
storeAddrInSameCycleVec(i) := VecInit((0 until StorePipelineWidth).map(w => {
Expand Down Expand Up @@ -518,6 +519,7 @@ class LoadQueueReplay(implicit p: Parameters) extends XSModule
io.replay(i).bits.missDbUpdated := s2_missDbUpdated
io.replay(i).bits.forward_tlDchannel := s2_replayCauses(LoadReplayCauses.C_DM)
io.replay(i).bits.schedIndex := s2_oldestSel(i).bits
io.replay(i).bits.uop.cf.loadWaitStrict := false.B

when (io.replay(i).fire) {
XSError(!allocated(s2_oldestSel(i).bits), p"LoadQueueReplay: why replay an invalid entry ${s2_oldestSel(i).bits} ?")
Expand Down Expand Up @@ -593,6 +595,7 @@ class LoadQueueReplay(implicit p: Parameters) extends XSModule

// init
blocking(enqIndex) := true.B
strict(enqIndex) := false.B

// update blocking pointer
when (replayInfo.cause(LoadReplayCauses.C_BC) ||
Expand Down Expand Up @@ -620,6 +623,7 @@ class LoadQueueReplay(implicit p: Parameters) extends XSModule
// special case: st-ld violation
when (replayInfo.cause(LoadReplayCauses.C_MA)) {
blockSqIdx(enqIndex) := replayInfo.addr_inv_sq_idx
strict(enqIndex) := enq.bits.uop.cf.loadWaitStrict
}

// special case: data forward fail
Expand Down
3 changes: 2 additions & 1 deletion src/main/scala/xiangshan/mem/prefetch/SMSPrefetcher.scala
Original file line number Diff line number Diff line change
Expand Up @@ -1083,7 +1083,8 @@ class SMSTrainFilter()(implicit p: Parameters) extends XSModule with HasSMSModul

class SMSPrefetcher()(implicit p: Parameters) extends BasePrefecher with HasSMSModuleHelper with HasL1PrefetchSourceParameter {

require(exuParameters.LduCnt == 2)
val maxLduCnt = LoadPipelineWidth
require(exuParameters.LduCnt == maxLduCnt)

val io_agt_en = IO(Input(Bool()))
val io_stride_en = IO(Input(Bool()))
Expand Down
7 changes: 4 additions & 3 deletions src/main/scala/xiangshan/mem/sbuffer/Sbuffer.scala
Original file line number Diff line number Diff line change
Expand Up @@ -403,6 +403,9 @@ class Sbuffer(implicit p: Parameters) extends DCacheModule with HasSbufferConst
io.store_prefetch(i) <> prefetcher.io.prefetch_req(i)
}
}
if (Enable3Load3Store) {
io.store_prefetch(2) <> prefetcher.io.prefetch_req(2)
}
prefetcher.io.memSetPattenDetected := io.memSetPattenDetected

def wordReqToBufLine( // allocate a new line in sbuffer
Expand Down Expand Up @@ -502,9 +505,7 @@ class Sbuffer(implicit p: Parameters) extends DCacheModule with HasSbufferConst
)
}

// for now, when enq, trigger a prefetch (if EnableAtCommitMissTrigger)
require(EnsbufferWidth == StorePipelineWidth)

require(Enable3Load3Store || (EnsbufferWidth == StorePipelineWidth))
// ---------------------- Send Dcache Req ---------------------

val sbuffer_empty = Cat(invalidMask).andR
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,8 @@ class PrefetchBurstGenerator(is_store: Boolean)(implicit p: Parameters) extends
val prefetch_req = Vec(StorePipelineWidth, DecoupledIO(new StorePrefetchReq))
})

require(StorePipelineWidth == 2)
val maxStorePipelineWidth = (if (Enable3Load3Store) 3 else 2)
require(StorePipelineWidth == maxStorePipelineWidth)

val SIZE = BURST_ENGINE_SIZE

Expand Down Expand Up @@ -129,6 +130,10 @@ class PrefetchBurstGenerator(is_store: Boolean)(implicit p: Parameters) extends
out_decouple(1).valid := deq_valid && data_next(PAGEOFFSET) === pg_bit && out_decouple(0).fire
out_decouple(1).bits := DontCare
out_decouple(1).bits.vaddr := data_next
if (Enable3Load3Store) {
out_decouple(2).valid := false.B
out_decouple(2).bits := DontCare
}
when(out_decouple(1).fire) {
// fired 2 prefetch reqs
data := data_next_next
Expand Down

0 comments on commit ec86549

Please sign in to comment.