xref: /XiangShan/src/main/scala/xiangshan/mem/sbuffer/Sbuffer.scala (revision c3abb8b6b92c14ec0f3dbbac60a8caa531994a95)
1/***************************************************************************************
2* Copyright (c) 2020-2021 Institute of Computing Technology, Chinese Academy of Sciences
3* Copyright (c) 2020-2021 Peng Cheng Laboratory
4*
5* XiangShan is licensed under Mulan PSL v2.
6* You can use this software according to the terms and conditions of the Mulan PSL v2.
7* You may obtain a copy of Mulan PSL v2 at:
8*          http://license.coscl.org.cn/MulanPSL2
9*
10* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
11* EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
12* MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
13*
14* See the Mulan PSL v2 for more details.
15***************************************************************************************/
16
17package xiangshan.mem
18
19import chipsalliance.rocketchip.config.Parameters
20import chisel3._
21import chisel3.util._
22import xiangshan._
23import utils._
24import xiangshan.cache._
25import difftest._
26
27class SbufferFlushBundle extends Bundle {
28  val valid = Output(Bool())
29  val empty = Input(Bool())
30}
31
32trait HasSbufferConst extends HasXSParameter {
33  val EvictCycles = 1 << 20
34  val SbufferReplayDelayCycles = 16
35  require(isPow2(EvictCycles))
36  val EvictCountBits = log2Up(EvictCycles+1)
37  val MissqReplayCountBits = log2Up(SbufferReplayDelayCycles) + 1
38
39  val SbufferIndexWidth: Int = log2Up(StoreBufferSize)
40  // paddr = ptag + offset
41  val CacheLineBytes: Int = CacheLineSize / 8
42  val CacheLineWords: Int = CacheLineBytes / DataBytes
43  val OffsetWidth: Int = log2Up(CacheLineBytes)
44  val WordsWidth: Int = log2Up(CacheLineWords)
45  val PTagWidth: Int = PAddrBits - OffsetWidth
46  val VTagWidth: Int = VAddrBits - OffsetWidth
47  val WordOffsetWidth: Int = PAddrBits - WordsWidth
48}
49
50class SbufferEntryState (implicit p: Parameters) extends SbufferBundle {
51  val state_valid    = Bool() // this entry is active
52  val state_inflight = Bool() // sbuffer is trying to write this entry to dcache
53  // val s_pipe_req = Bool() // scheduled dcache store pipeline req
54  val w_pipe_resp = Bool() // waiting for dcache store pipeline resp
55  val w_timeout = Bool() // waiting for resend store pipeline req timeout
56
57  def isInvalid(): Bool = !state_valid
58  def isValid(): Bool = state_valid
59  def isActive(): Bool = state_valid && !state_inflight
60  def isInflight(): Bool = state_inflight
61}
62
63class SbufferBundle(implicit p: Parameters) extends XSBundle with HasSbufferConst
64
65class DataWriteReq(implicit p: Parameters) extends SbufferBundle {
66  val idx = UInt(SbufferIndexWidth.W)
67  val mask = UInt((DataBits/8).W)
68  val data = UInt(DataBits.W)
69  val wordOffset = UInt(WordOffsetWidth.W)
70  val wline = Bool()
71}
72
73class SbufferData(implicit p: Parameters) extends XSModule with HasSbufferConst {
74  val io = IO(new Bundle(){
75    val writeReq = Vec(StorePipelineWidth, Flipped(ValidIO(new DataWriteReq)))
76    val dataOut = Output(Vec(StoreBufferSize, Vec(CacheLineWords, Vec(DataBytes, UInt(8.W)))))
77  })
78
79  val data = Reg(Vec(StoreBufferSize, Vec(CacheLineWords, Vec(DataBytes, UInt(8.W)))))
80
81  val req = io.writeReq
82
83  for(i <- 0 until StorePipelineWidth) {
84    when(req(i).valid){
85      for(word <- 0 until CacheLineWords){
86        for(byte <- 0 until DataBytes){
87          when(
88            req(i).bits.mask(byte) && (req(i).bits.wordOffset(WordsWidth-1, 0) === word.U) ||
89            req(i).bits.wline
90          ){
91            data(req(i).bits.idx)(word)(byte) := req(i).bits.data(byte*8+7, byte*8)
92          }
93        }
94      }
95    }
96  }
97
98  io.dataOut := data
99}
100
101class Sbuffer(implicit p: Parameters) extends DCacheModule with HasSbufferConst {
102  val io = IO(new Bundle() {
103    val in = Vec(StorePipelineWidth, Flipped(Decoupled(new DCacheWordReqWithVaddr)))  //Todo: store logic only support Width == 2 now
104    val dcache = Flipped(new DCacheToSbufferIO)
105    val forward = Vec(LoadPipelineWidth, Flipped(new LoadForwardQueryIO))
106    val sqempty = Input(Bool())
107    val flush = Flipped(new SbufferFlushBundle)
108    val csrCtrl = Flipped(new CustomCSRCtrlIO)
109  })
110
111  val dataModule = Module(new SbufferData)
112  dataModule.io.writeReq <> DontCare
113  val writeReq = dataModule.io.writeReq
114
115  val ptag = Reg(Vec(StoreBufferSize, UInt(PTagWidth.W)))
116  val vtag = Reg(Vec(StoreBufferSize, UInt(VTagWidth.W)))
117  val mask = Reg(Vec(StoreBufferSize, Vec(CacheLineWords, Vec(DataBytes, Bool()))))
118  val data = dataModule.io.dataOut
119  val stateVec = RegInit(VecInit(Seq.fill(StoreBufferSize)(0.U.asTypeOf(new SbufferEntryState))))
120  val cohCount = RegInit(VecInit(Seq.fill(StoreBufferSize)(0.U(EvictCountBits.W))))
121  val missqReplayCount = RegInit(VecInit(Seq.fill(StoreBufferSize)(0.U(MissqReplayCountBits.W))))
122
123  /*
124       idle --[flush]   --> drain   --[buf empty]--> idle
125            --[buf full]--> replace --[dcache resp]--> idle
126  */
127  // x_drain_all: drain store queue and sbuffer
128  // x_drain_sbuffer: drain sbuffer only, block store queue to sbuffer write
129  val x_idle :: x_replace :: x_drain_all :: x_drain_sbuffer :: Nil = Enum(4)
130  def needDrain(state: UInt): Bool =
131    state(1)
132  val sbuffer_state = RegInit(x_idle)
133
134  // ---------------------- Store Enq Sbuffer ---------------------
135
136  def getPTag(pa: UInt): UInt =
137    pa(PAddrBits - 1, PAddrBits - PTagWidth)
138
139  def getVTag(va: UInt): UInt =
140    va(VAddrBits - 1, VAddrBits - VTagWidth)
141
142  def getWord(pa: UInt): UInt =
143    pa(PAddrBits-1, 3)
144
145  def getWordOffset(pa: UInt): UInt =
146    pa(OffsetWidth-1, 3)
147
148  def getAddr(ptag: UInt): UInt =
149    Cat(ptag, 0.U((PAddrBits - PTagWidth).W))
150
151  def getByteOffset(offect: UInt): UInt =
152    Cat(offect(OffsetWidth - 1, 3), 0.U(3.W))
153
154  def isOneOf(key: UInt, seq: Seq[UInt]): Bool =
155    if(seq.isEmpty) false.B else Cat(seq.map(_===key)).orR()
156
157  def widthMap[T <: Data](f: Int => T) = (0 until StoreBufferSize) map f
158
159  // sbuffer entry count
160
161  val plru = new PseudoLRU(StoreBufferSize)
162  val accessIdx = Wire(Vec(StorePipelineWidth + 1, Valid(UInt(SbufferIndexWidth.W))))
163
164  val replaceIdx = plru.way
165  plru.access(accessIdx)
166
167  //-------------------------cohCount-----------------------------
168  // insert and merge: cohCount=0
169  // every cycle cohCount+=1
170  // if cohCount(EvictCountBits-1)==1, evict
171  val cohTimeOutMask = VecInit(widthMap(i => cohCount(i)(EvictCountBits - 1) && stateVec(i).isActive()))
172  val (cohTimeOutIdx, cohHasTimeOut) = PriorityEncoderWithFlag(cohTimeOutMask)
173  val missqReplayTimeOutMask = VecInit(widthMap(i => missqReplayCount(i)(MissqReplayCountBits - 1) && stateVec(i).w_timeout))
174  val (missqReplayTimeOutIdx, missqReplayHasTimeOut) = PriorityEncoderWithFlag(missqReplayTimeOutMask)
175
176  val activeMask = VecInit(stateVec.map(s => s.isActive()))
177  val drainIdx = PriorityEncoder(activeMask)
178
179  val inflightMask = VecInit(stateVec.map(s => s.isInflight()))
180
181  val inptags = io.in.map(in => getPTag(in.bits.addr))
182  val invtags = io.in.map(in => getVTag(in.bits.vaddr))
183  val sameTag = inptags(0) === inptags(1)
184  val firstWord = getWord(io.in(0).bits.addr)
185  val secondWord = getWord(io.in(1).bits.addr)
186  val sameWord = firstWord === secondWord
187
188  // merge condition
189  val mergeMask = Wire(Vec(StorePipelineWidth, Vec(StoreBufferSize, Bool())))
190  val mergeIdx = mergeMask.map(PriorityEncoder(_))
191  val canMerge = mergeMask.map(ParallelOR(_))
192
193  for(i <- 0 until StorePipelineWidth){
194    mergeMask(i) := widthMap(j =>
195      inptags(i) === ptag(j) && activeMask(j)
196    )
197  }
198
199  // insert condition
200  // firstInsert: the first invalid entry
201  // if first entry canMerge or second entry has the same ptag with the first entry,
202  // secondInsert equal the first invalid entry, otherwise, the second invalid entry
203  val invalidMask = VecInit(stateVec.map(s => s.isInvalid()))
204  val evenInvalidMask = GetEvenBits(invalidMask.asUInt)
205  val oddInvalidMask = GetOddBits(invalidMask.asUInt)
206
207  val (evenRawInsertIdx, evenCanInsert) = PriorityEncoderWithFlag(evenInvalidMask)
208  val (oddRawInsertIdx, oddCanInsert) = PriorityEncoderWithFlag(oddInvalidMask)
209  val evenInsertIdx = Cat(evenRawInsertIdx, 0.U(1.W))
210  val oddInsertIdx = Cat(oddRawInsertIdx, 1.U(1.W))
211
212  val enbufferSelReg = RegInit(false.B)
213  when(io.in(0).valid) {
214    enbufferSelReg := ~enbufferSelReg
215  }
216
217  val firstInsertIdx = Mux(enbufferSelReg, evenInsertIdx, oddInsertIdx)
218  val secondInsertIdx = Mux(sameTag,
219    firstInsertIdx,
220    Mux(~enbufferSelReg, evenInsertIdx, oddInsertIdx)
221  )
222  val firstCanInsert = sbuffer_state =/= x_drain_sbuffer && Mux(enbufferSelReg, evenCanInsert, oddCanInsert)
223  val secondCanInsert = sbuffer_state =/= x_drain_sbuffer && Mux(sameTag,
224    firstCanInsert,
225    Mux(~enbufferSelReg, evenCanInsert, oddCanInsert)
226  )
227  val need_uarch_drain = WireInit(false.B)
228  val do_uarch_drain = RegNext(need_uarch_drain)
229  XSPerfAccumulate("do_uarch_drain", do_uarch_drain)
230
231  io.in(0).ready := firstCanInsert
232  io.in(1).ready := secondCanInsert && !sameWord && io.in(0).ready
233
234  def wordReqToBufLine(req: DCacheWordReq, reqptag: UInt, reqvtag: UInt, insertIdx: UInt, wordOffset: UInt, flushMask: Bool): Unit = {
235    stateVec(insertIdx).state_valid := true.B
236    cohCount(insertIdx) := 0.U
237    missqReplayCount(insertIdx) := 0.U
238    ptag(insertIdx) := reqptag
239    vtag(insertIdx) := reqvtag // update vtag iff a new sbuffer line is allocated
240    when(flushMask){
241      for(j <- 0 until CacheLineWords){
242        for(i <- 0 until DataBytes){
243          mask(insertIdx)(j)(i) := false.B
244        }
245      }
246    }
247    for(i <- 0 until DataBytes){
248      when(req.mask(i)){
249        mask(insertIdx)(wordOffset)(i) := true.B
250//        data(insertIdx)(wordOffset)(i) := req.data(i*8+7, i*8)
251      }
252    }
253  }
254
255  def mergeWordReq(req: DCacheWordReq, reqptag: UInt, reqvtag: UInt, mergeIdx:UInt, wordOffset:UInt): Unit = {
256    cohCount(mergeIdx) := 0.U
257    missqReplayCount(mergeIdx) := 0.U
258    for(i <- 0 until DataBytes){
259      when(req.mask(i)){
260        mask(mergeIdx)(wordOffset)(i) := true.B
261//        data(mergeIdx)(wordOffset)(i) := req.data(i*8+7, i*8)
262      }
263    }
264    // check if vtag is the same, if not, trigger sbuffer flush
265    when(reqvtag =/= vtag(mergeIdx)) {
266      XSDebug("reqvtag =/= sbufvtag req(vtag %x ptag %x) sbuffer(vtag %x ptag %x)\n",
267        reqvtag << OffsetWidth,
268        reqptag << OffsetWidth,
269        vtag(mergeIdx) << OffsetWidth,
270        ptag(mergeIdx) << OffsetWidth
271      )
272      need_uarch_drain := true.B
273    }
274  }
275
276  for(((in, wordOffset), i) <- io.in.zip(Seq(firstWord, secondWord)).zipWithIndex){
277    writeReq(i).valid := in.fire()
278    writeReq(i).bits.wordOffset := wordOffset
279    writeReq(i).bits.mask := in.bits.mask
280    writeReq(i).bits.data := in.bits.data
281    writeReq(i).bits.wline := in.bits.wline
282    val insertIdx = if(i == 0) firstInsertIdx else secondInsertIdx
283    val flushMask = if(i == 0) true.B else !sameTag
284    accessIdx(i).valid := RegNext(in.fire())
285    accessIdx(i).bits := RegNext(Mux(canMerge(i), mergeIdx(i), insertIdx))
286    when(in.fire()){
287      when(canMerge(i)){
288        writeReq(i).bits.idx := mergeIdx(i)
289        mergeWordReq(in.bits, inptags(i), invtags(i), mergeIdx(i), wordOffset)
290        XSDebug(p"merge req $i to line [${mergeIdx(i)}]\n")
291      }.otherwise({
292        writeReq(i).bits.idx := insertIdx
293        wordReqToBufLine(in.bits, inptags(i), invtags(i), insertIdx, wordOffset, flushMask)
294        XSDebug(p"insert req $i to line[$insertIdx]\n")
295      })
296    }
297  }
298
299
300  for(i <- 0 until StoreBufferSize){
301    XSDebug(stateVec(i).isValid(),
302      p"[$i] timeout:${cohCount(i)(EvictCountBits-1)} state:${stateVec(i)}\n"
303    )
304  }
305
306  for((req, i) <- io.in.zipWithIndex){
307    XSDebug(req.fire(),
308      p"accept req [$i]: " +
309        p"addr:${Hexadecimal(req.bits.addr)} " +
310        p"mask:${Binary(req.bits.mask)} " +
311        p"data:${Hexadecimal(req.bits.data)}\n"
312    )
313    XSDebug(req.valid && !req.ready,
314      p"req [$i] blocked by sbuffer\n"
315    )
316  }
317
318  // ---------------------- Send Dcache Req ---------------------
319
320  val sbuffer_empty = Cat(invalidMask).andR()
321  val sq_empty = !Cat(io.in.map(_.valid)).orR()
322  val empty = sbuffer_empty && sq_empty
323  val threshold = RegNext(io.csrCtrl.sbuffer_threshold +& 1.U)
324  val validCount = PopCount(activeMask)
325  val do_eviction = RegNext(validCount >= threshold || validCount === (StoreBufferSize-1).U, init = false.B)
326  require((StoreBufferThreshold + 1) <= StoreBufferSize)
327
328  XSDebug(p"validCount[$validCount]\n")
329
330  io.flush.empty := RegNext(empty && io.sqempty)
331  // lru.io.flush := sbuffer_state === x_drain_all && empty
332  switch(sbuffer_state){
333    is(x_idle){
334      when(io.flush.valid){
335        sbuffer_state := x_drain_all
336      }.elsewhen(do_uarch_drain){
337        sbuffer_state := x_drain_sbuffer
338      }.elsewhen(do_eviction){
339        sbuffer_state := x_replace
340      }
341    }
342    is(x_drain_all){
343      when(empty){
344        sbuffer_state := x_idle
345      }
346    }
347    is(x_drain_sbuffer){
348      when(sbuffer_empty){
349        sbuffer_state := x_idle
350      }
351    }
352    is(x_replace){
353      when(io.flush.valid){
354        sbuffer_state := x_drain_all
355      }.elsewhen(do_uarch_drain){
356        sbuffer_state := x_drain_sbuffer
357      }.elsewhen(!do_eviction){
358        sbuffer_state := x_idle
359      }
360    }
361  }
362  XSDebug(p"sbuffer state:${sbuffer_state} do eviction:${do_eviction} empty:${empty}\n")
363
364  def noSameBlockInflight(idx: UInt): Bool = {
365    // stateVec(idx) itself must not be s_inflight
366    !Cat(widthMap(i => inflightMask(i) && ptag(idx) === ptag(i))).orR()
367  }
368
369  val need_drain = needDrain(sbuffer_state)
370  val need_replace = do_eviction || (sbuffer_state === x_replace)
371  val evictionIdx = Mux(missqReplayHasTimeOut,
372    missqReplayTimeOutIdx,
373    Mux(need_drain,
374      drainIdx,
375      Mux(cohHasTimeOut, cohTimeOutIdx, replaceIdx)
376    )
377  )
378
379  /*
380      If there is a inflight dcache req which has same ptag with evictionIdx's ptag,
381      current eviction should be blocked.
382   */
383  val prepareValid = missqReplayHasTimeOut ||
384    activeMask(evictionIdx) && (need_drain || cohHasTimeOut || need_replace) && noSameBlockInflight(evictionIdx)
385  val prepareValidReg = RegInit(false.B)
386  // when canSendDcacheReq, send dcache req stored in pipeline reg to dcache
387  val canSendDcacheReq = io.dcache.req.ready || !prepareValidReg
388  // when willSendDcacheReq, read dcache req data and store them in a pipeline reg
389  val willSendDcacheReq = prepareValid && canSendDcacheReq
390  when(io.dcache.req.fire()){
391    prepareValidReg := false.B
392  }
393  when(canSendDcacheReq){
394    prepareValidReg := prepareValid
395  }
396  when(willSendDcacheReq){
397    stateVec(evictionIdx).state_inflight := true.B
398    stateVec(evictionIdx).w_timeout := false.B
399    // stateVec(evictionIdx).s_pipe_req := true.B
400    XSDebug(p"$evictionIdx will be sent to Dcache\n")
401  }
402  XSDebug(p"need drain:$need_drain cohHasTimeOut: $cohHasTimeOut need replace:$need_replace\n")
403  XSDebug(p"drainIdx:$drainIdx tIdx:$cohTimeOutIdx replIdx:$replaceIdx " +
404    p"blocked:${!noSameBlockInflight(evictionIdx)} v:${activeMask(evictionIdx)}\n")
405  XSDebug(p"prepareValid:$prepareValid evictIdx:$evictionIdx dcache ready:${io.dcache.req.ready}\n")
406  // Note: if other dcache req in the same block are inflight,
407  // the lru update may not accurate
408  accessIdx(StorePipelineWidth).valid := invalidMask(replaceIdx) || (
409    need_replace && !need_drain && !cohHasTimeOut && !missqReplayHasTimeOut && canSendDcacheReq && activeMask(replaceIdx))
410  accessIdx(StorePipelineWidth).bits := replaceIdx
411  val evictionIdxReg = RegEnable(evictionIdx, enable = willSendDcacheReq)
412  val evictionPTag = RegEnable(ptag(evictionIdx), enable = willSendDcacheReq)
413  val evictionVTag = RegEnable(vtag(evictionIdx), enable = willSendDcacheReq)
414
415  io.dcache.req.valid := prepareValidReg
416  io.dcache.req.bits := DontCare
417  io.dcache.req.bits.cmd    := MemoryOpConstants.M_XWR
418  io.dcache.req.bits.addr   := getAddr(evictionPTag)
419  io.dcache.req.bits.vaddr   := getAddr(evictionVTag)
420  io.dcache.req.bits.data  := data(evictionIdxReg).asUInt
421  io.dcache.req.bits.mask  := mask(evictionIdxReg).asUInt
422  io.dcache.req.bits.id := evictionIdxReg
423
424  when (io.dcache.req.fire()) {
425    // stateVec(evictionIdxReg).s_pipe_req := false.B
426    stateVec(evictionIdxReg).w_pipe_resp := true.B
427    // assert(stateVec(evictionIdxReg).s_pipe_req === true.B)
428    assert(!(io.dcache.req.bits.vaddr === 0.U))
429    assert(!(io.dcache.req.bits.addr === 0.U))
430  }
431
432  XSDebug(io.dcache.req.fire(),
433    p"send buf [$evictionIdxReg] to Dcache, req fire\n"
434  )
435
436  // TODO: for timing reasons, dcache store pipe resp may need to be delayed
437  // update sbuffer status according to dcache resp source
438
439  // hit resp
440  io.dcache.hit_resps.map(resp => {
441  val dcache_resp_id = resp.bits.id
442    when (resp.fire()) {
443      stateVec(dcache_resp_id).state_inflight := false.B
444      stateVec(dcache_resp_id).state_valid := false.B
445      stateVec(dcache_resp_id).w_pipe_resp := false.B
446      assert(!resp.bits.replay)
447      assert(!resp.bits.miss) // not need to resp if miss, to be opted
448      assert(stateVec(dcache_resp_id).w_pipe_resp === true.B)
449      assert(stateVec(dcache_resp_id).state_inflight === true.B)
450    }
451  })
452
453  // replay resp
454  val replay_resp_id = io.dcache.replay_resp.bits.id
455  when (io.dcache.replay_resp.fire()) {
456    missqReplayCount(replay_resp_id) := 0.U
457    stateVec(replay_resp_id).w_timeout := true.B
458    // waiting for timeout
459    assert(io.dcache.replay_resp.bits.replay)
460    assert(stateVec(replay_resp_id).w_pipe_resp === true.B)
461    assert(stateVec(replay_resp_id).state_inflight === true.B)
462  }
463
464  // TODO: reuse cohCount
465  (0 until StoreBufferSize).map(i => {
466    when(stateVec(i).w_timeout && stateVec(i).state_inflight && !missqReplayCount(i)(MissqReplayCountBits-1)) {
467      missqReplayCount(i) := missqReplayCount(i) + 1.U
468    }
469    when(activeMask(i) && !cohTimeOutMask(i)){
470      cohCount(i) := cohCount(i)+1.U
471    }
472  })
473
474  // TODO: fix perf counter
475  // // performance counters
476  // XSPerfAccumulate("store_req", io.lsu.req.fire())
477  // XSPerfAccumulate("store_penalty", state =/= s_invalid)
478  // // this is useless
479  // // XSPerf("store_hit", state === s_pipe_resp && io.pipe_resp.fire() && !io.pipe_resp.bits.miss)
480  // XSPerfAccumulate("store_replay", state === s_pipe_resp && io.pipe_resp.fire() && io.pipe_resp.bits.miss && io.pipe_resp.bits.replay)
481  // XSPerfAccumulate("store_miss", state === s_pipe_resp && io.pipe_resp.fire() && io.pipe_resp.bits.miss)
482  // val (store_latency_sample, store_latency) = TransactionLatencyCounter(io.lsu.req.fire(), io.lsu.resp.fire())
483  // XSPerfHistogram("store_latency", store_latency, store_latency_sample, 0, 100, 10)
484  // XSPerfAccumulate("store_req", io.lsu.req.fire())
485  // val num_valids = PopCount(entries.map(e => !e.io.lsu.req.ready))
486  // XSPerfHistogram("num_valids", num_valids, true.B, 0, cfg.nStoreReplayEntries, 1)
487
488  if (!env.FPGAPlatform) {
489    // hit resp
490    io.dcache.hit_resps.zipWithIndex.map{case (resp, index) => {
491      val difftest = Module(new DifftestSbufferEvent)
492      val dcache_resp_id = resp.bits.id
493      difftest.io.clock := clock
494      difftest.io.coreid := hardId.U
495      difftest.io.index := index.U
496      difftest.io.sbufferResp := RegNext(resp.fire())
497      difftest.io.sbufferAddr := RegNext(getAddr(ptag(dcache_resp_id)))
498      difftest.io.sbufferData := RegNext(data(dcache_resp_id).asTypeOf(Vec(CacheLineBytes, UInt(8.W))))
499      difftest.io.sbufferMask := RegNext(mask(dcache_resp_id).asUInt)
500    }}
501
502//    // replay resp
503//    val replay_resp_id = io.dcache.replay_resp.bits.id
504//    val difftest = Module(new DifftestSbufferEvent)
505//    difftest.io.clock := clock
506//    difftest.io.coreid := hardId.U
507//    difftest.io.index := io.dcache.hit_resps.size.U // use an extra port
508//    difftest.io.sbufferResp := io.dcache.replay_resp.fire()
509//    difftest.io.sbufferAddr := getAddr(ptag(replay_resp_id))
510//    difftest.io.sbufferData := data(replay_resp_id).asTypeOf(Vec(CacheLineBytes, UInt(8.W)))
511//    difftest.io.sbufferMask := mask(replay_resp_id).asUInt
512  }
513
514  // ---------------------- Load Data Forward ---------------------
515  val mismatch = Wire(Vec(LoadPipelineWidth, Bool()))
516  XSPerfAccumulate("vaddr_match_failed", mismatch(0) || mismatch(1))
517  for ((forward, i) <- io.forward.zipWithIndex) {
518    val vtag_matches = VecInit(widthMap(w => vtag(w) === getVTag(forward.vaddr)))
519    val ptag_matches = VecInit(widthMap(w => ptag(w) === getPTag(forward.paddr)))
520    val tag_matches = vtag_matches
521    val tag_mismatch = RegNext(forward.valid) && VecInit(widthMap(w =>
522      RegNext(vtag_matches(w)) =/= RegNext(ptag_matches(w)) && RegNext((activeMask(w) || inflightMask(w)))
523    )).asUInt.orR
524    mismatch(i) := tag_mismatch
525    when (tag_mismatch) {
526      XSDebug("forward tag mismatch: pmatch %x vmatch %x vaddr %x paddr %x\n",
527        RegNext(ptag_matches.asUInt),
528        RegNext(vtag_matches.asUInt),
529        RegNext(forward.vaddr),
530        RegNext(forward.paddr)
531      )
532      do_uarch_drain := true.B
533    }
534    val valid_tag_matches = widthMap(w => tag_matches(w) && activeMask(w))
535    val inflight_tag_matches = widthMap(w => tag_matches(w) && inflightMask(w))
536    val line_offset_mask = UIntToOH(getWordOffset(forward.paddr))
537
538    val valid_tag_match_reg = valid_tag_matches.map(RegNext(_))
539    val inflight_tag_match_reg = inflight_tag_matches.map(RegNext(_))
540    val line_offset_reg = RegNext(line_offset_mask)
541
542    val selectedValidMask = Mux1H(line_offset_reg, Mux1H(valid_tag_match_reg, mask).asTypeOf(Vec(CacheLineWords, Vec(DataBytes, Bool()))))
543    val selectedValidData = Mux1H(line_offset_reg, Mux1H(valid_tag_match_reg, data).asTypeOf(Vec(CacheLineWords, Vec(DataBytes, UInt(8.W)))))
544
545    val selectedInflightMask = Mux1H(line_offset_reg, Mux1H(inflight_tag_match_reg, mask).asTypeOf(Vec(CacheLineWords, Vec(DataBytes, Bool()))))
546    val selectedInflightData = Mux1H(line_offset_reg, Mux1H(inflight_tag_match_reg, data).asTypeOf(Vec(CacheLineWords, Vec(DataBytes, UInt(8.W)))))
547
548    val selectedInflightMaskFast = Mux1H(line_offset_mask, Mux1H(inflight_tag_matches, mask).asTypeOf(Vec(CacheLineWords, Vec(DataBytes, Bool()))))
549    val selectedValidMaskFast = Mux1H(line_offset_mask, Mux1H(valid_tag_matches, mask).asTypeOf(Vec(CacheLineWords, Vec(DataBytes, Bool()))))
550
551    forward.dataInvalid := false.B // data in store line merge buffer is always ready
552    forward.matchInvalid := tag_mismatch // paddr / vaddr cam result does not match
553    for (j <- 0 until DataBytes) {
554      forward.forwardMask(j) := false.B
555      forward.forwardData(j) := DontCare
556
557      // valid entries have higher priority than inflight entries
558      when(selectedInflightMask(j)) {
559        forward.forwardMask(j) := true.B
560        forward.forwardData(j) := selectedInflightData(j)
561      }
562      when(selectedValidMask(j)) {
563        forward.forwardMask(j) := true.B
564        forward.forwardData(j) := selectedValidData(j)
565      }
566
567      forward.forwardMaskFast(j) := selectedInflightMaskFast(j) || selectedValidMaskFast(j)
568    }
569  }
570
571  for (i <- 0 until StoreBufferSize) {
572    XSDebug("sbf entry " + i + " : ptag %x vtag %x valid %x active %x inflight %x w_resp %x w_timeout %x\n",
573      ptag(i) << OffsetWidth,
574      vtag(i) << OffsetWidth,
575      stateVec(i).isValid(),
576      activeMask(i),
577      inflightMask(i),
578      stateVec(i).w_pipe_resp,
579      stateVec(i).w_timeout
580    )
581  }
582
583  val perf_valid_entry_count = PopCount(VecInit(stateVec.map(s => !s.isInvalid())).asUInt)
584  XSPerfHistogram("util", perf_valid_entry_count, true.B, 0, StoreBufferSize, 1)
585  XSPerfAccumulate("sbuffer_req_valid", PopCount(VecInit(io.in.map(_.valid)).asUInt))
586  XSPerfAccumulate("sbuffer_req_fire", PopCount(VecInit(io.in.map(_.fire())).asUInt))
587  XSPerfAccumulate("sbuffer_merge", PopCount(VecInit(io.in.zipWithIndex.map({case (in, i) => in.fire() && canMerge(i)})).asUInt))
588  XSPerfAccumulate("sbuffer_newline", PopCount(VecInit(io.in.zipWithIndex.map({case (in, i) => in.fire() && !canMerge(i)})).asUInt))
589  XSPerfAccumulate("dcache_req_valid", io.dcache.req.valid)
590  XSPerfAccumulate("dcache_req_fire", io.dcache.req.fire())
591  XSPerfAccumulate("sbuffer_idle", sbuffer_state === x_idle)
592  XSPerfAccumulate("sbuffer_flush", sbuffer_state === x_drain_sbuffer)
593  XSPerfAccumulate("sbuffer_replace", sbuffer_state === x_replace)
594  XSPerfAccumulate("evenCanInsert", evenCanInsert)
595  XSPerfAccumulate("oddCanInsert", oddCanInsert)
596}
597