xref: /XiangShan/src/main/scala/xiangshan/mem/lsqueue/LoadMisalignBuffer.scala (revision 35bb77967d8f8147bbe08e4cf9ecb32a8f912c9d)
1/***************************************************************************************
2* Copyright (c) 2020-2021 Institute of Computing Technology, Chinese Academy of Sciences
3* Copyright (c) 2020-2021 Peng Cheng Laboratory
4*
5* XiangShan is licensed under Mulan PSL v2.
6* You can use this software according to the terms and conditions of the Mulan PSL v2.
7* You may obtain a copy of Mulan PSL v2 at:
8*          http://license.coscl.org.cn/MulanPSL2
9*
10* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
11* EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
12* MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
13*
14* See the Mulan PSL v2 for more details.
15***************************************************************************************/
16
17package xiangshan.mem
18
19import org.chipsalliance.cde.config.Parameters
20import chisel3._
21import chisel3.util._
22import utils._
23import utility._
24import xiangshan._
25import xiangshan.ExceptionNO._
26import xiangshan.frontend.FtqPtr
27import xiangshan.backend.fu.FuConfig._
28import xiangshan.backend.fu.FuType
29import xiangshan.backend.fu.fpu.FPU
30import xiangshan.backend.rob.RobLsqIO
31import xiangshan.mem.Bundles._
32import xiangshan.backend.rob.RobPtr
33import xiangshan.backend.Bundles.{MemExuOutput, DynInst}
34import xiangshan.backend.fu.FuConfig.LduCfg
35import xiangshan.cache.mmu.HasTlbConst
36import xiangshan.cache._
37import xiangshan.cache.wpu.ReplayCarry
38
39class LoadMisalignBuffer(implicit p: Parameters) extends XSModule
40  with HasCircularQueuePtrHelper
41  with HasLoadHelper
42  with HasTlbConst
43{
44  private val enqPortNum = LoadPipelineWidth
45  private val maxSplitNum = 2
46
47  require(maxSplitNum == 2)
48
49  private val LB = "b00".U(2.W)
50  private val LH = "b01".U(2.W)
51  private val LW = "b10".U(2.W)
52  private val LD = "b11".U(2.W)
53
54  // encode of how many bytes to shift or truncate
55  private val BYTE0 = "b000".U(3.W)
56  private val BYTE1 = "b001".U(3.W)
57  private val BYTE2 = "b010".U(3.W)
58  private val BYTE3 = "b011".U(3.W)
59  private val BYTE4 = "b100".U(3.W)
60  private val BYTE5 = "b101".U(3.W)
61  private val BYTE6 = "b110".U(3.W)
62  private val BYTE7 = "b111".U(3.W)
63
64  def getMask(sizeEncode: UInt) = LookupTree(sizeEncode, List(
65    LB -> 0x1.U, // lb
66    LH -> 0x3.U, // lh
67    LW -> 0xf.U, // lw
68    LD -> 0xff.U  // ld
69  ))
70
71  def getShiftAndTruncateData(shiftEncode: UInt, truncateEncode: UInt, data: UInt) = {
72    val shiftData = LookupTree(shiftEncode, List(
73      BYTE0 -> data(63,    0),
74      BYTE1 -> data(63,    8),
75      BYTE2 -> data(63,   16),
76      BYTE3 -> data(63,   24),
77      BYTE4 -> data(63,   32),
78      BYTE5 -> data(63,   40),
79      BYTE6 -> data(63,   48),
80      BYTE7 -> data(63,   56)
81    ))
82    val truncateData = LookupTree(truncateEncode, List(
83      BYTE0 -> 0.U(XLEN.W), // can not truncate with 0 byte width
84      BYTE1 -> shiftData(7,    0),
85      BYTE2 -> shiftData(15,   0),
86      BYTE3 -> shiftData(23,   0),
87      BYTE4 -> shiftData(31,   0),
88      BYTE5 -> shiftData(39,   0),
89      BYTE6 -> shiftData(47,   0),
90      BYTE7 -> shiftData(55,   0)
91    ))
92    truncateData(XLEN - 1, 0)
93  }
94
95  def selectOldest[T <: LqWriteBundle](valid: Seq[Bool], bits: Seq[T]): (Seq[Bool], Seq[T]) = {
96    assert(valid.length == bits.length)
97    if (valid.length == 0 || valid.length == 1) {
98      (valid, bits)
99    } else if (valid.length == 2) {
100      val res = Seq.fill(2)(Wire(ValidIO(chiselTypeOf(bits(0)))))
101      for (i <- res.indices) {
102        res(i).valid := valid(i)
103        res(i).bits := bits(i)
104      }
105      val oldest = Mux(valid(0) && valid(1),
106        Mux(isAfter(bits(0).uop.robIdx, bits(1).uop.robIdx) ||
107          (bits(0).uop.robIdx === bits(1).uop.robIdx && bits(0).uop.uopIdx > bits(1).uop.uopIdx), res(1), res(0)),
108        Mux(valid(0) && !valid(1), res(0), res(1)))
109      (Seq(oldest.valid), Seq(oldest.bits))
110    } else {
111      val left = selectOldest(valid.take(valid.length / 2), bits.take(bits.length / 2))
112      val right = selectOldest(valid.takeRight(valid.length - (valid.length / 2)), bits.takeRight(bits.length - (bits.length / 2)))
113      selectOldest(left._1 ++ right._1, left._2 ++ right._2)
114    }
115  }
116
117  val io = IO(new Bundle() {
118    val redirect        = Flipped(Valid(new Redirect))
119    val enq             = Vec(enqPortNum, Flipped(new MisalignBufferEnqIO))
120    val rob             = Flipped(new RobLsqIO)
121    val splitLoadReq    = Decoupled(new LsPipelineBundle)
122    val splitLoadResp   = Flipped(Valid(new LqWriteBundle))
123    val writeBack       = Decoupled(new MemExuOutput)
124    val vecWriteBack    = Decoupled(new VecPipelineFeedbackIO(isVStore = false))
125    val loadOutValid    = Input(Bool())
126    val loadVecOutValid = Input(Bool())
127    val overwriteExpBuf = Output(new XSBundle {
128      val valid  = Bool()
129      val vaddr  = UInt(XLEN.W)
130      val isHyper = Bool()
131      val gpaddr = UInt(XLEN.W)
132      val isForVSnonLeafPTE = Bool()
133    })
134    val flushLdExpBuff  = Output(Bool())
135    val loadMisalignFull = Output(Bool())
136  })
137
138  io.rob.mmio := 0.U.asTypeOf(Vec(LoadPipelineWidth, Bool()))
139  io.rob.uop  := 0.U.asTypeOf(Vec(LoadPipelineWidth, new DynInst))
140
141  val req_valid = RegInit(false.B)
142  val req = Reg(new LqWriteBundle)
143
144  io.loadMisalignFull := req_valid
145
146  (0 until io.enq.length).map{i =>
147    if (i == 0) {
148      io.enq(0).req.ready := !req_valid && io.enq(0).req.valid
149    }
150    else {
151      io.enq(i).req.ready := !io.enq.take(i).map(_.req.ready).reduce(_ || _) && !req_valid && io.enq(i).req.valid
152    }
153  }
154
155  val select_req_bit   = ParallelPriorityMux(io.enq.map(_.req.valid), io.enq.map(_.req.bits))
156  val select_req_valid = io.enq.map(_.req.valid).reduce(_ || _)
157  val canEnqValid = !req_valid && !select_req_bit.uop.robIdx.needFlush(io.redirect) && select_req_valid
158  when(canEnqValid) {
159    req := select_req_bit
160    req_valid := true.B
161  }
162
163  // buffer control:
164  //  - s_idle:   idle
165  //  - s_split:  split misalign laod
166  //  - s_req:    issue a split memory access request
167  //  - s_resp:   Responds to a split load access request
168  //  - s_comb_wakeup_rep: Merge the data and issue a wakeup load
169  //  - s_wb: writeback yo rob/vecMergeBuffer
170  val s_idle :: s_split :: s_req :: s_resp :: s_comb_wakeup_rep :: s_wb :: Nil = Enum(6)
171  val bufferState = RegInit(s_idle)
172  val splitLoadReqs = RegInit(VecInit(List.fill(maxSplitNum)(0.U.asTypeOf(new LsPipelineBundle))))
173  val splitLoadResp = RegInit(VecInit(List.fill(maxSplitNum)(0.U.asTypeOf(new LqWriteBundle))))
174  val exceptionVec = RegInit(0.U.asTypeOf(ExceptionVec()))
175  val unSentLoads = RegInit(0.U(maxSplitNum.W))
176  val curPtr = RegInit(0.U(log2Ceil(maxSplitNum).W))
177  val needWakeUpReqsWire = Wire(Bool())
178  val needWakeUpWB       = RegInit(false.B)
179  val data_select        = RegEnable(genRdataOH(select_req_bit.uop), 0.U(genRdataOH(select_req_bit.uop).getWidth.W), canEnqValid)
180
181  // if there is exception or uncache in split load
182  val globalException = RegInit(false.B)
183  val globalUncache = RegInit(false.B)
184
185  // debug info
186  val globalMMIO = RegInit(false.B)
187  val globalNC   = RegInit(false.B)
188
189  val hasException = io.splitLoadResp.bits.vecActive &&
190    ExceptionNO.selectByFu(io.splitLoadResp.bits.uop.exceptionVec, LduCfg).asUInt.orR || TriggerAction.isDmode(io.splitLoadResp.bits.uop.trigger)
191  val isUncache = io.splitLoadResp.bits.mmio || io.splitLoadResp.bits.nc
192  needWakeUpReqsWire := false.B
193  switch(bufferState) {
194    is (s_idle) {
195      when (req_valid) {
196        bufferState := s_split
197      }
198    }
199
200    is (s_split) {
201      bufferState := s_req
202    }
203
204    is (s_req) {
205      when (io.splitLoadReq.fire) {
206        bufferState := s_resp
207      }
208    }
209
210    is (s_resp) {
211      when (io.splitLoadResp.valid) {
212        val clearOh = UIntToOH(curPtr)
213        when (hasException || isUncache) {
214          // commit directly when exception ocurs
215          // if any split load reaches uncache space, delegate to software loadAddrMisaligned exception
216          bufferState := s_wb
217          globalException := hasException
218          globalUncache := isUncache
219          globalMMIO := io.splitLoadResp.bits.mmio
220          globalNC   := io.splitLoadResp.bits.nc
221        } .elsewhen(io.splitLoadResp.bits.rep_info.need_rep || (unSentLoads & ~clearOh).orR) {
222          // need replay or still has unsent requests
223          bufferState := s_req
224        } .otherwise {
225          // merge the split load results
226          bufferState := s_comb_wakeup_rep
227          needWakeUpWB := !req.isvec
228        }
229      }
230    }
231
232    is (s_comb_wakeup_rep) {
233      when(!req.isvec) {
234        when(io.splitLoadReq.fire) {
235          bufferState := s_wb
236        }.otherwise {
237          bufferState := s_comb_wakeup_rep
238        }
239        needWakeUpReqsWire := true.B
240      } .otherwise {
241        bufferState := s_wb
242      }
243
244    }
245
246    is (s_wb) {
247      when(req.isvec) {
248        when(io.vecWriteBack.fire) {
249          bufferState := s_idle
250          req_valid := false.B
251          curPtr := 0.U
252          unSentLoads := 0.U
253          globalException := false.B
254          globalUncache := false.B
255          needWakeUpWB := false.B
256
257          globalMMIO := false.B
258          globalNC   := false.B
259        }
260
261      } .otherwise {
262        when(io.writeBack.fire) {
263          bufferState := s_idle
264          req_valid := false.B
265          curPtr := 0.U
266          unSentLoads := 0.U
267          globalException := false.B
268          globalUncache := false.B
269          needWakeUpWB := false.B
270
271          globalMMIO := false.B
272          globalNC   := false.B
273        }
274      }
275
276    }
277  }
278
279  val alignedType = Mux(req.isvec, req.alignedType(1,0), req.uop.fuOpType(1, 0))
280  val highAddress = LookupTree(alignedType, List(
281    LB -> 0.U,
282    LH -> 1.U,
283    LW -> 3.U,
284    LD -> 7.U
285  )) + req.vaddr(4, 0)
286  // to see if (vaddr + opSize - 1) and vaddr are in the same 16 bytes region
287  val cross16BytesBoundary = req_valid && (highAddress(4) =/= req.vaddr(4))
288  val aligned16BytesAddr   = (req.vaddr >> 4) << 4// req.vaddr & ~("b1111".U)
289  val aligned16BytesSel    = req.vaddr(3, 0)
290
291  // meta of 128 bit load
292  val new128Load = WireInit(0.U.asTypeOf(new LsPipelineBundle))
293  // meta of split loads
294  val lowAddrLoad  = WireInit(0.U.asTypeOf(new LsPipelineBundle))
295  val highAddrLoad = WireInit(0.U.asTypeOf(new LsPipelineBundle))
296  val lowResultShift = RegInit(0.U(3.W)) // how many bytes should we shift right when got result
297  val lowResultWidth = RegInit(0.U(3.W)) // how many bytes should we take from result
298  val highResultShift = RegInit(0.U(3.W))
299  val highResultWidth = RegInit(0.U(3.W))
300
301  when (bufferState === s_split) {
302    when (!cross16BytesBoundary) {
303      assert(false.B, s"There should be no non-aligned access that does not cross 16Byte boundaries.")
304    } .otherwise {
305      // split this unaligned load into `maxSplitNum` aligned loads
306      unSentLoads := Fill(maxSplitNum, 1.U(1.W))
307      curPtr := 0.U
308      lowAddrLoad.uop := req.uop
309      lowAddrLoad.uop.exceptionVec(loadAddrMisaligned) := false.B
310      lowAddrLoad.fullva := req.fullva
311      highAddrLoad.uop := req.uop
312      highAddrLoad.uop.exceptionVec(loadAddrMisaligned) := false.B
313      highAddrLoad.fullva := req.fullva
314
315      switch (alignedType(1, 0)) {
316        is (LB) {
317          assert(false.B, "lb should not trigger miss align")
318        }
319
320        is (LH) {
321          lowAddrLoad.uop.fuOpType := LB
322          lowAddrLoad.vaddr := req.vaddr
323          lowAddrLoad.mask  := 0x1.U << lowAddrLoad.vaddr(3, 0)
324          lowResultShift    := BYTE0
325          lowResultWidth    := BYTE1
326
327          highAddrLoad.uop.fuOpType := LB
328          highAddrLoad.vaddr := req.vaddr + 1.U
329          highAddrLoad.mask  := 0x1.U << highAddrLoad.vaddr(3, 0)
330          highResultShift    := BYTE0
331          highResultWidth    := BYTE1
332        }
333
334        is (LW) {
335          switch (req.vaddr(1, 0)) {
336            is ("b00".U) {
337              assert(false.B, "should not trigger miss align")
338            }
339
340            is ("b01".U) {
341              lowAddrLoad.uop.fuOpType := LW
342              lowAddrLoad.vaddr := req.vaddr - 1.U
343              lowAddrLoad.mask  := 0xf.U << lowAddrLoad.vaddr(3, 0)
344              lowResultShift    := BYTE1
345              lowResultWidth    := BYTE3
346
347              highAddrLoad.uop.fuOpType := LB
348              highAddrLoad.vaddr := req.vaddr + 3.U
349              highAddrLoad.mask  := 0x1.U << highAddrLoad.vaddr(3, 0)
350              highResultShift    := BYTE0
351              highResultWidth    := BYTE1
352            }
353
354            is ("b10".U) {
355              lowAddrLoad.uop.fuOpType := LH
356              lowAddrLoad.vaddr := req.vaddr
357              lowAddrLoad.mask  := 0x3.U << lowAddrLoad.vaddr(3, 0)
358              lowResultShift    := BYTE0
359              lowResultWidth    := BYTE2
360
361              highAddrLoad.uop.fuOpType := LH
362              highAddrLoad.vaddr := req.vaddr + 2.U
363              highAddrLoad.mask  := 0x3.U << highAddrLoad.vaddr(3, 0)
364              highResultShift    := BYTE0
365              highResultWidth    := BYTE2
366            }
367
368            is ("b11".U) {
369              lowAddrLoad.uop.fuOpType := LB
370              lowAddrLoad.vaddr := req.vaddr
371              lowAddrLoad.mask  := 0x1.U << lowAddrLoad.vaddr(3, 0)
372              lowResultShift    := BYTE0
373              lowResultWidth    := BYTE1
374
375              highAddrLoad.uop.fuOpType := LW
376              highAddrLoad.vaddr := req.vaddr + 1.U
377              highAddrLoad.mask  := 0xf.U << highAddrLoad.vaddr(3, 0)
378              highResultShift    := BYTE0
379              highResultWidth    := BYTE3
380            }
381          }
382        }
383
384        is (LD) {
385          switch (req.vaddr(2, 0)) {
386            is ("b000".U) {
387              assert(false.B, "should not trigger miss align")
388            }
389
390            is ("b001".U) {
391              lowAddrLoad.uop.fuOpType := LD
392              lowAddrLoad.vaddr := req.vaddr - 1.U
393              lowAddrLoad.mask  := 0xff.U << lowAddrLoad.vaddr(3, 0)
394              lowResultShift    := BYTE1
395              lowResultWidth    := BYTE7
396
397              highAddrLoad.uop.fuOpType := LB
398              highAddrLoad.vaddr := req.vaddr + 7.U
399              highAddrLoad.mask  := 0x1.U << highAddrLoad.vaddr(3, 0)
400              highResultShift    := BYTE0
401              highResultWidth    := BYTE1
402            }
403
404            is ("b010".U) {
405              lowAddrLoad.uop.fuOpType := LD
406              lowAddrLoad.vaddr := req.vaddr - 2.U
407              lowAddrLoad.mask  := 0xff.U << lowAddrLoad.vaddr(3, 0)
408              lowResultShift    := BYTE2
409              lowResultWidth    := BYTE6
410
411              highAddrLoad.uop.fuOpType := LH
412              highAddrLoad.vaddr := req.vaddr + 6.U
413              highAddrLoad.mask  := 0x3.U << highAddrLoad.vaddr(3, 0)
414              highResultShift    := BYTE0
415              highResultWidth    := BYTE2
416            }
417
418            is ("b011".U) {
419              lowAddrLoad.uop.fuOpType := LD
420              lowAddrLoad.vaddr := req.vaddr - 3.U
421              lowAddrLoad.mask  := 0xff.U << lowAddrLoad.vaddr(3, 0)
422              lowResultShift    := BYTE3
423              lowResultWidth    := BYTE5
424
425              highAddrLoad.uop.fuOpType := LW
426              highAddrLoad.vaddr := req.vaddr + 5.U
427              highAddrLoad.mask  := 0xf.U << highAddrLoad.vaddr(3, 0)
428              highResultShift    := BYTE0
429              highResultWidth    := BYTE3
430            }
431
432            is ("b100".U) {
433              lowAddrLoad.uop.fuOpType := LW
434              lowAddrLoad.vaddr := req.vaddr
435              lowAddrLoad.mask  := 0xf.U << lowAddrLoad.vaddr(3, 0)
436              lowResultShift    := BYTE0
437              lowResultWidth    := BYTE4
438
439              highAddrLoad.uop.fuOpType := LW
440              highAddrLoad.vaddr := req.vaddr + 4.U
441              highAddrLoad.mask  := 0xf.U << highAddrLoad.vaddr(3, 0)
442              highResultShift    := BYTE0
443              highResultWidth    := BYTE4
444            }
445
446            is ("b101".U) {
447              lowAddrLoad.uop.fuOpType := LW
448              lowAddrLoad.vaddr := req.vaddr - 1.U
449              lowAddrLoad.mask  := 0xf.U << lowAddrLoad.vaddr(3, 0)
450              lowResultShift    := BYTE1
451              lowResultWidth    := BYTE3
452
453              highAddrLoad.uop.fuOpType := LD
454              highAddrLoad.vaddr := req.vaddr + 3.U
455              highAddrLoad.mask  := 0xff.U << highAddrLoad.vaddr(3, 0)
456              highResultShift    := BYTE0
457              highResultWidth    := BYTE5
458            }
459
460            is ("b110".U) {
461              lowAddrLoad.uop.fuOpType := LH
462              lowAddrLoad.vaddr := req.vaddr
463              lowAddrLoad.mask  := 0x3.U << lowAddrLoad.vaddr(3, 0)
464              lowResultShift    := BYTE0
465              lowResultWidth    := BYTE2
466
467              highAddrLoad.uop.fuOpType := LD
468              highAddrLoad.vaddr := req.vaddr + 2.U
469              highAddrLoad.mask  := 0xff.U << highAddrLoad.vaddr(3, 0)
470              highResultShift    := BYTE0
471              highResultWidth    := BYTE6
472            }
473
474            is ("b111".U) {
475              lowAddrLoad.uop.fuOpType := LB
476              lowAddrLoad.vaddr := req.vaddr
477              lowAddrLoad.mask  := 0x1.U << lowAddrLoad.vaddr(3, 0)
478              lowResultShift    := BYTE0
479              lowResultWidth    := BYTE1
480
481              highAddrLoad.uop.fuOpType := LD
482              highAddrLoad.vaddr := req.vaddr + 1.U
483              highAddrLoad.mask  := 0xff.U << highAddrLoad.vaddr(3, 0)
484              highResultShift    := BYTE0
485              highResultWidth    := BYTE7
486            }
487          }
488        }
489      }
490
491      splitLoadReqs(0) := lowAddrLoad
492      splitLoadReqs(1) := highAddrLoad
493    }
494    exceptionVec := 0.U.asTypeOf(exceptionVec.cloneType)
495  }
496
497  io.splitLoadReq.valid := req_valid && (bufferState === s_req || bufferState === s_comb_wakeup_rep && needWakeUpReqsWire && !req.isvec)
498  io.splitLoadReq.bits  := splitLoadReqs(curPtr)
499  io.splitLoadReq.bits.isvec  := req.isvec
500  io.splitLoadReq.bits.misalignNeedWakeUp  := needWakeUpReqsWire
501  io.splitLoadReq.bits.isFinalSplit        := curPtr(0) && !needWakeUpReqsWire
502  // Restore the information of H extension load
503  // bit encoding: | hlv 1 | hlvx 1 | is unsigned(1bit) | size(2bit) |
504  val reqIsHlv  = LSUOpType.isHlv(req.uop.fuOpType)
505  val reqIsHlvx = LSUOpType.isHlvx(req.uop.fuOpType)
506  io.splitLoadReq.bits.uop.fuOpType := Mux(req.isvec, req.uop.fuOpType, Cat(reqIsHlv, reqIsHlvx, 0.U(1.W), splitLoadReqs(curPtr).uop.fuOpType(1, 0)))
507  io.splitLoadReq.bits.alignedType  := Mux(req.isvec, splitLoadReqs(curPtr).uop.fuOpType(1, 0), req.alignedType)
508
509  when (io.splitLoadResp.valid) {
510    val resp = io.splitLoadResp.bits
511    splitLoadResp(curPtr) := io.splitLoadResp.bits
512    when (isUncache) {
513      unSentLoads := 0.U
514      exceptionVec := ExceptionNO.selectByFu(0.U.asTypeOf(exceptionVec.cloneType), LduCfg)
515      // delegate to software
516      exceptionVec(loadAddrMisaligned) := true.B
517    } .elsewhen (hasException) {
518      unSentLoads := 0.U
519      LduCfg.exceptionOut.map(no => exceptionVec(no) := exceptionVec(no) || resp.uop.exceptionVec(no))
520    } .elsewhen (!io.splitLoadResp.bits.rep_info.need_rep) {
521      unSentLoads := unSentLoads & ~UIntToOH(curPtr)
522      curPtr := curPtr + 1.U
523      exceptionVec := 0.U.asTypeOf(ExceptionVec())
524    }
525  }
526
527  val combinedData = RegInit(0.U(XLEN.W))
528
529  when (bufferState === s_comb_wakeup_rep) {
530    val lowAddrResult = getShiftAndTruncateData(lowResultShift, lowResultWidth, splitLoadResp(0).data)
531                          .asTypeOf(Vec(XLEN / 8, UInt(8.W)))
532    val highAddrResult = getShiftAndTruncateData(highResultShift, highResultWidth, splitLoadResp(1).data)
533                          .asTypeOf(Vec(XLEN / 8, UInt(8.W)))
534    val catResult = Wire(Vec(XLEN / 8, UInt(8.W)))
535    (0 until XLEN / 8) .map {
536      case i => {
537        when (i.U < lowResultWidth) {
538          catResult(i) := lowAddrResult(i)
539        } .otherwise {
540          catResult(i) := highAddrResult(i.U - lowResultWidth)
541        }
542      }
543    }
544    combinedData := Mux(req.isvec, rdataVecHelper(req.alignedType, (catResult.asUInt)(XLEN - 1, 0)), rdataHelper(req.uop, (catResult.asUInt)(XLEN - 1, 0)))
545
546  }
547
548  io.writeBack.valid := req_valid && (bufferState === s_wb) && (io.splitLoadResp.valid && io.splitLoadResp.bits.misalignNeedWakeUp || globalUncache || globalException) && !io.loadOutValid && !req.isvec
549  io.writeBack.bits.uop := req.uop
550  io.writeBack.bits.uop.exceptionVec := DontCare
551  LduCfg.exceptionOut.map(no => io.writeBack.bits.uop.exceptionVec(no) := (globalUncache || globalException) && exceptionVec(no))
552  io.writeBack.bits.uop.rfWen := !globalException && !globalUncache && req.uop.rfWen
553  io.writeBack.bits.uop.fuType := FuType.ldu.U
554  io.writeBack.bits.uop.flushPipe := false.B
555  io.writeBack.bits.uop.replayInst := false.B
556  io.writeBack.bits.data := newRdataHelper(data_select, combinedData)
557  io.writeBack.bits.isFromLoadUnit := needWakeUpWB
558  // Misaligned accesses to uncache space trigger exceptions, so theoretically these signals won't do anything practical.
559  // But let's get them assigned correctly.
560  io.writeBack.bits.debug.isMMIO := globalMMIO
561  io.writeBack.bits.debug.isNC := globalNC
562  io.writeBack.bits.debug.isPerfCnt := false.B
563  io.writeBack.bits.debug.paddr := req.paddr
564  io.writeBack.bits.debug.vaddr := req.vaddr
565
566
567  // vector output
568  io.vecWriteBack.valid := req_valid && (bufferState === s_wb) && !io.loadVecOutValid && req.isvec
569
570  io.vecWriteBack.bits.alignedType          := req.alignedType
571  io.vecWriteBack.bits.vecFeedback          := true.B
572  io.vecWriteBack.bits.vecdata.get          := combinedData
573  io.vecWriteBack.bits.isvec                := req.isvec
574  io.vecWriteBack.bits.elemIdx              := req.elemIdx
575  io.vecWriteBack.bits.elemIdxInsideVd.get  := req.elemIdxInsideVd
576  io.vecWriteBack.bits.mask                 := req.mask
577  io.vecWriteBack.bits.reg_offset.get       := 0.U
578  io.vecWriteBack.bits.usSecondInv          := req.usSecondInv
579  io.vecWriteBack.bits.mBIndex              := req.mbIndex
580  io.vecWriteBack.bits.hit                  := true.B
581  io.vecWriteBack.bits.sourceType           := RSFeedbackType.lrqFull
582  io.vecWriteBack.bits.trigger              := TriggerAction.None
583  io.vecWriteBack.bits.flushState           := DontCare
584  io.vecWriteBack.bits.exceptionVec         := ExceptionNO.selectByFu(exceptionVec, VlduCfg)
585  io.vecWriteBack.bits.hasException         := globalException
586  io.vecWriteBack.bits.vaddr                := req.fullva
587  io.vecWriteBack.bits.vaNeedExt            := req.vaNeedExt
588  io.vecWriteBack.bits.gpaddr               := req.gpaddr
589  io.vecWriteBack.bits.isForVSnonLeafPTE    := req.isForVSnonLeafPTE
590  io.vecWriteBack.bits.mmio                 := globalMMIO
591  io.vecWriteBack.bits.vstart               := req.uop.vpu.vstart
592  io.vecWriteBack.bits.vecTriggerMask       := req.vecTriggerMask
593  io.vecWriteBack.bits.nc                   := globalNC
594
595
596  val flush = req_valid && req.uop.robIdx.needFlush(io.redirect)
597
598  when (flush) {
599    bufferState := s_idle
600    req_valid := false.B
601    curPtr := 0.U
602    unSentLoads := 0.U
603    globalException := false.B
604    globalUncache := false.B
605
606    globalMMIO := false.B
607    globalNC   := false.B
608  }
609
610  // NOTE: spectial case (unaligned load cross page, page fault happens in next page)
611  // if exception happens in the higher page address part, overwrite the loadExceptionBuffer vaddr
612  val shouldOverwrite = req_valid && globalException
613  val overwriteExpBuf = GatedValidRegNext(shouldOverwrite)
614  val overwriteVaddr = RegEnable(
615    Mux(
616      cross16BytesBoundary && (curPtr === 1.U),
617      splitLoadResp(curPtr).vaddr,
618      splitLoadResp(curPtr).fullva),
619    shouldOverwrite)
620  val overwriteGpaddr = RegEnable(splitLoadResp(curPtr).gpaddr, shouldOverwrite)
621  val overwriteIsHyper = RegEnable(splitLoadResp(curPtr).isHyper, shouldOverwrite)
622  val overwriteIsForVSnonLeafPTE = RegEnable(splitLoadResp(curPtr).isForVSnonLeafPTE, shouldOverwrite)
623
624  //TODO In theory, there is no need to overwrite, but for now, the signal is retained in the code in this way.
625  // and the signal will be removed after sufficient verification.
626  io.overwriteExpBuf.valid := false.B
627  io.overwriteExpBuf.vaddr := overwriteVaddr
628  io.overwriteExpBuf.isHyper := overwriteIsHyper
629  io.overwriteExpBuf.gpaddr := overwriteGpaddr
630  io.overwriteExpBuf.isForVSnonLeafPTE := overwriteIsForVSnonLeafPTE
631
632  // when no exception or uncache, flush loadExceptionBuffer at s_wb
633  val flushLdExpBuff = GatedValidRegNext(req_valid && (bufferState === s_wb) && !(globalUncache || globalException))
634  io.flushLdExpBuff := flushLdExpBuff
635
636  XSPerfAccumulate("alloc",                  RegNext(!req_valid) && req_valid)
637  XSPerfAccumulate("flush",                  flush)
638  XSPerfAccumulate("flush_idle",             flush && (bufferState === s_idle))
639  XSPerfAccumulate("flush_non_idle",         flush && (bufferState =/= s_idle))
640}
641