xref: /XiangShan/src/main/scala/xiangshan/mem/prefetch/L1StridePrefetcher.scala (revision 99ce5576f0ecce1b5045b7bc0dbbb2debd934fbb)
1c49ebec8SHaoyuan Feng/***************************************************************************************
2c49ebec8SHaoyuan Feng* Copyright (c) 2024 Beijing Institute of Open Source Chip (BOSC)
3c49ebec8SHaoyuan Feng* Copyright (c) 2020-2024 Institute of Computing Technology, Chinese Academy of Sciences
4c49ebec8SHaoyuan Feng* Copyright (c) 2020-2021 Peng Cheng Laboratory
5c49ebec8SHaoyuan Feng*
6c49ebec8SHaoyuan Feng* XiangShan is licensed under Mulan PSL v2.
7c49ebec8SHaoyuan Feng* You can use this software according to the terms and conditions of the Mulan PSL v2.
8c49ebec8SHaoyuan Feng* You may obtain a copy of Mulan PSL v2 at:
9c49ebec8SHaoyuan Feng*          http://license.coscl.org.cn/MulanPSL2
10c49ebec8SHaoyuan Feng*
11c49ebec8SHaoyuan Feng* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
12c49ebec8SHaoyuan Feng* EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
13c49ebec8SHaoyuan Feng* MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
14c49ebec8SHaoyuan Feng*
15c49ebec8SHaoyuan Feng* See the Mulan PSL v2 for more details.
16c49ebec8SHaoyuan Feng*
17c49ebec8SHaoyuan Feng*
18c49ebec8SHaoyuan Feng* Acknowledgement
19c49ebec8SHaoyuan Feng*
20c49ebec8SHaoyuan Feng* This implementation is inspired by several key papers:
21c49ebec8SHaoyuan Feng* [1] Jean-Loup Baer, and Tien-Fu Chen. "[An effective on-chip preloading scheme to reduce data access penalty.]
22c49ebec8SHaoyuan Feng* (https://doi.org/10.1145/125826.125932)" ACM/IEEE Conference on Supercomputing. 1991.
23c49ebec8SHaoyuan Feng***************************************************************************************/
24c49ebec8SHaoyuan Feng
250d32f713Shappy-lxpackage xiangshan.mem.prefetch
260d32f713Shappy-lx
278891a219SYinan Xuimport org.chipsalliance.cde.config.Parameters
280d32f713Shappy-lximport chisel3._
290d32f713Shappy-lximport chisel3.util._
300d32f713Shappy-lximport utils._
310d32f713Shappy-lximport utility._
329e12e8edScz4eimport xiangshan._
339e12e8edScz4eimport xiangshan.mem.L1PrefetchReq
34*99ce5576Scz4eimport xiangshan.mem.Bundles.LsPrefetchTrainBundle
359e12e8edScz4eimport xiangshan.mem.trace._
360d32f713Shappy-lximport xiangshan.cache.HasDCacheParameters
370d32f713Shappy-lximport xiangshan.cache.mmu._
380d32f713Shappy-lximport scala.collection.SeqLike
390d32f713Shappy-lx
400d32f713Shappy-lxtrait HasStridePrefetchHelper extends HasL1PrefetchHelper {
410d32f713Shappy-lx  val STRIDE_FILTER_SIZE = 6
420d32f713Shappy-lx  val STRIDE_ENTRY_NUM = 10
430d32f713Shappy-lx  val STRIDE_BITS = 10 + BLOCK_OFFSET
440d32f713Shappy-lx  val STRIDE_VADDR_BITS = 10 + BLOCK_OFFSET
450d32f713Shappy-lx  val STRIDE_CONF_BITS = 2
460d32f713Shappy-lx
470d32f713Shappy-lx  // detail control
48c686adcdSYinan Xu  val ALWAYS_UPDATE_PRE_VADDR = true
490d32f713Shappy-lx  val AGGRESIVE_POLICY = false // if true, prefetch degree is greater than 1, 1 otherwise
500d32f713Shappy-lx  val STRIDE_LOOK_AHEAD_BLOCKS = 2 // aggressive degree
510d32f713Shappy-lx  val LOOK_UP_STREAM = false // if true, avoid collision with stream
520d32f713Shappy-lx
530d32f713Shappy-lx  val STRIDE_WIDTH_BLOCKS = if(AGGRESIVE_POLICY) STRIDE_LOOK_AHEAD_BLOCKS else 1
540d32f713Shappy-lx
550d32f713Shappy-lx  def MAX_CONF = (1 << STRIDE_CONF_BITS) - 1
560d32f713Shappy-lx}
570d32f713Shappy-lx
580d32f713Shappy-lxclass StrideMetaBundle(implicit p: Parameters) extends XSBundle with HasStridePrefetchHelper {
590d32f713Shappy-lx  val pre_vaddr = UInt(STRIDE_VADDR_BITS.W)
600d32f713Shappy-lx  val stride = UInt(STRIDE_BITS.W)
610d32f713Shappy-lx  val confidence = UInt(STRIDE_CONF_BITS.W)
620d32f713Shappy-lx  val hash_pc = UInt(HASH_TAG_WIDTH.W)
630d32f713Shappy-lx
640d32f713Shappy-lx  def reset(index: Int) = {
650d32f713Shappy-lx    pre_vaddr := 0.U
660d32f713Shappy-lx    stride := 0.U
670d32f713Shappy-lx    confidence := 0.U
680d32f713Shappy-lx    hash_pc := index.U
690d32f713Shappy-lx  }
700d32f713Shappy-lx
7170eea123SYanqin Li  def tag_match(valid1: Bool, valid2: Bool, new_hash_pc: UInt): Bool = {
7270eea123SYanqin Li    valid1 && valid2 && hash_pc === new_hash_pc
7370eea123SYanqin Li  }
7470eea123SYanqin Li
750d32f713Shappy-lx  def alloc(vaddr: UInt, alloc_hash_pc: UInt) = {
760d32f713Shappy-lx    pre_vaddr := vaddr(STRIDE_VADDR_BITS - 1, 0)
770d32f713Shappy-lx    stride := 0.U
780d32f713Shappy-lx    confidence := 0.U
790d32f713Shappy-lx    hash_pc := alloc_hash_pc
800d32f713Shappy-lx  }
810d32f713Shappy-lx
820d32f713Shappy-lx  def update(vaddr: UInt, always_update_pre_vaddr: Bool) = {
830d32f713Shappy-lx    val new_vaddr = vaddr(STRIDE_VADDR_BITS - 1, 0)
840d32f713Shappy-lx    val new_stride = new_vaddr - pre_vaddr
850d32f713Shappy-lx    val new_stride_blk = block_addr(new_stride)
860d32f713Shappy-lx    // NOTE: for now, disable negtive stride
870d32f713Shappy-lx    val stride_valid = new_stride_blk =/= 0.U && new_stride_blk =/= 1.U && new_stride(STRIDE_VADDR_BITS - 1) === 0.U
880d32f713Shappy-lx    val stride_match = new_stride === stride
890d32f713Shappy-lx    val low_confidence = confidence <= 1.U
900d32f713Shappy-lx    val can_send_pf = stride_valid && stride_match && confidence === MAX_CONF.U
910d32f713Shappy-lx
920d32f713Shappy-lx    when(stride_valid) {
930d32f713Shappy-lx      when(stride_match) {
940d32f713Shappy-lx        confidence := Mux(confidence === MAX_CONF.U, confidence, confidence + 1.U)
950d32f713Shappy-lx      }.otherwise {
960d32f713Shappy-lx        confidence := Mux(confidence === 0.U, confidence, confidence - 1.U)
970d32f713Shappy-lx        when(low_confidence) {
980d32f713Shappy-lx          stride := new_stride
990d32f713Shappy-lx        }
1000d32f713Shappy-lx      }
1010d32f713Shappy-lx      pre_vaddr := new_vaddr
1020d32f713Shappy-lx    }
1030d32f713Shappy-lx    when(always_update_pre_vaddr) {
1040d32f713Shappy-lx      pre_vaddr := new_vaddr
1050d32f713Shappy-lx    }
1060d32f713Shappy-lx
1070d32f713Shappy-lx    (can_send_pf, new_stride)
1080d32f713Shappy-lx  }
1090d32f713Shappy-lx
1100d32f713Shappy-lx}
1110d32f713Shappy-lx
1120d32f713Shappy-lxclass StrideMetaArray(implicit p: Parameters) extends XSModule with HasStridePrefetchHelper {
1130d32f713Shappy-lx  val io = IO(new XSBundle {
1140d32f713Shappy-lx    val enable = Input(Bool())
1150d32f713Shappy-lx    // TODO: flush all entry when process changing happens, or disable stream prefetch for a while
1160d32f713Shappy-lx    val flush = Input(Bool())
1170d32f713Shappy-lx    val dynamic_depth = Input(UInt(32.W)) // TODO: enable dynamic stride depth
1180d32f713Shappy-lx    val train_req = Flipped(DecoupledIO(new PrefetchReqBundle))
11920e09ab1Shappy-lx    val l1_prefetch_req = ValidIO(new StreamPrefetchReqBundle)
12020e09ab1Shappy-lx    val l2_l3_prefetch_req = ValidIO(new StreamPrefetchReqBundle)
1210d32f713Shappy-lx    // query Stream component to see if a stream pattern has already been detected
1220d32f713Shappy-lx    val stream_lookup_req  = ValidIO(new PrefetchReqBundle)
1230d32f713Shappy-lx    val stream_lookup_resp = Input(Bool())
1240d32f713Shappy-lx  })
1250d32f713Shappy-lx
1260d32f713Shappy-lx  val array = Reg(Vec(STRIDE_ENTRY_NUM, new StrideMetaBundle))
12770eea123SYanqin Li  val valids = RegInit(VecInit(Seq.fill(STRIDE_ENTRY_NUM)(false.B)))
12870eea123SYanqin Li
12970eea123SYanqin Li  def reset_array(i: Int): Unit = {
13070eea123SYanqin Li    valids(i) := false.B
13170eea123SYanqin Li    //only need to rest control signals for firendly area
13270eea123SYanqin Li    // array(i).reset(i)
13370eea123SYanqin Li  }
13470eea123SYanqin Li
1350d32f713Shappy-lx  val replacement = ReplacementPolicy.fromString("plru", STRIDE_ENTRY_NUM)
1360d32f713Shappy-lx
1370d32f713Shappy-lx  // s0: hash pc -> cam all entries
1380d32f713Shappy-lx  val s0_can_accept = Wire(Bool())
1390d32f713Shappy-lx  val s0_valid = io.train_req.fire
1400d32f713Shappy-lx  val s0_vaddr = io.train_req.bits.vaddr
1410d32f713Shappy-lx  val s0_pc = io.train_req.bits.pc
1420d32f713Shappy-lx  val s0_pc_hash = pc_hash_tag(s0_pc)
14370eea123SYanqin Li  val s0_pc_match_vec = VecInit(array zip valids map { case (e, v) => e.tag_match(v, s0_valid, s0_pc_hash) }).asUInt
1440d32f713Shappy-lx  val s0_hit = s0_pc_match_vec.orR
1450d32f713Shappy-lx  val s0_index = Mux(s0_hit, OHToUInt(s0_pc_match_vec), replacement.way)
1460d32f713Shappy-lx  io.train_req.ready := s0_can_accept
1470d32f713Shappy-lx  io.stream_lookup_req.valid := s0_valid
1480d32f713Shappy-lx  io.stream_lookup_req.bits  := io.train_req.bits
1490d32f713Shappy-lx
1500d32f713Shappy-lx  when(s0_valid) {
1510d32f713Shappy-lx    replacement.access(s0_index)
1520d32f713Shappy-lx  }
1530d32f713Shappy-lx
1540d32f713Shappy-lx  assert(PopCount(s0_pc_match_vec) <= 1.U)
1550d32f713Shappy-lx  XSPerfAccumulate("s0_valid", s0_valid)
1560d32f713Shappy-lx  XSPerfAccumulate("s0_hit", s0_valid && s0_hit)
1570d32f713Shappy-lx  XSPerfAccumulate("s0_miss", s0_valid && !s0_hit)
1580d32f713Shappy-lx
1590d32f713Shappy-lx  // s1: alloc or update
1604ccb2e8bSYanqin Li  val s1_valid = GatedValidRegNext(s0_valid)
1610d32f713Shappy-lx  val s1_index = RegEnable(s0_index, s0_valid)
1620d32f713Shappy-lx  val s1_pc_hash = RegEnable(s0_pc_hash, s0_valid)
1630d32f713Shappy-lx  val s1_vaddr = RegEnable(s0_vaddr, s0_valid)
1640d32f713Shappy-lx  val s1_hit = RegEnable(s0_hit, s0_valid)
1650d32f713Shappy-lx  val s1_alloc = s1_valid && !s1_hit
1660d32f713Shappy-lx  val s1_update = s1_valid && s1_hit
1670d32f713Shappy-lx  val s1_stride = array(s1_index).stride
1680d32f713Shappy-lx  val s1_new_stride = WireInit(0.U(STRIDE_BITS.W))
1690d32f713Shappy-lx  val s1_can_send_pf = WireInit(false.B)
1700d32f713Shappy-lx  s0_can_accept := !(s1_valid && s1_pc_hash === s0_pc_hash)
1710d32f713Shappy-lx
172c686adcdSYinan Xu  val always_update = Constantin.createRecord(s"always_update${p(XSCoreParamsKey).HartId}", initValue = ALWAYS_UPDATE_PRE_VADDR)
1730d32f713Shappy-lx
1740d32f713Shappy-lx  when(s1_alloc) {
17570eea123SYanqin Li    valids(s1_index) := true.B
1760d32f713Shappy-lx    array(s1_index).alloc(
1770d32f713Shappy-lx      vaddr = s1_vaddr,
1780d32f713Shappy-lx      alloc_hash_pc = s1_pc_hash
1790d32f713Shappy-lx    )
1800d32f713Shappy-lx  }.elsewhen(s1_update) {
1810d32f713Shappy-lx    val res = array(s1_index).update(s1_vaddr, always_update)
1820d32f713Shappy-lx    s1_can_send_pf := res._1
1830d32f713Shappy-lx    s1_new_stride := res._2
1840d32f713Shappy-lx  }
1850d32f713Shappy-lx
186c686adcdSYinan Xu  val l1_stride_ratio_const = Constantin.createRecord(s"l1_stride_ratio${p(XSCoreParamsKey).HartId}", initValue = 2)
1870d32f713Shappy-lx  val l1_stride_ratio = l1_stride_ratio_const(3, 0)
188c686adcdSYinan Xu  val l2_stride_ratio_const = Constantin.createRecord(s"l2_stride_ratio${p(XSCoreParamsKey).HartId}", initValue = 5)
1890d32f713Shappy-lx  val l2_stride_ratio = l2_stride_ratio_const(3, 0)
1900d32f713Shappy-lx  // s2: calculate L1 & L2 pf addr
1914ccb2e8bSYanqin Li  val s2_valid = GatedValidRegNext(s1_valid && s1_can_send_pf)
1920d32f713Shappy-lx  val s2_vaddr = RegEnable(s1_vaddr, s1_valid && s1_can_send_pf)
1930d32f713Shappy-lx  val s2_stride = RegEnable(s1_stride, s1_valid && s1_can_send_pf)
1940d32f713Shappy-lx  val s2_l1_depth = s2_stride << l1_stride_ratio
1950d32f713Shappy-lx  val s2_l1_pf_vaddr = (s2_vaddr + s2_l1_depth)(VAddrBits - 1, 0)
1960d32f713Shappy-lx  val s2_l2_depth = s2_stride << l2_stride_ratio
1970d32f713Shappy-lx  val s2_l2_pf_vaddr = (s2_vaddr + s2_l2_depth)(VAddrBits - 1, 0)
1980d32f713Shappy-lx  val s2_l1_pf_req_bits = (new StreamPrefetchReqBundle).getStreamPrefetchReqBundle(
19958a9a40dSTang Haojin    valid = s2_valid,
2000d32f713Shappy-lx    vaddr = s2_l1_pf_vaddr,
2010d32f713Shappy-lx    width = STRIDE_WIDTH_BLOCKS,
2020d32f713Shappy-lx    decr_mode = false.B,
2030d32f713Shappy-lx    sink = SINK_L1,
20420e09ab1Shappy-lx    source = L1_HW_PREFETCH_STRIDE,
20520e09ab1Shappy-lx    // TODO: add stride debug db, not useful for now
20620e09ab1Shappy-lx    t_pc = 0xdeadbeefL.U,
20720e09ab1Shappy-lx    t_va = 0xdeadbeefL.U
20820e09ab1Shappy-lx    )
2090d32f713Shappy-lx  val s2_l2_pf_req_bits = (new StreamPrefetchReqBundle).getStreamPrefetchReqBundle(
21058a9a40dSTang Haojin    valid = s2_valid,
2110d32f713Shappy-lx    vaddr = s2_l2_pf_vaddr,
2120d32f713Shappy-lx    width = STRIDE_WIDTH_BLOCKS,
2130d32f713Shappy-lx    decr_mode = false.B,
2140d32f713Shappy-lx    sink = SINK_L2,
21520e09ab1Shappy-lx    source = L1_HW_PREFETCH_STRIDE,
21620e09ab1Shappy-lx    // TODO: add stride debug db, not useful for now
21720e09ab1Shappy-lx    t_pc = 0xdeadbeefL.U,
21820e09ab1Shappy-lx    t_va = 0xdeadbeefL.U
21920e09ab1Shappy-lx    )
2200d32f713Shappy-lx
2210d32f713Shappy-lx  // s3: send l1 pf out
2224ccb2e8bSYanqin Li  val s3_valid = if (LOOK_UP_STREAM) GatedValidRegNext(s2_valid) && !io.stream_lookup_resp else GatedValidRegNext(s2_valid)
2230d32f713Shappy-lx  val s3_l1_pf_req_bits = RegEnable(s2_l1_pf_req_bits, s2_valid)
2240d32f713Shappy-lx  val s3_l2_pf_req_bits = RegEnable(s2_l2_pf_req_bits, s2_valid)
2250d32f713Shappy-lx
2260d32f713Shappy-lx  // s4: send l2 pf out
2274ccb2e8bSYanqin Li  val s4_valid = GatedValidRegNext(s3_valid)
2280d32f713Shappy-lx  val s4_l2_pf_req_bits = RegEnable(s3_l2_pf_req_bits, s3_valid)
2290d32f713Shappy-lx
23020e09ab1Shappy-lx  io.l1_prefetch_req.valid := s3_valid
23120e09ab1Shappy-lx  io.l1_prefetch_req.bits := s3_l1_pf_req_bits
23220e09ab1Shappy-lx  io.l2_l3_prefetch_req.valid := s4_valid
23320e09ab1Shappy-lx  io.l2_l3_prefetch_req.bits := s4_l2_pf_req_bits
2340d32f713Shappy-lx
23520e09ab1Shappy-lx  XSPerfAccumulate("pf_valid", PopCount(Seq(io.l1_prefetch_req.valid, io.l2_l3_prefetch_req.valid)))
23620e09ab1Shappy-lx  XSPerfAccumulate("l1_pf_valid", s3_valid)
2370d32f713Shappy-lx  XSPerfAccumulate("l2_pf_valid", s4_valid)
2380d32f713Shappy-lx  XSPerfAccumulate("detect_stream", io.stream_lookup_resp)
2390d32f713Shappy-lx  XSPerfHistogram("high_conf_num", PopCount(VecInit(array.map(_.confidence === MAX_CONF.U))).asUInt, true.B, 0, STRIDE_ENTRY_NUM, 1)
2400d32f713Shappy-lx  for(i <- 0 until STRIDE_ENTRY_NUM) {
2410d32f713Shappy-lx    XSPerfAccumulate(s"entry_${i}_update", i.U === s1_index && s1_update)
2420d32f713Shappy-lx    for(j <- 0 until 4) {
2430d32f713Shappy-lx      XSPerfAccumulate(s"entry_${i}_disturb_${j}", i.U === s1_index && s1_update &&
2440d32f713Shappy-lx                                                   j.U === s1_new_stride &&
2450d32f713Shappy-lx                                                   array(s1_index).confidence === MAX_CONF.U &&
2460d32f713Shappy-lx                                                   array(s1_index).stride =/= s1_new_stride
2470d32f713Shappy-lx      )
2480d32f713Shappy-lx    }
2490d32f713Shappy-lx  }
2500d32f713Shappy-lx
2510d32f713Shappy-lx  for(i <- 0 until STRIDE_ENTRY_NUM) {
25270eea123SYanqin Li    when(GatedValidRegNext(io.flush)) {
25370eea123SYanqin Li      reset_array(i)
2540d32f713Shappy-lx    }
2550d32f713Shappy-lx  }
2560d32f713Shappy-lx}
257