xref: /XiangShan/src/main/scala/xiangshan/mem/prefetch/L1StreamPrefetcher.scala (revision 99ce5576f0ecce1b5045b7bc0dbbb2debd934fbb)
10d32f713Shappy-lxpackage xiangshan.mem.prefetch
20d32f713Shappy-lx
38891a219SYinan Xuimport org.chipsalliance.cde.config.Parameters
40d32f713Shappy-lximport chisel3._
50d32f713Shappy-lximport chisel3.util._
60d32f713Shappy-lximport utils._
70d32f713Shappy-lximport utility._
89e12e8edScz4eimport xiangshan._
99e12e8edScz4eimport xiangshan.mem.L1PrefetchReq
10*99ce5576Scz4eimport xiangshan.mem.Bundles.LsPrefetchTrainBundle
110d32f713Shappy-lximport xiangshan.mem.trace._
120d32f713Shappy-lximport xiangshan.mem.L1PrefetchSource
139e12e8edScz4eimport xiangshan.cache.HasDCacheParameters
149e12e8edScz4eimport xiangshan.cache.mmu._
150d32f713Shappy-lx
160d32f713Shappy-lxtrait HasStreamPrefetchHelper extends HasL1PrefetchHelper {
170d32f713Shappy-lx  // capacity related
180d32f713Shappy-lx  val STREAM_FILTER_SIZE = 4
190d32f713Shappy-lx  val BIT_VEC_ARRAY_SIZE = 16
200d32f713Shappy-lx  val ACTIVE_THRESHOLD = BIT_VEC_WITDH - 4
210d32f713Shappy-lx  val INIT_DEC_MODE = false
220d32f713Shappy-lx
230d32f713Shappy-lx  // bit_vector [StreamBitVectorBundle]:
240d32f713Shappy-lx  // `X`: valid; `.`: invalid; `H`: hit
250d32f713Shappy-lx  // [X X X X X X X X X . . H . X X X]                                                         [. . X X X X . . . . . . . . . .]
260d32f713Shappy-lx  //                    hit in 12th slot & active           --------------------->             prefetch bit_vector [StreamPrefetchReqBundle]
270d32f713Shappy-lx  //                        |  <---------------------------- depth ---------------------------->
280d32f713Shappy-lx  //                                                                                           | <-- width -- >
290d32f713Shappy-lx  val DEPTH_BYTES = 1024
300d32f713Shappy-lx  val DEPTH_CACHE_BLOCKS = DEPTH_BYTES / dcacheParameters.blockBytes
310d32f713Shappy-lx  val WIDTH_BYTES = 128
320d32f713Shappy-lx  val WIDTH_CACHE_BLOCKS = WIDTH_BYTES / dcacheParameters.blockBytes
330d32f713Shappy-lx
340d32f713Shappy-lx  val L2_DEPTH_RATIO = 2
350d32f713Shappy-lx  val L2_WIDTH_BYTES = WIDTH_BYTES * 2
360d32f713Shappy-lx  val L2_WIDTH_CACHE_BLOCKS = L2_WIDTH_BYTES / dcacheParameters.blockBytes
370d32f713Shappy-lx
380d32f713Shappy-lx  val L3_DEPTH_RATIO = 3
390d32f713Shappy-lx  val L3_WIDTH_BYTES = WIDTH_BYTES * 2 * 2
400d32f713Shappy-lx  val L3_WIDTH_CACHE_BLOCKS = L3_WIDTH_BYTES / dcacheParameters.blockBytes
410d32f713Shappy-lx
420d32f713Shappy-lx  val DEPTH_LOOKAHEAD = 6
430d32f713Shappy-lx  val DEPTH_BITS = log2Up(DEPTH_CACHE_BLOCKS) + DEPTH_LOOKAHEAD
440d32f713Shappy-lx
450d32f713Shappy-lx  val ENABLE_DECR_MODE = false
460d32f713Shappy-lx  val ENABLE_STRICT_ACTIVE_DETECTION = true
470d32f713Shappy-lx
480d32f713Shappy-lx  // constraints
490d32f713Shappy-lx  require((DEPTH_BYTES >= REGION_SIZE) && ((DEPTH_BYTES % REGION_SIZE) == 0) && ((DEPTH_BYTES / REGION_SIZE) > 0))
500d32f713Shappy-lx  require(((VADDR_HASH_WIDTH * 3) + BLK_ADDR_RAW_WIDTH) <= REGION_TAG_BITS)
510d32f713Shappy-lx  require(WIDTH_BYTES >= dcacheParameters.blockBytes)
520d32f713Shappy-lx}
530d32f713Shappy-lx
540d32f713Shappy-lxclass StreamBitVectorBundle(implicit p: Parameters) extends XSBundle with HasStreamPrefetchHelper {
550d32f713Shappy-lx  val tag = UInt(REGION_TAG_BITS.W)
560d32f713Shappy-lx  val bit_vec = UInt(BIT_VEC_WITDH.W)
570d32f713Shappy-lx  val active = Bool()
580d32f713Shappy-lx  // cnt can be optimized
590d32f713Shappy-lx  val cnt = UInt((log2Up(BIT_VEC_WITDH) + 1).W)
600d32f713Shappy-lx  val decr_mode = Bool()
610d32f713Shappy-lx
6220e09ab1Shappy-lx  // debug usage
6320e09ab1Shappy-lx  val trigger_full_va = UInt(VAddrBits.W)
6420e09ab1Shappy-lx
650d32f713Shappy-lx  def reset(index: Int) = {
660d32f713Shappy-lx    tag := index.U
670d32f713Shappy-lx    bit_vec := 0.U
680d32f713Shappy-lx    active := false.B
690d32f713Shappy-lx    cnt := 0.U
700d32f713Shappy-lx    decr_mode := INIT_DEC_MODE.B
7120e09ab1Shappy-lx    trigger_full_va := 0xdeadbeefL.U
720d32f713Shappy-lx  }
730d32f713Shappy-lx
7470eea123SYanqin Li  def tag_match(valid1: Bool, valid2: Bool, new_tag: UInt): Bool = {
7570eea123SYanqin Li    valid1 && valid2 && region_hash_tag(tag) === region_hash_tag(new_tag)
760d32f713Shappy-lx  }
770d32f713Shappy-lx
7820e09ab1Shappy-lx  def alloc(alloc_tag: UInt, alloc_bit_vec: UInt, alloc_active: Bool, alloc_decr_mode: Bool, alloc_full_vaddr: UInt) = {
790d32f713Shappy-lx    tag := alloc_tag
800d32f713Shappy-lx    bit_vec := alloc_bit_vec
810d32f713Shappy-lx    active := alloc_active
820d32f713Shappy-lx    cnt := 1.U
8320e09ab1Shappy-lx    trigger_full_va := alloc_full_vaddr
840d32f713Shappy-lx    if(ENABLE_DECR_MODE) {
850d32f713Shappy-lx      decr_mode := alloc_decr_mode
860d32f713Shappy-lx    }else {
870d32f713Shappy-lx      decr_mode := INIT_DEC_MODE.B
880d32f713Shappy-lx    }
890d32f713Shappy-lx
9020e09ab1Shappy-lx
910d32f713Shappy-lx    assert(PopCount(alloc_bit_vec) === 1.U, "alloc vector should be one hot")
920d32f713Shappy-lx  }
930d32f713Shappy-lx
940d32f713Shappy-lx  def update(update_bit_vec: UInt, update_active: Bool) = {
950d32f713Shappy-lx    // if the slot is 0 before, increment cnt
960d32f713Shappy-lx    val cnt_en = !((bit_vec & update_bit_vec).orR)
970d32f713Shappy-lx    val cnt_next = Mux(cnt_en, cnt + 1.U, cnt)
980d32f713Shappy-lx
990d32f713Shappy-lx    bit_vec := bit_vec | update_bit_vec
1000d32f713Shappy-lx    cnt := cnt_next
1010d32f713Shappy-lx    when(cnt_next >= ACTIVE_THRESHOLD.U) {
1020d32f713Shappy-lx      active := true.B
1030d32f713Shappy-lx    }
1040d32f713Shappy-lx    when(update_active) {
1050d32f713Shappy-lx      active := true.B
1060d32f713Shappy-lx    }
1070d32f713Shappy-lx
1080d32f713Shappy-lx    assert(PopCount(update_bit_vec) === 1.U, "update vector should be one hot")
1090d32f713Shappy-lx    assert(cnt <= BIT_VEC_WITDH.U, "cnt should always less than bit vector size")
1100d32f713Shappy-lx  }
1110d32f713Shappy-lx}
1120d32f713Shappy-lx
1130d32f713Shappy-lxclass StreamPrefetchReqBundle(implicit p: Parameters) extends XSBundle with HasStreamPrefetchHelper {
1140d32f713Shappy-lx  val region = UInt(REGION_TAG_BITS.W)
1150d32f713Shappy-lx  val bit_vec = UInt(BIT_VEC_WITDH.W)
1160d32f713Shappy-lx  val sink = UInt(SINK_BITS.W)
1170d32f713Shappy-lx  val source = new L1PrefetchSource()
11820e09ab1Shappy-lx  // debug usage
11920e09ab1Shappy-lx  val trigger_pc = UInt(VAddrBits.W)
12020e09ab1Shappy-lx  val trigger_va = UInt(VAddrBits.W)
1210d32f713Shappy-lx
1220d32f713Shappy-lx  // align prefetch vaddr and width to region
12320e09ab1Shappy-lx  def getStreamPrefetchReqBundle(valid: Bool, vaddr: UInt, width: Int, decr_mode: Bool, sink: UInt, source: UInt, t_pc: UInt, t_va: UInt): StreamPrefetchReqBundle = {
1240d32f713Shappy-lx    val res = Wire(new StreamPrefetchReqBundle)
1250d32f713Shappy-lx    res.region := get_region_tag(vaddr)
1260d32f713Shappy-lx    res.sink := sink
1270d32f713Shappy-lx    res.source.value := source
1280d32f713Shappy-lx
12920e09ab1Shappy-lx    res.trigger_pc := t_pc
13020e09ab1Shappy-lx    res.trigger_va := t_va
13120e09ab1Shappy-lx
1320d32f713Shappy-lx    val region_bits = get_region_bits(vaddr)
1330d32f713Shappy-lx    val region_bit_vec = UIntToOH(region_bits)
1340d32f713Shappy-lx    res.bit_vec := Mux(
1350d32f713Shappy-lx      decr_mode,
1360d32f713Shappy-lx      (0 until width).map{ case i => region_bit_vec >> i}.reduce(_ | _),
1370d32f713Shappy-lx      (0 until width).map{ case i => region_bit_vec << i}.reduce(_ | _)
1380d32f713Shappy-lx    )
1390d32f713Shappy-lx
14058a9a40dSTang Haojin    assert(!valid || PopCount(res.bit_vec) <= width.U, "actual prefetch block number should less than or equals to WIDTH_CACHE_BLOCKS")
14158a9a40dSTang Haojin    assert(!valid || PopCount(res.bit_vec) >= 1.U, "at least one block should be included")
1420d32f713Shappy-lx    assert(sink <= SINK_L3, "invalid sink")
1430d32f713Shappy-lx    for(i <- 0 until BIT_VEC_WITDH) {
1440d32f713Shappy-lx      when(decr_mode) {
1450d32f713Shappy-lx        when(i.U > region_bits) {
14658a9a40dSTang Haojin          assert(!valid || res.bit_vec(i) === 0.U, s"res.bit_vec(${i}) is not zero in decr_mode, prefetch vector is wrong!")
1470d32f713Shappy-lx        }.elsewhen(i.U === region_bits) {
14858a9a40dSTang Haojin          assert(!valid || res.bit_vec(i) === 1.U, s"res.bit_vec(${i}) is zero in decr_mode, prefetch vector is wrong!")
1490d32f713Shappy-lx        }
1500d32f713Shappy-lx      }.otherwise {
1510d32f713Shappy-lx        when(i.U < region_bits) {
15258a9a40dSTang Haojin          assert(!valid || res.bit_vec(i) === 0.U, s"res.bit_vec(${i}) is not zero in incr_mode, prefetch vector is wrong!")
1530d32f713Shappy-lx        }.elsewhen(i.U === region_bits) {
15458a9a40dSTang Haojin          assert(!valid || res.bit_vec(i) === 1.U, s"res.bit_vec(${i}) is zero in decr_mode, prefetch vector is wrong!")
1550d32f713Shappy-lx        }
1560d32f713Shappy-lx      }
1570d32f713Shappy-lx    }
1580d32f713Shappy-lx
1590d32f713Shappy-lx    res
1600d32f713Shappy-lx  }
1610d32f713Shappy-lx}
1620d32f713Shappy-lx
1630d32f713Shappy-lxclass StreamBitVectorArray(implicit p: Parameters) extends XSModule with HasStreamPrefetchHelper {
1640d32f713Shappy-lx  val io = IO(new XSBundle {
1650d32f713Shappy-lx    val enable = Input(Bool())
1660d32f713Shappy-lx    // TODO: flush all entry when process changing happens, or disable stream prefetch for a while
1670d32f713Shappy-lx    val flush = Input(Bool())
1680d32f713Shappy-lx    val dynamic_depth = Input(UInt(DEPTH_BITS.W))
1690d32f713Shappy-lx    val train_req = Flipped(DecoupledIO(new PrefetchReqBundle))
17020e09ab1Shappy-lx    val l1_prefetch_req = ValidIO(new StreamPrefetchReqBundle)
17120e09ab1Shappy-lx    val l2_l3_prefetch_req = ValidIO(new StreamPrefetchReqBundle)
1720d32f713Shappy-lx
1730d32f713Shappy-lx    // Stride send lookup req here
1740d32f713Shappy-lx    val stream_lookup_req  = Flipped(ValidIO(new PrefetchReqBundle))
1750d32f713Shappy-lx    val stream_lookup_resp = Output(Bool())
1760d32f713Shappy-lx  })
1770d32f713Shappy-lx
1780d32f713Shappy-lx  val array = Reg(Vec(BIT_VEC_ARRAY_SIZE, new StreamBitVectorBundle))
17970eea123SYanqin Li  val valids = RegInit(VecInit(Seq.fill(BIT_VEC_ARRAY_SIZE)(false.B)))
18070eea123SYanqin Li
18170eea123SYanqin Li  def reset_array(i: Int): Unit = {
18270eea123SYanqin Li    valids(i) := false.B
18370eea123SYanqin Li    //only need to rest control signals for firendly area
18470eea123SYanqin Li    // array(i).reset(i)
18570eea123SYanqin Li  }
18670eea123SYanqin Li
1870d32f713Shappy-lx  val replacement = ReplacementPolicy.fromString("plru", BIT_VEC_ARRAY_SIZE)
1880d32f713Shappy-lx
1890d32f713Shappy-lx  // s0: generate region tag, parallel match
1900d32f713Shappy-lx  val s0_can_accept = Wire(Bool())
1910d32f713Shappy-lx  val s0_valid = io.train_req.fire
19220e09ab1Shappy-lx  val s0_pc    = io.train_req.bits.pc
1930d32f713Shappy-lx  val s0_vaddr = io.train_req.bits.vaddr
194f4221883Shappy-lx  val s0_miss  = io.train_req.bits.miss
195f4221883Shappy-lx  val s0_pfHit = io.train_req.bits.pfHitStream
1960d32f713Shappy-lx  val s0_region_bits = get_region_bits(s0_vaddr)
1970d32f713Shappy-lx  val s0_region_tag = get_region_tag(s0_vaddr)
1980d32f713Shappy-lx  val s0_region_tag_plus_one = get_region_tag(s0_vaddr) + 1.U
1990d32f713Shappy-lx  val s0_region_tag_minus_one = get_region_tag(s0_vaddr) - 1.U
20070eea123SYanqin Li  val s0_region_tag_match_vec = array zip valids map { case (e, v) => e.tag_match(v, s0_valid, s0_region_tag) }
20170eea123SYanqin Li  val s0_region_tag_plus_one_match_vec = array zip valids map { case (e, v) => e.tag_match(v, s0_valid, s0_region_tag_plus_one) }
20270eea123SYanqin Li  val s0_region_tag_minus_one_match_vec = array zip valids map { case (e, v) => e.tag_match(v, s0_valid, s0_region_tag_minus_one) }
2030d32f713Shappy-lx  val s0_hit = Cat(s0_region_tag_match_vec).orR
2040d32f713Shappy-lx  val s0_plus_one_hit = Cat(s0_region_tag_plus_one_match_vec).orR
2050d32f713Shappy-lx  val s0_minus_one_hit = Cat(s0_region_tag_minus_one_match_vec).orR
2060d32f713Shappy-lx  val s0_hit_vec = VecInit(s0_region_tag_match_vec).asUInt
2070d32f713Shappy-lx  val s0_index = Mux(s0_hit, OHToUInt(s0_hit_vec), replacement.way)
2080d32f713Shappy-lx  val s0_plus_one_index = OHToUInt(VecInit(s0_region_tag_plus_one_match_vec).asUInt)
2090d32f713Shappy-lx  val s0_minus_one_index = OHToUInt(VecInit(s0_region_tag_minus_one_match_vec).asUInt)
2100d32f713Shappy-lx  io.train_req.ready := s0_can_accept
2110d32f713Shappy-lx
2120d32f713Shappy-lx  when(s0_valid) {
2130d32f713Shappy-lx    replacement.access(s0_index)
2140d32f713Shappy-lx  }
2150d32f713Shappy-lx
21620e09ab1Shappy-lx  val stream_pf_train_debug_table = ChiselDB.createTable("StreamTrainTraceTable" + p(XSCoreParamsKey).HartId.toString, new StreamTrainTraceEntry, basicDB = false)
21720e09ab1Shappy-lx
21820e09ab1Shappy-lx  val spf_log_enable = s0_valid
21920e09ab1Shappy-lx  val spf_log_data = Wire(new StreamTrainTraceEntry)
22020e09ab1Shappy-lx
22120e09ab1Shappy-lx  // WARNING: the type here only indicates trigger by stream, not saying it's sink
22220e09ab1Shappy-lx  spf_log_data.Type := MemReqSource.Prefetch2L2Stream.id.U
22320e09ab1Shappy-lx  spf_log_data.OldAddr := Mux(
22420e09ab1Shappy-lx    !s0_hit,
22520e09ab1Shappy-lx    s0_vaddr,
22620e09ab1Shappy-lx    array(s0_index).trigger_full_va
22720e09ab1Shappy-lx  )
22820e09ab1Shappy-lx  spf_log_data.CurAddr := s0_vaddr
22920e09ab1Shappy-lx  spf_log_data.Offset := DontCare
23020e09ab1Shappy-lx  spf_log_data.Score := DontCare
23120e09ab1Shappy-lx  spf_log_data.Miss := io.train_req.bits.miss
23220e09ab1Shappy-lx
23320e09ab1Shappy-lx  stream_pf_train_debug_table.log(
23420e09ab1Shappy-lx    data = spf_log_data,
23520e09ab1Shappy-lx    en = spf_log_enable,
23620e09ab1Shappy-lx    site = "StreamTrainTraceTable",
23720e09ab1Shappy-lx    clock = clock,
23820e09ab1Shappy-lx    reset = reset
23920e09ab1Shappy-lx  )
24020e09ab1Shappy-lx
2410d32f713Shappy-lx  assert(!s0_valid || PopCount(VecInit(s0_region_tag_match_vec)) <= 1.U, "req region should match no more than 1 entry")
2420d32f713Shappy-lx  assert(!s0_valid || PopCount(VecInit(s0_region_tag_plus_one_match_vec)) <= 1.U, "req region plus 1 should match no more than 1 entry")
2430d32f713Shappy-lx  assert(!s0_valid || PopCount(VecInit(s0_region_tag_minus_one_match_vec)) <= 1.U, "req region minus 1 should match no more than 1 entry")
2440d32f713Shappy-lx  assert(!s0_valid || !(s0_hit && s0_plus_one_hit && (s0_index === s0_plus_one_index)), "region and region plus 1 index match failed")
2450d32f713Shappy-lx  assert(!s0_valid || !(s0_hit && s0_minus_one_hit && (s0_index === s0_minus_one_index)), "region and region minus 1 index match failed")
2460d32f713Shappy-lx  assert(!s0_valid || !(s0_plus_one_hit && s0_minus_one_hit && (s0_minus_one_index === s0_plus_one_index)), "region plus 1 and region minus 1 index match failed")
2475adc4829SYanqin Li  assert(!(s0_valid && RegNext(s0_valid) && !s0_hit && !RegEnable(s0_hit, s0_valid) && replacement.way === RegEnable(replacement.way, s0_valid)), "replacement error")
2480d32f713Shappy-lx
2490d32f713Shappy-lx  XSPerfAccumulate("s0_valid_train_req", s0_valid)
2500d32f713Shappy-lx  val s0_hit_pattern_vec = Seq(s0_hit, s0_plus_one_hit, s0_minus_one_hit)
2510d32f713Shappy-lx  for(i <- 0 until (1 << s0_hit_pattern_vec.size)) {
2520d32f713Shappy-lx    XSPerfAccumulate(s"s0_hit_pattern_${toBinary(i)}", (VecInit(s0_hit_pattern_vec).asUInt === i.U) && s0_valid)
2530d32f713Shappy-lx  }
2540d32f713Shappy-lx  XSPerfAccumulate("s0_replace_the_neighbor", s0_valid && !s0_hit && ((s0_plus_one_hit && (s0_index === s0_plus_one_index)) || (s0_minus_one_hit && (s0_index === s0_minus_one_index))))
2550d32f713Shappy-lx  XSPerfAccumulate("s0_req_valid", io.train_req.valid)
2560d32f713Shappy-lx  XSPerfAccumulate("s0_req_cannot_accept", io.train_req.valid && !io.train_req.ready)
2570d32f713Shappy-lx
258c686adcdSYinan Xu  val ratio_const = Constantin.createRecord(s"l2DepthRatio${p(XSCoreParamsKey).HartId}", initValue = L2_DEPTH_RATIO)
2590d32f713Shappy-lx  val ratio = ratio_const(3, 0)
2600d32f713Shappy-lx
261c686adcdSYinan Xu  val l3_ratio_const = Constantin.createRecord(s"l3DepthRatio${p(XSCoreParamsKey).HartId}", initValue = L3_DEPTH_RATIO)
2620d32f713Shappy-lx  val l3_ratio = l3_ratio_const(3, 0)
2630d32f713Shappy-lx
2640d32f713Shappy-lx  // s1: alloc or update
2654ccb2e8bSYanqin Li  val s1_valid = GatedValidRegNext(s0_valid)
2660d32f713Shappy-lx  val s1_index = RegEnable(s0_index, s0_valid)
26720e09ab1Shappy-lx  val s1_pc    = RegEnable(s0_pc, s0_valid)
26820e09ab1Shappy-lx  val s1_vaddr = RegEnable(s0_vaddr, s0_valid)
269f4221883Shappy-lx  val s1_miss  = RegEnable(s0_miss, s0_valid)
270f4221883Shappy-lx  val s1_pfHit = RegEnable(s0_pfHit, s0_valid)
2710d32f713Shappy-lx  val s1_plus_one_index = RegEnable(s0_plus_one_index, s0_valid)
2720d32f713Shappy-lx  val s1_minus_one_index = RegEnable(s0_minus_one_index, s0_valid)
2730d32f713Shappy-lx  val s1_hit = RegEnable(s0_hit, s0_valid)
2740d32f713Shappy-lx  val s1_plus_one_hit = if(ENABLE_STRICT_ACTIVE_DETECTION)
2750d32f713Shappy-lx                            RegEnable(s0_plus_one_hit, s0_valid) && array(s1_plus_one_index).active && (array(s1_plus_one_index).cnt >= ACTIVE_THRESHOLD.U)
2760d32f713Shappy-lx                        else
2770d32f713Shappy-lx                            RegEnable(s0_plus_one_hit, s0_valid) && array(s1_plus_one_index).active
2780d32f713Shappy-lx  val s1_minus_one_hit = if(ENABLE_STRICT_ACTIVE_DETECTION)
2790d32f713Shappy-lx                            RegEnable(s0_minus_one_hit, s0_valid) && array(s1_minus_one_index).active && (array(s1_minus_one_index).cnt >= ACTIVE_THRESHOLD.U)
2800d32f713Shappy-lx                        else
2810d32f713Shappy-lx                            RegEnable(s0_minus_one_hit, s0_valid) && array(s1_minus_one_index).active
2820d32f713Shappy-lx  val s1_region_tag = RegEnable(s0_region_tag, s0_valid)
2830d32f713Shappy-lx  val s1_region_bits = RegEnable(s0_region_bits, s0_valid)
2840d32f713Shappy-lx  val s1_alloc = s1_valid && !s1_hit
2850d32f713Shappy-lx  val s1_update = s1_valid && s1_hit
2860d32f713Shappy-lx  val s1_pf_l1_incr_vaddr = Cat(region_to_block_addr(s1_region_tag, s1_region_bits) + io.dynamic_depth, 0.U(BLOCK_OFFSET.W))
2870d32f713Shappy-lx  val s1_pf_l1_decr_vaddr = Cat(region_to_block_addr(s1_region_tag, s1_region_bits) - io.dynamic_depth, 0.U(BLOCK_OFFSET.W))
2880d32f713Shappy-lx  val s1_pf_l2_incr_vaddr = Cat(region_to_block_addr(s1_region_tag, s1_region_bits) + (io.dynamic_depth << ratio), 0.U(BLOCK_OFFSET.W))
2890d32f713Shappy-lx  val s1_pf_l2_decr_vaddr = Cat(region_to_block_addr(s1_region_tag, s1_region_bits) - (io.dynamic_depth << ratio), 0.U(BLOCK_OFFSET.W))
2900d32f713Shappy-lx  val s1_pf_l3_incr_vaddr = Cat(region_to_block_addr(s1_region_tag, s1_region_bits) + (io.dynamic_depth << l3_ratio), 0.U(BLOCK_OFFSET.W))
2910d32f713Shappy-lx  val s1_pf_l3_decr_vaddr = Cat(region_to_block_addr(s1_region_tag, s1_region_bits) - (io.dynamic_depth << l3_ratio), 0.U(BLOCK_OFFSET.W))
2920d32f713Shappy-lx  // TODO: remove this
293f4221883Shappy-lx  val strict_trigger_const = Constantin.createRecord(s"StreamStrictTrigger_${p(XSCoreParamsKey).HartId}", initValue = 1)
294f4221883Shappy-lx  // If use strict triggering mode, the stream prefetcher will only trigger prefetching
295f4221883Shappy-lx  // under **cache miss or prefetch hit stream**, but will still perform training on the entire memory access trace.
296f4221883Shappy-lx  val s1_can_trigger = Mux(strict_trigger_const.orR, s1_miss || s1_pfHit, true.B)
297f4221883Shappy-lx  val s1_can_send_pf = Mux(s1_update, !((array(s1_index).bit_vec & UIntToOH(s1_region_bits)).orR), true.B) && s1_can_trigger
2980d32f713Shappy-lx  s0_can_accept := !(s1_valid && (region_hash_tag(s1_region_tag) === region_hash_tag(s0_region_tag)))
2990d32f713Shappy-lx
3000d32f713Shappy-lx  when(s1_alloc) {
3010d32f713Shappy-lx    // alloc a new entry
30270eea123SYanqin Li    valids(s1_index) := true.B
3030d32f713Shappy-lx    array(s1_index).alloc(
3040d32f713Shappy-lx      alloc_tag = s1_region_tag,
3050d32f713Shappy-lx      alloc_bit_vec = UIntToOH(s1_region_bits),
3060d32f713Shappy-lx      alloc_active = s1_plus_one_hit || s1_minus_one_hit,
30720e09ab1Shappy-lx      alloc_decr_mode = RegEnable(s0_plus_one_hit, s0_valid),
30820e09ab1Shappy-lx      alloc_full_vaddr = RegEnable(s0_vaddr, s0_valid)
30920e09ab1Shappy-lx      )
3100d32f713Shappy-lx
3110d32f713Shappy-lx  }.elsewhen(s1_update) {
3120d32f713Shappy-lx    // update a existing entry
31370eea123SYanqin Li    assert(array(s1_index).cnt =/= 0.U || valids(s1_index), "entry should have been allocated before")
3140d32f713Shappy-lx    array(s1_index).update(
3150d32f713Shappy-lx      update_bit_vec = UIntToOH(s1_region_bits),
3160d32f713Shappy-lx      update_active = s1_plus_one_hit || s1_minus_one_hit)
3170d32f713Shappy-lx  }
3180d32f713Shappy-lx
3190d32f713Shappy-lx  XSPerfAccumulate("s1_alloc", s1_alloc)
3200d32f713Shappy-lx  XSPerfAccumulate("s1_update", s1_update)
3210d32f713Shappy-lx  XSPerfAccumulate("s1_active_plus_one_hit", s1_valid && s1_plus_one_hit)
3220d32f713Shappy-lx  XSPerfAccumulate("s1_active_minus_one_hit", s1_valid && s1_minus_one_hit)
3230d32f713Shappy-lx
3240d32f713Shappy-lx  // s2: trigger prefetch if hit active bit vector, compute meta of prefetch req
3254ccb2e8bSYanqin Li  val s2_valid = GatedValidRegNext(s1_valid)
3260d32f713Shappy-lx  val s2_index = RegEnable(s1_index, s1_valid)
32720e09ab1Shappy-lx  val s2_pc    = RegEnable(s1_pc, s1_valid)
32820e09ab1Shappy-lx  val s2_vaddr = RegEnable(s1_vaddr, s1_valid)
3290d32f713Shappy-lx  val s2_region_bits = RegEnable(s1_region_bits, s1_valid)
3300d32f713Shappy-lx  val s2_region_tag = RegEnable(s1_region_tag, s1_valid)
3310d32f713Shappy-lx  val s2_pf_l1_incr_vaddr = RegEnable(s1_pf_l1_incr_vaddr, s1_valid)
3320d32f713Shappy-lx  val s2_pf_l1_decr_vaddr = RegEnable(s1_pf_l1_decr_vaddr, s1_valid)
3330d32f713Shappy-lx  val s2_pf_l2_incr_vaddr = RegEnable(s1_pf_l2_incr_vaddr, s1_valid)
3340d32f713Shappy-lx  val s2_pf_l2_decr_vaddr = RegEnable(s1_pf_l2_decr_vaddr, s1_valid)
3350d32f713Shappy-lx  val s2_pf_l3_incr_vaddr = RegEnable(s1_pf_l3_incr_vaddr, s1_valid)
3360d32f713Shappy-lx  val s2_pf_l3_decr_vaddr = RegEnable(s1_pf_l3_decr_vaddr, s1_valid)
3370d32f713Shappy-lx  val s2_can_send_pf = RegEnable(s1_can_send_pf, s1_valid)
3380d32f713Shappy-lx  val s2_active = array(s2_index).active
3390d32f713Shappy-lx  val s2_decr_mode = array(s2_index).decr_mode
3400d32f713Shappy-lx  val s2_l1_vaddr = Mux(s2_decr_mode, s2_pf_l1_decr_vaddr, s2_pf_l1_incr_vaddr)
3410d32f713Shappy-lx  val s2_l2_vaddr = Mux(s2_decr_mode, s2_pf_l2_decr_vaddr, s2_pf_l2_incr_vaddr)
3420d32f713Shappy-lx  val s2_l3_vaddr = Mux(s2_decr_mode, s2_pf_l3_decr_vaddr, s2_pf_l3_incr_vaddr)
3430d32f713Shappy-lx  val s2_will_send_pf = s2_valid && s2_active && s2_can_send_pf
3440d32f713Shappy-lx  val s2_pf_req_valid = s2_will_send_pf && io.enable
3450d32f713Shappy-lx  val s2_pf_l1_req_bits = (new StreamPrefetchReqBundle).getStreamPrefetchReqBundle(
34658a9a40dSTang Haojin    valid = s2_valid,
3470d32f713Shappy-lx    vaddr = s2_l1_vaddr,
3480d32f713Shappy-lx    width = WIDTH_CACHE_BLOCKS,
3490d32f713Shappy-lx    decr_mode = s2_decr_mode,
3500d32f713Shappy-lx    sink = SINK_L1,
35120e09ab1Shappy-lx    source = L1_HW_PREFETCH_STREAM,
35220e09ab1Shappy-lx    t_pc = s2_pc,
35320e09ab1Shappy-lx    t_va = s2_vaddr
35420e09ab1Shappy-lx    )
3550d32f713Shappy-lx  val s2_pf_l2_req_bits = (new StreamPrefetchReqBundle).getStreamPrefetchReqBundle(
35658a9a40dSTang Haojin    valid = s2_valid,
3570d32f713Shappy-lx    vaddr = s2_l2_vaddr,
3580d32f713Shappy-lx    width = L2_WIDTH_CACHE_BLOCKS,
3590d32f713Shappy-lx    decr_mode = s2_decr_mode,
3600d32f713Shappy-lx    sink = SINK_L2,
36120e09ab1Shappy-lx    source = L1_HW_PREFETCH_STREAM,
36220e09ab1Shappy-lx    t_pc = s2_pc,
36320e09ab1Shappy-lx    t_va = s2_vaddr
36420e09ab1Shappy-lx    )
3650d32f713Shappy-lx  val s2_pf_l3_req_bits = (new StreamPrefetchReqBundle).getStreamPrefetchReqBundle(
36658a9a40dSTang Haojin    valid = s2_valid,
3670d32f713Shappy-lx    vaddr = s2_l3_vaddr,
3680d32f713Shappy-lx    width = L3_WIDTH_CACHE_BLOCKS,
3690d32f713Shappy-lx    decr_mode = s2_decr_mode,
3700d32f713Shappy-lx    sink = SINK_L3,
37120e09ab1Shappy-lx    source = L1_HW_PREFETCH_STREAM,
37220e09ab1Shappy-lx    t_pc = s2_pc,
37320e09ab1Shappy-lx    t_va = s2_vaddr
37420e09ab1Shappy-lx    )
3750d32f713Shappy-lx
3760d32f713Shappy-lx  XSPerfAccumulate("s2_valid", s2_valid)
3770d32f713Shappy-lx  XSPerfAccumulate("s2_will_not_send_pf", s2_valid && !s2_will_send_pf)
3780d32f713Shappy-lx  XSPerfAccumulate("s2_will_send_decr_pf", s2_valid && s2_will_send_pf && s2_decr_mode)
3790d32f713Shappy-lx  XSPerfAccumulate("s2_will_send_incr_pf", s2_valid && s2_will_send_pf && !s2_decr_mode)
3800d32f713Shappy-lx
3810d32f713Shappy-lx  // s3: send the l1 prefetch req out
3824ccb2e8bSYanqin Li  val s3_pf_l1_valid = GatedValidRegNext(s2_pf_req_valid)
3830d32f713Shappy-lx  val s3_pf_l1_bits = RegEnable(s2_pf_l1_req_bits, s2_pf_req_valid)
3844ccb2e8bSYanqin Li  val s3_pf_l2_valid = GatedValidRegNext(s2_pf_req_valid)
3850d32f713Shappy-lx  val s3_pf_l2_bits = RegEnable(s2_pf_l2_req_bits, s2_pf_req_valid)
3860d32f713Shappy-lx  val s3_pf_l3_bits = RegEnable(s2_pf_l3_req_bits, s2_pf_req_valid)
3870d32f713Shappy-lx
3880d32f713Shappy-lx  XSPerfAccumulate("s3_pf_sent", s3_pf_l1_valid)
3890d32f713Shappy-lx
3900d32f713Shappy-lx  // s4: send the l2 prefetch req out
3914ccb2e8bSYanqin Li  val s4_pf_l2_valid = GatedValidRegNext(s3_pf_l2_valid)
3920d32f713Shappy-lx  val s4_pf_l2_bits = RegEnable(s3_pf_l2_bits, s3_pf_l2_valid)
3930d32f713Shappy-lx  val s4_pf_l3_bits = RegEnable(s3_pf_l3_bits, s3_pf_l2_valid)
3940d32f713Shappy-lx
395c686adcdSYinan Xu  val enable_l3_pf = Constantin.createRecord(s"enableL3StreamPrefetch${p(XSCoreParamsKey).HartId}", initValue = false)
3960d32f713Shappy-lx  // s5: send the l3 prefetch req out
3974ccb2e8bSYanqin Li  val s5_pf_l3_valid = GatedValidRegNext(s4_pf_l2_valid) && enable_l3_pf
3980d32f713Shappy-lx  val s5_pf_l3_bits = RegEnable(s4_pf_l3_bits, s4_pf_l2_valid)
3990d32f713Shappy-lx
40020e09ab1Shappy-lx  io.l1_prefetch_req.valid := s3_pf_l1_valid
40120e09ab1Shappy-lx  io.l1_prefetch_req.bits := s3_pf_l1_bits
40220e09ab1Shappy-lx  io.l2_l3_prefetch_req.valid := s4_pf_l2_valid || s5_pf_l3_valid
40320e09ab1Shappy-lx  io.l2_l3_prefetch_req.bits := Mux(s4_pf_l2_valid, s4_pf_l2_bits, s5_pf_l3_bits)
4040d32f713Shappy-lx
40520e09ab1Shappy-lx  XSPerfAccumulate("s4_pf_sent", s4_pf_l2_valid)
40620e09ab1Shappy-lx  XSPerfAccumulate("s5_pf_sent", !s4_pf_l2_valid && s5_pf_l3_valid)
40720e09ab1Shappy-lx  XSPerfAccumulate("pf_sent", PopCount(Seq(io.l1_prefetch_req.valid, io.l2_l3_prefetch_req.valid)))
4080d32f713Shappy-lx
4090d32f713Shappy-lx  // Stride lookup starts here
4100d32f713Shappy-lx  // S0: Stride send req
4110d32f713Shappy-lx  val s0_lookup_valid = io.stream_lookup_req.valid
4120d32f713Shappy-lx  val s0_lookup_vaddr = io.stream_lookup_req.bits.vaddr
4130d32f713Shappy-lx  val s0_lookup_tag = get_region_tag(s0_lookup_vaddr)
4140d32f713Shappy-lx  // S1: match
4154ccb2e8bSYanqin Li  val s1_lookup_valid = GatedValidRegNext(s0_lookup_valid)
4160d32f713Shappy-lx  val s1_lookup_tag = RegEnable(s0_lookup_tag, s0_lookup_valid)
41770eea123SYanqin Li  val s1_lookup_tag_match_vec = array zip valids map { case (e, v) => e.tag_match(v, s1_lookup_valid, s1_lookup_tag) }
4180d32f713Shappy-lx  val s1_lookup_hit = VecInit(s1_lookup_tag_match_vec).asUInt.orR
4190d32f713Shappy-lx  val s1_lookup_index = OHToUInt(VecInit(s1_lookup_tag_match_vec))
4200d32f713Shappy-lx  // S2: read active out
4214ccb2e8bSYanqin Li  val s2_lookup_valid = GatedValidRegNext(s1_lookup_valid)
4220d32f713Shappy-lx  val s2_lookup_hit = RegEnable(s1_lookup_hit, s1_lookup_valid)
4230d32f713Shappy-lx  val s2_lookup_index = RegEnable(s1_lookup_index, s1_lookup_valid)
4240d32f713Shappy-lx  val s2_lookup_active = array(s2_lookup_index).active
4250d32f713Shappy-lx  // S3: send back to Stride
4264ccb2e8bSYanqin Li  val s3_lookup_valid = GatedValidRegNext(s2_lookup_valid)
4270d32f713Shappy-lx  val s3_lookup_hit = RegEnable(s2_lookup_hit, s2_lookup_valid)
4280d32f713Shappy-lx  val s3_lookup_active = RegEnable(s2_lookup_active, s2_lookup_valid)
4290d32f713Shappy-lx  io.stream_lookup_resp := s3_lookup_valid && s3_lookup_hit && s3_lookup_active
4300d32f713Shappy-lx
4310d32f713Shappy-lx  // reset meta to avoid muti-hit problem
4320d32f713Shappy-lx  for(i <- 0 until BIT_VEC_ARRAY_SIZE) {
43370eea123SYanqin Li    when(GatedValidRegNext(io.flush)) {
43470eea123SYanqin Li      reset_array(i)
4350d32f713Shappy-lx    }
4360d32f713Shappy-lx  }
4370d32f713Shappy-lx
4380d32f713Shappy-lx  XSPerfHistogram("bit_vector_active", PopCount(VecInit(array.map(_.active)).asUInt), true.B, 0, BIT_VEC_ARRAY_SIZE, 1)
4390d32f713Shappy-lx  XSPerfHistogram("bit_vector_decr_mode", PopCount(VecInit(array.map(_.decr_mode)).asUInt), true.B, 0, BIT_VEC_ARRAY_SIZE, 1)
4400d32f713Shappy-lx  XSPerfAccumulate("hash_conflict", s0_valid && s2_valid && (s0_region_tag =/= s2_region_tag) && (region_hash_tag(s0_region_tag) === region_hash_tag(s2_region_tag)))
4410d32f713Shappy-lx}
442