10d32f713Shappy-lxpackage xiangshan.mem.prefetch 20d32f713Shappy-lx 38891a219SYinan Xuimport org.chipsalliance.cde.config.Parameters 40d32f713Shappy-lximport chisel3._ 50d32f713Shappy-lximport chisel3.util._ 60d32f713Shappy-lximport utils._ 70d32f713Shappy-lximport utility._ 89e12e8edScz4eimport xiangshan._ 99e12e8edScz4eimport xiangshan.mem.L1PrefetchReq 10*99ce5576Scz4eimport xiangshan.mem.Bundles.LsPrefetchTrainBundle 110d32f713Shappy-lximport xiangshan.mem.trace._ 120d32f713Shappy-lximport xiangshan.mem.L1PrefetchSource 139e12e8edScz4eimport xiangshan.cache.HasDCacheParameters 149e12e8edScz4eimport xiangshan.cache.mmu._ 150d32f713Shappy-lx 160d32f713Shappy-lxtrait HasStreamPrefetchHelper extends HasL1PrefetchHelper { 170d32f713Shappy-lx // capacity related 180d32f713Shappy-lx val STREAM_FILTER_SIZE = 4 190d32f713Shappy-lx val BIT_VEC_ARRAY_SIZE = 16 200d32f713Shappy-lx val ACTIVE_THRESHOLD = BIT_VEC_WITDH - 4 210d32f713Shappy-lx val INIT_DEC_MODE = false 220d32f713Shappy-lx 230d32f713Shappy-lx // bit_vector [StreamBitVectorBundle]: 240d32f713Shappy-lx // `X`: valid; `.`: invalid; `H`: hit 250d32f713Shappy-lx // [X X X X X X X X X . . H . X X X] [. . X X X X . . . . . . . . . .] 260d32f713Shappy-lx // hit in 12th slot & active ---------------------> prefetch bit_vector [StreamPrefetchReqBundle] 270d32f713Shappy-lx // | <---------------------------- depth ----------------------------> 280d32f713Shappy-lx // | <-- width -- > 290d32f713Shappy-lx val DEPTH_BYTES = 1024 300d32f713Shappy-lx val DEPTH_CACHE_BLOCKS = DEPTH_BYTES / dcacheParameters.blockBytes 310d32f713Shappy-lx val WIDTH_BYTES = 128 320d32f713Shappy-lx val WIDTH_CACHE_BLOCKS = WIDTH_BYTES / dcacheParameters.blockBytes 330d32f713Shappy-lx 340d32f713Shappy-lx val L2_DEPTH_RATIO = 2 350d32f713Shappy-lx val L2_WIDTH_BYTES = WIDTH_BYTES * 2 360d32f713Shappy-lx val L2_WIDTH_CACHE_BLOCKS = L2_WIDTH_BYTES / dcacheParameters.blockBytes 370d32f713Shappy-lx 380d32f713Shappy-lx val L3_DEPTH_RATIO = 3 390d32f713Shappy-lx val L3_WIDTH_BYTES = WIDTH_BYTES * 2 * 2 400d32f713Shappy-lx val L3_WIDTH_CACHE_BLOCKS = L3_WIDTH_BYTES / dcacheParameters.blockBytes 410d32f713Shappy-lx 420d32f713Shappy-lx val DEPTH_LOOKAHEAD = 6 430d32f713Shappy-lx val DEPTH_BITS = log2Up(DEPTH_CACHE_BLOCKS) + DEPTH_LOOKAHEAD 440d32f713Shappy-lx 450d32f713Shappy-lx val ENABLE_DECR_MODE = false 460d32f713Shappy-lx val ENABLE_STRICT_ACTIVE_DETECTION = true 470d32f713Shappy-lx 480d32f713Shappy-lx // constraints 490d32f713Shappy-lx require((DEPTH_BYTES >= REGION_SIZE) && ((DEPTH_BYTES % REGION_SIZE) == 0) && ((DEPTH_BYTES / REGION_SIZE) > 0)) 500d32f713Shappy-lx require(((VADDR_HASH_WIDTH * 3) + BLK_ADDR_RAW_WIDTH) <= REGION_TAG_BITS) 510d32f713Shappy-lx require(WIDTH_BYTES >= dcacheParameters.blockBytes) 520d32f713Shappy-lx} 530d32f713Shappy-lx 540d32f713Shappy-lxclass StreamBitVectorBundle(implicit p: Parameters) extends XSBundle with HasStreamPrefetchHelper { 550d32f713Shappy-lx val tag = UInt(REGION_TAG_BITS.W) 560d32f713Shappy-lx val bit_vec = UInt(BIT_VEC_WITDH.W) 570d32f713Shappy-lx val active = Bool() 580d32f713Shappy-lx // cnt can be optimized 590d32f713Shappy-lx val cnt = UInt((log2Up(BIT_VEC_WITDH) + 1).W) 600d32f713Shappy-lx val decr_mode = Bool() 610d32f713Shappy-lx 6220e09ab1Shappy-lx // debug usage 6320e09ab1Shappy-lx val trigger_full_va = UInt(VAddrBits.W) 6420e09ab1Shappy-lx 650d32f713Shappy-lx def reset(index: Int) = { 660d32f713Shappy-lx tag := index.U 670d32f713Shappy-lx bit_vec := 0.U 680d32f713Shappy-lx active := false.B 690d32f713Shappy-lx cnt := 0.U 700d32f713Shappy-lx decr_mode := INIT_DEC_MODE.B 7120e09ab1Shappy-lx trigger_full_va := 0xdeadbeefL.U 720d32f713Shappy-lx } 730d32f713Shappy-lx 7470eea123SYanqin Li def tag_match(valid1: Bool, valid2: Bool, new_tag: UInt): Bool = { 7570eea123SYanqin Li valid1 && valid2 && region_hash_tag(tag) === region_hash_tag(new_tag) 760d32f713Shappy-lx } 770d32f713Shappy-lx 7820e09ab1Shappy-lx def alloc(alloc_tag: UInt, alloc_bit_vec: UInt, alloc_active: Bool, alloc_decr_mode: Bool, alloc_full_vaddr: UInt) = { 790d32f713Shappy-lx tag := alloc_tag 800d32f713Shappy-lx bit_vec := alloc_bit_vec 810d32f713Shappy-lx active := alloc_active 820d32f713Shappy-lx cnt := 1.U 8320e09ab1Shappy-lx trigger_full_va := alloc_full_vaddr 840d32f713Shappy-lx if(ENABLE_DECR_MODE) { 850d32f713Shappy-lx decr_mode := alloc_decr_mode 860d32f713Shappy-lx }else { 870d32f713Shappy-lx decr_mode := INIT_DEC_MODE.B 880d32f713Shappy-lx } 890d32f713Shappy-lx 9020e09ab1Shappy-lx 910d32f713Shappy-lx assert(PopCount(alloc_bit_vec) === 1.U, "alloc vector should be one hot") 920d32f713Shappy-lx } 930d32f713Shappy-lx 940d32f713Shappy-lx def update(update_bit_vec: UInt, update_active: Bool) = { 950d32f713Shappy-lx // if the slot is 0 before, increment cnt 960d32f713Shappy-lx val cnt_en = !((bit_vec & update_bit_vec).orR) 970d32f713Shappy-lx val cnt_next = Mux(cnt_en, cnt + 1.U, cnt) 980d32f713Shappy-lx 990d32f713Shappy-lx bit_vec := bit_vec | update_bit_vec 1000d32f713Shappy-lx cnt := cnt_next 1010d32f713Shappy-lx when(cnt_next >= ACTIVE_THRESHOLD.U) { 1020d32f713Shappy-lx active := true.B 1030d32f713Shappy-lx } 1040d32f713Shappy-lx when(update_active) { 1050d32f713Shappy-lx active := true.B 1060d32f713Shappy-lx } 1070d32f713Shappy-lx 1080d32f713Shappy-lx assert(PopCount(update_bit_vec) === 1.U, "update vector should be one hot") 1090d32f713Shappy-lx assert(cnt <= BIT_VEC_WITDH.U, "cnt should always less than bit vector size") 1100d32f713Shappy-lx } 1110d32f713Shappy-lx} 1120d32f713Shappy-lx 1130d32f713Shappy-lxclass StreamPrefetchReqBundle(implicit p: Parameters) extends XSBundle with HasStreamPrefetchHelper { 1140d32f713Shappy-lx val region = UInt(REGION_TAG_BITS.W) 1150d32f713Shappy-lx val bit_vec = UInt(BIT_VEC_WITDH.W) 1160d32f713Shappy-lx val sink = UInt(SINK_BITS.W) 1170d32f713Shappy-lx val source = new L1PrefetchSource() 11820e09ab1Shappy-lx // debug usage 11920e09ab1Shappy-lx val trigger_pc = UInt(VAddrBits.W) 12020e09ab1Shappy-lx val trigger_va = UInt(VAddrBits.W) 1210d32f713Shappy-lx 1220d32f713Shappy-lx // align prefetch vaddr and width to region 12320e09ab1Shappy-lx def getStreamPrefetchReqBundle(valid: Bool, vaddr: UInt, width: Int, decr_mode: Bool, sink: UInt, source: UInt, t_pc: UInt, t_va: UInt): StreamPrefetchReqBundle = { 1240d32f713Shappy-lx val res = Wire(new StreamPrefetchReqBundle) 1250d32f713Shappy-lx res.region := get_region_tag(vaddr) 1260d32f713Shappy-lx res.sink := sink 1270d32f713Shappy-lx res.source.value := source 1280d32f713Shappy-lx 12920e09ab1Shappy-lx res.trigger_pc := t_pc 13020e09ab1Shappy-lx res.trigger_va := t_va 13120e09ab1Shappy-lx 1320d32f713Shappy-lx val region_bits = get_region_bits(vaddr) 1330d32f713Shappy-lx val region_bit_vec = UIntToOH(region_bits) 1340d32f713Shappy-lx res.bit_vec := Mux( 1350d32f713Shappy-lx decr_mode, 1360d32f713Shappy-lx (0 until width).map{ case i => region_bit_vec >> i}.reduce(_ | _), 1370d32f713Shappy-lx (0 until width).map{ case i => region_bit_vec << i}.reduce(_ | _) 1380d32f713Shappy-lx ) 1390d32f713Shappy-lx 14058a9a40dSTang Haojin assert(!valid || PopCount(res.bit_vec) <= width.U, "actual prefetch block number should less than or equals to WIDTH_CACHE_BLOCKS") 14158a9a40dSTang Haojin assert(!valid || PopCount(res.bit_vec) >= 1.U, "at least one block should be included") 1420d32f713Shappy-lx assert(sink <= SINK_L3, "invalid sink") 1430d32f713Shappy-lx for(i <- 0 until BIT_VEC_WITDH) { 1440d32f713Shappy-lx when(decr_mode) { 1450d32f713Shappy-lx when(i.U > region_bits) { 14658a9a40dSTang Haojin assert(!valid || res.bit_vec(i) === 0.U, s"res.bit_vec(${i}) is not zero in decr_mode, prefetch vector is wrong!") 1470d32f713Shappy-lx }.elsewhen(i.U === region_bits) { 14858a9a40dSTang Haojin assert(!valid || res.bit_vec(i) === 1.U, s"res.bit_vec(${i}) is zero in decr_mode, prefetch vector is wrong!") 1490d32f713Shappy-lx } 1500d32f713Shappy-lx }.otherwise { 1510d32f713Shappy-lx when(i.U < region_bits) { 15258a9a40dSTang Haojin assert(!valid || res.bit_vec(i) === 0.U, s"res.bit_vec(${i}) is not zero in incr_mode, prefetch vector is wrong!") 1530d32f713Shappy-lx }.elsewhen(i.U === region_bits) { 15458a9a40dSTang Haojin assert(!valid || res.bit_vec(i) === 1.U, s"res.bit_vec(${i}) is zero in decr_mode, prefetch vector is wrong!") 1550d32f713Shappy-lx } 1560d32f713Shappy-lx } 1570d32f713Shappy-lx } 1580d32f713Shappy-lx 1590d32f713Shappy-lx res 1600d32f713Shappy-lx } 1610d32f713Shappy-lx} 1620d32f713Shappy-lx 1630d32f713Shappy-lxclass StreamBitVectorArray(implicit p: Parameters) extends XSModule with HasStreamPrefetchHelper { 1640d32f713Shappy-lx val io = IO(new XSBundle { 1650d32f713Shappy-lx val enable = Input(Bool()) 1660d32f713Shappy-lx // TODO: flush all entry when process changing happens, or disable stream prefetch for a while 1670d32f713Shappy-lx val flush = Input(Bool()) 1680d32f713Shappy-lx val dynamic_depth = Input(UInt(DEPTH_BITS.W)) 1690d32f713Shappy-lx val train_req = Flipped(DecoupledIO(new PrefetchReqBundle)) 17020e09ab1Shappy-lx val l1_prefetch_req = ValidIO(new StreamPrefetchReqBundle) 17120e09ab1Shappy-lx val l2_l3_prefetch_req = ValidIO(new StreamPrefetchReqBundle) 1720d32f713Shappy-lx 1730d32f713Shappy-lx // Stride send lookup req here 1740d32f713Shappy-lx val stream_lookup_req = Flipped(ValidIO(new PrefetchReqBundle)) 1750d32f713Shappy-lx val stream_lookup_resp = Output(Bool()) 1760d32f713Shappy-lx }) 1770d32f713Shappy-lx 1780d32f713Shappy-lx val array = Reg(Vec(BIT_VEC_ARRAY_SIZE, new StreamBitVectorBundle)) 17970eea123SYanqin Li val valids = RegInit(VecInit(Seq.fill(BIT_VEC_ARRAY_SIZE)(false.B))) 18070eea123SYanqin Li 18170eea123SYanqin Li def reset_array(i: Int): Unit = { 18270eea123SYanqin Li valids(i) := false.B 18370eea123SYanqin Li //only need to rest control signals for firendly area 18470eea123SYanqin Li // array(i).reset(i) 18570eea123SYanqin Li } 18670eea123SYanqin Li 1870d32f713Shappy-lx val replacement = ReplacementPolicy.fromString("plru", BIT_VEC_ARRAY_SIZE) 1880d32f713Shappy-lx 1890d32f713Shappy-lx // s0: generate region tag, parallel match 1900d32f713Shappy-lx val s0_can_accept = Wire(Bool()) 1910d32f713Shappy-lx val s0_valid = io.train_req.fire 19220e09ab1Shappy-lx val s0_pc = io.train_req.bits.pc 1930d32f713Shappy-lx val s0_vaddr = io.train_req.bits.vaddr 194f4221883Shappy-lx val s0_miss = io.train_req.bits.miss 195f4221883Shappy-lx val s0_pfHit = io.train_req.bits.pfHitStream 1960d32f713Shappy-lx val s0_region_bits = get_region_bits(s0_vaddr) 1970d32f713Shappy-lx val s0_region_tag = get_region_tag(s0_vaddr) 1980d32f713Shappy-lx val s0_region_tag_plus_one = get_region_tag(s0_vaddr) + 1.U 1990d32f713Shappy-lx val s0_region_tag_minus_one = get_region_tag(s0_vaddr) - 1.U 20070eea123SYanqin Li val s0_region_tag_match_vec = array zip valids map { case (e, v) => e.tag_match(v, s0_valid, s0_region_tag) } 20170eea123SYanqin Li val s0_region_tag_plus_one_match_vec = array zip valids map { case (e, v) => e.tag_match(v, s0_valid, s0_region_tag_plus_one) } 20270eea123SYanqin Li val s0_region_tag_minus_one_match_vec = array zip valids map { case (e, v) => e.tag_match(v, s0_valid, s0_region_tag_minus_one) } 2030d32f713Shappy-lx val s0_hit = Cat(s0_region_tag_match_vec).orR 2040d32f713Shappy-lx val s0_plus_one_hit = Cat(s0_region_tag_plus_one_match_vec).orR 2050d32f713Shappy-lx val s0_minus_one_hit = Cat(s0_region_tag_minus_one_match_vec).orR 2060d32f713Shappy-lx val s0_hit_vec = VecInit(s0_region_tag_match_vec).asUInt 2070d32f713Shappy-lx val s0_index = Mux(s0_hit, OHToUInt(s0_hit_vec), replacement.way) 2080d32f713Shappy-lx val s0_plus_one_index = OHToUInt(VecInit(s0_region_tag_plus_one_match_vec).asUInt) 2090d32f713Shappy-lx val s0_minus_one_index = OHToUInt(VecInit(s0_region_tag_minus_one_match_vec).asUInt) 2100d32f713Shappy-lx io.train_req.ready := s0_can_accept 2110d32f713Shappy-lx 2120d32f713Shappy-lx when(s0_valid) { 2130d32f713Shappy-lx replacement.access(s0_index) 2140d32f713Shappy-lx } 2150d32f713Shappy-lx 21620e09ab1Shappy-lx val stream_pf_train_debug_table = ChiselDB.createTable("StreamTrainTraceTable" + p(XSCoreParamsKey).HartId.toString, new StreamTrainTraceEntry, basicDB = false) 21720e09ab1Shappy-lx 21820e09ab1Shappy-lx val spf_log_enable = s0_valid 21920e09ab1Shappy-lx val spf_log_data = Wire(new StreamTrainTraceEntry) 22020e09ab1Shappy-lx 22120e09ab1Shappy-lx // WARNING: the type here only indicates trigger by stream, not saying it's sink 22220e09ab1Shappy-lx spf_log_data.Type := MemReqSource.Prefetch2L2Stream.id.U 22320e09ab1Shappy-lx spf_log_data.OldAddr := Mux( 22420e09ab1Shappy-lx !s0_hit, 22520e09ab1Shappy-lx s0_vaddr, 22620e09ab1Shappy-lx array(s0_index).trigger_full_va 22720e09ab1Shappy-lx ) 22820e09ab1Shappy-lx spf_log_data.CurAddr := s0_vaddr 22920e09ab1Shappy-lx spf_log_data.Offset := DontCare 23020e09ab1Shappy-lx spf_log_data.Score := DontCare 23120e09ab1Shappy-lx spf_log_data.Miss := io.train_req.bits.miss 23220e09ab1Shappy-lx 23320e09ab1Shappy-lx stream_pf_train_debug_table.log( 23420e09ab1Shappy-lx data = spf_log_data, 23520e09ab1Shappy-lx en = spf_log_enable, 23620e09ab1Shappy-lx site = "StreamTrainTraceTable", 23720e09ab1Shappy-lx clock = clock, 23820e09ab1Shappy-lx reset = reset 23920e09ab1Shappy-lx ) 24020e09ab1Shappy-lx 2410d32f713Shappy-lx assert(!s0_valid || PopCount(VecInit(s0_region_tag_match_vec)) <= 1.U, "req region should match no more than 1 entry") 2420d32f713Shappy-lx assert(!s0_valid || PopCount(VecInit(s0_region_tag_plus_one_match_vec)) <= 1.U, "req region plus 1 should match no more than 1 entry") 2430d32f713Shappy-lx assert(!s0_valid || PopCount(VecInit(s0_region_tag_minus_one_match_vec)) <= 1.U, "req region minus 1 should match no more than 1 entry") 2440d32f713Shappy-lx assert(!s0_valid || !(s0_hit && s0_plus_one_hit && (s0_index === s0_plus_one_index)), "region and region plus 1 index match failed") 2450d32f713Shappy-lx assert(!s0_valid || !(s0_hit && s0_minus_one_hit && (s0_index === s0_minus_one_index)), "region and region minus 1 index match failed") 2460d32f713Shappy-lx assert(!s0_valid || !(s0_plus_one_hit && s0_minus_one_hit && (s0_minus_one_index === s0_plus_one_index)), "region plus 1 and region minus 1 index match failed") 2475adc4829SYanqin Li assert(!(s0_valid && RegNext(s0_valid) && !s0_hit && !RegEnable(s0_hit, s0_valid) && replacement.way === RegEnable(replacement.way, s0_valid)), "replacement error") 2480d32f713Shappy-lx 2490d32f713Shappy-lx XSPerfAccumulate("s0_valid_train_req", s0_valid) 2500d32f713Shappy-lx val s0_hit_pattern_vec = Seq(s0_hit, s0_plus_one_hit, s0_minus_one_hit) 2510d32f713Shappy-lx for(i <- 0 until (1 << s0_hit_pattern_vec.size)) { 2520d32f713Shappy-lx XSPerfAccumulate(s"s0_hit_pattern_${toBinary(i)}", (VecInit(s0_hit_pattern_vec).asUInt === i.U) && s0_valid) 2530d32f713Shappy-lx } 2540d32f713Shappy-lx XSPerfAccumulate("s0_replace_the_neighbor", s0_valid && !s0_hit && ((s0_plus_one_hit && (s0_index === s0_plus_one_index)) || (s0_minus_one_hit && (s0_index === s0_minus_one_index)))) 2550d32f713Shappy-lx XSPerfAccumulate("s0_req_valid", io.train_req.valid) 2560d32f713Shappy-lx XSPerfAccumulate("s0_req_cannot_accept", io.train_req.valid && !io.train_req.ready) 2570d32f713Shappy-lx 258c686adcdSYinan Xu val ratio_const = Constantin.createRecord(s"l2DepthRatio${p(XSCoreParamsKey).HartId}", initValue = L2_DEPTH_RATIO) 2590d32f713Shappy-lx val ratio = ratio_const(3, 0) 2600d32f713Shappy-lx 261c686adcdSYinan Xu val l3_ratio_const = Constantin.createRecord(s"l3DepthRatio${p(XSCoreParamsKey).HartId}", initValue = L3_DEPTH_RATIO) 2620d32f713Shappy-lx val l3_ratio = l3_ratio_const(3, 0) 2630d32f713Shappy-lx 2640d32f713Shappy-lx // s1: alloc or update 2654ccb2e8bSYanqin Li val s1_valid = GatedValidRegNext(s0_valid) 2660d32f713Shappy-lx val s1_index = RegEnable(s0_index, s0_valid) 26720e09ab1Shappy-lx val s1_pc = RegEnable(s0_pc, s0_valid) 26820e09ab1Shappy-lx val s1_vaddr = RegEnable(s0_vaddr, s0_valid) 269f4221883Shappy-lx val s1_miss = RegEnable(s0_miss, s0_valid) 270f4221883Shappy-lx val s1_pfHit = RegEnable(s0_pfHit, s0_valid) 2710d32f713Shappy-lx val s1_plus_one_index = RegEnable(s0_plus_one_index, s0_valid) 2720d32f713Shappy-lx val s1_minus_one_index = RegEnable(s0_minus_one_index, s0_valid) 2730d32f713Shappy-lx val s1_hit = RegEnable(s0_hit, s0_valid) 2740d32f713Shappy-lx val s1_plus_one_hit = if(ENABLE_STRICT_ACTIVE_DETECTION) 2750d32f713Shappy-lx RegEnable(s0_plus_one_hit, s0_valid) && array(s1_plus_one_index).active && (array(s1_plus_one_index).cnt >= ACTIVE_THRESHOLD.U) 2760d32f713Shappy-lx else 2770d32f713Shappy-lx RegEnable(s0_plus_one_hit, s0_valid) && array(s1_plus_one_index).active 2780d32f713Shappy-lx val s1_minus_one_hit = if(ENABLE_STRICT_ACTIVE_DETECTION) 2790d32f713Shappy-lx RegEnable(s0_minus_one_hit, s0_valid) && array(s1_minus_one_index).active && (array(s1_minus_one_index).cnt >= ACTIVE_THRESHOLD.U) 2800d32f713Shappy-lx else 2810d32f713Shappy-lx RegEnable(s0_minus_one_hit, s0_valid) && array(s1_minus_one_index).active 2820d32f713Shappy-lx val s1_region_tag = RegEnable(s0_region_tag, s0_valid) 2830d32f713Shappy-lx val s1_region_bits = RegEnable(s0_region_bits, s0_valid) 2840d32f713Shappy-lx val s1_alloc = s1_valid && !s1_hit 2850d32f713Shappy-lx val s1_update = s1_valid && s1_hit 2860d32f713Shappy-lx val s1_pf_l1_incr_vaddr = Cat(region_to_block_addr(s1_region_tag, s1_region_bits) + io.dynamic_depth, 0.U(BLOCK_OFFSET.W)) 2870d32f713Shappy-lx val s1_pf_l1_decr_vaddr = Cat(region_to_block_addr(s1_region_tag, s1_region_bits) - io.dynamic_depth, 0.U(BLOCK_OFFSET.W)) 2880d32f713Shappy-lx val s1_pf_l2_incr_vaddr = Cat(region_to_block_addr(s1_region_tag, s1_region_bits) + (io.dynamic_depth << ratio), 0.U(BLOCK_OFFSET.W)) 2890d32f713Shappy-lx val s1_pf_l2_decr_vaddr = Cat(region_to_block_addr(s1_region_tag, s1_region_bits) - (io.dynamic_depth << ratio), 0.U(BLOCK_OFFSET.W)) 2900d32f713Shappy-lx val s1_pf_l3_incr_vaddr = Cat(region_to_block_addr(s1_region_tag, s1_region_bits) + (io.dynamic_depth << l3_ratio), 0.U(BLOCK_OFFSET.W)) 2910d32f713Shappy-lx val s1_pf_l3_decr_vaddr = Cat(region_to_block_addr(s1_region_tag, s1_region_bits) - (io.dynamic_depth << l3_ratio), 0.U(BLOCK_OFFSET.W)) 2920d32f713Shappy-lx // TODO: remove this 293f4221883Shappy-lx val strict_trigger_const = Constantin.createRecord(s"StreamStrictTrigger_${p(XSCoreParamsKey).HartId}", initValue = 1) 294f4221883Shappy-lx // If use strict triggering mode, the stream prefetcher will only trigger prefetching 295f4221883Shappy-lx // under **cache miss or prefetch hit stream**, but will still perform training on the entire memory access trace. 296f4221883Shappy-lx val s1_can_trigger = Mux(strict_trigger_const.orR, s1_miss || s1_pfHit, true.B) 297f4221883Shappy-lx val s1_can_send_pf = Mux(s1_update, !((array(s1_index).bit_vec & UIntToOH(s1_region_bits)).orR), true.B) && s1_can_trigger 2980d32f713Shappy-lx s0_can_accept := !(s1_valid && (region_hash_tag(s1_region_tag) === region_hash_tag(s0_region_tag))) 2990d32f713Shappy-lx 3000d32f713Shappy-lx when(s1_alloc) { 3010d32f713Shappy-lx // alloc a new entry 30270eea123SYanqin Li valids(s1_index) := true.B 3030d32f713Shappy-lx array(s1_index).alloc( 3040d32f713Shappy-lx alloc_tag = s1_region_tag, 3050d32f713Shappy-lx alloc_bit_vec = UIntToOH(s1_region_bits), 3060d32f713Shappy-lx alloc_active = s1_plus_one_hit || s1_minus_one_hit, 30720e09ab1Shappy-lx alloc_decr_mode = RegEnable(s0_plus_one_hit, s0_valid), 30820e09ab1Shappy-lx alloc_full_vaddr = RegEnable(s0_vaddr, s0_valid) 30920e09ab1Shappy-lx ) 3100d32f713Shappy-lx 3110d32f713Shappy-lx }.elsewhen(s1_update) { 3120d32f713Shappy-lx // update a existing entry 31370eea123SYanqin Li assert(array(s1_index).cnt =/= 0.U || valids(s1_index), "entry should have been allocated before") 3140d32f713Shappy-lx array(s1_index).update( 3150d32f713Shappy-lx update_bit_vec = UIntToOH(s1_region_bits), 3160d32f713Shappy-lx update_active = s1_plus_one_hit || s1_minus_one_hit) 3170d32f713Shappy-lx } 3180d32f713Shappy-lx 3190d32f713Shappy-lx XSPerfAccumulate("s1_alloc", s1_alloc) 3200d32f713Shappy-lx XSPerfAccumulate("s1_update", s1_update) 3210d32f713Shappy-lx XSPerfAccumulate("s1_active_plus_one_hit", s1_valid && s1_plus_one_hit) 3220d32f713Shappy-lx XSPerfAccumulate("s1_active_minus_one_hit", s1_valid && s1_minus_one_hit) 3230d32f713Shappy-lx 3240d32f713Shappy-lx // s2: trigger prefetch if hit active bit vector, compute meta of prefetch req 3254ccb2e8bSYanqin Li val s2_valid = GatedValidRegNext(s1_valid) 3260d32f713Shappy-lx val s2_index = RegEnable(s1_index, s1_valid) 32720e09ab1Shappy-lx val s2_pc = RegEnable(s1_pc, s1_valid) 32820e09ab1Shappy-lx val s2_vaddr = RegEnable(s1_vaddr, s1_valid) 3290d32f713Shappy-lx val s2_region_bits = RegEnable(s1_region_bits, s1_valid) 3300d32f713Shappy-lx val s2_region_tag = RegEnable(s1_region_tag, s1_valid) 3310d32f713Shappy-lx val s2_pf_l1_incr_vaddr = RegEnable(s1_pf_l1_incr_vaddr, s1_valid) 3320d32f713Shappy-lx val s2_pf_l1_decr_vaddr = RegEnable(s1_pf_l1_decr_vaddr, s1_valid) 3330d32f713Shappy-lx val s2_pf_l2_incr_vaddr = RegEnable(s1_pf_l2_incr_vaddr, s1_valid) 3340d32f713Shappy-lx val s2_pf_l2_decr_vaddr = RegEnable(s1_pf_l2_decr_vaddr, s1_valid) 3350d32f713Shappy-lx val s2_pf_l3_incr_vaddr = RegEnable(s1_pf_l3_incr_vaddr, s1_valid) 3360d32f713Shappy-lx val s2_pf_l3_decr_vaddr = RegEnable(s1_pf_l3_decr_vaddr, s1_valid) 3370d32f713Shappy-lx val s2_can_send_pf = RegEnable(s1_can_send_pf, s1_valid) 3380d32f713Shappy-lx val s2_active = array(s2_index).active 3390d32f713Shappy-lx val s2_decr_mode = array(s2_index).decr_mode 3400d32f713Shappy-lx val s2_l1_vaddr = Mux(s2_decr_mode, s2_pf_l1_decr_vaddr, s2_pf_l1_incr_vaddr) 3410d32f713Shappy-lx val s2_l2_vaddr = Mux(s2_decr_mode, s2_pf_l2_decr_vaddr, s2_pf_l2_incr_vaddr) 3420d32f713Shappy-lx val s2_l3_vaddr = Mux(s2_decr_mode, s2_pf_l3_decr_vaddr, s2_pf_l3_incr_vaddr) 3430d32f713Shappy-lx val s2_will_send_pf = s2_valid && s2_active && s2_can_send_pf 3440d32f713Shappy-lx val s2_pf_req_valid = s2_will_send_pf && io.enable 3450d32f713Shappy-lx val s2_pf_l1_req_bits = (new StreamPrefetchReqBundle).getStreamPrefetchReqBundle( 34658a9a40dSTang Haojin valid = s2_valid, 3470d32f713Shappy-lx vaddr = s2_l1_vaddr, 3480d32f713Shappy-lx width = WIDTH_CACHE_BLOCKS, 3490d32f713Shappy-lx decr_mode = s2_decr_mode, 3500d32f713Shappy-lx sink = SINK_L1, 35120e09ab1Shappy-lx source = L1_HW_PREFETCH_STREAM, 35220e09ab1Shappy-lx t_pc = s2_pc, 35320e09ab1Shappy-lx t_va = s2_vaddr 35420e09ab1Shappy-lx ) 3550d32f713Shappy-lx val s2_pf_l2_req_bits = (new StreamPrefetchReqBundle).getStreamPrefetchReqBundle( 35658a9a40dSTang Haojin valid = s2_valid, 3570d32f713Shappy-lx vaddr = s2_l2_vaddr, 3580d32f713Shappy-lx width = L2_WIDTH_CACHE_BLOCKS, 3590d32f713Shappy-lx decr_mode = s2_decr_mode, 3600d32f713Shappy-lx sink = SINK_L2, 36120e09ab1Shappy-lx source = L1_HW_PREFETCH_STREAM, 36220e09ab1Shappy-lx t_pc = s2_pc, 36320e09ab1Shappy-lx t_va = s2_vaddr 36420e09ab1Shappy-lx ) 3650d32f713Shappy-lx val s2_pf_l3_req_bits = (new StreamPrefetchReqBundle).getStreamPrefetchReqBundle( 36658a9a40dSTang Haojin valid = s2_valid, 3670d32f713Shappy-lx vaddr = s2_l3_vaddr, 3680d32f713Shappy-lx width = L3_WIDTH_CACHE_BLOCKS, 3690d32f713Shappy-lx decr_mode = s2_decr_mode, 3700d32f713Shappy-lx sink = SINK_L3, 37120e09ab1Shappy-lx source = L1_HW_PREFETCH_STREAM, 37220e09ab1Shappy-lx t_pc = s2_pc, 37320e09ab1Shappy-lx t_va = s2_vaddr 37420e09ab1Shappy-lx ) 3750d32f713Shappy-lx 3760d32f713Shappy-lx XSPerfAccumulate("s2_valid", s2_valid) 3770d32f713Shappy-lx XSPerfAccumulate("s2_will_not_send_pf", s2_valid && !s2_will_send_pf) 3780d32f713Shappy-lx XSPerfAccumulate("s2_will_send_decr_pf", s2_valid && s2_will_send_pf && s2_decr_mode) 3790d32f713Shappy-lx XSPerfAccumulate("s2_will_send_incr_pf", s2_valid && s2_will_send_pf && !s2_decr_mode) 3800d32f713Shappy-lx 3810d32f713Shappy-lx // s3: send the l1 prefetch req out 3824ccb2e8bSYanqin Li val s3_pf_l1_valid = GatedValidRegNext(s2_pf_req_valid) 3830d32f713Shappy-lx val s3_pf_l1_bits = RegEnable(s2_pf_l1_req_bits, s2_pf_req_valid) 3844ccb2e8bSYanqin Li val s3_pf_l2_valid = GatedValidRegNext(s2_pf_req_valid) 3850d32f713Shappy-lx val s3_pf_l2_bits = RegEnable(s2_pf_l2_req_bits, s2_pf_req_valid) 3860d32f713Shappy-lx val s3_pf_l3_bits = RegEnable(s2_pf_l3_req_bits, s2_pf_req_valid) 3870d32f713Shappy-lx 3880d32f713Shappy-lx XSPerfAccumulate("s3_pf_sent", s3_pf_l1_valid) 3890d32f713Shappy-lx 3900d32f713Shappy-lx // s4: send the l2 prefetch req out 3914ccb2e8bSYanqin Li val s4_pf_l2_valid = GatedValidRegNext(s3_pf_l2_valid) 3920d32f713Shappy-lx val s4_pf_l2_bits = RegEnable(s3_pf_l2_bits, s3_pf_l2_valid) 3930d32f713Shappy-lx val s4_pf_l3_bits = RegEnable(s3_pf_l3_bits, s3_pf_l2_valid) 3940d32f713Shappy-lx 395c686adcdSYinan Xu val enable_l3_pf = Constantin.createRecord(s"enableL3StreamPrefetch${p(XSCoreParamsKey).HartId}", initValue = false) 3960d32f713Shappy-lx // s5: send the l3 prefetch req out 3974ccb2e8bSYanqin Li val s5_pf_l3_valid = GatedValidRegNext(s4_pf_l2_valid) && enable_l3_pf 3980d32f713Shappy-lx val s5_pf_l3_bits = RegEnable(s4_pf_l3_bits, s4_pf_l2_valid) 3990d32f713Shappy-lx 40020e09ab1Shappy-lx io.l1_prefetch_req.valid := s3_pf_l1_valid 40120e09ab1Shappy-lx io.l1_prefetch_req.bits := s3_pf_l1_bits 40220e09ab1Shappy-lx io.l2_l3_prefetch_req.valid := s4_pf_l2_valid || s5_pf_l3_valid 40320e09ab1Shappy-lx io.l2_l3_prefetch_req.bits := Mux(s4_pf_l2_valid, s4_pf_l2_bits, s5_pf_l3_bits) 4040d32f713Shappy-lx 40520e09ab1Shappy-lx XSPerfAccumulate("s4_pf_sent", s4_pf_l2_valid) 40620e09ab1Shappy-lx XSPerfAccumulate("s5_pf_sent", !s4_pf_l2_valid && s5_pf_l3_valid) 40720e09ab1Shappy-lx XSPerfAccumulate("pf_sent", PopCount(Seq(io.l1_prefetch_req.valid, io.l2_l3_prefetch_req.valid))) 4080d32f713Shappy-lx 4090d32f713Shappy-lx // Stride lookup starts here 4100d32f713Shappy-lx // S0: Stride send req 4110d32f713Shappy-lx val s0_lookup_valid = io.stream_lookup_req.valid 4120d32f713Shappy-lx val s0_lookup_vaddr = io.stream_lookup_req.bits.vaddr 4130d32f713Shappy-lx val s0_lookup_tag = get_region_tag(s0_lookup_vaddr) 4140d32f713Shappy-lx // S1: match 4154ccb2e8bSYanqin Li val s1_lookup_valid = GatedValidRegNext(s0_lookup_valid) 4160d32f713Shappy-lx val s1_lookup_tag = RegEnable(s0_lookup_tag, s0_lookup_valid) 41770eea123SYanqin Li val s1_lookup_tag_match_vec = array zip valids map { case (e, v) => e.tag_match(v, s1_lookup_valid, s1_lookup_tag) } 4180d32f713Shappy-lx val s1_lookup_hit = VecInit(s1_lookup_tag_match_vec).asUInt.orR 4190d32f713Shappy-lx val s1_lookup_index = OHToUInt(VecInit(s1_lookup_tag_match_vec)) 4200d32f713Shappy-lx // S2: read active out 4214ccb2e8bSYanqin Li val s2_lookup_valid = GatedValidRegNext(s1_lookup_valid) 4220d32f713Shappy-lx val s2_lookup_hit = RegEnable(s1_lookup_hit, s1_lookup_valid) 4230d32f713Shappy-lx val s2_lookup_index = RegEnable(s1_lookup_index, s1_lookup_valid) 4240d32f713Shappy-lx val s2_lookup_active = array(s2_lookup_index).active 4250d32f713Shappy-lx // S3: send back to Stride 4264ccb2e8bSYanqin Li val s3_lookup_valid = GatedValidRegNext(s2_lookup_valid) 4270d32f713Shappy-lx val s3_lookup_hit = RegEnable(s2_lookup_hit, s2_lookup_valid) 4280d32f713Shappy-lx val s3_lookup_active = RegEnable(s2_lookup_active, s2_lookup_valid) 4290d32f713Shappy-lx io.stream_lookup_resp := s3_lookup_valid && s3_lookup_hit && s3_lookup_active 4300d32f713Shappy-lx 4310d32f713Shappy-lx // reset meta to avoid muti-hit problem 4320d32f713Shappy-lx for(i <- 0 until BIT_VEC_ARRAY_SIZE) { 43370eea123SYanqin Li when(GatedValidRegNext(io.flush)) { 43470eea123SYanqin Li reset_array(i) 4350d32f713Shappy-lx } 4360d32f713Shappy-lx } 4370d32f713Shappy-lx 4380d32f713Shappy-lx XSPerfHistogram("bit_vector_active", PopCount(VecInit(array.map(_.active)).asUInt), true.B, 0, BIT_VEC_ARRAY_SIZE, 1) 4390d32f713Shappy-lx XSPerfHistogram("bit_vector_decr_mode", PopCount(VecInit(array.map(_.decr_mode)).asUInt), true.B, 0, BIT_VEC_ARRAY_SIZE, 1) 4400d32f713Shappy-lx XSPerfAccumulate("hash_conflict", s0_valid && s2_valid && (s0_region_tag =/= s2_region_tag) && (region_hash_tag(s0_region_tag) === region_hash_tag(s2_region_tag))) 4410d32f713Shappy-lx} 442