1c49ebec8SHaoyuan Feng/*************************************************************************************** 2c49ebec8SHaoyuan Feng* Copyright (c) 2024 Beijing Institute of Open Source Chip (BOSC) 3c49ebec8SHaoyuan Feng* Copyright (c) 2020-2024 Institute of Computing Technology, Chinese Academy of Sciences 4c49ebec8SHaoyuan Feng* Copyright (c) 2020-2021 Peng Cheng Laboratory 5c49ebec8SHaoyuan Feng* 6c49ebec8SHaoyuan Feng* XiangShan is licensed under Mulan PSL v2. 7c49ebec8SHaoyuan Feng* You can use this software according to the terms and conditions of the Mulan PSL v2. 8c49ebec8SHaoyuan Feng* You may obtain a copy of Mulan PSL v2 at: 9c49ebec8SHaoyuan Feng* http://license.coscl.org.cn/MulanPSL2 10c49ebec8SHaoyuan Feng* 11c49ebec8SHaoyuan Feng* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, 12c49ebec8SHaoyuan Feng* EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, 13c49ebec8SHaoyuan Feng* MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. 14c49ebec8SHaoyuan Feng* 15c49ebec8SHaoyuan Feng* See the Mulan PSL v2 for more details. 16c49ebec8SHaoyuan Feng* 17c49ebec8SHaoyuan Feng* 18c49ebec8SHaoyuan Feng* Acknowledgement 19c49ebec8SHaoyuan Feng* 20c49ebec8SHaoyuan Feng* This implementation is inspired by several key papers: 21c49ebec8SHaoyuan Feng* [1] Jean-Loup Baer, and Tien-Fu Chen. "[An effective on-chip preloading scheme to reduce data access penalty.] 22c49ebec8SHaoyuan Feng* (https://doi.org/10.1145/125826.125932)" ACM/IEEE Conference on Supercomputing. 1991. 23c49ebec8SHaoyuan Feng***************************************************************************************/ 24c49ebec8SHaoyuan Feng 250d32f713Shappy-lxpackage xiangshan.mem.prefetch 260d32f713Shappy-lx 278891a219SYinan Xuimport org.chipsalliance.cde.config.Parameters 280d32f713Shappy-lximport chisel3._ 290d32f713Shappy-lximport chisel3.util._ 300d32f713Shappy-lximport utils._ 310d32f713Shappy-lximport utility._ 329e12e8edScz4eimport xiangshan._ 339e12e8edScz4eimport xiangshan.mem.L1PrefetchReq 34*99ce5576Scz4eimport xiangshan.mem.Bundles.LsPrefetchTrainBundle 359e12e8edScz4eimport xiangshan.mem.trace._ 360d32f713Shappy-lximport xiangshan.cache.HasDCacheParameters 370d32f713Shappy-lximport xiangshan.cache.mmu._ 380d32f713Shappy-lximport scala.collection.SeqLike 390d32f713Shappy-lx 400d32f713Shappy-lxtrait HasStridePrefetchHelper extends HasL1PrefetchHelper { 410d32f713Shappy-lx val STRIDE_FILTER_SIZE = 6 420d32f713Shappy-lx val STRIDE_ENTRY_NUM = 10 430d32f713Shappy-lx val STRIDE_BITS = 10 + BLOCK_OFFSET 440d32f713Shappy-lx val STRIDE_VADDR_BITS = 10 + BLOCK_OFFSET 450d32f713Shappy-lx val STRIDE_CONF_BITS = 2 460d32f713Shappy-lx 470d32f713Shappy-lx // detail control 48c686adcdSYinan Xu val ALWAYS_UPDATE_PRE_VADDR = true 490d32f713Shappy-lx val AGGRESIVE_POLICY = false // if true, prefetch degree is greater than 1, 1 otherwise 500d32f713Shappy-lx val STRIDE_LOOK_AHEAD_BLOCKS = 2 // aggressive degree 510d32f713Shappy-lx val LOOK_UP_STREAM = false // if true, avoid collision with stream 520d32f713Shappy-lx 530d32f713Shappy-lx val STRIDE_WIDTH_BLOCKS = if(AGGRESIVE_POLICY) STRIDE_LOOK_AHEAD_BLOCKS else 1 540d32f713Shappy-lx 550d32f713Shappy-lx def MAX_CONF = (1 << STRIDE_CONF_BITS) - 1 560d32f713Shappy-lx} 570d32f713Shappy-lx 580d32f713Shappy-lxclass StrideMetaBundle(implicit p: Parameters) extends XSBundle with HasStridePrefetchHelper { 590d32f713Shappy-lx val pre_vaddr = UInt(STRIDE_VADDR_BITS.W) 600d32f713Shappy-lx val stride = UInt(STRIDE_BITS.W) 610d32f713Shappy-lx val confidence = UInt(STRIDE_CONF_BITS.W) 620d32f713Shappy-lx val hash_pc = UInt(HASH_TAG_WIDTH.W) 630d32f713Shappy-lx 640d32f713Shappy-lx def reset(index: Int) = { 650d32f713Shappy-lx pre_vaddr := 0.U 660d32f713Shappy-lx stride := 0.U 670d32f713Shappy-lx confidence := 0.U 680d32f713Shappy-lx hash_pc := index.U 690d32f713Shappy-lx } 700d32f713Shappy-lx 7170eea123SYanqin Li def tag_match(valid1: Bool, valid2: Bool, new_hash_pc: UInt): Bool = { 7270eea123SYanqin Li valid1 && valid2 && hash_pc === new_hash_pc 7370eea123SYanqin Li } 7470eea123SYanqin Li 750d32f713Shappy-lx def alloc(vaddr: UInt, alloc_hash_pc: UInt) = { 760d32f713Shappy-lx pre_vaddr := vaddr(STRIDE_VADDR_BITS - 1, 0) 770d32f713Shappy-lx stride := 0.U 780d32f713Shappy-lx confidence := 0.U 790d32f713Shappy-lx hash_pc := alloc_hash_pc 800d32f713Shappy-lx } 810d32f713Shappy-lx 820d32f713Shappy-lx def update(vaddr: UInt, always_update_pre_vaddr: Bool) = { 830d32f713Shappy-lx val new_vaddr = vaddr(STRIDE_VADDR_BITS - 1, 0) 840d32f713Shappy-lx val new_stride = new_vaddr - pre_vaddr 850d32f713Shappy-lx val new_stride_blk = block_addr(new_stride) 860d32f713Shappy-lx // NOTE: for now, disable negtive stride 870d32f713Shappy-lx val stride_valid = new_stride_blk =/= 0.U && new_stride_blk =/= 1.U && new_stride(STRIDE_VADDR_BITS - 1) === 0.U 880d32f713Shappy-lx val stride_match = new_stride === stride 890d32f713Shappy-lx val low_confidence = confidence <= 1.U 900d32f713Shappy-lx val can_send_pf = stride_valid && stride_match && confidence === MAX_CONF.U 910d32f713Shappy-lx 920d32f713Shappy-lx when(stride_valid) { 930d32f713Shappy-lx when(stride_match) { 940d32f713Shappy-lx confidence := Mux(confidence === MAX_CONF.U, confidence, confidence + 1.U) 950d32f713Shappy-lx }.otherwise { 960d32f713Shappy-lx confidence := Mux(confidence === 0.U, confidence, confidence - 1.U) 970d32f713Shappy-lx when(low_confidence) { 980d32f713Shappy-lx stride := new_stride 990d32f713Shappy-lx } 1000d32f713Shappy-lx } 1010d32f713Shappy-lx pre_vaddr := new_vaddr 1020d32f713Shappy-lx } 1030d32f713Shappy-lx when(always_update_pre_vaddr) { 1040d32f713Shappy-lx pre_vaddr := new_vaddr 1050d32f713Shappy-lx } 1060d32f713Shappy-lx 1070d32f713Shappy-lx (can_send_pf, new_stride) 1080d32f713Shappy-lx } 1090d32f713Shappy-lx 1100d32f713Shappy-lx} 1110d32f713Shappy-lx 1120d32f713Shappy-lxclass StrideMetaArray(implicit p: Parameters) extends XSModule with HasStridePrefetchHelper { 1130d32f713Shappy-lx val io = IO(new XSBundle { 1140d32f713Shappy-lx val enable = Input(Bool()) 1150d32f713Shappy-lx // TODO: flush all entry when process changing happens, or disable stream prefetch for a while 1160d32f713Shappy-lx val flush = Input(Bool()) 1170d32f713Shappy-lx val dynamic_depth = Input(UInt(32.W)) // TODO: enable dynamic stride depth 1180d32f713Shappy-lx val train_req = Flipped(DecoupledIO(new PrefetchReqBundle)) 11920e09ab1Shappy-lx val l1_prefetch_req = ValidIO(new StreamPrefetchReqBundle) 12020e09ab1Shappy-lx val l2_l3_prefetch_req = ValidIO(new StreamPrefetchReqBundle) 1210d32f713Shappy-lx // query Stream component to see if a stream pattern has already been detected 1220d32f713Shappy-lx val stream_lookup_req = ValidIO(new PrefetchReqBundle) 1230d32f713Shappy-lx val stream_lookup_resp = Input(Bool()) 1240d32f713Shappy-lx }) 1250d32f713Shappy-lx 1260d32f713Shappy-lx val array = Reg(Vec(STRIDE_ENTRY_NUM, new StrideMetaBundle)) 12770eea123SYanqin Li val valids = RegInit(VecInit(Seq.fill(STRIDE_ENTRY_NUM)(false.B))) 12870eea123SYanqin Li 12970eea123SYanqin Li def reset_array(i: Int): Unit = { 13070eea123SYanqin Li valids(i) := false.B 13170eea123SYanqin Li //only need to rest control signals for firendly area 13270eea123SYanqin Li // array(i).reset(i) 13370eea123SYanqin Li } 13470eea123SYanqin Li 1350d32f713Shappy-lx val replacement = ReplacementPolicy.fromString("plru", STRIDE_ENTRY_NUM) 1360d32f713Shappy-lx 1370d32f713Shappy-lx // s0: hash pc -> cam all entries 1380d32f713Shappy-lx val s0_can_accept = Wire(Bool()) 1390d32f713Shappy-lx val s0_valid = io.train_req.fire 1400d32f713Shappy-lx val s0_vaddr = io.train_req.bits.vaddr 1410d32f713Shappy-lx val s0_pc = io.train_req.bits.pc 1420d32f713Shappy-lx val s0_pc_hash = pc_hash_tag(s0_pc) 14370eea123SYanqin Li val s0_pc_match_vec = VecInit(array zip valids map { case (e, v) => e.tag_match(v, s0_valid, s0_pc_hash) }).asUInt 1440d32f713Shappy-lx val s0_hit = s0_pc_match_vec.orR 1450d32f713Shappy-lx val s0_index = Mux(s0_hit, OHToUInt(s0_pc_match_vec), replacement.way) 1460d32f713Shappy-lx io.train_req.ready := s0_can_accept 1470d32f713Shappy-lx io.stream_lookup_req.valid := s0_valid 1480d32f713Shappy-lx io.stream_lookup_req.bits := io.train_req.bits 1490d32f713Shappy-lx 1500d32f713Shappy-lx when(s0_valid) { 1510d32f713Shappy-lx replacement.access(s0_index) 1520d32f713Shappy-lx } 1530d32f713Shappy-lx 1540d32f713Shappy-lx assert(PopCount(s0_pc_match_vec) <= 1.U) 1550d32f713Shappy-lx XSPerfAccumulate("s0_valid", s0_valid) 1560d32f713Shappy-lx XSPerfAccumulate("s0_hit", s0_valid && s0_hit) 1570d32f713Shappy-lx XSPerfAccumulate("s0_miss", s0_valid && !s0_hit) 1580d32f713Shappy-lx 1590d32f713Shappy-lx // s1: alloc or update 1604ccb2e8bSYanqin Li val s1_valid = GatedValidRegNext(s0_valid) 1610d32f713Shappy-lx val s1_index = RegEnable(s0_index, s0_valid) 1620d32f713Shappy-lx val s1_pc_hash = RegEnable(s0_pc_hash, s0_valid) 1630d32f713Shappy-lx val s1_vaddr = RegEnable(s0_vaddr, s0_valid) 1640d32f713Shappy-lx val s1_hit = RegEnable(s0_hit, s0_valid) 1650d32f713Shappy-lx val s1_alloc = s1_valid && !s1_hit 1660d32f713Shappy-lx val s1_update = s1_valid && s1_hit 1670d32f713Shappy-lx val s1_stride = array(s1_index).stride 1680d32f713Shappy-lx val s1_new_stride = WireInit(0.U(STRIDE_BITS.W)) 1690d32f713Shappy-lx val s1_can_send_pf = WireInit(false.B) 1700d32f713Shappy-lx s0_can_accept := !(s1_valid && s1_pc_hash === s0_pc_hash) 1710d32f713Shappy-lx 172c686adcdSYinan Xu val always_update = Constantin.createRecord(s"always_update${p(XSCoreParamsKey).HartId}", initValue = ALWAYS_UPDATE_PRE_VADDR) 1730d32f713Shappy-lx 1740d32f713Shappy-lx when(s1_alloc) { 17570eea123SYanqin Li valids(s1_index) := true.B 1760d32f713Shappy-lx array(s1_index).alloc( 1770d32f713Shappy-lx vaddr = s1_vaddr, 1780d32f713Shappy-lx alloc_hash_pc = s1_pc_hash 1790d32f713Shappy-lx ) 1800d32f713Shappy-lx }.elsewhen(s1_update) { 1810d32f713Shappy-lx val res = array(s1_index).update(s1_vaddr, always_update) 1820d32f713Shappy-lx s1_can_send_pf := res._1 1830d32f713Shappy-lx s1_new_stride := res._2 1840d32f713Shappy-lx } 1850d32f713Shappy-lx 186c686adcdSYinan Xu val l1_stride_ratio_const = Constantin.createRecord(s"l1_stride_ratio${p(XSCoreParamsKey).HartId}", initValue = 2) 1870d32f713Shappy-lx val l1_stride_ratio = l1_stride_ratio_const(3, 0) 188c686adcdSYinan Xu val l2_stride_ratio_const = Constantin.createRecord(s"l2_stride_ratio${p(XSCoreParamsKey).HartId}", initValue = 5) 1890d32f713Shappy-lx val l2_stride_ratio = l2_stride_ratio_const(3, 0) 1900d32f713Shappy-lx // s2: calculate L1 & L2 pf addr 1914ccb2e8bSYanqin Li val s2_valid = GatedValidRegNext(s1_valid && s1_can_send_pf) 1920d32f713Shappy-lx val s2_vaddr = RegEnable(s1_vaddr, s1_valid && s1_can_send_pf) 1930d32f713Shappy-lx val s2_stride = RegEnable(s1_stride, s1_valid && s1_can_send_pf) 1940d32f713Shappy-lx val s2_l1_depth = s2_stride << l1_stride_ratio 1950d32f713Shappy-lx val s2_l1_pf_vaddr = (s2_vaddr + s2_l1_depth)(VAddrBits - 1, 0) 1960d32f713Shappy-lx val s2_l2_depth = s2_stride << l2_stride_ratio 1970d32f713Shappy-lx val s2_l2_pf_vaddr = (s2_vaddr + s2_l2_depth)(VAddrBits - 1, 0) 1980d32f713Shappy-lx val s2_l1_pf_req_bits = (new StreamPrefetchReqBundle).getStreamPrefetchReqBundle( 19958a9a40dSTang Haojin valid = s2_valid, 2000d32f713Shappy-lx vaddr = s2_l1_pf_vaddr, 2010d32f713Shappy-lx width = STRIDE_WIDTH_BLOCKS, 2020d32f713Shappy-lx decr_mode = false.B, 2030d32f713Shappy-lx sink = SINK_L1, 20420e09ab1Shappy-lx source = L1_HW_PREFETCH_STRIDE, 20520e09ab1Shappy-lx // TODO: add stride debug db, not useful for now 20620e09ab1Shappy-lx t_pc = 0xdeadbeefL.U, 20720e09ab1Shappy-lx t_va = 0xdeadbeefL.U 20820e09ab1Shappy-lx ) 2090d32f713Shappy-lx val s2_l2_pf_req_bits = (new StreamPrefetchReqBundle).getStreamPrefetchReqBundle( 21058a9a40dSTang Haojin valid = s2_valid, 2110d32f713Shappy-lx vaddr = s2_l2_pf_vaddr, 2120d32f713Shappy-lx width = STRIDE_WIDTH_BLOCKS, 2130d32f713Shappy-lx decr_mode = false.B, 2140d32f713Shappy-lx sink = SINK_L2, 21520e09ab1Shappy-lx source = L1_HW_PREFETCH_STRIDE, 21620e09ab1Shappy-lx // TODO: add stride debug db, not useful for now 21720e09ab1Shappy-lx t_pc = 0xdeadbeefL.U, 21820e09ab1Shappy-lx t_va = 0xdeadbeefL.U 21920e09ab1Shappy-lx ) 2200d32f713Shappy-lx 2210d32f713Shappy-lx // s3: send l1 pf out 2224ccb2e8bSYanqin Li val s3_valid = if (LOOK_UP_STREAM) GatedValidRegNext(s2_valid) && !io.stream_lookup_resp else GatedValidRegNext(s2_valid) 2230d32f713Shappy-lx val s3_l1_pf_req_bits = RegEnable(s2_l1_pf_req_bits, s2_valid) 2240d32f713Shappy-lx val s3_l2_pf_req_bits = RegEnable(s2_l2_pf_req_bits, s2_valid) 2250d32f713Shappy-lx 2260d32f713Shappy-lx // s4: send l2 pf out 2274ccb2e8bSYanqin Li val s4_valid = GatedValidRegNext(s3_valid) 2280d32f713Shappy-lx val s4_l2_pf_req_bits = RegEnable(s3_l2_pf_req_bits, s3_valid) 2290d32f713Shappy-lx 23020e09ab1Shappy-lx io.l1_prefetch_req.valid := s3_valid 23120e09ab1Shappy-lx io.l1_prefetch_req.bits := s3_l1_pf_req_bits 23220e09ab1Shappy-lx io.l2_l3_prefetch_req.valid := s4_valid 23320e09ab1Shappy-lx io.l2_l3_prefetch_req.bits := s4_l2_pf_req_bits 2340d32f713Shappy-lx 23520e09ab1Shappy-lx XSPerfAccumulate("pf_valid", PopCount(Seq(io.l1_prefetch_req.valid, io.l2_l3_prefetch_req.valid))) 23620e09ab1Shappy-lx XSPerfAccumulate("l1_pf_valid", s3_valid) 2370d32f713Shappy-lx XSPerfAccumulate("l2_pf_valid", s4_valid) 2380d32f713Shappy-lx XSPerfAccumulate("detect_stream", io.stream_lookup_resp) 2390d32f713Shappy-lx XSPerfHistogram("high_conf_num", PopCount(VecInit(array.map(_.confidence === MAX_CONF.U))).asUInt, true.B, 0, STRIDE_ENTRY_NUM, 1) 2400d32f713Shappy-lx for(i <- 0 until STRIDE_ENTRY_NUM) { 2410d32f713Shappy-lx XSPerfAccumulate(s"entry_${i}_update", i.U === s1_index && s1_update) 2420d32f713Shappy-lx for(j <- 0 until 4) { 2430d32f713Shappy-lx XSPerfAccumulate(s"entry_${i}_disturb_${j}", i.U === s1_index && s1_update && 2440d32f713Shappy-lx j.U === s1_new_stride && 2450d32f713Shappy-lx array(s1_index).confidence === MAX_CONF.U && 2460d32f713Shappy-lx array(s1_index).stride =/= s1_new_stride 2470d32f713Shappy-lx ) 2480d32f713Shappy-lx } 2490d32f713Shappy-lx } 2500d32f713Shappy-lx 2510d32f713Shappy-lx for(i <- 0 until STRIDE_ENTRY_NUM) { 25270eea123SYanqin Li when(GatedValidRegNext(io.flush)) { 25370eea123SYanqin Li reset_array(i) 2540d32f713Shappy-lx } 2550d32f713Shappy-lx } 2560d32f713Shappy-lx} 257