1/*************************************************************************************** 2* Copyright (c) 2020-2021 Institute of Computing Technology, Chinese Academy of Sciences 3* Copyright (c) 2020-2021 Peng Cheng Laboratory 4* 5* XiangShan is licensed under Mulan PSL v2. 6* You can use this software according to the terms and conditions of the Mulan PSL v2. 7* You may obtain a copy of Mulan PSL v2 at: 8* http://license.coscl.org.cn/MulanPSL2 9* 10* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, 11* EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, 12* MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. 13* 14* See the Mulan PSL v2 for more details. 15* 16* 17* Acknowledgement 18* 19* This implementation is inspired by several key papers: 20* [1] Santhosh Srinath, Onur Mutlu, Hyesoon Kim, and Yale N. Patt "[Feedback directed prefetching: Improving the 21* performance and bandwidth-efficiency of hardware prefetchers.](https://doi.org/10.1109/HPCA.2007.346185)" IEEE 13th 22* International Symposium on High Performance Computer Architecture (HPCA). 2007. 23***************************************************************************************/ 24 25package xiangshan.mem.prefetch 26 27import org.chipsalliance.cde.config.Parameters 28import chisel3._ 29import chisel3.util._ 30import freechips.rocketchip.tilelink.ClientStates._ 31import freechips.rocketchip.tilelink.MemoryOpCategories._ 32import freechips.rocketchip.tilelink.TLPermissions._ 33import freechips.rocketchip.tilelink.{ClientMetadata, ClientStates, TLPermissions} 34import xiangshan.backend.rob.RobDebugRollingIO 35import utils._ 36import utility._ 37import xiangshan.{L1CacheErrorInfo, XSCoreParamsKey} 38import xiangshan.mem.HasL1PrefetchSourceParameter 39import utility.{CircularQueuePtr} 40import xiangshan.cache._ 41import xiangshan.{XSBundle, XSModule} 42 43//---------------------------------------- 44// Feedback Direct Prefetching 45class CounterFilterDataBundle(implicit p: Parameters) extends DCacheBundle { 46 val idx = UInt(idxBits.W) 47 val way = UInt(wayBits.W) 48} 49 50class CounterFilterQueryBundle(implicit p: Parameters) extends DCacheBundle { 51 val req = ValidIO(new CounterFilterDataBundle()) 52 val resp = Input(Bool()) 53} 54 55// no Set Blocking in LoadPipe, so when counting useful prefetch counter, duplicate result occurs 56// s0 s1 s2 s3 57// r w 58// if 3 load insts is accessing the same cache line(set0, way0) in s0, s1, s2 59// they think they all prefetch hit, increment useful prefetch counter 3 times 60// so when load arrives at s3, save it's set&way to an FIFO, all loads will search this FIFO to avoid this case 61class CounterFilter()(implicit p: Parameters) extends DCacheModule { 62 private val LduCnt = backendParams.LduCnt 63 private val HyuCnt = backendParams.HyuCnt 64 65 val io = IO(new Bundle() { 66 // input, only from load for now 67 val ld_in = Flipped(Vec(LoadPipelineWidth, ValidIO(new CounterFilterDataBundle()))) 68 val query = Flipped(Vec(LoadPipelineWidth, new CounterFilterQueryBundle())) 69 }) 70 71 val LduStages = 4 72 val SIZE = (LduStages) * LduCnt 73 class Ptr(implicit p: Parameters) extends CircularQueuePtr[Ptr]( p => SIZE ){} 74 object Ptr { 75 def apply(f: Bool, v: UInt)(implicit p: Parameters): Ptr = { 76 val ptr = Wire(new Ptr) 77 ptr.flag := f 78 ptr.value := v 79 ptr 80 } 81 } 82 83 val entries = RegInit(VecInit(Seq.fill(SIZE){ (0.U.asTypeOf(new CounterFilterDataBundle())) })) 84 val valids = RegInit(VecInit(Seq.fill(SIZE){ (false.B) })) 85 86 // enq 87 val enqLen = LduCnt 88 val deqLen = LduCnt 89 val enqPtrExt = RegInit(VecInit((0 until enqLen).map(_.U.asTypeOf(new Ptr)))) 90 val deqPtrExt = RegInit(VecInit((0 until deqLen).map(_.U.asTypeOf(new Ptr)))) 91 92 val deqPtr = WireInit(deqPtrExt(0).value) 93 94 val reqs_l = io.ld_in.map(_.bits) 95 val reqs_vl = io.ld_in.map(_.valid) 96 val needAlloc = Wire(Vec(enqLen, Bool())) 97 val canAlloc = Wire(Vec(enqLen, Bool())) 98 val last3CycleAlloc = RegInit(0.U(log2Ceil(LduCnt + 1).W)) 99 100 for(i <- (0 until enqLen)) { 101 val req = reqs_l(i) 102 val req_v = reqs_vl(i) 103 val index = PopCount(needAlloc.take(i)) 104 val allocPtr = enqPtrExt(index) 105 106 needAlloc(i) := req_v 107 canAlloc(i) := needAlloc(i) && allocPtr >= deqPtrExt(0) 108 109 when(canAlloc(i)) { 110 valids(allocPtr.value) := true.B 111 entries(allocPtr.value) := req 112 } 113 114 assert(!needAlloc(i) || canAlloc(i), s"port${i} can not accept CounterFilter enq request, check if SIZE >= (Ldu stages - 2) * LduCnt") 115 } 116 val allocNum = PopCount(canAlloc) 117 118 enqPtrExt.foreach{case x => 119 when(canAlloc.asUInt.orR){ 120 x := x + allocNum 121 } 122 } 123 last3CycleAlloc := RegNext(RegNext(allocNum)) 124 125 // deq 126 for(i <- (0 until deqLen)) { 127 when(i.U < last3CycleAlloc) { 128 valids(deqPtrExt(i).value) := false.B 129 } 130 } 131 132 deqPtrExt.foreach{case x => x := x + last3CycleAlloc} 133 134 // query 135 val querys_l = io.query.map(_.req.bits) 136 val querys_vl = io.query.map(_.req.valid) 137 for(i <- (0 until LduCnt + HyuCnt)) { 138 val q = querys_l(i) 139 val q_v = querys_vl(i) 140 141 val entry_match = Cat(entries.zip(valids).map { 142 case(e, v) => v && (q.idx === e.idx) && (q.way === e.way) 143 }).orR 144 145 io.query(i).resp := q_v && entry_match 146 } 147 148 XSPerfAccumulate("req_nums", PopCount(io.query.map(_.req.valid))) 149 XSPerfAccumulate("req_set_way_match", PopCount(io.query.map(_.resp))) 150} 151 152class BloomQueryBundle(n: Int)(implicit p: Parameters) extends DCacheBundle { 153 val addr = UInt(BLOOMADDRWIDTH.W) 154 155 def BLOOMADDRWIDTH = log2Ceil(n) 156 157 def get_addr(paddr: UInt): UInt = { 158 assert(paddr.getWidth == PAddrBits) 159 assert(paddr.getWidth >= (blockOffBits + 2 * BLOOMADDRWIDTH)) 160 val block_paddr = paddr(paddr.getWidth - 1, blockOffBits) 161 val low_part = block_paddr(BLOOMADDRWIDTH - 1, 0) 162 val high_part = block_paddr(2 * BLOOMADDRWIDTH - 1, BLOOMADDRWIDTH) 163 low_part ^ high_part 164 } 165} 166 167class BloomRespBundle(implicit p: Parameters) extends DCacheBundle { 168 val res = Bool() 169} 170class BloomFilter(n: Int, bypass: Boolean = true)(implicit p: Parameters) extends DCacheModule { 171 val io = IO(new DCacheBundle { 172 val set = Flipped(ValidIO(new BloomQueryBundle(n))) 173 val clr = Flipped(ValidIO(new BloomQueryBundle(n))) 174 val query = Vec(LoadPipelineWidth, Flipped(ValidIO(new BloomQueryBundle(n)))) 175 val resp = Vec(LoadPipelineWidth, ValidIO(new BloomRespBundle)) 176 }) 177 178 val data = RegInit(0.U(n.W)) 179 val data_next = Wire(Vec(n, Bool())) 180 181 for (i <- 0 until n) { 182 when(io.clr.valid && i.U === io.clr.bits.addr) { 183 data_next(i) := false.B 184 }.elsewhen(io.set.valid && i.U === io.set.bits.addr) { 185 data_next(i) := true.B 186 }.otherwise { 187 data_next(i) := data(i).asBool 188 } 189 } 190 191 // resp will valid in next cycle 192 for(i <- 0 until LoadPipelineWidth) { 193 io.resp(i).valid := GatedValidRegNext(io.query(i).valid) 194 if(bypass) { 195 io.resp(i).bits.res := RegEnable(data_next(io.query(i).bits.addr), io.query(i).valid) 196 }else { 197 io.resp(i).bits.res := RegEnable(data(io.query(i).bits.addr), io.query(i).valid) 198 } 199 } 200 201 data := data_next.asUInt 202 203 assert(PopCount(data ^ data_next.asUInt) <= 2.U) 204 205 XSPerfHistogram("valid_nums", PopCount(data), true.B, 0, n + 1, 20) 206} 207 208class FDPrefetcherMonitorBundle()(implicit p: Parameters) extends XSBundle { 209 val refill = Input(Bool()) // from refill pipe, fire 210 val accuracy = new XSBundle { 211 val total_prefetch = Input(Bool()) // from mshr enq, fire, alloc, prefetch 212 val useful_prefetch = Vec(LoadPipelineWidth, Input(Bool())) // from load pipeline, prefetch hit 213 } 214 215 val timely = new XSBundle { 216 val late_prefetch = Input(Bool()) // from mshr enq, a load matches a mshr caused by prefetch 217 } 218 219 val pollution = new XSBundle { 220 val demand_miss = Vec(LoadPipelineWidth, Input(Bool())) // from load pipeline, first miss 221 val cache_pollution = Vec(LoadPipelineWidth, Input(Bool())) // from load pipeline, first miss and pollution caused 222 } 223 224 val pf_ctrl = Output(new PrefetchControlBundle) 225 val debugRolling = Flipped(new RobDebugRollingIO) 226} 227 228class FDPrefetcherMonitor()(implicit p: Parameters) extends XSModule { 229 val io = IO(new FDPrefetcherMonitorBundle) 230 231 val INTERVAL = 8192 232 val CNTWIDTH = log2Up(INTERVAL) + 1 233 234 io.pf_ctrl := DontCare 235 236 val refill_cnt = RegInit(0.U(CNTWIDTH.W)) 237 238 val total_prefetch_prev_cnt = RegInit(0.U(CNTWIDTH.W)) 239 val useful_prefetch_prev_cnt = RegInit(0.U(CNTWIDTH.W)) 240 val late_prefetch_prev_cnt = RegInit(0.U(CNTWIDTH.W)) 241 val demand_miss_prev_cnt = RegInit(0.U(CNTWIDTH.W)) 242 val pollution_prev_cnt = RegInit(0.U(CNTWIDTH.W)) 243 val prev_cnts = Seq(total_prefetch_prev_cnt, useful_prefetch_prev_cnt, late_prefetch_prev_cnt, demand_miss_prev_cnt, pollution_prev_cnt) 244 245 val total_prefetch_interval_cnt = RegInit(0.U(CNTWIDTH.W)) 246 val useful_prefetch_interval_cnt = RegInit(0.U(CNTWIDTH.W)) 247 val late_prefetch_interval_cnt = RegInit(0.U(CNTWIDTH.W)) 248 val demand_miss_interval_cnt = RegInit(0.U(CNTWIDTH.W)) 249 val pollution_interval_cnt = RegInit(0.U(CNTWIDTH.W)) 250 val interval_cnts = Seq(total_prefetch_interval_cnt, useful_prefetch_interval_cnt, late_prefetch_interval_cnt, demand_miss_interval_cnt, pollution_interval_cnt) 251 252 val interval_trigger = refill_cnt === INTERVAL.U 253 254 val io_ens = Seq(io.accuracy.total_prefetch, io.accuracy.useful_prefetch, io.timely.late_prefetch, io.pollution.demand_miss, io.pollution.cache_pollution) 255 256 for((interval, en) <- interval_cnts.zip(io_ens)) { 257 interval := interval + PopCount(en.asUInt) 258 } 259 260 when(io.refill) { 261 refill_cnt := refill_cnt + 1.U 262 } 263 264 when(interval_trigger) { 265 refill_cnt := 0.U 266 for((prev, interval) <- prev_cnts.zip(interval_cnts)) { 267 prev := Cat(0.U(1.W), prev(prev.getWidth - 1, 1)) + Cat(0.U(1.W), interval(interval.getWidth - 1, 1)) 268 interval := 0.U 269 } 270 } 271 272 // rolling by instr 273 XSPerfRolling( 274 "L1PrefetchAccuracyIns", 275 PopCount(io.accuracy.useful_prefetch), PopCount(io.accuracy.total_prefetch), 276 1000, io.debugRolling.robTrueCommit, clock, reset 277 ) 278 279 XSPerfRolling( 280 "L1PrefetchLatenessIns", 281 PopCount(io.timely.late_prefetch), PopCount(io.accuracy.total_prefetch), 282 1000, io.debugRolling.robTrueCommit, clock, reset 283 ) 284 285 XSPerfRolling( 286 "L1PrefetchPollutionIns", 287 PopCount(io.pollution.cache_pollution), PopCount(io.pollution.demand_miss), 288 1000, io.debugRolling.robTrueCommit, clock, reset 289 ) 290 291 XSPerfRolling( 292 "IPCIns", 293 io.debugRolling.robTrueCommit, 1.U, 294 1000, io.debugRolling.robTrueCommit, clock, reset 295 ) 296 297 XSPerfAccumulate("io_refill", io.refill) 298 XSPerfAccumulate("total_prefetch_en", io.accuracy.total_prefetch) 299 XSPerfAccumulate("useful_prefetch_en", PopCount(io.accuracy.useful_prefetch) + io.timely.late_prefetch) 300 XSPerfAccumulate("late_prefetch_en", io.timely.late_prefetch) 301 XSPerfAccumulate("demand_miss_en", PopCount(io.pollution.demand_miss)) 302 XSPerfAccumulate("cache_pollution_en", PopCount(io.pollution.cache_pollution)) 303}