xref: /XiangShan/src/main/scala/xiangshan/mem/prefetch/FDP.scala (revision c49ebec88f6e402aefec681225e3537e2c511430)
1/***************************************************************************************
2* Copyright (c) 2020-2021 Institute of Computing Technology, Chinese Academy of Sciences
3* Copyright (c) 2020-2021 Peng Cheng Laboratory
4*
5* XiangShan is licensed under Mulan PSL v2.
6* You can use this software according to the terms and conditions of the Mulan PSL v2.
7* You may obtain a copy of Mulan PSL v2 at:
8*          http://license.coscl.org.cn/MulanPSL2
9*
10* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
11* EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
12* MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
13*
14* See the Mulan PSL v2 for more details.
15*
16*
17* Acknowledgement
18*
19* This implementation is inspired by several key papers:
20* [1] Santhosh Srinath, Onur Mutlu, Hyesoon Kim, and Yale N. Patt "[Feedback directed prefetching: Improving the
21* performance and bandwidth-efficiency of hardware prefetchers.](https://doi.org/10.1109/HPCA.2007.346185)" IEEE 13th
22* International Symposium on High Performance Computer Architecture (HPCA). 2007.
23***************************************************************************************/
24
25package xiangshan.mem.prefetch
26
27import org.chipsalliance.cde.config.Parameters
28import chisel3._
29import chisel3.util._
30import freechips.rocketchip.tilelink.ClientStates._
31import freechips.rocketchip.tilelink.MemoryOpCategories._
32import freechips.rocketchip.tilelink.TLPermissions._
33import freechips.rocketchip.tilelink.{ClientMetadata, ClientStates, TLPermissions}
34import xiangshan.backend.rob.RobDebugRollingIO
35import utils._
36import utility._
37import xiangshan.{L1CacheErrorInfo, XSCoreParamsKey}
38import xiangshan.mem.HasL1PrefetchSourceParameter
39import utility.{CircularQueuePtr}
40import xiangshan.cache._
41import xiangshan.{XSBundle, XSModule}
42
43//----------------------------------------
44// Feedback Direct Prefetching
45class CounterFilterDataBundle(implicit p: Parameters) extends DCacheBundle {
46  val idx = UInt(idxBits.W)
47  val way = UInt(wayBits.W)
48}
49
50class CounterFilterQueryBundle(implicit p: Parameters) extends DCacheBundle {
51  val req = ValidIO(new CounterFilterDataBundle())
52  val resp = Input(Bool())
53}
54
55// no Set Blocking in LoadPipe, so when counting useful prefetch counter, duplicate result occurs
56// s0    s1     s2     s3
57// r                   w
58// if 3 load insts is accessing the same cache line(set0, way0) in s0, s1, s2
59// they think they all prefetch hit, increment useful prefetch counter 3 times
60// so when load arrives at s3, save it's set&way to an FIFO, all loads will search this FIFO to avoid this case
61class CounterFilter()(implicit p: Parameters) extends DCacheModule {
62  private val LduCnt = backendParams.LduCnt
63  private val HyuCnt = backendParams.HyuCnt
64
65  val io = IO(new Bundle() {
66    // input, only from load for now
67    val ld_in = Flipped(Vec(LoadPipelineWidth, ValidIO(new CounterFilterDataBundle())))
68    val query = Flipped(Vec(LoadPipelineWidth, new CounterFilterQueryBundle()))
69  })
70
71  val LduStages = 4
72  val SIZE = (LduStages) * LduCnt
73  class Ptr(implicit p: Parameters) extends CircularQueuePtr[Ptr]( p => SIZE ){}
74  object Ptr {
75    def apply(f: Bool, v: UInt)(implicit p: Parameters): Ptr = {
76      val ptr = Wire(new Ptr)
77      ptr.flag := f
78      ptr.value := v
79      ptr
80    }
81  }
82
83  val entries = RegInit(VecInit(Seq.fill(SIZE){ (0.U.asTypeOf(new CounterFilterDataBundle())) }))
84  val valids = RegInit(VecInit(Seq.fill(SIZE){ (false.B) }))
85
86  // enq
87  val enqLen = LduCnt
88  val deqLen = LduCnt
89  val enqPtrExt = RegInit(VecInit((0 until enqLen).map(_.U.asTypeOf(new Ptr))))
90  val deqPtrExt = RegInit(VecInit((0 until deqLen).map(_.U.asTypeOf(new Ptr))))
91
92  val deqPtr = WireInit(deqPtrExt(0).value)
93
94  val reqs_l = io.ld_in.map(_.bits)
95  val reqs_vl = io.ld_in.map(_.valid)
96  val needAlloc = Wire(Vec(enqLen, Bool()))
97  val canAlloc = Wire(Vec(enqLen, Bool()))
98  val last3CycleAlloc = RegInit(0.U(log2Ceil(LduCnt + 1).W))
99
100  for(i <- (0 until enqLen)) {
101    val req = reqs_l(i)
102    val req_v = reqs_vl(i)
103    val index = PopCount(needAlloc.take(i))
104    val allocPtr = enqPtrExt(index)
105
106    needAlloc(i) := req_v
107    canAlloc(i) := needAlloc(i) && allocPtr >= deqPtrExt(0)
108
109    when(canAlloc(i)) {
110      valids(allocPtr.value) := true.B
111      entries(allocPtr.value) := req
112    }
113
114    assert(!needAlloc(i) || canAlloc(i), s"port${i} can not accept CounterFilter enq request, check if SIZE >= (Ldu stages - 2) * LduCnt")
115  }
116  val allocNum = PopCount(canAlloc)
117
118  enqPtrExt.foreach{case x =>
119    when(canAlloc.asUInt.orR){
120      x := x + allocNum
121    }
122  }
123  last3CycleAlloc := RegNext(RegNext(allocNum))
124
125  // deq
126  for(i <- (0 until deqLen)) {
127    when(i.U < last3CycleAlloc) {
128      valids(deqPtrExt(i).value) := false.B
129    }
130  }
131
132  deqPtrExt.foreach{case x => x := x + last3CycleAlloc}
133
134  // query
135  val querys_l = io.query.map(_.req.bits)
136  val querys_vl = io.query.map(_.req.valid)
137  for(i <- (0 until LduCnt + HyuCnt)) {
138    val q = querys_l(i)
139    val q_v = querys_vl(i)
140
141    val entry_match = Cat(entries.zip(valids).map {
142      case(e, v) => v && (q.idx === e.idx) && (q.way === e.way)
143    }).orR
144
145    io.query(i).resp := q_v && entry_match
146  }
147
148  XSPerfAccumulate("req_nums", PopCount(io.query.map(_.req.valid)))
149  XSPerfAccumulate("req_set_way_match", PopCount(io.query.map(_.resp)))
150}
151
152class BloomQueryBundle(n: Int)(implicit p: Parameters) extends DCacheBundle {
153  val addr = UInt(BLOOMADDRWIDTH.W)
154
155  def BLOOMADDRWIDTH = log2Ceil(n)
156
157  def get_addr(paddr: UInt): UInt = {
158    assert(paddr.getWidth == PAddrBits)
159    assert(paddr.getWidth >= (blockOffBits + 2 * BLOOMADDRWIDTH))
160    val block_paddr = paddr(paddr.getWidth - 1, blockOffBits)
161    val low_part = block_paddr(BLOOMADDRWIDTH - 1, 0)
162    val high_part = block_paddr(2 * BLOOMADDRWIDTH - 1, BLOOMADDRWIDTH)
163    low_part ^ high_part
164  }
165}
166
167class BloomRespBundle(implicit p: Parameters) extends DCacheBundle {
168  val res = Bool()
169}
170class BloomFilter(n: Int, bypass: Boolean = true)(implicit p: Parameters) extends DCacheModule {
171  val io = IO(new DCacheBundle {
172    val set = Flipped(ValidIO(new BloomQueryBundle(n)))
173    val clr = Flipped(ValidIO(new BloomQueryBundle(n)))
174    val query = Vec(LoadPipelineWidth, Flipped(ValidIO(new BloomQueryBundle(n))))
175    val resp = Vec(LoadPipelineWidth, ValidIO(new BloomRespBundle))
176  })
177
178  val data = RegInit(0.U(n.W))
179  val data_next = Wire(Vec(n, Bool()))
180
181  for (i <- 0 until n) {
182    when(io.clr.valid && i.U === io.clr.bits.addr) {
183      data_next(i) := false.B
184    }.elsewhen(io.set.valid && i.U === io.set.bits.addr) {
185      data_next(i) := true.B
186    }.otherwise {
187      data_next(i) := data(i).asBool
188    }
189  }
190
191  // resp will valid in next cycle
192  for(i <- 0 until LoadPipelineWidth) {
193    io.resp(i).valid := GatedValidRegNext(io.query(i).valid)
194    if(bypass) {
195      io.resp(i).bits.res := RegEnable(data_next(io.query(i).bits.addr), io.query(i).valid)
196    }else {
197      io.resp(i).bits.res := RegEnable(data(io.query(i).bits.addr), io.query(i).valid)
198    }
199  }
200
201  data := data_next.asUInt
202
203  assert(PopCount(data ^ data_next.asUInt) <= 2.U)
204
205  XSPerfHistogram("valid_nums", PopCount(data), true.B, 0, n + 1, 20)
206}
207
208class FDPrefetcherMonitorBundle()(implicit p: Parameters) extends XSBundle {
209  val refill = Input(Bool()) // from refill pipe, fire
210  val accuracy = new XSBundle {
211    val total_prefetch = Input(Bool()) // from mshr enq, fire, alloc, prefetch
212    val useful_prefetch = Vec(LoadPipelineWidth, Input(Bool())) // from load pipeline, prefetch hit
213  }
214
215  val timely = new XSBundle {
216    val late_prefetch = Input(Bool()) // from mshr enq, a load matches a mshr caused by prefetch
217  }
218
219  val pollution = new XSBundle {
220    val demand_miss = Vec(LoadPipelineWidth, Input(Bool())) // from load pipeline, first miss
221    val cache_pollution = Vec(LoadPipelineWidth, Input(Bool())) // from load pipeline, first miss and pollution caused
222  }
223
224  val pf_ctrl = Output(new PrefetchControlBundle)
225  val debugRolling = Flipped(new RobDebugRollingIO)
226}
227
228class FDPrefetcherMonitor()(implicit p: Parameters) extends XSModule {
229  val io = IO(new FDPrefetcherMonitorBundle)
230
231  val INTERVAL = 8192
232  val CNTWIDTH = log2Up(INTERVAL) + 1
233
234  io.pf_ctrl := DontCare
235
236  val refill_cnt = RegInit(0.U(CNTWIDTH.W))
237
238  val total_prefetch_prev_cnt = RegInit(0.U(CNTWIDTH.W))
239  val useful_prefetch_prev_cnt = RegInit(0.U(CNTWIDTH.W))
240  val late_prefetch_prev_cnt = RegInit(0.U(CNTWIDTH.W))
241  val demand_miss_prev_cnt = RegInit(0.U(CNTWIDTH.W))
242  val pollution_prev_cnt = RegInit(0.U(CNTWIDTH.W))
243  val prev_cnts = Seq(total_prefetch_prev_cnt, useful_prefetch_prev_cnt, late_prefetch_prev_cnt, demand_miss_prev_cnt, pollution_prev_cnt)
244
245  val total_prefetch_interval_cnt = RegInit(0.U(CNTWIDTH.W))
246  val useful_prefetch_interval_cnt = RegInit(0.U(CNTWIDTH.W))
247  val late_prefetch_interval_cnt = RegInit(0.U(CNTWIDTH.W))
248  val demand_miss_interval_cnt = RegInit(0.U(CNTWIDTH.W))
249  val pollution_interval_cnt = RegInit(0.U(CNTWIDTH.W))
250  val interval_cnts = Seq(total_prefetch_interval_cnt, useful_prefetch_interval_cnt, late_prefetch_interval_cnt, demand_miss_interval_cnt, pollution_interval_cnt)
251
252  val interval_trigger = refill_cnt === INTERVAL.U
253
254  val io_ens = Seq(io.accuracy.total_prefetch, io.accuracy.useful_prefetch, io.timely.late_prefetch, io.pollution.demand_miss, io.pollution.cache_pollution)
255
256  for((interval, en) <- interval_cnts.zip(io_ens)) {
257    interval := interval + PopCount(en.asUInt)
258  }
259
260  when(io.refill) {
261    refill_cnt := refill_cnt + 1.U
262  }
263
264  when(interval_trigger) {
265    refill_cnt := 0.U
266    for((prev, interval) <- prev_cnts.zip(interval_cnts)) {
267      prev := Cat(0.U(1.W), prev(prev.getWidth - 1, 1)) + Cat(0.U(1.W), interval(interval.getWidth - 1, 1))
268      interval := 0.U
269    }
270  }
271
272  // rolling by instr
273  XSPerfRolling(
274    "L1PrefetchAccuracyIns",
275    PopCount(io.accuracy.useful_prefetch), PopCount(io.accuracy.total_prefetch),
276    1000, io.debugRolling.robTrueCommit, clock, reset
277  )
278
279  XSPerfRolling(
280    "L1PrefetchLatenessIns",
281    PopCount(io.timely.late_prefetch), PopCount(io.accuracy.total_prefetch),
282    1000, io.debugRolling.robTrueCommit, clock, reset
283  )
284
285  XSPerfRolling(
286    "L1PrefetchPollutionIns",
287    PopCount(io.pollution.cache_pollution), PopCount(io.pollution.demand_miss),
288    1000, io.debugRolling.robTrueCommit, clock, reset
289  )
290
291  XSPerfRolling(
292    "IPCIns",
293    io.debugRolling.robTrueCommit, 1.U,
294    1000, io.debugRolling.robTrueCommit, clock, reset
295  )
296
297  XSPerfAccumulate("io_refill", io.refill)
298  XSPerfAccumulate("total_prefetch_en", io.accuracy.total_prefetch)
299  XSPerfAccumulate("useful_prefetch_en", PopCount(io.accuracy.useful_prefetch) + io.timely.late_prefetch)
300  XSPerfAccumulate("late_prefetch_en", io.timely.late_prefetch)
301  XSPerfAccumulate("demand_miss_en", PopCount(io.pollution.demand_miss))
302  XSPerfAccumulate("cache_pollution_en", PopCount(io.pollution.cache_pollution))
303}