xref: /XiangShan/src/main/scala/xiangshan/mem/vector/VecCommon.scala (revision 688cc4e80703c22b2cd3570804f946813c224b12)
1/***************************************************************************************
2  * Copyright (c) 2020-2021 Institute of Computing Technology, Chinese Academy of Sciences
3  * Copyright (c) 2020-2021 Peng Cheng Laboratory
4  *
5  * XiangShan is licensed under Mulan PSL v2.
6  * You can use this software according to the terms and conditions of the Mulan PSL v2.
7  * You may obtain a copy of Mulan PSL v2 at:
8  *          http://license.coscl.org.cn/MulanPSL2
9  *
10  * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
11  * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
12  * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
13  *
14  * See the Mulan PSL v2 for more details.
15  ***************************************************************************************/
16
17package xiangshan.mem
18
19import org.chipsalliance.cde.config.Parameters
20import chisel3._
21import chisel3.util._
22import utils._
23import utility._
24import xiangshan._
25import xiangshan.backend.rob.RobPtr
26import xiangshan.backend.Bundles._
27import xiangshan.backend.fu.FuType
28import xiangshan.backend.fu.vector.Bundles.VEew
29
30/**
31  * Common used parameters or functions in vlsu
32  */
33trait VLSUConstants {
34  val VLEN = 128
35  //for pack unit-stride flow
36  val AlignedNum = 4 // 1/2/4/8
37  def VLENB = VLEN/8
38  def vOffsetBits = log2Up(VLENB) // bits-width to index offset inside a vector reg
39  lazy val vlmBindexBits = 8 //will be overrided later
40  lazy val vsmBindexBits = 8 // will be overrided later
41
42  def alignTypes = 5 // eew/sew = 1/2/4/8, last indicate 128 bit element
43  def alignTypeBits = log2Up(alignTypes)
44  def maxMUL = 8
45  def maxFields = 8
46  /**
47    * In the most extreme cases like a segment indexed instruction, eew=64, emul=8, sew=8, lmul=1,
48    * and nf=8, each data reg is mapped with 8 index regs and there are 8 data regs in total,
49    * each for a field. Therefore an instruction can be divided into 64 uops at most.
50    */
51  def maxUopNum = maxMUL * maxFields // 64
52  def maxFlowNum = 16
53  def maxElemNum = maxMUL * maxFlowNum // 128
54  // def uopIdxBits = log2Up(maxUopNum) // to index uop inside an robIdx
55  def elemIdxBits = log2Up(maxElemNum) + 1 // to index which element in an instruction
56  def flowIdxBits = log2Up(maxFlowNum) + 1 // to index which flow in a uop
57  def fieldBits = log2Up(maxFields) + 1 // 4-bits to indicate 1~8
58
59  def ewBits = 3 // bits-width of EEW/SEW
60  def mulBits = 3 // bits-width of emul/lmul
61
62  def getSlice(data: UInt, i: Int, alignBits: Int): UInt = {
63    require(data.getWidth >= (i+1) * alignBits)
64    data((i+1) * alignBits - 1, i * alignBits)
65  }
66  def getNoAlignedSlice(data: UInt, i: Int, alignBits: Int): UInt = {
67    data(i * 8 + alignBits - 1, i * 8)
68  }
69
70  def getByte(data: UInt, i: Int = 0) = getSlice(data, i, 8)
71  def getHalfWord(data: UInt, i: Int = 0) = getSlice(data, i, 16)
72  def getWord(data: UInt, i: Int = 0) = getSlice(data, i, 32)
73  def getDoubleWord(data: UInt, i: Int = 0) = getSlice(data, i, 64)
74  def getDoubleDoubleWord(data: UInt, i: Int = 0) = getSlice(data, i, 128)
75}
76
77trait HasVLSUParameters extends HasXSParameter with VLSUConstants {
78  override val VLEN = coreParams.VLEN
79  override lazy val vlmBindexBits = log2Up(coreParams.VlMergeBufferSize)
80  override lazy val vsmBindexBits = log2Up(coreParams.VsMergeBufferSize)
81  lazy val maxMemByteNum = 16 // Maximum bytes for a single memory access
82  /**
83   * get addr aligned low bits
84   * @param addr Address to be check
85   * @param width Width for checking alignment
86   */
87  def getCheckAddrLowBits(addr: UInt, width: Int): UInt = addr(log2Up(width) - 1, 0)
88  def getOverflowBit(in: UInt, width: Int): UInt = in(log2Up(width))
89  def isUnitStride(instType: UInt) = instType(1, 0) === "b00".U
90  def isStrided(instType: UInt) = instType(1, 0) === "b10".U
91  def isIndexed(instType: UInt) = instType(0) === "b1".U
92  def isNotIndexed(instType: UInt) = instType(0) === "b0".U
93  def isSegment(instType: UInt) = instType(2) === "b1".U
94  def is128Bit(alignedType: UInt) = alignedType(2) === "b1".U
95
96  def mergeDataWithMask(oldData: UInt, newData: UInt, mask: UInt): Vec[UInt] = {
97    require(oldData.getWidth == newData.getWidth)
98    require(oldData.getWidth == mask.getWidth * 8)
99    VecInit(mask.asBools.zipWithIndex.map { case (en, i) =>
100      Mux(en, getByte(newData, i), getByte(oldData, i))
101    })
102  }
103
104  // def asBytes(data: UInt) = {
105  //   require(data.getWidth % 8 == 0)
106  //   (0 until data.getWidth/8).map(i => getByte(data, i))
107  // }
108
109  def mergeDataWithElemIdx(
110    oldData: UInt,
111    newData: Seq[UInt],
112    alignedType: UInt,
113    elemIdx: Seq[UInt],
114    valids: Seq[Bool]
115  ): UInt = {
116    require(newData.length == elemIdx.length)
117    require(newData.length == valids.length)
118    LookupTree(alignedType, List(
119      "b00".U -> VecInit(elemIdx.map(e => UIntToOH(e(3, 0)).asBools).transpose.zipWithIndex.map { case (selVec, i) =>
120        ParallelPosteriorityMux(
121          true.B +: selVec.zip(valids).map(x => x._1 && x._2),
122          getByte(oldData, i) +: newData.map(getByte(_))
123        )}).asUInt,
124      "b01".U -> VecInit(elemIdx.map(e => UIntToOH(e(2, 0)).asBools).transpose.zipWithIndex.map { case (selVec, i) =>
125        ParallelPosteriorityMux(
126          true.B +: selVec.zip(valids).map(x => x._1 && x._2),
127          getHalfWord(oldData, i) +: newData.map(getHalfWord(_))
128        )}).asUInt,
129      "b10".U -> VecInit(elemIdx.map(e => UIntToOH(e(1, 0)).asBools).transpose.zipWithIndex.map { case (selVec, i) =>
130        ParallelPosteriorityMux(
131          true.B +: selVec.zip(valids).map(x => x._1 && x._2),
132          getWord(oldData, i) +: newData.map(getWord(_))
133        )}).asUInt,
134      "b11".U -> VecInit(elemIdx.map(e => UIntToOH(e(0)).asBools).transpose.zipWithIndex.map { case (selVec, i) =>
135        ParallelPosteriorityMux(
136          true.B +: selVec.zip(valids).map(x => x._1 && x._2),
137          getDoubleWord(oldData, i) +: newData.map(getDoubleWord(_))
138        )}).asUInt
139    ))
140  }
141
142  def mergeDataWithElemIdx(oldData: UInt, newData: UInt, alignedType: UInt, elemIdx: UInt): UInt = {
143    mergeDataWithElemIdx(oldData, Seq(newData), alignedType, Seq(elemIdx), Seq(true.B))
144  }
145  /**
146    * for merge 128-bits data of unit-stride
147    */
148  object mergeDataByByte{
149    def apply(oldData: UInt, newData: UInt, mask: UInt): UInt = {
150      val selVec = Seq(mask).map(_.asBools).transpose
151      VecInit(selVec.zipWithIndex.map{ case (selV, i) =>
152        ParallelPosteriorityMux(
153          true.B +: selV.map(x => x),
154          getByte(oldData, i) +: Seq(getByte(newData, i))
155        )}).asUInt
156    }
157  }
158
159  /**
160    * for merge Unit-Stride data to 256-bits
161    * merge 128-bits data to 256-bits
162    * if have 3 port,
163    *   if is port0, it is 6 to 1 Multiplexer -> (128'b0, data) or (data, 128'b0) or (data, port2data) or (port2data, data) or (data, port3data) or (port3data, data)
164    *   if is port1, it is 4 to 1 Multiplexer -> (128'b0, data) or (data, 128'b0) or (data, port3data) or (port3data, data)
165    *   if is port3, it is 2 to 1 Multiplexer -> (128'b0, data) or (data, 128'b0)
166    *
167    */
168  object mergeDataByIndex{
169    def apply(data:  Seq[UInt], mask: Seq[UInt], index: UInt, valids: Seq[Bool]): (UInt, UInt) = {
170      require(data.length == valids.length)
171      require(data.length == mask.length)
172      val muxLength = data.length
173      val selDataMatrix = Wire(Vec(muxLength, Vec(2, UInt((VLEN * 2).W)))) // 3 * 2 * 256
174      val selMaskMatrix = Wire(Vec(muxLength, Vec(2, UInt((VLENB * 2).W)))) // 3 * 2 * 16
175
176      if (backendParams.debugEn){
177        dontTouch(selDataMatrix)
178        dontTouch(selMaskMatrix)
179      }
180
181      for(i <- 0 until muxLength){
182        if(i == 0){
183          selDataMatrix(i)(0) := Cat(0.U(VLEN.W), data(i))
184          selDataMatrix(i)(1) := Cat(data(i), 0.U(VLEN.W))
185          selMaskMatrix(i)(0) := Cat(0.U(VLENB.W), mask(i))
186          selMaskMatrix(i)(1) := Cat(mask(i), 0.U(VLENB.W))
187        }
188        else{
189          selDataMatrix(i)(0) := Cat(data(i), data(0))
190          selDataMatrix(i)(1) := Cat(data(0), data(i))
191          selMaskMatrix(i)(0) := Cat(mask(i), mask(0))
192          selMaskMatrix(i)(1) := Cat(mask(0), mask(i))
193        }
194      }
195      val selIdxVec = (0 until muxLength).map(_.U)
196      val selIdx    = PriorityMux(valids.reverse, selIdxVec.reverse)
197
198      val selData = Mux(index === 0.U,
199                        selDataMatrix(selIdx)(0),
200                        selDataMatrix(selIdx)(1))
201      val selMask = Mux(index === 0.U,
202                        selMaskMatrix(selIdx)(0),
203                        selMaskMatrix(selIdx)(1))
204      (selData, selMask)
205    }
206  }
207  def mergeDataByIndex(data:  UInt, mask: UInt, index: UInt): (UInt, UInt) = {
208    mergeDataByIndex(Seq(data), Seq(mask), index, Seq(true.B))
209  }
210}
211abstract class VLSUModule(implicit p: Parameters) extends XSModule
212  with HasVLSUParameters
213  with HasCircularQueuePtrHelper
214abstract class VLSUBundle(implicit p: Parameters) extends XSBundle
215  with HasVLSUParameters
216
217class VLSUBundleWithMicroOp(implicit p: Parameters) extends VLSUBundle {
218  val uop = new DynInst
219}
220
221class OnlyVecExuOutput(implicit p: Parameters) extends VLSUBundle {
222  val isvec = Bool()
223  val vecdata = UInt(VLEN.W)
224  val mask = UInt(VLENB.W)
225  // val rob_idx_valid = Vec(2, Bool())
226  // val inner_idx = Vec(2, UInt(3.W))
227  // val rob_idx = Vec(2, new RobPtr)
228  // val offset = Vec(2, UInt(4.W))
229  val reg_offset = UInt(vOffsetBits.W)
230  val vecActive = Bool() // 1: vector active element, 0: vector not active element
231  val is_first_ele = Bool()
232  val elemIdx = UInt(elemIdxBits.W) // element index
233  val elemIdxInsideVd = UInt(elemIdxBits.W) // element index in scope of vd
234  val trigger = TriggerAction()
235  val vstart         = UInt(elemIdxBits.W)
236  val vecTriggerMask = UInt((VLEN/8).W)
237  // val uopQueuePtr = new VluopPtr
238  // val flowPtr = new VlflowPtr
239}
240
241class VecExuOutput(implicit p: Parameters) extends MemExuOutput with HasVLSUParameters {
242  val vec = new OnlyVecExuOutput
243  val alignedType       = UInt(alignTypeBits.W)
244   // feedback
245  val vecFeedback       = Bool()
246}
247
248class VecUopBundle(implicit p: Parameters) extends VLSUBundleWithMicroOp {
249  val flowMask       = UInt(VLENB.W) // each bit for a flow
250  val byteMask       = UInt(VLENB.W) // each bit for a byte
251  val data           = UInt(VLEN.W)
252  // val fof            = Bool() // fof is only used for vector loads
253  val excp_eew_index = UInt(elemIdxBits.W)
254  // val exceptionVec   = ExceptionVec() // uop has exceptionVec
255  val baseAddr = UInt(VAddrBits.W)
256  val stride = UInt(VLEN.W)
257  val flow_counter = UInt(flowIdxBits.W)
258
259  // instruction decode result
260  val flowNum = UInt(flowIdxBits.W) // # of flows in a uop
261  // val flowNumLog2 = UInt(log2Up(flowIdxBits).W) // log2(flowNum), for better timing of multiplication
262  val nfields = UInt(fieldBits.W) // NFIELDS
263  val vm = Bool() // whether vector masking is enabled
264  val usWholeReg = Bool() // unit-stride, whole register load
265  val usMaskReg = Bool() // unit-stride, masked store/load
266  val eew = VEew() // size of memory elements
267  val sew = UInt(ewBits.W)
268  val emul = UInt(mulBits.W)
269  val lmul = UInt(mulBits.W)
270  val vlmax = UInt(elemIdxBits.W)
271  val instType = UInt(3.W)
272  val vd_last_uop = Bool()
273  val vd_first_uop = Bool()
274}
275
276class VecFlowBundle(implicit p: Parameters) extends VLSUBundleWithMicroOp {
277  val vaddr             = UInt(VAddrBits.W)
278  val mask              = UInt(VLENB.W)
279  val alignedType       = UInt(alignTypeBits.W)
280  val vecActive         = Bool()
281  val elemIdx           = UInt(elemIdxBits.W)
282  val is_first_ele      = Bool()
283
284  // pack
285  val isPackage         = Bool()
286  val packageNum        = UInt((log2Up(VLENB) + 1).W)
287  val originAlignedType = UInt(alignTypeBits.W)
288}
289
290class VecMemExuOutput(isVector: Boolean = false)(implicit p: Parameters) extends VLSUBundle{
291  val output = new MemExuOutput(isVector)
292  val vecFeedback = Bool()
293  val nc = Bool()
294  val mmio = Bool()
295  val usSecondInv = Bool()
296  val hasException = Bool()
297  val elemIdx = UInt(elemIdxBits.W)
298  val alignedType = UInt(alignTypeBits.W)
299  val mbIndex     = UInt(vsmBindexBits.W)
300  val mask        = UInt(VLENB.W)
301  val vaddr       = UInt(XLEN.W)
302  val vaNeedExt   = Bool()
303  val gpaddr      = UInt(GPAddrBits.W)
304  val isForVSnonLeafPTE = Bool()
305  val vecTriggerMask = UInt((VLEN/8).W)
306}
307
308object MulNum {
309  def apply (mul: UInt): UInt = { //mul means emul or lmul
310    (LookupTree(mul,List(
311      "b101".U -> 1.U , // 1/8
312      "b110".U -> 1.U , // 1/4
313      "b111".U -> 1.U , // 1/2
314      "b000".U -> 1.U , // 1
315      "b001".U -> 2.U , // 2
316      "b010".U -> 4.U , // 4
317      "b011".U -> 8.U   // 8
318    )))}
319}
320/**
321  * when emul is greater than or equal to 1, this means the entire register needs to be written;
322  * otherwise, only write the specified number of bytes */
323object MulDataSize {
324  def apply (mul: UInt): UInt = { //mul means emul or lmul
325    (LookupTree(mul,List(
326      "b101".U -> 2.U  , // 1/8
327      "b110".U -> 4.U  , // 1/4
328      "b111".U -> 8.U  , // 1/2
329      "b000".U -> 16.U , // 1
330      "b001".U -> 16.U , // 2
331      "b010".U -> 16.U , // 4
332      "b011".U -> 16.U   // 8
333    )))}
334}
335
336object OneRegNum {
337  def apply (eew: UInt): UInt = { //mul means emul or lmul
338    require(eew.getWidth == 2, "The eew width must be 2.")
339    (LookupTree(eew, List(
340      "b00".U -> 16.U , // 1
341      "b01".U ->  8.U , // 2
342      "b10".U ->  4.U , // 4
343      "b11".U ->  2.U   // 8
344    )))}
345}
346
347//index inst read data byte
348object SewDataSize {
349  def apply (sew: UInt): UInt = {
350    (LookupTree(sew,List(
351      "b000".U -> 1.U , // 1
352      "b001".U -> 2.U , // 2
353      "b010".U -> 4.U , // 4
354      "b011".U -> 8.U   // 8
355    )))}
356}
357
358// strided inst read data byte
359object EewDataSize {
360  def apply (eew: UInt): UInt = {
361    require(eew.getWidth == 2, "The eew width must be 2.")
362    (LookupTree(eew, List(
363      "b00".U -> 1.U , // 1
364      "b01".U -> 2.U , // 2
365      "b10".U -> 4.U , // 4
366      "b11".U -> 8.U   // 8
367    )))}
368}
369
370object loadDataSize {
371  def apply (instType: UInt, emul: UInt, eew: UInt, sew: UInt): UInt = {
372    (LookupTree(instType,List(
373      "b000".U ->  MulDataSize(emul), // unit-stride
374      "b010".U ->  EewDataSize(eew)  , // strided
375      "b001".U ->  SewDataSize(sew)  , // indexed-unordered
376      "b011".U ->  SewDataSize(sew)  , // indexed-ordered
377      "b100".U ->  EewDataSize(eew)  , // segment unit-stride
378      "b110".U ->  EewDataSize(eew)  , // segment strided
379      "b101".U ->  SewDataSize(sew)  , // segment indexed-unordered
380      "b111".U ->  SewDataSize(sew)    // segment indexed-ordered
381    )))}
382}
383
384object storeDataSize {
385  def apply (instType: UInt, eew: UInt, sew: UInt): UInt = {
386    (LookupTree(instType,List(
387      "b000".U ->  EewDataSize(eew)  , // unit-stride, do not use
388      "b010".U ->  EewDataSize(eew)  , // strided
389      "b001".U ->  SewDataSize(sew)  , // indexed-unordered
390      "b011".U ->  SewDataSize(sew)  , // indexed-ordered
391      "b100".U ->  EewDataSize(eew)  , // segment unit-stride
392      "b110".U ->  EewDataSize(eew)  , // segment strided
393      "b101".U ->  SewDataSize(sew)  , // segment indexed-unordered
394      "b111".U ->  SewDataSize(sew)    // segment indexed-ordered
395    )))}
396}
397
398/**
399  * these are used to obtain immediate addresses for  index instruction */
400object EewEq8 {
401  def apply(index:UInt, flow_inner_idx: UInt): UInt = {
402    (LookupTree(flow_inner_idx,List(
403      0.U  -> index(7 ,0   ),
404      1.U  -> index(15,8   ),
405      2.U  -> index(23,16  ),
406      3.U  -> index(31,24  ),
407      4.U  -> index(39,32  ),
408      5.U  -> index(47,40  ),
409      6.U  -> index(55,48  ),
410      7.U  -> index(63,56  ),
411      8.U  -> index(71,64  ),
412      9.U  -> index(79,72  ),
413      10.U -> index(87,80  ),
414      11.U -> index(95,88  ),
415      12.U -> index(103,96 ),
416      13.U -> index(111,104),
417      14.U -> index(119,112),
418      15.U -> index(127,120)
419    )))}
420}
421
422object EewEq16 {
423  def apply(index: UInt, flow_inner_idx: UInt): UInt = {
424    (LookupTree(flow_inner_idx, List(
425      0.U -> index(15, 0),
426      1.U -> index(31, 16),
427      2.U -> index(47, 32),
428      3.U -> index(63, 48),
429      4.U -> index(79, 64),
430      5.U -> index(95, 80),
431      6.U -> index(111, 96),
432      7.U -> index(127, 112)
433    )))}
434}
435
436object EewEq32 {
437  def apply(index: UInt, flow_inner_idx: UInt): UInt = {
438    (LookupTree(flow_inner_idx, List(
439      0.U -> index(31, 0),
440      1.U -> index(63, 32),
441      2.U -> index(95, 64),
442      3.U -> index(127, 96)
443    )))}
444}
445
446object EewEq64 {
447  def apply (index: UInt, flow_inner_idx: UInt): UInt = {
448    (LookupTree(flow_inner_idx, List(
449      0.U -> index(63, 0),
450      1.U -> index(127, 64)
451    )))}
452}
453
454object IndexAddr {
455  def apply (index: UInt, flow_inner_idx: UInt, eew: UInt): UInt = {
456    require(eew.getWidth == 2, "The eew width must be 2.")
457    (LookupTree(eew, List(
458      "b00".U -> EewEq8 (index = index, flow_inner_idx = flow_inner_idx ), // Imm is 1 Byte // TODO: index maybe cross register
459      "b01".U -> EewEq16(index = index, flow_inner_idx = flow_inner_idx ), // Imm is 2 Byte
460      "b10".U -> EewEq32(index = index, flow_inner_idx = flow_inner_idx ), // Imm is 4 Byte
461      "b11".U -> EewEq64(index = index, flow_inner_idx = flow_inner_idx )  // Imm is 8 Byte
462    )))}
463}
464
465object Log2Num {
466  def apply (num: UInt): UInt = {
467    (LookupTree(num,List(
468      16.U -> 4.U,
469      8.U  -> 3.U,
470      4.U  -> 2.U,
471      2.U  -> 1.U,
472      1.U  -> 0.U
473    )))}
474}
475
476object GenUopIdxInField {
477  /**
478   * Used in normal vector instruction
479   * */
480  def apply (instType: UInt, emul: UInt, lmul: UInt, uopIdx: UInt): UInt = {
481    val isIndexed = instType(0)
482    val mulInField = Mux(
483      isIndexed,
484      Mux(lmul.asSInt > emul.asSInt, lmul, emul),
485      emul
486    )
487    LookupTree(mulInField, List(
488      "b101".U -> 0.U,
489      "b110".U -> 0.U,
490      "b111".U -> 0.U,
491      "b000".U -> 0.U,
492      "b001".U -> uopIdx(0),
493      "b010".U -> uopIdx(1, 0),
494      "b011".U -> uopIdx(2, 0)
495    ))
496  }
497  /**
498   *  Only used in segment instruction.
499   * */
500  def apply (select: UInt, uopIdx: UInt): UInt = {
501    LookupTree(select, List(
502      "b101".U -> 0.U,
503      "b110".U -> 0.U,
504      "b111".U -> 0.U,
505      "b000".U -> 0.U,
506      "b001".U -> uopIdx(0),
507      "b010".U -> uopIdx(1, 0),
508      "b011".U -> uopIdx(2, 0)
509    ))
510  }
511}
512
513//eew decode
514object EewLog2 extends VLSUConstants {
515  // def apply (eew: UInt): UInt = {
516  //   (LookupTree(eew,List(
517  //     "b000".U -> "b000".U , // 1
518  //     "b101".U -> "b001".U , // 2
519  //     "b110".U -> "b010".U , // 4
520  //     "b111".U -> "b011".U   // 8
521  //   )))}
522  def apply(eew: UInt): UInt = {
523    require(eew.getWidth == 2, "The eew width must be 2.")
524    ZeroExt(eew, ewBits)
525  }
526}
527
528object GenRealFlowNum {
529  /**
530   * unit-stride instructions don't use this method;
531   * other instructions generate realFlowNum by EmulDataSize >> eew,
532   * EmulDataSize means the number of bytes that need to be written to the register,
533   * eew means the number of bytes written at once.
534   *
535   * @param instType As the name implies.
536   * @param emul As the name implies.
537   * @param lmul As the name implies.
538   * @param eew As the name implies.
539   * @param sew As the name implies.
540   * @param isSegment Only modules related to segment need to be set to true.
541   * @return FlowNum of instruction.
542   *
543   */
544  def apply (instType: UInt, emul: UInt, lmul: UInt, eew: UInt, sew: UInt, isSegment: Boolean = false): UInt = {
545    require(instType.getWidth == 3, "The instType width must be 3, (isSegment, mop)")
546    require(eew.getWidth == 2, "The eew width must be 2.")
547    // Because the new segmentunit is needed. But the previous implementation is retained for the time being in case of emergency.
548    val segmentIndexFlowNum =  if (isSegment) (MulDataSize(lmul) >> sew(1,0)).asUInt
549    else Mux(emul.asSInt > lmul.asSInt, (MulDataSize(emul) >> eew).asUInt, (MulDataSize(lmul) >> sew(1,0)).asUInt)
550    (LookupTree(instType,List(
551      "b000".U ->  (MulDataSize(emul) >> eew).asUInt, // store use, load do not use
552      "b010".U ->  (MulDataSize(emul) >> eew).asUInt, // strided
553      "b001".U ->  Mux(emul.asSInt > lmul.asSInt, (MulDataSize(emul) >> eew).asUInt, (MulDataSize(lmul) >> sew(1,0)).asUInt), // indexed-unordered
554      "b011".U ->  Mux(emul.asSInt > lmul.asSInt, (MulDataSize(emul) >> eew).asUInt, (MulDataSize(lmul) >> sew(1,0)).asUInt), // indexed-ordered
555      "b100".U ->  (MulDataSize(emul) >> eew).asUInt, // segment unit-stride
556      "b110".U ->  (MulDataSize(emul) >> eew).asUInt, // segment strided
557      "b101".U ->  segmentIndexFlowNum, // segment indexed-unordered
558      "b111".U ->  segmentIndexFlowNum  // segment indexed-ordered
559    )))}
560}
561
562object GenRealFlowLog2 extends VLSUConstants {
563  /**
564   * GenRealFlowLog2 = Log2(GenRealFlowNum)
565   *
566   * @param instType As the name implies.
567   * @param emul As the name implies.
568   * @param lmul As the name implies.
569   * @param eew As the name implies.
570   * @param sew As the name implies.
571   * @param isSegment Only modules related to segment need to be set to true.
572   * @return FlowNumLog2 of instruction.
573   */
574  def apply(instType: UInt, emul: UInt, lmul: UInt, eew: UInt, sew: UInt, isSegment: Boolean = false): UInt = {
575    require(instType.getWidth == 3, "The instType width must be 3, (isSegment, mop)")
576    require(eew.getWidth == 2, "The eew width must be 2.")
577    val emulLog2 = Mux(emul.asSInt >= 0.S, 0.U, emul)
578    val lmulLog2 = Mux(lmul.asSInt >= 0.S, 0.U, lmul)
579    val eewRealFlowLog2 = emulLog2 + log2Up(VLENB).U - eew
580    val sewRealFlowLog2 = lmulLog2 + log2Up(VLENB).U - sew(1, 0)
581    // Because the new segmentunit is needed. But the previous implementation is retained for the time being in case of emergency.
582    val segmentIndexFlowLog2 = if (isSegment) sewRealFlowLog2 else Mux(emul.asSInt > lmul.asSInt, eewRealFlowLog2, sewRealFlowLog2)
583    (LookupTree(instType, List(
584      "b000".U -> eewRealFlowLog2, // unit-stride
585      "b010".U -> eewRealFlowLog2, // strided
586      "b001".U -> Mux(emul.asSInt > lmul.asSInt, eewRealFlowLog2, sewRealFlowLog2), // indexed-unordered
587      "b011".U -> Mux(emul.asSInt > lmul.asSInt, eewRealFlowLog2, sewRealFlowLog2), // indexed-ordered
588      "b100".U -> eewRealFlowLog2, // segment unit-stride
589      "b110".U -> eewRealFlowLog2, // segment strided
590      "b101".U -> segmentIndexFlowLog2, // segment indexed-unordered
591      "b111".U -> segmentIndexFlowLog2, // segment indexed-ordered
592    )))
593  }
594}
595
596/**
597  * GenElemIdx generals an element index within an instruction, given a certain uopIdx and a known flowIdx
598  * inside the uop.
599  */
600object GenElemIdx extends VLSUConstants {
601  def apply(instType: UInt, emul: UInt, lmul: UInt, eew: UInt, sew: UInt,
602            uopIdx: UInt, flowIdx: UInt): UInt = {
603    require(eew.getWidth == 2, "The eew width must be 2.")
604    val isIndexed = instType(0).asBool
605    val eewUopFlowsLog2 = Mux(emul.asSInt > 0.S, 0.U, emul) + log2Up(VLENB).U - eew
606    val sewUopFlowsLog2 = Mux(lmul.asSInt > 0.S, 0.U, lmul) + log2Up(VLENB).U - sew(1, 0)
607    val uopFlowsLog2 = Mux(
608      isIndexed,
609      Mux(emul.asSInt > lmul.asSInt, eewUopFlowsLog2, sewUopFlowsLog2),
610      eewUopFlowsLog2
611    )
612    LookupTree(uopFlowsLog2, List(
613      0.U -> uopIdx ## flowIdx(0), // for hardware misalign
614      1.U -> uopIdx ## flowIdx(0),
615      2.U -> uopIdx ## flowIdx(1, 0),
616      3.U -> uopIdx ## flowIdx(2, 0),
617      4.U -> uopIdx ## flowIdx(3, 0)
618    ))
619  }
620}
621
622/**
623  * GenVLMAX calculates VLMAX, which equals MUL * ew
624  */
625object GenVLMAXLog2 extends VLSUConstants {
626  def apply(lmul: UInt, sew: UInt): UInt = lmul + log2Up(VLENB).U - sew
627}
628object GenVLMAX {
629  def apply(lmul: UInt, sew: UInt): UInt = 1.U << GenVLMAXLog2(lmul, sew)
630}
631/**
632 * generate mask base on vlmax
633 * example: vlmax = b100, max = b011
634 * */
635object GenVlMaxMask{
636  def apply(vlmax: UInt, length: Int): UInt = (vlmax - 1.U)(length-1, 0)
637}
638
639object GenUSWholeRegVL extends VLSUConstants {
640  def apply(nfields: UInt, eew: UInt): UInt = {
641    require(eew.getWidth == 2, "The eew width must be 2.")
642    LookupTree(eew, List(
643      "b00".U -> (nfields << (log2Up(VLENB) - 0)),
644      "b01".U -> (nfields << (log2Up(VLENB) - 1)),
645      "b10".U -> (nfields << (log2Up(VLENB) - 2)),
646      "b11".U -> (nfields << (log2Up(VLENB) - 3))
647    ))
648  }
649}
650object GenUSWholeEmul extends VLSUConstants{
651  def apply(nf: UInt): UInt={
652    LookupTree(nf,List(
653      "b000".U -> "b000".U(mulBits.W),
654      "b001".U -> "b001".U(mulBits.W),
655      "b011".U -> "b010".U(mulBits.W),
656      "b111".U -> "b011".U(mulBits.W)
657    ))
658  }
659}
660
661
662object GenUSMaskRegVL extends VLSUConstants {
663  def apply(vl: UInt): UInt = {
664    Mux(vl(2,0) === 0.U , (vl >> 3.U), ((vl >> 3.U) + 1.U))
665  }
666}
667
668object GenUopByteMask {
669  def apply(flowMask: UInt, alignedType: UInt): UInt = {
670    LookupTree(alignedType, List(
671      "b000".U -> flowMask,
672      "b001".U -> FillInterleaved(2, flowMask),
673      "b010".U -> FillInterleaved(4, flowMask),
674      "b011".U -> FillInterleaved(8, flowMask),
675      "b100".U -> FillInterleaved(16, flowMask)
676    ))
677  }
678}
679
680object GenVdIdxInField extends VLSUConstants {
681  def apply(instType: UInt, emul: UInt, lmul: UInt, uopIdx: UInt): UInt = {
682    val vdIdx = Wire(UInt(log2Up(maxMUL).W))
683    when (instType(1,0) === "b00".U || instType(1,0) === "b10".U || lmul.asSInt > emul.asSInt) {
684      // Unit-stride or Strided, or indexed with lmul >= emul
685      vdIdx := uopIdx
686    }.otherwise {
687      // Indexed with lmul <= emul
688      val multiple = emul - lmul
689      val uopIdxWidth = uopIdx.getWidth
690      vdIdx := LookupTree(multiple, List(
691        0.U -> uopIdx,
692        1.U -> (uopIdx >> 1),
693        2.U -> (uopIdx >> 2),
694        3.U -> (uopIdx >> 3)
695      ))
696    }
697    vdIdx
698  }
699}
700/**
701* Use start and vl to generate flow activative mask
702* mod = true fill 0
703* mod = false fill 1
704*/
705object GenFlowMask extends VLSUConstants {
706  def apply(elementMask: UInt, start: UInt, vl: UInt , mod: Boolean): UInt = {
707    val startMask = ~UIntToMask(start, VLEN)
708    val vlMask = UIntToMask(vl, VLEN)
709    val maskVlStart = vlMask & startMask
710    if(mod){
711      elementMask & maskVlStart
712    }
713    else{
714      (~elementMask).asUInt & maskVlStart
715    }
716  }
717}
718
719object genVWmask128 {
720  def apply(addr: UInt, sizeEncode: UInt): UInt = {
721    (LookupTree(sizeEncode, List(
722      "b000".U -> 0x1.U, //0001 << addr(2:0)
723      "b001".U -> 0x3.U, //0011
724      "b010".U -> 0xf.U, //1111
725      "b011".U -> 0xff.U, //11111111
726      "b100".U -> 0xffff.U, //1111111111111111
727      "b111".U -> 0xffff.U  //cbo
728    )) << addr(3, 0)).asUInt
729  }
730}
731/*
732* only use in max length is 128
733*/
734object genVWdata {
735  def apply(data: UInt, sizeEncode: UInt): UInt = {
736    LookupTree(sizeEncode, List(
737      "b000".U -> Fill(16, data(7, 0)),
738      "b001".U -> Fill(8, data(15, 0)),
739      "b010".U -> Fill(4, data(31, 0)),
740      "b011".U -> Fill(2, data(63,0)),
741      "b100".U -> data(127,0)
742    ))
743  }
744}
745
746object genUSSplitAddr{
747  def apply(addr: UInt, index: UInt, width: Int): UInt = {
748    val tmpAddr = Cat(addr(width - 1, 4), 0.U(4.W))
749    val nextCacheline = tmpAddr + 16.U
750    LookupTree(index, List(
751      0.U -> tmpAddr,
752      1.U -> nextCacheline
753    ))
754  }
755}
756
757object genUSSplitMask{
758  def apply(mask: UInt, index: UInt): UInt = {
759    require(mask.getWidth == 32) // need to be 32-bits
760    LookupTree(index, List(
761      0.U -> mask(15, 0),
762      1.U -> mask(31, 16),
763    ))
764  }
765}
766
767object genUSSplitData{
768  def apply(data: UInt, index: UInt, addrOffset: UInt): UInt = {
769    val tmpData = WireInit(0.U(256.W))
770    val lookupTable = (0 until 16).map{case i =>
771      if(i == 0){
772        i.U -> Cat(0.U(128.W), data)
773      }else{
774        i.U -> Cat(0.U(((16-i)*8).W), data, 0.U((i*8).W))
775      }
776    }
777    tmpData := LookupTree(addrOffset, lookupTable).asUInt
778
779    LookupTree(index, List(
780      0.U -> tmpData(127, 0),
781      1.U -> tmpData(255, 128)
782    ))
783  }
784}
785
786object genVSData extends VLSUConstants {
787  def apply(data: UInt, elemIdx: UInt, alignedType: UInt): UInt = {
788    LookupTree(alignedType, List(
789      "b000".U -> ZeroExt(LookupTree(elemIdx(3, 0), List.tabulate(VLEN/8)(i => i.U -> getByte(data, i))), VLEN),
790      "b001".U -> ZeroExt(LookupTree(elemIdx(2, 0), List.tabulate(VLEN/16)(i => i.U -> getHalfWord(data, i))), VLEN),
791      "b010".U -> ZeroExt(LookupTree(elemIdx(1, 0), List.tabulate(VLEN/32)(i => i.U -> getWord(data, i))), VLEN),
792      "b011".U -> ZeroExt(LookupTree(elemIdx(0), List.tabulate(VLEN/64)(i => i.U -> getDoubleWord(data, i))), VLEN),
793      "b100".U -> data // if have wider element, it will broken
794    ))
795  }
796}
797
798// TODO: more elegant
799object genVStride extends VLSUConstants {
800  def apply(uopIdx: UInt, stride: UInt): UInt = {
801    LookupTree(uopIdx, List(
802      0.U -> 0.U,
803      1.U -> stride,
804      2.U -> (stride << 1),
805      3.U -> ((stride << 1).asUInt + stride),
806      4.U -> (stride << 2),
807      5.U -> ((stride << 2).asUInt + stride),
808      6.U -> ((stride << 2).asUInt + (stride << 1)),
809      7.U -> ((stride << 2).asUInt + (stride << 1) + stride)
810    ))
811  }
812}
813/**
814 * generate uopOffset, not used in segment instruction
815 * */
816object genVUopOffset extends VLSUConstants {
817  def apply(instType: UInt, isfof: Bool, uopidx: UInt, nf: UInt, eew: UInt, stride: UInt, alignedType: UInt): UInt = {
818    val uopInsidefield = (uopidx >> nf).asUInt // when nf == 0, is uopidx
819
820//    val fofVUopOffset = (LookupTree(instType,List(
821//      "b000".U -> ( genVStride(uopInsidefield, stride) << (log2Up(VLENB).U - eew)   ) , // unit-stride fof
822//      "b100".U -> ( genVStride(uopInsidefield, stride) << (log2Up(VLENB).U - eew)   ) , // segment unit-stride fof
823//    ))).asUInt
824
825    val otherVUopOffset = (LookupTree(instType,List(
826      "b000".U -> ( uopInsidefield << alignedType                                   ) , // unit-stride
827      "b010".U -> ( genVStride(uopInsidefield, stride) << (log2Up(VLENB).U - eew)   ) , // strided
828      "b001".U -> ( 0.U                                                             ) , // indexed-unordered
829      "b011".U -> ( 0.U                                                             ) , // indexed-ordered
830      "b100".U -> ( uopInsidefield << alignedType                                   ) , // segment unit-stride
831      "b110".U -> ( genVStride(uopInsidefield, stride) << (log2Up(VLENB).U - eew)   ) , // segment strided
832      "b101".U -> ( 0.U                                                             ) , // segment indexed-unordered
833      "b111".U -> ( 0.U                                                             )   // segment indexed-ordered
834    ))).asUInt
835
836//    Mux(isfof, fofVUopOffset, otherVUopOffset)
837    otherVUopOffset
838  }
839}
840
841
842
843object genVFirstUnmask extends VLSUConstants {
844  /**
845   * Find the lowest unmasked number of bits.
846   * example:
847   *   mask = 16'b1111_1111_1110_0000
848   *   return 5
849   * @param mask 16bits of mask.
850   * @return lowest unmasked number of bits.
851   */
852  def apply(mask: UInt): UInt = {
853    require(mask.getWidth == 16, "The mask width must be 16")
854    val select = (0 until 16).zip(mask.asBools).map{case (i, v) =>
855      (v, i.U)
856    }
857    PriorityMuxDefault(select, 0.U)
858  }
859
860  def apply(mask: UInt, regOffset: UInt): UInt = {
861    require(mask.getWidth == 16, "The mask width must be 16")
862    val realMask = (mask >> regOffset).asUInt
863    val select = (0 until 16).zip(realMask.asBools).map{case (i, v) =>
864      (v, i.U)
865    }
866    PriorityMuxDefault(select, 0.U)
867  }
868}
869
870class skidBufferConnect[T <: Data](gen: T) extends Module {
871  val io = IO(new Bundle() {
872    val in = Flipped(DecoupledIO(gen.cloneType))
873    val flush = Input(Bool())
874    val out = DecoupledIO(gen.cloneType)
875  })
876
877  skidBuffer.connect(io.in, io.out, io.flush)
878}
879
880object skidBuffer{
881  /*
882  * Skid Buffer used to break timing path of ready
883  * */
884  def connect[T <: Data](
885                          in: DecoupledIO[T],
886                          out: DecoupledIO[T],
887                          flush: Bool
888                        ): T = {
889    val empty :: skid :: Nil = Enum(2)
890    val state      = RegInit(empty)
891    val stateNext  = WireInit(empty)
892    val dataBuffer = RegEnable(in.bits, (!out.ready && in.fire))
893
894    when(state === empty){
895      stateNext := Mux(!out.ready && in.fire && !flush, skid, empty)
896    }.elsewhen(state === skid){
897      stateNext := Mux(out.ready || flush, empty, skid)
898    }
899    state     := stateNext
900
901    in.ready  := state === empty
902    out.bits  := Mux(state === skid, dataBuffer, in.bits)
903    out.valid := in.valid || (state === skid)
904
905    dataBuffer
906  }
907  def apply[T <: Data](
908                        in: DecoupledIO[T],
909                        out: DecoupledIO[T],
910                        flush: Bool,
911                        moduleName: String
912                      ): Unit = {
913    val buffer = Module(new skidBufferConnect(in.bits))
914    buffer.suggestName(moduleName)
915    buffer.io.in <> in
916    buffer.io.flush := flush
917    out <> buffer.io.out
918  }
919}
920
921