xref: /XiangShan/src/main/scala/xiangshan/mem/vector/VSplit.scala (revision 16c2d8bb27e9a24ed5ef5e4885693e6a30b536df)
13952421bSweiding liu/***************************************************************************************
23952421bSweiding liu  * Copyright (c) 2020-2021 Institute of Computing Technology, Chinese Academy of Sciences
33952421bSweiding liu  * Copyright (c) 2020-2021 Peng Cheng Laboratory
43952421bSweiding liu  *
53952421bSweiding liu  * XiangShan is licensed under Mulan PSL v2.
63952421bSweiding liu  * You can use this software according to the terms and conditions of the Mulan PSL v2.
73952421bSweiding liu  * You may obtain a copy of Mulan PSL v2 at:
83952421bSweiding liu  *          http://license.coscl.org.cn/MulanPSL2
93952421bSweiding liu  *
103952421bSweiding liu  * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
113952421bSweiding liu  * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
123952421bSweiding liu  * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
133952421bSweiding liu  *
143952421bSweiding liu  * See the Mulan PSL v2 for more details.
153952421bSweiding liu  ***************************************************************************************/
163952421bSweiding liu
173952421bSweiding liupackage xiangshan.mem
183952421bSweiding liu
193952421bSweiding liuimport org.chipsalliance.cde.config.Parameters
203952421bSweiding liuimport chisel3._
213952421bSweiding liuimport chisel3.util._
223952421bSweiding liuimport utils._
233952421bSweiding liuimport utility._
244a84d160SAnzoimport xiangshan.ExceptionNO._
253952421bSweiding liuimport xiangshan._
263952421bSweiding liuimport xiangshan.backend.rob.RobPtr
273952421bSweiding liuimport xiangshan.backend.Bundles._
283952421bSweiding liuimport xiangshan.mem._
293952421bSweiding liuimport xiangshan.backend.fu.vector.Bundles._
30102b377bSweiding liuimport xiangshan.backend.fu.FuConfig._
31e7ab4635SHuijin Liimport xiangshan.backend.fu.FuType
323952421bSweiding liu
333952421bSweiding liu
3426af847eSgood-circleclass VSplitPipeline(isVStore: Boolean = false)(implicit p: Parameters) extends VLSUModule{
3526af847eSgood-circle  val io = IO(new VSplitPipelineIO(isVStore))
3630d194e4Sweiding liu  // will be override later
3730d194e4Sweiding liu  def us_whole_reg(fuOpType: UInt): Bool = false.B
3830d194e4Sweiding liu  def us_mask(fuOpType: UInt): Bool = false.B
3930d194e4Sweiding liu  def us_fof(fuOpType: UInt): Bool = false.B
4002ab9019SAnzooooo  //TODO vdIdxReg should no longer be useful, don't delete it for now
412e1c1560Sweiding liu  val vdIdxReg = RegInit(0.U(3.W))
422e1c1560Sweiding liu
433952421bSweiding liu  val s1_ready = WireInit(false.B)
4426af847eSgood-circle  io.in.ready := s1_ready
453952421bSweiding liu
463952421bSweiding liu  /**-----------------------------------------------------------
473952421bSweiding liu    * s0 stage
483952421bSweiding liu    * decode and generate AlignedType, uop mask, preIsSplit
493952421bSweiding liu    * ----------------------------------------------------------
503952421bSweiding liu    */
51c66d9614Sweiding liu  val s0_uop = io.in.bits.uop
52c66d9614Sweiding liu  val s0_vtype = s0_uop.vpu.vtype
533952421bSweiding liu  val s0_sew = s0_vtype.vsew
54c66d9614Sweiding liu  val s0_eew = s0_uop.vpu.veew
553952421bSweiding liu  val s0_lmul = s0_vtype.vlmul
563952421bSweiding liu  // when load whole register or unit-stride masked , emul should be 1
57c66d9614Sweiding liu  val s0_fuOpType = s0_uop.fuOpType
583952421bSweiding liu  val s0_mop = s0_fuOpType(6, 5)
59c66d9614Sweiding liu  val s0_nf = Mux(us_whole_reg(s0_fuOpType), 0.U, s0_uop.vpu.nf)
60c66d9614Sweiding liu  val s0_vm = s0_uop.vpu.vm
61c66d9614Sweiding liu  val s0_emul = Mux(us_whole_reg(s0_fuOpType) ,GenUSWholeEmul(s0_uop.vpu.nf), Mux(us_mask(s0_fuOpType), 0.U(mulBits.W), EewLog2(s0_eew) - s0_sew + s0_lmul))
62df3b4b92SAnzooooo  val s0_preIsSplit = !isUnitStride(s0_mop)
6330d194e4Sweiding liu  val s0_nfield        = s0_nf +& 1.U
643952421bSweiding liu
653952421bSweiding liu  val s0_valid         = Wire(Bool())
6626af847eSgood-circle  val s0_kill          = io.in.bits.uop.robIdx.needFlush(io.redirect)
673952421bSweiding liu  val s0_can_go        = s1_ready
683952421bSweiding liu  val s0_fire          = s0_valid && s0_can_go
6926af847eSgood-circle  val s0_out           = Wire(new VLSBundle(isVStore))
703952421bSweiding liu
713952421bSweiding liu  val isUsWholeReg = isUnitStride(s0_mop) && us_whole_reg(s0_fuOpType)
723952421bSweiding liu  val isMaskReg = isUnitStride(s0_mop) && us_mask(s0_fuOpType)
733952421bSweiding liu  val isSegment = s0_nf =/= 0.U && !us_whole_reg(s0_fuOpType)
743952421bSweiding liu  val instType = Cat(isSegment, s0_mop)
753952421bSweiding liu  val uopIdx = io.in.bits.uop.vpu.vuopIdx
763952421bSweiding liu  val uopIdxInField = GenUopIdxInField(instType, s0_emul, s0_lmul, uopIdx)
773952421bSweiding liu  val vdIdxInField = GenVdIdxInField(instType, s0_emul, s0_lmul, uopIdxInField)
783952421bSweiding liu  val lmulLog2 = Mux(s0_lmul.asSInt >= 0.S, 0.U, s0_lmul)
793952421bSweiding liu  val emulLog2 = Mux(s0_emul.asSInt >= 0.S, 0.U, s0_emul)
803952421bSweiding liu  val numEewLog2 = emulLog2 - EewLog2(s0_eew)
813952421bSweiding liu  val numSewLog2 = lmulLog2 - s0_sew
823952421bSweiding liu  val numFlowsSameVdLog2 = Mux(
833952421bSweiding liu    isIndexed(instType),
843952421bSweiding liu    log2Up(VLENB).U - s0_sew(1,0),
85d73f3705SAnzo    log2Up(VLENB).U - s0_eew
863952421bSweiding liu  )
873952421bSweiding liu  // numUops = nf * max(lmul, emul)
883952421bSweiding liu  val lmulLog2Pos = Mux(s0_lmul.asSInt < 0.S, 0.U, s0_lmul)
893952421bSweiding liu  val emulLog2Pos = Mux(s0_emul.asSInt < 0.S, 0.U, s0_emul)
903952421bSweiding liu  val numUops = Mux(
913952421bSweiding liu    isIndexed(s0_mop) && s0_lmul.asSInt > s0_emul.asSInt,
923952421bSweiding liu    (s0_nf +& 1.U) << lmulLog2Pos,
933952421bSweiding liu    (s0_nf +& 1.U) << emulLog2Pos
943952421bSweiding liu  )
953952421bSweiding liu
963952421bSweiding liu  val vvl = io.in.bits.src_vl.asTypeOf(VConfig()).vl
9730d194e4Sweiding liu  val evl = Mux(isUsWholeReg,
9830d194e4Sweiding liu                GenUSWholeRegVL(io.in.bits.uop.vpu.nf +& 1.U, s0_eew),
9930d194e4Sweiding liu                Mux(isMaskReg,
10030d194e4Sweiding liu                    GenUSMaskRegVL(vvl),
10130d194e4Sweiding liu                    vvl))
1023952421bSweiding liu  val vvstart = io.in.bits.uop.vpu.vstart
103d73f3705SAnzo  val alignedType = Mux(isIndexed(instType), s0_sew(1, 0), s0_eew)
1043952421bSweiding liu  val broadenAligendType = Mux(s0_preIsSplit, Cat("b0".U, alignedType), "b100".U) // if is unit-stride, use 128-bits memory access
1053952421bSweiding liu  val flowsLog2 = GenRealFlowLog2(instType, s0_emul, s0_lmul, s0_eew, s0_sew)
10602ab9019SAnzooooo  val flowsPrevThisUop = (uopIdxInField << flowsLog2).asUInt // # of flows before this uop in a field
10702ab9019SAnzooooo  val flowsPrevThisVd = (vdIdxInField << numFlowsSameVdLog2).asUInt // # of flows before this vd in a field
10802ab9019SAnzooooo  val flowsIncludeThisUop = ((uopIdxInField +& 1.U) << flowsLog2).asUInt // # of flows before this uop besides this uop
1093952421bSweiding liu  val flowNum = io.in.bits.flowNum.get
11081b02df5Sweiding liu  // max index in vd, only use in index instructions for calculate index
111d73f3705SAnzo  val maxIdxInVdIndex = GenVLMAX(Mux(s0_emul.asSInt > 0.S, 0.U, s0_emul), s0_eew)
11281b02df5Sweiding liu  val indexVlMaxInVd = GenVlMaxMask(maxIdxInVdIndex, elemIdxBits)
11302ab9019SAnzooooo
11402ab9019SAnzooooo  // For vectore indexed  instructions:
11502ab9019SAnzooooo  //  When emul is greater than lmul, multiple uop correspond to a Vd, e.g:
11602ab9019SAnzooooo  //    vsetvli	t1,t0,e8,m1,ta,ma    lmul = 1
11702ab9019SAnzooooo  //    vluxei16.v	v2,(a0),v8       emul = 2
11802ab9019SAnzooooo  //    In this case, we need to ensure the flownumis right shift by flowsPrevThisUop, However, the mask passed to mergebuff is right shift by flowsPrevThisVd e.g:
11902ab9019SAnzooooo  //      vl = 9
12002ab9019SAnzooooo  //      srcMask = 0x1FF
12102ab9019SAnzooooo  //      uopIdxInField = 0 and vdIdxInField = 0, flowMask = 0x00FF, toMergeBuffMask = 0x01FF
12202ab9019SAnzooooo  //      uopIdxInField = 1 and vdIdxInField = 0, flowMask = 0x0001, toMergeBuffMask = 0x01FF
12302ab9019SAnzooooo  //      uopIdxInField = 0 and vdIdxInField = 0, flowMask = 0x0000, toMergeBuffMask = 0x0000
12402ab9019SAnzooooo  //      uopIdxInField = 0 and vdIdxInField = 0, flowMask = 0x0000, toMergeBuffMask = 0x0000
12502ab9019SAnzooooo  val isSpecialIndexed = isIndexed(instType) && s0_emul.asSInt > s0_lmul.asSInt
12602ab9019SAnzooooo
1273952421bSweiding liu  val srcMask = GenFlowMask(Mux(s0_vm, Fill(VLEN, 1.U(1.W)), io.in.bits.src_mask), vvstart, evl, true)
12802ab9019SAnzooooo  val srcMaskShiftBits = Mux(isSpecialIndexed, flowsPrevThisUop, flowsPrevThisVd)
1293952421bSweiding liu
1303952421bSweiding liu  val flowMask = ((srcMask &
13126af847eSgood-circle    UIntToMask(flowsIncludeThisUop.asUInt, VLEN + 1) &
13226af847eSgood-circle    (~UIntToMask(flowsPrevThisUop.asUInt, VLEN)).asUInt
13302ab9019SAnzooooo  ) >> srcMaskShiftBits)(VLENB - 1, 0)
13402ab9019SAnzooooo  val indexedSrcMask = (srcMask >> flowsPrevThisVd).asUInt //only for index instructions
13502ab9019SAnzooooo
13602ab9019SAnzooooo  // Used to calculate the element index.
13702ab9019SAnzooooo  // See 'splitbuffer' for 'io.out.splitIdxOffset' and 'mergebuffer' for 'merge data'
13802ab9019SAnzooooo  val indexedSplitOffset = Mux(isSpecialIndexed, flowsPrevThisUop - flowsPrevThisVd, 0.U) // only for index instructions of emul > lmul
1393952421bSweiding liu  val vlmax = GenVLMAX(s0_lmul, s0_sew)
1403952421bSweiding liu
1413952421bSweiding liu  // connect
1423952421bSweiding liu  s0_out := DontCare
1433952421bSweiding liu  s0_out match {case x =>
1443952421bSweiding liu    x.uop := io.in.bits.uop
145e7ab4635SHuijin Li    x.uop.imm := 0.U
1463952421bSweiding liu    x.uop.vpu.vl := evl
1473952421bSweiding liu    x.uop.uopIdx := uopIdx
1483952421bSweiding liu    x.uop.numUops := numUops
1493952421bSweiding liu    x.uop.lastUop := (uopIdx +& 1.U) === numUops
150b7d72c56Sweiding liu    x.uop.vpu.nf  := s0_nf
151df3b4b92SAnzooooo    x.rawNf := io.in.bits.uop.vpu.nf
1523952421bSweiding liu    x.flowMask := flowMask
15302ab9019SAnzooooo    x.indexedSrcMask := indexedSrcMask // Only vector indexed instructions uses it
15402ab9019SAnzooooo    x.indexedSplitOffset := indexedSplitOffset
1551d619202Sweiding liu    x.byteMask := GenUopByteMask(flowMask, Cat("b0".U, alignedType))(VLENB - 1, 0)
1563952421bSweiding liu    x.fof := isUnitStride(s0_mop) && us_fof(s0_fuOpType)
1573952421bSweiding liu    x.baseAddr := io.in.bits.src_rs1
1583952421bSweiding liu    x.stride := io.in.bits.src_stride
15900b107d2Sweiding liu    x.flowNum := flowNum
16030d194e4Sweiding liu    x.nfields := s0_nfield
1613952421bSweiding liu    x.vm := s0_vm
1623952421bSweiding liu    x.usWholeReg := isUsWholeReg
1633952421bSweiding liu    x.usMaskReg := isMaskReg
1643952421bSweiding liu    x.eew := s0_eew
1653952421bSweiding liu    x.sew := s0_sew
1663952421bSweiding liu    x.emul := s0_emul
1673952421bSweiding liu    x.lmul := s0_lmul
1683952421bSweiding liu    x.vlmax := Mux(isUsWholeReg, evl, vlmax)
1693952421bSweiding liu    x.instType := instType
1703952421bSweiding liu    x.data := io.in.bits.src_vs3
1713952421bSweiding liu    x.vdIdxInField := vdIdxInField
1723952421bSweiding liu    x.preIsSplit  := s0_preIsSplit
1733952421bSweiding liu    x.alignedType := broadenAligendType
17481b02df5Sweiding liu    x.indexVlMaxInVd := indexVlMaxInVd
1753952421bSweiding liu  }
1763952421bSweiding liu  s0_valid := io.in.valid && !s0_kill
1773952421bSweiding liu  /**-------------------------------------
1783952421bSweiding liu    * s1 stage
1793952421bSweiding liu    * ------------------------------------
1803952421bSweiding liu    * generate UopOffset
1813952421bSweiding liu    */
1823952421bSweiding liu  val s1_valid         = RegInit(false.B)
1833952421bSweiding liu  val s1_kill          = Wire(Bool())
18426af847eSgood-circle  val s1_in            = Wire(new VLSBundle(isVStore))
185c608693fSweiding liu  val s1_can_go        = io.out.ready && io.toMergeBuffer.req.ready
1863952421bSweiding liu  val s1_fire          = s1_valid && !s1_kill && s1_can_go
1873952421bSweiding liu
188c608693fSweiding liu  s1_ready         := s1_kill || !s1_valid || s1_can_go
1893952421bSweiding liu
1903952421bSweiding liu  when(s0_fire){
1913952421bSweiding liu    s1_valid := true.B
1923952421bSweiding liu  }.elsewhen(s1_fire){
1933952421bSweiding liu    s1_valid := false.B
1943952421bSweiding liu  }.elsewhen(s1_kill){
1953952421bSweiding liu    s1_valid := false.B
1963952421bSweiding liu  }
1973952421bSweiding liu  s1_in := RegEnable(s0_out, s0_fire)
1983952421bSweiding liu
199f2ea0585SAnzooooo  val s1_flowNum          = s1_in.flowNum
200c66d9614Sweiding liu  val s1_uop              = s1_in.uop
201c66d9614Sweiding liu  val s1_uopidx           = s1_uop.vpu.vuopIdx
202c66d9614Sweiding liu  val s1_nf               = s1_uop.vpu.nf
2033952421bSweiding liu  val s1_nfields          = s1_in.nfields
2043952421bSweiding liu  val s1_eew              = s1_in.eew
20502ab9019SAnzooooo  val s1_emul             = s1_in.emul
20602ab9019SAnzooooo  val s1_lmul             = s1_in.lmul
2073952421bSweiding liu  val s1_instType         = s1_in.instType
2083952421bSweiding liu  val s1_stride           = s1_in.stride
2091d619202Sweiding liu  val s1_vmask            = FillInterleaved(8, s1_in.byteMask)(VLEN-1, 0)
210b7d72c56Sweiding liu  val s1_alignedType      = s1_in.alignedType
21102ab9019SAnzooooo  val s1_isSpecialIndexed = isIndexed(s1_instType) && s1_emul.asSInt > s1_lmul.asSInt
21202ab9019SAnzooooo  val s1_mask             = Mux(s1_isSpecialIndexed, s1_in.indexedSrcMask, s1_in.flowMask)
21302ab9019SAnzooooo  val s1_vdIdx            = s1_in.vdIdxInField
21472439acfSAnzooooo  val s1_fof              = s1_in.fof
2153952421bSweiding liu  val s1_notIndexedStride = Mux( // stride for strided/unit-stride instruction
2163952421bSweiding liu    isStrided(s1_instType),
2173952421bSweiding liu    s1_stride(XLEN - 1, 0), // for strided load, stride = x[rs2]
218d73f3705SAnzo    s1_nfields << s1_eew // for unit-stride load, stride = eew * NFIELDS
2193952421bSweiding liu  )
22061054c5cSAnzooooo
22161054c5cSAnzooooo  val stride            = Mux(isIndexed(s1_instType), s1_stride, s1_notIndexedStride).asUInt // if is index instructions, get index when split
222d73f3705SAnzo  val uopOffset         = genVUopOffset(s1_instType, s1_fof, s1_uopidx, s1_nf, s1_eew, stride, s1_alignedType)
2233771bdd2Sweiding liu  // for Unit-Stride, if uop's addr is aligned with 128-bits, split it to one flow, otherwise split two
2243771bdd2Sweiding liu  val usLowBitsAddr     = getCheckAddrLowBits(s1_in.baseAddr, maxMemByteNum) + getCheckAddrLowBits(uopOffset, maxMemByteNum)
225c8d442a6Sweiding liu  val usMask            = Cat(0.U(VLENB.W), s1_in.byteMask) << getCheckAddrLowBits(usLowBitsAddr, maxMemByteNum)
2269f329f86SAnzo  val usAligned128      = getCheckAddrLowBits(usLowBitsAddr, maxMemByteNum) === 0.U // addr 128-bit aligned
2279f329f86SAnzo  val usMaskLowActive   = genUSSplitMask(usMask.asTypeOf(UInt(32.W)), 1.U).asUInt.orR
2289f329f86SAnzo  val usMaskHighActive  = genUSSplitMask(usMask.asTypeOf(UInt(32.W)), 0.U).asUInt.orR
2299f329f86SAnzo  val usActiveNum       = Mux(
2309f329f86SAnzo                            usMaskLowActive && usMaskHighActive,
2319f329f86SAnzo                            VecMemUnitStrideMaxFlowNum.U,
2329f329f86SAnzo                            Mux(usMaskLowActive || usMaskHighActive, (VecMemUnitStrideMaxFlowNum - 1).U, 0.U)
2339f329f86SAnzo                          )
2349f329f86SAnzo
2359f329f86SAnzo  val activeNum         = Mux(s1_in.preIsSplit, PopCount(s1_in.flowMask), usActiveNum)
2369f329f86SAnzo
2373952421bSweiding liu
2383952421bSweiding liu  s1_kill               := s1_in.uop.robIdx.needFlush(io.redirect)
2393952421bSweiding liu
2403952421bSweiding liu  // query mergeBuffer
241c608693fSweiding liu  io.toMergeBuffer.req.valid             := io.out.ready && s1_valid// only can_go will get MergeBuffer entry
242c41a9f78Slwd  io.toMergeBuffer.req.bits.flowNum      := activeNum
2433952421bSweiding liu  io.toMergeBuffer.req.bits.data         := s1_in.data
2443952421bSweiding liu  io.toMergeBuffer.req.bits.uop          := s1_in.uop
245df3b4b92SAnzooooo  io.toMergeBuffer.req.bits.uop.vpu.nf   := s1_in.rawNf
24602ab9019SAnzooooo  io.toMergeBuffer.req.bits.mask         := s1_mask
247c0355297SAnzooooo  io.toMergeBuffer.req.bits.vaddr        := s1_in.baseAddr
24802ab9019SAnzooooo  io.toMergeBuffer.req.bits.vdIdx        := s1_vdIdx  //TODO vdIdxReg should no longer be useful, don't delete it for now
24955178b77Sweiding liu  io.toMergeBuffer.req.bits.fof          := s1_in.fof
25055178b77Sweiding liu  io.toMergeBuffer.req.bits.vlmax        := s1_in.vlmax
2513952421bSweiding liu//   io.toMergeBuffer.req.bits.vdOffset :=
2523952421bSweiding liu
25302ab9019SAnzooooo  //TODO vdIdxReg should no longer be useful, don't delete it for now
25402ab9019SAnzooooo//  when (s1_in.uop.lastUop && s1_fire || s1_kill) {
25502ab9019SAnzooooo//    vdIdxReg := 0.U
25602ab9019SAnzooooo//  }.elsewhen(s1_fire) {
25702ab9019SAnzooooo//    vdIdxReg := vdIdxReg + 1.U
25802ab9019SAnzooooo//    XSError(vdIdxReg + 1.U === 0.U, s"Overflow! The number of vd should be less than 8\n")
25902ab9019SAnzooooo//  }
2603952421bSweiding liu  // out connect
261c41a9f78Slwd  io.out.valid          := s1_valid && io.toMergeBuffer.resp.valid && (activeNum =/= 0.U) // if activeNum == 0, this uop do nothing, can be killed.
26226af847eSgood-circle  io.out.bits           := s1_in
2633952421bSweiding liu  io.out.bits.uopOffset := uopOffset
264*16c2d8bbSAnzo  io.out.bits.uopAddr   := s1_in.baseAddr + uopOffset
2653952421bSweiding liu  io.out.bits.stride    := stride
26626af847eSgood-circle  io.out.bits.mBIndex   := io.toMergeBuffer.resp.bits.mBIndex
2673771bdd2Sweiding liu  io.out.bits.usLowBitsAddr := usLowBitsAddr
2683771bdd2Sweiding liu  io.out.bits.usAligned128  := usAligned128
269c8d442a6Sweiding liu  io.out.bits.usMask        := usMask
270df3b4b92SAnzooooo  io.out.bits.uop.vpu.nf    := s1_in.rawNf
271b2d6d8e7Sgood-circle
272b2d6d8e7Sgood-circle  XSPerfAccumulate("split_out",     io.out.fire)
273b2d6d8e7Sgood-circle  XSPerfAccumulate("pipe_block",    io.out.valid && !io.out.ready)
274b2d6d8e7Sgood-circle  XSPerfAccumulate("mbuffer_block", s1_valid && io.out.ready && !io.toMergeBuffer.resp.valid)
2753952421bSweiding liu}
2763952421bSweiding liu
2773952421bSweiding liuabstract class VSplitBuffer(isVStore: Boolean = false)(implicit p: Parameters) extends VLSUModule{
2783952421bSweiding liu  val io = IO(new VSplitBufferIO(isVStore))
279102b377bSweiding liu  lazy val fuCfg    = if(isVStore) VstuCfg else VlduCfg
2803952421bSweiding liu
281c66d9614Sweiding liu  val uopq          = Reg(new VLSBundle(isVStore))
282c66d9614Sweiding liu  val allocated     = RegInit(false.B)
283c66d9614Sweiding liu  val needCancel    = WireInit(false.B)
284c79353cdSweiding liu  val activeIssue   = Wire(Bool())
285c79353cdSweiding liu  val inActiveIssue = Wire(Bool())
286c66d9614Sweiding liu  val splitFinish   = WireInit(false.B)
287c79353cdSweiding liu
2883952421bSweiding liu  // for split
2893952421bSweiding liu  val splitIdx = RegInit(0.U(flowIdxBits.W))
2903952421bSweiding liu  val strideOffsetReg = RegInit(0.U(VLEN.W))
2913952421bSweiding liu
2923952421bSweiding liu  /**
2933952421bSweiding liu    * Redirect
2943952421bSweiding liu    */
295c79353cdSweiding liu  val cancelEnq    = io.in.bits.uop.robIdx.needFlush(io.redirect)
296c79353cdSweiding liu  val canEnqueue   = io.in.valid
297c79353cdSweiding liu  val needEnqueue  = canEnqueue && !cancelEnq
2983952421bSweiding liu
299c79353cdSweiding liu  // enqueue
300c79353cdSweiding liu  val offset    = PopCount(needEnqueue)
301c66d9614Sweiding liu  val canAccept = !allocated || allocated && splitFinish && (activeIssue || inActiveIssue) // if is valid entry, need split finish and send last uop
302c79353cdSweiding liu  io.in.ready  := canAccept
303c79353cdSweiding liu  val doEnqueue = canAccept && needEnqueue
304c79353cdSweiding liu
305c79353cdSweiding liu  when(doEnqueue){
306c66d9614Sweiding liu    uopq := io.in.bits
307c79353cdSweiding liu  }
3083952421bSweiding liu
3093952421bSweiding liu  //split uops
310c66d9614Sweiding liu  val issueValid       = allocated && !needCancel
311c66d9614Sweiding liu  val issueEntry       = uopq
3125281d28fSweiding liu  val issueMbIndex     = issueEntry.mBIndex
3133952421bSweiding liu  val issueFlowNum     = issueEntry.flowNum
3143952421bSweiding liu  val issueBaseAddr    = issueEntry.baseAddr
315*16c2d8bbSAnzo  val issueUopAddr     = issueEntry.uopAddr
3163952421bSweiding liu  val issueUop         = issueEntry.uop
3173952421bSweiding liu  val issueUopIdx      = issueUop.vpu.vuopIdx
3183952421bSweiding liu  val issueInstType    = issueEntry.instType
3193952421bSweiding liu  val issueUopOffset   = issueEntry.uopOffset
3203952421bSweiding liu  val issueEew         = issueEntry.eew
3213952421bSweiding liu  val issueSew         = issueEntry.sew
32261054c5cSAnzooooo  val issueLmul        = issueEntry.lmul
32361054c5cSAnzooooo  val issueEmul        = issueEntry.emul
3243952421bSweiding liu  val issueAlignedType = issueEntry.alignedType
3253952421bSweiding liu  val issuePreIsSplit  = issueEntry.preIsSplit
3263952421bSweiding liu  val issueByteMask    = issueEntry.byteMask
327c8d442a6Sweiding liu  val issueUsMask      = issueEntry.usMask
32861054c5cSAnzooooo  val issueVLMAXMask   = issueEntry.vlmax - 1.U
32961054c5cSAnzooooo  val issueIsWholeReg  = issueEntry.usWholeReg
33061054c5cSAnzooooo  val issueVLMAXLog2   = GenVLMAXLog2(issueEntry.lmul, issueSew)
33181b02df5Sweiding liu  val issueVlMaxInVd   = issueEntry.indexVlMaxInVd
3323771bdd2Sweiding liu  val issueUsLowBitsAddr = issueEntry.usLowBitsAddr
3333771bdd2Sweiding liu  val issueUsAligned128  = issueEntry.usAligned128
3343952421bSweiding liu  val elemIdx = GenElemIdx(
3353952421bSweiding liu    instType = issueInstType,
3363952421bSweiding liu    emul = issueEmul,
3373952421bSweiding liu    lmul = issueLmul,
3383952421bSweiding liu    eew = issueEew,
3393952421bSweiding liu    sew = issueSew,
3403952421bSweiding liu    uopIdx = issueUopIdx,
3413952421bSweiding liu    flowIdx = splitIdx
3423952421bSweiding liu  ) // elemIdx inside an inst, for exception
34361054c5cSAnzooooo
34402ab9019SAnzooooo  val splitIdxOffset = issueEntry.indexedSplitOffset + splitIdx
34502ab9019SAnzooooo
34681b02df5Sweiding liu  val indexFlowInnerIdx = elemIdx & issueVlMaxInVd
34761054c5cSAnzooooo  val nfIdx = Mux(issueIsWholeReg, 0.U, elemIdx >> issueVLMAXLog2)
34861054c5cSAnzooooo  val fieldOffset = nfIdx << issueAlignedType // field offset inside a segment
34961054c5cSAnzooooo
3503952421bSweiding liu  val indexedStride    = IndexAddr( // index for indexed instruction
3513952421bSweiding liu    index = issueEntry.stride,
35261054c5cSAnzooooo    flow_inner_idx = indexFlowInnerIdx,
3533952421bSweiding liu    eew = issueEew
3543952421bSweiding liu  )
3553952421bSweiding liu  val issueStride = Mux(isIndexed(issueInstType), indexedStride, strideOffsetReg)
356*16c2d8bbSAnzo  val vaddr = issueUopAddr + issueStride
3573952421bSweiding liu  val mask = genVWmask128(vaddr ,issueAlignedType) // scala maske for flow
3583952421bSweiding liu  val flowMask = issueEntry.flowMask
3593952421bSweiding liu  /*
3603952421bSweiding liu   * Unit-Stride split to one flow or two flow.
3613952421bSweiding liu   * for Unit-Stride, if uop's addr is aligned with 128-bits, split it to one flow, otherwise split two
3623952421bSweiding liu   */
363c8d442a6Sweiding liu  val usSplitMask      = genUSSplitMask(issueUsMask, splitIdx)
3649e76f400Sweiding liu  val usMaskInSingleUop = (genUSSplitMask(issueUsMask, 1.U) === 0.U) // if second splited Mask is zero, means this uop is unnecessary to split
3659e76f400Sweiding liu  val usNoSplit        = (issueUsAligned128 || usMaskInSingleUop) &&
366a31db3ffSweiding liu                          !issuePreIsSplit &&
367a31db3ffSweiding liu                          (splitIdx === 0.U)// unit-stride uop don't need to split into two flow
368*16c2d8bbSAnzo  val usSplitVaddr     = genUSSplitAddr(issueUopAddr, splitIdx, XLEN)
3693771bdd2Sweiding liu  val regOffset        = getCheckAddrLowBits(issueUsLowBitsAddr, maxMemByteNum) // offset in 256-bits vd
3703952421bSweiding liu  XSError((splitIdx > 1.U && usNoSplit) || (splitIdx > 1.U && !issuePreIsSplit) , "Unit-Stride addr split error!\n")
3713952421bSweiding liu
3729f329f86SAnzo  val vecActive = Mux(!issuePreIsSplit, usSplitMask.orR, (flowMask & UIntToOH(splitIdx)).orR)
373b240e1c0SAnzooooo  // no-unit-stride can trigger misalign
3744a84d160SAnzo  val addrAligned = LookupTree(issueEew, List(
3754a84d160SAnzo    "b00".U   -> true.B,                //b
376b240e1c0SAnzooooo    "b01".U   -> (vaddr(0)    === 0.U), //h
377b240e1c0SAnzooooo    "b10".U   -> (vaddr(1, 0) === 0.U), //w
378b240e1c0SAnzooooo    "b11".U   -> (vaddr(2, 0) === 0.U)  //d
379b240e1c0SAnzooooo  )) || !issuePreIsSplit
3804a84d160SAnzo
3813952421bSweiding liu  // data
3823952421bSweiding liu  io.out.bits match { case x =>
3833952421bSweiding liu    x.uop                   := issueUop
384e7ab4635SHuijin Li    x.uop.imm               := 0.U
385102b377bSweiding liu    x.uop.exceptionVec      := ExceptionNO.selectByFu(issueUop.exceptionVec, fuCfg)
3866bd8baa1Sweiding liu    x.vaddr                 := Mux(!issuePreIsSplit, usSplitVaddr, vaddr)
387c0355297SAnzooooo    x.basevaddr             := issueBaseAddr
3883952421bSweiding liu    x.alignedType           := issueAlignedType
3893952421bSweiding liu    x.isvec                 := true.B
3906bd8baa1Sweiding liu    x.mask                  := Mux(!issuePreIsSplit, usSplitMask, mask)
391b5d66726Sweiding liu    x.reg_offset            := regOffset //for merge unit-stride data
3929f329f86SAnzo    x.vecActive             := vecActive // currently, unit-stride's flow always send to pipeline
3933952421bSweiding liu    x.is_first_ele          := DontCare
3943952421bSweiding liu    x.usSecondInv           := usNoSplit
39555178b77Sweiding liu    x.elemIdx               := elemIdx
39602ab9019SAnzooooo    x.elemIdxInsideVd       := splitIdxOffset // if is Unit-Stride, elemIdx is the index of 2 splited mem request (for merge data)
39726af847eSgood-circle    x.uop_unit_stride_fof   := DontCare
39826af847eSgood-circle    x.isFirstIssue          := DontCare
39926af847eSgood-circle    x.mBIndex               := issueMbIndex
4003952421bSweiding liu  }
4013952421bSweiding liu
402c79353cdSweiding liu  // redirect
403c66d9614Sweiding liu  needCancel := uopq.uop.robIdx.needFlush(io.redirect) && allocated
4043952421bSweiding liu
4053952421bSweiding liu /* Execute logic */
4063952421bSweiding liu  /** Issue to scala pipeline**/
407b240e1c0SAnzooooo
408b240e1c0SAnzooooo  lazy val misalignedCanGo = true.B
409b240e1c0SAnzooooo  val allowIssue = (addrAligned || misalignedCanGo) && io.out.ready
410511725b0Sweiding liu  val issueCount = Mux(usNoSplit, 2.U, (PopCount(inActiveIssue) + PopCount(activeIssue))) // for dont need split unit-stride, issue two flow
411c79353cdSweiding liu  splitFinish := splitIdx >= (issueFlowNum - issueCount)
4123952421bSweiding liu
4133952421bSweiding liu  // handshake
4149f329f86SAnzo  activeIssue := issueValid && allowIssue && vecActive // active issue, current use in no unit-stride
4159f329f86SAnzo  inActiveIssue := issueValid && !vecActive
416c79353cdSweiding liu  when (!issueEntry.uop.robIdx.needFlush(io.redirect)) {
417c79353cdSweiding liu    when (!splitFinish) {
418511725b0Sweiding liu      when (activeIssue || inActiveIssue) {
4193952421bSweiding liu        // The uop has not been entirly splited yet
4203952421bSweiding liu        splitIdx := splitIdx + issueCount
421*16c2d8bbSAnzo        strideOffsetReg := Mux(!issuePreIsSplit, 0.U, strideOffsetReg + issueEntry.stride) // when normal unit-stride, don't use strideOffsetReg
42274954a87SAnzooooo      }
4233952421bSweiding liu    }.otherwise {
424511725b0Sweiding liu      when (activeIssue || inActiveIssue) {
4253952421bSweiding liu        // The uop is done spliting
4263952421bSweiding liu        splitIdx := 0.U(flowIdxBits.W) // initialize flowIdx
4273952421bSweiding liu        strideOffsetReg := 0.U
4283952421bSweiding liu      }
4293952421bSweiding liu    }
4303952421bSweiding liu  }.otherwise {
4313952421bSweiding liu    splitIdx := 0.U(flowIdxBits.W) // initialize flowIdx
4323952421bSweiding liu    strideOffsetReg := 0.U
4333952421bSweiding liu  }
434c79353cdSweiding liu  // allocated
435c66d9614Sweiding liu  when(doEnqueue){ // if enqueue need to been cancelled, it will be false, so this have high priority
436c66d9614Sweiding liu    allocated := true.B
437c66d9614Sweiding liu  }.elsewhen(needCancel) { // redirect
438c66d9614Sweiding liu    allocated := false.B
439c66d9614Sweiding liu  }.elsewhen(splitFinish && (activeIssue || inActiveIssue)){ //dequeue
440c66d9614Sweiding liu    allocated := false.B
441c79353cdSweiding liu  }
4423952421bSweiding liu
4433952421bSweiding liu  // out connect
4449f329f86SAnzo  io.out.valid := issueValid && vecActive && (addrAligned || misalignedCanGo) // TODO: inactive unit-stride uop do not send to pipeline
445c79353cdSweiding liu
446b2d6d8e7Sgood-circle  XSPerfAccumulate("out_valid",             io.out.valid)
447b2d6d8e7Sgood-circle  XSPerfAccumulate("out_fire",              io.out.fire)
448b2d6d8e7Sgood-circle  XSPerfAccumulate("out_fire_unitstride",   io.out.fire && !issuePreIsSplit)
449a31db3ffSweiding liu  XSPerfAccumulate("unitstride_vlenAlign",  io.out.fire && !issuePreIsSplit && getCheckAddrLowBits(io.out.bits.vaddr, maxMemByteNum) === 0.U)
450c79353cdSweiding liu  XSPerfAccumulate("unitstride_invalid",    io.out.ready && issueValid && !issuePreIsSplit && PopCount(io.out.bits.mask).orR)
4513952421bSweiding liu}
4523952421bSweiding liu
4533952421bSweiding liuclass VSSplitBufferImp(implicit p: Parameters) extends VSplitBuffer(isVStore = true){
454b240e1c0SAnzooooo  override lazy val misalignedCanGo = io.vstdMisalign.get.storePipeEmpty && io.vstdMisalign.get.storeMisalignBufferEmpty
455b240e1c0SAnzooooo
4563952421bSweiding liu  // split data
45708047a41SAnzooooo  val splitData = genVSData(
4583952421bSweiding liu        data = issueEntry.data.asUInt,
45902ab9019SAnzooooo        elemIdx = splitIdxOffset,
4603952421bSweiding liu        alignedType = issueAlignedType
4613952421bSweiding liu      )
462eaf128c5SAnzooooo  val flowData = genVWdata(splitData, issueAlignedType)
4633952421bSweiding liu  val usSplitData      = genUSSplitData(issueEntry.data.asUInt, splitIdx, vaddr(3,0))
4643952421bSweiding liu
46584258c4dSweiding liu  val sqIdx = issueUop.sqIdx + splitIdx
46684258c4dSweiding liu  io.out.bits.uop.sqIdx := sqIdx
4674a84d160SAnzo  io.out.bits.uop.exceptionVec(storeAddrMisaligned) := !addrAligned && !issuePreIsSplit && io.out.bits.mask.orR
46800b107d2Sweiding liu
4693952421bSweiding liu  // send data to sq
4703952421bSweiding liu  val vstd = io.vstd.get
471c41a9f78Slwd  vstd.valid := issueValid && (vecActive || !issuePreIsSplit)
4723952421bSweiding liu  vstd.bits.uop := issueUop
47384258c4dSweiding liu  vstd.bits.uop.sqIdx := sqIdx
474e7ab4635SHuijin Li  vstd.bits.uop.fuType := FuType.vstu.U
4756bd8baa1Sweiding liu  vstd.bits.data := Mux(!issuePreIsSplit, usSplitData, flowData)
47626af847eSgood-circle  vstd.bits.debug := DontCare
47726af847eSgood-circle  vstd.bits.vdIdx.get := DontCare
47826af847eSgood-circle  vstd.bits.vdIdxInField.get := DontCare
479bd3e32c1Ssinsanction  vstd.bits.isFromLoadUnit   := DontCare
4806bd8baa1Sweiding liu  vstd.bits.mask.get := Mux(!issuePreIsSplit, usSplitMask, mask)
481b7618691Sweiding liu
4823952421bSweiding liu}
4833952421bSweiding liu
4843952421bSweiding liuclass VLSplitBufferImp(implicit p: Parameters) extends VSplitBuffer(isVStore = false){
48500b107d2Sweiding liu  io.out.bits.uop.lqIdx := issueUop.lqIdx + splitIdx
4864a84d160SAnzo  io.out.bits.uop.exceptionVec(loadAddrMisaligned) := !addrAligned && !issuePreIsSplit && io.out.bits.mask.orR
487e7ab4635SHuijin Li  io.out.bits.uop.fuType := FuType.vldu.U
4883952421bSweiding liu}
4893952421bSweiding liu
49026af847eSgood-circleclass VSSplitPipelineImp(implicit p: Parameters) extends VSplitPipeline(isVStore = true){
49130d194e4Sweiding liu  override def us_whole_reg(fuOpType: UInt): Bool = fuOpType === VstuType.vsr
49230d194e4Sweiding liu  override def us_mask(fuOpType: UInt): Bool      = fuOpType === VstuType.vsm
49330d194e4Sweiding liu  override def us_fof(fuOpType: UInt): Bool       = false.B // dont have vector fof store
4943952421bSweiding liu}
4953952421bSweiding liu
49626af847eSgood-circleclass VLSplitPipelineImp(implicit p: Parameters) extends VSplitPipeline(isVStore = false){
49730d194e4Sweiding liu
49830d194e4Sweiding liu  override def us_whole_reg(fuOpType: UInt): Bool = fuOpType === VlduType.vlr
49930d194e4Sweiding liu  override def us_mask(fuOpType: UInt): Bool      = fuOpType === VlduType.vlm
50030d194e4Sweiding liu  override def us_fof(fuOpType: UInt): Bool       = fuOpType === VlduType.vleff
5013952421bSweiding liu}
5023952421bSweiding liu
5033952421bSweiding liuclass VLSplitImp(implicit p: Parameters) extends VLSUModule{
5043952421bSweiding liu  val io = IO(new VSplitIO(isVStore=false))
5053952421bSweiding liu  val splitPipeline = Module(new VLSplitPipelineImp())
5063952421bSweiding liu  val splitBuffer = Module(new VLSplitBufferImp())
5072d8a0b4aSAnzo  val mergeBufferNack = io.threshold.get.valid && io.threshold.get.bits =/= io.in.bits.uop.lqIdx
5083952421bSweiding liu  // Split Pipeline
5093952421bSweiding liu  splitPipeline.io.in <> io.in
5102d8a0b4aSAnzo  io.in.ready := splitPipeline.io.in.ready && !mergeBufferNack
5113952421bSweiding liu  splitPipeline.io.redirect <> io.redirect
5123952421bSweiding liu  io.toMergeBuffer <> splitPipeline.io.toMergeBuffer
5133952421bSweiding liu
51408b0bc30Shappy-lx  // skid buffer
515aa55b9faSlwd  skidBuffer(splitPipeline.io.out, splitBuffer.io.in,
516aa55b9faSlwd    Mux(splitPipeline.io.out.fire,
517aa55b9faSlwd      splitPipeline.io.out.bits.uop.robIdx.needFlush(io.redirect),
518aa55b9faSlwd      splitBuffer.io.in.bits.uop.robIdx.needFlush(io.redirect)),
519aa55b9faSlwd    "VSSplitSkidBuffer")
52008b0bc30Shappy-lx
5213952421bSweiding liu  // Split Buffer
5223952421bSweiding liu  splitBuffer.io.redirect <> io.redirect
5233952421bSweiding liu  io.out <> splitBuffer.io.out
5243952421bSweiding liu}
5253952421bSweiding liu
5263952421bSweiding liuclass VSSplitImp(implicit p: Parameters) extends VLSUModule{
5273952421bSweiding liu  val io = IO(new VSplitIO(isVStore=true))
5283952421bSweiding liu  val splitPipeline = Module(new VSSplitPipelineImp())
5293952421bSweiding liu  val splitBuffer = Module(new VSSplitBufferImp())
5303952421bSweiding liu  // Split Pipeline
5313952421bSweiding liu  splitPipeline.io.in <> io.in
5323952421bSweiding liu  splitPipeline.io.redirect <> io.redirect
5333952421bSweiding liu  io.toMergeBuffer <> splitPipeline.io.toMergeBuffer
5343952421bSweiding liu
53508b0bc30Shappy-lx  // skid buffer
536aa55b9faSlwd  skidBuffer(splitPipeline.io.out, splitBuffer.io.in,
537aa55b9faSlwd    Mux(splitPipeline.io.out.fire,
538aa55b9faSlwd      splitPipeline.io.out.bits.uop.robIdx.needFlush(io.redirect),
539aa55b9faSlwd      splitBuffer.io.in.bits.uop.robIdx.needFlush(io.redirect)),
540aa55b9faSlwd    "VSSplitSkidBuffer")
54108b0bc30Shappy-lx
5423952421bSweiding liu  // Split Buffer
5433952421bSweiding liu  splitBuffer.io.redirect <> io.redirect
5443952421bSweiding liu  io.out <> splitBuffer.io.out
5453952421bSweiding liu  io.vstd.get <> splitBuffer.io.vstd.get
546b240e1c0SAnzooooo
547b240e1c0SAnzooooo  io.vstdMisalign.get <> splitBuffer.io.vstdMisalign.get
5483952421bSweiding liu}
5493952421bSweiding liu
550