common/arm/ihevc_resi_trans.s

@/******************************************************************************
@ *
@ * Copyright (C) 2018 The Android Open Source Project
@ *
@ * Licensed under the Apache License, Version 2.0 (the "License");
@ * you may not use this file except in compliance with the License.
@ * You may obtain a copy of the License at:
@ *
@ * http://www.apache.org/licenses/LICENSE-2.0
@ *
@ * Unless required by applicable law or agreed to in writing, software
@ * distributed under the License is distributed on an "AS IS" BASIS,
@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ * See the License for the specific language governing permissions and
@ * limitations under the License.
@ *
@ *****************************************************************************
@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
@*/

.text
.align 4

@/**
@/*******************************************************************************
@/*
@/* @brief
@/*  Residue calculation and Forward Transform for 4x4 block with 8-bit input
@/*
@/* @par Description:
@/*  Performs residue calculation by subtracting source and  prediction and
@/*  followed by forward transform
@/*
@/* @param[in] pu1_src
@/*  Input 4x4 pixels
@/*
@/* @param[in] pu1_pred
@/*  Prediction data
@/*
@/* @param[in] pi4_tmp
@/*  Temporary buffer of size 4x4
@/*
@/* @param[out] pi2_dst
@/*  Output 4x4 coefficients
@/*
@/* @param[in] src_strd
@/*  Input stride
@/*
@/* @param[in] pred_strd
@/*  Prediction Stride
@/*
@/* @param[in] dst_strd
@/*  Output Stride
@/*
@/* @param[in] chr_plane
@/*  Chroma plane
@/*
@/* @returns  Void
@/*
@/* @remarks
@/*  None
@/*
@/*******************************************************************************
@/*/

@/**************Variables Vs Registers*****************************************
@    r0 => *pu1_src
@    r1 => *pu1_pred
@    r2 => *pi4_temp
@    r3 => *pi2_dst
@    r4 => src_strd
@    r5 => pred_strd
@    r6 => dst_strd
@    r7 => chroma_plane

    .global ihevc_resi_trans_4x4_a9q

ihevc_resi_trans_4x4_a9q:

    STMFD          sp!, {r4-r7, r14}   @ store all the register components from caller function to memory
    LDR            r4, [sp,#20]        @ r4 contains src_strd
    LDR            r5, [sp,#24]        @ r5 contains pred_strd
    LDR            r6, [sp,#28]        @ r6 contains dst_strd
    LDR            r7, [sp,#32]        @ r7 chroma plane

    CMP            r7, #-1
    BEQ            NON_INTERLEAVE_LOAD @if flag == NULL_PLANE, use non-interleaving loads

    VLD1.64        d0, [r0], r4        @ load row 0 src
    VLD1.64        d4, [r0], r4        @ load row 1 src
    VLD1.64        d1, [r0], r4        @ load row 2 src
    VLD1.64        d5, [r0], r4        @ load row 3 src
    VUZP.8         d0, d4              @ de-interleaving unzip instruction to get luma data of pu1_src in d0
    VUZP.8         d1, d5              @ de-interleaving unzip instruction to get luma data of pu1_src in d1

    VLD1.64        d2, [r1], r5        @ load row 0 pred
    VLD1.64        d6, [r1], r5        @ load row 1 pred
    VLD1.64        d3, [r1], r5        @ load row 2 pred
    VLD1.64        d7, [r1], r5        @ load row 3 pred
    VUZP.8         d2, d6              @ de-interleaving unzip instruction to get luma data of pu1_pred in d2
    VUZP.8         d3, d7              @ de-interleaving unzip instruction to get luma data of pu1_pred in d3

    CMP            r7, #0
    BEQ            LOAD_END
    VSWP.8         d0, d4
    VSWP.8         d1, d5
    VSWP.8         d2, d6
    VSWP.8         d3, d7

    B LOAD_END

NON_INTERLEAVE_LOAD:
    VLD1.U32     d0[0], [r0], r4       @ load row 0 src
    VLD1.U32     d0[1], [r0], r4       @ load row 1 src
    VLD1.U32     d1[0], [r0], r4       @ load row 2 src
    VLD1.U32     d1[1], [r0], r4       @ load row 3 src

    VLD1.U32     d2[0], [r1], r5       @ load row 0 pred
    VLD1.U32     d2[1], [r1], r5       @ load row 1 pred
    VLD1.U32     d3[0], [r1], r5       @ load row 2 pred
    VLD1.U32     d3[1], [r1], r5       @ load row 3 pred

LOAD_END:
    @ Finding the residue
    VSUBL.U8    q2, d0, d2             @ q2 contains 1st 16-bit 8 residues
    VSUBL.U8    q3, d1, d3             @ q3 contains 2nd 16-bit 8 residues

    @ SAD caculation
    VABDL.U8    q12, d0, d2            @ q12 contains absolute differences
    VABAL.U8    q12, d1, d3            @ q12 accumulates absolute differences
    VADD.U16    d26, d24, d25          @ add d-registers of q12
    VPADDL.U16  d27, d26               @ d27 contains 2 32-bit values that have to be added
    VPADDL.U32  d28, d27               @ d28 contains 64-bit SAD, only LSB important
    VMOV.32     r0, d28[0]             @ SAD stored in r0 for return
    @ SAD caculation ends

    @ Forward transform - step 1
    VMOV.I16    d2, #64                @ generate immediate constant in d2 for even row multiplication
    VTRN.16     d4, d5                 @ 3-step transpose of residue matrix starts
    VTRN.16     d6, d7                 @ 2nd step of the 3-step matrix transpose
    VMOV.I16    d0, #83                @ generate immediate constant in d0 for odd row multiplication
    VTRN.32     q2, q3                 @ Final step of matrix transpose

    VMOV.I16    d1, #36                @ generate immediate constant in d1 for odd row multiplication
    VSWP        d6, d7                 @ vector swap to allow even and odd row calculation using Q registers
    VADD.S16    q10, q2, q3            @ q4 has the even array
    VSUB.S16    q11, q2, q3            @ q5 has the odd array
    VMULL.S16   q12, d20, d2           @ e[0]*64
    VMLAL.S16   q12, d21, d2[0]        @ row 1 of results: e[0]*64 + e[1]*64
    VMULL.S16   q13, d20, d2           @ e[0]*64
    VMLSL.S16   q13, d21, d2[0]        @ row 3 of results: e[0]*64 - e[1]*64
    VMULL.S16   q8, d22, d0            @ o[0]*83
    VMLAL.S16   q8, d23, d1[0]         @ row 2 of results: o[0]*83 + o[1]*36
    VMULL.S16   q9, d22, d1            @ o[0]*36
    VMLSL.S16   q9, d23, d0[0]         @ row 4 of results: o[0]*36 - o[1]*83

    @ Forward transform - step 2
    VMOV.I32    d2, #64                @ generate immediate constant in d2 for even row multiplication
    VMOV.I32    d0, #83                @ generate immediate constant in d0 for odd row multiplication
    VTRN.32     q12, q8                @ 4-step transpose of residue matrix starts
    VTRN.32     q13, q9                @ 2nd step of the 4-step matrix transpose

    VMOV.I32    d1, #36                @ generate immediate constant in d1 for odd row multiplication
    VSWP        d25, d26               @ 3rd step of the 4-step matrix transpose
    VSWP        d17, d18               @ 4th step of the 4-step matrix transpose
    VADD.S32    q2, q12, q9            @ e[0]
    VADD.S32    q3, q8, q13            @ e[1]
    VSUB.S32    q10, q12, q9           @ o[0]
    VSUB.S32    q11, q8, q13           @ o[1]

    VMUL.S32    q12, q2, d2[0]         @ e[0]*64
    VMLA.S32    q12, q3, d2[0]         @ row 1 of results: e[0]*64 + e[1]*64
    VMUL.S32    q13, q2, d2[0]         @ e[1]*64
    VMLS.S32    q13, q3, d2[0]         @ row 3 of results: e[0]*64 - e[1]*64
    VMUL.S32    q8, q10, d0[0]         @ o[0]*83
    VMLA.S32    q8, q11, d1[0]         @ row 2 of results: o[0]*83 + o[1]*36
    VMUL.S32    q9, q10, d1[0]         @ o[0]*36
    VMLS.S32    q9, q11, d0[0]         @ row 4 of results: o[0]*36 - o[1]*83

    VRSHRN.S32  d0, q12, #9            @ (row1 + 256)/512
    VRSHRN.S32  d1, q8, #9             @ (row2 + 256)/512
    VRSHRN.S32  d2, q13, #9            @ (row3 + 256)/512
    VRSHRN.S32  d3, q9, #9             @ (row4 + 256)/512

    LSL         r7, r6, #1             @ r7 = 2*dst_strd, as pi2_dst contains 2-byte integers
    VST1.U16    d0, [r3], r7           @ store 1st row of result
    VST1.U16    d1, [r3], r7           @ store 2nd row of result
    VST1.U16    d2, [r3], r7           @ store 3rd row of result
    VST1.U16    d3, [r3], r7           @ store 4th row of result

    LDMFD       sp!,{r4-r7,r15}        @ Reload the registers from SP

    @ Function End

@/**
@*******************************************************************************
@*
@* @brief
@*  This function performs residue calculation and forward  transform type 1
@*  on input pixels
@*
@* @description
@*  Performs residue calculation by subtracting source and  prediction and
@*  followed by forward transform
@*
@* @param[in] pu1_src
@*  Input 4x4 pixels
@*
@* @param[in] pu1_pred
@*  Prediction data
@*
@* @param[in] pi2_tmp
@*  Temporary buffer of size 4x4
@*
@* @param[out] pi2_dst
@*  Output 4x4 coefficients
@*
@* @param[in] src_strd
@*  Input stride
@*
@* @param[in] pred_strd
@*  Prediction Stride
@*
@* @param[in] dst_strd
@*  Output Stride
@*
@* @param[in] chr_plane (unused)
@*  Chroma plane
@*
@* @returns void
@*
@* @remarks
@*  None
@*
@*******************************************************************************
@*/
@ UWORD32 ihevc_resi_trans_4x4_ttype1(UWORD8 *pu1_src,
@                                     UWORD8 *pu1_pred,
@                                     WORD32 *pi4_temp,
@                                     WORD16 *pi2_dst,
@                                     WORD32 src_strd,
@                                     WORD32 pred_strd,
@                                     WORD32 dst_strd
@                                     WORD32 chroma_plane);
@
@**************Variables Vs Registers*******************************************
@
@ r0 - pu1_src
@ r1 - pu1_pred
@ r2 - pi4_temp
@ r3 - pi2_dst
@
@ [sp]   - src_strd
@ [sp+4] - pred_strd
@ [sp+8] - dst_strd
@ [sp+12] - chroma_plane
@
@*******************************************************************************

    .global ihevc_resi_trans_4x4_ttype1_a9q

ihevc_resi_trans_4x4_ttype1_a9q:

    PUSH {r4}
    vpush {d8 - d15}

    LDR r2,[sp,#68]                 @ r2 = src_strd
    LDR r4,[sp,#72]                 @ r4 = pred_strd

    VLD1.32 d2[0],[r0],r2           @ Row 1 of source in d2[0]
    VLD1.32 d3[0],[r1],r4           @ Row 1 of prediction in d3[0]
    VLD1.32 d2[1],[r0],r2           @ Row 2 of source in d2[1]
    VLD1.32 d3[1],[r1],r4           @ Row 2 of prediction in d3[1]

    VLD1.32 d8[0],[r0],r2           @ Row 3 of source in d8[0]
    VABDL.U8 q0,d2,d3               @ Absolute differences of rows 1 and 2 in d0
                                    @ R2:[d11[3] d11[2] d11[1] d11[0]] => Row 2 of residue
    VLD1.32 d9[0],[r1],r4           @ Row 3 of prediction in d9[0]
    VSUBL.U8 q5,d2,d3               @ R1:[d10[3] d10[2] d10[1] d10[0]] => Row 1 of residue
    VLD1.32 d8[1],[r0]              @ Row 4 of source in d8[1]
    VTRN.16 d10,d11                 @ Transpose step 1
    VLD1.32 d9[1],[r1]              @ Row 4 of prediction in d9[1]

    VSUBL.U8 q6,d8,d9               @ R3:[d12[3] d12[2] d12[1] d12[0]] => Row 3 of residue
                                    @ R4:[d13[3] d13[2] d13[1] d13[0]] => Row 4 of residue
    VABAL.U8 q0,d8,d9               @ Absolute differences of rows 3 and 4 in d1
    VTRN.16 d12,d13                 @ Transpose step 2
    VTRN.32 q5,q6                   @ Transpose step 3, Residue block transposed
                                    @ Columns are in C1:d10, C2:d11, C3:d12 and C4:d13
    VADD.S16 d23,d11,d13            @ d23 = C2 + C4
    VMOV.I32 d6,#55                 @ Constant used for multiplication
    VADD.S16 d22,d10,d13            @ d22 = C1 + C4
    VADD.U16 d0,d1,d0               @ Accumulating SAD step 1
    VMOV.I32 d7,#84                 @ Constant used for multiplication
    VMULL.S16 q7,d23,d6[0]          @ q7  = 55*C2 + 55*C4
    VMOV.I32 d4,#74                 @ Constant used for multiplication
    VMULL.S16 q9,d22,d7[0]          @ q9  = 84*C1 + 84*C4
    VADD.S16 d16,d10,d11            @ d16 = C1 + C2
    VMUL.S16 d12,d12,d4[0]          @ d12 = 74*C3
    VMOV.I32 d5,#29                 @ Constant used for multiplication
    VPADDL.U16 d0,d0                @ Accumulating SAD step 2
    VSUB.S16 d16,d16,d13            @ d16 = C1 + C2 - C4
    VMLAL.S16 q7,d22,d5[0]          @ q7  = 29*C1 + 55*C2 + 84*C4
    VMLSL.S16 q9,d23,d5[0]          @ q9  = 84*C1 - 29*C2 + 55*C4
    VMULL.S16 q8,d16,d4[0]          @ q8  = 74*C1 + 74*C2 - 74*C4
    VPADDL.U32 d0,d0                @ Accumulating SAD step 3, SAD in d0
    VSUB.S32 q10,q9,q7              @ q10 = q9 - q7 = 55*C1 - 84*C2 - 29*C4
    VMOV.32 r0,d0[0]                @ Return SAD value
    VRSHR.S32 q8,q8,#1              @ Truncating the 1 bit in q8

    VADDW.S16 q7,q7,d12             @ q7  = 29*C1 + 55*C2 + 74*C3 + 84*C4
    VSUBW.S16 q9,q9,d12             @ q9  = 84*C1 - 29*C2 - 74*C3 + 55*C4
    VADDW.S16 q10,q10,d12           @ q10 = 55*C1 - 84*C2 + 74*C3 - 29*C4

    VRSHR.S32 q7,q7,#1              @ Truncating the 1 bit in q7
    VRSHR.S32 q9,q9,#1              @ Truncating the 1 bit in q9
    VRSHR.S32 q10,q10,#1            @ Truncating the 1 bit in q10
                                    @ Transform stage 1 is in P1:q7, P2:q8, P3:q9 and P4:q10
    VTRN.32 q7,q8
    VTRN.32 q9,q10
    VSWP d15,d18
    VSWP d17,d20                    @ Residue block transposed
                                    @ Corresponding columns are in S1:q7, S2:q8, S3:q9 and S4:q10
    VADD.S32 q13,q7,q8              @ q13 = S1 + S2
    VADD.S32 q1,q7,q10              @ q1 = S1 + S4
    VADD.S32 q4,q8,q10              @ q4 = S2 + S4
    VSUB.S32 q13,q13,q10            @ q13 = S1 + S2 - S4
    VMUL.S32 q12,q1,d5[0]           @ q12 = 29*S1 + 29*S4
    VMUL.S32 q14,q1,d7[0]           @ q14 = 84*S1 + 84*S4
    VMUL.S32 q13,q13,d4[0]          @ q13 = 74*S1 + 74*S2 - 74*S4

    VMLA.S32 q12,q4,d6[0]           @ q12 = 29*S1 + 55*S2 + 84*S4
    VMLS.S32 q14,q4,d5[0]           @ q14 = 84*S1 - 29*S2 + 55*S4
    VMUL.S32 q9,q9,d4[0]            @ q9 = 74*S3

    LDR r4,[sp,#76]                 @ r4 = dst_strd_chr_flag
    LSL r4,r4,#1                    @ r4 = 2*dst_strd

    VRSHRN.S32 d26,q13,#8
    VSUB.S32 q15,q14,q12            @ q15 = q14 - q12 = 55*S1 - 84*S2 - 29*S4

    VADD.S32 q12,q12,q9             @ q12 = 29*S1 + 55*S2 + 74*S3 + 84*S4
    VSUB.S32 q14,q14,q9             @ q14 = 84*S1 - 29*S2 - 74*S3 + 55*S4
    VADD.S32 q15,q15,q9             @ q15 = 55*S1 - 84*S2 + 74*S3 - 29*S4

    VRSHRN.S32 d24,q12,#8
    VRSHRN.S32 d28,q14,#8
    VRSHRN.S32 d30,q15,#8           @ Truncating the last 8 bits
                                    @ Transform stage 2 is in U1:d24, U2:d26, U3:d28 and U4:d30
    VST1.64 d24,[r3],r4             @ Storing row 1 of transform stage 2
    VST1.64 d26,[r3],r4             @ Storing row 2 of transform stage 2
    VST1.64 d28,[r3],r4             @ Storing row 3 of transform stage 2
    VST1.64 d30,[r3]                @ Storing row 4 of transform stage 2

    vpop {d8 - d15}
    POP {r4}
    MOV pc,lr

@/**
@*******************************************************************************
@*
@* @brief
@*  This function performs residue calculation and DCT integer forward transform
@*  on 8x8 block
@*
@* @description
@*  Performs residue calculation by subtracting source and prediction and
@*  followed by DCT integer forward transform
@*
@* @param[in] pu1_src
@*  Input 4x4 pixels
@*
@* @param[in] pu1_pred
@*  Prediction data
@*
@* @param[in] pi2_tmp
@*  Temporary buffer of size 8x8
@*
@* @param[out] pi2_dst
@*  Output 8x8 coefficients
@*
@* @param[in] src_strd
@*  Input stride
@*
@* @param[in] pred_strd
@*  Prediction Stride
@*
@* @param[in] dst_strd
@*  Output Stride
@*
@* @param[in] chr_plane
@*  Chroma plane
@*
@* @returns void
@*
@* @remarks
@*  None
@*
@*******************************************************************************
@*/
@ UWORB32 ihevc_resi_trans_8x8(UWORD8 *pu1_src,
@                              UWORD8 *pu1_pred,
@                              WORB32 *pi4_temp,
@                              WORB16 *pi2_dst,
@                              WORB32 src_strd,
@                              WORB32 pred_strd,
@                              WORB32 dst_strd
@                              WORB32 chroma_plane);
@
@**************Variables Vs Registers*******************************************
@
@ r0 - pu1_src
@ r1 - pu1_pred
@ r2 - pi4_temp
@ r3 - pi2_dst
@
@ [sp]   - src_strd
@ [sp+4] - pred_strd
@ [sp+8] - dst_strd
@ [sp+12] - chroma_plane
@
@*******************************************************************************

    .global ihevc_resi_trans_8x8_a9q

ihevc_resi_trans_8x8_a9q:

    PUSH {r4,r5}
    vpush {d8 - d15}

    @ Loading Prediction and Source blocks of size 8x8

    LDR r4,[sp,#84]                 @ r4 = chroma flag

    CMP r4,#-1                      @ NULL PLANE
    BEQ LUMA_LOAD

    CMP r4,#1                       @ V PLANE
    BEQ CHROMA_V_LOAD
                                    @ handling U PLANE
    LDR r5,[sp,#72]                 @ r5 = src_strd
    LDR r4,[sp,#76]                 @ r4 = pred_strd

    VLD2.8 {d0,d2},[r1],r4          @ Row 1 of prediction in d0
    VLD2.8 {d1,d3},[r0],r5          @ Row 1 of source in d1

    VABDL.U8 q15,d1,d0              @ Row 1 of absolute difference in q15
    VLD2.8 {d2,d4},[r1],r4          @ Row 2 of prediction in d2
    VSUBL.U8 q0,d1,d0               @ Row 1 of residue in q0
    VLD2.8 {d3,d5},[r0],r5          @ Row 2 of source in d3

    VABDL.U8 q9,d3,d2               @ Row 2 of absolute difference in q9
    VLD2.8 {d4,d6},[r1],r4          @ Row 3 of prediction in d4
    VSUBL.U8 q1,d3,d2               @ Row 2 of residue in q1
    VLD2.8 {d5,d7},[r0],r5          @ Row 3 of source in d5

    VABAL.U8 q15,d5,d4              @ Row 3 of absolute difference accumulated in q15
    VLD2.8 {d6,d8},[r1],r4          @ Row 4 of prediction in d6
    VSUBL.U8 q2,d5,d4               @ Row 3 of residue in q2
    VLD2.8 {d7,d9},[r0],r5          @ Row 4 of source in d7

    VABAL.U8 q9,d7,d6               @ Row 4 of absolute difference accumulated in q9
    VLD2.8 {d8,d10},[r1],r4         @ Row 5 of prediction in d8
    VSUBL.U8 q3,d7,d6               @ Row 4 of residue in q3
    VLD2.8 {d9,d11},[r0],r5         @ Row 5 of source in d9

    VABDL.U8 q10,d9,d8              @ Row 5 of absolute difference in q10
    VLD2.8 {d10,d12},[r1],r4        @ Row 6 of prediction in d10
    VSUBL.U8 q4,d9,d8               @ Row 5 of residue in q4
    VLD2.8 {d11,d13},[r0],r5        @ Row 6 of source in d11

    VABAL.U8 q15,d11,d10            @ Row 6 of absolute difference accumulated in q15
    VLD2.8 {d12,d14},[r1],r4        @ Row 7 of prediction in d12
    VSUBL.U8 q5,d11,d10             @ Row 6 of residue in q5
    VLD2.8 {d13,d15},[r0],r5        @ Row 7 of source in d13

    VABAL.U8 q9,d13,d12             @ Row 7 of absolute difference accumulated in q9
    VLD2.8 {d14,d16},[r1]           @ Row 8 of prediction in d14
    VSUBL.U8 q6,d13,d12             @ Row 7 of residue in q6
    VLD2.8 {d15,d17},[r0]           @ Row 8 of source in d15

    B LUMA_LOAD_END

CHROMA_V_LOAD:
    LDR r5,[sp,#72]                 @ r5 = src_strd
    LDR r4,[sp,#76]                 @ r4 = pred_strd

    VLD2.8 {d0,d2},[r1],r4          @ Row 1 of prediction in d2
    VLD2.8 {d1,d3},[r0],r5          @ Row 1 of source in d3

    VABDL.U8 q15,d3,d2              @ Row 1 of absolute difference in q15
    VLD2.8 {d4,d6},[r1],r4          @ Row 2 of prediction in d6
    VSUBL.U8 q0,d3,d2               @ Row 1 of residue in q0
    VLD2.8 {d5,d7},[r0],r5          @ Row 2 of source in d7

    VABDL.U8 q9,d7,d6               @ Row 2 of absolute difference in q9
    VLD2.8 {d8,d10},[r1],r4         @ Row 3 of prediction in d10
    VSUBL.U8 q1,d7,d6               @ Row 2 of residue in q1
    VLD2.8 {d9,d11},[r0],r5         @ Row 3 of source in d11

    VABAL.U8 q15,d11,d10            @ Row 3 of absolute difference accumulated in q15
    VLD2.8 {d6,d8},[r1],r4          @ Row 4 of prediction in d8
    VSUBL.U8 q2,d11,d10             @ Row 3 of residue in q2
    VLD2.8 {d7,d9},[r0],r5          @ Row 4 of source in d9

    VABAL.U8 q9,d9,d8               @ Row 4 of absolute difference accumulated in q9
    VLD2.8 {d10,d12},[r1],r4        @ Row 5 of prediction in d12
    VSUBL.U8 q3,d9,d8               @ Row 4 of residue in q3
    VLD2.8 {d11,d13},[r0],r5        @ Row 5 of source in d13

    VABDL.U8 q10,d13,d12            @ Row 5 of absolute difference in q10
    VLD2.8 {d14,d16},[r1],r4        @ Row 6 of prediction in d16
    VSUBL.U8 q4,d13,d12             @ Row 5 of residue in q4
    VLD2.8 {d15,d17},[r0],r5        @ Row 6 of source in d17

    VABAL.U8 q15,d17,d16            @ Row 6 of absolute difference accumulated in q15
    VLD2.8 {d12,d14},[r1],r4        @ Row 7 of prediction in d12
    VSUBL.U8 q5,d17,d16             @ Row 6 of residue in q5
    VLD2.8 {d13,d15},[r0],r5        @ Row 7 of source in d13

    VABAL.U8 q9,d15,d14             @ Row 7 of absolute difference accumulated in q9
    VSUBL.U8 q6,d15,d14             @ Row 7 of residue in q6

    VLD2.8 {d14,d16},[r1]           @ Row 8 of prediction in d14
    VLD2.8 {d15,d17},[r0]           @ Row 8 of source in d15
    VSWP.8 d14,d16
    VSWP.8 d15,d17

    B LUMA_LOAD_END

LUMA_LOAD:

    LDR r5,[sp,#72]                 @ r5 = src_strd
    LDR r4,[sp,#76]                 @ r4 = pred_strd

    VLD1.64 d0,[r1],r4              @ Row 1 of prediction in d0
    VLD1.64 d1,[r0],r5              @ Row 1 of source in d1

    VABDL.U8 q15,d1,d0              @ Row 1 of absolute difference in q15
    VLD1.64 d2,[r1],r4              @ Row 2 of prediction in d2
    VSUBL.U8 q0,d1,d0               @ Row 1 of residue in q0
    VLD1.64 d3,[r0],r5              @ Row 2 of source in d3

    VABDL.U8 q9,d3,d2               @ Row 2 of absolute difference in q9
    VLD1.64 d4,[r1],r4              @ Row 3 of prediction in d4
    VSUBL.U8 q1,d3,d2               @ Row 2 of residue in q1
    VLD1.64 d5,[r0],r5              @ Row 3 of source in d5

    VABAL.U8 q15,d5,d4              @ Row 3 of absolute difference accumulated in q15
    VLD1.64 d6,[r1],r4              @ Row 4 of prediction in d6
    VSUBL.U8 q2,d5,d4               @ Row 3 of residue in q2
    VLD1.64 d7,[r0],r5              @ Row 4 of source in d7

    VABAL.U8 q9,d7,d6               @ Row 4 of absolute difference accumulated in q9
    VLD1.64 d8,[r1],r4              @ Row 5 of prediction in d8
    VSUBL.U8 q3,d7,d6               @ Row 4 of residue in q3
    VLD1.64 d9,[r0],r5              @ Row 5 of source in d9

    VABDL.U8 q10,d9,d8              @ Row 5 of absolute difference in q10
    VLD1.64 d10,[r1],r4             @ Row 6 of prediction in d10
    VSUBL.U8 q4,d9,d8               @ Row 5 of residue in q4
    VLD1.64 d11,[r0],r5             @ Row 6 of source in d11

    VABAL.U8 q15,d11,d10            @ Row 6 of absolute difference accumulated in q15
    VLD1.64 d12,[r1],r4             @ Row 7 of prediction in d12
    VSUBL.U8 q5,d11,d10             @ Row 6 of residue in q5
    VLD1.64 d13,[r0],r5             @ Row 7 of source in d13

    VABAL.U8 q9,d13,d12             @ Row 7 of absolute difference accumulated in q9
    VLD1.64 d14,[r1]                @ Row 8 of prediction in d14
    VSUBL.U8 q6,d13,d12             @ Row 7 of residue in q6
    VLD1.64 d15,[r0]                @ Row 8 of source in d15

LUMA_LOAD_END:

    @ Transform stage 1
    @ Transposing residue matrix

    VABAL.U8 q10,d15,d14            @ Row 8 of absolute difference accumulated in q10
    VTRN.16 q0,q1                   @ Transpose residue matrix step (1a)
    VSUBL.U8 q7,d15,d14             @ Row 8 of residue in q7
    VTRN.16 q2,q3                   @ Transpose residue matrix step (1b)

    VTRN.16 q4,q5                   @ Transpose residue matrix step (1c)
    VTRN.16 q6,q7                   @ Transpose residue matrix step (1d)
    VTRN.32 q0,q2                   @ Transpose residue matrix step (2a)
    VTRN.32 q1,q3                   @ Transpose residue matrix step (2b)

    VADD.U16 q8,q15,q9              @ SAD calculation (1)
    VTRN.32 q4,q6                   @ Transpose residue matrix step (2c)
    VTRN.32 q5,q7                   @ Transpose residue matrix step (2d)

    VADD.U16 q8,q8,q10              @ SAD calculation (2)
    VSWP d1,d8                      @ Transpose residue matrix step (3a)
    VSWP d3,d10                     @ Transpose residue matrix step (3b)

    VADD.U16 d16,d16,d17            @ SAD calculation (3)
    VSWP d7,d14                     @ Transpose residue matrix step (3c)
    VSWP d5,d12                     @ Transpose residue matrix step (3d)
                                    @ Columns of residue C0-C7 (8x8 matrix) in q0-q7
    VPADDL.U16 d16,d16              @ SAD calculation (4)

    @ Evaluating first step in Butterfly diagram

    VADD.S16 q10,q0,q7              @ q10 = C0 + C7
    VADD.S16 q11,q1,q6              @ q11 = C1 + C6
    VPADDL.U32 d16,d16              @ SAD calculation (5)
    VADD.S16 q12,q2,q5              @ q12 = C2 + C5
    VADD.S16 q13,q3,q4              @ q13 = C3 + C4

    VSUB.S16 q4,q3,q4               @ q4  = C3 - C4
    VSUB.S16 q5,q2,q5               @ q5  = C2 - C5
    VSUB.S16 q6,q1,q6               @ q6  = C1 - C6
    VSUB.S16 q7,q0,q7               @ q7  = C0 - C7

    @ Calculating F0, F2, F4 and F6

    VADD.S16 q1,q11,q12             @ q1  = C1 + C2 + C5 + C6
    VADD.S16 q2,q10,q13             @ q2  = C0 + C3 + C4 + C7

    MOV r4,#50
    LSL r4,r4,#16
    ADD r4,r4,#18
    MOV r5,#89
    LSL r5,r5,#16
    ADD r5,r5,#75
    VMOV d0,r4,r5                   @ 16-bit aligned, d0[3] = 89, d0[2] = 75, d0[1] = 50, d0[0]=18

    MOV r4,#83
    LSL r4,r4,#16
    ADD r4,r4,#36
    VMOV d1,r4,r4                   @ 16-bit aligned, d1[3] = 83, d1[2] = 36, d1[1] = 83, d1[0]=36

    VSUB.S16 q10,q10,q13            @ q10 = C0 - C3 - C4 + C7
    VSUB.S16 q11,q11,q12            @ q11 = C1 - C2 - C5 + C6
    VMOV.32 r0,d16[0]               @ SAD calculation (6) : Return value = SAD

    VSUB.S16 q3,q2,q1               @ q3 = C0 - C1 - C2 + C3 + C4 - C5 - C6 + C7
    VADD.S16 q2,q2,q1               @ q2 = C0 + C1 + C2 + C3 + C4 + C5 + C6 + C7

    VMULL.S16 q14,d20,d1[1]         @ q14 = [0] of 83*(C0 - C3 - C4 + C7)
    VMULL.S16 q15,d21,d1[1]         @ q15 = [1] of 83*(C0 - C3 - C4 + C7)
    VMULL.S16 q9,d20,d1[0]          @ q9  = [0] of 36*(C0 - C3 - C4 + C7)
    VMULL.S16 q10,d21,d1[0]         @ q10 = [1] of 36*(C0 - C3 - C4 + C7)

    VMLAL.S16 q14,d22,d1[0]         @ q14 = F2[0] = 83*(C0 - C3 - C4 + C7) + 36*(C1 - C2 - C5 + C6)
    VSHLL.S16 q13,d6,#6             @ q13 = F4[0] = 64*(C0 - C1 - C2 + C3 + C4 - C5 - C6 + C7)
    VMLAL.S16 q15,d23,d1[0]         @ q15 = F2[1] = 83*(C0 - C3 - C4 + C7) + 36*(C1 - C2 - C5 + C6)
    VSHLL.S16 q3,d7,#6              @ q3  = F4[1] = 64*(C0 - C1 - C2 + C3 + C4 - C5 - C6 + C7)
    VMLSL.S16 q9,d22,d1[1]          @ q9  = F6[0] = 36*(C0 - C3 - C4 + C7) - 83*(C1 - C2 - C5 + C6)
    VSHLL.S16 q12,d4,#6             @ q12 = F0[0] = 64*(C0 + C1 + C2 + C3 + C4 + C5 + C6 + C7)
    VMLSL.S16 q10,d23,d1[1]         @ q10 = F6[1] = 36*(C0 - C3 - C4 + C7) - 83*(C1 - C2 - C5 + C6)
    VSHLL.S16 q2,d5,#6              @ q2  = F0[1] = 64*(C0 + C1 + C2 + C3 + C4 + C5 + C6 + C7)

    @ Calculating F1, F3, F5 and F7

    MOV r4,#48
    VST1.64 {d24,d25},[r2]!         @ Row 1 of transform stage 1 F0[0] stored
    VST1.64 {d4,d5},[r2],r4         @ Row 1 of transform stage 1 F0[1] stored
    VST1.64 {d28,d29},[r2]!         @ Row 3 of transform stage 1 F2[0] stored
    VST1.64 {d30,d31},[r2],r4       @ Row 3 of transform stage 1 F2[1] stored

    VST1.64 {d26,d27},[r2]!         @ Row 5 of transform stage 1 F4[0] stored
    VMULL.S16 q1,d14,d0[3]          @ q1  = [0] of 89*(C0 - C7)
    VMULL.S16 q8,d15,d0[3]          @ q8  = [1] of 89*(C0 - C7)
    VST1.64 {d6,d7},[r2],r4         @ Row 5 of transform stage 1 F4[1] stored
    VMULL.S16 q11,d14,d0[2]         @ q11 = [0] of 75*(C0 - C7)
    VMULL.S16 q13,d15,d0[2]         @ q13 = [1] of 75*(C0 - C7)
    VST1.64 {d18,d19},[r2]!         @ Row 7 of transform stage 1 F6[0] stored
    VMULL.S16 q3,d14,d0[1]          @ q3  = [0] of 50*(C0 - C7)
    VMULL.S16 q9,d15,d0[1]          @ q9  = [1] of 50*(C0 - C7)
    VST1.64 {d20,d21},[r2]          @ Row 7 of transform stage 1 F6[1] stored
    VMULL.S16 q10,d14,d0[0]         @ q10 = [0] of 18*(C0 - C7)
    VMULL.S16 q7,d15,d0[0]          @ q7  = [1] of 18*(C0 - C7)

    VMLAL.S16 q1,d12,d0[2]          @ q1  = [0] of 89*(C0 - C7) + 75*(C1 - C6)
    VMLAL.S16 q8,d13,d0[2]          @ q8  = [1] of 89*(C0 - C7) + 75*(C1 - C6)
    VMLSL.S16 q11,d12,d0[0]         @ q11 = [0] of 75*(C0 - C7) - 18*(C1 - C6)
    VMLSL.S16 q13,d13,d0[0]         @ q13 = [1] of 75*(C0 - C7) - 18*(C1 - C6)
    VMLSL.S16 q3,d12,d0[3]          @ q3  = [0] of 50*(C0 - C7) - 89*(C1 - C6)
    VMLSL.S16 q9,d13,d0[3]          @ q9  = [1] of 50*(C0 - C7) - 89*(C1 - C6)
    VMLSL.S16 q10,d12,d0[1]         @ q10 = [0] of 18*(C0 - C7) - 50*(C1 - C6)
    VMLSL.S16 q7,d13,d0[1]          @ q7  = [1] of 18*(C0 - C7) - 50*(C1 - C6)

    VMLAL.S16 q1,d10,d0[1]          @ q1  = [0] of 89*(C0 - C7) + 75*(C1 - C6) + 50*(C2 - C5)
    VMLAL.S16 q8,d11,d0[1]          @ q8  = [1] of 89*(C0 - C7) + 75*(C1 - C6) + 50*(C2 - C5)
    VMLSL.S16 q11,d10,d0[3]         @ q11 = [0] of 75*(C0 - C7) - 18*(C1 - C6) - 89*(C2 - C5)
    VMLSL.S16 q13,d11,d0[3]         @ q13 = [1] of 75*(C0 - C7) - 18*(C1 - C6) - 89*(C2 - C5)
    VMLAL.S16 q3,d10,d0[0]          @ q3  = [0] of 50*(C0 - C7) - 89*(C1 - C6) + 18*(C2 - C5)
    VMLAL.S16 q9,d11,d0[0]          @ q9  = [1] of 50*(C0 - C7) - 89*(C1 - C6) + 18*(C2 - C5)
    VMLAL.S16 q10,d10,d0[2]         @ q10 = [0] of 18*(C0 - C7) - 50*(C1 - C6) + 75*(C2 - C5)
    VMLAL.S16 q7,d11,d0[2]          @ q7  = [1] of 18*(C0 - C7) - 50*(C1 - C6) + 75*(C2 - C5)

    VMLAL.S16 q1,d8,d0[0]           @ q1  = F1[0] = 89*(C0 - C7) + 75*(C1 - C6) + 50*(C2 - C5) + 18*(C3 - C4)
    VMLAL.S16 q8,d9,d0[0]           @ q8  = F1[1] = 89*(C0 - C7) + 75*(C1 - C6) + 50*(C2 - C5) + 18*(C3 - C4)
    VMLSL.S16 q11,d8,d0[1]          @ q11 = F3[0] = 75*(C0 - C7) - 18*(C1 - C6) - 89*(C2 - C5) - 50*(C3 - C4)
    VMLSL.S16 q13,d9,d0[1]          @ q13 = F3[1] = 75*(C0 - C7) - 18*(C1 - C6) - 89*(C2 - C5) - 50*(C3 - C4)
    SUB r2,r2,#176                  @ r2 now points to the second row
    VMLAL.S16 q3,d8,d0[2]           @ q3  = F5[0] = 50*(C0 - C7) - 89*(C1 - C6) + 18*(C2 - C5) + 75*(C3 - C4)
    VMLAL.S16 q9,d9,d0[2]           @ q9  = F5[1] = 50*(C0 - C7) - 89*(C1 - C6) + 18*(C2 - C5) + 75*(C3 - C4)
    VST1.64 {d2,d3},[r2]!           @ Row 2 of transform stage 1 F1[0] stored
    VMLSL.S16 q10,d8,d0[3]          @ q10 = F7[0] = 18*(C0 - C7) - 50*(C1 - C6) + 75*(C2 - C5) - 89*(C3 - C4)
    VMLSL.S16 q7,d9,d0[3]           @ q7  = F7[1] = 18*(C0 - C7) - 50*(C1 - C6) + 75*(C2 - C5) - 89*(C3 - C4)

    VST1.64 {d16,d17},[r2],r4       @ Row 2 of transform stage 1 F1[1] stored
    VST1.64 {d22,d23},[r2]!         @ Row 4 of transform stage 1 F3[0] stored
    VST1.64 {d26,d27},[r2],r4       @ Row 4 of transform stage 1 F3[1] stored
    VST1.64 {d6,d7},[r2]!           @ Row 6 of transform stage 1 F5[0] stored
    VST1.64 {d18,d19},[r2],r4       @ Row 6 of transform stage 1 F5[1] stored
    VST1.64 {d20,d21},[r2]!         @ Row 8 of transform stage 1 F7[0] stored
    VST1.64 {d14,d15},[r2]          @ Row 8 of transform stage 1 F7[1] stored

    @ Transform stage 2 (for rows 1-4 of transform stage 1)
    @ Transposing the 4 rows (F0, F1, F2, F3)
    @ F0 = {q2,q12},  F1 = {q8,q1}, F2 = {q15,q14} and F3 = {q13,q11}

    VTRN.32 q12,q1                  @ Transposing first half of transform stage 1 (1a)
    VTRN.32 q14,q11                 @ Transposing first half of transform stage 1 (1b)
    VSWP d25,d28                    @ Transposing first half of transform stage 1 (2a)
    VSWP d22,d3                     @ Transposing first half of transform stage 1 (2b)

    VTRN.32 q2,q8                   @ Transposing first half of transform stage 1 (3a)
    VTRN.32 q15,q13                 @ Transposing first half of transform stage 1 (3b)
    VSWP d5,d30                     @ Transposing first half of transform stage 1 (4a)
    VSWP d26,d17                    @ Transposing first half of transform stage 1 (4b)
                                    @ B0:q12, B1:q1, B2:q14, B3:q11, B4:q2, B5:q8, B6:q15 and B7:q13

    @ Evaluating first step in Butterfly diagram

    VADD.S32 q0,q12,q13             @ q0  = B0 + B7
    VADD.S32 q5,q11,q2              @ q5  = B3 + B4
    VADD.S32 q3,q1,q15              @ q3  = B1 + B6
    VADD.S32 q4,q14,q8              @ q4  = B2 + B5

    VSUB.S32 q7,q14,q8              @ q7  = B2 - B5
    VSUB.S32 q8,q1,q15              @ q8  = B1 - B6
    VSUB.S32 q6,q11,q2              @ q6  = B3 - B4
    VSUB.S32 q9,q12,q13             @ q9  = B0 - B7

    @ Calculating G0, G2, G4 and G6

    MOV r4,#18
    MOV r5,#50
    VMOV d2,r4,r5                   @ 32-bit aligned, d2[1] = 50, d2[0] = 18
    VSUB.S32 q2,q0,q5               @ q2  = B0 - B3 - B4 + B7

    MOV r4,#75
    MOV r5,#89
    VMOV d3,r4,r5                   @ 32-bit aligned, d3[1] = 89, d3[0] = 75
    VADD.S32 q10,q0,q5              @ q10 = B0 + B3 + B4 + B7

    MOV r4,#36
    MOV r5,#83
    VMOV d0,r4,r5                   @ 32-bit aligned, d0[1] = 83, d0[0] = 36
    VSUB.S32 q11,q3,q4              @ q11 = B1 - B2 - B5 + B6
    VADD.S32 q3,q3,q4               @ q3  = B1 + B2 + B5 + B6

    VMUL.S32 q12,q2,d0[1]           @ q12 = 83*(B0 - B3 - B4 + B7)
    VMUL.S32 q2,q2,d0[0]            @ q2  = 36*(B0 - B3 - B4 + B7)
    VMUL.S32 q5,q9,d3[1]            @ q5 = 89*(B0 - B7)
    VADD.S32 q14,q10,q3             @ q14 = B0 + B1 + B2 + B3 + B4 + B5 + B6 + B7
    VMUL.S32 q4,q9,d3[0]            @ q4 = 75*(B0 - B7)
    VSUB.S32 q15,q10,q3             @ q15 = B0 - B1 - B2 + B3 + B4 - B5 - B6 + B7
@    VSHL.S32 q14,q14,#6             ; q14 = G0 = 64*(B0 + B1 + B2 + B3 + B4 + B5 + B6 + B7)
@    VSHL.S32 q15,q15,#6             ; q15 = G4 = 64*(B0 - B1 - B2 + B3 + B4 - B5 - B6 + B7)

    VMLA.S32 q12,q11,d0[0]          @ q12 = G2 = 83*(B0 - B3 - B4 + B7) + 36*(B1 - B2 - B5 + B6)
    VRSHRN.I32 d28,q14,#5           @ Truncating last 11 bits in G0
    VMLS.S32 q2,q11,d0[1]           @ q2  = G6 = 36*(B0 - B3 - B4 + B7) - 83*(B1 - B2 - B5 + B6)
    VRSHRN.I32 d30,q15,#5           @ Truncating last 11 bits in G4

    LDR r4,[sp,#80]                 @ r4 = dst_strd
    LSL r4,r4,#2                    @ r4 = 2*dst_strd*2

    VMUL.S32 q3,q9,d2[1]            @ q3 = 50*(B0 - B7)
    VRSHRN.I32 d24,q12,#11          @ Truncating last 11 bits in G2
    VMUL.S32 q9,q9,d2[0]            @ q9 = 18*(B0 - B7)
    VRSHRN.I32 d4,q2,#11            @ Truncating last 11 bits in G6

    VMLA.S32 q5,q8,d3[0]            @ q5 = 89*(B0 - B7) + 75*(B1 - B6)
    VST1.64 d28,[r3],r4             @ First half-row of row 1 of transform stage 2 (G0) stored
    VMLS.S32 q4,q8,d2[0]            @ q4 = 75*(B0 - B7) - 18*(B1 - B6)

    VMLS.S32 q3,q8,d3[1]            @ q3 = 50*(B0 - B7) - 89*(B1 - B6)
    VST1.64 d24,[r3],r4             @ First half-row of row 3 of transform stage 2 (G2) stored
    VMLS.S32 q9,q8,d2[1]            @ q9 = 18*(B0 - B7) - 50*(B1 - B6)

    VMLA.S32 q5,q7,d2[1]            @ q5 = 89*(B0 - B7) + 75*(B1 - B6) + 50*(B2 - B5)
    VST1.64 d30,[r3],r4             @ First half-row of row 5 of transform stage 2 (G4) stored
    VMLS.S32 q4,q7,d3[1]            @ q4 = 75*(B0 - B7) - 18*(B1 - B6) - 89*(B2 - B5)

    VMLA.S32 q3,q7,d2[0]            @ q3 = 50*(B0 - B7) - 89*(B1 - B6) + 18*(B2 - B5)
    VST1.64 d4,[r3]                 @ First half-row of row 7 of transform stage 2 (G6) stored
    VMLA.S32 q9,q7,d3[0]            @ q9 = 18*(B0 - B7) - 50*(B1 - B6) + 75*(B2 - B5)

    VMLA.S32 q5,q6,d2[0]            @ q5 = G1 = 89*(B0 - B7) + 75*(B1 - B6) + 50*(B2 - B5) + 18*(B3 - B4)
    VMLS.S32 q4,q6,d2[1]            @ q4 = G3 = 75*(B0 - B7) - 18*(B1 - B6) - 89*(B2 - B5) - 50*(B3 - B4)
    VMLA.S32 q3,q6,d3[0]            @ q3 = G5 = 50*(B0 - B7) - 89*(B1 - B6) + 18*(B2 - B5) + 75*(B3 - B4)
    VMLS.S32 q9,q6,d3[1]            @ q9 = G7 = 18*(B0 - B7) - 50*(B1 - B6) + 75*(B2 - B5) - 89*(B3 - B4)

    SUB r3,r3,r4,LSL #1
    SUB r3,r3,r4,ASR #1             @ r3 = r3 - 5*dst_strd*2
                                    @ r3 is moved from row 7 to row 2
    VRSHRN.I32 d10,q5,#11           @ Truncating last 11 bits in G1
    VRSHRN.I32 d8,q4,#11            @ Truncating last 11 bits in G3
    VRSHRN.I32 d6,q3,#11            @ Truncating last 11 bits in G5
    VST1.64 d10,[r3],r4             @ First half-row of row 2 of transform stage 2 (G1) stored
    VRSHRN.I32 d18,q9,#11           @ Truncating last 11 bits in G7

    VST1.64 d8,[r3],r4              @ First half-row of row 4 of transform stage 2 (G3) stored
    VST1.64 d6,[r3],r4              @ First half-row of row 6 of transform stage 2 (G5) stored
    VST1.64 d18,[r3]!               @ First half-row of row 8 of transform stage 2 (G7) stored

    @ Transform stage 2 (for rows 5-8 of transform stage 1)
    @ Loading the 4 rows (F4, F5, F6, F7)

    SUB r2,r2,#112                  @ r2 jumps from row 8 to row 5 in temporary memory
    VLD1.64 {d20,d21},[r2]!         @ q10 = F4[0]
    VLD1.64 {d22,d23},[r2]!         @ q11 = F4[1]
    VLD1.64 {d8,d9},[r2]!           @ q4  = F5[0]
    @ Transposing the 4 rows
    @ F0 = {q11,q10}, F1 = {q5,q4}, F2 = {q3,q2} and F3 = {q13,q12}

    VTRN.32 q10,q4                  @ Transposing second half of transform stage 1 (1a)
    VLD1.64 {d10,d11},[r2]!         @ q5  = F5[1]
    VLD1.64 {d4,d5},[r2]!           @ q2  = F6[0]
    VLD1.64 {d6,d7},[r2]!           @ q3  = F6[1]
    VLD1.64 {d24,d25},[r2]!         @ q12 = F7[0]
    VTRN.32 q2,q12                  @ Transposing second half of transform stage 1 (1b)
    VLD1.64 {d26,d27},[r2]          @ q13 = F7[1]

    VSWP d21,d4                     @ Transposing second half of transform stage 1 (2a)
    VSWP d24,d9                     @ Transposing second half of transform stage 1 (2b)

    VTRN.32 q11,q5                  @ Transposing second half of transform stage 1 (3a)
    VTRN.32 q3,q13                  @ Transposing second half of transform stage 1 (3b)
    VSWP d26,d11                    @ Transposing second half of transform stage 1 (4b)
    VSWP d23,d6                     @ Transposing second half of transform stage 1 (4a)
                                    @ B0:q10, B1:q4, B2:q2, B3:q12, B4:q11, B5:q5, B6:q3 and B7:q13

    @ Evaluating first step in Butterfly diagram

    VADD.S32 q0,q10,q13             @ q0  = B0 + B7
    VADD.S32 q15,q12,q11            @ q15 = B3 + B4
    VADD.S32 q1,q4,q3               @ q1  = B1 + B6
    VADD.S32 q14,q2,q5              @ q14 = B2 + B5

    VSUB.S32 q9,q10,q13             @ q9  = B0 - B7
    VSUB.S32 q6,q12,q11             @ q6  = B3 - B4
    VSUB.S32 q7,q2,q5               @ q7  = B2 - B5
    VSUB.S32 q8,q4,q3               @ q8  = B1 - B6

    @ Calculating H0, H2, H4 and H6

    VADD.S32 q3,q1,q14              @ q3 = B1 + B2 + B5 + B6
    VSUB.S32 q5,q1,q14              @ q5 = B1 - B2 - B5 + B6

    MOV r4,#18
    MOV r5,#50
    VSUB.S32 q4,q0,q15              @ q4 = B0 - B3 - B4 + B7
    VMOV d2,r4,r5                   @ 32-bit aligned, d2[1] = 50, d2[0] = 18

    MOV r4,#75
    MOV r5,#89
    VADD.S32 q2,q0,q15              @ q2 = B0 + B3 + B4 + B7
    VMOV d3,r4,r5                   @ 32-bit aligned, d3[1] = 89, d3[0] = 75

    MOV r4,#36
    MOV r5,#83

    @ Calculating H1, H3, H5 and H7

    VMUL.S32 q10,q9,d3[1]           @ q10 = 89*(B0 - B7)
    VMOV d0,r4,r5                   @ 32-bit aligned, d0[1] = 83, d0[0] = 36

    VMUL.S32 q13,q9,d3[0]           @ q13 = 75*(B0 - B7)

    VMUL.S32 q12,q4,d0[1]           @ q12 = 83*(B0 - B3 - B4 + B7)
    VADD.S32 q14,q2,q3              @ q14 = B0 + B1 + B2 + B3 + B4 + B5 + B6 + B7
    VMUL.S32 q4,q4,d0[0]            @ q4  = 36*(B0 - B3 - B4 + B7)
    VSUB.S32 q2,q2,q3               @ q2  = B0 - B1 - B2 + B3 + B4 - B5 - B6 + B7


    VMLA.S32 q12,q5,d0[0]           @ q12 = H2 = 83*(B0 - B3 - B4 + B7) + 36*(B1 - B2 - B5 + B6)
@    VSHL.S32 q14,q14,#6             ; q14 = H0 = 64*(B0 + B1 + B2 + B3 + B4 + B5 + B6 + B7)
    VMLS.S32 q4,q5,d0[1]            @ q4 = H6 = 36*(B0 - B3 - B4 + B7) - 83*(B1 - B2 - B5 + B6)
@    VSHL.S32 q2,q15,#6              ; q2 = H4 = 64*(B0 - B1 - B2 + B3 + B4 - B5 - B6 + B7)

    VMUL.S32 q11,q9,d2[1]           @ q11 = 50*(B0 - B7)
    VRSHRN.I32 d28,q14,#5           @ Truncating last 11 bits in H0
    VMUL.S32 q9,q9,d2[0]            @ q9  = 18*(B0 - B7)
    VRSHRN.I32 d24,q12,#11          @ Truncating last 11 bits in H2

    VMLA.S32 q10,q8,d3[0]           @ q10 = 89*(B0 - B7) + 75*(B1 - B6)
    VRSHRN.I32 d4,q2,#5             @ Truncating last 11 bits in H4
    VMLS.S32 q13,q8,d2[0]           @ q13 = 75*(B0 - B7) - 18*(B1 - B6)
    VRSHRN.I32 d8,q4,#11            @ Truncating last 11 bits in H6

    LDR r4,[sp,#80]                 @ r4 = dst_strd
    LSL r4,r4,#2                    @ r4 = 2*dst_strd*2

    SUB r3,r3,r4,LSL #2
    ADD r3,r3,r4,ASR #1             @ r3 = r3 - 7*dst_strd*2
                                    @ r3 is moved from row 8 to row 1
    VMLS.S32 q11,q8,d3[1]           @ q11 = 50*(B0 - B7) - 89*(B1 - B6)
    VST1.64 d28,[r3],r4             @ Second half-row of row 1 of transform stage 2 (H0) stored
    VMLS.S32 q9,q8,d2[1]            @ q9  = 18*(B0 - B7) - 50*(B1 - B6)

    VMLA.S32 q10,q7,d2[1]           @ q10 = 89*(B0 - B7) + 75*(B1 - B6) + 50*(B2 - B5)
    VST1.64 d24,[r3],r4             @ Second half-row of row 3 of transform stage 2 (H2) stored
    VMLS.S32 q13,q7,d3[1]           @ q13 = 75*(B0 - B7) - 18*(B1 - B6) - 89*(B2 - B5)

    VMLA.S32 q11,q7,d2[0]           @ q11 = 50*(B0 - B7) - 89*(B1 - B6) + 18*(B2 - B5)
    VST1.64 d4,[r3],r4              @ Second half-row of row 5 of transform stage 2 (H4) stored
    VMLA.S32 q9,q7,d3[0]            @ q9  = 18*(B0 - B7) - 50*(B1 - B6) + 75*(B2 - B5)

    VMLA.S32 q10,q6,d2[0]           @ q10 = H1 = 89*(B0 - B7) + 75*(B1 - B6) + 50*(B2 - B5) + 18*(B3 - B4)
    VST1.64 d8,[r3]                 @ Second half-row of row 7 of transform stage 2 (H6) stored
    VMLS.S32 q13,q6,d2[1]           @ q13 = H3 = 75*(B0 - B7) - 18*(B1 - B6) - 89*(B2 - B5) - 50*(B3 - B4)

    VMLA.S32 q11,q6,d3[0]           @ q11 = H5 = 50*(B0 - B7) - 89*(B1 - B6) + 18*(B2 - B5) + 75*(B3 - B4)
    VMLS.S32 q9,q6,d3[1]            @ q9  = H7 = 18*(B0 - B7) - 50*(B1 - B6) + 75*(B2 - B5) - 89*(B3 - B4)

    SUB r3,r3,r4,LSL #1
    SUB r3,r3,r4,ASR #1             @ r3 = r3 - 5*dst_strd
                                    @ r3 is moved from row 7 to row 2
    VRSHRN.I32 d20,q10,#11          @ Truncating last 11 bits in H1
    VRSHRN.I32 d26,q13,#11          @ Truncating last 11 bits in H3
    VRSHRN.I32 d22,q11,#11          @ Truncating last 11 bits in H5
    VST1.64 d20,[r3],r4             @ Second half-row of row 2 of transform stage 2 (H1) stored
    VRSHRN.I32 d18,q9,#11           @ Truncating last 11 bits in H7

    VST1.64 d26,[r3],r4             @ Second half-row of row 4 of transform stage 2 (H3) stored
    VST1.64 d22,[r3],r4             @ Second half-row of row 6 of transform stage 2 (H5) stored
    VST1.64 d18,[r3]                @ Second half-row of row 8 of transform stage 2 (H7) stored

    vpop {d8 - d15}
    POP {r4,r5}
    MOV pc,lr

@/**
@*/ *******************************************************************************
@*/
@*/@brief
@*/  This function performs residue calculation and forward  transform on
@*/ input pixels
@*/
@*/@par Description:
@*/ Performs residue calculation by subtracting source and  prediction and
@*/ followed by forward transform
@*/
@*/ @param[in] pu1_src
@*/  Input 16x16 pixels
@*/
@*/ @param[in] pu1_pred
@*/  Prediction data
@*/
@*/ @param[in] pi2_tmp
@*/  Temporary buffer of size 16x16
@*/
@*/ @param[out] pi2_dst
@*/  Output 16x16 coefficients
@*/
@*/ @param[in] src_strd
@*/  Input stride
@*/
@*/ @param[in] pred_strd
@*/  Prediction Stride
@*/
@*/ @param[in] dst_strd
@*/  Output Stride
@*/
@*/ @param[in] chr_plane
@*/  Chroma plane
@*/
@*/ @returns  Void
@*/
@*/ @remarks
@*/  None
@*/
@*/*******************************************************************************
@*/

.extern g_ai2_ihevc_trans_16
.extern g_ai4_ihevc_trans_16

g_ai2_ihevc_trans_16_addr_1:
.long g_ai2_ihevc_trans_16 - ulbl1 - 8

g_ai2_ihevc_trans_16_addr_2:
.long g_ai2_ihevc_trans_16 - ulbl2 - 8

g_ai4_ihevc_trans_16_addr:
.long g_ai4_ihevc_trans_16 - ulbl3 - 8

    .global ihevc_resi_trans_16x16_a9q

ihevc_resi_trans_16x16_a9q:

.equ TMP_STRIDE        ,  64            @16*4, Stride of tmp register
.equ SHIFT             ,  13            @shift = 13; // log2(iWidth) - 1 + g_uiBitIncrement
.equ RADD              ,  4096          @1 << (shift - 1);

.equ COFF_STD_2B       ,  32            @Stride for g_ai2_ihevc_trans_16 in bytes
.equ COFF_STD_W        ,  32            @Stride for g_ai4_ihevc_trans_16 in bytes

@;LOAD the fucntion
    STMFD          SP!,{r4-r12,LR}      @stack store values of the arguments
    vpush          {d8 - d15}
    SUB            SP,SP,#32

    LDR             R4,[SP,#136]         @get src_strd
    LDR             R5,[SP,#140]         @get pred_strd
    LDR             R6,[SP,#144]         @get dst_strd
    LDR             R14,[SP,#148]        @get chroma_plane

    MOV R8,#0                           @Set loop counter
    LDR R9,g_ai2_ihevc_trans_16_addr_1    @get 16 bit transform matrix
ulbl1:
    ADD R9, R9, PC
    @Read [0 0] [4 0] [8 0] [12 0],[0 1] [4 1] [8 1] [12 1] values of g_ai2_ihevc_trans_16
    @and write to stack
    MOV R12,#COFF_STD_2B
    LSL R12,#2

    VLD1.S32 D30[0],[R9],R12
    VLD1.S32 D30[1],[R9],R12
    VLD1.S32 D31[0],[R9],R12
    VLD1.S32 D31[1],[R9],R12

    VTRN.S32 D30,D31
    VTRN.S16 D30,D31
    VST1.S16 {d30,d31},[SP]

    LDR R9,g_ai2_ihevc_trans_16_addr_2      @get back 16 bit transform matrix
ulbl2:
    ADD R9, R9, PC

    MOV R7,#TMP_STRIDE

    VMOV.S32 Q14,#0

@R0         pu1_src
@R1         pu1_pred
@R2         pi4_tmp
@R3         pi2_dst
@R4         src_strd
@R5         pred_strd
@R6         dst_strd
@R7         tmp_dst Nx4 block stride
@R8         loop cntr
@R9         g_ai2_ihevc_trans_16
@R10        tmp_dst Nx4 block offset
@R11        tmp register
@R12        ------
@R14        chroma_plane
@q14        shift 32 bit
@q15        add 32 bit

CORE_LOOP_16X16_HORIZ:

    CMP R14,#-1
    BGT INTERLEAVED_LOAD_S1

    VLD1.U8 {d0,d1},[R0],R4             @LOAD 1-16 src row 1
    VLD1.U8 {d2,d3},[R1],R5             @LOAD 1-16 pred row 1
    VLD1.U8 {d4,d5},[R0],R4             @LOAD 1-16 src row 2
    VLD1.U8 {d6,d7},[R1],R5             @LOAD 1-16 pred row 2
    B    LOAD_DONE

INTERLEAVED_LOAD_S1:
    CMP R14,#1
    BEQ INTERLEAVED_LOAD_S2
    VLD2.U8 {Q0,Q1},[R0],R4             @LOAD 1-16 src row 1
    VLD2.U8 {Q1,Q2},[R1],R5             @LOAD 1-16 pred row 1
    VLD2.U8 {Q2,Q3},[R0],R4             @LOAD 1-16 src row 2
    VLD2.U8 {Q3,Q4},[R1],R5             @LOAD 1-16 pred row 2
    B LOAD_DONE

INTERLEAVED_LOAD_S2:
    VLD2.U8 {Q0,Q1},[R0],R4             @LOAD 1-16 src row 1
    VSWP.U8 Q0,Q1
    VLD2.U8 {Q1,Q2},[R1],R5             @LOAD 1-16 pred row 1
    VSWP.U8 Q1,Q2
    VLD2.U8 {Q2,Q3},[R0],R4             @LOAD 1-16 src row 2
    VSWP.U8 Q2,Q3
    VLD2.U8 {Q3,Q4},[R1],R5             @LOAD 1-16 pred row 2
    VSWP.U8 Q3,Q4

LOAD_DONE:

    VSUBL.U8 Q4,D0,D2                   @Get residue 1-8 row 1
    VSUBL.U8 Q5,D1,D3                   @Get residue 9-16 row 1
    VSUBL.U8 Q6,D4,D6                   @Get residue 1-8 row 2
    VSUBL.U8 Q7,D5,D7                   @Get residue 9-16 row 2

    @Get blk sads
    VABDL.U8 Q15,D0,D2
    VABAL.U8 Q15,D1,D3
    VABAL.U8 Q15,D4,D6
    VABAL.U8 Q15,D5,D7
    VADDW.S16 Q14,Q14,D30
    VADDW.S16 Q14,Q14,D31

    VREV64.S16 Q5,Q5                    @Rev row 1
    VREV64.S16 Q7,Q7                    @Rev row 2
    VSWP D10,D11
    VSWP D14,D15

    VADD.S16 Q8 ,Q4,Q5                  @e[k] = resi_tmp_1 + resi_tmp_2  k -> 1-8 row 1
    VSUB.S16 Q9 ,Q4,Q5                  @o[k] = resi_tmp_1 - resi_tmp_2     k ->9-16 row 1
    VADD.S16 Q10,Q6,Q7                  @e[k] = resi_tmp_1 + resi_tmp_2  k -> 1-8 row 2
    VSUB.S16 Q11,Q6,Q7                  @o[k] = resi_tmp_1 - resi_tmp_2     k ->9-16 row 2

    VREV64.S16    D24,D17               @rev e[k] k-> 4-7 row 1
    VREV64.S16    D25,D21               @rev e[k] k-> 4-7 row 2
    VMOV.S16    D17,D20

    @arrangement OF DATA
    @Q8     A1 A2 A3 A4 B1 B2 B3 B4
    @Q12    A8 A7 A6 A5 B8 B7 B6 B5

    VADD.S16 Q13,Q8,Q12                 @ee[k] = e[k] + e[7 - k] row 1 & 2
    VSUB.S16 Q0,Q8,Q12                  @eo[k] = e[k] - e[7 - k] row 1 & 2

    @D26 R1ee[0] R1ee[1] R1ee[2] R1ee[3]
    @D27 R2ee[0] R2ee[1] R2ee[2] R2ee[3]
    VTRN.S32 D26,D27                    @1-cycle stall before it?
    @D26 R1ee[0] R1ee[1] R2ee[0] R2ee[1]
    @D27 R1ee[2] R1ee[3] R2ee[2] R2ee[3]
    VREV32.16 D2,D27                    @1-cycle stall before it?
    @D26 R1ee[0] R1ee[1] R2ee[0] R2ee[1]
    @D2 R1ee[3] R1ee[2] R2ee[3] R2ee[2]
    VMOV.S16 D27,D26
    VNEG.S16 D3,D2
    @Q13 R1ee[0] R1ee[1] R2ee[0] R2ee[1]  R1ee[0]  R1ee[1]  R2ee[0]  R2ee[1]
    @Q1  R1ee[3] R1ee[2] R2ee[3] R2ee[2] -R1ee[3] -R1ee[2] -R2ee[3] -R2ee[2]

    @D8 : [0 0] [4 0] [8 0] [12 0]
    @D9 : [0 1] [4 1] [8 1] [12 1]
    VLD1.S16 {d8,d9},[SP]               @[0 0] [4 0] [8 0] [12 0] [0 1] [4 1] [8 1] [12 1]
    VADD.S16 Q1,Q13,Q1                  @ 1-cycle stall before it?
    @Q15 R1eee[0] R1eee[1] R2eee[0] R2eee[1] R1eeo[0] R1eeo[1] R2eeo[0] R2eeo[1]

    @Q1  R1eee[0] R1eee[1] R2eee[0] R2eee[1]
    @    R1eeo[0] R1eeo[1] R2eeo[0] R2eeo[1]
    VTRN.S16 D2,D3                      @2-cycle stall before it?
    @Q1  R1eee[0] R1eeo[0] R2eee[0] R2eeo[0]
    @     R1eee[1] R1eeo[1] R2eee[1] R2eeo[1]

    VDUP.S32 D4,D2[0]    @R1eee[0] R1eeo[0] R1eee[0] R1eeo[0]    ;1-cycle stall?
    VDUP.S32 D5,D2[1]    @R2eee[0] R2eeo[0] R2eee[0] R2eeo[0]
    VDUP.S32 D6,D3[0]    @R1eee[1] R1eeo[1] R1eee[1] R1eeo[1]
    VDUP.S32 D7,D3[1]    @R2eee[1] R2eeo[1] R2eee[1] R2eeo[1]

    @---------------Process EO--------------------
    @ Early start to avoid stalls
    MOV R12,#COFF_STD_2B                @Get stride of coeffs

    VMULL.S16 Q5,D4,D8                  @   g_ai2_ihevc_trans_16 * R1eee[0] R1eeo[0] R1eee[0] R1eeo[0]
    VMLAL.S16 Q5,D6,D9                  @ + g_ai2_ihevc_trans_16 * R1eee[1] R1eeo[1] R1eee[1] R1eeo[1]
    VMULL.S16 Q6,D5,D8                  @   g_ai2_ihevc_trans_16 * R2eee[0] R2eeo[0] R2eee[0] R2eeo[0]
    VMLAL.S16 Q6,D7,D9                  @ + g_ai2_ihevc_trans_16 * R2eee[1] R2eeo[1] R2eee[1] R2eeo[1]

    ADD R11,R9,R12,LSL #1               @Load address of g_ai2_ihevc_trans_16[2]
    LSL R12,R12,#2

    VLD1.S16 D26,[R11],R12              @LOAD g_ai2_ihevc_trans_16[2][0-4]]

    VLD1.S16 D27,[R11],R12              @LOAD g_ai2_ihevc_trans_16[6][0-4]
    VMULL.S16 Q1,D26,D0                 @g_ai2_ihevc_trans_16[2][0-4] * eo[0-4]    R1

    VMULL.S16 Q2,D26,D1                 @g_ai2_ihevc_trans_16[2][0-4] * eo[0-4]    R2

    VZIP.S32 Q5,Q6                      @3-cycle instruction
    VMULL.S16 Q3,D27,D0                 @g_ai2_ihevc_trans_16[6][0-4] * eo[0-4]    R1


    VLD1.S16 D26,[R11],R12              @LOAD g_ai2_ihevc_trans_16[10][0-4]
    VMULL.S16 Q4,D27,D1                 @g_ai2_ihevc_trans_16[6][0-4] * eo[0-4]    R2

    @These values must go to 0 4 8 12 colums hence we need stride *4
    LSL R10,R7,#2

    VLD1.S16 D27,[R11],R12              @LOAD g_ai2_ihevc_trans_16[14][0-4]

    VST1.32 D10,[R2],R10
    VMULL.S16 Q8,D27,D1                 @g_ai2_ihevc_trans_16[14][0-4] * eo[0-4] R2

    VST1.32 D11,[R2],R10
    VMULL.S16 Q7,D27,D0                 @g_ai2_ihevc_trans_16[14][0-4] * eo[0-4] R1

    VST1.32 D12,[R2],R10
    VMULL.S16 Q5,D26,D0                 @g_ai2_ihevc_trans_16[10][0-4] * eo[0-4] R1

    VST1.32 D13,[R2],R10
    VMULL.S16 Q6,D26,D1                 @g_ai2_ihevc_trans_16[10][0-4] * eo[0-4] R2

    SUB R2,R2,R10,LSL #2

    @transpose the 4x4 matrix row1
    VTRN.32 Q1, Q3                      @R1 transpose1 -- 2 cycles

    @transpose the 4x4 matrix row2
    VTRN.32 Q2,Q4                       @R2 transpose1 -- 2 cycles

    VTRN.32 Q5, Q7                      @R1 transpose1 -- 2 cycles

    VTRN.32 Q6,Q8                       @R2 transpose1 -- 2 cycles

    VSWP    D10,D3                      @R1 transpose2
    VSWP    D14,D7                      @R1 transpose2

    VSWP    D12,D5                      @R2 transpose2
    VSWP    D16,D9                      @R2 transpose2

    VADD.S32 Q5,Q5,Q1                   @R1 add
    VADD.S32 Q3,Q3,Q7                   @R1 add

    VADD.S32 Q2,Q2,Q4                   @R2 add
    VADD.S32 Q6,Q6,Q8                   @R2 add

    VADD.S32 Q5,Q5,Q3                   @R1 add

    VADD.S32 Q4,Q6,Q2                   @R2 add

    @-----------------------Processing O ----------------------------
    @ Early start to avoid stalls
    MOV R12,#COFF_STD_2B                @Get coeffs stride
    LSL R12,R12,#1
    ADD R11,R9,#COFF_STD_2B             @Get address of g_ai2_ihevc_trans_16[1]

    VLD1.S16 {d4,d5},[R11],R12          @g_ai2_ihevc_trans_16[1][0-7] -- 2 cycles

    VZIP.S32 Q5,Q4                      @ 3 cycle instruction
    VMULL.S16 Q6,D18,D4                 @o[0][0-3]*  R1


    VMLAL.S16 Q6,D19,D5                 @o[0][4-7]*  R1     ; follows MULL instruction: Multiplier accumulator forwarding
    @write to memory
    @this should go to 2 6 10 14
    LSL R10,R7,#2
    ADD R2,R2,R7,LSL #1                 @move to third row
    VST1.32 D10,[R2],R10
    VMULL.S16 Q7,D22,D4                 @o[0][0-3]*  R2

    VST1.32 D11,[R2],R10
    VMLAL.S16 Q7,D23,D5                 @o[0][4-7]*  R2

    VLD1.S16 {d4,d5},[R11],R12          @g_ai2_ihevc_trans_16[3][0-7]

    VST1.32 D8,[R2],R10
    VMULL.S16 Q8,D18,D4                 @o[1][0-3]*  R1

    VST1.32 D9,[R2],R10
    VMLAL.S16 Q8,D19,D5                 @o[1][4-7]*  R1
    SUB R2,R2,R10,LSL #2
    SUB R2,R2,R7,LSL #1

    @--------------------Done procrssing EO -------------------------

    @ -----------------Processing O continues------------------------

    VMULL.S16 Q10,D22,D4                @o[1][0-3]*  R2
    VMLAL.S16 Q10,D23,D5                @o[1][4-7]*  R2

    VLD1.S16 {d4,d5},[R11],R12          @g_ai2_ihevc_trans_16[5][0-7]

    VLD1.S16 {d6,d7},[R11],R12          @g_ai2_ihevc_trans_16[7][0-7]
    VMULL.S16 Q12,D18,D4                @o[2][0-3]*  R1

    VMLAL.S16 Q12,D19,D5                @o[2][4-7]*  R1
    VMULL.S16 Q0,D18,D6                 @o[3][0-3]*  R1
    VMLAL.S16 Q0,D19,D7                 @o[3][4-7]*  R1

    VMULL.S16 Q13,D22,D4                @o[2][0-3]*  R2
    VMLAL.S16 Q13,D23,D5                @o[2][4-7]*  R2
    VMULL.S16 Q1,D22,D6                 @o[3][0-3]*  R2
    VMLAL.S16 Q1,D23,D7                 @o[3][4-7]*  R2

    @transpose the 4x4 matrix R1
    VTRN.32 Q6, Q8                      @ 2-cycle instruction

    VTRN.32 Q12,Q0                      @ 2-cycle instruction

    @transpose the 4x4 matrix R2
    VTRN.32 Q7,Q10                      @ 2-cycle instruction

    VTRN.32 Q13,Q1                      @ 2-cycle instruction

    VSWP    D24,D13
    VSWP    D0, D17

    VSWP     D26,D15
    VSWP    D2,D21

    VADD.S32 Q8 ,Q8 ,Q6
    VADD.S32 Q12,Q12,Q0

    VADD.S32 Q10,Q10,Q7
    VADD.S32 Q13,Q13,Q1

    VLD1.S16 {d4,d5},[R11],R12          @g_ai2_ihevc_trans_16[9][0-7]
    VADD.S32 Q12 ,Q12 ,Q8

    VADD.S32 Q13,Q13,Q10
    VMULL.S16 Q3,D18,D4                 @o[4][0-3]*  R1
    VMLAL.S16 Q3,D19,D5                 @o[4][4-7]*  R1

    VZIP.S32 Q12,Q13
    VMULL.S16 Q4,D22,D4                 @o[0][0-3]*  R2


    VMLAL.S16 Q4,D23,D5                 @o[0][4-7]*  R2
    @write to memory
    @this should go to 1 3 5 7
    ADD R2,R2,R7
    LSL R7,R7,#1
    VLD1.S16 {d4,d5},[R11],R12          @g_ai2_ihevc_trans_16[11][0-7]

    VST1.32 D24,[R2],R7
    VMULL.S16 Q5,D18,D4                 @o[5][0-3]*  R1

    VST1.32 D25,[R2],R7
    VMLAL.S16 Q5,D19,D5                 @o[5][4-7]*  R1

    VST1.32 D26,[R2],R7
    VMULL.S16 Q6,D22,D4                 @o[0][0-3]*  R2

    VST1.32 D27,[R2],R7
    VMLAL.S16 Q6,D23,D5                 @o[0][4-7]*  R2

    VLD1.S16 {d4,d5},[R11],R12          @g_ai2_ihevc_trans_16[13][0-7]

    VLD1.S16 {d2,d3},[R11],R12          @g_ai2_ihevc_trans_16[15][0-7]
    VMULL.S16 Q7,D18,D4                 @o[6][0-3]*  R1

    VMLAL.S16 Q7,D19,D5                 @o[6][4-7]*  R1
    VMULL.S16 Q10,D18,D2                @o[7][0-3]*  R1
    VMLAL.S16 Q10,D19,D3                @o[7][4-7]*  R1

    VMULL.S16 Q8,D22,D4                 @o[0][0-3]*  R2
    VMLAL.S16 Q8,D23,D5                 @o[0][4-7]*  R2
    VMULL.S16 Q12,D22,D2                @o[0][0-3]*  R2
    VMLAL.S16 Q12,D23,D3                @o[0][4-7]*  R2


    @transpose the 4x4 matrix R1
    VTRN.32 Q3 ,Q5                      @ 2-cycle instruction

    VTRN.32 Q7 ,Q10                     @ transpose step 2 R1 , 2-cycle instruction

    @transpose the 4x4 matrix R2
    VTRN.32 Q4 ,Q6                      @ 2-cycle instruction

    VTRN.32 Q8 ,Q12                     @ transpose step 2 R2 , 2-cycle instruction

    VSWP    D14,D7                      @ transpose step 3, R1
    VSWP    D20,D11                     @ transpose step 4, R1
    VSWP    D16,D9                      @ transpose step 3, R2
    VSWP    D24,D13                     @ transpose step 4, R2

    VADD.S32 Q5 ,Q5 ,Q3
    VADD.S32 Q10,Q10,Q7
    VADD.S32 Q6 ,Q6 ,Q4
    VADD.S32 Q12,Q12,Q8
    VADD.S32 Q10,Q10,Q5
    VADD.S32 Q12,Q12,Q6

    @ 2-cycle stall
    VZIP.S32 Q10,Q12                    @ 3-cycle instruction

    @ 2-cycle stall
    @this should go to 9 11 13 15
    VST1.32 D20,[R2],R7

    VST1.32 D21,[R2],R7

    VST1.32 D24,[R2],R7

    VST1.32 D25,[R2],R7

    SUB R2,R2,R7,LSL #3
    LSR R7,R7,#1
    SUB R2,R2,R7

    ADD R2,R2,#8                        @MOVE TO NEXT to next COLUMN - pi4_tmp

    ADD R8,R8,#2                        @increment loop cntr
    CMP R8,#16                          @check lllop cntr
    BNE CORE_LOOP_16X16_HORIZ           @jump acc


@*****************Vertical transform************************************

@Initialization for vert transform
@pi4_tmp will be the new src
@tmp stride will be new src stride
@dst will be new pi4_tmp
@dst stride will be new tmp stride
@trans table will be of 32 bit

    LDR R9,g_ai4_ihevc_trans_16_addr        @get 32 bit transform matrix
ulbl3:
    ADD R9, R9, PC

    SUB R0,R2,#64                       @set tmp as src [-32 to move back to orgin]
    MOV R2,R3                           @set dst as tmp
    MOV R4,#TMP_STRIDE                  @set tmp stride as src stride
    LSL R7,R6,#1                        @Set dst stride as tmp stride
    SUB R4,#48                          @Adjust stride 3 previous loads

    @Block SAD
    VADD.S32 D28,D28,D29
    VPADD.S32 D28,D28,D29
    VMOV.S32 R3,D28[0]
    @ SAD calculation ends -- final value in R3.

    @Read [0 0] [4 0] [8 0] [12 0],[0 1] [4 1] [8 1] [12 1]
    @values of g_ai4_ihevc_trans_16 and write to stack
    MOV R12,#COFF_STD_W
    LSL R12,R12,#2
    VLD1.S32 D28,[R9],R12
    VLD1.S32 D29,[R9],R12
    VLD1.S32 D30,[R9],R12
    VLD1.S32 D31,[R9],R12
    SUB R9,R9,R12,LSL #2

    VREV64.32 Q15,Q15
    VTRN.S32 Q14,Q15
    VST1.S32 {Q14-Q15},[SP]

    VMOV.U32 Q14,#RADD                  @get the round factor to q14
    VMOV.U32 Q15,#SHIFT                 @Get the shift to neon

    MOV R8,#0                           @INIT LOOP

CORE_LOOP_16X16_VERT:

    VLD1.S32 {D0,D1},[R0]!              @LOAD 1-4 src R1
    VLD1.S32 {D2,D3},[R0]!              @LOAD 5-8 pred R1
    VLD1.S32 {D4,D5},[R0]!              @LOAD 9-12 src R1
    VLD1.S32 {D6,D7},[R0],R4            @LOAD 12-16 pred R1

    VLD1.S32 {D8,D9},[R0]!              @LOAD 1-4 src R2
    VLD1.S32 {D10,D11},[R0]!            @LOAD 5-8 pred R2
    VLD1.S32 {D12,D13},[R0]!            @LOAD 9-12 src R2
    VLD1.S32 {D14,D15},[R0],R4          @LOAD 12-16 pred R2

    VREV64.S32 Q2,Q2                    @Rev 9-12 R1
    VREV64.S32 Q3,Q3                    @Rev 12-16 R1
    VREV64.S32 Q6,Q6                    @Rev 9-12 R2
    VREV64.S32 Q7,Q7                    @Rev 12-16 R2

    VSWP D6,D7
    VSWP D4,D5
    VADD.S32 Q8 ,Q0,Q3                  @e[k] = resi_tmp_1 + resi_tmp_2  k -> 1-4  R1
    VSWP D12,D13                        @ dual issued with prev. instruction
    VADD.S32 Q9 ,Q1,Q2                  @e[k] = resi_tmp_1 + resi_tmp_2  k -> 5-8  R1
    VSWP D14,D15                        @ dual issued with prev. instruction
    VSUB.S32 Q10,Q0,Q3                  @o[k] = resi_tmp_1 - resi_tmp_2  k -> 1-4  R1
    VSUB.S32 Q11,Q1,Q2                  @o[k] = resi_tmp_1 - resi_tmp_2  k -> 5-8  R1

    VADD.S32 Q12,Q4,Q7                  @e[k] = resi_tmp_1 + resi_tmp_2  k -> 1-4  R2
    VREV64.S32    Q9 ,Q9                @rev e[k] k-> 4-7 R1, dual issued with prev. instruction
    VADD.S32 Q13,Q5,Q6                  @e[k] = resi_tmp_1 + resi_tmp_2  k -> 5-8  R2
    VSUB.S32 Q0 ,Q4,Q7                  @o[k] = resi_tmp_1 - resi_tmp_2  k -> 1-4  R2
    VSWP D18,D19                        @ dual issued with prev. instruction
    VSUB.S32 Q1 ,Q5,Q6                  @o[k] = resi_tmp_1 - resi_tmp_2  k -> 5-8  R2
    VREV64.S32    Q13,Q13               @rev e[k] k-> 4-7 R2, dual issued with prev. instruction

    VADD.S32 Q2,Q8,Q9                   @ee[k] = e[k] + e[7 - k] row R1
    VSUB.S32 Q3,Q8,Q9                   @eo[k] = e[k] - e[7 - k] row R1
    VSWP D26,D27


    VADD.S32 Q4,Q12,Q13                 @ee[k] = e[k] + e[7 - k] row R2
    VSUB.S32 Q5,Q12,Q13                 @eo[k] = e[k] - e[7 - k] row R2
    VREV64.S32 D5,D5                    @rev ee[k] 4-7 R1, dual issued with prev. instruction

    VADD.S32 D12,D4,D5                  @eee[0] eee[1]    R1
    VSUB.S32 D13,D4,D5                  @eeo[0] eeo[1]    R1
    VREV64.S32 D9,D9                    @rev ee[k] 4-7 R2, dual issued with prev. instruction


    VADD.S32 D14,D8,D9                  @eee[0] eee[1]    R2
    VSUB.S32 D15,D8,D9                  @eeo[0] eeo[1]    R2

    VLD1.S32 {Q12,Q13},[SP]             @Load g_ai2_ihevc_trans_16[xx]->  Q12 : [0 0] [8 0] [4 0] [12 0]  Q13 : [0 1] [8 1] [4 1] [12 1]
    VREV64.S32 Q8,Q6                    @Q6 : eee[0] eee[1] eeo[0] eeo[1] R1   ->     ;Q8 : eee[1] eee[0] eeo[1] eeo[0] R1

    VREV64.S32 Q9,Q7                    @Q7 : eee[0] eee[1] eeo[0] eeo[1] R2     ->    ;Q9 : eee[1] eee[0] eeo[1] eeo[0] R2


    VMUL.S32 Q4,Q6,Q12                  @g_ai2_ihevc_trans_16 * eee[0] eee[1] eeo[0] eeo[1]    R1
    VMLA.S32 Q4,Q8,Q13                  @g_ai2_ihevc_trans_16 * eee[1] eee[0] eeo[1] eeo[0]    R1

    VMUL.S32 Q6,Q7,Q12                  @g_ai2_ihevc_trans_16 * eee[0] eee[1] eeo[0] eeo[1]    R2
    VMLA.S32 Q6,Q9,Q13                  @g_ai2_ihevc_trans_16 * eee[1] eee[0] eeo[1] eeo[0] R2

                                        @Q3    :R1E00 R1E01 R1E02 R1E03
                                        @Q5    :R2E00 R2E01 R2E02 R2E03
    VSWP D7,D10                         @ dual issued with prev. instruction
                                        @Q3    :R1E00 R1E01 R2E00 R2E01
                                        @Q5    :R1E02 R1E03 R2E02 R2E03
    VSWP D7,D11
                                        @Q3    :R1E00 R1E01 R2E02 R2E03
                                        @Q5    :R1E02 R1E03 R2E00 R2E01

    MOV R12,#COFF_STD_W
    ADD R11,R9,R12,LSL #1               @Get to the 2nd row of src
    LSL R12,R12,#2

    VLD1.S32  {D14,D15},[R11],R12       @LOAD g_ai2_ihevc_trans_16[2][0-4] -> 2G0 2G1 2G2 2G3, 2-cycle instr.

    VADD.S32  Q4,Q4,Q14                 @ROUND  R1
    VMUL.S32  Q12,Q3,Q7                 @2G0 2G1 2G2 2G3 * R1E00 R1E01 R2E02 R2E03, 4-cycle instruction
    VSWP      D14,D15                   @2G0 2G1 2G2 2G3 -> 2G2 2G3 2G0 2G1, dual issued with prev. instruction

    VADD.S32 Q6,Q6,Q14                  @ROUND  R2

    VSHRN.S32 D8,Q4,#SHIFT              @NARROW R1

    VLD1.S32  {D16,D17},[R11],R12       @LOAD g_ai2_ihevc_trans_16[6][0-4]
    VSHRN.S32 D9,Q6,#SHIFT              @NARROW R2, dual issued in 2nd cycle

    VMUL.S32  Q2,Q3,Q8                  @g_ai2_ihevc_trans_16[6][0-4] * eo[0-4], 4-cycle instruction
    VSWP      D16,D17                   @dual issued with prev. instr.

    VZIP.S16 D8,D9                      @INTERLEAVE R1 R2 R1 R2 R1 R2 to write
    VMLA.S32  Q12,Q5,Q7                 @2G2 2G3 2G0 2G1 * R1E02 R1E03 R2E00 R2E01, 4-cycle instruction


    @WRITE INTO MEM the values or wait to be shuffled
    @These values must go to 0 4 8 12 colums
    LSL R10,R7,#2
    VST1.S32 D8[0],[R2],R10

    VST1.S32 D9[0],[R2],R10

    VST1.S32 D8[1],[R2],R10
    VPADD.S32 D18,D24,D25               @D18[0] -> 2G0*R1E00+2G1*R1E01 2G2*R2E02+2G3*R2E03
                                        @D18[1] -> 2G2*R1E02+2G3*R1E03 2G0*R2E00+*2G1R2E01

    VST1.S32 D9[1],[R2],R10
    VMLA.S32  Q2,Q5,Q8                  @g_ai2_ihevc_trans_16[2][0-4] * eo[0-4]
    LSL R10,R10,#2
    SUB R2,R2,R10

    VLD1.S32  {D14,D15},[R11],R12       @LOAD g_ai2_ihevc_trans_16[10][0-4]

    VMUL.S32  Q6,Q3,Q7                  @g_ai2_ihevc_trans_16[10][0-4] * eo[0-4]
    VSWP      D14,D15                   @ dual issued with prev. instruction
    VPADD.S32 D19,D4,D5

    VLD1.S32  {D16,D17},[R11],R12       @LOAD g_ai2_ihevc_trans_16[14][0-4]
    VMUL.S32  Q2,Q3,Q8                  @g_ai2_ihevc_trans_16[14][0-4] * eo[0-4]
    VSWP      D16,D17

    VMLA.S32  Q6,Q5,Q7                  @g_ai2_ihevc_trans_16[2][0-4] * eo[0-4]
    VADD.S32 Q9,Q9,Q14                  @Round by RADD R1
    VMLA.S32  Q2,Q5,Q8                  @g_ai2_ihevc_trans_16[2][0-4] * eo[0-4]
    VSHRN.S32 D8,Q9,#SHIFT              @Shift by SHIFT
    VPADD.S32 D24,D12,D13
    @---------------Processing O, Row 1 and Row 2--------------------------------------
    @ Early start to avoid stalls
    MOV R12,#COFF_STD_W
    ADD R11,R9,R12                      @Get 1ST row
    LSL R12,R12,#1

    LSL R10,R7,#2
    ADD R2,R2,R7,LSL #1                 @move to third row
    @this should go to 2  6 10 14
    VST1.S32 D8[0],[R2],R10

    VST1.S32 D8[1],[R2],R10
    VPADD.S32 D25,D4,D5                 @ dual issued with prev. instruction in 2nd cycle

    VLD1.S32 {Q2,Q3},[R11],R12          @g_ai2_ihevc_trans_16[1][0-7]
    VADD.S32 Q12,Q12,Q14                @Round by RADD R2, dual issued with prev. instruction in 2nd cycle
    VMUL.S32 Q6,Q2,Q0                   @g_ai2_ihevc_trans_16[1][0-3]*o[0][0-3] R2
    VMLA.S32 Q6,Q3,Q1                   @g_ai2_ihevc_trans_16[1][4-7]*o[0][4-7] R2
    VSHRN.S32 D9,Q12,#SHIFT             @Shift by SHIFT

    VMUL.S32 Q2,Q2,Q10                  @g_ai2_ihevc_trans_16[1][0-3]*o[0][0-3] R1
    VMLA.S32 Q2,Q3,Q11                  @g_ai2_ihevc_trans_16[1][4-7]*o[0][4-7] R1
    VADD.S32 D11,D12,D13                @g_ai2_ihevc_trans_16[1][k]*o[0][k]+g_ai2_ihevc_trans_16[0][7-k]*o[0][7-k] R2, dual issued with prev. instr.
    VST1.S32 D9[0],[R2],R10

    VST1.S32 D9[1],[R2],R10
    VADD.S32 D10,D4,D5                  @g_ai2_ihevc_trans_16[1][k]*o[0][k]+g_ai2_ihevc_trans_16[0][7-k]*o[0][7-k] R1, dual issued with prev. instr.
    LSL R10,R10,#2                      @go back to orgin
    SUB R2,R2,R10
    SUB R2,R2,R7,LSL #1

    VLD1.S32 {Q2,Q3},[R11],R12          @g_ai2_ihevc_trans_16[3][0-7]

    VMUL.S32 Q7,Q2,Q10                  @o[0][0-3]
    VMLA.S32 Q7,Q3,Q11                  @o[0][4-7]
    VMUL.S32 Q8,Q2,Q0                   @o[0][0-3]
    VMLA.S32 Q8,Q3,Q1                   @o[0][4-7]

    VLD1.S32 {Q2,Q3},[R11],R12          @g_ai2_ihevc_trans_16[5][0-7]
    VADD.S32 D18,D14,D15
    VMUL.S32 Q12,Q2,Q10                 @o[0][0-3]
    VMLA.S32 Q12,Q3,Q11                 @o[0][4-7]
    VADD.S32 D19,D16,D17
    VMUL.S32 Q4,Q2,Q0
    VMLA.S32 Q4,Q3,Q1
    VLD1.S32 {Q2,Q3},[R11],R12          @g_ai2_ihevc_trans_16[7][0-7]
    VADD.S32 D26,D24,D25                @ dual issued with prev. instr.
    VMUL.S32 Q6,Q2,Q10                  @o[0][0-3]
    VMLA.S32 Q6,Q3,Q11                  @o[0][4-7]
    VADD.S32 D27,D8,D9
    VMUL.S32 Q4,Q2,Q0
    VMLA.S32 Q4,Q3,Q1
    VADD.S32 D12,D12,D13
    @Q5 Q9 Q13 Q6
    VPADD.S32 D14,D10,D11
    VPADD.S32 D15,D18,D19
    VPADD.S32 D16,D26,D27
    VADD.S32  D13,D8,D9
    VADD.S32 Q9,Q7,Q14
    VLD1.S32 {Q2,Q3},[R11],R12          @g_ai2_ihevc_trans_16[0][0-7]
    VPADD.S32 D17,D12,D13               @ dual issued with prev. instr. in 2nd cycle

    VMUL.S32 Q4,Q2,Q10                  @o[0][0-3]
    VMLA.S32 Q4,Q3,Q11                  @o[0][4-7]

    VADD.S32 Q12,Q8,Q14

    VMUL.S32 Q6,Q2,Q0                   @o[0][0-3]
    VMLA.S32 Q6,Q3,Q1                   @o[0][4-7]

    VSHRN.S32 D26,Q9,#SHIFT
    VSHRN.S32 D27,Q12,#SHIFT
    VADD.S32 D10,D8,D9
    @write to memory this should go to 1 3 5 7
    ADD R2,R2,R7
    LSL R7,R7,#1
    VLD1.S32 {Q2,Q3},[R11],R12          @g_ai2_ihevc_trans_16[1][0-7]
    VADD.S32 D11,D12,D13                @ dual issued with prev. instr.

    VST1.S32 D26[0],[R2],R7
    VMUL.S32 Q7,Q2,Q10                  @o[0][0-3]
    VMLA.S32 Q7,Q3,Q11                  @o[0][4-7]
    VST1.S32 D26[1],[R2],R7
    VMUL.S32 Q8,Q2,Q0                   @o[0][0-3]
    VMLA.S32 Q8,Q3,Q1                   @o[0][4-7]
    VST1.S32 D27[0],[R2],R7
    VADD.S32 D18,D14,D15
    VST1.S32 D27[1],[R2],R7

    VLD1.S32 {Q2,Q3},[R11],R12          @g_ai2_ihevc_trans_16[2][0-7]
    VADD.S32 D19,D16,D17                @ dual issued with prev. instr.

    VMUL.S32 Q12,Q2,Q10                 @o[0][0-3]
    VMLA.S32 Q12,Q3,Q11                 @o[0][4-7]
    VMUL.S32 Q4,Q2,Q0
    VMLA.S32 Q4,Q3,Q1

    VLD1.S32 {Q2,Q3},[R11],R12          @g_ai2_ihevc_trans_16[3][0-7]
    VADD.S32 D26,D24,D25

    VMUL.S32 Q6,Q2,Q10                  @o[0][0-3]
    VMLA.S32 Q6,Q3,Q11                  @o[0][4-7]
    VADD.S32  D27,D8,D9

    VMUL.S32 Q4,Q2,Q0
    VMLA.S32 Q4,Q3,Q1
    VADD.S32 D12,D12,D13
    @Q5 Q9 Q13 Q6
    VPADD.S32 D14,D10,D11
    VPADD.S32 D15,D18,D19
    VPADD.S32 D16,D26,D27
    VADD.S32  D13,D8,D9
    VADD.S32 Q9,Q7,Q14
    @ 1- cycle stall?
    VPADD.S32 D17,D12,D13
    VSHRN.S32 D22,Q9,#SHIFT
    VADD.S32 Q10,Q8,Q14
    @ 2-cycle stall?
    VSHRN.S32 D23,Q10,#SHIFT

    @this should go to 9 11 13 15
    @LSL R11,R7,#1
    VST1.S32 D22[0],[R2],R7
    VST1.S32 D22[1],[R2],R7
    VST1.S32 D23[0],[R2],R7
    VST1.S32 D23[1],[R2],R7

    SUB R2,R2,R7,LSL #3
    LSR R7,R7,#1
    SUB R2,R2,R7

    ADD R2,R2,#4                        @MOVE TO NEXT to next COLUMN

    ADD R8,R8,#2                        @increment loop cntr by 2 since we process loop as 2 cols
    CMP R8,#16                          @check loop cntr
    BNE CORE_LOOP_16X16_VERT            @jump acc

    MOV R0,R3

    ADD SP,SP,#32
    vpop {d8 - d15}
    LDMFD          sp!,{r4-r12,PC}      @stack store values of the arguments