xref: /aosp_15_r20/external/libhevc/common/arm/ihevc_resi_trans.s (revision c83a76b084498d55f252f48b2e3786804cdf24b7)
1@/******************************************************************************
2@ *
3@ * Copyright (C) 2018 The Android Open Source Project
4@ *
5@ * Licensed under the Apache License, Version 2.0 (the "License");
6@ * you may not use this file except in compliance with the License.
7@ * You may obtain a copy of the License at:
8@ *
9@ * http://www.apache.org/licenses/LICENSE-2.0
10@ *
11@ * Unless required by applicable law or agreed to in writing, software
12@ * distributed under the License is distributed on an "AS IS" BASIS,
13@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14@ * See the License for the specific language governing permissions and
15@ * limitations under the License.
16@ *
17@ *****************************************************************************
18@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19@*/
20
21.text
22.align 4
23
24@/**
25@/*******************************************************************************
26@/*
27@/* @brief
28@/*  Residue calculation and Forward Transform for 4x4 block with 8-bit input
29@/*
30@/* @par Description:
31@/*  Performs residue calculation by subtracting source and  prediction and
32@/*  followed by forward transform
33@/*
34@/* @param[in] pu1_src
35@/*  Input 4x4 pixels
36@/*
37@/* @param[in] pu1_pred
38@/*  Prediction data
39@/*
40@/* @param[in] pi4_tmp
41@/*  Temporary buffer of size 4x4
42@/*
43@/* @param[out] pi2_dst
44@/*  Output 4x4 coefficients
45@/*
46@/* @param[in] src_strd
47@/*  Input stride
48@/*
49@/* @param[in] pred_strd
50@/*  Prediction Stride
51@/*
52@/* @param[in] dst_strd
53@/*  Output Stride
54@/*
55@/* @param[in] chr_plane
56@/*  Chroma plane
57@/*
58@/* @returns  Void
59@/*
60@/* @remarks
61@/*  None
62@/*
63@/*******************************************************************************
64@/*/
65
66@/**************Variables Vs Registers*****************************************
67@    r0 => *pu1_src
68@    r1 => *pu1_pred
69@    r2 => *pi4_temp
70@    r3 => *pi2_dst
71@    r4 => src_strd
72@    r5 => pred_strd
73@    r6 => dst_strd
74@    r7 => chroma_plane
75
76    .global ihevc_resi_trans_4x4_a9q
77
78ihevc_resi_trans_4x4_a9q:
79
80    STMFD          sp!, {r4-r7, r14}   @ store all the register components from caller function to memory
81    LDR            r4, [sp,#20]        @ r4 contains src_strd
82    LDR            r5, [sp,#24]        @ r5 contains pred_strd
83    LDR            r6, [sp,#28]        @ r6 contains dst_strd
84    LDR            r7, [sp,#32]        @ r7 chroma plane
85
86    CMP            r7, #-1
87    BEQ            NON_INTERLEAVE_LOAD @if flag == NULL_PLANE, use non-interleaving loads
88
89    VLD1.64        d0, [r0], r4        @ load row 0 src
90    VLD1.64        d4, [r0], r4        @ load row 1 src
91    VLD1.64        d1, [r0], r4        @ load row 2 src
92    VLD1.64        d5, [r0], r4        @ load row 3 src
93    VUZP.8         d0, d4              @ de-interleaving unzip instruction to get luma data of pu1_src in d0
94    VUZP.8         d1, d5              @ de-interleaving unzip instruction to get luma data of pu1_src in d1
95
96    VLD1.64        d2, [r1], r5        @ load row 0 pred
97    VLD1.64        d6, [r1], r5        @ load row 1 pred
98    VLD1.64        d3, [r1], r5        @ load row 2 pred
99    VLD1.64        d7, [r1], r5        @ load row 3 pred
100    VUZP.8         d2, d6              @ de-interleaving unzip instruction to get luma data of pu1_pred in d2
101    VUZP.8         d3, d7              @ de-interleaving unzip instruction to get luma data of pu1_pred in d3
102
103    CMP            r7, #0
104    BEQ            LOAD_END
105    VSWP.8         d0, d4
106    VSWP.8         d1, d5
107    VSWP.8         d2, d6
108    VSWP.8         d3, d7
109
110    B LOAD_END
111
112NON_INTERLEAVE_LOAD:
113    VLD1.U32     d0[0], [r0], r4       @ load row 0 src
114    VLD1.U32     d0[1], [r0], r4       @ load row 1 src
115    VLD1.U32     d1[0], [r0], r4       @ load row 2 src
116    VLD1.U32     d1[1], [r0], r4       @ load row 3 src
117
118    VLD1.U32     d2[0], [r1], r5       @ load row 0 pred
119    VLD1.U32     d2[1], [r1], r5       @ load row 1 pred
120    VLD1.U32     d3[0], [r1], r5       @ load row 2 pred
121    VLD1.U32     d3[1], [r1], r5       @ load row 3 pred
122
123LOAD_END:
124    @ Finding the residue
125    VSUBL.U8    q2, d0, d2             @ q2 contains 1st 16-bit 8 residues
126    VSUBL.U8    q3, d1, d3             @ q3 contains 2nd 16-bit 8 residues
127
128    @ SAD caculation
129    VABDL.U8    q12, d0, d2            @ q12 contains absolute differences
130    VABAL.U8    q12, d1, d3            @ q12 accumulates absolute differences
131    VADD.U16    d26, d24, d25          @ add d-registers of q12
132    VPADDL.U16  d27, d26               @ d27 contains 2 32-bit values that have to be added
133    VPADDL.U32  d28, d27               @ d28 contains 64-bit SAD, only LSB important
134    VMOV.32     r0, d28[0]             @ SAD stored in r0 for return
135    @ SAD caculation ends
136
137    @ Forward transform - step 1
138    VMOV.I16    d2, #64                @ generate immediate constant in d2 for even row multiplication
139    VTRN.16     d4, d5                 @ 3-step transpose of residue matrix starts
140    VTRN.16     d6, d7                 @ 2nd step of the 3-step matrix transpose
141    VMOV.I16    d0, #83                @ generate immediate constant in d0 for odd row multiplication
142    VTRN.32     q2, q3                 @ Final step of matrix transpose
143
144    VMOV.I16    d1, #36                @ generate immediate constant in d1 for odd row multiplication
145    VSWP        d6, d7                 @ vector swap to allow even and odd row calculation using Q registers
146    VADD.S16    q10, q2, q3            @ q4 has the even array
147    VSUB.S16    q11, q2, q3            @ q5 has the odd array
148    VMULL.S16   q12, d20, d2           @ e[0]*64
149    VMLAL.S16   q12, d21, d2[0]        @ row 1 of results: e[0]*64 + e[1]*64
150    VMULL.S16   q13, d20, d2           @ e[0]*64
151    VMLSL.S16   q13, d21, d2[0]        @ row 3 of results: e[0]*64 - e[1]*64
152    VMULL.S16   q8, d22, d0            @ o[0]*83
153    VMLAL.S16   q8, d23, d1[0]         @ row 2 of results: o[0]*83 + o[1]*36
154    VMULL.S16   q9, d22, d1            @ o[0]*36
155    VMLSL.S16   q9, d23, d0[0]         @ row 4 of results: o[0]*36 - o[1]*83
156
157    @ Forward transform - step 2
158    VMOV.I32    d2, #64                @ generate immediate constant in d2 for even row multiplication
159    VMOV.I32    d0, #83                @ generate immediate constant in d0 for odd row multiplication
160    VTRN.32     q12, q8                @ 4-step transpose of residue matrix starts
161    VTRN.32     q13, q9                @ 2nd step of the 4-step matrix transpose
162
163    VMOV.I32    d1, #36                @ generate immediate constant in d1 for odd row multiplication
164    VSWP        d25, d26               @ 3rd step of the 4-step matrix transpose
165    VSWP        d17, d18               @ 4th step of the 4-step matrix transpose
166    VADD.S32    q2, q12, q9            @ e[0]
167    VADD.S32    q3, q8, q13            @ e[1]
168    VSUB.S32    q10, q12, q9           @ o[0]
169    VSUB.S32    q11, q8, q13           @ o[1]
170
171    VMUL.S32    q12, q2, d2[0]         @ e[0]*64
172    VMLA.S32    q12, q3, d2[0]         @ row 1 of results: e[0]*64 + e[1]*64
173    VMUL.S32    q13, q2, d2[0]         @ e[1]*64
174    VMLS.S32    q13, q3, d2[0]         @ row 3 of results: e[0]*64 - e[1]*64
175    VMUL.S32    q8, q10, d0[0]         @ o[0]*83
176    VMLA.S32    q8, q11, d1[0]         @ row 2 of results: o[0]*83 + o[1]*36
177    VMUL.S32    q9, q10, d1[0]         @ o[0]*36
178    VMLS.S32    q9, q11, d0[0]         @ row 4 of results: o[0]*36 - o[1]*83
179
180    VRSHRN.S32  d0, q12, #9            @ (row1 + 256)/512
181    VRSHRN.S32  d1, q8, #9             @ (row2 + 256)/512
182    VRSHRN.S32  d2, q13, #9            @ (row3 + 256)/512
183    VRSHRN.S32  d3, q9, #9             @ (row4 + 256)/512
184
185    LSL         r7, r6, #1             @ r7 = 2*dst_strd, as pi2_dst contains 2-byte integers
186    VST1.U16    d0, [r3], r7           @ store 1st row of result
187    VST1.U16    d1, [r3], r7           @ store 2nd row of result
188    VST1.U16    d2, [r3], r7           @ store 3rd row of result
189    VST1.U16    d3, [r3], r7           @ store 4th row of result
190
191    LDMFD       sp!,{r4-r7,r15}        @ Reload the registers from SP
192
193    @ Function End
194
195@/**
196@*******************************************************************************
197@*
198@* @brief
199@*  This function performs residue calculation and forward  transform type 1
200@*  on input pixels
201@*
202@* @description
203@*  Performs residue calculation by subtracting source and  prediction and
204@*  followed by forward transform
205@*
206@* @param[in] pu1_src
207@*  Input 4x4 pixels
208@*
209@* @param[in] pu1_pred
210@*  Prediction data
211@*
212@* @param[in] pi2_tmp
213@*  Temporary buffer of size 4x4
214@*
215@* @param[out] pi2_dst
216@*  Output 4x4 coefficients
217@*
218@* @param[in] src_strd
219@*  Input stride
220@*
221@* @param[in] pred_strd
222@*  Prediction Stride
223@*
224@* @param[in] dst_strd
225@*  Output Stride
226@*
227@* @param[in] chr_plane (unused)
228@*  Chroma plane
229@*
230@* @returns void
231@*
232@* @remarks
233@*  None
234@*
235@*******************************************************************************
236@*/
237@ UWORD32 ihevc_resi_trans_4x4_ttype1(UWORD8 *pu1_src,
238@                                     UWORD8 *pu1_pred,
239@                                     WORD32 *pi4_temp,
240@                                     WORD16 *pi2_dst,
241@                                     WORD32 src_strd,
242@                                     WORD32 pred_strd,
243@                                     WORD32 dst_strd
244@                                     WORD32 chroma_plane);
245@
246@**************Variables Vs Registers*******************************************
247@
248@ r0 - pu1_src
249@ r1 - pu1_pred
250@ r2 - pi4_temp
251@ r3 - pi2_dst
252@
253@ [sp]   - src_strd
254@ [sp+4] - pred_strd
255@ [sp+8] - dst_strd
256@ [sp+12] - chroma_plane
257@
258@*******************************************************************************
259
260    .global ihevc_resi_trans_4x4_ttype1_a9q
261
262ihevc_resi_trans_4x4_ttype1_a9q:
263
264    PUSH {r4}
265    vpush {d8 - d15}
266
267    LDR r2,[sp,#68]                 @ r2 = src_strd
268    LDR r4,[sp,#72]                 @ r4 = pred_strd
269
270    VLD1.32 d2[0],[r0],r2           @ Row 1 of source in d2[0]
271    VLD1.32 d3[0],[r1],r4           @ Row 1 of prediction in d3[0]
272    VLD1.32 d2[1],[r0],r2           @ Row 2 of source in d2[1]
273    VLD1.32 d3[1],[r1],r4           @ Row 2 of prediction in d3[1]
274
275    VLD1.32 d8[0],[r0],r2           @ Row 3 of source in d8[0]
276    VABDL.U8 q0,d2,d3               @ Absolute differences of rows 1 and 2 in d0
277                                    @ R2:[d11[3] d11[2] d11[1] d11[0]] => Row 2 of residue
278    VLD1.32 d9[0],[r1],r4           @ Row 3 of prediction in d9[0]
279    VSUBL.U8 q5,d2,d3               @ R1:[d10[3] d10[2] d10[1] d10[0]] => Row 1 of residue
280    VLD1.32 d8[1],[r0]              @ Row 4 of source in d8[1]
281    VTRN.16 d10,d11                 @ Transpose step 1
282    VLD1.32 d9[1],[r1]              @ Row 4 of prediction in d9[1]
283
284    VSUBL.U8 q6,d8,d9               @ R3:[d12[3] d12[2] d12[1] d12[0]] => Row 3 of residue
285                                    @ R4:[d13[3] d13[2] d13[1] d13[0]] => Row 4 of residue
286    VABAL.U8 q0,d8,d9               @ Absolute differences of rows 3 and 4 in d1
287    VTRN.16 d12,d13                 @ Transpose step 2
288    VTRN.32 q5,q6                   @ Transpose step 3, Residue block transposed
289                                    @ Columns are in C1:d10, C2:d11, C3:d12 and C4:d13
290    VADD.S16 d23,d11,d13            @ d23 = C2 + C4
291    VMOV.I32 d6,#55                 @ Constant used for multiplication
292    VADD.S16 d22,d10,d13            @ d22 = C1 + C4
293    VADD.U16 d0,d1,d0               @ Accumulating SAD step 1
294    VMOV.I32 d7,#84                 @ Constant used for multiplication
295    VMULL.S16 q7,d23,d6[0]          @ q7  = 55*C2 + 55*C4
296    VMOV.I32 d4,#74                 @ Constant used for multiplication
297    VMULL.S16 q9,d22,d7[0]          @ q9  = 84*C1 + 84*C4
298    VADD.S16 d16,d10,d11            @ d16 = C1 + C2
299    VMUL.S16 d12,d12,d4[0]          @ d12 = 74*C3
300    VMOV.I32 d5,#29                 @ Constant used for multiplication
301    VPADDL.U16 d0,d0                @ Accumulating SAD step 2
302    VSUB.S16 d16,d16,d13            @ d16 = C1 + C2 - C4
303    VMLAL.S16 q7,d22,d5[0]          @ q7  = 29*C1 + 55*C2 + 84*C4
304    VMLSL.S16 q9,d23,d5[0]          @ q9  = 84*C1 - 29*C2 + 55*C4
305    VMULL.S16 q8,d16,d4[0]          @ q8  = 74*C1 + 74*C2 - 74*C4
306    VPADDL.U32 d0,d0                @ Accumulating SAD step 3, SAD in d0
307    VSUB.S32 q10,q9,q7              @ q10 = q9 - q7 = 55*C1 - 84*C2 - 29*C4
308    VMOV.32 r0,d0[0]                @ Return SAD value
309    VRSHR.S32 q8,q8,#1              @ Truncating the 1 bit in q8
310
311    VADDW.S16 q7,q7,d12             @ q7  = 29*C1 + 55*C2 + 74*C3 + 84*C4
312    VSUBW.S16 q9,q9,d12             @ q9  = 84*C1 - 29*C2 - 74*C3 + 55*C4
313    VADDW.S16 q10,q10,d12           @ q10 = 55*C1 - 84*C2 + 74*C3 - 29*C4
314
315    VRSHR.S32 q7,q7,#1              @ Truncating the 1 bit in q7
316    VRSHR.S32 q9,q9,#1              @ Truncating the 1 bit in q9
317    VRSHR.S32 q10,q10,#1            @ Truncating the 1 bit in q10
318                                    @ Transform stage 1 is in P1:q7, P2:q8, P3:q9 and P4:q10
319    VTRN.32 q7,q8
320    VTRN.32 q9,q10
321    VSWP d15,d18
322    VSWP d17,d20                    @ Residue block transposed
323                                    @ Corresponding columns are in S1:q7, S2:q8, S3:q9 and S4:q10
324    VADD.S32 q13,q7,q8              @ q13 = S1 + S2
325    VADD.S32 q1,q7,q10              @ q1 = S1 + S4
326    VADD.S32 q4,q8,q10              @ q4 = S2 + S4
327    VSUB.S32 q13,q13,q10            @ q13 = S1 + S2 - S4
328    VMUL.S32 q12,q1,d5[0]           @ q12 = 29*S1 + 29*S4
329    VMUL.S32 q14,q1,d7[0]           @ q14 = 84*S1 + 84*S4
330    VMUL.S32 q13,q13,d4[0]          @ q13 = 74*S1 + 74*S2 - 74*S4
331
332    VMLA.S32 q12,q4,d6[0]           @ q12 = 29*S1 + 55*S2 + 84*S4
333    VMLS.S32 q14,q4,d5[0]           @ q14 = 84*S1 - 29*S2 + 55*S4
334    VMUL.S32 q9,q9,d4[0]            @ q9 = 74*S3
335
336    LDR r4,[sp,#76]                 @ r4 = dst_strd_chr_flag
337    LSL r4,r4,#1                    @ r4 = 2*dst_strd
338
339    VRSHRN.S32 d26,q13,#8
340    VSUB.S32 q15,q14,q12            @ q15 = q14 - q12 = 55*S1 - 84*S2 - 29*S4
341
342    VADD.S32 q12,q12,q9             @ q12 = 29*S1 + 55*S2 + 74*S3 + 84*S4
343    VSUB.S32 q14,q14,q9             @ q14 = 84*S1 - 29*S2 - 74*S3 + 55*S4
344    VADD.S32 q15,q15,q9             @ q15 = 55*S1 - 84*S2 + 74*S3 - 29*S4
345
346    VRSHRN.S32 d24,q12,#8
347    VRSHRN.S32 d28,q14,#8
348    VRSHRN.S32 d30,q15,#8           @ Truncating the last 8 bits
349                                    @ Transform stage 2 is in U1:d24, U2:d26, U3:d28 and U4:d30
350    VST1.64 d24,[r3],r4             @ Storing row 1 of transform stage 2
351    VST1.64 d26,[r3],r4             @ Storing row 2 of transform stage 2
352    VST1.64 d28,[r3],r4             @ Storing row 3 of transform stage 2
353    VST1.64 d30,[r3]                @ Storing row 4 of transform stage 2
354
355    vpop {d8 - d15}
356    POP {r4}
357    MOV pc,lr
358
359@/**
360@*******************************************************************************
361@*
362@* @brief
363@*  This function performs residue calculation and DCT integer forward transform
364@*  on 8x8 block
365@*
366@* @description
367@*  Performs residue calculation by subtracting source and prediction and
368@*  followed by DCT integer forward transform
369@*
370@* @param[in] pu1_src
371@*  Input 4x4 pixels
372@*
373@* @param[in] pu1_pred
374@*  Prediction data
375@*
376@* @param[in] pi2_tmp
377@*  Temporary buffer of size 8x8
378@*
379@* @param[out] pi2_dst
380@*  Output 8x8 coefficients
381@*
382@* @param[in] src_strd
383@*  Input stride
384@*
385@* @param[in] pred_strd
386@*  Prediction Stride
387@*
388@* @param[in] dst_strd
389@*  Output Stride
390@*
391@* @param[in] chr_plane
392@*  Chroma plane
393@*
394@* @returns void
395@*
396@* @remarks
397@*  None
398@*
399@*******************************************************************************
400@*/
401@ UWORB32 ihevc_resi_trans_8x8(UWORD8 *pu1_src,
402@                              UWORD8 *pu1_pred,
403@                              WORB32 *pi4_temp,
404@                              WORB16 *pi2_dst,
405@                              WORB32 src_strd,
406@                              WORB32 pred_strd,
407@                              WORB32 dst_strd
408@                              WORB32 chroma_plane);
409@
410@**************Variables Vs Registers*******************************************
411@
412@ r0 - pu1_src
413@ r1 - pu1_pred
414@ r2 - pi4_temp
415@ r3 - pi2_dst
416@
417@ [sp]   - src_strd
418@ [sp+4] - pred_strd
419@ [sp+8] - dst_strd
420@ [sp+12] - chroma_plane
421@
422@*******************************************************************************
423
424    .global ihevc_resi_trans_8x8_a9q
425
426ihevc_resi_trans_8x8_a9q:
427
428    PUSH {r4,r5}
429    vpush {d8 - d15}
430
431    @ Loading Prediction and Source blocks of size 8x8
432
433    LDR r4,[sp,#84]                 @ r4 = chroma flag
434
435    CMP r4,#-1                      @ NULL PLANE
436    BEQ LUMA_LOAD
437
438    CMP r4,#1                       @ V PLANE
439    BEQ CHROMA_V_LOAD
440                                    @ handling U PLANE
441    LDR r5,[sp,#72]                 @ r5 = src_strd
442    LDR r4,[sp,#76]                 @ r4 = pred_strd
443
444    VLD2.8 {d0,d2},[r1],r4          @ Row 1 of prediction in d0
445    VLD2.8 {d1,d3},[r0],r5          @ Row 1 of source in d1
446
447    VABDL.U8 q15,d1,d0              @ Row 1 of absolute difference in q15
448    VLD2.8 {d2,d4},[r1],r4          @ Row 2 of prediction in d2
449    VSUBL.U8 q0,d1,d0               @ Row 1 of residue in q0
450    VLD2.8 {d3,d5},[r0],r5          @ Row 2 of source in d3
451
452    VABDL.U8 q9,d3,d2               @ Row 2 of absolute difference in q9
453    VLD2.8 {d4,d6},[r1],r4          @ Row 3 of prediction in d4
454    VSUBL.U8 q1,d3,d2               @ Row 2 of residue in q1
455    VLD2.8 {d5,d7},[r0],r5          @ Row 3 of source in d5
456
457    VABAL.U8 q15,d5,d4              @ Row 3 of absolute difference accumulated in q15
458    VLD2.8 {d6,d8},[r1],r4          @ Row 4 of prediction in d6
459    VSUBL.U8 q2,d5,d4               @ Row 3 of residue in q2
460    VLD2.8 {d7,d9},[r0],r5          @ Row 4 of source in d7
461
462    VABAL.U8 q9,d7,d6               @ Row 4 of absolute difference accumulated in q9
463    VLD2.8 {d8,d10},[r1],r4         @ Row 5 of prediction in d8
464    VSUBL.U8 q3,d7,d6               @ Row 4 of residue in q3
465    VLD2.8 {d9,d11},[r0],r5         @ Row 5 of source in d9
466
467    VABDL.U8 q10,d9,d8              @ Row 5 of absolute difference in q10
468    VLD2.8 {d10,d12},[r1],r4        @ Row 6 of prediction in d10
469    VSUBL.U8 q4,d9,d8               @ Row 5 of residue in q4
470    VLD2.8 {d11,d13},[r0],r5        @ Row 6 of source in d11
471
472    VABAL.U8 q15,d11,d10            @ Row 6 of absolute difference accumulated in q15
473    VLD2.8 {d12,d14},[r1],r4        @ Row 7 of prediction in d12
474    VSUBL.U8 q5,d11,d10             @ Row 6 of residue in q5
475    VLD2.8 {d13,d15},[r0],r5        @ Row 7 of source in d13
476
477    VABAL.U8 q9,d13,d12             @ Row 7 of absolute difference accumulated in q9
478    VLD2.8 {d14,d16},[r1]           @ Row 8 of prediction in d14
479    VSUBL.U8 q6,d13,d12             @ Row 7 of residue in q6
480    VLD2.8 {d15,d17},[r0]           @ Row 8 of source in d15
481
482    B LUMA_LOAD_END
483
484CHROMA_V_LOAD:
485    LDR r5,[sp,#72]                 @ r5 = src_strd
486    LDR r4,[sp,#76]                 @ r4 = pred_strd
487
488    VLD2.8 {d0,d2},[r1],r4          @ Row 1 of prediction in d2
489    VLD2.8 {d1,d3},[r0],r5          @ Row 1 of source in d3
490
491    VABDL.U8 q15,d3,d2              @ Row 1 of absolute difference in q15
492    VLD2.8 {d4,d6},[r1],r4          @ Row 2 of prediction in d6
493    VSUBL.U8 q0,d3,d2               @ Row 1 of residue in q0
494    VLD2.8 {d5,d7},[r0],r5          @ Row 2 of source in d7
495
496    VABDL.U8 q9,d7,d6               @ Row 2 of absolute difference in q9
497    VLD2.8 {d8,d10},[r1],r4         @ Row 3 of prediction in d10
498    VSUBL.U8 q1,d7,d6               @ Row 2 of residue in q1
499    VLD2.8 {d9,d11},[r0],r5         @ Row 3 of source in d11
500
501    VABAL.U8 q15,d11,d10            @ Row 3 of absolute difference accumulated in q15
502    VLD2.8 {d6,d8},[r1],r4          @ Row 4 of prediction in d8
503    VSUBL.U8 q2,d11,d10             @ Row 3 of residue in q2
504    VLD2.8 {d7,d9},[r0],r5          @ Row 4 of source in d9
505
506    VABAL.U8 q9,d9,d8               @ Row 4 of absolute difference accumulated in q9
507    VLD2.8 {d10,d12},[r1],r4        @ Row 5 of prediction in d12
508    VSUBL.U8 q3,d9,d8               @ Row 4 of residue in q3
509    VLD2.8 {d11,d13},[r0],r5        @ Row 5 of source in d13
510
511    VABDL.U8 q10,d13,d12            @ Row 5 of absolute difference in q10
512    VLD2.8 {d14,d16},[r1],r4        @ Row 6 of prediction in d16
513    VSUBL.U8 q4,d13,d12             @ Row 5 of residue in q4
514    VLD2.8 {d15,d17},[r0],r5        @ Row 6 of source in d17
515
516    VABAL.U8 q15,d17,d16            @ Row 6 of absolute difference accumulated in q15
517    VLD2.8 {d12,d14},[r1],r4        @ Row 7 of prediction in d12
518    VSUBL.U8 q5,d17,d16             @ Row 6 of residue in q5
519    VLD2.8 {d13,d15},[r0],r5        @ Row 7 of source in d13
520
521    VABAL.U8 q9,d15,d14             @ Row 7 of absolute difference accumulated in q9
522    VSUBL.U8 q6,d15,d14             @ Row 7 of residue in q6
523
524    VLD2.8 {d14,d16},[r1]           @ Row 8 of prediction in d14
525    VLD2.8 {d15,d17},[r0]           @ Row 8 of source in d15
526    VSWP.8 d14,d16
527    VSWP.8 d15,d17
528
529    B LUMA_LOAD_END
530
531LUMA_LOAD:
532
533    LDR r5,[sp,#72]                 @ r5 = src_strd
534    LDR r4,[sp,#76]                 @ r4 = pred_strd
535
536    VLD1.64 d0,[r1],r4              @ Row 1 of prediction in d0
537    VLD1.64 d1,[r0],r5              @ Row 1 of source in d1
538
539    VABDL.U8 q15,d1,d0              @ Row 1 of absolute difference in q15
540    VLD1.64 d2,[r1],r4              @ Row 2 of prediction in d2
541    VSUBL.U8 q0,d1,d0               @ Row 1 of residue in q0
542    VLD1.64 d3,[r0],r5              @ Row 2 of source in d3
543
544    VABDL.U8 q9,d3,d2               @ Row 2 of absolute difference in q9
545    VLD1.64 d4,[r1],r4              @ Row 3 of prediction in d4
546    VSUBL.U8 q1,d3,d2               @ Row 2 of residue in q1
547    VLD1.64 d5,[r0],r5              @ Row 3 of source in d5
548
549    VABAL.U8 q15,d5,d4              @ Row 3 of absolute difference accumulated in q15
550    VLD1.64 d6,[r1],r4              @ Row 4 of prediction in d6
551    VSUBL.U8 q2,d5,d4               @ Row 3 of residue in q2
552    VLD1.64 d7,[r0],r5              @ Row 4 of source in d7
553
554    VABAL.U8 q9,d7,d6               @ Row 4 of absolute difference accumulated in q9
555    VLD1.64 d8,[r1],r4              @ Row 5 of prediction in d8
556    VSUBL.U8 q3,d7,d6               @ Row 4 of residue in q3
557    VLD1.64 d9,[r0],r5              @ Row 5 of source in d9
558
559    VABDL.U8 q10,d9,d8              @ Row 5 of absolute difference in q10
560    VLD1.64 d10,[r1],r4             @ Row 6 of prediction in d10
561    VSUBL.U8 q4,d9,d8               @ Row 5 of residue in q4
562    VLD1.64 d11,[r0],r5             @ Row 6 of source in d11
563
564    VABAL.U8 q15,d11,d10            @ Row 6 of absolute difference accumulated in q15
565    VLD1.64 d12,[r1],r4             @ Row 7 of prediction in d12
566    VSUBL.U8 q5,d11,d10             @ Row 6 of residue in q5
567    VLD1.64 d13,[r0],r5             @ Row 7 of source in d13
568
569    VABAL.U8 q9,d13,d12             @ Row 7 of absolute difference accumulated in q9
570    VLD1.64 d14,[r1]                @ Row 8 of prediction in d14
571    VSUBL.U8 q6,d13,d12             @ Row 7 of residue in q6
572    VLD1.64 d15,[r0]                @ Row 8 of source in d15
573
574LUMA_LOAD_END:
575
576    @ Transform stage 1
577    @ Transposing residue matrix
578
579    VABAL.U8 q10,d15,d14            @ Row 8 of absolute difference accumulated in q10
580    VTRN.16 q0,q1                   @ Transpose residue matrix step (1a)
581    VSUBL.U8 q7,d15,d14             @ Row 8 of residue in q7
582    VTRN.16 q2,q3                   @ Transpose residue matrix step (1b)
583
584    VTRN.16 q4,q5                   @ Transpose residue matrix step (1c)
585    VTRN.16 q6,q7                   @ Transpose residue matrix step (1d)
586    VTRN.32 q0,q2                   @ Transpose residue matrix step (2a)
587    VTRN.32 q1,q3                   @ Transpose residue matrix step (2b)
588
589    VADD.U16 q8,q15,q9              @ SAD calculation (1)
590    VTRN.32 q4,q6                   @ Transpose residue matrix step (2c)
591    VTRN.32 q5,q7                   @ Transpose residue matrix step (2d)
592
593    VADD.U16 q8,q8,q10              @ SAD calculation (2)
594    VSWP d1,d8                      @ Transpose residue matrix step (3a)
595    VSWP d3,d10                     @ Transpose residue matrix step (3b)
596
597    VADD.U16 d16,d16,d17            @ SAD calculation (3)
598    VSWP d7,d14                     @ Transpose residue matrix step (3c)
599    VSWP d5,d12                     @ Transpose residue matrix step (3d)
600                                    @ Columns of residue C0-C7 (8x8 matrix) in q0-q7
601    VPADDL.U16 d16,d16              @ SAD calculation (4)
602
603    @ Evaluating first step in Butterfly diagram
604
605    VADD.S16 q10,q0,q7              @ q10 = C0 + C7
606    VADD.S16 q11,q1,q6              @ q11 = C1 + C6
607    VPADDL.U32 d16,d16              @ SAD calculation (5)
608    VADD.S16 q12,q2,q5              @ q12 = C2 + C5
609    VADD.S16 q13,q3,q4              @ q13 = C3 + C4
610
611    VSUB.S16 q4,q3,q4               @ q4  = C3 - C4
612    VSUB.S16 q5,q2,q5               @ q5  = C2 - C5
613    VSUB.S16 q6,q1,q6               @ q6  = C1 - C6
614    VSUB.S16 q7,q0,q7               @ q7  = C0 - C7
615
616    @ Calculating F0, F2, F4 and F6
617
618    VADD.S16 q1,q11,q12             @ q1  = C1 + C2 + C5 + C6
619    VADD.S16 q2,q10,q13             @ q2  = C0 + C3 + C4 + C7
620
621    MOV r4,#50
622    LSL r4,r4,#16
623    ADD r4,r4,#18
624    MOV r5,#89
625    LSL r5,r5,#16
626    ADD r5,r5,#75
627    VMOV d0,r4,r5                   @ 16-bit aligned, d0[3] = 89, d0[2] = 75, d0[1] = 50, d0[0]=18
628
629    MOV r4,#83
630    LSL r4,r4,#16
631    ADD r4,r4,#36
632    VMOV d1,r4,r4                   @ 16-bit aligned, d1[3] = 83, d1[2] = 36, d1[1] = 83, d1[0]=36
633
634    VSUB.S16 q10,q10,q13            @ q10 = C0 - C3 - C4 + C7
635    VSUB.S16 q11,q11,q12            @ q11 = C1 - C2 - C5 + C6
636    VMOV.32 r0,d16[0]               @ SAD calculation (6) : Return value = SAD
637
638    VSUB.S16 q3,q2,q1               @ q3 = C0 - C1 - C2 + C3 + C4 - C5 - C6 + C7
639    VADD.S16 q2,q2,q1               @ q2 = C0 + C1 + C2 + C3 + C4 + C5 + C6 + C7
640
641    VMULL.S16 q14,d20,d1[1]         @ q14 = [0] of 83*(C0 - C3 - C4 + C7)
642    VMULL.S16 q15,d21,d1[1]         @ q15 = [1] of 83*(C0 - C3 - C4 + C7)
643    VMULL.S16 q9,d20,d1[0]          @ q9  = [0] of 36*(C0 - C3 - C4 + C7)
644    VMULL.S16 q10,d21,d1[0]         @ q10 = [1] of 36*(C0 - C3 - C4 + C7)
645
646    VMLAL.S16 q14,d22,d1[0]         @ q14 = F2[0] = 83*(C0 - C3 - C4 + C7) + 36*(C1 - C2 - C5 + C6)
647    VSHLL.S16 q13,d6,#6             @ q13 = F4[0] = 64*(C0 - C1 - C2 + C3 + C4 - C5 - C6 + C7)
648    VMLAL.S16 q15,d23,d1[0]         @ q15 = F2[1] = 83*(C0 - C3 - C4 + C7) + 36*(C1 - C2 - C5 + C6)
649    VSHLL.S16 q3,d7,#6              @ q3  = F4[1] = 64*(C0 - C1 - C2 + C3 + C4 - C5 - C6 + C7)
650    VMLSL.S16 q9,d22,d1[1]          @ q9  = F6[0] = 36*(C0 - C3 - C4 + C7) - 83*(C1 - C2 - C5 + C6)
651    VSHLL.S16 q12,d4,#6             @ q12 = F0[0] = 64*(C0 + C1 + C2 + C3 + C4 + C5 + C6 + C7)
652    VMLSL.S16 q10,d23,d1[1]         @ q10 = F6[1] = 36*(C0 - C3 - C4 + C7) - 83*(C1 - C2 - C5 + C6)
653    VSHLL.S16 q2,d5,#6              @ q2  = F0[1] = 64*(C0 + C1 + C2 + C3 + C4 + C5 + C6 + C7)
654
655    @ Calculating F1, F3, F5 and F7
656
657    MOV r4,#48
658    VST1.64 {d24,d25},[r2]!         @ Row 1 of transform stage 1 F0[0] stored
659    VST1.64 {d4,d5},[r2],r4         @ Row 1 of transform stage 1 F0[1] stored
660    VST1.64 {d28,d29},[r2]!         @ Row 3 of transform stage 1 F2[0] stored
661    VST1.64 {d30,d31},[r2],r4       @ Row 3 of transform stage 1 F2[1] stored
662
663    VST1.64 {d26,d27},[r2]!         @ Row 5 of transform stage 1 F4[0] stored
664    VMULL.S16 q1,d14,d0[3]          @ q1  = [0] of 89*(C0 - C7)
665    VMULL.S16 q8,d15,d0[3]          @ q8  = [1] of 89*(C0 - C7)
666    VST1.64 {d6,d7},[r2],r4         @ Row 5 of transform stage 1 F4[1] stored
667    VMULL.S16 q11,d14,d0[2]         @ q11 = [0] of 75*(C0 - C7)
668    VMULL.S16 q13,d15,d0[2]         @ q13 = [1] of 75*(C0 - C7)
669    VST1.64 {d18,d19},[r2]!         @ Row 7 of transform stage 1 F6[0] stored
670    VMULL.S16 q3,d14,d0[1]          @ q3  = [0] of 50*(C0 - C7)
671    VMULL.S16 q9,d15,d0[1]          @ q9  = [1] of 50*(C0 - C7)
672    VST1.64 {d20,d21},[r2]          @ Row 7 of transform stage 1 F6[1] stored
673    VMULL.S16 q10,d14,d0[0]         @ q10 = [0] of 18*(C0 - C7)
674    VMULL.S16 q7,d15,d0[0]          @ q7  = [1] of 18*(C0 - C7)
675
676    VMLAL.S16 q1,d12,d0[2]          @ q1  = [0] of 89*(C0 - C7) + 75*(C1 - C6)
677    VMLAL.S16 q8,d13,d0[2]          @ q8  = [1] of 89*(C0 - C7) + 75*(C1 - C6)
678    VMLSL.S16 q11,d12,d0[0]         @ q11 = [0] of 75*(C0 - C7) - 18*(C1 - C6)
679    VMLSL.S16 q13,d13,d0[0]         @ q13 = [1] of 75*(C0 - C7) - 18*(C1 - C6)
680    VMLSL.S16 q3,d12,d0[3]          @ q3  = [0] of 50*(C0 - C7) - 89*(C1 - C6)
681    VMLSL.S16 q9,d13,d0[3]          @ q9  = [1] of 50*(C0 - C7) - 89*(C1 - C6)
682    VMLSL.S16 q10,d12,d0[1]         @ q10 = [0] of 18*(C0 - C7) - 50*(C1 - C6)
683    VMLSL.S16 q7,d13,d0[1]          @ q7  = [1] of 18*(C0 - C7) - 50*(C1 - C6)
684
685    VMLAL.S16 q1,d10,d0[1]          @ q1  = [0] of 89*(C0 - C7) + 75*(C1 - C6) + 50*(C2 - C5)
686    VMLAL.S16 q8,d11,d0[1]          @ q8  = [1] of 89*(C0 - C7) + 75*(C1 - C6) + 50*(C2 - C5)
687    VMLSL.S16 q11,d10,d0[3]         @ q11 = [0] of 75*(C0 - C7) - 18*(C1 - C6) - 89*(C2 - C5)
688    VMLSL.S16 q13,d11,d0[3]         @ q13 = [1] of 75*(C0 - C7) - 18*(C1 - C6) - 89*(C2 - C5)
689    VMLAL.S16 q3,d10,d0[0]          @ q3  = [0] of 50*(C0 - C7) - 89*(C1 - C6) + 18*(C2 - C5)
690    VMLAL.S16 q9,d11,d0[0]          @ q9  = [1] of 50*(C0 - C7) - 89*(C1 - C6) + 18*(C2 - C5)
691    VMLAL.S16 q10,d10,d0[2]         @ q10 = [0] of 18*(C0 - C7) - 50*(C1 - C6) + 75*(C2 - C5)
692    VMLAL.S16 q7,d11,d0[2]          @ q7  = [1] of 18*(C0 - C7) - 50*(C1 - C6) + 75*(C2 - C5)
693
694    VMLAL.S16 q1,d8,d0[0]           @ q1  = F1[0] = 89*(C0 - C7) + 75*(C1 - C6) + 50*(C2 - C5) + 18*(C3 - C4)
695    VMLAL.S16 q8,d9,d0[0]           @ q8  = F1[1] = 89*(C0 - C7) + 75*(C1 - C6) + 50*(C2 - C5) + 18*(C3 - C4)
696    VMLSL.S16 q11,d8,d0[1]          @ q11 = F3[0] = 75*(C0 - C7) - 18*(C1 - C6) - 89*(C2 - C5) - 50*(C3 - C4)
697    VMLSL.S16 q13,d9,d0[1]          @ q13 = F3[1] = 75*(C0 - C7) - 18*(C1 - C6) - 89*(C2 - C5) - 50*(C3 - C4)
698    SUB r2,r2,#176                  @ r2 now points to the second row
699    VMLAL.S16 q3,d8,d0[2]           @ q3  = F5[0] = 50*(C0 - C7) - 89*(C1 - C6) + 18*(C2 - C5) + 75*(C3 - C4)
700    VMLAL.S16 q9,d9,d0[2]           @ q9  = F5[1] = 50*(C0 - C7) - 89*(C1 - C6) + 18*(C2 - C5) + 75*(C3 - C4)
701    VST1.64 {d2,d3},[r2]!           @ Row 2 of transform stage 1 F1[0] stored
702    VMLSL.S16 q10,d8,d0[3]          @ q10 = F7[0] = 18*(C0 - C7) - 50*(C1 - C6) + 75*(C2 - C5) - 89*(C3 - C4)
703    VMLSL.S16 q7,d9,d0[3]           @ q7  = F7[1] = 18*(C0 - C7) - 50*(C1 - C6) + 75*(C2 - C5) - 89*(C3 - C4)
704
705    VST1.64 {d16,d17},[r2],r4       @ Row 2 of transform stage 1 F1[1] stored
706    VST1.64 {d22,d23},[r2]!         @ Row 4 of transform stage 1 F3[0] stored
707    VST1.64 {d26,d27},[r2],r4       @ Row 4 of transform stage 1 F3[1] stored
708    VST1.64 {d6,d7},[r2]!           @ Row 6 of transform stage 1 F5[0] stored
709    VST1.64 {d18,d19},[r2],r4       @ Row 6 of transform stage 1 F5[1] stored
710    VST1.64 {d20,d21},[r2]!         @ Row 8 of transform stage 1 F7[0] stored
711    VST1.64 {d14,d15},[r2]          @ Row 8 of transform stage 1 F7[1] stored
712
713    @ Transform stage 2 (for rows 1-4 of transform stage 1)
714    @ Transposing the 4 rows (F0, F1, F2, F3)
715    @ F0 = {q2,q12},  F1 = {q8,q1}, F2 = {q15,q14} and F3 = {q13,q11}
716
717    VTRN.32 q12,q1                  @ Transposing first half of transform stage 1 (1a)
718    VTRN.32 q14,q11                 @ Transposing first half of transform stage 1 (1b)
719    VSWP d25,d28                    @ Transposing first half of transform stage 1 (2a)
720    VSWP d22,d3                     @ Transposing first half of transform stage 1 (2b)
721
722    VTRN.32 q2,q8                   @ Transposing first half of transform stage 1 (3a)
723    VTRN.32 q15,q13                 @ Transposing first half of transform stage 1 (3b)
724    VSWP d5,d30                     @ Transposing first half of transform stage 1 (4a)
725    VSWP d26,d17                    @ Transposing first half of transform stage 1 (4b)
726                                    @ B0:q12, B1:q1, B2:q14, B3:q11, B4:q2, B5:q8, B6:q15 and B7:q13
727
728    @ Evaluating first step in Butterfly diagram
729
730    VADD.S32 q0,q12,q13             @ q0  = B0 + B7
731    VADD.S32 q5,q11,q2              @ q5  = B3 + B4
732    VADD.S32 q3,q1,q15              @ q3  = B1 + B6
733    VADD.S32 q4,q14,q8              @ q4  = B2 + B5
734
735    VSUB.S32 q7,q14,q8              @ q7  = B2 - B5
736    VSUB.S32 q8,q1,q15              @ q8  = B1 - B6
737    VSUB.S32 q6,q11,q2              @ q6  = B3 - B4
738    VSUB.S32 q9,q12,q13             @ q9  = B0 - B7
739
740    @ Calculating G0, G2, G4 and G6
741
742    MOV r4,#18
743    MOV r5,#50
744    VMOV d2,r4,r5                   @ 32-bit aligned, d2[1] = 50, d2[0] = 18
745    VSUB.S32 q2,q0,q5               @ q2  = B0 - B3 - B4 + B7
746
747    MOV r4,#75
748    MOV r5,#89
749    VMOV d3,r4,r5                   @ 32-bit aligned, d3[1] = 89, d3[0] = 75
750    VADD.S32 q10,q0,q5              @ q10 = B0 + B3 + B4 + B7
751
752    MOV r4,#36
753    MOV r5,#83
754    VMOV d0,r4,r5                   @ 32-bit aligned, d0[1] = 83, d0[0] = 36
755    VSUB.S32 q11,q3,q4              @ q11 = B1 - B2 - B5 + B6
756    VADD.S32 q3,q3,q4               @ q3  = B1 + B2 + B5 + B6
757
758    VMUL.S32 q12,q2,d0[1]           @ q12 = 83*(B0 - B3 - B4 + B7)
759    VMUL.S32 q2,q2,d0[0]            @ q2  = 36*(B0 - B3 - B4 + B7)
760    VMUL.S32 q5,q9,d3[1]            @ q5 = 89*(B0 - B7)
761    VADD.S32 q14,q10,q3             @ q14 = B0 + B1 + B2 + B3 + B4 + B5 + B6 + B7
762    VMUL.S32 q4,q9,d3[0]            @ q4 = 75*(B0 - B7)
763    VSUB.S32 q15,q10,q3             @ q15 = B0 - B1 - B2 + B3 + B4 - B5 - B6 + B7
764@    VSHL.S32 q14,q14,#6             ; q14 = G0 = 64*(B0 + B1 + B2 + B3 + B4 + B5 + B6 + B7)
765@    VSHL.S32 q15,q15,#6             ; q15 = G4 = 64*(B0 - B1 - B2 + B3 + B4 - B5 - B6 + B7)
766
767    VMLA.S32 q12,q11,d0[0]          @ q12 = G2 = 83*(B0 - B3 - B4 + B7) + 36*(B1 - B2 - B5 + B6)
768    VRSHRN.I32 d28,q14,#5           @ Truncating last 11 bits in G0
769    VMLS.S32 q2,q11,d0[1]           @ q2  = G6 = 36*(B0 - B3 - B4 + B7) - 83*(B1 - B2 - B5 + B6)
770    VRSHRN.I32 d30,q15,#5           @ Truncating last 11 bits in G4
771
772    LDR r4,[sp,#80]                 @ r4 = dst_strd
773    LSL r4,r4,#2                    @ r4 = 2*dst_strd*2
774
775    VMUL.S32 q3,q9,d2[1]            @ q3 = 50*(B0 - B7)
776    VRSHRN.I32 d24,q12,#11          @ Truncating last 11 bits in G2
777    VMUL.S32 q9,q9,d2[0]            @ q9 = 18*(B0 - B7)
778    VRSHRN.I32 d4,q2,#11            @ Truncating last 11 bits in G6
779
780    VMLA.S32 q5,q8,d3[0]            @ q5 = 89*(B0 - B7) + 75*(B1 - B6)
781    VST1.64 d28,[r3],r4             @ First half-row of row 1 of transform stage 2 (G0) stored
782    VMLS.S32 q4,q8,d2[0]            @ q4 = 75*(B0 - B7) - 18*(B1 - B6)
783
784    VMLS.S32 q3,q8,d3[1]            @ q3 = 50*(B0 - B7) - 89*(B1 - B6)
785    VST1.64 d24,[r3],r4             @ First half-row of row 3 of transform stage 2 (G2) stored
786    VMLS.S32 q9,q8,d2[1]            @ q9 = 18*(B0 - B7) - 50*(B1 - B6)
787
788    VMLA.S32 q5,q7,d2[1]            @ q5 = 89*(B0 - B7) + 75*(B1 - B6) + 50*(B2 - B5)
789    VST1.64 d30,[r3],r4             @ First half-row of row 5 of transform stage 2 (G4) stored
790    VMLS.S32 q4,q7,d3[1]            @ q4 = 75*(B0 - B7) - 18*(B1 - B6) - 89*(B2 - B5)
791
792    VMLA.S32 q3,q7,d2[0]            @ q3 = 50*(B0 - B7) - 89*(B1 - B6) + 18*(B2 - B5)
793    VST1.64 d4,[r3]                 @ First half-row of row 7 of transform stage 2 (G6) stored
794    VMLA.S32 q9,q7,d3[0]            @ q9 = 18*(B0 - B7) - 50*(B1 - B6) + 75*(B2 - B5)
795
796    VMLA.S32 q5,q6,d2[0]            @ q5 = G1 = 89*(B0 - B7) + 75*(B1 - B6) + 50*(B2 - B5) + 18*(B3 - B4)
797    VMLS.S32 q4,q6,d2[1]            @ q4 = G3 = 75*(B0 - B7) - 18*(B1 - B6) - 89*(B2 - B5) - 50*(B3 - B4)
798    VMLA.S32 q3,q6,d3[0]            @ q3 = G5 = 50*(B0 - B7) - 89*(B1 - B6) + 18*(B2 - B5) + 75*(B3 - B4)
799    VMLS.S32 q9,q6,d3[1]            @ q9 = G7 = 18*(B0 - B7) - 50*(B1 - B6) + 75*(B2 - B5) - 89*(B3 - B4)
800
801    SUB r3,r3,r4,LSL #1
802    SUB r3,r3,r4,ASR #1             @ r3 = r3 - 5*dst_strd*2
803                                    @ r3 is moved from row 7 to row 2
804    VRSHRN.I32 d10,q5,#11           @ Truncating last 11 bits in G1
805    VRSHRN.I32 d8,q4,#11            @ Truncating last 11 bits in G3
806    VRSHRN.I32 d6,q3,#11            @ Truncating last 11 bits in G5
807    VST1.64 d10,[r3],r4             @ First half-row of row 2 of transform stage 2 (G1) stored
808    VRSHRN.I32 d18,q9,#11           @ Truncating last 11 bits in G7
809
810    VST1.64 d8,[r3],r4              @ First half-row of row 4 of transform stage 2 (G3) stored
811    VST1.64 d6,[r3],r4              @ First half-row of row 6 of transform stage 2 (G5) stored
812    VST1.64 d18,[r3]!               @ First half-row of row 8 of transform stage 2 (G7) stored
813
814    @ Transform stage 2 (for rows 5-8 of transform stage 1)
815    @ Loading the 4 rows (F4, F5, F6, F7)
816
817    SUB r2,r2,#112                  @ r2 jumps from row 8 to row 5 in temporary memory
818    VLD1.64 {d20,d21},[r2]!         @ q10 = F4[0]
819    VLD1.64 {d22,d23},[r2]!         @ q11 = F4[1]
820    VLD1.64 {d8,d9},[r2]!           @ q4  = F5[0]
821    @ Transposing the 4 rows
822    @ F0 = {q11,q10}, F1 = {q5,q4}, F2 = {q3,q2} and F3 = {q13,q12}
823
824    VTRN.32 q10,q4                  @ Transposing second half of transform stage 1 (1a)
825    VLD1.64 {d10,d11},[r2]!         @ q5  = F5[1]
826    VLD1.64 {d4,d5},[r2]!           @ q2  = F6[0]
827    VLD1.64 {d6,d7},[r2]!           @ q3  = F6[1]
828    VLD1.64 {d24,d25},[r2]!         @ q12 = F7[0]
829    VTRN.32 q2,q12                  @ Transposing second half of transform stage 1 (1b)
830    VLD1.64 {d26,d27},[r2]          @ q13 = F7[1]
831
832    VSWP d21,d4                     @ Transposing second half of transform stage 1 (2a)
833    VSWP d24,d9                     @ Transposing second half of transform stage 1 (2b)
834
835    VTRN.32 q11,q5                  @ Transposing second half of transform stage 1 (3a)
836    VTRN.32 q3,q13                  @ Transposing second half of transform stage 1 (3b)
837    VSWP d26,d11                    @ Transposing second half of transform stage 1 (4b)
838    VSWP d23,d6                     @ Transposing second half of transform stage 1 (4a)
839                                    @ B0:q10, B1:q4, B2:q2, B3:q12, B4:q11, B5:q5, B6:q3 and B7:q13
840
841    @ Evaluating first step in Butterfly diagram
842
843    VADD.S32 q0,q10,q13             @ q0  = B0 + B7
844    VADD.S32 q15,q12,q11            @ q15 = B3 + B4
845    VADD.S32 q1,q4,q3               @ q1  = B1 + B6
846    VADD.S32 q14,q2,q5              @ q14 = B2 + B5
847
848    VSUB.S32 q9,q10,q13             @ q9  = B0 - B7
849    VSUB.S32 q6,q12,q11             @ q6  = B3 - B4
850    VSUB.S32 q7,q2,q5               @ q7  = B2 - B5
851    VSUB.S32 q8,q4,q3               @ q8  = B1 - B6
852
853    @ Calculating H0, H2, H4 and H6
854
855    VADD.S32 q3,q1,q14              @ q3 = B1 + B2 + B5 + B6
856    VSUB.S32 q5,q1,q14              @ q5 = B1 - B2 - B5 + B6
857
858    MOV r4,#18
859    MOV r5,#50
860    VSUB.S32 q4,q0,q15              @ q4 = B0 - B3 - B4 + B7
861    VMOV d2,r4,r5                   @ 32-bit aligned, d2[1] = 50, d2[0] = 18
862
863    MOV r4,#75
864    MOV r5,#89
865    VADD.S32 q2,q0,q15              @ q2 = B0 + B3 + B4 + B7
866    VMOV d3,r4,r5                   @ 32-bit aligned, d3[1] = 89, d3[0] = 75
867
868    MOV r4,#36
869    MOV r5,#83
870
871    @ Calculating H1, H3, H5 and H7
872
873    VMUL.S32 q10,q9,d3[1]           @ q10 = 89*(B0 - B7)
874    VMOV d0,r4,r5                   @ 32-bit aligned, d0[1] = 83, d0[0] = 36
875
876    VMUL.S32 q13,q9,d3[0]           @ q13 = 75*(B0 - B7)
877
878    VMUL.S32 q12,q4,d0[1]           @ q12 = 83*(B0 - B3 - B4 + B7)
879    VADD.S32 q14,q2,q3              @ q14 = B0 + B1 + B2 + B3 + B4 + B5 + B6 + B7
880    VMUL.S32 q4,q4,d0[0]            @ q4  = 36*(B0 - B3 - B4 + B7)
881    VSUB.S32 q2,q2,q3               @ q2  = B0 - B1 - B2 + B3 + B4 - B5 - B6 + B7
882
883
884    VMLA.S32 q12,q5,d0[0]           @ q12 = H2 = 83*(B0 - B3 - B4 + B7) + 36*(B1 - B2 - B5 + B6)
885@    VSHL.S32 q14,q14,#6             ; q14 = H0 = 64*(B0 + B1 + B2 + B3 + B4 + B5 + B6 + B7)
886    VMLS.S32 q4,q5,d0[1]            @ q4 = H6 = 36*(B0 - B3 - B4 + B7) - 83*(B1 - B2 - B5 + B6)
887@    VSHL.S32 q2,q15,#6              ; q2 = H4 = 64*(B0 - B1 - B2 + B3 + B4 - B5 - B6 + B7)
888
889    VMUL.S32 q11,q9,d2[1]           @ q11 = 50*(B0 - B7)
890    VRSHRN.I32 d28,q14,#5           @ Truncating last 11 bits in H0
891    VMUL.S32 q9,q9,d2[0]            @ q9  = 18*(B0 - B7)
892    VRSHRN.I32 d24,q12,#11          @ Truncating last 11 bits in H2
893
894    VMLA.S32 q10,q8,d3[0]           @ q10 = 89*(B0 - B7) + 75*(B1 - B6)
895    VRSHRN.I32 d4,q2,#5             @ Truncating last 11 bits in H4
896    VMLS.S32 q13,q8,d2[0]           @ q13 = 75*(B0 - B7) - 18*(B1 - B6)
897    VRSHRN.I32 d8,q4,#11            @ Truncating last 11 bits in H6
898
899    LDR r4,[sp,#80]                 @ r4 = dst_strd
900    LSL r4,r4,#2                    @ r4 = 2*dst_strd*2
901
902    SUB r3,r3,r4,LSL #2
903    ADD r3,r3,r4,ASR #1             @ r3 = r3 - 7*dst_strd*2
904                                    @ r3 is moved from row 8 to row 1
905    VMLS.S32 q11,q8,d3[1]           @ q11 = 50*(B0 - B7) - 89*(B1 - B6)
906    VST1.64 d28,[r3],r4             @ Second half-row of row 1 of transform stage 2 (H0) stored
907    VMLS.S32 q9,q8,d2[1]            @ q9  = 18*(B0 - B7) - 50*(B1 - B6)
908
909    VMLA.S32 q10,q7,d2[1]           @ q10 = 89*(B0 - B7) + 75*(B1 - B6) + 50*(B2 - B5)
910    VST1.64 d24,[r3],r4             @ Second half-row of row 3 of transform stage 2 (H2) stored
911    VMLS.S32 q13,q7,d3[1]           @ q13 = 75*(B0 - B7) - 18*(B1 - B6) - 89*(B2 - B5)
912
913    VMLA.S32 q11,q7,d2[0]           @ q11 = 50*(B0 - B7) - 89*(B1 - B6) + 18*(B2 - B5)
914    VST1.64 d4,[r3],r4              @ Second half-row of row 5 of transform stage 2 (H4) stored
915    VMLA.S32 q9,q7,d3[0]            @ q9  = 18*(B0 - B7) - 50*(B1 - B6) + 75*(B2 - B5)
916
917    VMLA.S32 q10,q6,d2[0]           @ q10 = H1 = 89*(B0 - B7) + 75*(B1 - B6) + 50*(B2 - B5) + 18*(B3 - B4)
918    VST1.64 d8,[r3]                 @ Second half-row of row 7 of transform stage 2 (H6) stored
919    VMLS.S32 q13,q6,d2[1]           @ q13 = H3 = 75*(B0 - B7) - 18*(B1 - B6) - 89*(B2 - B5) - 50*(B3 - B4)
920
921    VMLA.S32 q11,q6,d3[0]           @ q11 = H5 = 50*(B0 - B7) - 89*(B1 - B6) + 18*(B2 - B5) + 75*(B3 - B4)
922    VMLS.S32 q9,q6,d3[1]            @ q9  = H7 = 18*(B0 - B7) - 50*(B1 - B6) + 75*(B2 - B5) - 89*(B3 - B4)
923
924    SUB r3,r3,r4,LSL #1
925    SUB r3,r3,r4,ASR #1             @ r3 = r3 - 5*dst_strd
926                                    @ r3 is moved from row 7 to row 2
927    VRSHRN.I32 d20,q10,#11          @ Truncating last 11 bits in H1
928    VRSHRN.I32 d26,q13,#11          @ Truncating last 11 bits in H3
929    VRSHRN.I32 d22,q11,#11          @ Truncating last 11 bits in H5
930    VST1.64 d20,[r3],r4             @ Second half-row of row 2 of transform stage 2 (H1) stored
931    VRSHRN.I32 d18,q9,#11           @ Truncating last 11 bits in H7
932
933    VST1.64 d26,[r3],r4             @ Second half-row of row 4 of transform stage 2 (H3) stored
934    VST1.64 d22,[r3],r4             @ Second half-row of row 6 of transform stage 2 (H5) stored
935    VST1.64 d18,[r3]                @ Second half-row of row 8 of transform stage 2 (H7) stored
936
937    vpop {d8 - d15}
938    POP {r4,r5}
939    MOV pc,lr
940
941@/**
942@*/ *******************************************************************************
943@*/
944@*/@brief
945@*/  This function performs residue calculation and forward  transform on
946@*/ input pixels
947@*/
948@*/@par Description:
949@*/ Performs residue calculation by subtracting source and  prediction and
950@*/ followed by forward transform
951@*/
952@*/ @param[in] pu1_src
953@*/  Input 16x16 pixels
954@*/
955@*/ @param[in] pu1_pred
956@*/  Prediction data
957@*/
958@*/ @param[in] pi2_tmp
959@*/  Temporary buffer of size 16x16
960@*/
961@*/ @param[out] pi2_dst
962@*/  Output 16x16 coefficients
963@*/
964@*/ @param[in] src_strd
965@*/  Input stride
966@*/
967@*/ @param[in] pred_strd
968@*/  Prediction Stride
969@*/
970@*/ @param[in] dst_strd
971@*/  Output Stride
972@*/
973@*/ @param[in] chr_plane
974@*/  Chroma plane
975@*/
976@*/ @returns  Void
977@*/
978@*/ @remarks
979@*/  None
980@*/
981@*/*******************************************************************************
982@*/
983
984.extern g_ai2_ihevc_trans_16
985.extern g_ai4_ihevc_trans_16
986
987g_ai2_ihevc_trans_16_addr_1:
988.long g_ai2_ihevc_trans_16 - ulbl1 - 8
989
990g_ai2_ihevc_trans_16_addr_2:
991.long g_ai2_ihevc_trans_16 - ulbl2 - 8
992
993g_ai4_ihevc_trans_16_addr:
994.long g_ai4_ihevc_trans_16 - ulbl3 - 8
995
996    .global ihevc_resi_trans_16x16_a9q
997
998ihevc_resi_trans_16x16_a9q:
999
1000.equ TMP_STRIDE        ,  64            @16*4, Stride of tmp register
1001.equ SHIFT             ,  13            @shift = 13; // log2(iWidth) - 1 + g_uiBitIncrement
1002.equ RADD              ,  4096          @1 << (shift - 1);
1003
1004.equ COFF_STD_2B       ,  32            @Stride for g_ai2_ihevc_trans_16 in bytes
1005.equ COFF_STD_W        ,  32            @Stride for g_ai4_ihevc_trans_16 in bytes
1006
1007@;LOAD the fucntion
1008    STMFD          SP!,{r4-r12,LR}      @stack store values of the arguments
1009    vpush          {d8 - d15}
1010    SUB            SP,SP,#32
1011
1012    LDR             R4,[SP,#136]         @get src_strd
1013    LDR             R5,[SP,#140]         @get pred_strd
1014    LDR             R6,[SP,#144]         @get dst_strd
1015    LDR             R14,[SP,#148]        @get chroma_plane
1016
1017    MOV R8,#0                           @Set loop counter
1018    LDR R9,g_ai2_ihevc_trans_16_addr_1    @get 16 bit transform matrix
1019ulbl1:
1020    ADD R9, R9, PC
1021    @Read [0 0] [4 0] [8 0] [12 0],[0 1] [4 1] [8 1] [12 1] values of g_ai2_ihevc_trans_16
1022    @and write to stack
1023    MOV R12,#COFF_STD_2B
1024    LSL R12,#2
1025
1026    VLD1.S32 D30[0],[R9],R12
1027    VLD1.S32 D30[1],[R9],R12
1028    VLD1.S32 D31[0],[R9],R12
1029    VLD1.S32 D31[1],[R9],R12
1030
1031    VTRN.S32 D30,D31
1032    VTRN.S16 D30,D31
1033    VST1.S16 {d30,d31},[SP]
1034
1035    LDR R9,g_ai2_ihevc_trans_16_addr_2      @get back 16 bit transform matrix
1036ulbl2:
1037    ADD R9, R9, PC
1038
1039    MOV R7,#TMP_STRIDE
1040
1041    VMOV.S32 Q14,#0
1042
1043@R0         pu1_src
1044@R1         pu1_pred
1045@R2         pi4_tmp
1046@R3         pi2_dst
1047@R4         src_strd
1048@R5         pred_strd
1049@R6         dst_strd
1050@R7         tmp_dst Nx4 block stride
1051@R8         loop cntr
1052@R9         g_ai2_ihevc_trans_16
1053@R10        tmp_dst Nx4 block offset
1054@R11        tmp register
1055@R12        ------
1056@R14        chroma_plane
1057@q14        shift 32 bit
1058@q15        add 32 bit
1059
1060CORE_LOOP_16X16_HORIZ:
1061
1062    CMP R14,#-1
1063    BGT INTERLEAVED_LOAD_S1
1064
1065    VLD1.U8 {d0,d1},[R0],R4             @LOAD 1-16 src row 1
1066    VLD1.U8 {d2,d3},[R1],R5             @LOAD 1-16 pred row 1
1067    VLD1.U8 {d4,d5},[R0],R4             @LOAD 1-16 src row 2
1068    VLD1.U8 {d6,d7},[R1],R5             @LOAD 1-16 pred row 2
1069    B    LOAD_DONE
1070
1071INTERLEAVED_LOAD_S1:
1072    CMP R14,#1
1073    BEQ INTERLEAVED_LOAD_S2
1074    VLD2.U8 {Q0,Q1},[R0],R4             @LOAD 1-16 src row 1
1075    VLD2.U8 {Q1,Q2},[R1],R5             @LOAD 1-16 pred row 1
1076    VLD2.U8 {Q2,Q3},[R0],R4             @LOAD 1-16 src row 2
1077    VLD2.U8 {Q3,Q4},[R1],R5             @LOAD 1-16 pred row 2
1078    B LOAD_DONE
1079
1080INTERLEAVED_LOAD_S2:
1081    VLD2.U8 {Q0,Q1},[R0],R4             @LOAD 1-16 src row 1
1082    VSWP.U8 Q0,Q1
1083    VLD2.U8 {Q1,Q2},[R1],R5             @LOAD 1-16 pred row 1
1084    VSWP.U8 Q1,Q2
1085    VLD2.U8 {Q2,Q3},[R0],R4             @LOAD 1-16 src row 2
1086    VSWP.U8 Q2,Q3
1087    VLD2.U8 {Q3,Q4},[R1],R5             @LOAD 1-16 pred row 2
1088    VSWP.U8 Q3,Q4
1089
1090LOAD_DONE:
1091
1092    VSUBL.U8 Q4,D0,D2                   @Get residue 1-8 row 1
1093    VSUBL.U8 Q5,D1,D3                   @Get residue 9-16 row 1
1094    VSUBL.U8 Q6,D4,D6                   @Get residue 1-8 row 2
1095    VSUBL.U8 Q7,D5,D7                   @Get residue 9-16 row 2
1096
1097    @Get blk sads
1098    VABDL.U8 Q15,D0,D2
1099    VABAL.U8 Q15,D1,D3
1100    VABAL.U8 Q15,D4,D6
1101    VABAL.U8 Q15,D5,D7
1102    VADDW.S16 Q14,Q14,D30
1103    VADDW.S16 Q14,Q14,D31
1104
1105    VREV64.S16 Q5,Q5                    @Rev row 1
1106    VREV64.S16 Q7,Q7                    @Rev row 2
1107    VSWP D10,D11
1108    VSWP D14,D15
1109
1110    VADD.S16 Q8 ,Q4,Q5                  @e[k] = resi_tmp_1 + resi_tmp_2  k -> 1-8 row 1
1111    VSUB.S16 Q9 ,Q4,Q5                  @o[k] = resi_tmp_1 - resi_tmp_2     k ->9-16 row 1
1112    VADD.S16 Q10,Q6,Q7                  @e[k] = resi_tmp_1 + resi_tmp_2  k -> 1-8 row 2
1113    VSUB.S16 Q11,Q6,Q7                  @o[k] = resi_tmp_1 - resi_tmp_2     k ->9-16 row 2
1114
1115    VREV64.S16    D24,D17               @rev e[k] k-> 4-7 row 1
1116    VREV64.S16    D25,D21               @rev e[k] k-> 4-7 row 2
1117    VMOV.S16    D17,D20
1118
1119    @arrangement OF DATA
1120    @Q8     A1 A2 A3 A4 B1 B2 B3 B4
1121    @Q12    A8 A7 A6 A5 B8 B7 B6 B5
1122
1123    VADD.S16 Q13,Q8,Q12                 @ee[k] = e[k] + e[7 - k] row 1 & 2
1124    VSUB.S16 Q0,Q8,Q12                  @eo[k] = e[k] - e[7 - k] row 1 & 2
1125
1126    @D26 R1ee[0] R1ee[1] R1ee[2] R1ee[3]
1127    @D27 R2ee[0] R2ee[1] R2ee[2] R2ee[3]
1128    VTRN.S32 D26,D27                    @1-cycle stall before it?
1129    @D26 R1ee[0] R1ee[1] R2ee[0] R2ee[1]
1130    @D27 R1ee[2] R1ee[3] R2ee[2] R2ee[3]
1131    VREV32.16 D2,D27                    @1-cycle stall before it?
1132    @D26 R1ee[0] R1ee[1] R2ee[0] R2ee[1]
1133    @D2 R1ee[3] R1ee[2] R2ee[3] R2ee[2]
1134    VMOV.S16 D27,D26
1135    VNEG.S16 D3,D2
1136    @Q13 R1ee[0] R1ee[1] R2ee[0] R2ee[1]  R1ee[0]  R1ee[1]  R2ee[0]  R2ee[1]
1137    @Q1  R1ee[3] R1ee[2] R2ee[3] R2ee[2] -R1ee[3] -R1ee[2] -R2ee[3] -R2ee[2]
1138
1139    @D8 : [0 0] [4 0] [8 0] [12 0]
1140    @D9 : [0 1] [4 1] [8 1] [12 1]
1141    VLD1.S16 {d8,d9},[SP]               @[0 0] [4 0] [8 0] [12 0] [0 1] [4 1] [8 1] [12 1]
1142    VADD.S16 Q1,Q13,Q1                  @ 1-cycle stall before it?
1143    @Q15 R1eee[0] R1eee[1] R2eee[0] R2eee[1] R1eeo[0] R1eeo[1] R2eeo[0] R2eeo[1]
1144
1145    @Q1  R1eee[0] R1eee[1] R2eee[0] R2eee[1]
1146    @    R1eeo[0] R1eeo[1] R2eeo[0] R2eeo[1]
1147    VTRN.S16 D2,D3                      @2-cycle stall before it?
1148    @Q1  R1eee[0] R1eeo[0] R2eee[0] R2eeo[0]
1149    @     R1eee[1] R1eeo[1] R2eee[1] R2eeo[1]
1150
1151    VDUP.S32 D4,D2[0]    @R1eee[0] R1eeo[0] R1eee[0] R1eeo[0]    ;1-cycle stall?
1152    VDUP.S32 D5,D2[1]    @R2eee[0] R2eeo[0] R2eee[0] R2eeo[0]
1153    VDUP.S32 D6,D3[0]    @R1eee[1] R1eeo[1] R1eee[1] R1eeo[1]
1154    VDUP.S32 D7,D3[1]    @R2eee[1] R2eeo[1] R2eee[1] R2eeo[1]
1155
1156    @---------------Process EO--------------------
1157    @ Early start to avoid stalls
1158    MOV R12,#COFF_STD_2B                @Get stride of coeffs
1159
1160    VMULL.S16 Q5,D4,D8                  @   g_ai2_ihevc_trans_16 * R1eee[0] R1eeo[0] R1eee[0] R1eeo[0]
1161    VMLAL.S16 Q5,D6,D9                  @ + g_ai2_ihevc_trans_16 * R1eee[1] R1eeo[1] R1eee[1] R1eeo[1]
1162    VMULL.S16 Q6,D5,D8                  @   g_ai2_ihevc_trans_16 * R2eee[0] R2eeo[0] R2eee[0] R2eeo[0]
1163    VMLAL.S16 Q6,D7,D9                  @ + g_ai2_ihevc_trans_16 * R2eee[1] R2eeo[1] R2eee[1] R2eeo[1]
1164
1165    ADD R11,R9,R12,LSL #1               @Load address of g_ai2_ihevc_trans_16[2]
1166    LSL R12,R12,#2
1167
1168    VLD1.S16 D26,[R11],R12              @LOAD g_ai2_ihevc_trans_16[2][0-4]]
1169
1170    VLD1.S16 D27,[R11],R12              @LOAD g_ai2_ihevc_trans_16[6][0-4]
1171    VMULL.S16 Q1,D26,D0                 @g_ai2_ihevc_trans_16[2][0-4] * eo[0-4]    R1
1172
1173    VMULL.S16 Q2,D26,D1                 @g_ai2_ihevc_trans_16[2][0-4] * eo[0-4]    R2
1174
1175    VZIP.S32 Q5,Q6                      @3-cycle instruction
1176    VMULL.S16 Q3,D27,D0                 @g_ai2_ihevc_trans_16[6][0-4] * eo[0-4]    R1
1177
1178
1179    VLD1.S16 D26,[R11],R12              @LOAD g_ai2_ihevc_trans_16[10][0-4]
1180    VMULL.S16 Q4,D27,D1                 @g_ai2_ihevc_trans_16[6][0-4] * eo[0-4]    R2
1181
1182    @These values must go to 0 4 8 12 colums hence we need stride *4
1183    LSL R10,R7,#2
1184
1185    VLD1.S16 D27,[R11],R12              @LOAD g_ai2_ihevc_trans_16[14][0-4]
1186
1187    VST1.32 D10,[R2],R10
1188    VMULL.S16 Q8,D27,D1                 @g_ai2_ihevc_trans_16[14][0-4] * eo[0-4] R2
1189
1190    VST1.32 D11,[R2],R10
1191    VMULL.S16 Q7,D27,D0                 @g_ai2_ihevc_trans_16[14][0-4] * eo[0-4] R1
1192
1193    VST1.32 D12,[R2],R10
1194    VMULL.S16 Q5,D26,D0                 @g_ai2_ihevc_trans_16[10][0-4] * eo[0-4] R1
1195
1196    VST1.32 D13,[R2],R10
1197    VMULL.S16 Q6,D26,D1                 @g_ai2_ihevc_trans_16[10][0-4] * eo[0-4] R2
1198
1199    SUB R2,R2,R10,LSL #2
1200
1201    @transpose the 4x4 matrix row1
1202    VTRN.32 Q1, Q3                      @R1 transpose1 -- 2 cycles
1203
1204    @transpose the 4x4 matrix row2
1205    VTRN.32 Q2,Q4                       @R2 transpose1 -- 2 cycles
1206
1207    VTRN.32 Q5, Q7                      @R1 transpose1 -- 2 cycles
1208
1209    VTRN.32 Q6,Q8                       @R2 transpose1 -- 2 cycles
1210
1211    VSWP    D10,D3                      @R1 transpose2
1212    VSWP    D14,D7                      @R1 transpose2
1213
1214    VSWP    D12,D5                      @R2 transpose2
1215    VSWP    D16,D9                      @R2 transpose2
1216
1217    VADD.S32 Q5,Q5,Q1                   @R1 add
1218    VADD.S32 Q3,Q3,Q7                   @R1 add
1219
1220    VADD.S32 Q2,Q2,Q4                   @R2 add
1221    VADD.S32 Q6,Q6,Q8                   @R2 add
1222
1223    VADD.S32 Q5,Q5,Q3                   @R1 add
1224
1225    VADD.S32 Q4,Q6,Q2                   @R2 add
1226
1227    @-----------------------Processing O ----------------------------
1228    @ Early start to avoid stalls
1229    MOV R12,#COFF_STD_2B                @Get coeffs stride
1230    LSL R12,R12,#1
1231    ADD R11,R9,#COFF_STD_2B             @Get address of g_ai2_ihevc_trans_16[1]
1232
1233    VLD1.S16 {d4,d5},[R11],R12          @g_ai2_ihevc_trans_16[1][0-7] -- 2 cycles
1234
1235    VZIP.S32 Q5,Q4                      @ 3 cycle instruction
1236    VMULL.S16 Q6,D18,D4                 @o[0][0-3]*  R1
1237
1238
1239    VMLAL.S16 Q6,D19,D5                 @o[0][4-7]*  R1     ; follows MULL instruction: Multiplier accumulator forwarding
1240    @write to memory
1241    @this should go to 2 6 10 14
1242    LSL R10,R7,#2
1243    ADD R2,R2,R7,LSL #1                 @move to third row
1244    VST1.32 D10,[R2],R10
1245    VMULL.S16 Q7,D22,D4                 @o[0][0-3]*  R2
1246
1247    VST1.32 D11,[R2],R10
1248    VMLAL.S16 Q7,D23,D5                 @o[0][4-7]*  R2
1249
1250    VLD1.S16 {d4,d5},[R11],R12          @g_ai2_ihevc_trans_16[3][0-7]
1251
1252    VST1.32 D8,[R2],R10
1253    VMULL.S16 Q8,D18,D4                 @o[1][0-3]*  R1
1254
1255    VST1.32 D9,[R2],R10
1256    VMLAL.S16 Q8,D19,D5                 @o[1][4-7]*  R1
1257    SUB R2,R2,R10,LSL #2
1258    SUB R2,R2,R7,LSL #1
1259
1260    @--------------------Done procrssing EO -------------------------
1261
1262    @ -----------------Processing O continues------------------------
1263
1264    VMULL.S16 Q10,D22,D4                @o[1][0-3]*  R2
1265    VMLAL.S16 Q10,D23,D5                @o[1][4-7]*  R2
1266
1267    VLD1.S16 {d4,d5},[R11],R12          @g_ai2_ihevc_trans_16[5][0-7]
1268
1269    VLD1.S16 {d6,d7},[R11],R12          @g_ai2_ihevc_trans_16[7][0-7]
1270    VMULL.S16 Q12,D18,D4                @o[2][0-3]*  R1
1271
1272    VMLAL.S16 Q12,D19,D5                @o[2][4-7]*  R1
1273    VMULL.S16 Q0,D18,D6                 @o[3][0-3]*  R1
1274    VMLAL.S16 Q0,D19,D7                 @o[3][4-7]*  R1
1275
1276    VMULL.S16 Q13,D22,D4                @o[2][0-3]*  R2
1277    VMLAL.S16 Q13,D23,D5                @o[2][4-7]*  R2
1278    VMULL.S16 Q1,D22,D6                 @o[3][0-3]*  R2
1279    VMLAL.S16 Q1,D23,D7                 @o[3][4-7]*  R2
1280
1281    @transpose the 4x4 matrix R1
1282    VTRN.32 Q6, Q8                      @ 2-cycle instruction
1283
1284    VTRN.32 Q12,Q0                      @ 2-cycle instruction
1285
1286    @transpose the 4x4 matrix R2
1287    VTRN.32 Q7,Q10                      @ 2-cycle instruction
1288
1289    VTRN.32 Q13,Q1                      @ 2-cycle instruction
1290
1291    VSWP    D24,D13
1292    VSWP    D0, D17
1293
1294    VSWP     D26,D15
1295    VSWP    D2,D21
1296
1297    VADD.S32 Q8 ,Q8 ,Q6
1298    VADD.S32 Q12,Q12,Q0
1299
1300    VADD.S32 Q10,Q10,Q7
1301    VADD.S32 Q13,Q13,Q1
1302
1303    VLD1.S16 {d4,d5},[R11],R12          @g_ai2_ihevc_trans_16[9][0-7]
1304    VADD.S32 Q12 ,Q12 ,Q8
1305
1306    VADD.S32 Q13,Q13,Q10
1307    VMULL.S16 Q3,D18,D4                 @o[4][0-3]*  R1
1308    VMLAL.S16 Q3,D19,D5                 @o[4][4-7]*  R1
1309
1310    VZIP.S32 Q12,Q13
1311    VMULL.S16 Q4,D22,D4                 @o[0][0-3]*  R2
1312
1313
1314    VMLAL.S16 Q4,D23,D5                 @o[0][4-7]*  R2
1315    @write to memory
1316    @this should go to 1 3 5 7
1317    ADD R2,R2,R7
1318    LSL R7,R7,#1
1319    VLD1.S16 {d4,d5},[R11],R12          @g_ai2_ihevc_trans_16[11][0-7]
1320
1321    VST1.32 D24,[R2],R7
1322    VMULL.S16 Q5,D18,D4                 @o[5][0-3]*  R1
1323
1324    VST1.32 D25,[R2],R7
1325    VMLAL.S16 Q5,D19,D5                 @o[5][4-7]*  R1
1326
1327    VST1.32 D26,[R2],R7
1328    VMULL.S16 Q6,D22,D4                 @o[0][0-3]*  R2
1329
1330    VST1.32 D27,[R2],R7
1331    VMLAL.S16 Q6,D23,D5                 @o[0][4-7]*  R2
1332
1333    VLD1.S16 {d4,d5},[R11],R12          @g_ai2_ihevc_trans_16[13][0-7]
1334
1335    VLD1.S16 {d2,d3},[R11],R12          @g_ai2_ihevc_trans_16[15][0-7]
1336    VMULL.S16 Q7,D18,D4                 @o[6][0-3]*  R1
1337
1338    VMLAL.S16 Q7,D19,D5                 @o[6][4-7]*  R1
1339    VMULL.S16 Q10,D18,D2                @o[7][0-3]*  R1
1340    VMLAL.S16 Q10,D19,D3                @o[7][4-7]*  R1
1341
1342    VMULL.S16 Q8,D22,D4                 @o[0][0-3]*  R2
1343    VMLAL.S16 Q8,D23,D5                 @o[0][4-7]*  R2
1344    VMULL.S16 Q12,D22,D2                @o[0][0-3]*  R2
1345    VMLAL.S16 Q12,D23,D3                @o[0][4-7]*  R2
1346
1347
1348    @transpose the 4x4 matrix R1
1349    VTRN.32 Q3 ,Q5                      @ 2-cycle instruction
1350
1351    VTRN.32 Q7 ,Q10                     @ transpose step 2 R1 , 2-cycle instruction
1352
1353    @transpose the 4x4 matrix R2
1354    VTRN.32 Q4 ,Q6                      @ 2-cycle instruction
1355
1356    VTRN.32 Q8 ,Q12                     @ transpose step 2 R2 , 2-cycle instruction
1357
1358    VSWP    D14,D7                      @ transpose step 3, R1
1359    VSWP    D20,D11                     @ transpose step 4, R1
1360    VSWP    D16,D9                      @ transpose step 3, R2
1361    VSWP    D24,D13                     @ transpose step 4, R2
1362
1363    VADD.S32 Q5 ,Q5 ,Q3
1364    VADD.S32 Q10,Q10,Q7
1365    VADD.S32 Q6 ,Q6 ,Q4
1366    VADD.S32 Q12,Q12,Q8
1367    VADD.S32 Q10,Q10,Q5
1368    VADD.S32 Q12,Q12,Q6
1369
1370    @ 2-cycle stall
1371    VZIP.S32 Q10,Q12                    @ 3-cycle instruction
1372
1373    @ 2-cycle stall
1374    @this should go to 9 11 13 15
1375    VST1.32 D20,[R2],R7
1376
1377    VST1.32 D21,[R2],R7
1378
1379    VST1.32 D24,[R2],R7
1380
1381    VST1.32 D25,[R2],R7
1382
1383    SUB R2,R2,R7,LSL #3
1384    LSR R7,R7,#1
1385    SUB R2,R2,R7
1386
1387    ADD R2,R2,#8                        @MOVE TO NEXT to next COLUMN - pi4_tmp
1388
1389    ADD R8,R8,#2                        @increment loop cntr
1390    CMP R8,#16                          @check lllop cntr
1391    BNE CORE_LOOP_16X16_HORIZ           @jump acc
1392
1393
1394@*****************Vertical transform************************************
1395
1396@Initialization for vert transform
1397@pi4_tmp will be the new src
1398@tmp stride will be new src stride
1399@dst will be new pi4_tmp
1400@dst stride will be new tmp stride
1401@trans table will be of 32 bit
1402
1403    LDR R9,g_ai4_ihevc_trans_16_addr        @get 32 bit transform matrix
1404ulbl3:
1405    ADD R9, R9, PC
1406
1407    SUB R0,R2,#64                       @set tmp as src [-32 to move back to orgin]
1408    MOV R2,R3                           @set dst as tmp
1409    MOV R4,#TMP_STRIDE                  @set tmp stride as src stride
1410    LSL R7,R6,#1                        @Set dst stride as tmp stride
1411    SUB R4,#48                          @Adjust stride 3 previous loads
1412
1413    @Block SAD
1414    VADD.S32 D28,D28,D29
1415    VPADD.S32 D28,D28,D29
1416    VMOV.S32 R3,D28[0]
1417    @ SAD calculation ends -- final value in R3.
1418
1419    @Read [0 0] [4 0] [8 0] [12 0],[0 1] [4 1] [8 1] [12 1]
1420    @values of g_ai4_ihevc_trans_16 and write to stack
1421    MOV R12,#COFF_STD_W
1422    LSL R12,R12,#2
1423    VLD1.S32 D28,[R9],R12
1424    VLD1.S32 D29,[R9],R12
1425    VLD1.S32 D30,[R9],R12
1426    VLD1.S32 D31,[R9],R12
1427    SUB R9,R9,R12,LSL #2
1428
1429    VREV64.32 Q15,Q15
1430    VTRN.S32 Q14,Q15
1431    VST1.S32 {Q14-Q15},[SP]
1432
1433    VMOV.U32 Q14,#RADD                  @get the round factor to q14
1434    VMOV.U32 Q15,#SHIFT                 @Get the shift to neon
1435
1436    MOV R8,#0                           @INIT LOOP
1437
1438CORE_LOOP_16X16_VERT:
1439
1440    VLD1.S32 {D0,D1},[R0]!              @LOAD 1-4 src R1
1441    VLD1.S32 {D2,D3},[R0]!              @LOAD 5-8 pred R1
1442    VLD1.S32 {D4,D5},[R0]!              @LOAD 9-12 src R1
1443    VLD1.S32 {D6,D7},[R0],R4            @LOAD 12-16 pred R1
1444
1445    VLD1.S32 {D8,D9},[R0]!              @LOAD 1-4 src R2
1446    VLD1.S32 {D10,D11},[R0]!            @LOAD 5-8 pred R2
1447    VLD1.S32 {D12,D13},[R0]!            @LOAD 9-12 src R2
1448    VLD1.S32 {D14,D15},[R0],R4          @LOAD 12-16 pred R2
1449
1450    VREV64.S32 Q2,Q2                    @Rev 9-12 R1
1451    VREV64.S32 Q3,Q3                    @Rev 12-16 R1
1452    VREV64.S32 Q6,Q6                    @Rev 9-12 R2
1453    VREV64.S32 Q7,Q7                    @Rev 12-16 R2
1454
1455    VSWP D6,D7
1456    VSWP D4,D5
1457    VADD.S32 Q8 ,Q0,Q3                  @e[k] = resi_tmp_1 + resi_tmp_2  k -> 1-4  R1
1458    VSWP D12,D13                        @ dual issued with prev. instruction
1459    VADD.S32 Q9 ,Q1,Q2                  @e[k] = resi_tmp_1 + resi_tmp_2  k -> 5-8  R1
1460    VSWP D14,D15                        @ dual issued with prev. instruction
1461    VSUB.S32 Q10,Q0,Q3                  @o[k] = resi_tmp_1 - resi_tmp_2  k -> 1-4  R1
1462    VSUB.S32 Q11,Q1,Q2                  @o[k] = resi_tmp_1 - resi_tmp_2  k -> 5-8  R1
1463
1464    VADD.S32 Q12,Q4,Q7                  @e[k] = resi_tmp_1 + resi_tmp_2  k -> 1-4  R2
1465    VREV64.S32    Q9 ,Q9                @rev e[k] k-> 4-7 R1, dual issued with prev. instruction
1466    VADD.S32 Q13,Q5,Q6                  @e[k] = resi_tmp_1 + resi_tmp_2  k -> 5-8  R2
1467    VSUB.S32 Q0 ,Q4,Q7                  @o[k] = resi_tmp_1 - resi_tmp_2  k -> 1-4  R2
1468    VSWP D18,D19                        @ dual issued with prev. instruction
1469    VSUB.S32 Q1 ,Q5,Q6                  @o[k] = resi_tmp_1 - resi_tmp_2  k -> 5-8  R2
1470    VREV64.S32    Q13,Q13               @rev e[k] k-> 4-7 R2, dual issued with prev. instruction
1471
1472    VADD.S32 Q2,Q8,Q9                   @ee[k] = e[k] + e[7 - k] row R1
1473    VSUB.S32 Q3,Q8,Q9                   @eo[k] = e[k] - e[7 - k] row R1
1474    VSWP D26,D27
1475
1476
1477    VADD.S32 Q4,Q12,Q13                 @ee[k] = e[k] + e[7 - k] row R2
1478    VSUB.S32 Q5,Q12,Q13                 @eo[k] = e[k] - e[7 - k] row R2
1479    VREV64.S32 D5,D5                    @rev ee[k] 4-7 R1, dual issued with prev. instruction
1480
1481    VADD.S32 D12,D4,D5                  @eee[0] eee[1]    R1
1482    VSUB.S32 D13,D4,D5                  @eeo[0] eeo[1]    R1
1483    VREV64.S32 D9,D9                    @rev ee[k] 4-7 R2, dual issued with prev. instruction
1484
1485
1486    VADD.S32 D14,D8,D9                  @eee[0] eee[1]    R2
1487    VSUB.S32 D15,D8,D9                  @eeo[0] eeo[1]    R2
1488
1489    VLD1.S32 {Q12,Q13},[SP]             @Load g_ai2_ihevc_trans_16[xx]->  Q12 : [0 0] [8 0] [4 0] [12 0]  Q13 : [0 1] [8 1] [4 1] [12 1]
1490    VREV64.S32 Q8,Q6                    @Q6 : eee[0] eee[1] eeo[0] eeo[1] R1   ->     ;Q8 : eee[1] eee[0] eeo[1] eeo[0] R1
1491
1492    VREV64.S32 Q9,Q7                    @Q7 : eee[0] eee[1] eeo[0] eeo[1] R2     ->    ;Q9 : eee[1] eee[0] eeo[1] eeo[0] R2
1493
1494
1495    VMUL.S32 Q4,Q6,Q12                  @g_ai2_ihevc_trans_16 * eee[0] eee[1] eeo[0] eeo[1]    R1
1496    VMLA.S32 Q4,Q8,Q13                  @g_ai2_ihevc_trans_16 * eee[1] eee[0] eeo[1] eeo[0]    R1
1497
1498    VMUL.S32 Q6,Q7,Q12                  @g_ai2_ihevc_trans_16 * eee[0] eee[1] eeo[0] eeo[1]    R2
1499    VMLA.S32 Q6,Q9,Q13                  @g_ai2_ihevc_trans_16 * eee[1] eee[0] eeo[1] eeo[0] R2
1500
1501                                        @Q3    :R1E00 R1E01 R1E02 R1E03
1502                                        @Q5    :R2E00 R2E01 R2E02 R2E03
1503    VSWP D7,D10                         @ dual issued with prev. instruction
1504                                        @Q3    :R1E00 R1E01 R2E00 R2E01
1505                                        @Q5    :R1E02 R1E03 R2E02 R2E03
1506    VSWP D7,D11
1507                                        @Q3    :R1E00 R1E01 R2E02 R2E03
1508                                        @Q5    :R1E02 R1E03 R2E00 R2E01
1509
1510    MOV R12,#COFF_STD_W
1511    ADD R11,R9,R12,LSL #1               @Get to the 2nd row of src
1512    LSL R12,R12,#2
1513
1514    VLD1.S32  {D14,D15},[R11],R12       @LOAD g_ai2_ihevc_trans_16[2][0-4] -> 2G0 2G1 2G2 2G3, 2-cycle instr.
1515
1516    VADD.S32  Q4,Q4,Q14                 @ROUND  R1
1517    VMUL.S32  Q12,Q3,Q7                 @2G0 2G1 2G2 2G3 * R1E00 R1E01 R2E02 R2E03, 4-cycle instruction
1518    VSWP      D14,D15                   @2G0 2G1 2G2 2G3 -> 2G2 2G3 2G0 2G1, dual issued with prev. instruction
1519
1520    VADD.S32 Q6,Q6,Q14                  @ROUND  R2
1521
1522    VSHRN.S32 D8,Q4,#SHIFT              @NARROW R1
1523
1524    VLD1.S32  {D16,D17},[R11],R12       @LOAD g_ai2_ihevc_trans_16[6][0-4]
1525    VSHRN.S32 D9,Q6,#SHIFT              @NARROW R2, dual issued in 2nd cycle
1526
1527    VMUL.S32  Q2,Q3,Q8                  @g_ai2_ihevc_trans_16[6][0-4] * eo[0-4], 4-cycle instruction
1528    VSWP      D16,D17                   @dual issued with prev. instr.
1529
1530    VZIP.S16 D8,D9                      @INTERLEAVE R1 R2 R1 R2 R1 R2 to write
1531    VMLA.S32  Q12,Q5,Q7                 @2G2 2G3 2G0 2G1 * R1E02 R1E03 R2E00 R2E01, 4-cycle instruction
1532
1533
1534    @WRITE INTO MEM the values or wait to be shuffled
1535    @These values must go to 0 4 8 12 colums
1536    LSL R10,R7,#2
1537    VST1.S32 D8[0],[R2],R10
1538
1539    VST1.S32 D9[0],[R2],R10
1540
1541    VST1.S32 D8[1],[R2],R10
1542    VPADD.S32 D18,D24,D25               @D18[0] -> 2G0*R1E00+2G1*R1E01 2G2*R2E02+2G3*R2E03
1543                                        @D18[1] -> 2G2*R1E02+2G3*R1E03 2G0*R2E00+*2G1R2E01
1544
1545    VST1.S32 D9[1],[R2],R10
1546    VMLA.S32  Q2,Q5,Q8                  @g_ai2_ihevc_trans_16[2][0-4] * eo[0-4]
1547    LSL R10,R10,#2
1548    SUB R2,R2,R10
1549
1550    VLD1.S32  {D14,D15},[R11],R12       @LOAD g_ai2_ihevc_trans_16[10][0-4]
1551
1552    VMUL.S32  Q6,Q3,Q7                  @g_ai2_ihevc_trans_16[10][0-4] * eo[0-4]
1553    VSWP      D14,D15                   @ dual issued with prev. instruction
1554    VPADD.S32 D19,D4,D5
1555
1556    VLD1.S32  {D16,D17},[R11],R12       @LOAD g_ai2_ihevc_trans_16[14][0-4]
1557    VMUL.S32  Q2,Q3,Q8                  @g_ai2_ihevc_trans_16[14][0-4] * eo[0-4]
1558    VSWP      D16,D17
1559
1560    VMLA.S32  Q6,Q5,Q7                  @g_ai2_ihevc_trans_16[2][0-4] * eo[0-4]
1561    VADD.S32 Q9,Q9,Q14                  @Round by RADD R1
1562    VMLA.S32  Q2,Q5,Q8                  @g_ai2_ihevc_trans_16[2][0-4] * eo[0-4]
1563    VSHRN.S32 D8,Q9,#SHIFT              @Shift by SHIFT
1564    VPADD.S32 D24,D12,D13
1565    @---------------Processing O, Row 1 and Row 2--------------------------------------
1566    @ Early start to avoid stalls
1567    MOV R12,#COFF_STD_W
1568    ADD R11,R9,R12                      @Get 1ST row
1569    LSL R12,R12,#1
1570
1571    LSL R10,R7,#2
1572    ADD R2,R2,R7,LSL #1                 @move to third row
1573    @this should go to 2  6 10 14
1574    VST1.S32 D8[0],[R2],R10
1575
1576    VST1.S32 D8[1],[R2],R10
1577    VPADD.S32 D25,D4,D5                 @ dual issued with prev. instruction in 2nd cycle
1578
1579    VLD1.S32 {Q2,Q3},[R11],R12          @g_ai2_ihevc_trans_16[1][0-7]
1580    VADD.S32 Q12,Q12,Q14                @Round by RADD R2, dual issued with prev. instruction in 2nd cycle
1581    VMUL.S32 Q6,Q2,Q0                   @g_ai2_ihevc_trans_16[1][0-3]*o[0][0-3] R2
1582    VMLA.S32 Q6,Q3,Q1                   @g_ai2_ihevc_trans_16[1][4-7]*o[0][4-7] R2
1583    VSHRN.S32 D9,Q12,#SHIFT             @Shift by SHIFT
1584
1585    VMUL.S32 Q2,Q2,Q10                  @g_ai2_ihevc_trans_16[1][0-3]*o[0][0-3] R1
1586    VMLA.S32 Q2,Q3,Q11                  @g_ai2_ihevc_trans_16[1][4-7]*o[0][4-7] R1
1587    VADD.S32 D11,D12,D13                @g_ai2_ihevc_trans_16[1][k]*o[0][k]+g_ai2_ihevc_trans_16[0][7-k]*o[0][7-k] R2, dual issued with prev. instr.
1588    VST1.S32 D9[0],[R2],R10
1589
1590    VST1.S32 D9[1],[R2],R10
1591    VADD.S32 D10,D4,D5                  @g_ai2_ihevc_trans_16[1][k]*o[0][k]+g_ai2_ihevc_trans_16[0][7-k]*o[0][7-k] R1, dual issued with prev. instr.
1592    LSL R10,R10,#2                      @go back to orgin
1593    SUB R2,R2,R10
1594    SUB R2,R2,R7,LSL #1
1595
1596    VLD1.S32 {Q2,Q3},[R11],R12          @g_ai2_ihevc_trans_16[3][0-7]
1597
1598    VMUL.S32 Q7,Q2,Q10                  @o[0][0-3]
1599    VMLA.S32 Q7,Q3,Q11                  @o[0][4-7]
1600    VMUL.S32 Q8,Q2,Q0                   @o[0][0-3]
1601    VMLA.S32 Q8,Q3,Q1                   @o[0][4-7]
1602
1603    VLD1.S32 {Q2,Q3},[R11],R12          @g_ai2_ihevc_trans_16[5][0-7]
1604    VADD.S32 D18,D14,D15
1605    VMUL.S32 Q12,Q2,Q10                 @o[0][0-3]
1606    VMLA.S32 Q12,Q3,Q11                 @o[0][4-7]
1607    VADD.S32 D19,D16,D17
1608    VMUL.S32 Q4,Q2,Q0
1609    VMLA.S32 Q4,Q3,Q1
1610    VLD1.S32 {Q2,Q3},[R11],R12          @g_ai2_ihevc_trans_16[7][0-7]
1611    VADD.S32 D26,D24,D25                @ dual issued with prev. instr.
1612    VMUL.S32 Q6,Q2,Q10                  @o[0][0-3]
1613    VMLA.S32 Q6,Q3,Q11                  @o[0][4-7]
1614    VADD.S32 D27,D8,D9
1615    VMUL.S32 Q4,Q2,Q0
1616    VMLA.S32 Q4,Q3,Q1
1617    VADD.S32 D12,D12,D13
1618    @Q5 Q9 Q13 Q6
1619    VPADD.S32 D14,D10,D11
1620    VPADD.S32 D15,D18,D19
1621    VPADD.S32 D16,D26,D27
1622    VADD.S32  D13,D8,D9
1623    VADD.S32 Q9,Q7,Q14
1624    VLD1.S32 {Q2,Q3},[R11],R12          @g_ai2_ihevc_trans_16[0][0-7]
1625    VPADD.S32 D17,D12,D13               @ dual issued with prev. instr. in 2nd cycle
1626
1627    VMUL.S32 Q4,Q2,Q10                  @o[0][0-3]
1628    VMLA.S32 Q4,Q3,Q11                  @o[0][4-7]
1629
1630    VADD.S32 Q12,Q8,Q14
1631
1632    VMUL.S32 Q6,Q2,Q0                   @o[0][0-3]
1633    VMLA.S32 Q6,Q3,Q1                   @o[0][4-7]
1634
1635    VSHRN.S32 D26,Q9,#SHIFT
1636    VSHRN.S32 D27,Q12,#SHIFT
1637    VADD.S32 D10,D8,D9
1638    @write to memory this should go to 1 3 5 7
1639    ADD R2,R2,R7
1640    LSL R7,R7,#1
1641    VLD1.S32 {Q2,Q3},[R11],R12          @g_ai2_ihevc_trans_16[1][0-7]
1642    VADD.S32 D11,D12,D13                @ dual issued with prev. instr.
1643
1644    VST1.S32 D26[0],[R2],R7
1645    VMUL.S32 Q7,Q2,Q10                  @o[0][0-3]
1646    VMLA.S32 Q7,Q3,Q11                  @o[0][4-7]
1647    VST1.S32 D26[1],[R2],R7
1648    VMUL.S32 Q8,Q2,Q0                   @o[0][0-3]
1649    VMLA.S32 Q8,Q3,Q1                   @o[0][4-7]
1650    VST1.S32 D27[0],[R2],R7
1651    VADD.S32 D18,D14,D15
1652    VST1.S32 D27[1],[R2],R7
1653
1654    VLD1.S32 {Q2,Q3},[R11],R12          @g_ai2_ihevc_trans_16[2][0-7]
1655    VADD.S32 D19,D16,D17                @ dual issued with prev. instr.
1656
1657    VMUL.S32 Q12,Q2,Q10                 @o[0][0-3]
1658    VMLA.S32 Q12,Q3,Q11                 @o[0][4-7]
1659    VMUL.S32 Q4,Q2,Q0
1660    VMLA.S32 Q4,Q3,Q1
1661
1662    VLD1.S32 {Q2,Q3},[R11],R12          @g_ai2_ihevc_trans_16[3][0-7]
1663    VADD.S32 D26,D24,D25
1664
1665    VMUL.S32 Q6,Q2,Q10                  @o[0][0-3]
1666    VMLA.S32 Q6,Q3,Q11                  @o[0][4-7]
1667    VADD.S32  D27,D8,D9
1668
1669    VMUL.S32 Q4,Q2,Q0
1670    VMLA.S32 Q4,Q3,Q1
1671    VADD.S32 D12,D12,D13
1672    @Q5 Q9 Q13 Q6
1673    VPADD.S32 D14,D10,D11
1674    VPADD.S32 D15,D18,D19
1675    VPADD.S32 D16,D26,D27
1676    VADD.S32  D13,D8,D9
1677    VADD.S32 Q9,Q7,Q14
1678    @ 1- cycle stall?
1679    VPADD.S32 D17,D12,D13
1680    VSHRN.S32 D22,Q9,#SHIFT
1681    VADD.S32 Q10,Q8,Q14
1682    @ 2-cycle stall?
1683    VSHRN.S32 D23,Q10,#SHIFT
1684
1685    @this should go to 9 11 13 15
1686    @LSL R11,R7,#1
1687    VST1.S32 D22[0],[R2],R7
1688    VST1.S32 D22[1],[R2],R7
1689    VST1.S32 D23[0],[R2],R7
1690    VST1.S32 D23[1],[R2],R7
1691
1692    SUB R2,R2,R7,LSL #3
1693    LSR R7,R7,#1
1694    SUB R2,R2,R7
1695
1696    ADD R2,R2,#4                        @MOVE TO NEXT to next COLUMN
1697
1698    ADD R8,R8,#2                        @increment loop cntr by 2 since we process loop as 2 cols
1699    CMP R8,#16                          @check loop cntr
1700    BNE CORE_LOOP_16X16_VERT            @jump acc
1701
1702    MOV R0,R3
1703
1704    ADD SP,SP,#32
1705    vpop {d8 - d15}
1706    LDMFD          sp!,{r4-r12,PC}      @stack store values of the arguments
1707
1708