1@/****************************************************************************** 2@ * 3@ * Copyright (C) 2018 The Android Open Source Project 4@ * 5@ * Licensed under the Apache License, Version 2.0 (the "License"); 6@ * you may not use this file except in compliance with the License. 7@ * You may obtain a copy of the License at: 8@ * 9@ * http://www.apache.org/licenses/LICENSE-2.0 10@ * 11@ * Unless required by applicable law or agreed to in writing, software 12@ * distributed under the License is distributed on an "AS IS" BASIS, 13@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14@ * See the License for the specific language governing permissions and 15@ * limitations under the License. 16@ * 17@ ***************************************************************************** 18@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore 19@*/ 20 21.text 22.align 4 23 24@/** 25@/******************************************************************************* 26@/* 27@/* @brief 28@/* Residue calculation and Forward Transform for 4x4 block with 8-bit input 29@/* 30@/* @par Description: 31@/* Performs residue calculation by subtracting source and prediction and 32@/* followed by forward transform 33@/* 34@/* @param[in] pu1_src 35@/* Input 4x4 pixels 36@/* 37@/* @param[in] pu1_pred 38@/* Prediction data 39@/* 40@/* @param[in] pi4_tmp 41@/* Temporary buffer of size 4x4 42@/* 43@/* @param[out] pi2_dst 44@/* Output 4x4 coefficients 45@/* 46@/* @param[in] src_strd 47@/* Input stride 48@/* 49@/* @param[in] pred_strd 50@/* Prediction Stride 51@/* 52@/* @param[in] dst_strd 53@/* Output Stride 54@/* 55@/* @param[in] chr_plane 56@/* Chroma plane 57@/* 58@/* @returns Void 59@/* 60@/* @remarks 61@/* None 62@/* 63@/******************************************************************************* 64@/*/ 65 66@/**************Variables Vs Registers***************************************** 67@ r0 => *pu1_src 68@ r1 => *pu1_pred 69@ r2 => *pi4_temp 70@ r3 => *pi2_dst 71@ r4 => src_strd 72@ r5 => pred_strd 73@ r6 => dst_strd 74@ r7 => chroma_plane 75 76 .global ihevc_resi_trans_4x4_a9q 77 78ihevc_resi_trans_4x4_a9q: 79 80 STMFD sp!, {r4-r7, r14} @ store all the register components from caller function to memory 81 LDR r4, [sp,#20] @ r4 contains src_strd 82 LDR r5, [sp,#24] @ r5 contains pred_strd 83 LDR r6, [sp,#28] @ r6 contains dst_strd 84 LDR r7, [sp,#32] @ r7 chroma plane 85 86 CMP r7, #-1 87 BEQ NON_INTERLEAVE_LOAD @if flag == NULL_PLANE, use non-interleaving loads 88 89 VLD1.64 d0, [r0], r4 @ load row 0 src 90 VLD1.64 d4, [r0], r4 @ load row 1 src 91 VLD1.64 d1, [r0], r4 @ load row 2 src 92 VLD1.64 d5, [r0], r4 @ load row 3 src 93 VUZP.8 d0, d4 @ de-interleaving unzip instruction to get luma data of pu1_src in d0 94 VUZP.8 d1, d5 @ de-interleaving unzip instruction to get luma data of pu1_src in d1 95 96 VLD1.64 d2, [r1], r5 @ load row 0 pred 97 VLD1.64 d6, [r1], r5 @ load row 1 pred 98 VLD1.64 d3, [r1], r5 @ load row 2 pred 99 VLD1.64 d7, [r1], r5 @ load row 3 pred 100 VUZP.8 d2, d6 @ de-interleaving unzip instruction to get luma data of pu1_pred in d2 101 VUZP.8 d3, d7 @ de-interleaving unzip instruction to get luma data of pu1_pred in d3 102 103 CMP r7, #0 104 BEQ LOAD_END 105 VSWP.8 d0, d4 106 VSWP.8 d1, d5 107 VSWP.8 d2, d6 108 VSWP.8 d3, d7 109 110 B LOAD_END 111 112NON_INTERLEAVE_LOAD: 113 VLD1.U32 d0[0], [r0], r4 @ load row 0 src 114 VLD1.U32 d0[1], [r0], r4 @ load row 1 src 115 VLD1.U32 d1[0], [r0], r4 @ load row 2 src 116 VLD1.U32 d1[1], [r0], r4 @ load row 3 src 117 118 VLD1.U32 d2[0], [r1], r5 @ load row 0 pred 119 VLD1.U32 d2[1], [r1], r5 @ load row 1 pred 120 VLD1.U32 d3[0], [r1], r5 @ load row 2 pred 121 VLD1.U32 d3[1], [r1], r5 @ load row 3 pred 122 123LOAD_END: 124 @ Finding the residue 125 VSUBL.U8 q2, d0, d2 @ q2 contains 1st 16-bit 8 residues 126 VSUBL.U8 q3, d1, d3 @ q3 contains 2nd 16-bit 8 residues 127 128 @ SAD caculation 129 VABDL.U8 q12, d0, d2 @ q12 contains absolute differences 130 VABAL.U8 q12, d1, d3 @ q12 accumulates absolute differences 131 VADD.U16 d26, d24, d25 @ add d-registers of q12 132 VPADDL.U16 d27, d26 @ d27 contains 2 32-bit values that have to be added 133 VPADDL.U32 d28, d27 @ d28 contains 64-bit SAD, only LSB important 134 VMOV.32 r0, d28[0] @ SAD stored in r0 for return 135 @ SAD caculation ends 136 137 @ Forward transform - step 1 138 VMOV.I16 d2, #64 @ generate immediate constant in d2 for even row multiplication 139 VTRN.16 d4, d5 @ 3-step transpose of residue matrix starts 140 VTRN.16 d6, d7 @ 2nd step of the 3-step matrix transpose 141 VMOV.I16 d0, #83 @ generate immediate constant in d0 for odd row multiplication 142 VTRN.32 q2, q3 @ Final step of matrix transpose 143 144 VMOV.I16 d1, #36 @ generate immediate constant in d1 for odd row multiplication 145 VSWP d6, d7 @ vector swap to allow even and odd row calculation using Q registers 146 VADD.S16 q10, q2, q3 @ q4 has the even array 147 VSUB.S16 q11, q2, q3 @ q5 has the odd array 148 VMULL.S16 q12, d20, d2 @ e[0]*64 149 VMLAL.S16 q12, d21, d2[0] @ row 1 of results: e[0]*64 + e[1]*64 150 VMULL.S16 q13, d20, d2 @ e[0]*64 151 VMLSL.S16 q13, d21, d2[0] @ row 3 of results: e[0]*64 - e[1]*64 152 VMULL.S16 q8, d22, d0 @ o[0]*83 153 VMLAL.S16 q8, d23, d1[0] @ row 2 of results: o[0]*83 + o[1]*36 154 VMULL.S16 q9, d22, d1 @ o[0]*36 155 VMLSL.S16 q9, d23, d0[0] @ row 4 of results: o[0]*36 - o[1]*83 156 157 @ Forward transform - step 2 158 VMOV.I32 d2, #64 @ generate immediate constant in d2 for even row multiplication 159 VMOV.I32 d0, #83 @ generate immediate constant in d0 for odd row multiplication 160 VTRN.32 q12, q8 @ 4-step transpose of residue matrix starts 161 VTRN.32 q13, q9 @ 2nd step of the 4-step matrix transpose 162 163 VMOV.I32 d1, #36 @ generate immediate constant in d1 for odd row multiplication 164 VSWP d25, d26 @ 3rd step of the 4-step matrix transpose 165 VSWP d17, d18 @ 4th step of the 4-step matrix transpose 166 VADD.S32 q2, q12, q9 @ e[0] 167 VADD.S32 q3, q8, q13 @ e[1] 168 VSUB.S32 q10, q12, q9 @ o[0] 169 VSUB.S32 q11, q8, q13 @ o[1] 170 171 VMUL.S32 q12, q2, d2[0] @ e[0]*64 172 VMLA.S32 q12, q3, d2[0] @ row 1 of results: e[0]*64 + e[1]*64 173 VMUL.S32 q13, q2, d2[0] @ e[1]*64 174 VMLS.S32 q13, q3, d2[0] @ row 3 of results: e[0]*64 - e[1]*64 175 VMUL.S32 q8, q10, d0[0] @ o[0]*83 176 VMLA.S32 q8, q11, d1[0] @ row 2 of results: o[0]*83 + o[1]*36 177 VMUL.S32 q9, q10, d1[0] @ o[0]*36 178 VMLS.S32 q9, q11, d0[0] @ row 4 of results: o[0]*36 - o[1]*83 179 180 VRSHRN.S32 d0, q12, #9 @ (row1 + 256)/512 181 VRSHRN.S32 d1, q8, #9 @ (row2 + 256)/512 182 VRSHRN.S32 d2, q13, #9 @ (row3 + 256)/512 183 VRSHRN.S32 d3, q9, #9 @ (row4 + 256)/512 184 185 LSL r7, r6, #1 @ r7 = 2*dst_strd, as pi2_dst contains 2-byte integers 186 VST1.U16 d0, [r3], r7 @ store 1st row of result 187 VST1.U16 d1, [r3], r7 @ store 2nd row of result 188 VST1.U16 d2, [r3], r7 @ store 3rd row of result 189 VST1.U16 d3, [r3], r7 @ store 4th row of result 190 191 LDMFD sp!,{r4-r7,r15} @ Reload the registers from SP 192 193 @ Function End 194 195@/** 196@******************************************************************************* 197@* 198@* @brief 199@* This function performs residue calculation and forward transform type 1 200@* on input pixels 201@* 202@* @description 203@* Performs residue calculation by subtracting source and prediction and 204@* followed by forward transform 205@* 206@* @param[in] pu1_src 207@* Input 4x4 pixels 208@* 209@* @param[in] pu1_pred 210@* Prediction data 211@* 212@* @param[in] pi2_tmp 213@* Temporary buffer of size 4x4 214@* 215@* @param[out] pi2_dst 216@* Output 4x4 coefficients 217@* 218@* @param[in] src_strd 219@* Input stride 220@* 221@* @param[in] pred_strd 222@* Prediction Stride 223@* 224@* @param[in] dst_strd 225@* Output Stride 226@* 227@* @param[in] chr_plane (unused) 228@* Chroma plane 229@* 230@* @returns void 231@* 232@* @remarks 233@* None 234@* 235@******************************************************************************* 236@*/ 237@ UWORD32 ihevc_resi_trans_4x4_ttype1(UWORD8 *pu1_src, 238@ UWORD8 *pu1_pred, 239@ WORD32 *pi4_temp, 240@ WORD16 *pi2_dst, 241@ WORD32 src_strd, 242@ WORD32 pred_strd, 243@ WORD32 dst_strd 244@ WORD32 chroma_plane); 245@ 246@**************Variables Vs Registers******************************************* 247@ 248@ r0 - pu1_src 249@ r1 - pu1_pred 250@ r2 - pi4_temp 251@ r3 - pi2_dst 252@ 253@ [sp] - src_strd 254@ [sp+4] - pred_strd 255@ [sp+8] - dst_strd 256@ [sp+12] - chroma_plane 257@ 258@******************************************************************************* 259 260 .global ihevc_resi_trans_4x4_ttype1_a9q 261 262ihevc_resi_trans_4x4_ttype1_a9q: 263 264 PUSH {r4} 265 vpush {d8 - d15} 266 267 LDR r2,[sp,#68] @ r2 = src_strd 268 LDR r4,[sp,#72] @ r4 = pred_strd 269 270 VLD1.32 d2[0],[r0],r2 @ Row 1 of source in d2[0] 271 VLD1.32 d3[0],[r1],r4 @ Row 1 of prediction in d3[0] 272 VLD1.32 d2[1],[r0],r2 @ Row 2 of source in d2[1] 273 VLD1.32 d3[1],[r1],r4 @ Row 2 of prediction in d3[1] 274 275 VLD1.32 d8[0],[r0],r2 @ Row 3 of source in d8[0] 276 VABDL.U8 q0,d2,d3 @ Absolute differences of rows 1 and 2 in d0 277 @ R2:[d11[3] d11[2] d11[1] d11[0]] => Row 2 of residue 278 VLD1.32 d9[0],[r1],r4 @ Row 3 of prediction in d9[0] 279 VSUBL.U8 q5,d2,d3 @ R1:[d10[3] d10[2] d10[1] d10[0]] => Row 1 of residue 280 VLD1.32 d8[1],[r0] @ Row 4 of source in d8[1] 281 VTRN.16 d10,d11 @ Transpose step 1 282 VLD1.32 d9[1],[r1] @ Row 4 of prediction in d9[1] 283 284 VSUBL.U8 q6,d8,d9 @ R3:[d12[3] d12[2] d12[1] d12[0]] => Row 3 of residue 285 @ R4:[d13[3] d13[2] d13[1] d13[0]] => Row 4 of residue 286 VABAL.U8 q0,d8,d9 @ Absolute differences of rows 3 and 4 in d1 287 VTRN.16 d12,d13 @ Transpose step 2 288 VTRN.32 q5,q6 @ Transpose step 3, Residue block transposed 289 @ Columns are in C1:d10, C2:d11, C3:d12 and C4:d13 290 VADD.S16 d23,d11,d13 @ d23 = C2 + C4 291 VMOV.I32 d6,#55 @ Constant used for multiplication 292 VADD.S16 d22,d10,d13 @ d22 = C1 + C4 293 VADD.U16 d0,d1,d0 @ Accumulating SAD step 1 294 VMOV.I32 d7,#84 @ Constant used for multiplication 295 VMULL.S16 q7,d23,d6[0] @ q7 = 55*C2 + 55*C4 296 VMOV.I32 d4,#74 @ Constant used for multiplication 297 VMULL.S16 q9,d22,d7[0] @ q9 = 84*C1 + 84*C4 298 VADD.S16 d16,d10,d11 @ d16 = C1 + C2 299 VMUL.S16 d12,d12,d4[0] @ d12 = 74*C3 300 VMOV.I32 d5,#29 @ Constant used for multiplication 301 VPADDL.U16 d0,d0 @ Accumulating SAD step 2 302 VSUB.S16 d16,d16,d13 @ d16 = C1 + C2 - C4 303 VMLAL.S16 q7,d22,d5[0] @ q7 = 29*C1 + 55*C2 + 84*C4 304 VMLSL.S16 q9,d23,d5[0] @ q9 = 84*C1 - 29*C2 + 55*C4 305 VMULL.S16 q8,d16,d4[0] @ q8 = 74*C1 + 74*C2 - 74*C4 306 VPADDL.U32 d0,d0 @ Accumulating SAD step 3, SAD in d0 307 VSUB.S32 q10,q9,q7 @ q10 = q9 - q7 = 55*C1 - 84*C2 - 29*C4 308 VMOV.32 r0,d0[0] @ Return SAD value 309 VRSHR.S32 q8,q8,#1 @ Truncating the 1 bit in q8 310 311 VADDW.S16 q7,q7,d12 @ q7 = 29*C1 + 55*C2 + 74*C3 + 84*C4 312 VSUBW.S16 q9,q9,d12 @ q9 = 84*C1 - 29*C2 - 74*C3 + 55*C4 313 VADDW.S16 q10,q10,d12 @ q10 = 55*C1 - 84*C2 + 74*C3 - 29*C4 314 315 VRSHR.S32 q7,q7,#1 @ Truncating the 1 bit in q7 316 VRSHR.S32 q9,q9,#1 @ Truncating the 1 bit in q9 317 VRSHR.S32 q10,q10,#1 @ Truncating the 1 bit in q10 318 @ Transform stage 1 is in P1:q7, P2:q8, P3:q9 and P4:q10 319 VTRN.32 q7,q8 320 VTRN.32 q9,q10 321 VSWP d15,d18 322 VSWP d17,d20 @ Residue block transposed 323 @ Corresponding columns are in S1:q7, S2:q8, S3:q9 and S4:q10 324 VADD.S32 q13,q7,q8 @ q13 = S1 + S2 325 VADD.S32 q1,q7,q10 @ q1 = S1 + S4 326 VADD.S32 q4,q8,q10 @ q4 = S2 + S4 327 VSUB.S32 q13,q13,q10 @ q13 = S1 + S2 - S4 328 VMUL.S32 q12,q1,d5[0] @ q12 = 29*S1 + 29*S4 329 VMUL.S32 q14,q1,d7[0] @ q14 = 84*S1 + 84*S4 330 VMUL.S32 q13,q13,d4[0] @ q13 = 74*S1 + 74*S2 - 74*S4 331 332 VMLA.S32 q12,q4,d6[0] @ q12 = 29*S1 + 55*S2 + 84*S4 333 VMLS.S32 q14,q4,d5[0] @ q14 = 84*S1 - 29*S2 + 55*S4 334 VMUL.S32 q9,q9,d4[0] @ q9 = 74*S3 335 336 LDR r4,[sp,#76] @ r4 = dst_strd_chr_flag 337 LSL r4,r4,#1 @ r4 = 2*dst_strd 338 339 VRSHRN.S32 d26,q13,#8 340 VSUB.S32 q15,q14,q12 @ q15 = q14 - q12 = 55*S1 - 84*S2 - 29*S4 341 342 VADD.S32 q12,q12,q9 @ q12 = 29*S1 + 55*S2 + 74*S3 + 84*S4 343 VSUB.S32 q14,q14,q9 @ q14 = 84*S1 - 29*S2 - 74*S3 + 55*S4 344 VADD.S32 q15,q15,q9 @ q15 = 55*S1 - 84*S2 + 74*S3 - 29*S4 345 346 VRSHRN.S32 d24,q12,#8 347 VRSHRN.S32 d28,q14,#8 348 VRSHRN.S32 d30,q15,#8 @ Truncating the last 8 bits 349 @ Transform stage 2 is in U1:d24, U2:d26, U3:d28 and U4:d30 350 VST1.64 d24,[r3],r4 @ Storing row 1 of transform stage 2 351 VST1.64 d26,[r3],r4 @ Storing row 2 of transform stage 2 352 VST1.64 d28,[r3],r4 @ Storing row 3 of transform stage 2 353 VST1.64 d30,[r3] @ Storing row 4 of transform stage 2 354 355 vpop {d8 - d15} 356 POP {r4} 357 MOV pc,lr 358 359@/** 360@******************************************************************************* 361@* 362@* @brief 363@* This function performs residue calculation and DCT integer forward transform 364@* on 8x8 block 365@* 366@* @description 367@* Performs residue calculation by subtracting source and prediction and 368@* followed by DCT integer forward transform 369@* 370@* @param[in] pu1_src 371@* Input 4x4 pixels 372@* 373@* @param[in] pu1_pred 374@* Prediction data 375@* 376@* @param[in] pi2_tmp 377@* Temporary buffer of size 8x8 378@* 379@* @param[out] pi2_dst 380@* Output 8x8 coefficients 381@* 382@* @param[in] src_strd 383@* Input stride 384@* 385@* @param[in] pred_strd 386@* Prediction Stride 387@* 388@* @param[in] dst_strd 389@* Output Stride 390@* 391@* @param[in] chr_plane 392@* Chroma plane 393@* 394@* @returns void 395@* 396@* @remarks 397@* None 398@* 399@******************************************************************************* 400@*/ 401@ UWORB32 ihevc_resi_trans_8x8(UWORD8 *pu1_src, 402@ UWORD8 *pu1_pred, 403@ WORB32 *pi4_temp, 404@ WORB16 *pi2_dst, 405@ WORB32 src_strd, 406@ WORB32 pred_strd, 407@ WORB32 dst_strd 408@ WORB32 chroma_plane); 409@ 410@**************Variables Vs Registers******************************************* 411@ 412@ r0 - pu1_src 413@ r1 - pu1_pred 414@ r2 - pi4_temp 415@ r3 - pi2_dst 416@ 417@ [sp] - src_strd 418@ [sp+4] - pred_strd 419@ [sp+8] - dst_strd 420@ [sp+12] - chroma_plane 421@ 422@******************************************************************************* 423 424 .global ihevc_resi_trans_8x8_a9q 425 426ihevc_resi_trans_8x8_a9q: 427 428 PUSH {r4,r5} 429 vpush {d8 - d15} 430 431 @ Loading Prediction and Source blocks of size 8x8 432 433 LDR r4,[sp,#84] @ r4 = chroma flag 434 435 CMP r4,#-1 @ NULL PLANE 436 BEQ LUMA_LOAD 437 438 CMP r4,#1 @ V PLANE 439 BEQ CHROMA_V_LOAD 440 @ handling U PLANE 441 LDR r5,[sp,#72] @ r5 = src_strd 442 LDR r4,[sp,#76] @ r4 = pred_strd 443 444 VLD2.8 {d0,d2},[r1],r4 @ Row 1 of prediction in d0 445 VLD2.8 {d1,d3},[r0],r5 @ Row 1 of source in d1 446 447 VABDL.U8 q15,d1,d0 @ Row 1 of absolute difference in q15 448 VLD2.8 {d2,d4},[r1],r4 @ Row 2 of prediction in d2 449 VSUBL.U8 q0,d1,d0 @ Row 1 of residue in q0 450 VLD2.8 {d3,d5},[r0],r5 @ Row 2 of source in d3 451 452 VABDL.U8 q9,d3,d2 @ Row 2 of absolute difference in q9 453 VLD2.8 {d4,d6},[r1],r4 @ Row 3 of prediction in d4 454 VSUBL.U8 q1,d3,d2 @ Row 2 of residue in q1 455 VLD2.8 {d5,d7},[r0],r5 @ Row 3 of source in d5 456 457 VABAL.U8 q15,d5,d4 @ Row 3 of absolute difference accumulated in q15 458 VLD2.8 {d6,d8},[r1],r4 @ Row 4 of prediction in d6 459 VSUBL.U8 q2,d5,d4 @ Row 3 of residue in q2 460 VLD2.8 {d7,d9},[r0],r5 @ Row 4 of source in d7 461 462 VABAL.U8 q9,d7,d6 @ Row 4 of absolute difference accumulated in q9 463 VLD2.8 {d8,d10},[r1],r4 @ Row 5 of prediction in d8 464 VSUBL.U8 q3,d7,d6 @ Row 4 of residue in q3 465 VLD2.8 {d9,d11},[r0],r5 @ Row 5 of source in d9 466 467 VABDL.U8 q10,d9,d8 @ Row 5 of absolute difference in q10 468 VLD2.8 {d10,d12},[r1],r4 @ Row 6 of prediction in d10 469 VSUBL.U8 q4,d9,d8 @ Row 5 of residue in q4 470 VLD2.8 {d11,d13},[r0],r5 @ Row 6 of source in d11 471 472 VABAL.U8 q15,d11,d10 @ Row 6 of absolute difference accumulated in q15 473 VLD2.8 {d12,d14},[r1],r4 @ Row 7 of prediction in d12 474 VSUBL.U8 q5,d11,d10 @ Row 6 of residue in q5 475 VLD2.8 {d13,d15},[r0],r5 @ Row 7 of source in d13 476 477 VABAL.U8 q9,d13,d12 @ Row 7 of absolute difference accumulated in q9 478 VLD2.8 {d14,d16},[r1] @ Row 8 of prediction in d14 479 VSUBL.U8 q6,d13,d12 @ Row 7 of residue in q6 480 VLD2.8 {d15,d17},[r0] @ Row 8 of source in d15 481 482 B LUMA_LOAD_END 483 484CHROMA_V_LOAD: 485 LDR r5,[sp,#72] @ r5 = src_strd 486 LDR r4,[sp,#76] @ r4 = pred_strd 487 488 VLD2.8 {d0,d2},[r1],r4 @ Row 1 of prediction in d2 489 VLD2.8 {d1,d3},[r0],r5 @ Row 1 of source in d3 490 491 VABDL.U8 q15,d3,d2 @ Row 1 of absolute difference in q15 492 VLD2.8 {d4,d6},[r1],r4 @ Row 2 of prediction in d6 493 VSUBL.U8 q0,d3,d2 @ Row 1 of residue in q0 494 VLD2.8 {d5,d7},[r0],r5 @ Row 2 of source in d7 495 496 VABDL.U8 q9,d7,d6 @ Row 2 of absolute difference in q9 497 VLD2.8 {d8,d10},[r1],r4 @ Row 3 of prediction in d10 498 VSUBL.U8 q1,d7,d6 @ Row 2 of residue in q1 499 VLD2.8 {d9,d11},[r0],r5 @ Row 3 of source in d11 500 501 VABAL.U8 q15,d11,d10 @ Row 3 of absolute difference accumulated in q15 502 VLD2.8 {d6,d8},[r1],r4 @ Row 4 of prediction in d8 503 VSUBL.U8 q2,d11,d10 @ Row 3 of residue in q2 504 VLD2.8 {d7,d9},[r0],r5 @ Row 4 of source in d9 505 506 VABAL.U8 q9,d9,d8 @ Row 4 of absolute difference accumulated in q9 507 VLD2.8 {d10,d12},[r1],r4 @ Row 5 of prediction in d12 508 VSUBL.U8 q3,d9,d8 @ Row 4 of residue in q3 509 VLD2.8 {d11,d13},[r0],r5 @ Row 5 of source in d13 510 511 VABDL.U8 q10,d13,d12 @ Row 5 of absolute difference in q10 512 VLD2.8 {d14,d16},[r1],r4 @ Row 6 of prediction in d16 513 VSUBL.U8 q4,d13,d12 @ Row 5 of residue in q4 514 VLD2.8 {d15,d17},[r0],r5 @ Row 6 of source in d17 515 516 VABAL.U8 q15,d17,d16 @ Row 6 of absolute difference accumulated in q15 517 VLD2.8 {d12,d14},[r1],r4 @ Row 7 of prediction in d12 518 VSUBL.U8 q5,d17,d16 @ Row 6 of residue in q5 519 VLD2.8 {d13,d15},[r0],r5 @ Row 7 of source in d13 520 521 VABAL.U8 q9,d15,d14 @ Row 7 of absolute difference accumulated in q9 522 VSUBL.U8 q6,d15,d14 @ Row 7 of residue in q6 523 524 VLD2.8 {d14,d16},[r1] @ Row 8 of prediction in d14 525 VLD2.8 {d15,d17},[r0] @ Row 8 of source in d15 526 VSWP.8 d14,d16 527 VSWP.8 d15,d17 528 529 B LUMA_LOAD_END 530 531LUMA_LOAD: 532 533 LDR r5,[sp,#72] @ r5 = src_strd 534 LDR r4,[sp,#76] @ r4 = pred_strd 535 536 VLD1.64 d0,[r1],r4 @ Row 1 of prediction in d0 537 VLD1.64 d1,[r0],r5 @ Row 1 of source in d1 538 539 VABDL.U8 q15,d1,d0 @ Row 1 of absolute difference in q15 540 VLD1.64 d2,[r1],r4 @ Row 2 of prediction in d2 541 VSUBL.U8 q0,d1,d0 @ Row 1 of residue in q0 542 VLD1.64 d3,[r0],r5 @ Row 2 of source in d3 543 544 VABDL.U8 q9,d3,d2 @ Row 2 of absolute difference in q9 545 VLD1.64 d4,[r1],r4 @ Row 3 of prediction in d4 546 VSUBL.U8 q1,d3,d2 @ Row 2 of residue in q1 547 VLD1.64 d5,[r0],r5 @ Row 3 of source in d5 548 549 VABAL.U8 q15,d5,d4 @ Row 3 of absolute difference accumulated in q15 550 VLD1.64 d6,[r1],r4 @ Row 4 of prediction in d6 551 VSUBL.U8 q2,d5,d4 @ Row 3 of residue in q2 552 VLD1.64 d7,[r0],r5 @ Row 4 of source in d7 553 554 VABAL.U8 q9,d7,d6 @ Row 4 of absolute difference accumulated in q9 555 VLD1.64 d8,[r1],r4 @ Row 5 of prediction in d8 556 VSUBL.U8 q3,d7,d6 @ Row 4 of residue in q3 557 VLD1.64 d9,[r0],r5 @ Row 5 of source in d9 558 559 VABDL.U8 q10,d9,d8 @ Row 5 of absolute difference in q10 560 VLD1.64 d10,[r1],r4 @ Row 6 of prediction in d10 561 VSUBL.U8 q4,d9,d8 @ Row 5 of residue in q4 562 VLD1.64 d11,[r0],r5 @ Row 6 of source in d11 563 564 VABAL.U8 q15,d11,d10 @ Row 6 of absolute difference accumulated in q15 565 VLD1.64 d12,[r1],r4 @ Row 7 of prediction in d12 566 VSUBL.U8 q5,d11,d10 @ Row 6 of residue in q5 567 VLD1.64 d13,[r0],r5 @ Row 7 of source in d13 568 569 VABAL.U8 q9,d13,d12 @ Row 7 of absolute difference accumulated in q9 570 VLD1.64 d14,[r1] @ Row 8 of prediction in d14 571 VSUBL.U8 q6,d13,d12 @ Row 7 of residue in q6 572 VLD1.64 d15,[r0] @ Row 8 of source in d15 573 574LUMA_LOAD_END: 575 576 @ Transform stage 1 577 @ Transposing residue matrix 578 579 VABAL.U8 q10,d15,d14 @ Row 8 of absolute difference accumulated in q10 580 VTRN.16 q0,q1 @ Transpose residue matrix step (1a) 581 VSUBL.U8 q7,d15,d14 @ Row 8 of residue in q7 582 VTRN.16 q2,q3 @ Transpose residue matrix step (1b) 583 584 VTRN.16 q4,q5 @ Transpose residue matrix step (1c) 585 VTRN.16 q6,q7 @ Transpose residue matrix step (1d) 586 VTRN.32 q0,q2 @ Transpose residue matrix step (2a) 587 VTRN.32 q1,q3 @ Transpose residue matrix step (2b) 588 589 VADD.U16 q8,q15,q9 @ SAD calculation (1) 590 VTRN.32 q4,q6 @ Transpose residue matrix step (2c) 591 VTRN.32 q5,q7 @ Transpose residue matrix step (2d) 592 593 VADD.U16 q8,q8,q10 @ SAD calculation (2) 594 VSWP d1,d8 @ Transpose residue matrix step (3a) 595 VSWP d3,d10 @ Transpose residue matrix step (3b) 596 597 VADD.U16 d16,d16,d17 @ SAD calculation (3) 598 VSWP d7,d14 @ Transpose residue matrix step (3c) 599 VSWP d5,d12 @ Transpose residue matrix step (3d) 600 @ Columns of residue C0-C7 (8x8 matrix) in q0-q7 601 VPADDL.U16 d16,d16 @ SAD calculation (4) 602 603 @ Evaluating first step in Butterfly diagram 604 605 VADD.S16 q10,q0,q7 @ q10 = C0 + C7 606 VADD.S16 q11,q1,q6 @ q11 = C1 + C6 607 VPADDL.U32 d16,d16 @ SAD calculation (5) 608 VADD.S16 q12,q2,q5 @ q12 = C2 + C5 609 VADD.S16 q13,q3,q4 @ q13 = C3 + C4 610 611 VSUB.S16 q4,q3,q4 @ q4 = C3 - C4 612 VSUB.S16 q5,q2,q5 @ q5 = C2 - C5 613 VSUB.S16 q6,q1,q6 @ q6 = C1 - C6 614 VSUB.S16 q7,q0,q7 @ q7 = C0 - C7 615 616 @ Calculating F0, F2, F4 and F6 617 618 VADD.S16 q1,q11,q12 @ q1 = C1 + C2 + C5 + C6 619 VADD.S16 q2,q10,q13 @ q2 = C0 + C3 + C4 + C7 620 621 MOV r4,#50 622 LSL r4,r4,#16 623 ADD r4,r4,#18 624 MOV r5,#89 625 LSL r5,r5,#16 626 ADD r5,r5,#75 627 VMOV d0,r4,r5 @ 16-bit aligned, d0[3] = 89, d0[2] = 75, d0[1] = 50, d0[0]=18 628 629 MOV r4,#83 630 LSL r4,r4,#16 631 ADD r4,r4,#36 632 VMOV d1,r4,r4 @ 16-bit aligned, d1[3] = 83, d1[2] = 36, d1[1] = 83, d1[0]=36 633 634 VSUB.S16 q10,q10,q13 @ q10 = C0 - C3 - C4 + C7 635 VSUB.S16 q11,q11,q12 @ q11 = C1 - C2 - C5 + C6 636 VMOV.32 r0,d16[0] @ SAD calculation (6) : Return value = SAD 637 638 VSUB.S16 q3,q2,q1 @ q3 = C0 - C1 - C2 + C3 + C4 - C5 - C6 + C7 639 VADD.S16 q2,q2,q1 @ q2 = C0 + C1 + C2 + C3 + C4 + C5 + C6 + C7 640 641 VMULL.S16 q14,d20,d1[1] @ q14 = [0] of 83*(C0 - C3 - C4 + C7) 642 VMULL.S16 q15,d21,d1[1] @ q15 = [1] of 83*(C0 - C3 - C4 + C7) 643 VMULL.S16 q9,d20,d1[0] @ q9 = [0] of 36*(C0 - C3 - C4 + C7) 644 VMULL.S16 q10,d21,d1[0] @ q10 = [1] of 36*(C0 - C3 - C4 + C7) 645 646 VMLAL.S16 q14,d22,d1[0] @ q14 = F2[0] = 83*(C0 - C3 - C4 + C7) + 36*(C1 - C2 - C5 + C6) 647 VSHLL.S16 q13,d6,#6 @ q13 = F4[0] = 64*(C0 - C1 - C2 + C3 + C4 - C5 - C6 + C7) 648 VMLAL.S16 q15,d23,d1[0] @ q15 = F2[1] = 83*(C0 - C3 - C4 + C7) + 36*(C1 - C2 - C5 + C6) 649 VSHLL.S16 q3,d7,#6 @ q3 = F4[1] = 64*(C0 - C1 - C2 + C3 + C4 - C5 - C6 + C7) 650 VMLSL.S16 q9,d22,d1[1] @ q9 = F6[0] = 36*(C0 - C3 - C4 + C7) - 83*(C1 - C2 - C5 + C6) 651 VSHLL.S16 q12,d4,#6 @ q12 = F0[0] = 64*(C0 + C1 + C2 + C3 + C4 + C5 + C6 + C7) 652 VMLSL.S16 q10,d23,d1[1] @ q10 = F6[1] = 36*(C0 - C3 - C4 + C7) - 83*(C1 - C2 - C5 + C6) 653 VSHLL.S16 q2,d5,#6 @ q2 = F0[1] = 64*(C0 + C1 + C2 + C3 + C4 + C5 + C6 + C7) 654 655 @ Calculating F1, F3, F5 and F7 656 657 MOV r4,#48 658 VST1.64 {d24,d25},[r2]! @ Row 1 of transform stage 1 F0[0] stored 659 VST1.64 {d4,d5},[r2],r4 @ Row 1 of transform stage 1 F0[1] stored 660 VST1.64 {d28,d29},[r2]! @ Row 3 of transform stage 1 F2[0] stored 661 VST1.64 {d30,d31},[r2],r4 @ Row 3 of transform stage 1 F2[1] stored 662 663 VST1.64 {d26,d27},[r2]! @ Row 5 of transform stage 1 F4[0] stored 664 VMULL.S16 q1,d14,d0[3] @ q1 = [0] of 89*(C0 - C7) 665 VMULL.S16 q8,d15,d0[3] @ q8 = [1] of 89*(C0 - C7) 666 VST1.64 {d6,d7},[r2],r4 @ Row 5 of transform stage 1 F4[1] stored 667 VMULL.S16 q11,d14,d0[2] @ q11 = [0] of 75*(C0 - C7) 668 VMULL.S16 q13,d15,d0[2] @ q13 = [1] of 75*(C0 - C7) 669 VST1.64 {d18,d19},[r2]! @ Row 7 of transform stage 1 F6[0] stored 670 VMULL.S16 q3,d14,d0[1] @ q3 = [0] of 50*(C0 - C7) 671 VMULL.S16 q9,d15,d0[1] @ q9 = [1] of 50*(C0 - C7) 672 VST1.64 {d20,d21},[r2] @ Row 7 of transform stage 1 F6[1] stored 673 VMULL.S16 q10,d14,d0[0] @ q10 = [0] of 18*(C0 - C7) 674 VMULL.S16 q7,d15,d0[0] @ q7 = [1] of 18*(C0 - C7) 675 676 VMLAL.S16 q1,d12,d0[2] @ q1 = [0] of 89*(C0 - C7) + 75*(C1 - C6) 677 VMLAL.S16 q8,d13,d0[2] @ q8 = [1] of 89*(C0 - C7) + 75*(C1 - C6) 678 VMLSL.S16 q11,d12,d0[0] @ q11 = [0] of 75*(C0 - C7) - 18*(C1 - C6) 679 VMLSL.S16 q13,d13,d0[0] @ q13 = [1] of 75*(C0 - C7) - 18*(C1 - C6) 680 VMLSL.S16 q3,d12,d0[3] @ q3 = [0] of 50*(C0 - C7) - 89*(C1 - C6) 681 VMLSL.S16 q9,d13,d0[3] @ q9 = [1] of 50*(C0 - C7) - 89*(C1 - C6) 682 VMLSL.S16 q10,d12,d0[1] @ q10 = [0] of 18*(C0 - C7) - 50*(C1 - C6) 683 VMLSL.S16 q7,d13,d0[1] @ q7 = [1] of 18*(C0 - C7) - 50*(C1 - C6) 684 685 VMLAL.S16 q1,d10,d0[1] @ q1 = [0] of 89*(C0 - C7) + 75*(C1 - C6) + 50*(C2 - C5) 686 VMLAL.S16 q8,d11,d0[1] @ q8 = [1] of 89*(C0 - C7) + 75*(C1 - C6) + 50*(C2 - C5) 687 VMLSL.S16 q11,d10,d0[3] @ q11 = [0] of 75*(C0 - C7) - 18*(C1 - C6) - 89*(C2 - C5) 688 VMLSL.S16 q13,d11,d0[3] @ q13 = [1] of 75*(C0 - C7) - 18*(C1 - C6) - 89*(C2 - C5) 689 VMLAL.S16 q3,d10,d0[0] @ q3 = [0] of 50*(C0 - C7) - 89*(C1 - C6) + 18*(C2 - C5) 690 VMLAL.S16 q9,d11,d0[0] @ q9 = [1] of 50*(C0 - C7) - 89*(C1 - C6) + 18*(C2 - C5) 691 VMLAL.S16 q10,d10,d0[2] @ q10 = [0] of 18*(C0 - C7) - 50*(C1 - C6) + 75*(C2 - C5) 692 VMLAL.S16 q7,d11,d0[2] @ q7 = [1] of 18*(C0 - C7) - 50*(C1 - C6) + 75*(C2 - C5) 693 694 VMLAL.S16 q1,d8,d0[0] @ q1 = F1[0] = 89*(C0 - C7) + 75*(C1 - C6) + 50*(C2 - C5) + 18*(C3 - C4) 695 VMLAL.S16 q8,d9,d0[0] @ q8 = F1[1] = 89*(C0 - C7) + 75*(C1 - C6) + 50*(C2 - C5) + 18*(C3 - C4) 696 VMLSL.S16 q11,d8,d0[1] @ q11 = F3[0] = 75*(C0 - C7) - 18*(C1 - C6) - 89*(C2 - C5) - 50*(C3 - C4) 697 VMLSL.S16 q13,d9,d0[1] @ q13 = F3[1] = 75*(C0 - C7) - 18*(C1 - C6) - 89*(C2 - C5) - 50*(C3 - C4) 698 SUB r2,r2,#176 @ r2 now points to the second row 699 VMLAL.S16 q3,d8,d0[2] @ q3 = F5[0] = 50*(C0 - C7) - 89*(C1 - C6) + 18*(C2 - C5) + 75*(C3 - C4) 700 VMLAL.S16 q9,d9,d0[2] @ q9 = F5[1] = 50*(C0 - C7) - 89*(C1 - C6) + 18*(C2 - C5) + 75*(C3 - C4) 701 VST1.64 {d2,d3},[r2]! @ Row 2 of transform stage 1 F1[0] stored 702 VMLSL.S16 q10,d8,d0[3] @ q10 = F7[0] = 18*(C0 - C7) - 50*(C1 - C6) + 75*(C2 - C5) - 89*(C3 - C4) 703 VMLSL.S16 q7,d9,d0[3] @ q7 = F7[1] = 18*(C0 - C7) - 50*(C1 - C6) + 75*(C2 - C5) - 89*(C3 - C4) 704 705 VST1.64 {d16,d17},[r2],r4 @ Row 2 of transform stage 1 F1[1] stored 706 VST1.64 {d22,d23},[r2]! @ Row 4 of transform stage 1 F3[0] stored 707 VST1.64 {d26,d27},[r2],r4 @ Row 4 of transform stage 1 F3[1] stored 708 VST1.64 {d6,d7},[r2]! @ Row 6 of transform stage 1 F5[0] stored 709 VST1.64 {d18,d19},[r2],r4 @ Row 6 of transform stage 1 F5[1] stored 710 VST1.64 {d20,d21},[r2]! @ Row 8 of transform stage 1 F7[0] stored 711 VST1.64 {d14,d15},[r2] @ Row 8 of transform stage 1 F7[1] stored 712 713 @ Transform stage 2 (for rows 1-4 of transform stage 1) 714 @ Transposing the 4 rows (F0, F1, F2, F3) 715 @ F0 = {q2,q12}, F1 = {q8,q1}, F2 = {q15,q14} and F3 = {q13,q11} 716 717 VTRN.32 q12,q1 @ Transposing first half of transform stage 1 (1a) 718 VTRN.32 q14,q11 @ Transposing first half of transform stage 1 (1b) 719 VSWP d25,d28 @ Transposing first half of transform stage 1 (2a) 720 VSWP d22,d3 @ Transposing first half of transform stage 1 (2b) 721 722 VTRN.32 q2,q8 @ Transposing first half of transform stage 1 (3a) 723 VTRN.32 q15,q13 @ Transposing first half of transform stage 1 (3b) 724 VSWP d5,d30 @ Transposing first half of transform stage 1 (4a) 725 VSWP d26,d17 @ Transposing first half of transform stage 1 (4b) 726 @ B0:q12, B1:q1, B2:q14, B3:q11, B4:q2, B5:q8, B6:q15 and B7:q13 727 728 @ Evaluating first step in Butterfly diagram 729 730 VADD.S32 q0,q12,q13 @ q0 = B0 + B7 731 VADD.S32 q5,q11,q2 @ q5 = B3 + B4 732 VADD.S32 q3,q1,q15 @ q3 = B1 + B6 733 VADD.S32 q4,q14,q8 @ q4 = B2 + B5 734 735 VSUB.S32 q7,q14,q8 @ q7 = B2 - B5 736 VSUB.S32 q8,q1,q15 @ q8 = B1 - B6 737 VSUB.S32 q6,q11,q2 @ q6 = B3 - B4 738 VSUB.S32 q9,q12,q13 @ q9 = B0 - B7 739 740 @ Calculating G0, G2, G4 and G6 741 742 MOV r4,#18 743 MOV r5,#50 744 VMOV d2,r4,r5 @ 32-bit aligned, d2[1] = 50, d2[0] = 18 745 VSUB.S32 q2,q0,q5 @ q2 = B0 - B3 - B4 + B7 746 747 MOV r4,#75 748 MOV r5,#89 749 VMOV d3,r4,r5 @ 32-bit aligned, d3[1] = 89, d3[0] = 75 750 VADD.S32 q10,q0,q5 @ q10 = B0 + B3 + B4 + B7 751 752 MOV r4,#36 753 MOV r5,#83 754 VMOV d0,r4,r5 @ 32-bit aligned, d0[1] = 83, d0[0] = 36 755 VSUB.S32 q11,q3,q4 @ q11 = B1 - B2 - B5 + B6 756 VADD.S32 q3,q3,q4 @ q3 = B1 + B2 + B5 + B6 757 758 VMUL.S32 q12,q2,d0[1] @ q12 = 83*(B0 - B3 - B4 + B7) 759 VMUL.S32 q2,q2,d0[0] @ q2 = 36*(B0 - B3 - B4 + B7) 760 VMUL.S32 q5,q9,d3[1] @ q5 = 89*(B0 - B7) 761 VADD.S32 q14,q10,q3 @ q14 = B0 + B1 + B2 + B3 + B4 + B5 + B6 + B7 762 VMUL.S32 q4,q9,d3[0] @ q4 = 75*(B0 - B7) 763 VSUB.S32 q15,q10,q3 @ q15 = B0 - B1 - B2 + B3 + B4 - B5 - B6 + B7 764@ VSHL.S32 q14,q14,#6 ; q14 = G0 = 64*(B0 + B1 + B2 + B3 + B4 + B5 + B6 + B7) 765@ VSHL.S32 q15,q15,#6 ; q15 = G4 = 64*(B0 - B1 - B2 + B3 + B4 - B5 - B6 + B7) 766 767 VMLA.S32 q12,q11,d0[0] @ q12 = G2 = 83*(B0 - B3 - B4 + B7) + 36*(B1 - B2 - B5 + B6) 768 VRSHRN.I32 d28,q14,#5 @ Truncating last 11 bits in G0 769 VMLS.S32 q2,q11,d0[1] @ q2 = G6 = 36*(B0 - B3 - B4 + B7) - 83*(B1 - B2 - B5 + B6) 770 VRSHRN.I32 d30,q15,#5 @ Truncating last 11 bits in G4 771 772 LDR r4,[sp,#80] @ r4 = dst_strd 773 LSL r4,r4,#2 @ r4 = 2*dst_strd*2 774 775 VMUL.S32 q3,q9,d2[1] @ q3 = 50*(B0 - B7) 776 VRSHRN.I32 d24,q12,#11 @ Truncating last 11 bits in G2 777 VMUL.S32 q9,q9,d2[0] @ q9 = 18*(B0 - B7) 778 VRSHRN.I32 d4,q2,#11 @ Truncating last 11 bits in G6 779 780 VMLA.S32 q5,q8,d3[0] @ q5 = 89*(B0 - B7) + 75*(B1 - B6) 781 VST1.64 d28,[r3],r4 @ First half-row of row 1 of transform stage 2 (G0) stored 782 VMLS.S32 q4,q8,d2[0] @ q4 = 75*(B0 - B7) - 18*(B1 - B6) 783 784 VMLS.S32 q3,q8,d3[1] @ q3 = 50*(B0 - B7) - 89*(B1 - B6) 785 VST1.64 d24,[r3],r4 @ First half-row of row 3 of transform stage 2 (G2) stored 786 VMLS.S32 q9,q8,d2[1] @ q9 = 18*(B0 - B7) - 50*(B1 - B6) 787 788 VMLA.S32 q5,q7,d2[1] @ q5 = 89*(B0 - B7) + 75*(B1 - B6) + 50*(B2 - B5) 789 VST1.64 d30,[r3],r4 @ First half-row of row 5 of transform stage 2 (G4) stored 790 VMLS.S32 q4,q7,d3[1] @ q4 = 75*(B0 - B7) - 18*(B1 - B6) - 89*(B2 - B5) 791 792 VMLA.S32 q3,q7,d2[0] @ q3 = 50*(B0 - B7) - 89*(B1 - B6) + 18*(B2 - B5) 793 VST1.64 d4,[r3] @ First half-row of row 7 of transform stage 2 (G6) stored 794 VMLA.S32 q9,q7,d3[0] @ q9 = 18*(B0 - B7) - 50*(B1 - B6) + 75*(B2 - B5) 795 796 VMLA.S32 q5,q6,d2[0] @ q5 = G1 = 89*(B0 - B7) + 75*(B1 - B6) + 50*(B2 - B5) + 18*(B3 - B4) 797 VMLS.S32 q4,q6,d2[1] @ q4 = G3 = 75*(B0 - B7) - 18*(B1 - B6) - 89*(B2 - B5) - 50*(B3 - B4) 798 VMLA.S32 q3,q6,d3[0] @ q3 = G5 = 50*(B0 - B7) - 89*(B1 - B6) + 18*(B2 - B5) + 75*(B3 - B4) 799 VMLS.S32 q9,q6,d3[1] @ q9 = G7 = 18*(B0 - B7) - 50*(B1 - B6) + 75*(B2 - B5) - 89*(B3 - B4) 800 801 SUB r3,r3,r4,LSL #1 802 SUB r3,r3,r4,ASR #1 @ r3 = r3 - 5*dst_strd*2 803 @ r3 is moved from row 7 to row 2 804 VRSHRN.I32 d10,q5,#11 @ Truncating last 11 bits in G1 805 VRSHRN.I32 d8,q4,#11 @ Truncating last 11 bits in G3 806 VRSHRN.I32 d6,q3,#11 @ Truncating last 11 bits in G5 807 VST1.64 d10,[r3],r4 @ First half-row of row 2 of transform stage 2 (G1) stored 808 VRSHRN.I32 d18,q9,#11 @ Truncating last 11 bits in G7 809 810 VST1.64 d8,[r3],r4 @ First half-row of row 4 of transform stage 2 (G3) stored 811 VST1.64 d6,[r3],r4 @ First half-row of row 6 of transform stage 2 (G5) stored 812 VST1.64 d18,[r3]! @ First half-row of row 8 of transform stage 2 (G7) stored 813 814 @ Transform stage 2 (for rows 5-8 of transform stage 1) 815 @ Loading the 4 rows (F4, F5, F6, F7) 816 817 SUB r2,r2,#112 @ r2 jumps from row 8 to row 5 in temporary memory 818 VLD1.64 {d20,d21},[r2]! @ q10 = F4[0] 819 VLD1.64 {d22,d23},[r2]! @ q11 = F4[1] 820 VLD1.64 {d8,d9},[r2]! @ q4 = F5[0] 821 @ Transposing the 4 rows 822 @ F0 = {q11,q10}, F1 = {q5,q4}, F2 = {q3,q2} and F3 = {q13,q12} 823 824 VTRN.32 q10,q4 @ Transposing second half of transform stage 1 (1a) 825 VLD1.64 {d10,d11},[r2]! @ q5 = F5[1] 826 VLD1.64 {d4,d5},[r2]! @ q2 = F6[0] 827 VLD1.64 {d6,d7},[r2]! @ q3 = F6[1] 828 VLD1.64 {d24,d25},[r2]! @ q12 = F7[0] 829 VTRN.32 q2,q12 @ Transposing second half of transform stage 1 (1b) 830 VLD1.64 {d26,d27},[r2] @ q13 = F7[1] 831 832 VSWP d21,d4 @ Transposing second half of transform stage 1 (2a) 833 VSWP d24,d9 @ Transposing second half of transform stage 1 (2b) 834 835 VTRN.32 q11,q5 @ Transposing second half of transform stage 1 (3a) 836 VTRN.32 q3,q13 @ Transposing second half of transform stage 1 (3b) 837 VSWP d26,d11 @ Transposing second half of transform stage 1 (4b) 838 VSWP d23,d6 @ Transposing second half of transform stage 1 (4a) 839 @ B0:q10, B1:q4, B2:q2, B3:q12, B4:q11, B5:q5, B6:q3 and B7:q13 840 841 @ Evaluating first step in Butterfly diagram 842 843 VADD.S32 q0,q10,q13 @ q0 = B0 + B7 844 VADD.S32 q15,q12,q11 @ q15 = B3 + B4 845 VADD.S32 q1,q4,q3 @ q1 = B1 + B6 846 VADD.S32 q14,q2,q5 @ q14 = B2 + B5 847 848 VSUB.S32 q9,q10,q13 @ q9 = B0 - B7 849 VSUB.S32 q6,q12,q11 @ q6 = B3 - B4 850 VSUB.S32 q7,q2,q5 @ q7 = B2 - B5 851 VSUB.S32 q8,q4,q3 @ q8 = B1 - B6 852 853 @ Calculating H0, H2, H4 and H6 854 855 VADD.S32 q3,q1,q14 @ q3 = B1 + B2 + B5 + B6 856 VSUB.S32 q5,q1,q14 @ q5 = B1 - B2 - B5 + B6 857 858 MOV r4,#18 859 MOV r5,#50 860 VSUB.S32 q4,q0,q15 @ q4 = B0 - B3 - B4 + B7 861 VMOV d2,r4,r5 @ 32-bit aligned, d2[1] = 50, d2[0] = 18 862 863 MOV r4,#75 864 MOV r5,#89 865 VADD.S32 q2,q0,q15 @ q2 = B0 + B3 + B4 + B7 866 VMOV d3,r4,r5 @ 32-bit aligned, d3[1] = 89, d3[0] = 75 867 868 MOV r4,#36 869 MOV r5,#83 870 871 @ Calculating H1, H3, H5 and H7 872 873 VMUL.S32 q10,q9,d3[1] @ q10 = 89*(B0 - B7) 874 VMOV d0,r4,r5 @ 32-bit aligned, d0[1] = 83, d0[0] = 36 875 876 VMUL.S32 q13,q9,d3[0] @ q13 = 75*(B0 - B7) 877 878 VMUL.S32 q12,q4,d0[1] @ q12 = 83*(B0 - B3 - B4 + B7) 879 VADD.S32 q14,q2,q3 @ q14 = B0 + B1 + B2 + B3 + B4 + B5 + B6 + B7 880 VMUL.S32 q4,q4,d0[0] @ q4 = 36*(B0 - B3 - B4 + B7) 881 VSUB.S32 q2,q2,q3 @ q2 = B0 - B1 - B2 + B3 + B4 - B5 - B6 + B7 882 883 884 VMLA.S32 q12,q5,d0[0] @ q12 = H2 = 83*(B0 - B3 - B4 + B7) + 36*(B1 - B2 - B5 + B6) 885@ VSHL.S32 q14,q14,#6 ; q14 = H0 = 64*(B0 + B1 + B2 + B3 + B4 + B5 + B6 + B7) 886 VMLS.S32 q4,q5,d0[1] @ q4 = H6 = 36*(B0 - B3 - B4 + B7) - 83*(B1 - B2 - B5 + B6) 887@ VSHL.S32 q2,q15,#6 ; q2 = H4 = 64*(B0 - B1 - B2 + B3 + B4 - B5 - B6 + B7) 888 889 VMUL.S32 q11,q9,d2[1] @ q11 = 50*(B0 - B7) 890 VRSHRN.I32 d28,q14,#5 @ Truncating last 11 bits in H0 891 VMUL.S32 q9,q9,d2[0] @ q9 = 18*(B0 - B7) 892 VRSHRN.I32 d24,q12,#11 @ Truncating last 11 bits in H2 893 894 VMLA.S32 q10,q8,d3[0] @ q10 = 89*(B0 - B7) + 75*(B1 - B6) 895 VRSHRN.I32 d4,q2,#5 @ Truncating last 11 bits in H4 896 VMLS.S32 q13,q8,d2[0] @ q13 = 75*(B0 - B7) - 18*(B1 - B6) 897 VRSHRN.I32 d8,q4,#11 @ Truncating last 11 bits in H6 898 899 LDR r4,[sp,#80] @ r4 = dst_strd 900 LSL r4,r4,#2 @ r4 = 2*dst_strd*2 901 902 SUB r3,r3,r4,LSL #2 903 ADD r3,r3,r4,ASR #1 @ r3 = r3 - 7*dst_strd*2 904 @ r3 is moved from row 8 to row 1 905 VMLS.S32 q11,q8,d3[1] @ q11 = 50*(B0 - B7) - 89*(B1 - B6) 906 VST1.64 d28,[r3],r4 @ Second half-row of row 1 of transform stage 2 (H0) stored 907 VMLS.S32 q9,q8,d2[1] @ q9 = 18*(B0 - B7) - 50*(B1 - B6) 908 909 VMLA.S32 q10,q7,d2[1] @ q10 = 89*(B0 - B7) + 75*(B1 - B6) + 50*(B2 - B5) 910 VST1.64 d24,[r3],r4 @ Second half-row of row 3 of transform stage 2 (H2) stored 911 VMLS.S32 q13,q7,d3[1] @ q13 = 75*(B0 - B7) - 18*(B1 - B6) - 89*(B2 - B5) 912 913 VMLA.S32 q11,q7,d2[0] @ q11 = 50*(B0 - B7) - 89*(B1 - B6) + 18*(B2 - B5) 914 VST1.64 d4,[r3],r4 @ Second half-row of row 5 of transform stage 2 (H4) stored 915 VMLA.S32 q9,q7,d3[0] @ q9 = 18*(B0 - B7) - 50*(B1 - B6) + 75*(B2 - B5) 916 917 VMLA.S32 q10,q6,d2[0] @ q10 = H1 = 89*(B0 - B7) + 75*(B1 - B6) + 50*(B2 - B5) + 18*(B3 - B4) 918 VST1.64 d8,[r3] @ Second half-row of row 7 of transform stage 2 (H6) stored 919 VMLS.S32 q13,q6,d2[1] @ q13 = H3 = 75*(B0 - B7) - 18*(B1 - B6) - 89*(B2 - B5) - 50*(B3 - B4) 920 921 VMLA.S32 q11,q6,d3[0] @ q11 = H5 = 50*(B0 - B7) - 89*(B1 - B6) + 18*(B2 - B5) + 75*(B3 - B4) 922 VMLS.S32 q9,q6,d3[1] @ q9 = H7 = 18*(B0 - B7) - 50*(B1 - B6) + 75*(B2 - B5) - 89*(B3 - B4) 923 924 SUB r3,r3,r4,LSL #1 925 SUB r3,r3,r4,ASR #1 @ r3 = r3 - 5*dst_strd 926 @ r3 is moved from row 7 to row 2 927 VRSHRN.I32 d20,q10,#11 @ Truncating last 11 bits in H1 928 VRSHRN.I32 d26,q13,#11 @ Truncating last 11 bits in H3 929 VRSHRN.I32 d22,q11,#11 @ Truncating last 11 bits in H5 930 VST1.64 d20,[r3],r4 @ Second half-row of row 2 of transform stage 2 (H1) stored 931 VRSHRN.I32 d18,q9,#11 @ Truncating last 11 bits in H7 932 933 VST1.64 d26,[r3],r4 @ Second half-row of row 4 of transform stage 2 (H3) stored 934 VST1.64 d22,[r3],r4 @ Second half-row of row 6 of transform stage 2 (H5) stored 935 VST1.64 d18,[r3] @ Second half-row of row 8 of transform stage 2 (H7) stored 936 937 vpop {d8 - d15} 938 POP {r4,r5} 939 MOV pc,lr 940 941@/** 942@*/ ******************************************************************************* 943@*/ 944@*/@brief 945@*/ This function performs residue calculation and forward transform on 946@*/ input pixels 947@*/ 948@*/@par Description: 949@*/ Performs residue calculation by subtracting source and prediction and 950@*/ followed by forward transform 951@*/ 952@*/ @param[in] pu1_src 953@*/ Input 16x16 pixels 954@*/ 955@*/ @param[in] pu1_pred 956@*/ Prediction data 957@*/ 958@*/ @param[in] pi2_tmp 959@*/ Temporary buffer of size 16x16 960@*/ 961@*/ @param[out] pi2_dst 962@*/ Output 16x16 coefficients 963@*/ 964@*/ @param[in] src_strd 965@*/ Input stride 966@*/ 967@*/ @param[in] pred_strd 968@*/ Prediction Stride 969@*/ 970@*/ @param[in] dst_strd 971@*/ Output Stride 972@*/ 973@*/ @param[in] chr_plane 974@*/ Chroma plane 975@*/ 976@*/ @returns Void 977@*/ 978@*/ @remarks 979@*/ None 980@*/ 981@*/******************************************************************************* 982@*/ 983 984.extern g_ai2_ihevc_trans_16 985.extern g_ai4_ihevc_trans_16 986 987g_ai2_ihevc_trans_16_addr_1: 988.long g_ai2_ihevc_trans_16 - ulbl1 - 8 989 990g_ai2_ihevc_trans_16_addr_2: 991.long g_ai2_ihevc_trans_16 - ulbl2 - 8 992 993g_ai4_ihevc_trans_16_addr: 994.long g_ai4_ihevc_trans_16 - ulbl3 - 8 995 996 .global ihevc_resi_trans_16x16_a9q 997 998ihevc_resi_trans_16x16_a9q: 999 1000.equ TMP_STRIDE , 64 @16*4, Stride of tmp register 1001.equ SHIFT , 13 @shift = 13; // log2(iWidth) - 1 + g_uiBitIncrement 1002.equ RADD , 4096 @1 << (shift - 1); 1003 1004.equ COFF_STD_2B , 32 @Stride for g_ai2_ihevc_trans_16 in bytes 1005.equ COFF_STD_W , 32 @Stride for g_ai4_ihevc_trans_16 in bytes 1006 1007@;LOAD the fucntion 1008 STMFD SP!,{r4-r12,LR} @stack store values of the arguments 1009 vpush {d8 - d15} 1010 SUB SP,SP,#32 1011 1012 LDR R4,[SP,#136] @get src_strd 1013 LDR R5,[SP,#140] @get pred_strd 1014 LDR R6,[SP,#144] @get dst_strd 1015 LDR R14,[SP,#148] @get chroma_plane 1016 1017 MOV R8,#0 @Set loop counter 1018 LDR R9,g_ai2_ihevc_trans_16_addr_1 @get 16 bit transform matrix 1019ulbl1: 1020 ADD R9, R9, PC 1021 @Read [0 0] [4 0] [8 0] [12 0],[0 1] [4 1] [8 1] [12 1] values of g_ai2_ihevc_trans_16 1022 @and write to stack 1023 MOV R12,#COFF_STD_2B 1024 LSL R12,#2 1025 1026 VLD1.S32 D30[0],[R9],R12 1027 VLD1.S32 D30[1],[R9],R12 1028 VLD1.S32 D31[0],[R9],R12 1029 VLD1.S32 D31[1],[R9],R12 1030 1031 VTRN.S32 D30,D31 1032 VTRN.S16 D30,D31 1033 VST1.S16 {d30,d31},[SP] 1034 1035 LDR R9,g_ai2_ihevc_trans_16_addr_2 @get back 16 bit transform matrix 1036ulbl2: 1037 ADD R9, R9, PC 1038 1039 MOV R7,#TMP_STRIDE 1040 1041 VMOV.S32 Q14,#0 1042 1043@R0 pu1_src 1044@R1 pu1_pred 1045@R2 pi4_tmp 1046@R3 pi2_dst 1047@R4 src_strd 1048@R5 pred_strd 1049@R6 dst_strd 1050@R7 tmp_dst Nx4 block stride 1051@R8 loop cntr 1052@R9 g_ai2_ihevc_trans_16 1053@R10 tmp_dst Nx4 block offset 1054@R11 tmp register 1055@R12 ------ 1056@R14 chroma_plane 1057@q14 shift 32 bit 1058@q15 add 32 bit 1059 1060CORE_LOOP_16X16_HORIZ: 1061 1062 CMP R14,#-1 1063 BGT INTERLEAVED_LOAD_S1 1064 1065 VLD1.U8 {d0,d1},[R0],R4 @LOAD 1-16 src row 1 1066 VLD1.U8 {d2,d3},[R1],R5 @LOAD 1-16 pred row 1 1067 VLD1.U8 {d4,d5},[R0],R4 @LOAD 1-16 src row 2 1068 VLD1.U8 {d6,d7},[R1],R5 @LOAD 1-16 pred row 2 1069 B LOAD_DONE 1070 1071INTERLEAVED_LOAD_S1: 1072 CMP R14,#1 1073 BEQ INTERLEAVED_LOAD_S2 1074 VLD2.U8 {Q0,Q1},[R0],R4 @LOAD 1-16 src row 1 1075 VLD2.U8 {Q1,Q2},[R1],R5 @LOAD 1-16 pred row 1 1076 VLD2.U8 {Q2,Q3},[R0],R4 @LOAD 1-16 src row 2 1077 VLD2.U8 {Q3,Q4},[R1],R5 @LOAD 1-16 pred row 2 1078 B LOAD_DONE 1079 1080INTERLEAVED_LOAD_S2: 1081 VLD2.U8 {Q0,Q1},[R0],R4 @LOAD 1-16 src row 1 1082 VSWP.U8 Q0,Q1 1083 VLD2.U8 {Q1,Q2},[R1],R5 @LOAD 1-16 pred row 1 1084 VSWP.U8 Q1,Q2 1085 VLD2.U8 {Q2,Q3},[R0],R4 @LOAD 1-16 src row 2 1086 VSWP.U8 Q2,Q3 1087 VLD2.U8 {Q3,Q4},[R1],R5 @LOAD 1-16 pred row 2 1088 VSWP.U8 Q3,Q4 1089 1090LOAD_DONE: 1091 1092 VSUBL.U8 Q4,D0,D2 @Get residue 1-8 row 1 1093 VSUBL.U8 Q5,D1,D3 @Get residue 9-16 row 1 1094 VSUBL.U8 Q6,D4,D6 @Get residue 1-8 row 2 1095 VSUBL.U8 Q7,D5,D7 @Get residue 9-16 row 2 1096 1097 @Get blk sads 1098 VABDL.U8 Q15,D0,D2 1099 VABAL.U8 Q15,D1,D3 1100 VABAL.U8 Q15,D4,D6 1101 VABAL.U8 Q15,D5,D7 1102 VADDW.S16 Q14,Q14,D30 1103 VADDW.S16 Q14,Q14,D31 1104 1105 VREV64.S16 Q5,Q5 @Rev row 1 1106 VREV64.S16 Q7,Q7 @Rev row 2 1107 VSWP D10,D11 1108 VSWP D14,D15 1109 1110 VADD.S16 Q8 ,Q4,Q5 @e[k] = resi_tmp_1 + resi_tmp_2 k -> 1-8 row 1 1111 VSUB.S16 Q9 ,Q4,Q5 @o[k] = resi_tmp_1 - resi_tmp_2 k ->9-16 row 1 1112 VADD.S16 Q10,Q6,Q7 @e[k] = resi_tmp_1 + resi_tmp_2 k -> 1-8 row 2 1113 VSUB.S16 Q11,Q6,Q7 @o[k] = resi_tmp_1 - resi_tmp_2 k ->9-16 row 2 1114 1115 VREV64.S16 D24,D17 @rev e[k] k-> 4-7 row 1 1116 VREV64.S16 D25,D21 @rev e[k] k-> 4-7 row 2 1117 VMOV.S16 D17,D20 1118 1119 @arrangement OF DATA 1120 @Q8 A1 A2 A3 A4 B1 B2 B3 B4 1121 @Q12 A8 A7 A6 A5 B8 B7 B6 B5 1122 1123 VADD.S16 Q13,Q8,Q12 @ee[k] = e[k] + e[7 - k] row 1 & 2 1124 VSUB.S16 Q0,Q8,Q12 @eo[k] = e[k] - e[7 - k] row 1 & 2 1125 1126 @D26 R1ee[0] R1ee[1] R1ee[2] R1ee[3] 1127 @D27 R2ee[0] R2ee[1] R2ee[2] R2ee[3] 1128 VTRN.S32 D26,D27 @1-cycle stall before it? 1129 @D26 R1ee[0] R1ee[1] R2ee[0] R2ee[1] 1130 @D27 R1ee[2] R1ee[3] R2ee[2] R2ee[3] 1131 VREV32.16 D2,D27 @1-cycle stall before it? 1132 @D26 R1ee[0] R1ee[1] R2ee[0] R2ee[1] 1133 @D2 R1ee[3] R1ee[2] R2ee[3] R2ee[2] 1134 VMOV.S16 D27,D26 1135 VNEG.S16 D3,D2 1136 @Q13 R1ee[0] R1ee[1] R2ee[0] R2ee[1] R1ee[0] R1ee[1] R2ee[0] R2ee[1] 1137 @Q1 R1ee[3] R1ee[2] R2ee[3] R2ee[2] -R1ee[3] -R1ee[2] -R2ee[3] -R2ee[2] 1138 1139 @D8 : [0 0] [4 0] [8 0] [12 0] 1140 @D9 : [0 1] [4 1] [8 1] [12 1] 1141 VLD1.S16 {d8,d9},[SP] @[0 0] [4 0] [8 0] [12 0] [0 1] [4 1] [8 1] [12 1] 1142 VADD.S16 Q1,Q13,Q1 @ 1-cycle stall before it? 1143 @Q15 R1eee[0] R1eee[1] R2eee[0] R2eee[1] R1eeo[0] R1eeo[1] R2eeo[0] R2eeo[1] 1144 1145 @Q1 R1eee[0] R1eee[1] R2eee[0] R2eee[1] 1146 @ R1eeo[0] R1eeo[1] R2eeo[0] R2eeo[1] 1147 VTRN.S16 D2,D3 @2-cycle stall before it? 1148 @Q1 R1eee[0] R1eeo[0] R2eee[0] R2eeo[0] 1149 @ R1eee[1] R1eeo[1] R2eee[1] R2eeo[1] 1150 1151 VDUP.S32 D4,D2[0] @R1eee[0] R1eeo[0] R1eee[0] R1eeo[0] ;1-cycle stall? 1152 VDUP.S32 D5,D2[1] @R2eee[0] R2eeo[0] R2eee[0] R2eeo[0] 1153 VDUP.S32 D6,D3[0] @R1eee[1] R1eeo[1] R1eee[1] R1eeo[1] 1154 VDUP.S32 D7,D3[1] @R2eee[1] R2eeo[1] R2eee[1] R2eeo[1] 1155 1156 @---------------Process EO-------------------- 1157 @ Early start to avoid stalls 1158 MOV R12,#COFF_STD_2B @Get stride of coeffs 1159 1160 VMULL.S16 Q5,D4,D8 @ g_ai2_ihevc_trans_16 * R1eee[0] R1eeo[0] R1eee[0] R1eeo[0] 1161 VMLAL.S16 Q5,D6,D9 @ + g_ai2_ihevc_trans_16 * R1eee[1] R1eeo[1] R1eee[1] R1eeo[1] 1162 VMULL.S16 Q6,D5,D8 @ g_ai2_ihevc_trans_16 * R2eee[0] R2eeo[0] R2eee[0] R2eeo[0] 1163 VMLAL.S16 Q6,D7,D9 @ + g_ai2_ihevc_trans_16 * R2eee[1] R2eeo[1] R2eee[1] R2eeo[1] 1164 1165 ADD R11,R9,R12,LSL #1 @Load address of g_ai2_ihevc_trans_16[2] 1166 LSL R12,R12,#2 1167 1168 VLD1.S16 D26,[R11],R12 @LOAD g_ai2_ihevc_trans_16[2][0-4]] 1169 1170 VLD1.S16 D27,[R11],R12 @LOAD g_ai2_ihevc_trans_16[6][0-4] 1171 VMULL.S16 Q1,D26,D0 @g_ai2_ihevc_trans_16[2][0-4] * eo[0-4] R1 1172 1173 VMULL.S16 Q2,D26,D1 @g_ai2_ihevc_trans_16[2][0-4] * eo[0-4] R2 1174 1175 VZIP.S32 Q5,Q6 @3-cycle instruction 1176 VMULL.S16 Q3,D27,D0 @g_ai2_ihevc_trans_16[6][0-4] * eo[0-4] R1 1177 1178 1179 VLD1.S16 D26,[R11],R12 @LOAD g_ai2_ihevc_trans_16[10][0-4] 1180 VMULL.S16 Q4,D27,D1 @g_ai2_ihevc_trans_16[6][0-4] * eo[0-4] R2 1181 1182 @These values must go to 0 4 8 12 colums hence we need stride *4 1183 LSL R10,R7,#2 1184 1185 VLD1.S16 D27,[R11],R12 @LOAD g_ai2_ihevc_trans_16[14][0-4] 1186 1187 VST1.32 D10,[R2],R10 1188 VMULL.S16 Q8,D27,D1 @g_ai2_ihevc_trans_16[14][0-4] * eo[0-4] R2 1189 1190 VST1.32 D11,[R2],R10 1191 VMULL.S16 Q7,D27,D0 @g_ai2_ihevc_trans_16[14][0-4] * eo[0-4] R1 1192 1193 VST1.32 D12,[R2],R10 1194 VMULL.S16 Q5,D26,D0 @g_ai2_ihevc_trans_16[10][0-4] * eo[0-4] R1 1195 1196 VST1.32 D13,[R2],R10 1197 VMULL.S16 Q6,D26,D1 @g_ai2_ihevc_trans_16[10][0-4] * eo[0-4] R2 1198 1199 SUB R2,R2,R10,LSL #2 1200 1201 @transpose the 4x4 matrix row1 1202 VTRN.32 Q1, Q3 @R1 transpose1 -- 2 cycles 1203 1204 @transpose the 4x4 matrix row2 1205 VTRN.32 Q2,Q4 @R2 transpose1 -- 2 cycles 1206 1207 VTRN.32 Q5, Q7 @R1 transpose1 -- 2 cycles 1208 1209 VTRN.32 Q6,Q8 @R2 transpose1 -- 2 cycles 1210 1211 VSWP D10,D3 @R1 transpose2 1212 VSWP D14,D7 @R1 transpose2 1213 1214 VSWP D12,D5 @R2 transpose2 1215 VSWP D16,D9 @R2 transpose2 1216 1217 VADD.S32 Q5,Q5,Q1 @R1 add 1218 VADD.S32 Q3,Q3,Q7 @R1 add 1219 1220 VADD.S32 Q2,Q2,Q4 @R2 add 1221 VADD.S32 Q6,Q6,Q8 @R2 add 1222 1223 VADD.S32 Q5,Q5,Q3 @R1 add 1224 1225 VADD.S32 Q4,Q6,Q2 @R2 add 1226 1227 @-----------------------Processing O ---------------------------- 1228 @ Early start to avoid stalls 1229 MOV R12,#COFF_STD_2B @Get coeffs stride 1230 LSL R12,R12,#1 1231 ADD R11,R9,#COFF_STD_2B @Get address of g_ai2_ihevc_trans_16[1] 1232 1233 VLD1.S16 {d4,d5},[R11],R12 @g_ai2_ihevc_trans_16[1][0-7] -- 2 cycles 1234 1235 VZIP.S32 Q5,Q4 @ 3 cycle instruction 1236 VMULL.S16 Q6,D18,D4 @o[0][0-3]* R1 1237 1238 1239 VMLAL.S16 Q6,D19,D5 @o[0][4-7]* R1 ; follows MULL instruction: Multiplier accumulator forwarding 1240 @write to memory 1241 @this should go to 2 6 10 14 1242 LSL R10,R7,#2 1243 ADD R2,R2,R7,LSL #1 @move to third row 1244 VST1.32 D10,[R2],R10 1245 VMULL.S16 Q7,D22,D4 @o[0][0-3]* R2 1246 1247 VST1.32 D11,[R2],R10 1248 VMLAL.S16 Q7,D23,D5 @o[0][4-7]* R2 1249 1250 VLD1.S16 {d4,d5},[R11],R12 @g_ai2_ihevc_trans_16[3][0-7] 1251 1252 VST1.32 D8,[R2],R10 1253 VMULL.S16 Q8,D18,D4 @o[1][0-3]* R1 1254 1255 VST1.32 D9,[R2],R10 1256 VMLAL.S16 Q8,D19,D5 @o[1][4-7]* R1 1257 SUB R2,R2,R10,LSL #2 1258 SUB R2,R2,R7,LSL #1 1259 1260 @--------------------Done procrssing EO ------------------------- 1261 1262 @ -----------------Processing O continues------------------------ 1263 1264 VMULL.S16 Q10,D22,D4 @o[1][0-3]* R2 1265 VMLAL.S16 Q10,D23,D5 @o[1][4-7]* R2 1266 1267 VLD1.S16 {d4,d5},[R11],R12 @g_ai2_ihevc_trans_16[5][0-7] 1268 1269 VLD1.S16 {d6,d7},[R11],R12 @g_ai2_ihevc_trans_16[7][0-7] 1270 VMULL.S16 Q12,D18,D4 @o[2][0-3]* R1 1271 1272 VMLAL.S16 Q12,D19,D5 @o[2][4-7]* R1 1273 VMULL.S16 Q0,D18,D6 @o[3][0-3]* R1 1274 VMLAL.S16 Q0,D19,D7 @o[3][4-7]* R1 1275 1276 VMULL.S16 Q13,D22,D4 @o[2][0-3]* R2 1277 VMLAL.S16 Q13,D23,D5 @o[2][4-7]* R2 1278 VMULL.S16 Q1,D22,D6 @o[3][0-3]* R2 1279 VMLAL.S16 Q1,D23,D7 @o[3][4-7]* R2 1280 1281 @transpose the 4x4 matrix R1 1282 VTRN.32 Q6, Q8 @ 2-cycle instruction 1283 1284 VTRN.32 Q12,Q0 @ 2-cycle instruction 1285 1286 @transpose the 4x4 matrix R2 1287 VTRN.32 Q7,Q10 @ 2-cycle instruction 1288 1289 VTRN.32 Q13,Q1 @ 2-cycle instruction 1290 1291 VSWP D24,D13 1292 VSWP D0, D17 1293 1294 VSWP D26,D15 1295 VSWP D2,D21 1296 1297 VADD.S32 Q8 ,Q8 ,Q6 1298 VADD.S32 Q12,Q12,Q0 1299 1300 VADD.S32 Q10,Q10,Q7 1301 VADD.S32 Q13,Q13,Q1 1302 1303 VLD1.S16 {d4,d5},[R11],R12 @g_ai2_ihevc_trans_16[9][0-7] 1304 VADD.S32 Q12 ,Q12 ,Q8 1305 1306 VADD.S32 Q13,Q13,Q10 1307 VMULL.S16 Q3,D18,D4 @o[4][0-3]* R1 1308 VMLAL.S16 Q3,D19,D5 @o[4][4-7]* R1 1309 1310 VZIP.S32 Q12,Q13 1311 VMULL.S16 Q4,D22,D4 @o[0][0-3]* R2 1312 1313 1314 VMLAL.S16 Q4,D23,D5 @o[0][4-7]* R2 1315 @write to memory 1316 @this should go to 1 3 5 7 1317 ADD R2,R2,R7 1318 LSL R7,R7,#1 1319 VLD1.S16 {d4,d5},[R11],R12 @g_ai2_ihevc_trans_16[11][0-7] 1320 1321 VST1.32 D24,[R2],R7 1322 VMULL.S16 Q5,D18,D4 @o[5][0-3]* R1 1323 1324 VST1.32 D25,[R2],R7 1325 VMLAL.S16 Q5,D19,D5 @o[5][4-7]* R1 1326 1327 VST1.32 D26,[R2],R7 1328 VMULL.S16 Q6,D22,D4 @o[0][0-3]* R2 1329 1330 VST1.32 D27,[R2],R7 1331 VMLAL.S16 Q6,D23,D5 @o[0][4-7]* R2 1332 1333 VLD1.S16 {d4,d5},[R11],R12 @g_ai2_ihevc_trans_16[13][0-7] 1334 1335 VLD1.S16 {d2,d3},[R11],R12 @g_ai2_ihevc_trans_16[15][0-7] 1336 VMULL.S16 Q7,D18,D4 @o[6][0-3]* R1 1337 1338 VMLAL.S16 Q7,D19,D5 @o[6][4-7]* R1 1339 VMULL.S16 Q10,D18,D2 @o[7][0-3]* R1 1340 VMLAL.S16 Q10,D19,D3 @o[7][4-7]* R1 1341 1342 VMULL.S16 Q8,D22,D4 @o[0][0-3]* R2 1343 VMLAL.S16 Q8,D23,D5 @o[0][4-7]* R2 1344 VMULL.S16 Q12,D22,D2 @o[0][0-3]* R2 1345 VMLAL.S16 Q12,D23,D3 @o[0][4-7]* R2 1346 1347 1348 @transpose the 4x4 matrix R1 1349 VTRN.32 Q3 ,Q5 @ 2-cycle instruction 1350 1351 VTRN.32 Q7 ,Q10 @ transpose step 2 R1 , 2-cycle instruction 1352 1353 @transpose the 4x4 matrix R2 1354 VTRN.32 Q4 ,Q6 @ 2-cycle instruction 1355 1356 VTRN.32 Q8 ,Q12 @ transpose step 2 R2 , 2-cycle instruction 1357 1358 VSWP D14,D7 @ transpose step 3, R1 1359 VSWP D20,D11 @ transpose step 4, R1 1360 VSWP D16,D9 @ transpose step 3, R2 1361 VSWP D24,D13 @ transpose step 4, R2 1362 1363 VADD.S32 Q5 ,Q5 ,Q3 1364 VADD.S32 Q10,Q10,Q7 1365 VADD.S32 Q6 ,Q6 ,Q4 1366 VADD.S32 Q12,Q12,Q8 1367 VADD.S32 Q10,Q10,Q5 1368 VADD.S32 Q12,Q12,Q6 1369 1370 @ 2-cycle stall 1371 VZIP.S32 Q10,Q12 @ 3-cycle instruction 1372 1373 @ 2-cycle stall 1374 @this should go to 9 11 13 15 1375 VST1.32 D20,[R2],R7 1376 1377 VST1.32 D21,[R2],R7 1378 1379 VST1.32 D24,[R2],R7 1380 1381 VST1.32 D25,[R2],R7 1382 1383 SUB R2,R2,R7,LSL #3 1384 LSR R7,R7,#1 1385 SUB R2,R2,R7 1386 1387 ADD R2,R2,#8 @MOVE TO NEXT to next COLUMN - pi4_tmp 1388 1389 ADD R8,R8,#2 @increment loop cntr 1390 CMP R8,#16 @check lllop cntr 1391 BNE CORE_LOOP_16X16_HORIZ @jump acc 1392 1393 1394@*****************Vertical transform************************************ 1395 1396@Initialization for vert transform 1397@pi4_tmp will be the new src 1398@tmp stride will be new src stride 1399@dst will be new pi4_tmp 1400@dst stride will be new tmp stride 1401@trans table will be of 32 bit 1402 1403 LDR R9,g_ai4_ihevc_trans_16_addr @get 32 bit transform matrix 1404ulbl3: 1405 ADD R9, R9, PC 1406 1407 SUB R0,R2,#64 @set tmp as src [-32 to move back to orgin] 1408 MOV R2,R3 @set dst as tmp 1409 MOV R4,#TMP_STRIDE @set tmp stride as src stride 1410 LSL R7,R6,#1 @Set dst stride as tmp stride 1411 SUB R4,#48 @Adjust stride 3 previous loads 1412 1413 @Block SAD 1414 VADD.S32 D28,D28,D29 1415 VPADD.S32 D28,D28,D29 1416 VMOV.S32 R3,D28[0] 1417 @ SAD calculation ends -- final value in R3. 1418 1419 @Read [0 0] [4 0] [8 0] [12 0],[0 1] [4 1] [8 1] [12 1] 1420 @values of g_ai4_ihevc_trans_16 and write to stack 1421 MOV R12,#COFF_STD_W 1422 LSL R12,R12,#2 1423 VLD1.S32 D28,[R9],R12 1424 VLD1.S32 D29,[R9],R12 1425 VLD1.S32 D30,[R9],R12 1426 VLD1.S32 D31,[R9],R12 1427 SUB R9,R9,R12,LSL #2 1428 1429 VREV64.32 Q15,Q15 1430 VTRN.S32 Q14,Q15 1431 VST1.S32 {Q14-Q15},[SP] 1432 1433 VMOV.U32 Q14,#RADD @get the round factor to q14 1434 VMOV.U32 Q15,#SHIFT @Get the shift to neon 1435 1436 MOV R8,#0 @INIT LOOP 1437 1438CORE_LOOP_16X16_VERT: 1439 1440 VLD1.S32 {D0,D1},[R0]! @LOAD 1-4 src R1 1441 VLD1.S32 {D2,D3},[R0]! @LOAD 5-8 pred R1 1442 VLD1.S32 {D4,D5},[R0]! @LOAD 9-12 src R1 1443 VLD1.S32 {D6,D7},[R0],R4 @LOAD 12-16 pred R1 1444 1445 VLD1.S32 {D8,D9},[R0]! @LOAD 1-4 src R2 1446 VLD1.S32 {D10,D11},[R0]! @LOAD 5-8 pred R2 1447 VLD1.S32 {D12,D13},[R0]! @LOAD 9-12 src R2 1448 VLD1.S32 {D14,D15},[R0],R4 @LOAD 12-16 pred R2 1449 1450 VREV64.S32 Q2,Q2 @Rev 9-12 R1 1451 VREV64.S32 Q3,Q3 @Rev 12-16 R1 1452 VREV64.S32 Q6,Q6 @Rev 9-12 R2 1453 VREV64.S32 Q7,Q7 @Rev 12-16 R2 1454 1455 VSWP D6,D7 1456 VSWP D4,D5 1457 VADD.S32 Q8 ,Q0,Q3 @e[k] = resi_tmp_1 + resi_tmp_2 k -> 1-4 R1 1458 VSWP D12,D13 @ dual issued with prev. instruction 1459 VADD.S32 Q9 ,Q1,Q2 @e[k] = resi_tmp_1 + resi_tmp_2 k -> 5-8 R1 1460 VSWP D14,D15 @ dual issued with prev. instruction 1461 VSUB.S32 Q10,Q0,Q3 @o[k] = resi_tmp_1 - resi_tmp_2 k -> 1-4 R1 1462 VSUB.S32 Q11,Q1,Q2 @o[k] = resi_tmp_1 - resi_tmp_2 k -> 5-8 R1 1463 1464 VADD.S32 Q12,Q4,Q7 @e[k] = resi_tmp_1 + resi_tmp_2 k -> 1-4 R2 1465 VREV64.S32 Q9 ,Q9 @rev e[k] k-> 4-7 R1, dual issued with prev. instruction 1466 VADD.S32 Q13,Q5,Q6 @e[k] = resi_tmp_1 + resi_tmp_2 k -> 5-8 R2 1467 VSUB.S32 Q0 ,Q4,Q7 @o[k] = resi_tmp_1 - resi_tmp_2 k -> 1-4 R2 1468 VSWP D18,D19 @ dual issued with prev. instruction 1469 VSUB.S32 Q1 ,Q5,Q6 @o[k] = resi_tmp_1 - resi_tmp_2 k -> 5-8 R2 1470 VREV64.S32 Q13,Q13 @rev e[k] k-> 4-7 R2, dual issued with prev. instruction 1471 1472 VADD.S32 Q2,Q8,Q9 @ee[k] = e[k] + e[7 - k] row R1 1473 VSUB.S32 Q3,Q8,Q9 @eo[k] = e[k] - e[7 - k] row R1 1474 VSWP D26,D27 1475 1476 1477 VADD.S32 Q4,Q12,Q13 @ee[k] = e[k] + e[7 - k] row R2 1478 VSUB.S32 Q5,Q12,Q13 @eo[k] = e[k] - e[7 - k] row R2 1479 VREV64.S32 D5,D5 @rev ee[k] 4-7 R1, dual issued with prev. instruction 1480 1481 VADD.S32 D12,D4,D5 @eee[0] eee[1] R1 1482 VSUB.S32 D13,D4,D5 @eeo[0] eeo[1] R1 1483 VREV64.S32 D9,D9 @rev ee[k] 4-7 R2, dual issued with prev. instruction 1484 1485 1486 VADD.S32 D14,D8,D9 @eee[0] eee[1] R2 1487 VSUB.S32 D15,D8,D9 @eeo[0] eeo[1] R2 1488 1489 VLD1.S32 {Q12,Q13},[SP] @Load g_ai2_ihevc_trans_16[xx]-> Q12 : [0 0] [8 0] [4 0] [12 0] Q13 : [0 1] [8 1] [4 1] [12 1] 1490 VREV64.S32 Q8,Q6 @Q6 : eee[0] eee[1] eeo[0] eeo[1] R1 -> ;Q8 : eee[1] eee[0] eeo[1] eeo[0] R1 1491 1492 VREV64.S32 Q9,Q7 @Q7 : eee[0] eee[1] eeo[0] eeo[1] R2 -> ;Q9 : eee[1] eee[0] eeo[1] eeo[0] R2 1493 1494 1495 VMUL.S32 Q4,Q6,Q12 @g_ai2_ihevc_trans_16 * eee[0] eee[1] eeo[0] eeo[1] R1 1496 VMLA.S32 Q4,Q8,Q13 @g_ai2_ihevc_trans_16 * eee[1] eee[0] eeo[1] eeo[0] R1 1497 1498 VMUL.S32 Q6,Q7,Q12 @g_ai2_ihevc_trans_16 * eee[0] eee[1] eeo[0] eeo[1] R2 1499 VMLA.S32 Q6,Q9,Q13 @g_ai2_ihevc_trans_16 * eee[1] eee[0] eeo[1] eeo[0] R2 1500 1501 @Q3 :R1E00 R1E01 R1E02 R1E03 1502 @Q5 :R2E00 R2E01 R2E02 R2E03 1503 VSWP D7,D10 @ dual issued with prev. instruction 1504 @Q3 :R1E00 R1E01 R2E00 R2E01 1505 @Q5 :R1E02 R1E03 R2E02 R2E03 1506 VSWP D7,D11 1507 @Q3 :R1E00 R1E01 R2E02 R2E03 1508 @Q5 :R1E02 R1E03 R2E00 R2E01 1509 1510 MOV R12,#COFF_STD_W 1511 ADD R11,R9,R12,LSL #1 @Get to the 2nd row of src 1512 LSL R12,R12,#2 1513 1514 VLD1.S32 {D14,D15},[R11],R12 @LOAD g_ai2_ihevc_trans_16[2][0-4] -> 2G0 2G1 2G2 2G3, 2-cycle instr. 1515 1516 VADD.S32 Q4,Q4,Q14 @ROUND R1 1517 VMUL.S32 Q12,Q3,Q7 @2G0 2G1 2G2 2G3 * R1E00 R1E01 R2E02 R2E03, 4-cycle instruction 1518 VSWP D14,D15 @2G0 2G1 2G2 2G3 -> 2G2 2G3 2G0 2G1, dual issued with prev. instruction 1519 1520 VADD.S32 Q6,Q6,Q14 @ROUND R2 1521 1522 VSHRN.S32 D8,Q4,#SHIFT @NARROW R1 1523 1524 VLD1.S32 {D16,D17},[R11],R12 @LOAD g_ai2_ihevc_trans_16[6][0-4] 1525 VSHRN.S32 D9,Q6,#SHIFT @NARROW R2, dual issued in 2nd cycle 1526 1527 VMUL.S32 Q2,Q3,Q8 @g_ai2_ihevc_trans_16[6][0-4] * eo[0-4], 4-cycle instruction 1528 VSWP D16,D17 @dual issued with prev. instr. 1529 1530 VZIP.S16 D8,D9 @INTERLEAVE R1 R2 R1 R2 R1 R2 to write 1531 VMLA.S32 Q12,Q5,Q7 @2G2 2G3 2G0 2G1 * R1E02 R1E03 R2E00 R2E01, 4-cycle instruction 1532 1533 1534 @WRITE INTO MEM the values or wait to be shuffled 1535 @These values must go to 0 4 8 12 colums 1536 LSL R10,R7,#2 1537 VST1.S32 D8[0],[R2],R10 1538 1539 VST1.S32 D9[0],[R2],R10 1540 1541 VST1.S32 D8[1],[R2],R10 1542 VPADD.S32 D18,D24,D25 @D18[0] -> 2G0*R1E00+2G1*R1E01 2G2*R2E02+2G3*R2E03 1543 @D18[1] -> 2G2*R1E02+2G3*R1E03 2G0*R2E00+*2G1R2E01 1544 1545 VST1.S32 D9[1],[R2],R10 1546 VMLA.S32 Q2,Q5,Q8 @g_ai2_ihevc_trans_16[2][0-4] * eo[0-4] 1547 LSL R10,R10,#2 1548 SUB R2,R2,R10 1549 1550 VLD1.S32 {D14,D15},[R11],R12 @LOAD g_ai2_ihevc_trans_16[10][0-4] 1551 1552 VMUL.S32 Q6,Q3,Q7 @g_ai2_ihevc_trans_16[10][0-4] * eo[0-4] 1553 VSWP D14,D15 @ dual issued with prev. instruction 1554 VPADD.S32 D19,D4,D5 1555 1556 VLD1.S32 {D16,D17},[R11],R12 @LOAD g_ai2_ihevc_trans_16[14][0-4] 1557 VMUL.S32 Q2,Q3,Q8 @g_ai2_ihevc_trans_16[14][0-4] * eo[0-4] 1558 VSWP D16,D17 1559 1560 VMLA.S32 Q6,Q5,Q7 @g_ai2_ihevc_trans_16[2][0-4] * eo[0-4] 1561 VADD.S32 Q9,Q9,Q14 @Round by RADD R1 1562 VMLA.S32 Q2,Q5,Q8 @g_ai2_ihevc_trans_16[2][0-4] * eo[0-4] 1563 VSHRN.S32 D8,Q9,#SHIFT @Shift by SHIFT 1564 VPADD.S32 D24,D12,D13 1565 @---------------Processing O, Row 1 and Row 2-------------------------------------- 1566 @ Early start to avoid stalls 1567 MOV R12,#COFF_STD_W 1568 ADD R11,R9,R12 @Get 1ST row 1569 LSL R12,R12,#1 1570 1571 LSL R10,R7,#2 1572 ADD R2,R2,R7,LSL #1 @move to third row 1573 @this should go to 2 6 10 14 1574 VST1.S32 D8[0],[R2],R10 1575 1576 VST1.S32 D8[1],[R2],R10 1577 VPADD.S32 D25,D4,D5 @ dual issued with prev. instruction in 2nd cycle 1578 1579 VLD1.S32 {Q2,Q3},[R11],R12 @g_ai2_ihevc_trans_16[1][0-7] 1580 VADD.S32 Q12,Q12,Q14 @Round by RADD R2, dual issued with prev. instruction in 2nd cycle 1581 VMUL.S32 Q6,Q2,Q0 @g_ai2_ihevc_trans_16[1][0-3]*o[0][0-3] R2 1582 VMLA.S32 Q6,Q3,Q1 @g_ai2_ihevc_trans_16[1][4-7]*o[0][4-7] R2 1583 VSHRN.S32 D9,Q12,#SHIFT @Shift by SHIFT 1584 1585 VMUL.S32 Q2,Q2,Q10 @g_ai2_ihevc_trans_16[1][0-3]*o[0][0-3] R1 1586 VMLA.S32 Q2,Q3,Q11 @g_ai2_ihevc_trans_16[1][4-7]*o[0][4-7] R1 1587 VADD.S32 D11,D12,D13 @g_ai2_ihevc_trans_16[1][k]*o[0][k]+g_ai2_ihevc_trans_16[0][7-k]*o[0][7-k] R2, dual issued with prev. instr. 1588 VST1.S32 D9[0],[R2],R10 1589 1590 VST1.S32 D9[1],[R2],R10 1591 VADD.S32 D10,D4,D5 @g_ai2_ihevc_trans_16[1][k]*o[0][k]+g_ai2_ihevc_trans_16[0][7-k]*o[0][7-k] R1, dual issued with prev. instr. 1592 LSL R10,R10,#2 @go back to orgin 1593 SUB R2,R2,R10 1594 SUB R2,R2,R7,LSL #1 1595 1596 VLD1.S32 {Q2,Q3},[R11],R12 @g_ai2_ihevc_trans_16[3][0-7] 1597 1598 VMUL.S32 Q7,Q2,Q10 @o[0][0-3] 1599 VMLA.S32 Q7,Q3,Q11 @o[0][4-7] 1600 VMUL.S32 Q8,Q2,Q0 @o[0][0-3] 1601 VMLA.S32 Q8,Q3,Q1 @o[0][4-7] 1602 1603 VLD1.S32 {Q2,Q3},[R11],R12 @g_ai2_ihevc_trans_16[5][0-7] 1604 VADD.S32 D18,D14,D15 1605 VMUL.S32 Q12,Q2,Q10 @o[0][0-3] 1606 VMLA.S32 Q12,Q3,Q11 @o[0][4-7] 1607 VADD.S32 D19,D16,D17 1608 VMUL.S32 Q4,Q2,Q0 1609 VMLA.S32 Q4,Q3,Q1 1610 VLD1.S32 {Q2,Q3},[R11],R12 @g_ai2_ihevc_trans_16[7][0-7] 1611 VADD.S32 D26,D24,D25 @ dual issued with prev. instr. 1612 VMUL.S32 Q6,Q2,Q10 @o[0][0-3] 1613 VMLA.S32 Q6,Q3,Q11 @o[0][4-7] 1614 VADD.S32 D27,D8,D9 1615 VMUL.S32 Q4,Q2,Q0 1616 VMLA.S32 Q4,Q3,Q1 1617 VADD.S32 D12,D12,D13 1618 @Q5 Q9 Q13 Q6 1619 VPADD.S32 D14,D10,D11 1620 VPADD.S32 D15,D18,D19 1621 VPADD.S32 D16,D26,D27 1622 VADD.S32 D13,D8,D9 1623 VADD.S32 Q9,Q7,Q14 1624 VLD1.S32 {Q2,Q3},[R11],R12 @g_ai2_ihevc_trans_16[0][0-7] 1625 VPADD.S32 D17,D12,D13 @ dual issued with prev. instr. in 2nd cycle 1626 1627 VMUL.S32 Q4,Q2,Q10 @o[0][0-3] 1628 VMLA.S32 Q4,Q3,Q11 @o[0][4-7] 1629 1630 VADD.S32 Q12,Q8,Q14 1631 1632 VMUL.S32 Q6,Q2,Q0 @o[0][0-3] 1633 VMLA.S32 Q6,Q3,Q1 @o[0][4-7] 1634 1635 VSHRN.S32 D26,Q9,#SHIFT 1636 VSHRN.S32 D27,Q12,#SHIFT 1637 VADD.S32 D10,D8,D9 1638 @write to memory this should go to 1 3 5 7 1639 ADD R2,R2,R7 1640 LSL R7,R7,#1 1641 VLD1.S32 {Q2,Q3},[R11],R12 @g_ai2_ihevc_trans_16[1][0-7] 1642 VADD.S32 D11,D12,D13 @ dual issued with prev. instr. 1643 1644 VST1.S32 D26[0],[R2],R7 1645 VMUL.S32 Q7,Q2,Q10 @o[0][0-3] 1646 VMLA.S32 Q7,Q3,Q11 @o[0][4-7] 1647 VST1.S32 D26[1],[R2],R7 1648 VMUL.S32 Q8,Q2,Q0 @o[0][0-3] 1649 VMLA.S32 Q8,Q3,Q1 @o[0][4-7] 1650 VST1.S32 D27[0],[R2],R7 1651 VADD.S32 D18,D14,D15 1652 VST1.S32 D27[1],[R2],R7 1653 1654 VLD1.S32 {Q2,Q3},[R11],R12 @g_ai2_ihevc_trans_16[2][0-7] 1655 VADD.S32 D19,D16,D17 @ dual issued with prev. instr. 1656 1657 VMUL.S32 Q12,Q2,Q10 @o[0][0-3] 1658 VMLA.S32 Q12,Q3,Q11 @o[0][4-7] 1659 VMUL.S32 Q4,Q2,Q0 1660 VMLA.S32 Q4,Q3,Q1 1661 1662 VLD1.S32 {Q2,Q3},[R11],R12 @g_ai2_ihevc_trans_16[3][0-7] 1663 VADD.S32 D26,D24,D25 1664 1665 VMUL.S32 Q6,Q2,Q10 @o[0][0-3] 1666 VMLA.S32 Q6,Q3,Q11 @o[0][4-7] 1667 VADD.S32 D27,D8,D9 1668 1669 VMUL.S32 Q4,Q2,Q0 1670 VMLA.S32 Q4,Q3,Q1 1671 VADD.S32 D12,D12,D13 1672 @Q5 Q9 Q13 Q6 1673 VPADD.S32 D14,D10,D11 1674 VPADD.S32 D15,D18,D19 1675 VPADD.S32 D16,D26,D27 1676 VADD.S32 D13,D8,D9 1677 VADD.S32 Q9,Q7,Q14 1678 @ 1- cycle stall? 1679 VPADD.S32 D17,D12,D13 1680 VSHRN.S32 D22,Q9,#SHIFT 1681 VADD.S32 Q10,Q8,Q14 1682 @ 2-cycle stall? 1683 VSHRN.S32 D23,Q10,#SHIFT 1684 1685 @this should go to 9 11 13 15 1686 @LSL R11,R7,#1 1687 VST1.S32 D22[0],[R2],R7 1688 VST1.S32 D22[1],[R2],R7 1689 VST1.S32 D23[0],[R2],R7 1690 VST1.S32 D23[1],[R2],R7 1691 1692 SUB R2,R2,R7,LSL #3 1693 LSR R7,R7,#1 1694 SUB R2,R2,R7 1695 1696 ADD R2,R2,#4 @MOVE TO NEXT to next COLUMN 1697 1698 ADD R8,R8,#2 @increment loop cntr by 2 since we process loop as 2 cols 1699 CMP R8,#16 @check loop cntr 1700 BNE CORE_LOOP_16X16_VERT @jump acc 1701 1702 MOV R0,R3 1703 1704 ADD SP,SP,#32 1705 vpop {d8 - d15} 1706 LDMFD sp!,{r4-r12,PC} @stack store values of the arguments 1707 1708