1*a97c2a1fSXin Li@/****************************************************************************** 2*a97c2a1fSXin Li@ * 3*a97c2a1fSXin Li@ * Copyright (C) 2015 The Android Open Source Project 4*a97c2a1fSXin Li@ * 5*a97c2a1fSXin Li@ * Licensed under the Apache License, Version 2.0 (the "License"); 6*a97c2a1fSXin Li@ * you may not use this file except in compliance with the License. 7*a97c2a1fSXin Li@ * You may obtain a copy of the License at: 8*a97c2a1fSXin Li@ * 9*a97c2a1fSXin Li@ * http://www.apache.org/licenses/LICENSE-2.0 10*a97c2a1fSXin Li@ * 11*a97c2a1fSXin Li@ * Unless required by applicable law or agreed to in writing, software 12*a97c2a1fSXin Li@ * distributed under the License is distributed on an "AS IS" BASIS, 13*a97c2a1fSXin Li@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14*a97c2a1fSXin Li@ * See the License for the specific language governing permissions and 15*a97c2a1fSXin Li@ * limitations under the License. 16*a97c2a1fSXin Li@ * 17*a97c2a1fSXin Li@ ***************************************************************************** 18*a97c2a1fSXin Li@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore 19*a97c2a1fSXin Li@*/ 20*a97c2a1fSXin Li 21*a97c2a1fSXin Li@/* 22*a97c2a1fSXin Li@//---------------------------------------------------------------------------- 23*a97c2a1fSXin Li@// File Name : impeg2_inter_pred.s 24*a97c2a1fSXin Li@// 25*a97c2a1fSXin Li@// Description : This file has motion compensation related 26*a97c2a1fSXin Li@// interpolation functions on Neon + CortexA-8 platform 27*a97c2a1fSXin Li@// 28*a97c2a1fSXin Li@// Reference Document : 29*a97c2a1fSXin Li@// 30*a97c2a1fSXin Li@// Revision History : 31*a97c2a1fSXin Li@// Date Author Detail Description 32*a97c2a1fSXin Li@// ------------ ---------------- ---------------------------------- 33*a97c2a1fSXin Li@// 18 jun 2010 S Hamsalekha Created 34*a97c2a1fSXin Li@// 35*a97c2a1fSXin Li@//------------------------------------------------------------------------- 36*a97c2a1fSXin Li@*/ 37*a97c2a1fSXin Li 38*a97c2a1fSXin Li@/* 39*a97c2a1fSXin Li@// ---------------------------------------------------------------------------- 40*a97c2a1fSXin Li@// Include Files 41*a97c2a1fSXin Li@// ---------------------------------------------------------------------------- 42*a97c2a1fSXin Li@*/ 43*a97c2a1fSXin Li.text 44*a97c2a1fSXin Li.p2align 2 45*a97c2a1fSXin Li 46*a97c2a1fSXin Li 47*a97c2a1fSXin Li@/* 48*a97c2a1fSXin Li@// ---------------------------------------------------------------------------- 49*a97c2a1fSXin Li@// Struct/Union Types and Define 50*a97c2a1fSXin Li@// ---------------------------------------------------------------------------- 51*a97c2a1fSXin Li@*/ 52*a97c2a1fSXin Li 53*a97c2a1fSXin Li 54*a97c2a1fSXin Li@/* 55*a97c2a1fSXin Li@// ---------------------------------------------------------------------------- 56*a97c2a1fSXin Li@// Static Global Data section variables 57*a97c2a1fSXin Li@// ---------------------------------------------------------------------------- 58*a97c2a1fSXin Li@*/ 59*a97c2a1fSXin Li@// -------------------------- NONE -------------------------------------------- 60*a97c2a1fSXin Li 61*a97c2a1fSXin Li 62*a97c2a1fSXin Li@/* 63*a97c2a1fSXin Li@// ---------------------------------------------------------------------------- 64*a97c2a1fSXin Li@// Static Prototype Functions 65*a97c2a1fSXin Li@// ---------------------------------------------------------------------------- 66*a97c2a1fSXin Li@*/ 67*a97c2a1fSXin Li@// -------------------------- NONE -------------------------------------------- 68*a97c2a1fSXin Li 69*a97c2a1fSXin Li@/* 70*a97c2a1fSXin Li@// ---------------------------------------------------------------------------- 71*a97c2a1fSXin Li@// Exported functions 72*a97c2a1fSXin Li@// ---------------------------------------------------------------------------- 73*a97c2a1fSXin Li@*/ 74*a97c2a1fSXin Li 75*a97c2a1fSXin Li@//--------------------------------------------------------------------------- 76*a97c2a1fSXin Li@// Function Name : impeg2_copy_mb_a9q() 77*a97c2a1fSXin Li@// 78*a97c2a1fSXin Li@// Detail Description : Copies one MB worth of data from src to the dst 79*a97c2a1fSXin Li@// 80*a97c2a1fSXin Li@// Inputs : r0 - pointer to src 81*a97c2a1fSXin Li@// r1 - pointer to dst 82*a97c2a1fSXin Li@// r2 - source width 83*a97c2a1fSXin Li@// r3 - destination width 84*a97c2a1fSXin Li@// Registers Used : r4, r5, d0, d1 85*a97c2a1fSXin Li@// 86*a97c2a1fSXin Li@// Stack Usage : 12 bytes 87*a97c2a1fSXin Li@// 88*a97c2a1fSXin Li@// Outputs : 89*a97c2a1fSXin Li@// 90*a97c2a1fSXin Li@// Return Data : None 91*a97c2a1fSXin Li@// 92*a97c2a1fSXin Li@// Programming Note : <program limitation> 93*a97c2a1fSXin Li@//----------------------------------------------------------------------------- 94*a97c2a1fSXin Li@*/ 95*a97c2a1fSXin Li 96*a97c2a1fSXin Li 97*a97c2a1fSXin Li 98*a97c2a1fSXin Li .global impeg2_copy_mb_a9q 99*a97c2a1fSXin Li 100*a97c2a1fSXin Li 101*a97c2a1fSXin Liimpeg2_copy_mb_a9q: 102*a97c2a1fSXin Li 103*a97c2a1fSXin Li stmfd sp!, {r4, r5, r14} 104*a97c2a1fSXin Li 105*a97c2a1fSXin Li 106*a97c2a1fSXin Li ldr r4, [r0] @src->y 107*a97c2a1fSXin Li ldr r5, [r1] @dst->y 108*a97c2a1fSXin Li @Read one row of data from the src 109*a97c2a1fSXin Li vld1.8 {d0, d1}, [r4], r2 @Load and increment src 110*a97c2a1fSXin Li vst1.8 {d0, d1}, [r5], r3 @Store and increment dst 111*a97c2a1fSXin Li 112*a97c2a1fSXin Li @//Repeat 15 times for y 113*a97c2a1fSXin Li vld1.8 {d0, d1}, [r4], r2 @Load and increment src 114*a97c2a1fSXin Li vst1.8 {d0, d1}, [r5], r3 @Store and increment dst 115*a97c2a1fSXin Li vld1.8 {d0, d1}, [r4], r2 @Load and increment src 116*a97c2a1fSXin Li vst1.8 {d0, d1}, [r5], r3 @Store and increment dst 117*a97c2a1fSXin Li vld1.8 {d0, d1}, [r4], r2 @Load and increment src 118*a97c2a1fSXin Li vst1.8 {d0, d1}, [r5], r3 @Store and increment dst 119*a97c2a1fSXin Li vld1.8 {d0, d1}, [r4], r2 @Load and increment src 120*a97c2a1fSXin Li vst1.8 {d0, d1}, [r5], r3 @Store and increment dst 121*a97c2a1fSXin Li vld1.8 {d0, d1}, [r4], r2 @Load and increment src 122*a97c2a1fSXin Li vst1.8 {d0, d1}, [r5], r3 @Store and increment dst 123*a97c2a1fSXin Li vld1.8 {d0, d1}, [r4], r2 @Load and increment src 124*a97c2a1fSXin Li vst1.8 {d0, d1}, [r5], r3 @Store and increment dst 125*a97c2a1fSXin Li vld1.8 {d0, d1}, [r4], r2 @Load and increment src 126*a97c2a1fSXin Li vst1.8 {d0, d1}, [r5], r3 @Store and increment dst 127*a97c2a1fSXin Li vld1.8 {d0, d1}, [r4], r2 @Load and increment src 128*a97c2a1fSXin Li vst1.8 {d0, d1}, [r5], r3 @Store and increment dst 129*a97c2a1fSXin Li vld1.8 {d0, d1}, [r4], r2 @Load and increment src 130*a97c2a1fSXin Li vst1.8 {d0, d1}, [r5], r3 @Store and increment dst 131*a97c2a1fSXin Li vld1.8 {d0, d1}, [r4], r2 @Load and increment src 132*a97c2a1fSXin Li vst1.8 {d0, d1}, [r5], r3 @Store and increment dst 133*a97c2a1fSXin Li vld1.8 {d0, d1}, [r4], r2 @Load and increment src 134*a97c2a1fSXin Li vst1.8 {d0, d1}, [r5], r3 @Store and increment dst 135*a97c2a1fSXin Li vld1.8 {d0, d1}, [r4], r2 @Load and increment src 136*a97c2a1fSXin Li vst1.8 {d0, d1}, [r5], r3 @Store and increment dst 137*a97c2a1fSXin Li vld1.8 {d0, d1}, [r4], r2 @Load and increment src 138*a97c2a1fSXin Li vst1.8 {d0, d1}, [r5], r3 @Store and increment dst 139*a97c2a1fSXin Li vld1.8 {d0, d1}, [r4], r2 @Load and increment src 140*a97c2a1fSXin Li vst1.8 {d0, d1}, [r5], r3 @Store and increment dst 141*a97c2a1fSXin Li vld1.8 {d0, d1}, [r4], r2 @Load and increment src 142*a97c2a1fSXin Li vst1.8 {d0, d1}, [r5], r3 @Store and increment dst 143*a97c2a1fSXin Li 144*a97c2a1fSXin Li mov r2, r2, lsr #1 @src_offset /= 2 145*a97c2a1fSXin Li mov r3, r3, lsr #1 @dst_offset /= 2 146*a97c2a1fSXin Li 147*a97c2a1fSXin Li ldr r4, [r0, #4] @src->u 148*a97c2a1fSXin Li ldr r5, [r1, #4] @dst->u 149*a97c2a1fSXin Li @Read one row of data from the src 150*a97c2a1fSXin Li vld1.8 {d0}, [r4], r2 @Load and increment src 151*a97c2a1fSXin Li vst1.8 {d0}, [r5], r3 @Store and increment dst 152*a97c2a1fSXin Li 153*a97c2a1fSXin Li @//Repeat 7 times for u 154*a97c2a1fSXin Li vld1.8 {d0}, [r4], r2 @Load and increment src 155*a97c2a1fSXin Li vst1.8 {d0}, [r5], r3 @Store and increment dst 156*a97c2a1fSXin Li vld1.8 {d0}, [r4], r2 @Load and increment src 157*a97c2a1fSXin Li vst1.8 {d0}, [r5], r3 @Store and increment dst 158*a97c2a1fSXin Li vld1.8 {d0}, [r4], r2 @Load and increment src 159*a97c2a1fSXin Li vst1.8 {d0}, [r5], r3 @Store and increment dst 160*a97c2a1fSXin Li vld1.8 {d0}, [r4], r2 @Load and increment src 161*a97c2a1fSXin Li vst1.8 {d0}, [r5], r3 @Store and increment dst 162*a97c2a1fSXin Li vld1.8 {d0}, [r4], r2 @Load and increment src 163*a97c2a1fSXin Li vst1.8 {d0}, [r5], r3 @Store and increment dst 164*a97c2a1fSXin Li vld1.8 {d0}, [r4], r2 @Load and increment src 165*a97c2a1fSXin Li vst1.8 {d0}, [r5], r3 @Store and increment dst 166*a97c2a1fSXin Li vld1.8 {d0}, [r4], r2 @Load and increment src 167*a97c2a1fSXin Li vst1.8 {d0}, [r5], r3 @Store and increment dst 168*a97c2a1fSXin Li 169*a97c2a1fSXin Li ldr r4, [r0, #8] @src->v 170*a97c2a1fSXin Li ldr r5, [r1, #8] @dst->v 171*a97c2a1fSXin Li @Read one row of data from the src 172*a97c2a1fSXin Li vld1.8 {d0}, [r4], r2 @Load and increment src 173*a97c2a1fSXin Li vst1.8 {d0}, [r5], r3 @Store and increment dst 174*a97c2a1fSXin Li 175*a97c2a1fSXin Li @//Repeat 7 times for v 176*a97c2a1fSXin Li vld1.8 {d0}, [r4], r2 @Load and increment src 177*a97c2a1fSXin Li vst1.8 {d0}, [r5], r3 @Store and increment dst 178*a97c2a1fSXin Li vld1.8 {d0}, [r4], r2 @Load and increment src 179*a97c2a1fSXin Li vst1.8 {d0}, [r5], r3 @Store and increment dst 180*a97c2a1fSXin Li vld1.8 {d0}, [r4], r2 @Load and increment src 181*a97c2a1fSXin Li vst1.8 {d0}, [r5], r3 @Store and increment dst 182*a97c2a1fSXin Li vld1.8 {d0}, [r4], r2 @Load and increment src 183*a97c2a1fSXin Li vst1.8 {d0}, [r5], r3 @Store and increment dst 184*a97c2a1fSXin Li vld1.8 {d0}, [r4], r2 @Load and increment src 185*a97c2a1fSXin Li vst1.8 {d0}, [r5], r3 @Store and increment dst 186*a97c2a1fSXin Li vld1.8 {d0}, [r4], r2 @Load and increment src 187*a97c2a1fSXin Li vst1.8 {d0}, [r5], r3 @Store and increment dst 188*a97c2a1fSXin Li vld1.8 {d0}, [r4], r2 @Load and increment src 189*a97c2a1fSXin Li vst1.8 {d0}, [r5], r3 @Store and increment dst 190*a97c2a1fSXin Li 191*a97c2a1fSXin Li ldmfd sp!, {r4, r5, pc} 192*a97c2a1fSXin Li 193*a97c2a1fSXin Li 194*a97c2a1fSXin Li 195*a97c2a1fSXin Li 196*a97c2a1fSXin Li@/* 197*a97c2a1fSXin Li@//--------------------------------------------------------------------------- 198*a97c2a1fSXin Li@// Function Name : impeg2_mc_fullx_halfy_8x8_a9q() 199*a97c2a1fSXin Li@// 200*a97c2a1fSXin Li@// Detail Description : This function pastes the reference block in the 201*a97c2a1fSXin Li@// current frame buffer.This function is called for 202*a97c2a1fSXin Li@// blocks that are not coded and have motion vectors 203*a97c2a1fSXin Li@// with a half pel resolution. 204*a97c2a1fSXin Li@// 205*a97c2a1fSXin Li@// Inputs : r0 - out : Current Block Pointer 206*a97c2a1fSXin Li@// r1 - ref : Refernce Block Pointer 207*a97c2a1fSXin Li@// r2 - ref_wid : Refernce Block Width 208*a97c2a1fSXin Li@// r3 - out_wid ; Current Block Width 209*a97c2a1fSXin Li@// 210*a97c2a1fSXin Li@// Registers Used : D0-D9 211*a97c2a1fSXin Li@// 212*a97c2a1fSXin Li@// Stack Usage : 4 bytes 213*a97c2a1fSXin Li@// 214*a97c2a1fSXin Li@// Outputs : The Motion Compensated Block 215*a97c2a1fSXin Li@// 216*a97c2a1fSXin Li@// Return Data : None 217*a97c2a1fSXin Li@// 218*a97c2a1fSXin Li@// Programming Note : <program limitation> 219*a97c2a1fSXin Li@//----------------------------------------------------------------------------- 220*a97c2a1fSXin Li@*/ 221*a97c2a1fSXin Li 222*a97c2a1fSXin Li .global impeg2_mc_fullx_halfy_8x8_a9q 223*a97c2a1fSXin Li 224*a97c2a1fSXin Liimpeg2_mc_fullx_halfy_8x8_a9q: 225*a97c2a1fSXin Li 226*a97c2a1fSXin Li stmfd sp!, {r14} 227*a97c2a1fSXin Li vpush {d8-d9} 228*a97c2a1fSXin Li add r14, r1, r2 229*a97c2a1fSXin Li mov r2, r2, lsl #1 230*a97c2a1fSXin Li 231*a97c2a1fSXin Li@/* Load 8 + 1 rows from reference block */ 232*a97c2a1fSXin Li@/* Do the addition with out rounding off as rounding value is 1 */ 233*a97c2a1fSXin Li vld1.8 {d0}, [r1], r2 @// first row hence r1 = D0 234*a97c2a1fSXin Li vld1.8 {d2}, [r14], r2 @// second row hence r2 = D2 235*a97c2a1fSXin Li vld1.8 {d4}, [r1], r2 @// third row hence r3 = D4 236*a97c2a1fSXin Li vld1.8 {d6}, [r14], r2 @// fourth row hence r4 = D6 237*a97c2a1fSXin Li vld1.8 {d1}, [r1], r2 @// fifth row hence r5 = D1 238*a97c2a1fSXin Li vld1.8 {d3}, [r14], r2 @// sixth row hence r6 = D3 239*a97c2a1fSXin Li vrhadd.u8 d9, d1, d6 @// estimated row 4 = D9 240*a97c2a1fSXin Li vld1.8 {d5}, [r1], r2 @// seventh row hence r7 = D5 241*a97c2a1fSXin Li vrhadd.u8 q0, q0, q1 @// estimated row 1 = D0, row 5 = D1 242*a97c2a1fSXin Li vld1.8 {d7}, [r14], r2 @// eighth row hence r8 = D7 243*a97c2a1fSXin Li vrhadd.u8 q1, q1, q2 @// estimated row 2 = D2, row 6 = D3 244*a97c2a1fSXin Li vld1.8 {d8}, [r1], r2 @// ninth row hence r9 = D8 245*a97c2a1fSXin Li vrhadd.u8 q2, q2, q3 @// estimated row 3 = D4, row 7 = D5 246*a97c2a1fSXin Li 247*a97c2a1fSXin Li add r14, r0, r3 248*a97c2a1fSXin Li mov r3, r3, lsl #1 249*a97c2a1fSXin Li 250*a97c2a1fSXin Li@/* Store the eight rows calculated above */ 251*a97c2a1fSXin Li vst1.8 {d2}, [r14], r3 @// second row hence D2 252*a97c2a1fSXin Li vrhadd.u8 d7, d7, d8 @// estimated row 8 = D7 253*a97c2a1fSXin Li vst1.8 {d0}, [r0], r3 @// first row hence D0 254*a97c2a1fSXin Li vst1.8 {d9}, [r14], r3 @// fourth row hence D9 255*a97c2a1fSXin Li vst1.8 {d4}, [r0], r3 @// third row hence D4 256*a97c2a1fSXin Li vst1.8 {d3}, [r14], r3 @// sixth row hence r6 = D3 257*a97c2a1fSXin Li vst1.8 {d1}, [r0], r3 @// fifth row hence r5 = D1 258*a97c2a1fSXin Li vst1.8 {d7}, [r14], r3 @// eighth row hence r8 = D7 259*a97c2a1fSXin Li vst1.8 {d5}, [r0], r3 @// seventh row hence r7 = D5 260*a97c2a1fSXin Li 261*a97c2a1fSXin Li vpop {d8-d9} 262*a97c2a1fSXin Li ldmfd sp!, {pc} 263*a97c2a1fSXin Li 264*a97c2a1fSXin Li 265*a97c2a1fSXin Li 266*a97c2a1fSXin Li 267*a97c2a1fSXin Li 268*a97c2a1fSXin Li 269*a97c2a1fSXin Li@/* 270*a97c2a1fSXin Li@//--------------------------------------------------------------------------- 271*a97c2a1fSXin Li@// Function Name : impeg2_mc_halfx_fully_8x8_a9q() 272*a97c2a1fSXin Li@// 273*a97c2a1fSXin Li@// Detail Description : This function pastes the reference block in the 274*a97c2a1fSXin Li@// current frame buffer.This function is called for 275*a97c2a1fSXin Li@// blocks that are not coded and have motion vectors 276*a97c2a1fSXin Li@// with a half pel resolutionand VopRoundingType is 0 .. 277*a97c2a1fSXin Li@// 278*a97c2a1fSXin Li@// Inputs : r0 - out : Current Block Pointer 279*a97c2a1fSXin Li@// r1 - ref : Refernce Block Pointer 280*a97c2a1fSXin Li@// r2 - ref_wid : Refernce Block Width 281*a97c2a1fSXin Li@// r3 - out_wid ; Current Block Width 282*a97c2a1fSXin Li@// 283*a97c2a1fSXin Li@// Registers Used : r12, r14, d0-d10, d12-d14, d16-d18, d20-d22 284*a97c2a1fSXin Li 285*a97c2a1fSXin Li@// 286*a97c2a1fSXin Li@// Stack Usage : 8 bytes 287*a97c2a1fSXin Li@// 288*a97c2a1fSXin Li@// Outputs : The Motion Compensated Block 289*a97c2a1fSXin Li@// 290*a97c2a1fSXin Li@// Return Data : None 291*a97c2a1fSXin Li@// 292*a97c2a1fSXin Li@// Programming Note : <program limitation> 293*a97c2a1fSXin Li@//----------------------------------------------------------------------------- 294*a97c2a1fSXin Li@*/ 295*a97c2a1fSXin Li 296*a97c2a1fSXin Li 297*a97c2a1fSXin Li 298*a97c2a1fSXin Li .global impeg2_mc_halfx_fully_8x8_a9q 299*a97c2a1fSXin Li 300*a97c2a1fSXin Li 301*a97c2a1fSXin Li 302*a97c2a1fSXin Liimpeg2_mc_halfx_fully_8x8_a9q: 303*a97c2a1fSXin Li 304*a97c2a1fSXin Li stmfd sp!, {r12, lr} 305*a97c2a1fSXin Li 306*a97c2a1fSXin Li add r14, r1, r2, lsl #2 307*a97c2a1fSXin Li 308*a97c2a1fSXin Li add r12, r0, r3, lsl#2 309*a97c2a1fSXin Li 310*a97c2a1fSXin Li vld1.8 {d0, d1}, [r1], r2 @load 16 pixels of row1 311*a97c2a1fSXin Li 312*a97c2a1fSXin Li vld1.8 {d2, d3}, [r14], r2 @ row5 313*a97c2a1fSXin Li 314*a97c2a1fSXin Li 315*a97c2a1fSXin Li vld1.8 {d4, d5}, [r1], r2 @load 16 pixels row2 316*a97c2a1fSXin Li 317*a97c2a1fSXin Li vld1.8 {d6, d7}, [r14], r2 @row6 318*a97c2a1fSXin Li 319*a97c2a1fSXin Li 320*a97c2a1fSXin Li vext.8 d24, d0, d1, #1 @Extract pixels (1-8) of row1 321*a97c2a1fSXin Li 322*a97c2a1fSXin Li vext.8 d28, d2, d3, #1 @Extract pixels (1-8) of row5 323*a97c2a1fSXin Li 324*a97c2a1fSXin Li vext.8 d16, d4, d5, #1 @Extract pixels (1-8) of row2 325*a97c2a1fSXin Li 326*a97c2a1fSXin Li vext.8 d20, d6, d7, #1 @Extract pixels (1-8) of row6 327*a97c2a1fSXin Li 328*a97c2a1fSXin Li 329*a97c2a1fSXin Li vld1.8 {d25, d26}, [r1], r2 @load row3 330*a97c2a1fSXin Li 331*a97c2a1fSXin Li vld1.8 {d29, d30}, [r14], r2 @load row7 332*a97c2a1fSXin Li 333*a97c2a1fSXin Li vld1.8 {d17, d18}, [r1], r2 @load row4 334*a97c2a1fSXin Li 335*a97c2a1fSXin Li vld1.8 {d21, d22}, [r14], r2 @load row8 336*a97c2a1fSXin Li 337*a97c2a1fSXin Li 338*a97c2a1fSXin Li vext.8 d1, d25, d26, #1 @Extract pixels (1-8) of row3 339*a97c2a1fSXin Li 340*a97c2a1fSXin Li vext.8 d3, d29, d30, #1 @Extract pixels (1-8) of row7 341*a97c2a1fSXin Li 342*a97c2a1fSXin Li 343*a97c2a1fSXin Li 344*a97c2a1fSXin Li vext.8 d5, d17, d18, #1 @Extract pixels (1-8) of row4 345*a97c2a1fSXin Li 346*a97c2a1fSXin Li vext.8 d7, d21, d22, #1 @Extract pixels (1-8) of row8 347*a97c2a1fSXin Li 348*a97c2a1fSXin Li 349*a97c2a1fSXin Li vrhadd.u8 q0, q0, q12 @operate on row1 and row3 350*a97c2a1fSXin Li 351*a97c2a1fSXin Li vrhadd.u8 q1, q1, q14 @operate on row5 and row7 352*a97c2a1fSXin Li 353*a97c2a1fSXin Li 354*a97c2a1fSXin Li vrhadd.u8 q2, q2, q8 @operate on row2 and row4 355*a97c2a1fSXin Li 356*a97c2a1fSXin Li 357*a97c2a1fSXin Li 358*a97c2a1fSXin Li vrhadd.u8 q3, q3, q10 @operate on row6 and row8 359*a97c2a1fSXin Li 360*a97c2a1fSXin Li vst1.8 d0, [r0], r3 @store row1 361*a97c2a1fSXin Li 362*a97c2a1fSXin Li vst1.8 d2, [r12], r3 @store row5 363*a97c2a1fSXin Li 364*a97c2a1fSXin Li vst1.8 d4, [r0], r3 @store row2 365*a97c2a1fSXin Li 366*a97c2a1fSXin Li vst1.8 d6, [r12], r3 @store row6 367*a97c2a1fSXin Li 368*a97c2a1fSXin Li vst1.8 d1, [r0], r3 @store row3 369*a97c2a1fSXin Li 370*a97c2a1fSXin Li vst1.8 d3, [r12], r3 @store row7 371*a97c2a1fSXin Li 372*a97c2a1fSXin Li vst1.8 d5, [r0], r3 @store row4 373*a97c2a1fSXin Li 374*a97c2a1fSXin Li vst1.8 d7, [r12], r3 @store row8 375*a97c2a1fSXin Li 376*a97c2a1fSXin Li 377*a97c2a1fSXin Li 378*a97c2a1fSXin Li ldmfd sp!, {r12, pc} 379*a97c2a1fSXin Li 380*a97c2a1fSXin Li 381*a97c2a1fSXin Li 382*a97c2a1fSXin Li 383*a97c2a1fSXin Li 384*a97c2a1fSXin Li 385*a97c2a1fSXin Li 386*a97c2a1fSXin Li 387*a97c2a1fSXin Li@/* 388*a97c2a1fSXin Li@//--------------------------------------------------------------------------- 389*a97c2a1fSXin Li@// Function Name : impeg2_mc_halfx_halfy_8x8_a9q() 390*a97c2a1fSXin Li@// 391*a97c2a1fSXin Li@// Detail Description : This function pastes the reference block in the 392*a97c2a1fSXin Li@// current frame buffer.This function is called for 393*a97c2a1fSXin Li@// blocks that are not coded and have motion vectors 394*a97c2a1fSXin Li@// with a half pel resolutionand VopRoundingType is 0 .. 395*a97c2a1fSXin Li@// 396*a97c2a1fSXin Li@// Inputs : r0 - out : Current Block Pointer 397*a97c2a1fSXin Li@// r1 - ref : Refernce Block Pointer 398*a97c2a1fSXin Li@// r2 - ref_wid : Refernce Block Width 399*a97c2a1fSXin Li@// r3 - out_wid ; Current Block Width 400*a97c2a1fSXin Li@// 401*a97c2a1fSXin Li@// Registers Used : r14, q0-q15 402*a97c2a1fSXin Li 403*a97c2a1fSXin Li@// 404*a97c2a1fSXin Li@// Stack Usage : 4 bytes 405*a97c2a1fSXin Li@// 406*a97c2a1fSXin Li@// Outputs : The Motion Compensated Block 407*a97c2a1fSXin Li@// 408*a97c2a1fSXin Li@// Return Data : None 409*a97c2a1fSXin Li@// 410*a97c2a1fSXin Li@// Programming Note : <program limitation> 411*a97c2a1fSXin Li@//----------------------------------------------------------------------------- 412*a97c2a1fSXin Li@*/ 413*a97c2a1fSXin Li 414*a97c2a1fSXin Li 415*a97c2a1fSXin Li .global impeg2_mc_halfx_halfy_8x8_a9q 416*a97c2a1fSXin Li 417*a97c2a1fSXin Liimpeg2_mc_halfx_halfy_8x8_a9q: 418*a97c2a1fSXin Li 419*a97c2a1fSXin Li stmfd sp!, {r14} 420*a97c2a1fSXin Li vpush {d8-d15} 421*a97c2a1fSXin Li 422*a97c2a1fSXin Li add r14, r1, r2, lsl #2 423*a97c2a1fSXin Li 424*a97c2a1fSXin Li vld1.8 {d0, d1}, [r1], r2 @load 16 pixels of row1 425*a97c2a1fSXin Li 426*a97c2a1fSXin Li vld1.8 {d2, d3}, [r14], r2 @ row5 427*a97c2a1fSXin Li 428*a97c2a1fSXin Li vld1.8 {d4, d5}, [r1], r2 @load 16 pixels row2 429*a97c2a1fSXin Li 430*a97c2a1fSXin Li vld1.8 {d6, d7}, [r14], r2 @row6 431*a97c2a1fSXin Li 432*a97c2a1fSXin Li vext.8 d1, d0, d1, #1 @Extract pixels (1-8) of row1 433*a97c2a1fSXin Li 434*a97c2a1fSXin Li 435*a97c2a1fSXin Li 436*a97c2a1fSXin Li vext.8 d3, d2, d3, #1 @Extract pixels (1-8) of row5 437*a97c2a1fSXin Li 438*a97c2a1fSXin Li 439*a97c2a1fSXin Li 440*a97c2a1fSXin Li vext.8 d5, d4, d5, #1 @Extract pixels (1-8) of row2 441*a97c2a1fSXin Li 442*a97c2a1fSXin Li vext.8 d7, d6, d7, #1 @Extract pixels (1-8) of row6 443*a97c2a1fSXin Li 444*a97c2a1fSXin Li 445*a97c2a1fSXin Li 446*a97c2a1fSXin Li 447*a97c2a1fSXin Li vld1.8 {d8, d9}, [r1], r2 @load row3 448*a97c2a1fSXin Li 449*a97c2a1fSXin Li 450*a97c2a1fSXin Li 451*a97c2a1fSXin Li vld1.8 {d10, d11}, [r14], r2 @load row7 452*a97c2a1fSXin Li 453*a97c2a1fSXin Li vld1.8 {d12, d13}, [r1], r2 @load row4 454*a97c2a1fSXin Li 455*a97c2a1fSXin Li vld1.8 {d14, d15}, [r14], r2 @load row8 456*a97c2a1fSXin Li 457*a97c2a1fSXin Li vext.8 d9, d8, d9, #1 @Extract pixels (1-8) of row3 458*a97c2a1fSXin Li 459*a97c2a1fSXin Li vld1.8 {d16, d17}, [r14], r2 @load row9 460*a97c2a1fSXin Li 461*a97c2a1fSXin Li 462*a97c2a1fSXin Li 463*a97c2a1fSXin Li 464*a97c2a1fSXin Li 465*a97c2a1fSXin Li vext.8 d11, d10, d11, #1 @Extract pixels (1-8) of row7 466*a97c2a1fSXin Li 467*a97c2a1fSXin Li 468*a97c2a1fSXin Li 469*a97c2a1fSXin Li vext.8 d13, d12, d13, #1 @Extract pixels (1-8) of row4 470*a97c2a1fSXin Li 471*a97c2a1fSXin Li 472*a97c2a1fSXin Li 473*a97c2a1fSXin Li vext.8 d15, d14, d15, #1 @Extract pixels (1-8) of row8 474*a97c2a1fSXin Li 475*a97c2a1fSXin Li vext.8 d17, d16, d17, #1 @Extract pixels (1-8) of row9 476*a97c2a1fSXin Li 477*a97c2a1fSXin Li 478*a97c2a1fSXin Li @interpolation in x direction 479*a97c2a1fSXin Li 480*a97c2a1fSXin Li vaddl.u8 q0, d0, d1 @operate row1 481*a97c2a1fSXin Li 482*a97c2a1fSXin Li vaddl.u8 q1, d2, d3 @operate row5 483*a97c2a1fSXin Li 484*a97c2a1fSXin Li vaddl.u8 q2, d4, d5 @operate row2 485*a97c2a1fSXin Li 486*a97c2a1fSXin Li vaddl.u8 q3, d6, d7 @operate row6 487*a97c2a1fSXin Li 488*a97c2a1fSXin Li vaddl.u8 q4, d8, d9 @operate row3 489*a97c2a1fSXin Li 490*a97c2a1fSXin Li vaddl.u8 q5, d10, d11 @operate row7 491*a97c2a1fSXin Li 492*a97c2a1fSXin Li vaddl.u8 q6, d12, d13 @operate row4 493*a97c2a1fSXin Li 494*a97c2a1fSXin Li vaddl.u8 q7, d14, d15 @operate row8 495*a97c2a1fSXin Li 496*a97c2a1fSXin Li vaddl.u8 q8, d16, d17 @operate row9 497*a97c2a1fSXin Li 498*a97c2a1fSXin Li @interpolation in y direction 499*a97c2a1fSXin Li 500*a97c2a1fSXin Li add r14, r0, r3, lsl #2 501*a97c2a1fSXin Li 502*a97c2a1fSXin Li 503*a97c2a1fSXin Li 504*a97c2a1fSXin Li vadd.u16 q9, q0, q2 @operate row1 and row2 505*a97c2a1fSXin Li 506*a97c2a1fSXin Li vadd.u16 q13, q1, q3 @operate row5 and row6 507*a97c2a1fSXin Li 508*a97c2a1fSXin Li vadd.u16 q10, q2, q4 @operate row2 and row3 509*a97c2a1fSXin Li 510*a97c2a1fSXin Li vadd.u16 q14, q3, q5 @operate row6 and row7 511*a97c2a1fSXin Li 512*a97c2a1fSXin Li vrshrn.u16 d18, q9, #2 @row1 513*a97c2a1fSXin Li 514*a97c2a1fSXin Li vrshrn.u16 d26, q13, #2 @row5 515*a97c2a1fSXin Li 516*a97c2a1fSXin Li vrshrn.u16 d20, q10, #2 @row2 517*a97c2a1fSXin Li 518*a97c2a1fSXin Li vrshrn.u16 d28, q14, #2 @row6 519*a97c2a1fSXin Li 520*a97c2a1fSXin Li vadd.u16 q11, q4, q6 @operate row3 and row4 521*a97c2a1fSXin Li 522*a97c2a1fSXin Li vst1.8 d18, [r0], r3 @store row1 523*a97c2a1fSXin Li 524*a97c2a1fSXin Li vadd.u16 q15, q5, q7 @operate row7 and row8 525*a97c2a1fSXin Li 526*a97c2a1fSXin Li vst1.8 d26, [r14], r3 @store row5 527*a97c2a1fSXin Li 528*a97c2a1fSXin Li vadd.u16 q12, q6, q1 @operate row4 and row5 529*a97c2a1fSXin Li 530*a97c2a1fSXin Li vst1.8 d20, [r0], r3 @store row2 531*a97c2a1fSXin Li 532*a97c2a1fSXin Li vadd.u16 q7, q7, q8 @operate row8 and row9 533*a97c2a1fSXin Li 534*a97c2a1fSXin Li vst1.8 d28, [r14], r3 @store row6 535*a97c2a1fSXin Li 536*a97c2a1fSXin Li 537*a97c2a1fSXin Li 538*a97c2a1fSXin Li vrshrn.u16 d22, q11, #2 @row3 539*a97c2a1fSXin Li 540*a97c2a1fSXin Li vrshrn.u16 d30, q15, #2 @row7 541*a97c2a1fSXin Li 542*a97c2a1fSXin Li vrshrn.u16 d24, q12, #2 @row4 543*a97c2a1fSXin Li 544*a97c2a1fSXin Li vrshrn.u16 d14, q7, #2 @row8 545*a97c2a1fSXin Li 546*a97c2a1fSXin Li 547*a97c2a1fSXin Li vst1.8 d22, [r0], r3 @store row3 548*a97c2a1fSXin Li vst1.8 d30, [r14], r3 @store row7 549*a97c2a1fSXin Li vst1.8 d24, [r0], r3 @store row4 550*a97c2a1fSXin Li vst1.8 d14, [r14], r3 @store row8 551*a97c2a1fSXin Li 552*a97c2a1fSXin Li 553*a97c2a1fSXin Li 554*a97c2a1fSXin Li vpop {d8-d15} 555*a97c2a1fSXin Li ldmfd sp!, {pc} 556*a97c2a1fSXin Li 557*a97c2a1fSXin Li 558*a97c2a1fSXin Li 559*a97c2a1fSXin Li 560*a97c2a1fSXin Li 561*a97c2a1fSXin Li@/* 562*a97c2a1fSXin Li@//--------------------------------------------------------------------------- 563*a97c2a1fSXin Li@// Function Name : impeg2_mc_fullx_fully_8x8_a9q() 564*a97c2a1fSXin Li@// 565*a97c2a1fSXin Li@// Detail Description : This function pastes the reference block in the 566*a97c2a1fSXin Li@// current frame buffer.This function is called for 567*a97c2a1fSXin Li@// blocks that are not coded and have motion vectors 568*a97c2a1fSXin Li@// with a half pel resolutionand .. 569*a97c2a1fSXin Li@// 570*a97c2a1fSXin Li@// Inputs : r0 - out : Current Block Pointer 571*a97c2a1fSXin Li@// r1 - ref : Refernce Block Pointer 572*a97c2a1fSXin Li@// r2 - ref_wid : Refernce Block Width 573*a97c2a1fSXin Li@// r3 - out_wid ; Current Block Width 574*a97c2a1fSXin Li@// 575*a97c2a1fSXin Li@// Registers Used : r12, r14, d0-d3 576*a97c2a1fSXin Li 577*a97c2a1fSXin Li@// 578*a97c2a1fSXin Li@// Stack Usage : 8 bytes 579*a97c2a1fSXin Li@// 580*a97c2a1fSXin Li@// Outputs : The Motion Compensated Block 581*a97c2a1fSXin Li@// 582*a97c2a1fSXin Li@// Return Data : None 583*a97c2a1fSXin Li@// 584*a97c2a1fSXin Li@// Programming Note : <program limitation> 585*a97c2a1fSXin Li@//----------------------------------------------------------------------------- 586*a97c2a1fSXin Li@*/ 587*a97c2a1fSXin Li 588*a97c2a1fSXin Li 589*a97c2a1fSXin Li .global impeg2_mc_fullx_fully_8x8_a9q 590*a97c2a1fSXin Liimpeg2_mc_fullx_fully_8x8_a9q: 591*a97c2a1fSXin Li 592*a97c2a1fSXin Li 593*a97c2a1fSXin Li stmfd sp!, {r12, lr} 594*a97c2a1fSXin Li 595*a97c2a1fSXin Li add r14, r1, r2, lsl #2 596*a97c2a1fSXin Li 597*a97c2a1fSXin Li add r12, r0, r3, lsl #2 598*a97c2a1fSXin Li 599*a97c2a1fSXin Li 600*a97c2a1fSXin Li vld1.8 d0, [r1], r2 @load row1 601*a97c2a1fSXin Li 602*a97c2a1fSXin Li vld1.8 d1, [r14], r2 @load row4 603*a97c2a1fSXin Li 604*a97c2a1fSXin Li vld1.8 d2, [r1], r2 @load row2 605*a97c2a1fSXin Li 606*a97c2a1fSXin Li vld1.8 d3, [r14], r2 @load row5 607*a97c2a1fSXin Li 608*a97c2a1fSXin Li 609*a97c2a1fSXin Li vst1.8 d0, [r0], r3 @store row1 610*a97c2a1fSXin Li 611*a97c2a1fSXin Li vst1.8 d1, [r12], r3 @store row4 612*a97c2a1fSXin Li 613*a97c2a1fSXin Li vst1.8 d2, [r0], r3 @store row2 614*a97c2a1fSXin Li 615*a97c2a1fSXin Li vst1.8 d3, [r12], r3 @store row5 616*a97c2a1fSXin Li 617*a97c2a1fSXin Li 618*a97c2a1fSXin Li vld1.8 d0, [r1], r2 @load row3 619*a97c2a1fSXin Li 620*a97c2a1fSXin Li vld1.8 d1, [r14], r2 @load row6 621*a97c2a1fSXin Li 622*a97c2a1fSXin Li vld1.8 d2, [r1], r2 @load row4 623*a97c2a1fSXin Li 624*a97c2a1fSXin Li vld1.8 d3, [r14], r2 @load row8 625*a97c2a1fSXin Li 626*a97c2a1fSXin Li 627*a97c2a1fSXin Li vst1.8 d0, [r0], r3 @store row3 628*a97c2a1fSXin Li 629*a97c2a1fSXin Li vst1.8 d1, [r12], r3 @store row6 630*a97c2a1fSXin Li 631*a97c2a1fSXin Li vst1.8 d2, [r0], r3 @store row4 632*a97c2a1fSXin Li 633*a97c2a1fSXin Li vst1.8 d3, [r12], r3 @store row8 634*a97c2a1fSXin Li 635*a97c2a1fSXin Li 636*a97c2a1fSXin Li ldmfd sp!, {r12, pc} 637*a97c2a1fSXin Li 638*a97c2a1fSXin Li 639*a97c2a1fSXin Li 640*a97c2a1fSXin Li 641*a97c2a1fSXin Li 642*a97c2a1fSXin Li@/* 643*a97c2a1fSXin Li@//--------------------------------------------------------------------------- 644*a97c2a1fSXin Li@// Function Name : impeg2_interpolate_a9q() 645*a97c2a1fSXin Li@// 646*a97c2a1fSXin Li@// Detail Description : interpolates two buffers and adds pred 647*a97c2a1fSXin Li@// 648*a97c2a1fSXin Li@// Inputs : r0 - pointer to src1 649*a97c2a1fSXin Li@// r1 - pointer to src2 650*a97c2a1fSXin Li@// r2 - dest buf 651*a97c2a1fSXin Li@// r3 - dst stride 652*a97c2a1fSXin Li@// Registers Used : r4, r5, r7, r14, d0-d15 653*a97c2a1fSXin Li@// 654*a97c2a1fSXin Li@// Stack Usage : 20 bytes 655*a97c2a1fSXin Li@// 656*a97c2a1fSXin Li@// Outputs : The Motion Compensated Block 657*a97c2a1fSXin Li@// 658*a97c2a1fSXin Li@// Return Data : None 659*a97c2a1fSXin Li@// 660*a97c2a1fSXin Li@// Programming Note : <program limitation> 661*a97c2a1fSXin Li@//----------------------------------------------------------------------------- 662*a97c2a1fSXin Li@*/ 663*a97c2a1fSXin Li 664*a97c2a1fSXin Li 665*a97c2a1fSXin Li .global impeg2_interpolate_a9q 666*a97c2a1fSXin Li 667*a97c2a1fSXin Li 668*a97c2a1fSXin Liimpeg2_interpolate_a9q: 669*a97c2a1fSXin Li 670*a97c2a1fSXin Li stmfd sp!, {r4, r5, r7, r12, r14} 671*a97c2a1fSXin Li vpush {d8-d15} 672*a97c2a1fSXin Li 673*a97c2a1fSXin Li ldr r4, [r0, #0] @ptr_y src1 674*a97c2a1fSXin Li 675*a97c2a1fSXin Li ldr r5, [r1, #0] @ptr_y src2 676*a97c2a1fSXin Li 677*a97c2a1fSXin Li ldr r7, [r2, #0] @ptr_y dst buf 678*a97c2a1fSXin Li 679*a97c2a1fSXin Li mov r12, #4 @counter for number of blocks 680*a97c2a1fSXin Li 681*a97c2a1fSXin Li 682*a97c2a1fSXin Liinterp_lumablocks_stride: 683*a97c2a1fSXin Li 684*a97c2a1fSXin Li vld1.8 {d0, d1}, [r4]! @row1 src1 685*a97c2a1fSXin Li 686*a97c2a1fSXin Li vld1.8 {d2, d3}, [r4]! @row2 src1 687*a97c2a1fSXin Li 688*a97c2a1fSXin Li vld1.8 {d4, d5}, [r4]! @row3 src1 689*a97c2a1fSXin Li 690*a97c2a1fSXin Li vld1.8 {d6, d7}, [r4]! @row4 src1 691*a97c2a1fSXin Li 692*a97c2a1fSXin Li 693*a97c2a1fSXin Li vld1.8 {d8, d9}, [r5]! @row1 src2 694*a97c2a1fSXin Li 695*a97c2a1fSXin Li vld1.8 {d10, d11}, [r5]! @row2 src2 696*a97c2a1fSXin Li 697*a97c2a1fSXin Li vld1.8 {d12, d13}, [r5]! @row3 src2 698*a97c2a1fSXin Li 699*a97c2a1fSXin Li vld1.8 {d14, d15}, [r5]! @row4 src2 700*a97c2a1fSXin Li 701*a97c2a1fSXin Li 702*a97c2a1fSXin Li 703*a97c2a1fSXin Li 704*a97c2a1fSXin Li vrhadd.u8 q0, q0, q4 @operate on row1 705*a97c2a1fSXin Li 706*a97c2a1fSXin Li vrhadd.u8 q1, q1, q5 @operate on row2 707*a97c2a1fSXin Li 708*a97c2a1fSXin Li vrhadd.u8 q2, q2, q6 @operate on row3 709*a97c2a1fSXin Li 710*a97c2a1fSXin Li vrhadd.u8 q3, q3, q7 @operate on row4 711*a97c2a1fSXin Li 712*a97c2a1fSXin Li 713*a97c2a1fSXin Li 714*a97c2a1fSXin Li vst1.8 {d0, d1}, [r7], r3 @row1 715*a97c2a1fSXin Li 716*a97c2a1fSXin Li vst1.8 {d2, d3}, [r7], r3 @row2 717*a97c2a1fSXin Li 718*a97c2a1fSXin Li vst1.8 {d4, d5}, [r7], r3 @row3 719*a97c2a1fSXin Li 720*a97c2a1fSXin Li vst1.8 {d6, d7}, [r7], r3 @row4 721*a97c2a1fSXin Li 722*a97c2a1fSXin Li subs r12, r12, #1 723*a97c2a1fSXin Li 724*a97c2a1fSXin Li bne interp_lumablocks_stride 725*a97c2a1fSXin Li 726*a97c2a1fSXin Li 727*a97c2a1fSXin Li mov r3, r3, lsr #1 @stride >> 1 728*a97c2a1fSXin Li 729*a97c2a1fSXin Li ldr r4, [r0, #4] @ptr_u src1 730*a97c2a1fSXin Li 731*a97c2a1fSXin Li ldr r5, [r1, #4] @ptr_u src2 732*a97c2a1fSXin Li 733*a97c2a1fSXin Li ldr r7 , [r2, #4] @ptr_u dst buf 734*a97c2a1fSXin Li 735*a97c2a1fSXin Li mov r12, #2 @counter for number of blocks 736*a97c2a1fSXin Li 737*a97c2a1fSXin Li 738*a97c2a1fSXin Li 739*a97c2a1fSXin Li@chroma blocks 740*a97c2a1fSXin Li 741*a97c2a1fSXin Liinterp_chromablocks_stride: 742*a97c2a1fSXin Li 743*a97c2a1fSXin Li vld1.8 {d0, d1}, [r4]! @row1 & 2 src1 744*a97c2a1fSXin Li 745*a97c2a1fSXin Li vld1.8 {d2, d3}, [r4]! @row3 & 4 src1 746*a97c2a1fSXin Li 747*a97c2a1fSXin Li vld1.8 {d4, d5}, [r4]! @row5 & 6 src1 748*a97c2a1fSXin Li 749*a97c2a1fSXin Li vld1.8 {d6, d7}, [r4]! @row7 & 8 src1 750*a97c2a1fSXin Li 751*a97c2a1fSXin Li 752*a97c2a1fSXin Li vld1.8 {d8, d9}, [r5]! @row1 & 2 src2 753*a97c2a1fSXin Li 754*a97c2a1fSXin Li vld1.8 {d10, d11}, [r5]! @row3 & 4 src2 755*a97c2a1fSXin Li 756*a97c2a1fSXin Li vld1.8 {d12, d13}, [r5]! @row5 & 6 src2 757*a97c2a1fSXin Li 758*a97c2a1fSXin Li vld1.8 {d14, d15}, [r5]! @row7 & 8 src2 759*a97c2a1fSXin Li 760*a97c2a1fSXin Li 761*a97c2a1fSXin Li 762*a97c2a1fSXin Li 763*a97c2a1fSXin Li vrhadd.u8 q0, q0, q4 @operate on row1 & 2 764*a97c2a1fSXin Li 765*a97c2a1fSXin Li vrhadd.u8 q1, q1, q5 @operate on row3 & 4 766*a97c2a1fSXin Li 767*a97c2a1fSXin Li vrhadd.u8 q2, q2, q6 @operate on row5 & 6 768*a97c2a1fSXin Li 769*a97c2a1fSXin Li vrhadd.u8 q3, q3, q7 @operate on row7 & 8 770*a97c2a1fSXin Li 771*a97c2a1fSXin Li 772*a97c2a1fSXin Li vst1.8 {d0}, [r7], r3 @row1 773*a97c2a1fSXin Li 774*a97c2a1fSXin Li vst1.8 {d1}, [r7], r3 @row2 775*a97c2a1fSXin Li 776*a97c2a1fSXin Li vst1.8 {d2}, [r7], r3 @row3 777*a97c2a1fSXin Li 778*a97c2a1fSXin Li vst1.8 {d3}, [r7], r3 @row4 779*a97c2a1fSXin Li 780*a97c2a1fSXin Li vst1.8 {d4}, [r7], r3 @row5 781*a97c2a1fSXin Li 782*a97c2a1fSXin Li vst1.8 {d5}, [r7], r3 @row6 783*a97c2a1fSXin Li 784*a97c2a1fSXin Li vst1.8 {d6}, [r7], r3 @row7 785*a97c2a1fSXin Li 786*a97c2a1fSXin Li vst1.8 {d7}, [r7], r3 @row8 787*a97c2a1fSXin Li 788*a97c2a1fSXin Li 789*a97c2a1fSXin Li 790*a97c2a1fSXin Li ldr r4, [r0, #8] @ptr_v src1 791*a97c2a1fSXin Li 792*a97c2a1fSXin Li ldr r5, [r1, #8] @ptr_v src2 793*a97c2a1fSXin Li 794*a97c2a1fSXin Li ldr r7, [r2, #8] @ptr_v dst buf 795*a97c2a1fSXin Li 796*a97c2a1fSXin Li subs r12, r12, #1 797*a97c2a1fSXin Li 798*a97c2a1fSXin Li bne interp_chromablocks_stride 799*a97c2a1fSXin Li 800*a97c2a1fSXin Li 801*a97c2a1fSXin Li vpop {d8-d15} 802*a97c2a1fSXin Li ldmfd sp!, {r4, r5, r7, r12, pc} 803*a97c2a1fSXin Li 804*a97c2a1fSXin Li 805*a97c2a1fSXin Li 806*a97c2a1fSXin Li 807*a97c2a1fSXin Li 808