xref: /aosp_15_r20/external/libmpeg2/common/arm/impeg2_inter_pred.s (revision a97c2a1f0a796dc32bed80d3353c69c5fc07c750)
1*a97c2a1fSXin Li@/******************************************************************************
2*a97c2a1fSXin Li@ *
3*a97c2a1fSXin Li@ * Copyright (C) 2015 The Android Open Source Project
4*a97c2a1fSXin Li@ *
5*a97c2a1fSXin Li@ * Licensed under the Apache License, Version 2.0 (the "License");
6*a97c2a1fSXin Li@ * you may not use this file except in compliance with the License.
7*a97c2a1fSXin Li@ * You may obtain a copy of the License at:
8*a97c2a1fSXin Li@ *
9*a97c2a1fSXin Li@ * http://www.apache.org/licenses/LICENSE-2.0
10*a97c2a1fSXin Li@ *
11*a97c2a1fSXin Li@ * Unless required by applicable law or agreed to in writing, software
12*a97c2a1fSXin Li@ * distributed under the License is distributed on an "AS IS" BASIS,
13*a97c2a1fSXin Li@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14*a97c2a1fSXin Li@ * See the License for the specific language governing permissions and
15*a97c2a1fSXin Li@ * limitations under the License.
16*a97c2a1fSXin Li@ *
17*a97c2a1fSXin Li@ *****************************************************************************
18*a97c2a1fSXin Li@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19*a97c2a1fSXin Li@*/
20*a97c2a1fSXin Li
21*a97c2a1fSXin Li@/*
22*a97c2a1fSXin Li@//----------------------------------------------------------------------------
23*a97c2a1fSXin Li@// File Name            : impeg2_inter_pred.s
24*a97c2a1fSXin Li@//
25*a97c2a1fSXin Li@// Description          : This file has motion compensation related
26*a97c2a1fSXin Li@//                        interpolation functions on Neon + CortexA-8 platform
27*a97c2a1fSXin Li@//
28*a97c2a1fSXin Li@// Reference Document   :
29*a97c2a1fSXin Li@//
30*a97c2a1fSXin Li@// Revision History     :
31*a97c2a1fSXin Li@//      Date            Author                  Detail Description
32*a97c2a1fSXin Li@//   ------------    ----------------    ----------------------------------
33*a97c2a1fSXin Li@//   18 jun 2010     S Hamsalekha              Created
34*a97c2a1fSXin Li@//
35*a97c2a1fSXin Li@//-------------------------------------------------------------------------
36*a97c2a1fSXin Li@*/
37*a97c2a1fSXin Li
38*a97c2a1fSXin Li@/*
39*a97c2a1fSXin Li@// ----------------------------------------------------------------------------
40*a97c2a1fSXin Li@// Include Files
41*a97c2a1fSXin Li@// ----------------------------------------------------------------------------
42*a97c2a1fSXin Li@*/
43*a97c2a1fSXin Li.text
44*a97c2a1fSXin Li.p2align 2
45*a97c2a1fSXin Li
46*a97c2a1fSXin Li
47*a97c2a1fSXin Li@/*
48*a97c2a1fSXin Li@// ----------------------------------------------------------------------------
49*a97c2a1fSXin Li@// Struct/Union Types and Define
50*a97c2a1fSXin Li@// ----------------------------------------------------------------------------
51*a97c2a1fSXin Li@*/
52*a97c2a1fSXin Li
53*a97c2a1fSXin Li
54*a97c2a1fSXin Li@/*
55*a97c2a1fSXin Li@// ----------------------------------------------------------------------------
56*a97c2a1fSXin Li@// Static Global Data section variables
57*a97c2a1fSXin Li@// ----------------------------------------------------------------------------
58*a97c2a1fSXin Li@*/
59*a97c2a1fSXin Li@// -------------------------- NONE --------------------------------------------
60*a97c2a1fSXin Li
61*a97c2a1fSXin Li
62*a97c2a1fSXin Li@/*
63*a97c2a1fSXin Li@// ----------------------------------------------------------------------------
64*a97c2a1fSXin Li@// Static Prototype Functions
65*a97c2a1fSXin Li@// ----------------------------------------------------------------------------
66*a97c2a1fSXin Li@*/
67*a97c2a1fSXin Li@// -------------------------- NONE --------------------------------------------
68*a97c2a1fSXin Li
69*a97c2a1fSXin Li@/*
70*a97c2a1fSXin Li@// ----------------------------------------------------------------------------
71*a97c2a1fSXin Li@// Exported functions
72*a97c2a1fSXin Li@// ----------------------------------------------------------------------------
73*a97c2a1fSXin Li@*/
74*a97c2a1fSXin Li
75*a97c2a1fSXin Li@//---------------------------------------------------------------------------
76*a97c2a1fSXin Li@// Function Name      :   impeg2_copy_mb_a9q()
77*a97c2a1fSXin Li@//
78*a97c2a1fSXin Li@// Detail Description : Copies one MB worth of data from src to the dst
79*a97c2a1fSXin Li@//
80*a97c2a1fSXin Li@// Inputs             : r0 - pointer to src
81*a97c2a1fSXin Li@//                      r1 - pointer to dst
82*a97c2a1fSXin Li@//                      r2 - source width
83*a97c2a1fSXin Li@//                      r3 - destination width
84*a97c2a1fSXin Li@// Registers Used     : r4, r5, d0, d1
85*a97c2a1fSXin Li@//
86*a97c2a1fSXin Li@// Stack Usage        : 12 bytes
87*a97c2a1fSXin Li@//
88*a97c2a1fSXin Li@// Outputs            :
89*a97c2a1fSXin Li@//
90*a97c2a1fSXin Li@// Return Data        : None
91*a97c2a1fSXin Li@//
92*a97c2a1fSXin Li@// Programming Note   : <program limitation>
93*a97c2a1fSXin Li@//-----------------------------------------------------------------------------
94*a97c2a1fSXin Li@*/
95*a97c2a1fSXin Li
96*a97c2a1fSXin Li
97*a97c2a1fSXin Li
98*a97c2a1fSXin Li        .global impeg2_copy_mb_a9q
99*a97c2a1fSXin Li
100*a97c2a1fSXin Li
101*a97c2a1fSXin Liimpeg2_copy_mb_a9q:
102*a97c2a1fSXin Li
103*a97c2a1fSXin Li    stmfd           sp!, {r4, r5, r14}
104*a97c2a1fSXin Li
105*a97c2a1fSXin Li
106*a97c2a1fSXin Li    ldr             r4, [r0]            @src->y
107*a97c2a1fSXin Li    ldr             r5, [r1]            @dst->y
108*a97c2a1fSXin Li    @Read one row of data from the src
109*a97c2a1fSXin Li    vld1.8          {d0, d1}, [r4], r2  @Load and increment src
110*a97c2a1fSXin Li    vst1.8          {d0, d1}, [r5], r3  @Store and increment dst
111*a97c2a1fSXin Li
112*a97c2a1fSXin Li    @//Repeat 15 times for y
113*a97c2a1fSXin Li    vld1.8          {d0, d1}, [r4], r2  @Load and increment src
114*a97c2a1fSXin Li    vst1.8          {d0, d1}, [r5], r3  @Store and increment dst
115*a97c2a1fSXin Li    vld1.8          {d0, d1}, [r4], r2  @Load and increment src
116*a97c2a1fSXin Li    vst1.8          {d0, d1}, [r5], r3  @Store and increment dst
117*a97c2a1fSXin Li    vld1.8          {d0, d1}, [r4], r2  @Load and increment src
118*a97c2a1fSXin Li    vst1.8          {d0, d1}, [r5], r3  @Store and increment dst
119*a97c2a1fSXin Li    vld1.8          {d0, d1}, [r4], r2  @Load and increment src
120*a97c2a1fSXin Li    vst1.8          {d0, d1}, [r5], r3  @Store and increment dst
121*a97c2a1fSXin Li    vld1.8          {d0, d1}, [r4], r2  @Load and increment src
122*a97c2a1fSXin Li    vst1.8          {d0, d1}, [r5], r3  @Store and increment dst
123*a97c2a1fSXin Li    vld1.8          {d0, d1}, [r4], r2  @Load and increment src
124*a97c2a1fSXin Li    vst1.8          {d0, d1}, [r5], r3  @Store and increment dst
125*a97c2a1fSXin Li    vld1.8          {d0, d1}, [r4], r2  @Load and increment src
126*a97c2a1fSXin Li    vst1.8          {d0, d1}, [r5], r3  @Store and increment dst
127*a97c2a1fSXin Li    vld1.8          {d0, d1}, [r4], r2  @Load and increment src
128*a97c2a1fSXin Li    vst1.8          {d0, d1}, [r5], r3  @Store and increment dst
129*a97c2a1fSXin Li    vld1.8          {d0, d1}, [r4], r2  @Load and increment src
130*a97c2a1fSXin Li    vst1.8          {d0, d1}, [r5], r3  @Store and increment dst
131*a97c2a1fSXin Li    vld1.8          {d0, d1}, [r4], r2  @Load and increment src
132*a97c2a1fSXin Li    vst1.8          {d0, d1}, [r5], r3  @Store and increment dst
133*a97c2a1fSXin Li    vld1.8          {d0, d1}, [r4], r2  @Load and increment src
134*a97c2a1fSXin Li    vst1.8          {d0, d1}, [r5], r3  @Store and increment dst
135*a97c2a1fSXin Li    vld1.8          {d0, d1}, [r4], r2  @Load and increment src
136*a97c2a1fSXin Li    vst1.8          {d0, d1}, [r5], r3  @Store and increment dst
137*a97c2a1fSXin Li    vld1.8          {d0, d1}, [r4], r2  @Load and increment src
138*a97c2a1fSXin Li    vst1.8          {d0, d1}, [r5], r3  @Store and increment dst
139*a97c2a1fSXin Li    vld1.8          {d0, d1}, [r4], r2  @Load and increment src
140*a97c2a1fSXin Li    vst1.8          {d0, d1}, [r5], r3  @Store and increment dst
141*a97c2a1fSXin Li    vld1.8          {d0, d1}, [r4], r2  @Load and increment src
142*a97c2a1fSXin Li    vst1.8          {d0, d1}, [r5], r3  @Store and increment dst
143*a97c2a1fSXin Li
144*a97c2a1fSXin Li    mov             r2, r2, lsr #1      @src_offset /= 2
145*a97c2a1fSXin Li    mov             r3, r3, lsr #1      @dst_offset /= 2
146*a97c2a1fSXin Li
147*a97c2a1fSXin Li    ldr             r4, [r0, #4]        @src->u
148*a97c2a1fSXin Li    ldr             r5, [r1, #4]        @dst->u
149*a97c2a1fSXin Li    @Read one row of data from the src
150*a97c2a1fSXin Li    vld1.8          {d0}, [r4], r2      @Load and increment src
151*a97c2a1fSXin Li    vst1.8          {d0}, [r5], r3      @Store and increment dst
152*a97c2a1fSXin Li
153*a97c2a1fSXin Li    @//Repeat 7 times for u
154*a97c2a1fSXin Li    vld1.8          {d0}, [r4], r2      @Load and increment src
155*a97c2a1fSXin Li    vst1.8          {d0}, [r5], r3      @Store and increment dst
156*a97c2a1fSXin Li    vld1.8          {d0}, [r4], r2      @Load and increment src
157*a97c2a1fSXin Li    vst1.8          {d0}, [r5], r3      @Store and increment dst
158*a97c2a1fSXin Li    vld1.8          {d0}, [r4], r2      @Load and increment src
159*a97c2a1fSXin Li    vst1.8          {d0}, [r5], r3      @Store and increment dst
160*a97c2a1fSXin Li    vld1.8          {d0}, [r4], r2      @Load and increment src
161*a97c2a1fSXin Li    vst1.8          {d0}, [r5], r3      @Store and increment dst
162*a97c2a1fSXin Li    vld1.8          {d0}, [r4], r2      @Load and increment src
163*a97c2a1fSXin Li    vst1.8          {d0}, [r5], r3      @Store and increment dst
164*a97c2a1fSXin Li    vld1.8          {d0}, [r4], r2      @Load and increment src
165*a97c2a1fSXin Li    vst1.8          {d0}, [r5], r3      @Store and increment dst
166*a97c2a1fSXin Li    vld1.8          {d0}, [r4], r2      @Load and increment src
167*a97c2a1fSXin Li    vst1.8          {d0}, [r5], r3      @Store and increment dst
168*a97c2a1fSXin Li
169*a97c2a1fSXin Li    ldr             r4, [r0, #8]        @src->v
170*a97c2a1fSXin Li    ldr             r5, [r1, #8]        @dst->v
171*a97c2a1fSXin Li    @Read one row of data from the src
172*a97c2a1fSXin Li    vld1.8          {d0}, [r4], r2      @Load and increment src
173*a97c2a1fSXin Li    vst1.8          {d0}, [r5], r3      @Store and increment dst
174*a97c2a1fSXin Li
175*a97c2a1fSXin Li    @//Repeat 7 times for v
176*a97c2a1fSXin Li    vld1.8          {d0}, [r4], r2      @Load and increment src
177*a97c2a1fSXin Li    vst1.8          {d0}, [r5], r3      @Store and increment dst
178*a97c2a1fSXin Li    vld1.8          {d0}, [r4], r2      @Load and increment src
179*a97c2a1fSXin Li    vst1.8          {d0}, [r5], r3      @Store and increment dst
180*a97c2a1fSXin Li    vld1.8          {d0}, [r4], r2      @Load and increment src
181*a97c2a1fSXin Li    vst1.8          {d0}, [r5], r3      @Store and increment dst
182*a97c2a1fSXin Li    vld1.8          {d0}, [r4], r2      @Load and increment src
183*a97c2a1fSXin Li    vst1.8          {d0}, [r5], r3      @Store and increment dst
184*a97c2a1fSXin Li    vld1.8          {d0}, [r4], r2      @Load and increment src
185*a97c2a1fSXin Li    vst1.8          {d0}, [r5], r3      @Store and increment dst
186*a97c2a1fSXin Li    vld1.8          {d0}, [r4], r2      @Load and increment src
187*a97c2a1fSXin Li    vst1.8          {d0}, [r5], r3      @Store and increment dst
188*a97c2a1fSXin Li    vld1.8          {d0}, [r4], r2      @Load and increment src
189*a97c2a1fSXin Li    vst1.8          {d0}, [r5], r3      @Store and increment dst
190*a97c2a1fSXin Li
191*a97c2a1fSXin Li    ldmfd           sp!, {r4, r5, pc}
192*a97c2a1fSXin Li
193*a97c2a1fSXin Li
194*a97c2a1fSXin Li
195*a97c2a1fSXin Li
196*a97c2a1fSXin Li@/*
197*a97c2a1fSXin Li@//---------------------------------------------------------------------------
198*a97c2a1fSXin Li@// Function Name      :   impeg2_mc_fullx_halfy_8x8_a9q()
199*a97c2a1fSXin Li@//
200*a97c2a1fSXin Li@// Detail Description : This function pastes the reference block in the
201*a97c2a1fSXin Li@//                      current frame buffer.This function is called for
202*a97c2a1fSXin Li@//                      blocks that are not coded and have motion vectors
203*a97c2a1fSXin Li@//                      with a half pel resolution.
204*a97c2a1fSXin Li@//
205*a97c2a1fSXin Li@// Inputs             : r0 - out    : Current Block Pointer
206*a97c2a1fSXin Li@//                      r1 - ref     : Refernce Block Pointer
207*a97c2a1fSXin Li@//                      r2 - ref_wid   : Refernce Block Width
208*a97c2a1fSXin Li@//                      r3 - out_wid   ; Current Block Width
209*a97c2a1fSXin Li@//
210*a97c2a1fSXin Li@// Registers Used     : D0-D9
211*a97c2a1fSXin Li@//
212*a97c2a1fSXin Li@// Stack Usage        : 4 bytes
213*a97c2a1fSXin Li@//
214*a97c2a1fSXin Li@// Outputs            : The Motion Compensated Block
215*a97c2a1fSXin Li@//
216*a97c2a1fSXin Li@// Return Data        : None
217*a97c2a1fSXin Li@//
218*a97c2a1fSXin Li@// Programming Note   : <program limitation>
219*a97c2a1fSXin Li@//-----------------------------------------------------------------------------
220*a97c2a1fSXin Li@*/
221*a97c2a1fSXin Li
222*a97c2a1fSXin Li        .global impeg2_mc_fullx_halfy_8x8_a9q
223*a97c2a1fSXin Li
224*a97c2a1fSXin Liimpeg2_mc_fullx_halfy_8x8_a9q:
225*a97c2a1fSXin Li
226*a97c2a1fSXin Li    stmfd           sp!, {r14}
227*a97c2a1fSXin Li    vpush           {d8-d9}
228*a97c2a1fSXin Li    add             r14, r1, r2
229*a97c2a1fSXin Li    mov             r2, r2, lsl #1
230*a97c2a1fSXin Li
231*a97c2a1fSXin Li@/* Load 8 + 1 rows from reference block */
232*a97c2a1fSXin Li@/* Do the addition with out rounding off as rounding value is 1 */
233*a97c2a1fSXin Li    vld1.8          {d0}, [r1], r2      @// first row hence r1 = D0
234*a97c2a1fSXin Li    vld1.8          {d2}, [r14], r2     @// second row hence r2 = D2
235*a97c2a1fSXin Li    vld1.8          {d4}, [r1], r2      @// third row hence r3 = D4
236*a97c2a1fSXin Li    vld1.8          {d6}, [r14], r2     @// fourth row hence r4 = D6
237*a97c2a1fSXin Li    vld1.8          {d1}, [r1], r2      @// fifth row hence r5 = D1
238*a97c2a1fSXin Li    vld1.8          {d3}, [r14], r2     @// sixth row hence r6 = D3
239*a97c2a1fSXin Li    vrhadd.u8       d9, d1, d6          @// estimated row 4 = D9
240*a97c2a1fSXin Li    vld1.8          {d5}, [r1], r2      @// seventh row hence r7 = D5
241*a97c2a1fSXin Li    vrhadd.u8       q0, q0, q1          @// estimated row 1 = D0, row 5 = D1
242*a97c2a1fSXin Li    vld1.8          {d7}, [r14], r2     @// eighth row hence r8 = D7
243*a97c2a1fSXin Li    vrhadd.u8       q1, q1, q2          @// estimated row 2 = D2, row 6 = D3
244*a97c2a1fSXin Li    vld1.8          {d8}, [r1], r2      @// ninth row hence r9 = D8
245*a97c2a1fSXin Li    vrhadd.u8       q2, q2, q3          @// estimated row 3 = D4, row 7 = D5
246*a97c2a1fSXin Li
247*a97c2a1fSXin Li    add             r14, r0, r3
248*a97c2a1fSXin Li    mov             r3, r3, lsl #1
249*a97c2a1fSXin Li
250*a97c2a1fSXin Li@/* Store the eight rows calculated above */
251*a97c2a1fSXin Li    vst1.8          {d2}, [r14], r3     @// second row hence D2
252*a97c2a1fSXin Li    vrhadd.u8       d7, d7, d8          @// estimated row 8 = D7
253*a97c2a1fSXin Li    vst1.8          {d0}, [r0], r3      @// first row hence D0
254*a97c2a1fSXin Li    vst1.8          {d9}, [r14], r3     @// fourth row hence D9
255*a97c2a1fSXin Li    vst1.8          {d4}, [r0], r3      @// third row hence D4
256*a97c2a1fSXin Li    vst1.8          {d3}, [r14], r3     @// sixth row hence r6 = D3
257*a97c2a1fSXin Li    vst1.8          {d1}, [r0], r3      @// fifth row hence r5 = D1
258*a97c2a1fSXin Li    vst1.8          {d7}, [r14], r3     @// eighth row hence r8 = D7
259*a97c2a1fSXin Li    vst1.8          {d5}, [r0], r3      @// seventh row hence r7 = D5
260*a97c2a1fSXin Li
261*a97c2a1fSXin Li    vpop            {d8-d9}
262*a97c2a1fSXin Li    ldmfd           sp!, {pc}
263*a97c2a1fSXin Li
264*a97c2a1fSXin Li
265*a97c2a1fSXin Li
266*a97c2a1fSXin Li
267*a97c2a1fSXin Li
268*a97c2a1fSXin Li
269*a97c2a1fSXin Li@/*
270*a97c2a1fSXin Li@//---------------------------------------------------------------------------
271*a97c2a1fSXin Li@// Function Name      :   impeg2_mc_halfx_fully_8x8_a9q()
272*a97c2a1fSXin Li@//
273*a97c2a1fSXin Li@// Detail Description : This function pastes the reference block in the
274*a97c2a1fSXin Li@//                      current frame buffer.This function is called for
275*a97c2a1fSXin Li@//                      blocks that are not coded and have motion vectors
276*a97c2a1fSXin Li@//                      with a half pel resolutionand VopRoundingType is 0 ..
277*a97c2a1fSXin Li@//
278*a97c2a1fSXin Li@// Inputs             : r0 - out    : Current Block Pointer
279*a97c2a1fSXin Li@//                      r1 - ref     : Refernce Block Pointer
280*a97c2a1fSXin Li@//                      r2 - ref_wid   : Refernce Block Width
281*a97c2a1fSXin Li@//                      r3 - out_wid   ; Current Block Width
282*a97c2a1fSXin Li@//
283*a97c2a1fSXin Li@// Registers Used     : r12, r14, d0-d10, d12-d14, d16-d18, d20-d22
284*a97c2a1fSXin Li
285*a97c2a1fSXin Li@//
286*a97c2a1fSXin Li@// Stack Usage        : 8 bytes
287*a97c2a1fSXin Li@//
288*a97c2a1fSXin Li@// Outputs            : The Motion Compensated Block
289*a97c2a1fSXin Li@//
290*a97c2a1fSXin Li@// Return Data        : None
291*a97c2a1fSXin Li@//
292*a97c2a1fSXin Li@// Programming Note   : <program limitation>
293*a97c2a1fSXin Li@//-----------------------------------------------------------------------------
294*a97c2a1fSXin Li@*/
295*a97c2a1fSXin Li
296*a97c2a1fSXin Li
297*a97c2a1fSXin Li
298*a97c2a1fSXin Li        .global impeg2_mc_halfx_fully_8x8_a9q
299*a97c2a1fSXin Li
300*a97c2a1fSXin Li
301*a97c2a1fSXin Li
302*a97c2a1fSXin Liimpeg2_mc_halfx_fully_8x8_a9q:
303*a97c2a1fSXin Li
304*a97c2a1fSXin Li    stmfd           sp!, {r12, lr}
305*a97c2a1fSXin Li
306*a97c2a1fSXin Li    add             r14, r1, r2, lsl #2
307*a97c2a1fSXin Li
308*a97c2a1fSXin Li    add             r12, r0, r3, lsl#2
309*a97c2a1fSXin Li
310*a97c2a1fSXin Li    vld1.8          {d0, d1}, [r1], r2  @load 16 pixels of  row1
311*a97c2a1fSXin Li
312*a97c2a1fSXin Li    vld1.8          {d2, d3}, [r14], r2 @ row5
313*a97c2a1fSXin Li
314*a97c2a1fSXin Li
315*a97c2a1fSXin Li    vld1.8          {d4, d5}, [r1], r2  @load 16 pixels row2
316*a97c2a1fSXin Li
317*a97c2a1fSXin Li    vld1.8          {d6, d7}, [r14], r2 @row6
318*a97c2a1fSXin Li
319*a97c2a1fSXin Li
320*a97c2a1fSXin Li    vext.8          d24, d0, d1, #1     @Extract pixels (1-8) of row1
321*a97c2a1fSXin Li
322*a97c2a1fSXin Li    vext.8          d28, d2, d3, #1     @Extract pixels (1-8) of row5
323*a97c2a1fSXin Li
324*a97c2a1fSXin Li    vext.8          d16, d4, d5, #1     @Extract pixels (1-8) of row2
325*a97c2a1fSXin Li
326*a97c2a1fSXin Li    vext.8          d20, d6, d7, #1     @Extract pixels (1-8) of row6
327*a97c2a1fSXin Li
328*a97c2a1fSXin Li
329*a97c2a1fSXin Li    vld1.8          {d25, d26}, [r1], r2 @load row3
330*a97c2a1fSXin Li
331*a97c2a1fSXin Li    vld1.8          {d29, d30}, [r14], r2 @load row7
332*a97c2a1fSXin Li
333*a97c2a1fSXin Li    vld1.8          {d17, d18}, [r1], r2 @load  row4
334*a97c2a1fSXin Li
335*a97c2a1fSXin Li    vld1.8          {d21, d22}, [r14], r2 @load  row8
336*a97c2a1fSXin Li
337*a97c2a1fSXin Li
338*a97c2a1fSXin Li    vext.8          d1, d25, d26, #1    @Extract pixels (1-8) of row3
339*a97c2a1fSXin Li
340*a97c2a1fSXin Li    vext.8          d3, d29, d30, #1    @Extract pixels (1-8) of row7
341*a97c2a1fSXin Li
342*a97c2a1fSXin Li
343*a97c2a1fSXin Li
344*a97c2a1fSXin Li    vext.8          d5, d17, d18, #1    @Extract pixels (1-8) of row4
345*a97c2a1fSXin Li
346*a97c2a1fSXin Li    vext.8          d7, d21, d22, #1    @Extract pixels (1-8) of row8
347*a97c2a1fSXin Li
348*a97c2a1fSXin Li
349*a97c2a1fSXin Li    vrhadd.u8       q0, q0, q12         @operate on row1 and row3
350*a97c2a1fSXin Li
351*a97c2a1fSXin Li    vrhadd.u8       q1, q1, q14         @operate on row5 and row7
352*a97c2a1fSXin Li
353*a97c2a1fSXin Li
354*a97c2a1fSXin Li    vrhadd.u8       q2, q2, q8          @operate on row2 and row4
355*a97c2a1fSXin Li
356*a97c2a1fSXin Li
357*a97c2a1fSXin Li
358*a97c2a1fSXin Li    vrhadd.u8       q3, q3, q10         @operate on row6 and row8
359*a97c2a1fSXin Li
360*a97c2a1fSXin Li    vst1.8          d0, [r0], r3        @store row1
361*a97c2a1fSXin Li
362*a97c2a1fSXin Li    vst1.8          d2, [r12], r3       @store row5
363*a97c2a1fSXin Li
364*a97c2a1fSXin Li    vst1.8          d4, [r0], r3        @store row2
365*a97c2a1fSXin Li
366*a97c2a1fSXin Li    vst1.8          d6, [r12], r3       @store row6
367*a97c2a1fSXin Li
368*a97c2a1fSXin Li    vst1.8          d1, [r0], r3        @store row3
369*a97c2a1fSXin Li
370*a97c2a1fSXin Li    vst1.8          d3, [r12], r3       @store row7
371*a97c2a1fSXin Li
372*a97c2a1fSXin Li    vst1.8          d5, [r0], r3        @store row4
373*a97c2a1fSXin Li
374*a97c2a1fSXin Li    vst1.8          d7, [r12], r3       @store row8
375*a97c2a1fSXin Li
376*a97c2a1fSXin Li
377*a97c2a1fSXin Li
378*a97c2a1fSXin Li    ldmfd           sp!, {r12, pc}
379*a97c2a1fSXin Li
380*a97c2a1fSXin Li
381*a97c2a1fSXin Li
382*a97c2a1fSXin Li
383*a97c2a1fSXin Li
384*a97c2a1fSXin Li
385*a97c2a1fSXin Li
386*a97c2a1fSXin Li
387*a97c2a1fSXin Li@/*
388*a97c2a1fSXin Li@//---------------------------------------------------------------------------
389*a97c2a1fSXin Li@// Function Name      :   impeg2_mc_halfx_halfy_8x8_a9q()
390*a97c2a1fSXin Li@//
391*a97c2a1fSXin Li@// Detail Description : This function pastes the reference block in the
392*a97c2a1fSXin Li@//                      current frame buffer.This function is called for
393*a97c2a1fSXin Li@//                      blocks that are not coded and have motion vectors
394*a97c2a1fSXin Li@//                      with a half pel resolutionand VopRoundingType is 0 ..
395*a97c2a1fSXin Li@//
396*a97c2a1fSXin Li@// Inputs             : r0 - out    : Current Block Pointer
397*a97c2a1fSXin Li@//                      r1 - ref     : Refernce Block Pointer
398*a97c2a1fSXin Li@//                      r2 - ref_wid   : Refernce Block Width
399*a97c2a1fSXin Li@//                      r3 - out_wid   ; Current Block Width
400*a97c2a1fSXin Li@//
401*a97c2a1fSXin Li@// Registers Used     : r14, q0-q15
402*a97c2a1fSXin Li
403*a97c2a1fSXin Li@//
404*a97c2a1fSXin Li@// Stack Usage        : 4 bytes
405*a97c2a1fSXin Li@//
406*a97c2a1fSXin Li@// Outputs            : The Motion Compensated Block
407*a97c2a1fSXin Li@//
408*a97c2a1fSXin Li@// Return Data        : None
409*a97c2a1fSXin Li@//
410*a97c2a1fSXin Li@// Programming Note   : <program limitation>
411*a97c2a1fSXin Li@//-----------------------------------------------------------------------------
412*a97c2a1fSXin Li@*/
413*a97c2a1fSXin Li
414*a97c2a1fSXin Li
415*a97c2a1fSXin Li        .global impeg2_mc_halfx_halfy_8x8_a9q
416*a97c2a1fSXin Li
417*a97c2a1fSXin Liimpeg2_mc_halfx_halfy_8x8_a9q:
418*a97c2a1fSXin Li
419*a97c2a1fSXin Li    stmfd           sp!, {r14}
420*a97c2a1fSXin Li    vpush           {d8-d15}
421*a97c2a1fSXin Li
422*a97c2a1fSXin Li    add             r14, r1, r2, lsl #2
423*a97c2a1fSXin Li
424*a97c2a1fSXin Li    vld1.8          {d0, d1}, [r1], r2  @load 16 pixels of  row1
425*a97c2a1fSXin Li
426*a97c2a1fSXin Li    vld1.8          {d2, d3}, [r14], r2 @ row5
427*a97c2a1fSXin Li
428*a97c2a1fSXin Li    vld1.8          {d4, d5}, [r1], r2  @load 16 pixels row2
429*a97c2a1fSXin Li
430*a97c2a1fSXin Li    vld1.8          {d6, d7}, [r14], r2 @row6
431*a97c2a1fSXin Li
432*a97c2a1fSXin Li    vext.8          d1, d0, d1, #1      @Extract pixels (1-8) of row1
433*a97c2a1fSXin Li
434*a97c2a1fSXin Li
435*a97c2a1fSXin Li
436*a97c2a1fSXin Li    vext.8          d3, d2, d3, #1      @Extract pixels (1-8) of row5
437*a97c2a1fSXin Li
438*a97c2a1fSXin Li
439*a97c2a1fSXin Li
440*a97c2a1fSXin Li    vext.8          d5, d4, d5, #1      @Extract pixels (1-8) of row2
441*a97c2a1fSXin Li
442*a97c2a1fSXin Li    vext.8          d7, d6, d7, #1      @Extract pixels (1-8) of row6
443*a97c2a1fSXin Li
444*a97c2a1fSXin Li
445*a97c2a1fSXin Li
446*a97c2a1fSXin Li
447*a97c2a1fSXin Li    vld1.8          {d8, d9}, [r1], r2  @load row3
448*a97c2a1fSXin Li
449*a97c2a1fSXin Li
450*a97c2a1fSXin Li
451*a97c2a1fSXin Li    vld1.8          {d10, d11}, [r14], r2 @load row7
452*a97c2a1fSXin Li
453*a97c2a1fSXin Li    vld1.8          {d12, d13}, [r1], r2 @load  row4
454*a97c2a1fSXin Li
455*a97c2a1fSXin Li    vld1.8          {d14, d15}, [r14], r2 @load  row8
456*a97c2a1fSXin Li
457*a97c2a1fSXin Li    vext.8          d9, d8, d9, #1      @Extract pixels (1-8) of row3
458*a97c2a1fSXin Li
459*a97c2a1fSXin Li    vld1.8          {d16, d17}, [r14], r2 @load  row9
460*a97c2a1fSXin Li
461*a97c2a1fSXin Li
462*a97c2a1fSXin Li
463*a97c2a1fSXin Li
464*a97c2a1fSXin Li
465*a97c2a1fSXin Li    vext.8          d11, d10, d11, #1   @Extract pixels (1-8) of row7
466*a97c2a1fSXin Li
467*a97c2a1fSXin Li
468*a97c2a1fSXin Li
469*a97c2a1fSXin Li    vext.8          d13, d12, d13, #1   @Extract pixels (1-8) of row4
470*a97c2a1fSXin Li
471*a97c2a1fSXin Li
472*a97c2a1fSXin Li
473*a97c2a1fSXin Li    vext.8          d15, d14, d15, #1   @Extract pixels (1-8) of row8
474*a97c2a1fSXin Li
475*a97c2a1fSXin Li    vext.8          d17, d16, d17, #1   @Extract pixels (1-8) of row9
476*a97c2a1fSXin Li
477*a97c2a1fSXin Li
478*a97c2a1fSXin Li    @interpolation in x direction
479*a97c2a1fSXin Li
480*a97c2a1fSXin Li    vaddl.u8        q0, d0, d1          @operate row1
481*a97c2a1fSXin Li
482*a97c2a1fSXin Li    vaddl.u8        q1, d2, d3          @operate row5
483*a97c2a1fSXin Li
484*a97c2a1fSXin Li    vaddl.u8        q2, d4, d5          @operate row2
485*a97c2a1fSXin Li
486*a97c2a1fSXin Li    vaddl.u8        q3, d6, d7          @operate row6
487*a97c2a1fSXin Li
488*a97c2a1fSXin Li    vaddl.u8        q4, d8, d9          @operate row3
489*a97c2a1fSXin Li
490*a97c2a1fSXin Li    vaddl.u8        q5, d10, d11        @operate row7
491*a97c2a1fSXin Li
492*a97c2a1fSXin Li    vaddl.u8        q6, d12, d13        @operate row4
493*a97c2a1fSXin Li
494*a97c2a1fSXin Li    vaddl.u8        q7, d14, d15        @operate row8
495*a97c2a1fSXin Li
496*a97c2a1fSXin Li    vaddl.u8        q8, d16, d17        @operate row9
497*a97c2a1fSXin Li
498*a97c2a1fSXin Li    @interpolation in y direction
499*a97c2a1fSXin Li
500*a97c2a1fSXin Li    add             r14, r0, r3, lsl #2
501*a97c2a1fSXin Li
502*a97c2a1fSXin Li
503*a97c2a1fSXin Li
504*a97c2a1fSXin Li    vadd.u16        q9, q0, q2          @operate row1 and row2
505*a97c2a1fSXin Li
506*a97c2a1fSXin Li    vadd.u16        q13, q1, q3         @operate row5 and row6
507*a97c2a1fSXin Li
508*a97c2a1fSXin Li    vadd.u16        q10, q2, q4         @operate row2 and row3
509*a97c2a1fSXin Li
510*a97c2a1fSXin Li    vadd.u16        q14, q3, q5         @operate row6 and row7
511*a97c2a1fSXin Li
512*a97c2a1fSXin Li    vrshrn.u16      d18, q9, #2         @row1
513*a97c2a1fSXin Li
514*a97c2a1fSXin Li    vrshrn.u16      d26, q13, #2        @row5
515*a97c2a1fSXin Li
516*a97c2a1fSXin Li    vrshrn.u16      d20, q10, #2        @row2
517*a97c2a1fSXin Li
518*a97c2a1fSXin Li    vrshrn.u16      d28, q14, #2        @row6
519*a97c2a1fSXin Li
520*a97c2a1fSXin Li    vadd.u16        q11, q4, q6         @operate row3 and row4
521*a97c2a1fSXin Li
522*a97c2a1fSXin Li    vst1.8          d18, [r0], r3       @store row1
523*a97c2a1fSXin Li
524*a97c2a1fSXin Li    vadd.u16        q15, q5, q7         @operate row7 and row8
525*a97c2a1fSXin Li
526*a97c2a1fSXin Li    vst1.8          d26, [r14], r3      @store row5
527*a97c2a1fSXin Li
528*a97c2a1fSXin Li    vadd.u16        q12, q6, q1         @operate row4 and row5
529*a97c2a1fSXin Li
530*a97c2a1fSXin Li    vst1.8          d20, [r0], r3       @store row2
531*a97c2a1fSXin Li
532*a97c2a1fSXin Li    vadd.u16        q7, q7, q8          @operate row8 and row9
533*a97c2a1fSXin Li
534*a97c2a1fSXin Li    vst1.8          d28, [r14], r3      @store row6
535*a97c2a1fSXin Li
536*a97c2a1fSXin Li
537*a97c2a1fSXin Li
538*a97c2a1fSXin Li    vrshrn.u16      d22, q11, #2        @row3
539*a97c2a1fSXin Li
540*a97c2a1fSXin Li    vrshrn.u16      d30, q15, #2        @row7
541*a97c2a1fSXin Li
542*a97c2a1fSXin Li    vrshrn.u16      d24, q12, #2        @row4
543*a97c2a1fSXin Li
544*a97c2a1fSXin Li    vrshrn.u16      d14, q7, #2         @row8
545*a97c2a1fSXin Li
546*a97c2a1fSXin Li
547*a97c2a1fSXin Li    vst1.8          d22, [r0], r3       @store row3
548*a97c2a1fSXin Li    vst1.8          d30, [r14], r3      @store row7
549*a97c2a1fSXin Li    vst1.8          d24, [r0], r3       @store row4
550*a97c2a1fSXin Li    vst1.8          d14, [r14], r3      @store row8
551*a97c2a1fSXin Li
552*a97c2a1fSXin Li
553*a97c2a1fSXin Li
554*a97c2a1fSXin Li    vpop            {d8-d15}
555*a97c2a1fSXin Li    ldmfd           sp!, {pc}
556*a97c2a1fSXin Li
557*a97c2a1fSXin Li
558*a97c2a1fSXin Li
559*a97c2a1fSXin Li
560*a97c2a1fSXin Li
561*a97c2a1fSXin Li@/*
562*a97c2a1fSXin Li@//---------------------------------------------------------------------------
563*a97c2a1fSXin Li@// Function Name      :   impeg2_mc_fullx_fully_8x8_a9q()
564*a97c2a1fSXin Li@//
565*a97c2a1fSXin Li@// Detail Description : This function pastes the reference block in the
566*a97c2a1fSXin Li@//                      current frame buffer.This function is called for
567*a97c2a1fSXin Li@//                      blocks that are not coded and have motion vectors
568*a97c2a1fSXin Li@//                      with a half pel resolutionand ..
569*a97c2a1fSXin Li@//
570*a97c2a1fSXin Li@// Inputs             : r0 - out    : Current Block Pointer
571*a97c2a1fSXin Li@//                      r1 - ref     : Refernce Block Pointer
572*a97c2a1fSXin Li@//                      r2 - ref_wid   : Refernce Block Width
573*a97c2a1fSXin Li@//                      r3 - out_wid   ; Current Block Width
574*a97c2a1fSXin Li@//
575*a97c2a1fSXin Li@// Registers Used     : r12, r14, d0-d3
576*a97c2a1fSXin Li
577*a97c2a1fSXin Li@//
578*a97c2a1fSXin Li@// Stack Usage        : 8 bytes
579*a97c2a1fSXin Li@//
580*a97c2a1fSXin Li@// Outputs            : The Motion Compensated Block
581*a97c2a1fSXin Li@//
582*a97c2a1fSXin Li@// Return Data        : None
583*a97c2a1fSXin Li@//
584*a97c2a1fSXin Li@// Programming Note   : <program limitation>
585*a97c2a1fSXin Li@//-----------------------------------------------------------------------------
586*a97c2a1fSXin Li@*/
587*a97c2a1fSXin Li
588*a97c2a1fSXin Li
589*a97c2a1fSXin Li        .global impeg2_mc_fullx_fully_8x8_a9q
590*a97c2a1fSXin Liimpeg2_mc_fullx_fully_8x8_a9q:
591*a97c2a1fSXin Li
592*a97c2a1fSXin Li
593*a97c2a1fSXin Li    stmfd           sp!, {r12, lr}
594*a97c2a1fSXin Li
595*a97c2a1fSXin Li    add             r14, r1, r2, lsl #2
596*a97c2a1fSXin Li
597*a97c2a1fSXin Li    add             r12, r0, r3, lsl #2
598*a97c2a1fSXin Li
599*a97c2a1fSXin Li
600*a97c2a1fSXin Li    vld1.8          d0, [r1], r2        @load row1
601*a97c2a1fSXin Li
602*a97c2a1fSXin Li    vld1.8          d1, [r14], r2       @load row4
603*a97c2a1fSXin Li
604*a97c2a1fSXin Li    vld1.8          d2, [r1], r2        @load row2
605*a97c2a1fSXin Li
606*a97c2a1fSXin Li    vld1.8          d3, [r14], r2       @load row5
607*a97c2a1fSXin Li
608*a97c2a1fSXin Li
609*a97c2a1fSXin Li    vst1.8          d0, [r0], r3        @store row1
610*a97c2a1fSXin Li
611*a97c2a1fSXin Li    vst1.8          d1, [r12], r3       @store row4
612*a97c2a1fSXin Li
613*a97c2a1fSXin Li    vst1.8          d2, [r0], r3        @store row2
614*a97c2a1fSXin Li
615*a97c2a1fSXin Li    vst1.8          d3, [r12], r3       @store row5
616*a97c2a1fSXin Li
617*a97c2a1fSXin Li
618*a97c2a1fSXin Li    vld1.8          d0, [r1], r2        @load row3
619*a97c2a1fSXin Li
620*a97c2a1fSXin Li    vld1.8          d1, [r14], r2       @load row6
621*a97c2a1fSXin Li
622*a97c2a1fSXin Li    vld1.8          d2, [r1], r2        @load row4
623*a97c2a1fSXin Li
624*a97c2a1fSXin Li    vld1.8          d3, [r14], r2       @load row8
625*a97c2a1fSXin Li
626*a97c2a1fSXin Li
627*a97c2a1fSXin Li    vst1.8          d0, [r0], r3        @store row3
628*a97c2a1fSXin Li
629*a97c2a1fSXin Li    vst1.8          d1, [r12], r3       @store row6
630*a97c2a1fSXin Li
631*a97c2a1fSXin Li    vst1.8          d2, [r0], r3        @store row4
632*a97c2a1fSXin Li
633*a97c2a1fSXin Li    vst1.8          d3, [r12], r3       @store row8
634*a97c2a1fSXin Li
635*a97c2a1fSXin Li
636*a97c2a1fSXin Li    ldmfd           sp!, {r12, pc}
637*a97c2a1fSXin Li
638*a97c2a1fSXin Li
639*a97c2a1fSXin Li
640*a97c2a1fSXin Li
641*a97c2a1fSXin Li
642*a97c2a1fSXin Li@/*
643*a97c2a1fSXin Li@//---------------------------------------------------------------------------
644*a97c2a1fSXin Li@// Function Name      :   impeg2_interpolate_a9q()
645*a97c2a1fSXin Li@//
646*a97c2a1fSXin Li@// Detail Description : interpolates two buffers and adds pred
647*a97c2a1fSXin Li@//
648*a97c2a1fSXin Li@// Inputs             : r0 - pointer to src1
649*a97c2a1fSXin Li@//                      r1 - pointer to src2
650*a97c2a1fSXin Li@//                      r2 - dest buf
651*a97c2a1fSXin Li@//                      r3 - dst stride
652*a97c2a1fSXin Li@// Registers Used     : r4, r5, r7, r14, d0-d15
653*a97c2a1fSXin Li@//
654*a97c2a1fSXin Li@// Stack Usage        : 20 bytes
655*a97c2a1fSXin Li@//
656*a97c2a1fSXin Li@// Outputs            : The Motion Compensated Block
657*a97c2a1fSXin Li@//
658*a97c2a1fSXin Li@// Return Data        : None
659*a97c2a1fSXin Li@//
660*a97c2a1fSXin Li@// Programming Note   : <program limitation>
661*a97c2a1fSXin Li@//-----------------------------------------------------------------------------
662*a97c2a1fSXin Li@*/
663*a97c2a1fSXin Li
664*a97c2a1fSXin Li
665*a97c2a1fSXin Li        .global impeg2_interpolate_a9q
666*a97c2a1fSXin Li
667*a97c2a1fSXin Li
668*a97c2a1fSXin Liimpeg2_interpolate_a9q:
669*a97c2a1fSXin Li
670*a97c2a1fSXin Li    stmfd           sp!, {r4, r5, r7, r12, r14}
671*a97c2a1fSXin Li    vpush           {d8-d15}
672*a97c2a1fSXin Li
673*a97c2a1fSXin Li    ldr             r4, [r0, #0]        @ptr_y src1
674*a97c2a1fSXin Li
675*a97c2a1fSXin Li    ldr             r5, [r1, #0]        @ptr_y src2
676*a97c2a1fSXin Li
677*a97c2a1fSXin Li    ldr             r7, [r2, #0]        @ptr_y dst buf
678*a97c2a1fSXin Li
679*a97c2a1fSXin Li    mov             r12, #4             @counter for number of blocks
680*a97c2a1fSXin Li
681*a97c2a1fSXin Li
682*a97c2a1fSXin Liinterp_lumablocks_stride:
683*a97c2a1fSXin Li
684*a97c2a1fSXin Li    vld1.8          {d0, d1}, [r4]!     @row1 src1
685*a97c2a1fSXin Li
686*a97c2a1fSXin Li    vld1.8          {d2, d3}, [r4]!     @row2 src1
687*a97c2a1fSXin Li
688*a97c2a1fSXin Li    vld1.8          {d4, d5}, [r4]!     @row3 src1
689*a97c2a1fSXin Li
690*a97c2a1fSXin Li    vld1.8          {d6, d7}, [r4]!     @row4 src1
691*a97c2a1fSXin Li
692*a97c2a1fSXin Li
693*a97c2a1fSXin Li    vld1.8          {d8, d9}, [r5]!     @row1 src2
694*a97c2a1fSXin Li
695*a97c2a1fSXin Li    vld1.8          {d10, d11}, [r5]!   @row2 src2
696*a97c2a1fSXin Li
697*a97c2a1fSXin Li    vld1.8          {d12, d13}, [r5]!   @row3 src2
698*a97c2a1fSXin Li
699*a97c2a1fSXin Li    vld1.8          {d14, d15}, [r5]!   @row4 src2
700*a97c2a1fSXin Li
701*a97c2a1fSXin Li
702*a97c2a1fSXin Li
703*a97c2a1fSXin Li
704*a97c2a1fSXin Li    vrhadd.u8       q0, q0, q4          @operate on row1
705*a97c2a1fSXin Li
706*a97c2a1fSXin Li    vrhadd.u8       q1, q1, q5          @operate on row2
707*a97c2a1fSXin Li
708*a97c2a1fSXin Li    vrhadd.u8       q2, q2, q6          @operate on row3
709*a97c2a1fSXin Li
710*a97c2a1fSXin Li    vrhadd.u8       q3, q3, q7          @operate on row4
711*a97c2a1fSXin Li
712*a97c2a1fSXin Li
713*a97c2a1fSXin Li
714*a97c2a1fSXin Li    vst1.8          {d0, d1}, [r7], r3  @row1
715*a97c2a1fSXin Li
716*a97c2a1fSXin Li    vst1.8          {d2, d3}, [r7], r3  @row2
717*a97c2a1fSXin Li
718*a97c2a1fSXin Li    vst1.8          {d4, d5}, [r7], r3  @row3
719*a97c2a1fSXin Li
720*a97c2a1fSXin Li    vst1.8          {d6, d7}, [r7], r3  @row4
721*a97c2a1fSXin Li
722*a97c2a1fSXin Li    subs            r12, r12, #1
723*a97c2a1fSXin Li
724*a97c2a1fSXin Li    bne             interp_lumablocks_stride
725*a97c2a1fSXin Li
726*a97c2a1fSXin Li
727*a97c2a1fSXin Li    mov             r3, r3, lsr #1      @stride >> 1
728*a97c2a1fSXin Li
729*a97c2a1fSXin Li    ldr             r4, [r0, #4]        @ptr_u src1
730*a97c2a1fSXin Li
731*a97c2a1fSXin Li    ldr             r5, [r1, #4]        @ptr_u src2
732*a97c2a1fSXin Li
733*a97c2a1fSXin Li    ldr             r7 , [r2, #4]       @ptr_u dst buf
734*a97c2a1fSXin Li
735*a97c2a1fSXin Li    mov             r12, #2             @counter for number of blocks
736*a97c2a1fSXin Li
737*a97c2a1fSXin Li
738*a97c2a1fSXin Li
739*a97c2a1fSXin Li@chroma blocks
740*a97c2a1fSXin Li
741*a97c2a1fSXin Liinterp_chromablocks_stride:
742*a97c2a1fSXin Li
743*a97c2a1fSXin Li    vld1.8          {d0, d1}, [r4]!     @row1 & 2 src1
744*a97c2a1fSXin Li
745*a97c2a1fSXin Li    vld1.8          {d2, d3}, [r4]!     @row3 & 4 src1
746*a97c2a1fSXin Li
747*a97c2a1fSXin Li    vld1.8          {d4, d5}, [r4]!     @row5 & 6 src1
748*a97c2a1fSXin Li
749*a97c2a1fSXin Li    vld1.8          {d6, d7}, [r4]!     @row7 & 8 src1
750*a97c2a1fSXin Li
751*a97c2a1fSXin Li
752*a97c2a1fSXin Li    vld1.8          {d8, d9}, [r5]!     @row1 & 2 src2
753*a97c2a1fSXin Li
754*a97c2a1fSXin Li    vld1.8          {d10, d11}, [r5]!   @row3 & 4 src2
755*a97c2a1fSXin Li
756*a97c2a1fSXin Li    vld1.8          {d12, d13}, [r5]!   @row5 & 6 src2
757*a97c2a1fSXin Li
758*a97c2a1fSXin Li    vld1.8          {d14, d15}, [r5]!   @row7 & 8 src2
759*a97c2a1fSXin Li
760*a97c2a1fSXin Li
761*a97c2a1fSXin Li
762*a97c2a1fSXin Li
763*a97c2a1fSXin Li    vrhadd.u8       q0, q0, q4          @operate on row1 & 2
764*a97c2a1fSXin Li
765*a97c2a1fSXin Li    vrhadd.u8       q1, q1, q5          @operate on row3 & 4
766*a97c2a1fSXin Li
767*a97c2a1fSXin Li    vrhadd.u8       q2, q2, q6          @operate on row5 & 6
768*a97c2a1fSXin Li
769*a97c2a1fSXin Li    vrhadd.u8       q3, q3, q7          @operate on row7 & 8
770*a97c2a1fSXin Li
771*a97c2a1fSXin Li
772*a97c2a1fSXin Li    vst1.8          {d0}, [r7], r3      @row1
773*a97c2a1fSXin Li
774*a97c2a1fSXin Li    vst1.8          {d1}, [r7], r3      @row2
775*a97c2a1fSXin Li
776*a97c2a1fSXin Li    vst1.8          {d2}, [r7], r3      @row3
777*a97c2a1fSXin Li
778*a97c2a1fSXin Li    vst1.8          {d3}, [r7], r3      @row4
779*a97c2a1fSXin Li
780*a97c2a1fSXin Li    vst1.8          {d4}, [r7], r3      @row5
781*a97c2a1fSXin Li
782*a97c2a1fSXin Li    vst1.8          {d5}, [r7], r3      @row6
783*a97c2a1fSXin Li
784*a97c2a1fSXin Li    vst1.8          {d6}, [r7], r3      @row7
785*a97c2a1fSXin Li
786*a97c2a1fSXin Li    vst1.8          {d7}, [r7], r3      @row8
787*a97c2a1fSXin Li
788*a97c2a1fSXin Li
789*a97c2a1fSXin Li
790*a97c2a1fSXin Li    ldr             r4, [r0, #8]        @ptr_v src1
791*a97c2a1fSXin Li
792*a97c2a1fSXin Li    ldr             r5, [r1, #8]        @ptr_v src2
793*a97c2a1fSXin Li
794*a97c2a1fSXin Li    ldr             r7, [r2, #8]        @ptr_v dst buf
795*a97c2a1fSXin Li
796*a97c2a1fSXin Li    subs            r12, r12, #1
797*a97c2a1fSXin Li
798*a97c2a1fSXin Li    bne             interp_chromablocks_stride
799*a97c2a1fSXin Li
800*a97c2a1fSXin Li
801*a97c2a1fSXin Li    vpop            {d8-d15}
802*a97c2a1fSXin Li    ldmfd           sp!, {r4, r5, r7, r12, pc}
803*a97c2a1fSXin Li
804*a97c2a1fSXin Li
805*a97c2a1fSXin Li
806*a97c2a1fSXin Li
807*a97c2a1fSXin Li
808