xref: /aosp_15_r20/external/libvpx/vpx_dsp/mips/sad_msa.c (revision fb1b10ab9aebc7c7068eedab379b749d7e3900be)
1 /*
2  *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include "./vpx_dsp_rtcd.h"
12 #include "vpx_dsp/mips/macros_msa.h"
13 
14 #define SAD_INSVE_W4(RTYPE, in0, in1, in2, in3, out)       \
15   {                                                        \
16     out = (RTYPE)__msa_insve_w((v4i32)out, 0, (v4i32)in0); \
17     out = (RTYPE)__msa_insve_w((v4i32)out, 1, (v4i32)in1); \
18     out = (RTYPE)__msa_insve_w((v4i32)out, 2, (v4i32)in2); \
19     out = (RTYPE)__msa_insve_w((v4i32)out, 3, (v4i32)in3); \
20   }
21 #define SAD_INSVE_W4_UB(...) SAD_INSVE_W4(v16u8, __VA_ARGS__)
22 
sad_4width_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride,int32_t height)23 static uint32_t sad_4width_msa(const uint8_t *src_ptr, int32_t src_stride,
24                                const uint8_t *ref_ptr, int32_t ref_stride,
25                                int32_t height) {
26   int32_t ht_cnt;
27   uint32_t src0, src1, src2, src3, ref0, ref1, ref2, ref3;
28   v16u8 src = { 0 };
29   v16u8 ref = { 0 };
30   v16u8 diff;
31   v8u16 sad = { 0 };
32 
33   for (ht_cnt = (height >> 2); ht_cnt--;) {
34     LW4(src_ptr, src_stride, src0, src1, src2, src3);
35     src_ptr += (4 * src_stride);
36     LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
37     ref_ptr += (4 * ref_stride);
38 
39     INSERT_W4_UB(src0, src1, src2, src3, src);
40     INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
41 
42     diff = __msa_asub_u_b(src, ref);
43     sad += __msa_hadd_u_h(diff, diff);
44   }
45 
46   return HADD_UH_U32(sad);
47 }
48 
sad_8width_msa(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,int32_t height)49 static uint32_t sad_8width_msa(const uint8_t *src, int32_t src_stride,
50                                const uint8_t *ref, int32_t ref_stride,
51                                int32_t height) {
52   int32_t ht_cnt;
53   v16u8 src0, src1, src2, src3, ref0, ref1, ref2, ref3;
54   v8u16 sad = { 0 };
55 
56   for (ht_cnt = (height >> 2); ht_cnt--;) {
57     LD_UB4(src, src_stride, src0, src1, src2, src3);
58     src += (4 * src_stride);
59     LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3);
60     ref += (4 * ref_stride);
61 
62     PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1,
63                 ref0, ref1);
64     sad += SAD_UB2_UH(src0, src1, ref0, ref1);
65   }
66 
67   return HADD_UH_U32(sad);
68 }
69 
sad_16width_msa(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,int32_t height)70 static uint32_t sad_16width_msa(const uint8_t *src, int32_t src_stride,
71                                 const uint8_t *ref, int32_t ref_stride,
72                                 int32_t height) {
73   int32_t ht_cnt;
74   v16u8 src0, src1, ref0, ref1;
75   v8u16 sad = { 0 };
76 
77   for (ht_cnt = (height >> 2); ht_cnt--;) {
78     LD_UB2(src, src_stride, src0, src1);
79     src += (2 * src_stride);
80     LD_UB2(ref, ref_stride, ref0, ref1);
81     ref += (2 * ref_stride);
82     sad += SAD_UB2_UH(src0, src1, ref0, ref1);
83 
84     LD_UB2(src, src_stride, src0, src1);
85     src += (2 * src_stride);
86     LD_UB2(ref, ref_stride, ref0, ref1);
87     ref += (2 * ref_stride);
88     sad += SAD_UB2_UH(src0, src1, ref0, ref1);
89   }
90 
91   return HADD_UH_U32(sad);
92 }
93 
sad_32width_msa(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,int32_t height)94 static uint32_t sad_32width_msa(const uint8_t *src, int32_t src_stride,
95                                 const uint8_t *ref, int32_t ref_stride,
96                                 int32_t height) {
97   int32_t ht_cnt;
98   v16u8 src0, src1, ref0, ref1;
99   v8u16 sad = { 0 };
100 
101   for (ht_cnt = (height >> 2); ht_cnt--;) {
102     LD_UB2(src, 16, src0, src1);
103     src += src_stride;
104     LD_UB2(ref, 16, ref0, ref1);
105     ref += ref_stride;
106     sad += SAD_UB2_UH(src0, src1, ref0, ref1);
107 
108     LD_UB2(src, 16, src0, src1);
109     src += src_stride;
110     LD_UB2(ref, 16, ref0, ref1);
111     ref += ref_stride;
112     sad += SAD_UB2_UH(src0, src1, ref0, ref1);
113 
114     LD_UB2(src, 16, src0, src1);
115     src += src_stride;
116     LD_UB2(ref, 16, ref0, ref1);
117     ref += ref_stride;
118     sad += SAD_UB2_UH(src0, src1, ref0, ref1);
119 
120     LD_UB2(src, 16, src0, src1);
121     src += src_stride;
122     LD_UB2(ref, 16, ref0, ref1);
123     ref += ref_stride;
124     sad += SAD_UB2_UH(src0, src1, ref0, ref1);
125   }
126 
127   return HADD_UH_U32(sad);
128 }
129 
sad_64width_msa(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,int32_t height)130 static uint32_t sad_64width_msa(const uint8_t *src, int32_t src_stride,
131                                 const uint8_t *ref, int32_t ref_stride,
132                                 int32_t height) {
133   int32_t ht_cnt;
134   uint32_t sad = 0;
135   v16u8 src0, src1, src2, src3;
136   v16u8 ref0, ref1, ref2, ref3;
137   v8u16 sad0 = { 0 };
138   v8u16 sad1 = { 0 };
139 
140   for (ht_cnt = (height >> 1); ht_cnt--;) {
141     LD_UB4(src, 16, src0, src1, src2, src3);
142     src += src_stride;
143     LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
144     ref += ref_stride;
145     sad0 += SAD_UB2_UH(src0, src1, ref0, ref1);
146     sad1 += SAD_UB2_UH(src2, src3, ref2, ref3);
147 
148     LD_UB4(src, 16, src0, src1, src2, src3);
149     src += src_stride;
150     LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
151     ref += ref_stride;
152     sad0 += SAD_UB2_UH(src0, src1, ref0, ref1);
153     sad1 += SAD_UB2_UH(src2, src3, ref2, ref3);
154   }
155 
156   sad = HADD_UH_U32(sad0);
157   sad += HADD_UH_U32(sad1);
158 
159   return sad;
160 }
161 
sad_4width_x4d_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * const aref_ptr[],int32_t ref_stride,int32_t height,uint32_t * sad_array)162 static void sad_4width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride,
163                                const uint8_t *const aref_ptr[],
164                                int32_t ref_stride, int32_t height,
165                                uint32_t *sad_array) {
166   const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
167   int32_t ht_cnt;
168   uint32_t src0, src1, src2, src3;
169   uint32_t ref0, ref1, ref2, ref3;
170   v16u8 src = { 0 };
171   v16u8 ref = { 0 };
172   v16u8 diff;
173   v8u16 sad0 = { 0 };
174   v8u16 sad1 = { 0 };
175   v8u16 sad2 = { 0 };
176   v8u16 sad3 = { 0 };
177 
178   ref0_ptr = aref_ptr[0];
179   ref1_ptr = aref_ptr[1];
180   ref2_ptr = aref_ptr[2];
181   ref3_ptr = aref_ptr[3];
182 
183   for (ht_cnt = (height >> 2); ht_cnt--;) {
184     LW4(src_ptr, src_stride, src0, src1, src2, src3);
185     INSERT_W4_UB(src0, src1, src2, src3, src);
186     src_ptr += (4 * src_stride);
187 
188     LW4(ref0_ptr, ref_stride, ref0, ref1, ref2, ref3);
189     INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
190     ref0_ptr += (4 * ref_stride);
191 
192     diff = __msa_asub_u_b(src, ref);
193     sad0 += __msa_hadd_u_h(diff, diff);
194 
195     LW4(ref1_ptr, ref_stride, ref0, ref1, ref2, ref3);
196     INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
197     ref1_ptr += (4 * ref_stride);
198 
199     diff = __msa_asub_u_b(src, ref);
200     sad1 += __msa_hadd_u_h(diff, diff);
201 
202     LW4(ref2_ptr, ref_stride, ref0, ref1, ref2, ref3);
203     INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
204     ref2_ptr += (4 * ref_stride);
205 
206     diff = __msa_asub_u_b(src, ref);
207     sad2 += __msa_hadd_u_h(diff, diff);
208 
209     LW4(ref3_ptr, ref_stride, ref0, ref1, ref2, ref3);
210     INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
211     ref3_ptr += (4 * ref_stride);
212 
213     diff = __msa_asub_u_b(src, ref);
214     sad3 += __msa_hadd_u_h(diff, diff);
215   }
216 
217   sad_array[0] = HADD_UH_U32(sad0);
218   sad_array[1] = HADD_UH_U32(sad1);
219   sad_array[2] = HADD_UH_U32(sad2);
220   sad_array[3] = HADD_UH_U32(sad3);
221 }
222 
sad_8width_x4d_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * const aref_ptr[],int32_t ref_stride,int32_t height,uint32_t * sad_array)223 static void sad_8width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride,
224                                const uint8_t *const aref_ptr[],
225                                int32_t ref_stride, int32_t height,
226                                uint32_t *sad_array) {
227   int32_t ht_cnt;
228   const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
229   v16u8 src0, src1, src2, src3;
230   v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
231   v16u8 ref8, ref9, ref10, ref11, ref12, ref13, ref14, ref15;
232   v8u16 sad0 = { 0 };
233   v8u16 sad1 = { 0 };
234   v8u16 sad2 = { 0 };
235   v8u16 sad3 = { 0 };
236 
237   ref0_ptr = aref_ptr[0];
238   ref1_ptr = aref_ptr[1];
239   ref2_ptr = aref_ptr[2];
240   ref3_ptr = aref_ptr[3];
241 
242   for (ht_cnt = (height >> 2); ht_cnt--;) {
243     LD_UB4(src_ptr, src_stride, src0, src1, src2, src3);
244     src_ptr += (4 * src_stride);
245     LD_UB4(ref0_ptr, ref_stride, ref0, ref1, ref2, ref3);
246     ref0_ptr += (4 * ref_stride);
247     LD_UB4(ref1_ptr, ref_stride, ref4, ref5, ref6, ref7);
248     ref1_ptr += (4 * ref_stride);
249     LD_UB4(ref2_ptr, ref_stride, ref8, ref9, ref10, ref11);
250     ref2_ptr += (4 * ref_stride);
251     LD_UB4(ref3_ptr, ref_stride, ref12, ref13, ref14, ref15);
252     ref3_ptr += (4 * ref_stride);
253 
254     PCKEV_D2_UB(src1, src0, src3, src2, src0, src1);
255     PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
256     sad0 += SAD_UB2_UH(src0, src1, ref0, ref1);
257 
258     PCKEV_D2_UB(ref5, ref4, ref7, ref6, ref0, ref1);
259     sad1 += SAD_UB2_UH(src0, src1, ref0, ref1);
260 
261     PCKEV_D2_UB(ref9, ref8, ref11, ref10, ref0, ref1);
262     sad2 += SAD_UB2_UH(src0, src1, ref0, ref1);
263 
264     PCKEV_D2_UB(ref13, ref12, ref15, ref14, ref0, ref1);
265     sad3 += SAD_UB2_UH(src0, src1, ref0, ref1);
266   }
267 
268   sad_array[0] = HADD_UH_U32(sad0);
269   sad_array[1] = HADD_UH_U32(sad1);
270   sad_array[2] = HADD_UH_U32(sad2);
271   sad_array[3] = HADD_UH_U32(sad3);
272 }
273 
sad_16width_x4d_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * const aref_ptr[],int32_t ref_stride,int32_t height,uint32_t * sad_array)274 static void sad_16width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride,
275                                 const uint8_t *const aref_ptr[],
276                                 int32_t ref_stride, int32_t height,
277                                 uint32_t *sad_array) {
278   int32_t ht_cnt;
279   const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
280   v16u8 src, ref0, ref1, ref2, ref3, diff;
281   v8u16 sad0 = { 0 };
282   v8u16 sad1 = { 0 };
283   v8u16 sad2 = { 0 };
284   v8u16 sad3 = { 0 };
285 
286   ref0_ptr = aref_ptr[0];
287   ref1_ptr = aref_ptr[1];
288   ref2_ptr = aref_ptr[2];
289   ref3_ptr = aref_ptr[3];
290 
291   for (ht_cnt = (height >> 1); ht_cnt--;) {
292     src = LD_UB(src_ptr);
293     src_ptr += src_stride;
294     ref0 = LD_UB(ref0_ptr);
295     ref0_ptr += ref_stride;
296     ref1 = LD_UB(ref1_ptr);
297     ref1_ptr += ref_stride;
298     ref2 = LD_UB(ref2_ptr);
299     ref2_ptr += ref_stride;
300     ref3 = LD_UB(ref3_ptr);
301     ref3_ptr += ref_stride;
302 
303     diff = __msa_asub_u_b(src, ref0);
304     sad0 += __msa_hadd_u_h(diff, diff);
305     diff = __msa_asub_u_b(src, ref1);
306     sad1 += __msa_hadd_u_h(diff, diff);
307     diff = __msa_asub_u_b(src, ref2);
308     sad2 += __msa_hadd_u_h(diff, diff);
309     diff = __msa_asub_u_b(src, ref3);
310     sad3 += __msa_hadd_u_h(diff, diff);
311 
312     src = LD_UB(src_ptr);
313     src_ptr += src_stride;
314     ref0 = LD_UB(ref0_ptr);
315     ref0_ptr += ref_stride;
316     ref1 = LD_UB(ref1_ptr);
317     ref1_ptr += ref_stride;
318     ref2 = LD_UB(ref2_ptr);
319     ref2_ptr += ref_stride;
320     ref3 = LD_UB(ref3_ptr);
321     ref3_ptr += ref_stride;
322 
323     diff = __msa_asub_u_b(src, ref0);
324     sad0 += __msa_hadd_u_h(diff, diff);
325     diff = __msa_asub_u_b(src, ref1);
326     sad1 += __msa_hadd_u_h(diff, diff);
327     diff = __msa_asub_u_b(src, ref2);
328     sad2 += __msa_hadd_u_h(diff, diff);
329     diff = __msa_asub_u_b(src, ref3);
330     sad3 += __msa_hadd_u_h(diff, diff);
331   }
332 
333   sad_array[0] = HADD_UH_U32(sad0);
334   sad_array[1] = HADD_UH_U32(sad1);
335   sad_array[2] = HADD_UH_U32(sad2);
336   sad_array[3] = HADD_UH_U32(sad3);
337 }
338 
sad_32width_x4d_msa(const uint8_t * src,int32_t src_stride,const uint8_t * const aref_ptr[],int32_t ref_stride,int32_t height,uint32_t * sad_array)339 static void sad_32width_x4d_msa(const uint8_t *src, int32_t src_stride,
340                                 const uint8_t *const aref_ptr[],
341                                 int32_t ref_stride, int32_t height,
342                                 uint32_t *sad_array) {
343   const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
344   int32_t ht_cnt;
345   v16u8 src0, src1, ref0, ref1;
346   v8u16 sad0 = { 0 };
347   v8u16 sad1 = { 0 };
348   v8u16 sad2 = { 0 };
349   v8u16 sad3 = { 0 };
350 
351   ref0_ptr = aref_ptr[0];
352   ref1_ptr = aref_ptr[1];
353   ref2_ptr = aref_ptr[2];
354   ref3_ptr = aref_ptr[3];
355 
356   for (ht_cnt = height; ht_cnt--;) {
357     LD_UB2(src, 16, src0, src1);
358     src += src_stride;
359 
360     LD_UB2(ref0_ptr, 16, ref0, ref1);
361     ref0_ptr += ref_stride;
362     sad0 += SAD_UB2_UH(src0, src1, ref0, ref1);
363 
364     LD_UB2(ref1_ptr, 16, ref0, ref1);
365     ref1_ptr += ref_stride;
366     sad1 += SAD_UB2_UH(src0, src1, ref0, ref1);
367 
368     LD_UB2(ref2_ptr, 16, ref0, ref1);
369     ref2_ptr += ref_stride;
370     sad2 += SAD_UB2_UH(src0, src1, ref0, ref1);
371 
372     LD_UB2(ref3_ptr, 16, ref0, ref1);
373     ref3_ptr += ref_stride;
374     sad3 += SAD_UB2_UH(src0, src1, ref0, ref1);
375   }
376 
377   sad_array[0] = HADD_UH_U32(sad0);
378   sad_array[1] = HADD_UH_U32(sad1);
379   sad_array[2] = HADD_UH_U32(sad2);
380   sad_array[3] = HADD_UH_U32(sad3);
381 }
382 
sad_64width_x4d_msa(const uint8_t * src,int32_t src_stride,const uint8_t * const aref_ptr[],int32_t ref_stride,int32_t height,uint32_t * sad_array)383 static void sad_64width_x4d_msa(const uint8_t *src, int32_t src_stride,
384                                 const uint8_t *const aref_ptr[],
385                                 int32_t ref_stride, int32_t height,
386                                 uint32_t *sad_array) {
387   const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
388   int32_t ht_cnt;
389   v16u8 src0, src1, src2, src3;
390   v16u8 ref0, ref1, ref2, ref3;
391   v8u16 sad0_0 = { 0 };
392   v8u16 sad0_1 = { 0 };
393   v8u16 sad1_0 = { 0 };
394   v8u16 sad1_1 = { 0 };
395   v8u16 sad2_0 = { 0 };
396   v8u16 sad2_1 = { 0 };
397   v8u16 sad3_0 = { 0 };
398   v8u16 sad3_1 = { 0 };
399   v4u32 sad;
400 
401   ref0_ptr = aref_ptr[0];
402   ref1_ptr = aref_ptr[1];
403   ref2_ptr = aref_ptr[2];
404   ref3_ptr = aref_ptr[3];
405 
406   for (ht_cnt = height; ht_cnt--;) {
407     LD_UB4(src, 16, src0, src1, src2, src3);
408     src += src_stride;
409 
410     LD_UB4(ref0_ptr, 16, ref0, ref1, ref2, ref3);
411     ref0_ptr += ref_stride;
412     sad0_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
413     sad0_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
414 
415     LD_UB4(ref1_ptr, 16, ref0, ref1, ref2, ref3);
416     ref1_ptr += ref_stride;
417     sad1_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
418     sad1_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
419 
420     LD_UB4(ref2_ptr, 16, ref0, ref1, ref2, ref3);
421     ref2_ptr += ref_stride;
422     sad2_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
423     sad2_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
424 
425     LD_UB4(ref3_ptr, 16, ref0, ref1, ref2, ref3);
426     ref3_ptr += ref_stride;
427     sad3_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
428     sad3_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
429   }
430 
431   sad = __msa_hadd_u_w(sad0_0, sad0_0);
432   sad += __msa_hadd_u_w(sad0_1, sad0_1);
433   sad_array[0] = HADD_UW_U32(sad);
434 
435   sad = __msa_hadd_u_w(sad1_0, sad1_0);
436   sad += __msa_hadd_u_w(sad1_1, sad1_1);
437   sad_array[1] = HADD_UW_U32(sad);
438 
439   sad = __msa_hadd_u_w(sad2_0, sad2_0);
440   sad += __msa_hadd_u_w(sad2_1, sad2_1);
441   sad_array[2] = HADD_UW_U32(sad);
442 
443   sad = __msa_hadd_u_w(sad3_0, sad3_0);
444   sad += __msa_hadd_u_w(sad3_1, sad3_1);
445   sad_array[3] = HADD_UW_U32(sad);
446 }
447 
avgsad_4width_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride,int32_t height,const uint8_t * sec_pred)448 static uint32_t avgsad_4width_msa(const uint8_t *src_ptr, int32_t src_stride,
449                                   const uint8_t *ref_ptr, int32_t ref_stride,
450                                   int32_t height, const uint8_t *sec_pred) {
451   int32_t ht_cnt;
452   uint32_t src0, src1, src2, src3, ref0, ref1, ref2, ref3;
453   v16u8 src = { 0 };
454   v16u8 ref = { 0 };
455   v16u8 diff, pred, comp;
456   v8u16 sad = { 0 };
457 
458   for (ht_cnt = (height >> 2); ht_cnt--;) {
459     LW4(src_ptr, src_stride, src0, src1, src2, src3);
460     src_ptr += (4 * src_stride);
461     LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
462     ref_ptr += (4 * ref_stride);
463     pred = LD_UB(sec_pred);
464     sec_pred += 16;
465 
466     INSERT_W4_UB(src0, src1, src2, src3, src);
467     INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
468 
469     comp = __msa_aver_u_b(pred, ref);
470     diff = __msa_asub_u_b(src, comp);
471     sad += __msa_hadd_u_h(diff, diff);
472   }
473 
474   return HADD_UH_U32(sad);
475 }
476 
avgsad_8width_msa(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,int32_t height,const uint8_t * sec_pred)477 static uint32_t avgsad_8width_msa(const uint8_t *src, int32_t src_stride,
478                                   const uint8_t *ref, int32_t ref_stride,
479                                   int32_t height, const uint8_t *sec_pred) {
480   int32_t ht_cnt;
481   v16u8 src0, src1, src2, src3, ref0, ref1, ref2, ref3;
482   v16u8 diff0, diff1, pred0, pred1;
483   v8u16 sad = { 0 };
484 
485   for (ht_cnt = (height >> 2); ht_cnt--;) {
486     LD_UB4(src, src_stride, src0, src1, src2, src3);
487     src += (4 * src_stride);
488     LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3);
489     ref += (4 * ref_stride);
490     LD_UB2(sec_pred, 16, pred0, pred1);
491     sec_pred += 32;
492     PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1,
493                 ref0, ref1);
494     AVER_UB2_UB(pred0, ref0, pred1, ref1, diff0, diff1);
495     sad += SAD_UB2_UH(src0, src1, diff0, diff1);
496   }
497 
498   return HADD_UH_U32(sad);
499 }
500 
avgsad_16width_msa(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,int32_t height,const uint8_t * sec_pred)501 static uint32_t avgsad_16width_msa(const uint8_t *src, int32_t src_stride,
502                                    const uint8_t *ref, int32_t ref_stride,
503                                    int32_t height, const uint8_t *sec_pred) {
504   int32_t ht_cnt;
505   v16u8 src0, src1, src2, src3, ref0, ref1, ref2, ref3;
506   v16u8 pred0, pred1, pred2, pred3, comp0, comp1;
507   v8u16 sad = { 0 };
508 
509   for (ht_cnt = (height >> 3); ht_cnt--;) {
510     LD_UB4(src, src_stride, src0, src1, src2, src3);
511     src += (4 * src_stride);
512     LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3);
513     ref += (4 * ref_stride);
514     LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
515     sec_pred += (4 * 16);
516     AVER_UB2_UB(pred0, ref0, pred1, ref1, comp0, comp1);
517     sad += SAD_UB2_UH(src0, src1, comp0, comp1);
518     AVER_UB2_UB(pred2, ref2, pred3, ref3, comp0, comp1);
519     sad += SAD_UB2_UH(src2, src3, comp0, comp1);
520 
521     LD_UB4(src, src_stride, src0, src1, src2, src3);
522     src += (4 * src_stride);
523     LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3);
524     ref += (4 * ref_stride);
525     LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
526     sec_pred += (4 * 16);
527     AVER_UB2_UB(pred0, ref0, pred1, ref1, comp0, comp1);
528     sad += SAD_UB2_UH(src0, src1, comp0, comp1);
529     AVER_UB2_UB(pred2, ref2, pred3, ref3, comp0, comp1);
530     sad += SAD_UB2_UH(src2, src3, comp0, comp1);
531   }
532 
533   return HADD_UH_U32(sad);
534 }
535 
avgsad_32width_msa(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,int32_t height,const uint8_t * sec_pred)536 static uint32_t avgsad_32width_msa(const uint8_t *src, int32_t src_stride,
537                                    const uint8_t *ref, int32_t ref_stride,
538                                    int32_t height, const uint8_t *sec_pred) {
539   int32_t ht_cnt;
540   v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
541   v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
542   v16u8 pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7;
543   v16u8 comp0, comp1;
544   v8u16 sad = { 0 };
545 
546   for (ht_cnt = (height >> 2); ht_cnt--;) {
547     LD_UB4(src, src_stride, src0, src2, src4, src6);
548     LD_UB4(src + 16, src_stride, src1, src3, src5, src7);
549     src += (4 * src_stride);
550 
551     LD_UB4(ref, ref_stride, ref0, ref2, ref4, ref6);
552     LD_UB4(ref + 16, ref_stride, ref1, ref3, ref5, ref7);
553     ref += (4 * ref_stride);
554 
555     LD_UB4(sec_pred, 32, pred0, pred2, pred4, pred6);
556     LD_UB4(sec_pred + 16, 32, pred1, pred3, pred5, pred7);
557     sec_pred += (4 * 32);
558 
559     AVER_UB2_UB(pred0, ref0, pred1, ref1, comp0, comp1);
560     sad += SAD_UB2_UH(src0, src1, comp0, comp1);
561     AVER_UB2_UB(pred2, ref2, pred3, ref3, comp0, comp1);
562     sad += SAD_UB2_UH(src2, src3, comp0, comp1);
563     AVER_UB2_UB(pred4, ref4, pred5, ref5, comp0, comp1);
564     sad += SAD_UB2_UH(src4, src5, comp0, comp1);
565     AVER_UB2_UB(pred6, ref6, pred7, ref7, comp0, comp1);
566     sad += SAD_UB2_UH(src6, src7, comp0, comp1);
567   }
568 
569   return HADD_UH_U32(sad);
570 }
571 
avgsad_64width_msa(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,int32_t height,const uint8_t * sec_pred)572 static uint32_t avgsad_64width_msa(const uint8_t *src, int32_t src_stride,
573                                    const uint8_t *ref, int32_t ref_stride,
574                                    int32_t height, const uint8_t *sec_pred) {
575   int32_t ht_cnt;
576   v16u8 src0, src1, src2, src3;
577   v16u8 ref0, ref1, ref2, ref3;
578   v16u8 comp0, comp1, comp2, comp3;
579   v16u8 pred0, pred1, pred2, pred3;
580   v8u16 sad0 = { 0 };
581   v8u16 sad1 = { 0 };
582   v4u32 sad;
583 
584   for (ht_cnt = (height >> 2); ht_cnt--;) {
585     LD_UB4(src, 16, src0, src1, src2, src3);
586     src += src_stride;
587     LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
588     ref += ref_stride;
589     LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
590     sec_pred += 64;
591     AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, comp0,
592                 comp1, comp2, comp3);
593     sad0 += SAD_UB2_UH(src0, src1, comp0, comp1);
594     sad1 += SAD_UB2_UH(src2, src3, comp2, comp3);
595 
596     LD_UB4(src, 16, src0, src1, src2, src3);
597     src += src_stride;
598     LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
599     ref += ref_stride;
600     LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
601     sec_pred += 64;
602     AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, comp0,
603                 comp1, comp2, comp3);
604     sad0 += SAD_UB2_UH(src0, src1, comp0, comp1);
605     sad1 += SAD_UB2_UH(src2, src3, comp2, comp3);
606 
607     LD_UB4(src, 16, src0, src1, src2, src3);
608     src += src_stride;
609     LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
610     ref += ref_stride;
611     LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
612     sec_pred += 64;
613     AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, comp0,
614                 comp1, comp2, comp3);
615     sad0 += SAD_UB2_UH(src0, src1, comp0, comp1);
616     sad1 += SAD_UB2_UH(src2, src3, comp2, comp3);
617 
618     LD_UB4(src, 16, src0, src1, src2, src3);
619     src += src_stride;
620     LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
621     ref += ref_stride;
622     LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
623     sec_pred += 64;
624     AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, comp0,
625                 comp1, comp2, comp3);
626     sad0 += SAD_UB2_UH(src0, src1, comp0, comp1);
627     sad1 += SAD_UB2_UH(src2, src3, comp2, comp3);
628   }
629 
630   sad = __msa_hadd_u_w(sad0, sad0);
631   sad += __msa_hadd_u_w(sad1, sad1);
632 
633   return HADD_SW_S32(sad);
634 }
635 
636 #define VPX_SAD_4xHEIGHT_MSA(height)                                         \
637   uint32_t vpx_sad4x##height##_msa(const uint8_t *src, int32_t src_stride,   \
638                                    const uint8_t *ref, int32_t ref_stride) { \
639     return sad_4width_msa(src, src_stride, ref, ref_stride, height);         \
640   }
641 
642 #define VPX_SAD_8xHEIGHT_MSA(height)                                         \
643   uint32_t vpx_sad8x##height##_msa(const uint8_t *src, int32_t src_stride,   \
644                                    const uint8_t *ref, int32_t ref_stride) { \
645     return sad_8width_msa(src, src_stride, ref, ref_stride, height);         \
646   }
647 
648 #define VPX_SAD_16xHEIGHT_MSA(height)                                         \
649   uint32_t vpx_sad16x##height##_msa(const uint8_t *src, int32_t src_stride,   \
650                                     const uint8_t *ref, int32_t ref_stride) { \
651     return sad_16width_msa(src, src_stride, ref, ref_stride, height);         \
652   }
653 
654 #define VPX_SAD_32xHEIGHT_MSA(height)                                         \
655   uint32_t vpx_sad32x##height##_msa(const uint8_t *src, int32_t src_stride,   \
656                                     const uint8_t *ref, int32_t ref_stride) { \
657     return sad_32width_msa(src, src_stride, ref, ref_stride, height);         \
658   }
659 
660 #define VPX_SAD_64xHEIGHT_MSA(height)                                         \
661   uint32_t vpx_sad64x##height##_msa(const uint8_t *src, int32_t src_stride,   \
662                                     const uint8_t *ref, int32_t ref_stride) { \
663     return sad_64width_msa(src, src_stride, ref, ref_stride, height);         \
664   }
665 
666 #define VPX_SAD_4xHEIGHTx4D_MSA(height)                                   \
667   void vpx_sad4x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
668                                   const uint8_t *const refs[4],           \
669                                   int32_t ref_stride, uint32_t sads[4]) { \
670     sad_4width_x4d_msa(src, src_stride, refs, ref_stride, height, sads);  \
671   }
672 
673 #define VPX_SAD_8xHEIGHTx4D_MSA(height)                                   \
674   void vpx_sad8x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
675                                   const uint8_t *const refs[4],           \
676                                   int32_t ref_stride, uint32_t sads[4]) { \
677     sad_8width_x4d_msa(src, src_stride, refs, ref_stride, height, sads);  \
678   }
679 
680 #define VPX_SAD_16xHEIGHTx4D_MSA(height)                                   \
681   void vpx_sad16x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
682                                    const uint8_t *const refs[4],           \
683                                    int32_t ref_stride, uint32_t sads[4]) { \
684     sad_16width_x4d_msa(src, src_stride, refs, ref_stride, height, sads);  \
685   }
686 
687 #define VPX_SAD_32xHEIGHTx4D_MSA(height)                                   \
688   void vpx_sad32x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
689                                    const uint8_t *const refs[4],           \
690                                    int32_t ref_stride, uint32_t sads[4]) { \
691     sad_32width_x4d_msa(src, src_stride, refs, ref_stride, height, sads);  \
692   }
693 
694 #define VPX_SAD_64xHEIGHTx4D_MSA(height)                                   \
695   void vpx_sad64x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
696                                    const uint8_t *const refs[4],           \
697                                    int32_t ref_stride, uint32_t sads[4]) { \
698     sad_64width_x4d_msa(src, src_stride, refs, ref_stride, height, sads);  \
699   }
700 
701 #define VPX_AVGSAD_4xHEIGHT_MSA(height)                                        \
702   uint32_t vpx_sad4x##height##_avg_msa(const uint8_t *src, int32_t src_stride, \
703                                        const uint8_t *ref, int32_t ref_stride, \
704                                        const uint8_t *second_pred) {           \
705     return avgsad_4width_msa(src, src_stride, ref, ref_stride, height,         \
706                              second_pred);                                     \
707   }
708 
709 #define VPX_AVGSAD_8xHEIGHT_MSA(height)                                        \
710   uint32_t vpx_sad8x##height##_avg_msa(const uint8_t *src, int32_t src_stride, \
711                                        const uint8_t *ref, int32_t ref_stride, \
712                                        const uint8_t *second_pred) {           \
713     return avgsad_8width_msa(src, src_stride, ref, ref_stride, height,         \
714                              second_pred);                                     \
715   }
716 
717 #define VPX_AVGSAD_16xHEIGHT_MSA(height)                                \
718   uint32_t vpx_sad16x##height##_avg_msa(                                \
719       const uint8_t *src, int32_t src_stride, const uint8_t *ref,       \
720       int32_t ref_stride, const uint8_t *second_pred) {                 \
721     return avgsad_16width_msa(src, src_stride, ref, ref_stride, height, \
722                               second_pred);                             \
723   }
724 
725 #define VPX_AVGSAD_32xHEIGHT_MSA(height)                                \
726   uint32_t vpx_sad32x##height##_avg_msa(                                \
727       const uint8_t *src, int32_t src_stride, const uint8_t *ref,       \
728       int32_t ref_stride, const uint8_t *second_pred) {                 \
729     return avgsad_32width_msa(src, src_stride, ref, ref_stride, height, \
730                               second_pred);                             \
731   }
732 
733 #define VPX_AVGSAD_64xHEIGHT_MSA(height)                                \
734   uint32_t vpx_sad64x##height##_avg_msa(                                \
735       const uint8_t *src, int32_t src_stride, const uint8_t *ref,       \
736       int32_t ref_stride, const uint8_t *second_pred) {                 \
737     return avgsad_64width_msa(src, src_stride, ref, ref_stride, height, \
738                               second_pred);                             \
739   }
740 
741 // 64x64
742 VPX_SAD_64xHEIGHT_MSA(64);
743 VPX_SAD_64xHEIGHTx4D_MSA(64);
744 VPX_AVGSAD_64xHEIGHT_MSA(64);
745 
746 // 64x32
747 VPX_SAD_64xHEIGHT_MSA(32);
748 VPX_SAD_64xHEIGHTx4D_MSA(32);
749 VPX_AVGSAD_64xHEIGHT_MSA(32);
750 
751 // 32x64
752 VPX_SAD_32xHEIGHT_MSA(64);
753 VPX_SAD_32xHEIGHTx4D_MSA(64);
754 VPX_AVGSAD_32xHEIGHT_MSA(64);
755 
756 // 32x32
757 VPX_SAD_32xHEIGHT_MSA(32);
758 VPX_SAD_32xHEIGHTx4D_MSA(32);
759 VPX_AVGSAD_32xHEIGHT_MSA(32);
760 
761 // 32x16
762 VPX_SAD_32xHEIGHT_MSA(16);
763 VPX_SAD_32xHEIGHTx4D_MSA(16);
764 VPX_AVGSAD_32xHEIGHT_MSA(16);
765 
766 // 16x32
767 VPX_SAD_16xHEIGHT_MSA(32);
768 VPX_SAD_16xHEIGHTx4D_MSA(32);
769 VPX_AVGSAD_16xHEIGHT_MSA(32);
770 
771 // 16x16
772 VPX_SAD_16xHEIGHT_MSA(16);
773 VPX_SAD_16xHEIGHTx4D_MSA(16);
774 VPX_AVGSAD_16xHEIGHT_MSA(16);
775 
776 // 16x8
777 VPX_SAD_16xHEIGHT_MSA(8);
778 VPX_SAD_16xHEIGHTx4D_MSA(8);
779 VPX_AVGSAD_16xHEIGHT_MSA(8);
780 
781 // 8x16
782 VPX_SAD_8xHEIGHT_MSA(16);
783 VPX_SAD_8xHEIGHTx4D_MSA(16);
784 VPX_AVGSAD_8xHEIGHT_MSA(16);
785 
786 // 8x8
787 VPX_SAD_8xHEIGHT_MSA(8);
788 VPX_SAD_8xHEIGHTx4D_MSA(8);
789 VPX_AVGSAD_8xHEIGHT_MSA(8);
790 
791 // 8x4
792 VPX_SAD_8xHEIGHT_MSA(4);
793 VPX_SAD_8xHEIGHTx4D_MSA(4);
794 VPX_AVGSAD_8xHEIGHT_MSA(4);
795 
796 // 4x8
797 VPX_SAD_4xHEIGHT_MSA(8);
798 VPX_SAD_4xHEIGHTx4D_MSA(8);
799 VPX_AVGSAD_4xHEIGHT_MSA(8);
800 
801 // 4x4
802 VPX_SAD_4xHEIGHT_MSA(4);
803 VPX_SAD_4xHEIGHTx4D_MSA(4);
804 VPX_AVGSAD_4xHEIGHT_MSA(4);
805