1 /*
2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "./vpx_dsp_rtcd.h"
12 #include "vpx_dsp/mips/macros_msa.h"
13
14 #define SAD_INSVE_W4(RTYPE, in0, in1, in2, in3, out) \
15 { \
16 out = (RTYPE)__msa_insve_w((v4i32)out, 0, (v4i32)in0); \
17 out = (RTYPE)__msa_insve_w((v4i32)out, 1, (v4i32)in1); \
18 out = (RTYPE)__msa_insve_w((v4i32)out, 2, (v4i32)in2); \
19 out = (RTYPE)__msa_insve_w((v4i32)out, 3, (v4i32)in3); \
20 }
21 #define SAD_INSVE_W4_UB(...) SAD_INSVE_W4(v16u8, __VA_ARGS__)
22
sad_4width_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride,int32_t height)23 static uint32_t sad_4width_msa(const uint8_t *src_ptr, int32_t src_stride,
24 const uint8_t *ref_ptr, int32_t ref_stride,
25 int32_t height) {
26 int32_t ht_cnt;
27 uint32_t src0, src1, src2, src3, ref0, ref1, ref2, ref3;
28 v16u8 src = { 0 };
29 v16u8 ref = { 0 };
30 v16u8 diff;
31 v8u16 sad = { 0 };
32
33 for (ht_cnt = (height >> 2); ht_cnt--;) {
34 LW4(src_ptr, src_stride, src0, src1, src2, src3);
35 src_ptr += (4 * src_stride);
36 LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
37 ref_ptr += (4 * ref_stride);
38
39 INSERT_W4_UB(src0, src1, src2, src3, src);
40 INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
41
42 diff = __msa_asub_u_b(src, ref);
43 sad += __msa_hadd_u_h(diff, diff);
44 }
45
46 return HADD_UH_U32(sad);
47 }
48
sad_8width_msa(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,int32_t height)49 static uint32_t sad_8width_msa(const uint8_t *src, int32_t src_stride,
50 const uint8_t *ref, int32_t ref_stride,
51 int32_t height) {
52 int32_t ht_cnt;
53 v16u8 src0, src1, src2, src3, ref0, ref1, ref2, ref3;
54 v8u16 sad = { 0 };
55
56 for (ht_cnt = (height >> 2); ht_cnt--;) {
57 LD_UB4(src, src_stride, src0, src1, src2, src3);
58 src += (4 * src_stride);
59 LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3);
60 ref += (4 * ref_stride);
61
62 PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1,
63 ref0, ref1);
64 sad += SAD_UB2_UH(src0, src1, ref0, ref1);
65 }
66
67 return HADD_UH_U32(sad);
68 }
69
sad_16width_msa(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,int32_t height)70 static uint32_t sad_16width_msa(const uint8_t *src, int32_t src_stride,
71 const uint8_t *ref, int32_t ref_stride,
72 int32_t height) {
73 int32_t ht_cnt;
74 v16u8 src0, src1, ref0, ref1;
75 v8u16 sad = { 0 };
76
77 for (ht_cnt = (height >> 2); ht_cnt--;) {
78 LD_UB2(src, src_stride, src0, src1);
79 src += (2 * src_stride);
80 LD_UB2(ref, ref_stride, ref0, ref1);
81 ref += (2 * ref_stride);
82 sad += SAD_UB2_UH(src0, src1, ref0, ref1);
83
84 LD_UB2(src, src_stride, src0, src1);
85 src += (2 * src_stride);
86 LD_UB2(ref, ref_stride, ref0, ref1);
87 ref += (2 * ref_stride);
88 sad += SAD_UB2_UH(src0, src1, ref0, ref1);
89 }
90
91 return HADD_UH_U32(sad);
92 }
93
sad_32width_msa(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,int32_t height)94 static uint32_t sad_32width_msa(const uint8_t *src, int32_t src_stride,
95 const uint8_t *ref, int32_t ref_stride,
96 int32_t height) {
97 int32_t ht_cnt;
98 v16u8 src0, src1, ref0, ref1;
99 v8u16 sad = { 0 };
100
101 for (ht_cnt = (height >> 2); ht_cnt--;) {
102 LD_UB2(src, 16, src0, src1);
103 src += src_stride;
104 LD_UB2(ref, 16, ref0, ref1);
105 ref += ref_stride;
106 sad += SAD_UB2_UH(src0, src1, ref0, ref1);
107
108 LD_UB2(src, 16, src0, src1);
109 src += src_stride;
110 LD_UB2(ref, 16, ref0, ref1);
111 ref += ref_stride;
112 sad += SAD_UB2_UH(src0, src1, ref0, ref1);
113
114 LD_UB2(src, 16, src0, src1);
115 src += src_stride;
116 LD_UB2(ref, 16, ref0, ref1);
117 ref += ref_stride;
118 sad += SAD_UB2_UH(src0, src1, ref0, ref1);
119
120 LD_UB2(src, 16, src0, src1);
121 src += src_stride;
122 LD_UB2(ref, 16, ref0, ref1);
123 ref += ref_stride;
124 sad += SAD_UB2_UH(src0, src1, ref0, ref1);
125 }
126
127 return HADD_UH_U32(sad);
128 }
129
sad_64width_msa(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,int32_t height)130 static uint32_t sad_64width_msa(const uint8_t *src, int32_t src_stride,
131 const uint8_t *ref, int32_t ref_stride,
132 int32_t height) {
133 int32_t ht_cnt;
134 uint32_t sad = 0;
135 v16u8 src0, src1, src2, src3;
136 v16u8 ref0, ref1, ref2, ref3;
137 v8u16 sad0 = { 0 };
138 v8u16 sad1 = { 0 };
139
140 for (ht_cnt = (height >> 1); ht_cnt--;) {
141 LD_UB4(src, 16, src0, src1, src2, src3);
142 src += src_stride;
143 LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
144 ref += ref_stride;
145 sad0 += SAD_UB2_UH(src0, src1, ref0, ref1);
146 sad1 += SAD_UB2_UH(src2, src3, ref2, ref3);
147
148 LD_UB4(src, 16, src0, src1, src2, src3);
149 src += src_stride;
150 LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
151 ref += ref_stride;
152 sad0 += SAD_UB2_UH(src0, src1, ref0, ref1);
153 sad1 += SAD_UB2_UH(src2, src3, ref2, ref3);
154 }
155
156 sad = HADD_UH_U32(sad0);
157 sad += HADD_UH_U32(sad1);
158
159 return sad;
160 }
161
sad_4width_x4d_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * const aref_ptr[],int32_t ref_stride,int32_t height,uint32_t * sad_array)162 static void sad_4width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride,
163 const uint8_t *const aref_ptr[],
164 int32_t ref_stride, int32_t height,
165 uint32_t *sad_array) {
166 const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
167 int32_t ht_cnt;
168 uint32_t src0, src1, src2, src3;
169 uint32_t ref0, ref1, ref2, ref3;
170 v16u8 src = { 0 };
171 v16u8 ref = { 0 };
172 v16u8 diff;
173 v8u16 sad0 = { 0 };
174 v8u16 sad1 = { 0 };
175 v8u16 sad2 = { 0 };
176 v8u16 sad3 = { 0 };
177
178 ref0_ptr = aref_ptr[0];
179 ref1_ptr = aref_ptr[1];
180 ref2_ptr = aref_ptr[2];
181 ref3_ptr = aref_ptr[3];
182
183 for (ht_cnt = (height >> 2); ht_cnt--;) {
184 LW4(src_ptr, src_stride, src0, src1, src2, src3);
185 INSERT_W4_UB(src0, src1, src2, src3, src);
186 src_ptr += (4 * src_stride);
187
188 LW4(ref0_ptr, ref_stride, ref0, ref1, ref2, ref3);
189 INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
190 ref0_ptr += (4 * ref_stride);
191
192 diff = __msa_asub_u_b(src, ref);
193 sad0 += __msa_hadd_u_h(diff, diff);
194
195 LW4(ref1_ptr, ref_stride, ref0, ref1, ref2, ref3);
196 INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
197 ref1_ptr += (4 * ref_stride);
198
199 diff = __msa_asub_u_b(src, ref);
200 sad1 += __msa_hadd_u_h(diff, diff);
201
202 LW4(ref2_ptr, ref_stride, ref0, ref1, ref2, ref3);
203 INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
204 ref2_ptr += (4 * ref_stride);
205
206 diff = __msa_asub_u_b(src, ref);
207 sad2 += __msa_hadd_u_h(diff, diff);
208
209 LW4(ref3_ptr, ref_stride, ref0, ref1, ref2, ref3);
210 INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
211 ref3_ptr += (4 * ref_stride);
212
213 diff = __msa_asub_u_b(src, ref);
214 sad3 += __msa_hadd_u_h(diff, diff);
215 }
216
217 sad_array[0] = HADD_UH_U32(sad0);
218 sad_array[1] = HADD_UH_U32(sad1);
219 sad_array[2] = HADD_UH_U32(sad2);
220 sad_array[3] = HADD_UH_U32(sad3);
221 }
222
sad_8width_x4d_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * const aref_ptr[],int32_t ref_stride,int32_t height,uint32_t * sad_array)223 static void sad_8width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride,
224 const uint8_t *const aref_ptr[],
225 int32_t ref_stride, int32_t height,
226 uint32_t *sad_array) {
227 int32_t ht_cnt;
228 const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
229 v16u8 src0, src1, src2, src3;
230 v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
231 v16u8 ref8, ref9, ref10, ref11, ref12, ref13, ref14, ref15;
232 v8u16 sad0 = { 0 };
233 v8u16 sad1 = { 0 };
234 v8u16 sad2 = { 0 };
235 v8u16 sad3 = { 0 };
236
237 ref0_ptr = aref_ptr[0];
238 ref1_ptr = aref_ptr[1];
239 ref2_ptr = aref_ptr[2];
240 ref3_ptr = aref_ptr[3];
241
242 for (ht_cnt = (height >> 2); ht_cnt--;) {
243 LD_UB4(src_ptr, src_stride, src0, src1, src2, src3);
244 src_ptr += (4 * src_stride);
245 LD_UB4(ref0_ptr, ref_stride, ref0, ref1, ref2, ref3);
246 ref0_ptr += (4 * ref_stride);
247 LD_UB4(ref1_ptr, ref_stride, ref4, ref5, ref6, ref7);
248 ref1_ptr += (4 * ref_stride);
249 LD_UB4(ref2_ptr, ref_stride, ref8, ref9, ref10, ref11);
250 ref2_ptr += (4 * ref_stride);
251 LD_UB4(ref3_ptr, ref_stride, ref12, ref13, ref14, ref15);
252 ref3_ptr += (4 * ref_stride);
253
254 PCKEV_D2_UB(src1, src0, src3, src2, src0, src1);
255 PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
256 sad0 += SAD_UB2_UH(src0, src1, ref0, ref1);
257
258 PCKEV_D2_UB(ref5, ref4, ref7, ref6, ref0, ref1);
259 sad1 += SAD_UB2_UH(src0, src1, ref0, ref1);
260
261 PCKEV_D2_UB(ref9, ref8, ref11, ref10, ref0, ref1);
262 sad2 += SAD_UB2_UH(src0, src1, ref0, ref1);
263
264 PCKEV_D2_UB(ref13, ref12, ref15, ref14, ref0, ref1);
265 sad3 += SAD_UB2_UH(src0, src1, ref0, ref1);
266 }
267
268 sad_array[0] = HADD_UH_U32(sad0);
269 sad_array[1] = HADD_UH_U32(sad1);
270 sad_array[2] = HADD_UH_U32(sad2);
271 sad_array[3] = HADD_UH_U32(sad3);
272 }
273
sad_16width_x4d_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * const aref_ptr[],int32_t ref_stride,int32_t height,uint32_t * sad_array)274 static void sad_16width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride,
275 const uint8_t *const aref_ptr[],
276 int32_t ref_stride, int32_t height,
277 uint32_t *sad_array) {
278 int32_t ht_cnt;
279 const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
280 v16u8 src, ref0, ref1, ref2, ref3, diff;
281 v8u16 sad0 = { 0 };
282 v8u16 sad1 = { 0 };
283 v8u16 sad2 = { 0 };
284 v8u16 sad3 = { 0 };
285
286 ref0_ptr = aref_ptr[0];
287 ref1_ptr = aref_ptr[1];
288 ref2_ptr = aref_ptr[2];
289 ref3_ptr = aref_ptr[3];
290
291 for (ht_cnt = (height >> 1); ht_cnt--;) {
292 src = LD_UB(src_ptr);
293 src_ptr += src_stride;
294 ref0 = LD_UB(ref0_ptr);
295 ref0_ptr += ref_stride;
296 ref1 = LD_UB(ref1_ptr);
297 ref1_ptr += ref_stride;
298 ref2 = LD_UB(ref2_ptr);
299 ref2_ptr += ref_stride;
300 ref3 = LD_UB(ref3_ptr);
301 ref3_ptr += ref_stride;
302
303 diff = __msa_asub_u_b(src, ref0);
304 sad0 += __msa_hadd_u_h(diff, diff);
305 diff = __msa_asub_u_b(src, ref1);
306 sad1 += __msa_hadd_u_h(diff, diff);
307 diff = __msa_asub_u_b(src, ref2);
308 sad2 += __msa_hadd_u_h(diff, diff);
309 diff = __msa_asub_u_b(src, ref3);
310 sad3 += __msa_hadd_u_h(diff, diff);
311
312 src = LD_UB(src_ptr);
313 src_ptr += src_stride;
314 ref0 = LD_UB(ref0_ptr);
315 ref0_ptr += ref_stride;
316 ref1 = LD_UB(ref1_ptr);
317 ref1_ptr += ref_stride;
318 ref2 = LD_UB(ref2_ptr);
319 ref2_ptr += ref_stride;
320 ref3 = LD_UB(ref3_ptr);
321 ref3_ptr += ref_stride;
322
323 diff = __msa_asub_u_b(src, ref0);
324 sad0 += __msa_hadd_u_h(diff, diff);
325 diff = __msa_asub_u_b(src, ref1);
326 sad1 += __msa_hadd_u_h(diff, diff);
327 diff = __msa_asub_u_b(src, ref2);
328 sad2 += __msa_hadd_u_h(diff, diff);
329 diff = __msa_asub_u_b(src, ref3);
330 sad3 += __msa_hadd_u_h(diff, diff);
331 }
332
333 sad_array[0] = HADD_UH_U32(sad0);
334 sad_array[1] = HADD_UH_U32(sad1);
335 sad_array[2] = HADD_UH_U32(sad2);
336 sad_array[3] = HADD_UH_U32(sad3);
337 }
338
sad_32width_x4d_msa(const uint8_t * src,int32_t src_stride,const uint8_t * const aref_ptr[],int32_t ref_stride,int32_t height,uint32_t * sad_array)339 static void sad_32width_x4d_msa(const uint8_t *src, int32_t src_stride,
340 const uint8_t *const aref_ptr[],
341 int32_t ref_stride, int32_t height,
342 uint32_t *sad_array) {
343 const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
344 int32_t ht_cnt;
345 v16u8 src0, src1, ref0, ref1;
346 v8u16 sad0 = { 0 };
347 v8u16 sad1 = { 0 };
348 v8u16 sad2 = { 0 };
349 v8u16 sad3 = { 0 };
350
351 ref0_ptr = aref_ptr[0];
352 ref1_ptr = aref_ptr[1];
353 ref2_ptr = aref_ptr[2];
354 ref3_ptr = aref_ptr[3];
355
356 for (ht_cnt = height; ht_cnt--;) {
357 LD_UB2(src, 16, src0, src1);
358 src += src_stride;
359
360 LD_UB2(ref0_ptr, 16, ref0, ref1);
361 ref0_ptr += ref_stride;
362 sad0 += SAD_UB2_UH(src0, src1, ref0, ref1);
363
364 LD_UB2(ref1_ptr, 16, ref0, ref1);
365 ref1_ptr += ref_stride;
366 sad1 += SAD_UB2_UH(src0, src1, ref0, ref1);
367
368 LD_UB2(ref2_ptr, 16, ref0, ref1);
369 ref2_ptr += ref_stride;
370 sad2 += SAD_UB2_UH(src0, src1, ref0, ref1);
371
372 LD_UB2(ref3_ptr, 16, ref0, ref1);
373 ref3_ptr += ref_stride;
374 sad3 += SAD_UB2_UH(src0, src1, ref0, ref1);
375 }
376
377 sad_array[0] = HADD_UH_U32(sad0);
378 sad_array[1] = HADD_UH_U32(sad1);
379 sad_array[2] = HADD_UH_U32(sad2);
380 sad_array[3] = HADD_UH_U32(sad3);
381 }
382
sad_64width_x4d_msa(const uint8_t * src,int32_t src_stride,const uint8_t * const aref_ptr[],int32_t ref_stride,int32_t height,uint32_t * sad_array)383 static void sad_64width_x4d_msa(const uint8_t *src, int32_t src_stride,
384 const uint8_t *const aref_ptr[],
385 int32_t ref_stride, int32_t height,
386 uint32_t *sad_array) {
387 const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
388 int32_t ht_cnt;
389 v16u8 src0, src1, src2, src3;
390 v16u8 ref0, ref1, ref2, ref3;
391 v8u16 sad0_0 = { 0 };
392 v8u16 sad0_1 = { 0 };
393 v8u16 sad1_0 = { 0 };
394 v8u16 sad1_1 = { 0 };
395 v8u16 sad2_0 = { 0 };
396 v8u16 sad2_1 = { 0 };
397 v8u16 sad3_0 = { 0 };
398 v8u16 sad3_1 = { 0 };
399 v4u32 sad;
400
401 ref0_ptr = aref_ptr[0];
402 ref1_ptr = aref_ptr[1];
403 ref2_ptr = aref_ptr[2];
404 ref3_ptr = aref_ptr[3];
405
406 for (ht_cnt = height; ht_cnt--;) {
407 LD_UB4(src, 16, src0, src1, src2, src3);
408 src += src_stride;
409
410 LD_UB4(ref0_ptr, 16, ref0, ref1, ref2, ref3);
411 ref0_ptr += ref_stride;
412 sad0_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
413 sad0_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
414
415 LD_UB4(ref1_ptr, 16, ref0, ref1, ref2, ref3);
416 ref1_ptr += ref_stride;
417 sad1_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
418 sad1_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
419
420 LD_UB4(ref2_ptr, 16, ref0, ref1, ref2, ref3);
421 ref2_ptr += ref_stride;
422 sad2_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
423 sad2_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
424
425 LD_UB4(ref3_ptr, 16, ref0, ref1, ref2, ref3);
426 ref3_ptr += ref_stride;
427 sad3_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
428 sad3_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
429 }
430
431 sad = __msa_hadd_u_w(sad0_0, sad0_0);
432 sad += __msa_hadd_u_w(sad0_1, sad0_1);
433 sad_array[0] = HADD_UW_U32(sad);
434
435 sad = __msa_hadd_u_w(sad1_0, sad1_0);
436 sad += __msa_hadd_u_w(sad1_1, sad1_1);
437 sad_array[1] = HADD_UW_U32(sad);
438
439 sad = __msa_hadd_u_w(sad2_0, sad2_0);
440 sad += __msa_hadd_u_w(sad2_1, sad2_1);
441 sad_array[2] = HADD_UW_U32(sad);
442
443 sad = __msa_hadd_u_w(sad3_0, sad3_0);
444 sad += __msa_hadd_u_w(sad3_1, sad3_1);
445 sad_array[3] = HADD_UW_U32(sad);
446 }
447
avgsad_4width_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride,int32_t height,const uint8_t * sec_pred)448 static uint32_t avgsad_4width_msa(const uint8_t *src_ptr, int32_t src_stride,
449 const uint8_t *ref_ptr, int32_t ref_stride,
450 int32_t height, const uint8_t *sec_pred) {
451 int32_t ht_cnt;
452 uint32_t src0, src1, src2, src3, ref0, ref1, ref2, ref3;
453 v16u8 src = { 0 };
454 v16u8 ref = { 0 };
455 v16u8 diff, pred, comp;
456 v8u16 sad = { 0 };
457
458 for (ht_cnt = (height >> 2); ht_cnt--;) {
459 LW4(src_ptr, src_stride, src0, src1, src2, src3);
460 src_ptr += (4 * src_stride);
461 LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
462 ref_ptr += (4 * ref_stride);
463 pred = LD_UB(sec_pred);
464 sec_pred += 16;
465
466 INSERT_W4_UB(src0, src1, src2, src3, src);
467 INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
468
469 comp = __msa_aver_u_b(pred, ref);
470 diff = __msa_asub_u_b(src, comp);
471 sad += __msa_hadd_u_h(diff, diff);
472 }
473
474 return HADD_UH_U32(sad);
475 }
476
avgsad_8width_msa(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,int32_t height,const uint8_t * sec_pred)477 static uint32_t avgsad_8width_msa(const uint8_t *src, int32_t src_stride,
478 const uint8_t *ref, int32_t ref_stride,
479 int32_t height, const uint8_t *sec_pred) {
480 int32_t ht_cnt;
481 v16u8 src0, src1, src2, src3, ref0, ref1, ref2, ref3;
482 v16u8 diff0, diff1, pred0, pred1;
483 v8u16 sad = { 0 };
484
485 for (ht_cnt = (height >> 2); ht_cnt--;) {
486 LD_UB4(src, src_stride, src0, src1, src2, src3);
487 src += (4 * src_stride);
488 LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3);
489 ref += (4 * ref_stride);
490 LD_UB2(sec_pred, 16, pred0, pred1);
491 sec_pred += 32;
492 PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1,
493 ref0, ref1);
494 AVER_UB2_UB(pred0, ref0, pred1, ref1, diff0, diff1);
495 sad += SAD_UB2_UH(src0, src1, diff0, diff1);
496 }
497
498 return HADD_UH_U32(sad);
499 }
500
avgsad_16width_msa(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,int32_t height,const uint8_t * sec_pred)501 static uint32_t avgsad_16width_msa(const uint8_t *src, int32_t src_stride,
502 const uint8_t *ref, int32_t ref_stride,
503 int32_t height, const uint8_t *sec_pred) {
504 int32_t ht_cnt;
505 v16u8 src0, src1, src2, src3, ref0, ref1, ref2, ref3;
506 v16u8 pred0, pred1, pred2, pred3, comp0, comp1;
507 v8u16 sad = { 0 };
508
509 for (ht_cnt = (height >> 3); ht_cnt--;) {
510 LD_UB4(src, src_stride, src0, src1, src2, src3);
511 src += (4 * src_stride);
512 LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3);
513 ref += (4 * ref_stride);
514 LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
515 sec_pred += (4 * 16);
516 AVER_UB2_UB(pred0, ref0, pred1, ref1, comp0, comp1);
517 sad += SAD_UB2_UH(src0, src1, comp0, comp1);
518 AVER_UB2_UB(pred2, ref2, pred3, ref3, comp0, comp1);
519 sad += SAD_UB2_UH(src2, src3, comp0, comp1);
520
521 LD_UB4(src, src_stride, src0, src1, src2, src3);
522 src += (4 * src_stride);
523 LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3);
524 ref += (4 * ref_stride);
525 LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
526 sec_pred += (4 * 16);
527 AVER_UB2_UB(pred0, ref0, pred1, ref1, comp0, comp1);
528 sad += SAD_UB2_UH(src0, src1, comp0, comp1);
529 AVER_UB2_UB(pred2, ref2, pred3, ref3, comp0, comp1);
530 sad += SAD_UB2_UH(src2, src3, comp0, comp1);
531 }
532
533 return HADD_UH_U32(sad);
534 }
535
avgsad_32width_msa(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,int32_t height,const uint8_t * sec_pred)536 static uint32_t avgsad_32width_msa(const uint8_t *src, int32_t src_stride,
537 const uint8_t *ref, int32_t ref_stride,
538 int32_t height, const uint8_t *sec_pred) {
539 int32_t ht_cnt;
540 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
541 v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
542 v16u8 pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7;
543 v16u8 comp0, comp1;
544 v8u16 sad = { 0 };
545
546 for (ht_cnt = (height >> 2); ht_cnt--;) {
547 LD_UB4(src, src_stride, src0, src2, src4, src6);
548 LD_UB4(src + 16, src_stride, src1, src3, src5, src7);
549 src += (4 * src_stride);
550
551 LD_UB4(ref, ref_stride, ref0, ref2, ref4, ref6);
552 LD_UB4(ref + 16, ref_stride, ref1, ref3, ref5, ref7);
553 ref += (4 * ref_stride);
554
555 LD_UB4(sec_pred, 32, pred0, pred2, pred4, pred6);
556 LD_UB4(sec_pred + 16, 32, pred1, pred3, pred5, pred7);
557 sec_pred += (4 * 32);
558
559 AVER_UB2_UB(pred0, ref0, pred1, ref1, comp0, comp1);
560 sad += SAD_UB2_UH(src0, src1, comp0, comp1);
561 AVER_UB2_UB(pred2, ref2, pred3, ref3, comp0, comp1);
562 sad += SAD_UB2_UH(src2, src3, comp0, comp1);
563 AVER_UB2_UB(pred4, ref4, pred5, ref5, comp0, comp1);
564 sad += SAD_UB2_UH(src4, src5, comp0, comp1);
565 AVER_UB2_UB(pred6, ref6, pred7, ref7, comp0, comp1);
566 sad += SAD_UB2_UH(src6, src7, comp0, comp1);
567 }
568
569 return HADD_UH_U32(sad);
570 }
571
avgsad_64width_msa(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,int32_t height,const uint8_t * sec_pred)572 static uint32_t avgsad_64width_msa(const uint8_t *src, int32_t src_stride,
573 const uint8_t *ref, int32_t ref_stride,
574 int32_t height, const uint8_t *sec_pred) {
575 int32_t ht_cnt;
576 v16u8 src0, src1, src2, src3;
577 v16u8 ref0, ref1, ref2, ref3;
578 v16u8 comp0, comp1, comp2, comp3;
579 v16u8 pred0, pred1, pred2, pred3;
580 v8u16 sad0 = { 0 };
581 v8u16 sad1 = { 0 };
582 v4u32 sad;
583
584 for (ht_cnt = (height >> 2); ht_cnt--;) {
585 LD_UB4(src, 16, src0, src1, src2, src3);
586 src += src_stride;
587 LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
588 ref += ref_stride;
589 LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
590 sec_pred += 64;
591 AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, comp0,
592 comp1, comp2, comp3);
593 sad0 += SAD_UB2_UH(src0, src1, comp0, comp1);
594 sad1 += SAD_UB2_UH(src2, src3, comp2, comp3);
595
596 LD_UB4(src, 16, src0, src1, src2, src3);
597 src += src_stride;
598 LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
599 ref += ref_stride;
600 LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
601 sec_pred += 64;
602 AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, comp0,
603 comp1, comp2, comp3);
604 sad0 += SAD_UB2_UH(src0, src1, comp0, comp1);
605 sad1 += SAD_UB2_UH(src2, src3, comp2, comp3);
606
607 LD_UB4(src, 16, src0, src1, src2, src3);
608 src += src_stride;
609 LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
610 ref += ref_stride;
611 LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
612 sec_pred += 64;
613 AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, comp0,
614 comp1, comp2, comp3);
615 sad0 += SAD_UB2_UH(src0, src1, comp0, comp1);
616 sad1 += SAD_UB2_UH(src2, src3, comp2, comp3);
617
618 LD_UB4(src, 16, src0, src1, src2, src3);
619 src += src_stride;
620 LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
621 ref += ref_stride;
622 LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
623 sec_pred += 64;
624 AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, comp0,
625 comp1, comp2, comp3);
626 sad0 += SAD_UB2_UH(src0, src1, comp0, comp1);
627 sad1 += SAD_UB2_UH(src2, src3, comp2, comp3);
628 }
629
630 sad = __msa_hadd_u_w(sad0, sad0);
631 sad += __msa_hadd_u_w(sad1, sad1);
632
633 return HADD_SW_S32(sad);
634 }
635
636 #define VPX_SAD_4xHEIGHT_MSA(height) \
637 uint32_t vpx_sad4x##height##_msa(const uint8_t *src, int32_t src_stride, \
638 const uint8_t *ref, int32_t ref_stride) { \
639 return sad_4width_msa(src, src_stride, ref, ref_stride, height); \
640 }
641
642 #define VPX_SAD_8xHEIGHT_MSA(height) \
643 uint32_t vpx_sad8x##height##_msa(const uint8_t *src, int32_t src_stride, \
644 const uint8_t *ref, int32_t ref_stride) { \
645 return sad_8width_msa(src, src_stride, ref, ref_stride, height); \
646 }
647
648 #define VPX_SAD_16xHEIGHT_MSA(height) \
649 uint32_t vpx_sad16x##height##_msa(const uint8_t *src, int32_t src_stride, \
650 const uint8_t *ref, int32_t ref_stride) { \
651 return sad_16width_msa(src, src_stride, ref, ref_stride, height); \
652 }
653
654 #define VPX_SAD_32xHEIGHT_MSA(height) \
655 uint32_t vpx_sad32x##height##_msa(const uint8_t *src, int32_t src_stride, \
656 const uint8_t *ref, int32_t ref_stride) { \
657 return sad_32width_msa(src, src_stride, ref, ref_stride, height); \
658 }
659
660 #define VPX_SAD_64xHEIGHT_MSA(height) \
661 uint32_t vpx_sad64x##height##_msa(const uint8_t *src, int32_t src_stride, \
662 const uint8_t *ref, int32_t ref_stride) { \
663 return sad_64width_msa(src, src_stride, ref, ref_stride, height); \
664 }
665
666 #define VPX_SAD_4xHEIGHTx4D_MSA(height) \
667 void vpx_sad4x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
668 const uint8_t *const refs[4], \
669 int32_t ref_stride, uint32_t sads[4]) { \
670 sad_4width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \
671 }
672
673 #define VPX_SAD_8xHEIGHTx4D_MSA(height) \
674 void vpx_sad8x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
675 const uint8_t *const refs[4], \
676 int32_t ref_stride, uint32_t sads[4]) { \
677 sad_8width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \
678 }
679
680 #define VPX_SAD_16xHEIGHTx4D_MSA(height) \
681 void vpx_sad16x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
682 const uint8_t *const refs[4], \
683 int32_t ref_stride, uint32_t sads[4]) { \
684 sad_16width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \
685 }
686
687 #define VPX_SAD_32xHEIGHTx4D_MSA(height) \
688 void vpx_sad32x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
689 const uint8_t *const refs[4], \
690 int32_t ref_stride, uint32_t sads[4]) { \
691 sad_32width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \
692 }
693
694 #define VPX_SAD_64xHEIGHTx4D_MSA(height) \
695 void vpx_sad64x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
696 const uint8_t *const refs[4], \
697 int32_t ref_stride, uint32_t sads[4]) { \
698 sad_64width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \
699 }
700
701 #define VPX_AVGSAD_4xHEIGHT_MSA(height) \
702 uint32_t vpx_sad4x##height##_avg_msa(const uint8_t *src, int32_t src_stride, \
703 const uint8_t *ref, int32_t ref_stride, \
704 const uint8_t *second_pred) { \
705 return avgsad_4width_msa(src, src_stride, ref, ref_stride, height, \
706 second_pred); \
707 }
708
709 #define VPX_AVGSAD_8xHEIGHT_MSA(height) \
710 uint32_t vpx_sad8x##height##_avg_msa(const uint8_t *src, int32_t src_stride, \
711 const uint8_t *ref, int32_t ref_stride, \
712 const uint8_t *second_pred) { \
713 return avgsad_8width_msa(src, src_stride, ref, ref_stride, height, \
714 second_pred); \
715 }
716
717 #define VPX_AVGSAD_16xHEIGHT_MSA(height) \
718 uint32_t vpx_sad16x##height##_avg_msa( \
719 const uint8_t *src, int32_t src_stride, const uint8_t *ref, \
720 int32_t ref_stride, const uint8_t *second_pred) { \
721 return avgsad_16width_msa(src, src_stride, ref, ref_stride, height, \
722 second_pred); \
723 }
724
725 #define VPX_AVGSAD_32xHEIGHT_MSA(height) \
726 uint32_t vpx_sad32x##height##_avg_msa( \
727 const uint8_t *src, int32_t src_stride, const uint8_t *ref, \
728 int32_t ref_stride, const uint8_t *second_pred) { \
729 return avgsad_32width_msa(src, src_stride, ref, ref_stride, height, \
730 second_pred); \
731 }
732
733 #define VPX_AVGSAD_64xHEIGHT_MSA(height) \
734 uint32_t vpx_sad64x##height##_avg_msa( \
735 const uint8_t *src, int32_t src_stride, const uint8_t *ref, \
736 int32_t ref_stride, const uint8_t *second_pred) { \
737 return avgsad_64width_msa(src, src_stride, ref, ref_stride, height, \
738 second_pred); \
739 }
740
741 // 64x64
742 VPX_SAD_64xHEIGHT_MSA(64);
743 VPX_SAD_64xHEIGHTx4D_MSA(64);
744 VPX_AVGSAD_64xHEIGHT_MSA(64);
745
746 // 64x32
747 VPX_SAD_64xHEIGHT_MSA(32);
748 VPX_SAD_64xHEIGHTx4D_MSA(32);
749 VPX_AVGSAD_64xHEIGHT_MSA(32);
750
751 // 32x64
752 VPX_SAD_32xHEIGHT_MSA(64);
753 VPX_SAD_32xHEIGHTx4D_MSA(64);
754 VPX_AVGSAD_32xHEIGHT_MSA(64);
755
756 // 32x32
757 VPX_SAD_32xHEIGHT_MSA(32);
758 VPX_SAD_32xHEIGHTx4D_MSA(32);
759 VPX_AVGSAD_32xHEIGHT_MSA(32);
760
761 // 32x16
762 VPX_SAD_32xHEIGHT_MSA(16);
763 VPX_SAD_32xHEIGHTx4D_MSA(16);
764 VPX_AVGSAD_32xHEIGHT_MSA(16);
765
766 // 16x32
767 VPX_SAD_16xHEIGHT_MSA(32);
768 VPX_SAD_16xHEIGHTx4D_MSA(32);
769 VPX_AVGSAD_16xHEIGHT_MSA(32);
770
771 // 16x16
772 VPX_SAD_16xHEIGHT_MSA(16);
773 VPX_SAD_16xHEIGHTx4D_MSA(16);
774 VPX_AVGSAD_16xHEIGHT_MSA(16);
775
776 // 16x8
777 VPX_SAD_16xHEIGHT_MSA(8);
778 VPX_SAD_16xHEIGHTx4D_MSA(8);
779 VPX_AVGSAD_16xHEIGHT_MSA(8);
780
781 // 8x16
782 VPX_SAD_8xHEIGHT_MSA(16);
783 VPX_SAD_8xHEIGHTx4D_MSA(16);
784 VPX_AVGSAD_8xHEIGHT_MSA(16);
785
786 // 8x8
787 VPX_SAD_8xHEIGHT_MSA(8);
788 VPX_SAD_8xHEIGHTx4D_MSA(8);
789 VPX_AVGSAD_8xHEIGHT_MSA(8);
790
791 // 8x4
792 VPX_SAD_8xHEIGHT_MSA(4);
793 VPX_SAD_8xHEIGHTx4D_MSA(4);
794 VPX_AVGSAD_8xHEIGHT_MSA(4);
795
796 // 4x8
797 VPX_SAD_4xHEIGHT_MSA(8);
798 VPX_SAD_4xHEIGHTx4D_MSA(8);
799 VPX_AVGSAD_4xHEIGHT_MSA(8);
800
801 // 4x4
802 VPX_SAD_4xHEIGHT_MSA(4);
803 VPX_SAD_4xHEIGHTx4D_MSA(4);
804 VPX_AVGSAD_4xHEIGHT_MSA(4);
805