xref: /aosp_15_r20/external/libaom/aom_dsp/x86/sad_sse2.asm (revision 77c1e3ccc04c968bd2bc212e87364f250e820521)
1;
2; Copyright (c) 2016, Alliance for Open Media. All rights reserved.
3;
4; This source code is subject to the terms of the BSD 2 Clause License and
5; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6; was not distributed with this source code in the LICENSE file, you can
7; obtain it at www.aomedia.org/license/software. If the Alliance for Open
8; Media Patent License 1.0 was not distributed with this source code in the
9; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10;
11
12;
13
14%include "third_party/x86inc/x86inc.asm"
15
16SECTION .text
17
18; Macro Arguments
19; Arg 1: Width
20; Arg 2: Height
21; Arg 3: Number of general purpose registers: 5 for 32-bit build, 6 for 64-bit
22; Arg 4: Type of function: if 0, normal sad; if 1, avg; if 2, skip rows
23%macro SAD_FN 4
24%if %4 == 0 ; normal sad
25%if %3 == 5
26cglobal sad%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, n_rows
27%else ; %3 == 7
28cglobal sad%1x%2, 4, %3, 6, src, src_stride, ref, ref_stride, \
29                            src_stride3, ref_stride3, n_rows
30%endif ; %3 == 5/7
31
32%elif %4 == 2 ; skip
33%if %3 == 5
34cglobal sad_skip_%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, n_rows
35%else ; %3 == 7
36cglobal sad_skip_%1x%2, 4, %3, 6, src, src_stride, ref, ref_stride, \
37                            src_stride3, ref_stride3, n_rows
38%endif ; %3 == 5/7
39
40%else
41%if %3 == 5
42cglobal sad%1x%2_avg, 5, 1 + %3, 5, src, src_stride, ref, ref_stride, \
43                                    second_pred, n_rows
44%else ; %3 == 7
45cglobal sad%1x%2_avg, 5, AOM_ARCH_X86_64 + %3, 6, src, src_stride, \
46                                              ref, ref_stride, \
47                                              second_pred, \
48                                              src_stride3, ref_stride3
49%if AOM_ARCH_X86_64
50%define n_rowsd r7d
51%else ; x86-32
52%define n_rowsd dword r0m
53%endif ; x86-32/64
54%endif ; %3 == 5/7
55%endif ; sad/avg/skip
56%if %4 == 2; skip rows so double the stride
57lea           src_strided, [src_strided*2]
58lea           ref_strided, [ref_strided*2]
59%endif ; %4 skip
60  movsxdifnidn src_strideq, src_strided
61  movsxdifnidn ref_strideq, ref_strided
62%if %3 == 7
63  lea         src_stride3q, [src_strideq*3]
64  lea         ref_stride3q, [ref_strideq*3]
65%endif ; %3 == 7
66%endmacro
67
68; unsigned int aom_sad128x128_sse2(uint8_t *src, int src_stride,
69;                                  uint8_t *ref, int ref_stride);
70%macro SAD128XN 1-2 0
71  SAD_FN 128, %1, 5, %2
72%if %2 == 2
73  mov              n_rowsd, %1/2
74%else
75  mov              n_rowsd, %1
76%endif
77  pxor                  m0, m0
78
79.loop:
80  movu                  m1, [refq]
81  movu                  m2, [refq+16]
82  movu                  m3, [refq+32]
83  movu                  m4, [refq+48]
84%if %2 == 1
85  pavgb                 m1, [second_predq+mmsize*0]
86  pavgb                 m2, [second_predq+mmsize*1]
87  pavgb                 m3, [second_predq+mmsize*2]
88  pavgb                 m4, [second_predq+mmsize*3]
89%endif
90  psadbw                m1, [srcq]
91  psadbw                m2, [srcq+16]
92  psadbw                m3, [srcq+32]
93  psadbw                m4, [srcq+48]
94
95  paddd                 m1, m2
96  paddd                 m3, m4
97  paddd                 m0, m1
98  paddd                 m0, m3
99
100  movu                  m1, [refq+64]
101  movu                  m2, [refq+80]
102  movu                  m3, [refq+96]
103  movu                  m4, [refq+112]
104%if %2 == 1
105  pavgb                 m1, [second_predq+mmsize*4]
106  pavgb                 m2, [second_predq+mmsize*5]
107  pavgb                 m3, [second_predq+mmsize*6]
108  pavgb                 m4, [second_predq+mmsize*7]
109  lea         second_predq, [second_predq+mmsize*8]
110%endif
111  psadbw                m1, [srcq+64]
112  psadbw                m2, [srcq+80]
113  psadbw                m3, [srcq+96]
114  psadbw                m4, [srcq+112]
115
116  add                 refq, ref_strideq
117  add                 srcq, src_strideq
118
119  paddd                 m1, m2
120  paddd                 m3, m4
121  paddd                 m0, m1
122  paddd                 m0, m3
123
124  sub              n_rowsd, 1
125  jg .loop
126
127  movhlps               m1, m0
128  paddd                 m0, m1
129%if %2 == 2 ; we skipped rows, so now we need to double the sad
130  pslld                 m0, 1
131%endif
132  movd                 eax, m0
133  RET
134%endmacro
135
136INIT_XMM sse2
137SAD128XN 128     ; sad128x128_sse2
138SAD128XN 128, 1  ; sad128x128_avg_sse2
139SAD128XN 128, 2  ; sad128x128_skip_sse2
140SAD128XN 64      ; sad128x64_sse2
141SAD128XN 64, 1   ; sad128x64_avg_sse2
142SAD128XN 64, 2   ; sad128x64_skip_sse2
143
144
145; unsigned int aom_sad64x64_sse2(uint8_t *src, int src_stride,
146;                                uint8_t *ref, int ref_stride);
147%macro SAD64XN 1-2 0
148  SAD_FN 64, %1, 5, %2
149%if %2 == 2
150  mov              n_rowsd, %1/2
151%else
152  mov              n_rowsd, %1
153%endif
154  pxor                  m0, m0
155.loop:
156  movu                  m1, [refq]
157  movu                  m2, [refq+16]
158  movu                  m3, [refq+32]
159  movu                  m4, [refq+48]
160%if %2 == 1
161  pavgb                 m1, [second_predq+mmsize*0]
162  pavgb                 m2, [second_predq+mmsize*1]
163  pavgb                 m3, [second_predq+mmsize*2]
164  pavgb                 m4, [second_predq+mmsize*3]
165  lea         second_predq, [second_predq+mmsize*4]
166%endif
167  psadbw                m1, [srcq]
168  psadbw                m2, [srcq+16]
169  psadbw                m3, [srcq+32]
170  psadbw                m4, [srcq+48]
171  paddd                 m1, m2
172  paddd                 m3, m4
173  add                 refq, ref_strideq
174  paddd                 m0, m1
175  add                 srcq, src_strideq
176  paddd                 m0, m3
177  dec              n_rowsd
178  jg .loop
179
180  movhlps               m1, m0
181  paddd                 m0, m1
182%if %2 == 2 ; we skipped rows, so now we need to double the sad
183  pslld                 m0, 1
184%endif
185  movd                 eax, m0
186  RET
187%endmacro
188
189INIT_XMM sse2
190SAD64XN 128     ; sad64x128_sse2
191SAD64XN  64     ; sad64x64_sse2
192SAD64XN  32     ; sad64x32_sse2
193SAD64XN 128, 1  ; sad64x128_avg_sse2
194SAD64XN  64, 1  ; sad64x64_avg_sse2
195SAD64XN  32, 1  ; sad64x32_avg_sse2
196SAD64XN 128, 2  ; sad64x128_skip_sse2
197SAD64XN  64, 2  ; sad64x64_skip_sse2
198SAD64XN  32, 2  ; sad64x32_skip_sse2
199%if CONFIG_REALTIME_ONLY==0
200SAD64XN  16     ; sad64x16_sse2
201SAD64XN  16, 1  ; sad64x16_avg_sse2
202SAD64XN  16, 2  ; sad64x16_skip_sse2
203%endif
204
205; unsigned int aom_sad32x32_sse2(uint8_t *src, int src_stride,
206;                                uint8_t *ref, int ref_stride);
207%macro SAD32XN 1-2 0
208  SAD_FN 32, %1, 5, %2
209%if %2 == 2
210  mov              n_rowsd, %1/4
211%else
212  mov              n_rowsd, %1/2
213%endif
214  pxor                  m0, m0
215.loop:
216  movu                  m1, [refq]
217  movu                  m2, [refq+16]
218  movu                  m3, [refq+ref_strideq]
219  movu                  m4, [refq+ref_strideq+16]
220%if %2 == 1
221  pavgb                 m1, [second_predq+mmsize*0]
222  pavgb                 m2, [second_predq+mmsize*1]
223  pavgb                 m3, [second_predq+mmsize*2]
224  pavgb                 m4, [second_predq+mmsize*3]
225  lea         second_predq, [second_predq+mmsize*4]
226%endif
227  psadbw                m1, [srcq]
228  psadbw                m2, [srcq+16]
229  psadbw                m3, [srcq+src_strideq]
230  psadbw                m4, [srcq+src_strideq+16]
231  paddd                 m1, m2
232  paddd                 m3, m4
233  lea                 refq, [refq+ref_strideq*2]
234  paddd                 m0, m1
235  lea                 srcq, [srcq+src_strideq*2]
236  paddd                 m0, m3
237  dec              n_rowsd
238  jg .loop
239
240  movhlps               m1, m0
241  paddd                 m0, m1
242%if %2 == 2 ; we skipped rows, so now we need to double the sad
243  pslld                 m0, 1
244%endif
245  movd                 eax, m0
246  RET
247%endmacro
248
249INIT_XMM sse2
250SAD32XN 64    ; sad32x64_sse2
251SAD32XN 32    ; sad32x32_sse2
252SAD32XN 16    ; sad32x16_sse2
253SAD32XN 64, 1 ; sad32x64_avg_sse2
254SAD32XN 32, 1 ; sad32x32_avg_sse2
255SAD32XN 16, 1 ; sad32x16_avg_sse2
256SAD32XN 64, 2 ; sad32x64_skip_sse2
257SAD32XN 32, 2 ; sad32x32_skip_sse2
258SAD32XN 16, 2 ; sad32x16_skip_sse2
259%if CONFIG_REALTIME_ONLY==0
260SAD32XN  8    ; sad_32x8_sse2
261SAD32XN  8, 1 ; sad_32x8_avg_sse2
262SAD32XN  8, 2 ; sad_32x8_skip_sse2
263%endif
264
265; unsigned int aom_sad16x{8,16}_sse2(uint8_t *src, int src_stride,
266;                                    uint8_t *ref, int ref_stride);
267%macro SAD16XN 1-2 0
268  SAD_FN 16, %1, 7, %2
269%if %2 == 2
270  mov              n_rowsd, %1/8
271%else
272  mov              n_rowsd, %1/4
273%endif
274  pxor                  m0, m0
275
276.loop:
277  movu                  m1, [refq]
278  movu                  m2, [refq+ref_strideq]
279  movu                  m3, [refq+ref_strideq*2]
280  movu                  m4, [refq+ref_stride3q]
281%if %2 == 1
282  pavgb                 m1, [second_predq+mmsize*0]
283  pavgb                 m2, [second_predq+mmsize*1]
284  pavgb                 m3, [second_predq+mmsize*2]
285  pavgb                 m4, [second_predq+mmsize*3]
286  lea         second_predq, [second_predq+mmsize*4]
287%endif
288  psadbw                m1, [srcq]
289  psadbw                m2, [srcq+src_strideq]
290  psadbw                m3, [srcq+src_strideq*2]
291  psadbw                m4, [srcq+src_stride3q]
292  paddd                 m1, m2
293  paddd                 m3, m4
294  lea                 refq, [refq+ref_strideq*4]
295  paddd                 m0, m1
296  lea                 srcq, [srcq+src_strideq*4]
297  paddd                 m0, m3
298  dec              n_rowsd
299  jg .loop
300
301  movhlps               m1, m0
302  paddd                 m0, m1
303%if %2 == 2 ; we skipped rows, so now we need to double the sad
304  pslld                 m0, 1
305%endif
306  movd                 eax, m0
307  RET
308%endmacro
309
310INIT_XMM sse2
311SAD16XN 32    ; sad16x32_sse2
312SAD16XN 16    ; sad16x16_sse2
313SAD16XN  8    ; sad16x8_sse2
314SAD16XN 32, 1 ; sad16x32_avg_sse2
315SAD16XN 16, 1 ; sad16x16_avg_sse2
316SAD16XN  8, 1 ; sad16x8_avg_sse2
317SAD16XN 32, 2 ; sad16x32_skip_sse2
318SAD16XN 16, 2 ; sad16x16_skip_sse2
319SAD16XN  8, 2 ; sad16x8_skip_sse2
320%if CONFIG_REALTIME_ONLY==0
321SAD16XN 64    ; sad_16x64_sse2
322SAD16XN  4    ; sad_16x4_sse2
323SAD16XN 64, 1 ; sad_16x64_avg_sse2
324SAD16XN  4, 1 ; sad_16x4_avg_sse2
325SAD16XN 64, 2 ; sad_16x64_skip_sse2
326%endif
327
328; unsigned int aom_sad8x{8,16}_sse2(uint8_t *src, int src_stride,
329;                                   uint8_t *ref, int ref_stride);
330%macro SAD8XN 1-2 0
331  SAD_FN 8, %1, 7, %2
332%if %2 == 2
333  mov              n_rowsd, %1/8
334%else
335  mov              n_rowsd, %1/4
336%endif
337  pxor                  m0, m0
338
339.loop:
340  movh                  m1, [refq]
341  movhps                m1, [refq+ref_strideq]
342  movh                  m2, [refq+ref_strideq*2]
343  movhps                m2, [refq+ref_stride3q]
344%if %2 == 1
345  pavgb                 m1, [second_predq+mmsize*0]
346  pavgb                 m2, [second_predq+mmsize*1]
347  lea         second_predq, [second_predq+mmsize*2]
348%endif
349  movh                  m3, [srcq]
350  movhps                m3, [srcq+src_strideq]
351  movh                  m4, [srcq+src_strideq*2]
352  movhps                m4, [srcq+src_stride3q]
353  psadbw                m1, m3
354  psadbw                m2, m4
355  lea                 refq, [refq+ref_strideq*4]
356  paddd                 m0, m1
357  lea                 srcq, [srcq+src_strideq*4]
358  paddd                 m0, m2
359  dec              n_rowsd
360  jg .loop
361
362  movhlps               m1, m0
363  paddd                 m0, m1
364%if %2 == 2 ; we skipped rows, so now we need to double the sad
365  pslld                 m0, 1
366%endif
367  movd                 eax, m0
368  RET
369%endmacro
370
371INIT_XMM sse2
372SAD8XN 16    ; sad8x16_sse2
373SAD8XN  8    ; sad8x8_sse2
374SAD8XN  4    ; sad8x4_sse2
375SAD8XN 16, 1 ; sad8x16_avg_sse2
376SAD8XN  8, 1 ; sad8x8_avg_sse2
377SAD8XN  4, 1 ; sad8x4_avg_sse2
378SAD8XN 16, 2 ; sad8x16_skip_sse2
379SAD8XN  8, 2 ; sad8x8_skip_sse2
380%if CONFIG_REALTIME_ONLY==0
381SAD8XN 32    ; sad_8x32_sse2
382SAD8XN 32, 1 ; sad_8x32_avg_sse2
383SAD8XN 32, 2 ; sad_8x32_skip_sse2
384%endif
385
386; unsigned int aom_sad4x{4, 8}_sse2(uint8_t *src, int src_stride,
387;                                   uint8_t *ref, int ref_stride);
388%macro SAD4XN 1-2 0
389  SAD_FN 4, %1, 7, %2
390%if %2 == 2
391  mov              n_rowsd, %1/8
392%else
393  mov              n_rowsd, %1/4
394%endif
395  pxor                  m0, m0
396
397.loop:
398  movd                  m1, [refq]
399  movd                  m2, [refq+ref_strideq]
400  movd                  m3, [refq+ref_strideq*2]
401  movd                  m4, [refq+ref_stride3q]
402  punpckldq             m1, m2
403  punpckldq             m3, m4
404  movlhps               m1, m3
405%if %2 == 1
406  pavgb                 m1, [second_predq+mmsize*0]
407  lea         second_predq, [second_predq+mmsize*1]
408%endif
409  movd                  m2, [srcq]
410  movd                  m5, [srcq+src_strideq]
411  movd                  m4, [srcq+src_strideq*2]
412  movd                  m3, [srcq+src_stride3q]
413  punpckldq             m2, m5
414  punpckldq             m4, m3
415  movlhps               m2, m4
416  psadbw                m1, m2
417  lea                 refq, [refq+ref_strideq*4]
418  paddd                 m0, m1
419  lea                 srcq, [srcq+src_strideq*4]
420  dec              n_rowsd
421  jg .loop
422
423  movhlps               m1, m0
424  paddd                 m0, m1
425%if %2 == 2 ; we skipped rows, so now we need to double the sad
426  pslld                 m0, 1
427%endif
428  movd                 eax, m0
429  RET
430%endmacro
431
432INIT_XMM sse2
433SAD4XN  8 ; sad4x8_sse2
434SAD4XN  4 ; sad4x4_sse2
435SAD4XN  8, 1 ; sad4x8_avg_sse2
436SAD4XN  4, 1 ; sad4x4_avg_sse2
437SAD4XN  8, 2 ; sad4x8_skip_sse2
438%if CONFIG_REALTIME_ONLY==0
439SAD4XN 16 ; sad_4x16_sse2
440SAD4XN 16, 1 ; sad_4x16_avg_sse2
441SAD4XN 16, 2 ; sad_4x16_skip_sse2
442%endif
443