xref: /aosp_15_r20/external/libaom/aom_dsp/x86/highbd_subpel_variance_impl_sse2.asm (revision 77c1e3ccc04c968bd2bc212e87364f250e820521)
1;
2; Copyright (c) 2016, Alliance for Open Media. All rights reserved.
3;
4; This source code is subject to the terms of the BSD 2 Clause License and
5; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6; was not distributed with this source code in the LICENSE file, you can
7; obtain it at www.aomedia.org/license/software. If the Alliance for Open
8; Media Patent License 1.0 was not distributed with this source code in the
9; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10;
11
12;
13
14%include "third_party/x86inc/x86inc.asm"
15
16SECTION_RODATA
17pw_8: times  8 dw  8
18bilin_filter_m_sse2: times  8 dw 16
19                     times  8 dw  0
20                     times  8 dw 14
21                     times  8 dw  2
22                     times  8 dw 12
23                     times  8 dw  4
24                     times  8 dw 10
25                     times  8 dw  6
26                     times 16 dw  8
27                     times  8 dw  6
28                     times  8 dw 10
29                     times  8 dw  4
30                     times  8 dw 12
31                     times  8 dw  2
32                     times  8 dw 14
33
34SECTION .text
35
36; int aom_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride,
37;                               int x_offset, int y_offset,
38;                               const uint8_t *dst, ptrdiff_t dst_stride,
39;                               int height, unsigned int *sse);
40;
41; This function returns the SE and stores SSE in the given pointer.
42
43%macro SUM_SSE 6 ; src1, dst1, src2, dst2, sum, sse
44  psubw                %3, %4
45  psubw                %1, %2
46  mova                 %4, %3       ; make copies to manipulate to calc sum
47  mova                 %2, %1       ; use originals for calc sse
48  pmaddwd              %3, %3
49  paddw                %4, %2
50  pmaddwd              %1, %1
51  movhlps              %2, %4
52  paddd                %6, %3
53  paddw                %4, %2
54  pxor                 %2, %2
55  pcmpgtw              %2, %4       ; mask for 0 > %4 (sum)
56  punpcklwd            %4, %2       ; sign-extend word to dword
57  paddd                %6, %1
58  paddd                %5, %4
59
60%endmacro
61
62%macro STORE_AND_RET 0
63%if mmsize == 16
64  ; if H=64 and W=16, we have 8 words of each 2(1bit)x64(6bit)x9bit=16bit
65  ; in m6, i.e. it _exactly_ fits in a signed word per word in the xmm reg.
66  ; We have to sign-extend it before adding the words within the register
67  ; and outputing to a dword.
68  movhlps              m3, m7
69  movhlps              m4, m6
70  paddd                m7, m3
71  paddd                m6, m4
72  pshufd               m3, m7, 0x1
73  pshufd               m4, m6, 0x1
74  paddd                m7, m3
75  paddd                m6, m4
76  mov                  r1, ssem         ; r1 = unsigned int *sse
77  movd               [r1], m7           ; store sse
78  movd                eax, m6           ; store sum as return value
79%endif
80  RET
81%endmacro
82
83%macro INC_SRC_BY_SRC_STRIDE  0
84%if AOM_ARCH_X86=1 && CONFIG_PIC=1
85  add                srcq, src_stridemp
86  add                srcq, src_stridemp
87%else
88  lea                srcq, [srcq + src_strideq*2]
89%endif
90%endmacro
91
92%macro SUBPEL_VARIANCE 1-2 0 ; W
93%define bilin_filter_m bilin_filter_m_sse2
94%define filter_idx_shift 5
95
96
97%if AOM_ARCH_X86_64
98  %if %2 == 1 ; avg
99    cglobal highbd_sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \
100                                      x_offset, y_offset, \
101                                      dst, dst_stride, \
102                                      sec, sec_stride, height, sse
103    %define sec_str sec_strideq
104  %else
105    cglobal highbd_sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, \
106                                  x_offset, y_offset, \
107                                  dst, dst_stride, height, sse
108  %endif
109  %define block_height heightd
110  %define bilin_filter sseq
111%else
112  %if CONFIG_PIC=1
113    %if %2 == 1 ; avg
114      cglobal highbd_sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \
115                                        x_offset, y_offset, \
116                                        dst, dst_stride, \
117                                        sec, sec_stride, height, sse
118      %define block_height dword heightm
119      %define sec_str sec_stridemp
120    %else
121      cglobal highbd_sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \
122                                    x_offset, y_offset, \
123                                    dst, dst_stride, height, sse
124      %define block_height heightd
125    %endif
126
127    ; reuse argument stack space
128    %define g_bilin_filterm x_offsetm
129    %define g_pw_8m y_offsetm
130
131    ; Store bilin_filter and pw_8 location in stack
132    %if GET_GOT_DEFINED == 1
133      GET_GOT eax
134      add esp, 4                ; restore esp
135    %endif
136
137    lea ecx, [GLOBAL(bilin_filter_m)]
138    mov g_bilin_filterm, ecx
139
140    lea ecx, [GLOBAL(pw_8)]
141    mov g_pw_8m, ecx
142
143    LOAD_IF_USED 0, 1         ; load eax, ecx back
144  %else
145    %if %2 == 1 ; avg
146      cglobal highbd_sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \
147                                        x_offset, y_offset, \
148                                        dst, dst_stride, \
149                                        sec, sec_stride, height, sse
150      %define block_height dword heightm
151      %define sec_str sec_stridemp
152    %else
153      cglobal highbd_sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \
154                                    x_offset, y_offset, \
155                                    dst, dst_stride, height, sse
156      %define block_height heightd
157    %endif
158
159    %define bilin_filter bilin_filter_m
160  %endif
161%endif
162
163  ASSERT               %1 <= 16         ; m6 overflows if w > 16
164  pxor                 m6, m6           ; sum
165  pxor                 m7, m7           ; sse
166
167%if %1 < 16
168  sar                   block_height, 1
169%endif
170%if %2 == 1 ; avg
171  shl             sec_str, 1
172%endif
173
174  ; FIXME(rbultje) replace by jumptable?
175  test          x_offsetd, x_offsetd
176  jnz .x_nonzero
177  ; x_offset == 0
178  test          y_offsetd, y_offsetd
179  jnz .x_zero_y_nonzero
180
181  ; x_offset == 0 && y_offset == 0
182.x_zero_y_zero_loop:
183%if %1 == 16
184  movu                 m0, [srcq]
185  movu                 m2, [srcq + 16]
186  mova                 m1, [dstq]
187  mova                 m3, [dstq + 16]
188%if %2 == 1 ; avg
189  pavgw                m0, [secq]
190  pavgw                m2, [secq+16]
191%endif
192  SUM_SSE              m0, m1, m2, m3, m6, m7
193
194  lea                srcq, [srcq + src_strideq*2]
195  lea                dstq, [dstq + dst_strideq*2]
196%if %2 == 1 ; avg
197  add                secq, sec_str
198%endif
199%else ; %1 < 16
200  movu                 m0, [srcq]
201  movu                 m2, [srcq + src_strideq*2]
202  mova                 m1, [dstq]
203  mova                 m3, [dstq + dst_strideq*2]
204%if %2 == 1 ; avg
205  pavgw                m0, [secq]
206  add                secq, sec_str
207  pavgw                m2, [secq]
208%endif
209  SUM_SSE              m0, m1, m2, m3, m6, m7
210
211  lea                srcq, [srcq + src_strideq*4]
212  lea                dstq, [dstq + dst_strideq*4]
213%if %2 == 1 ; avg
214  add                secq, sec_str
215%endif
216%endif
217  dec                   block_height
218  jg .x_zero_y_zero_loop
219  STORE_AND_RET
220
221.x_zero_y_nonzero:
222  cmp           y_offsetd, 8
223  jne .x_zero_y_nonhalf
224
225  ; x_offset == 0 && y_offset == 0.5
226.x_zero_y_half_loop:
227%if %1 == 16
228  movu                 m0, [srcq]
229  movu                 m1, [srcq+16]
230  movu                 m4, [srcq+src_strideq*2]
231  movu                 m5, [srcq+src_strideq*2+16]
232  mova                 m2, [dstq]
233  mova                 m3, [dstq+16]
234  pavgw                m0, m4
235  pavgw                m1, m5
236%if %2 == 1 ; avg
237  pavgw                m0, [secq]
238  pavgw                m1, [secq+16]
239%endif
240  SUM_SSE              m0, m2, m1, m3, m6, m7
241
242  lea                srcq, [srcq + src_strideq*2]
243  lea                dstq, [dstq + dst_strideq*2]
244%if %2 == 1 ; avg
245  add                secq, sec_str
246%endif
247%else ; %1 < 16
248  movu                 m0, [srcq]
249  movu                 m1, [srcq+src_strideq*2]
250  movu                 m5, [srcq+src_strideq*4]
251  mova                 m2, [dstq]
252  mova                 m3, [dstq+dst_strideq*2]
253  pavgw                m0, m1
254  pavgw                m1, m5
255%if %2 == 1 ; avg
256  pavgw                m0, [secq]
257  add                secq, sec_str
258  pavgw                m1, [secq]
259%endif
260  SUM_SSE              m0, m2, m1, m3, m6, m7
261
262  lea                srcq, [srcq + src_strideq*4]
263  lea                dstq, [dstq + dst_strideq*4]
264%if %2 == 1 ; avg
265  add                secq, sec_str
266%endif
267%endif
268  dec                   block_height
269  jg .x_zero_y_half_loop
270  STORE_AND_RET
271
272.x_zero_y_nonhalf:
273  ; x_offset == 0 && y_offset == bilin interpolation
274%if AOM_ARCH_X86_64
275  lea        bilin_filter, [GLOBAL(bilin_filter_m)]
276%endif
277  shl           y_offsetd, filter_idx_shift
278%if AOM_ARCH_X86_64 && mmsize == 16
279  mova                 m8, [bilin_filter+y_offsetq]
280  mova                 m9, [bilin_filter+y_offsetq+16]
281  mova                m10, [GLOBAL(pw_8)]
282%define filter_y_a m8
283%define filter_y_b m9
284%define filter_rnd m10
285%else ; x86-32 or mmx
286%if AOM_ARCH_X86=1 && CONFIG_PIC=1
287; x_offset == 0, reuse x_offset reg
288%define tempq x_offsetq
289  add y_offsetq, g_bilin_filterm
290%define filter_y_a [y_offsetq]
291%define filter_y_b [y_offsetq+16]
292  mov tempq, g_pw_8m
293%define filter_rnd [tempq]
294%else
295  add           y_offsetq, bilin_filter
296%define filter_y_a [y_offsetq]
297%define filter_y_b [y_offsetq+16]
298%define filter_rnd [GLOBAL(pw_8)]
299%endif
300%endif
301
302.x_zero_y_other_loop:
303%if %1 == 16
304  movu                 m0, [srcq]
305  movu                 m1, [srcq + 16]
306  movu                 m4, [srcq+src_strideq*2]
307  movu                 m5, [srcq+src_strideq*2+16]
308  mova                 m2, [dstq]
309  mova                 m3, [dstq+16]
310  ; FIXME(rbultje) instead of out=((num-x)*in1+x*in2+rnd)>>log2(num), we can
311  ; also do out=in1+(((num-x)*(in2-in1)+rnd)>>log2(num)). Total number of
312  ; instructions is the same (5), but it is 1 mul instead of 2, so might be
313  ; slightly faster because of pmullw latency. It would also cut our rodata
314  ; tables in half for this function, and save 1-2 registers on x86-64.
315  pmullw               m1, filter_y_a
316  pmullw               m5, filter_y_b
317  paddw                m1, filter_rnd
318  pmullw               m0, filter_y_a
319  pmullw               m4, filter_y_b
320  paddw                m0, filter_rnd
321  paddw                m1, m5
322  paddw                m0, m4
323  psrlw                m1, 4
324  psrlw                m0, 4
325%if %2 == 1 ; avg
326  pavgw                m0, [secq]
327  pavgw                m1, [secq+16]
328%endif
329  SUM_SSE              m0, m2, m1, m3, m6, m7
330
331  lea                srcq, [srcq + src_strideq*2]
332  lea                dstq, [dstq + dst_strideq*2]
333%if %2 == 1 ; avg
334  add                secq, sec_str
335%endif
336%else ; %1 < 16
337  movu                 m0, [srcq]
338  movu                 m1, [srcq+src_strideq*2]
339  movu                 m5, [srcq+src_strideq*4]
340  mova                 m4, m1
341  mova                 m2, [dstq]
342  mova                 m3, [dstq+dst_strideq*2]
343  pmullw               m1, filter_y_a
344  pmullw               m5, filter_y_b
345  paddw                m1, filter_rnd
346  pmullw               m0, filter_y_a
347  pmullw               m4, filter_y_b
348  paddw                m0, filter_rnd
349  paddw                m1, m5
350  paddw                m0, m4
351  psrlw                m1, 4
352  psrlw                m0, 4
353%if %2 == 1 ; avg
354  pavgw                m0, [secq]
355  add                secq, sec_str
356  pavgw                m1, [secq]
357%endif
358  SUM_SSE              m0, m2, m1, m3, m6, m7
359
360  lea                srcq, [srcq + src_strideq*4]
361  lea                dstq, [dstq + dst_strideq*4]
362%if %2 == 1 ; avg
363  add                secq, sec_str
364%endif
365%endif
366  dec                   block_height
367  jg .x_zero_y_other_loop
368%undef filter_y_a
369%undef filter_y_b
370%undef filter_rnd
371  STORE_AND_RET
372
373.x_nonzero:
374  cmp           x_offsetd, 8
375  jne .x_nonhalf
376  ; x_offset == 0.5
377  test          y_offsetd, y_offsetd
378  jnz .x_half_y_nonzero
379
380  ; x_offset == 0.5 && y_offset == 0
381.x_half_y_zero_loop:
382%if %1 == 16
383  movu                 m0, [srcq]
384  movu                 m1, [srcq + 16]
385  movu                 m4, [srcq + 2]
386  movu                 m5, [srcq + 18]
387  mova                 m2, [dstq]
388  mova                 m3, [dstq + 16]
389  pavgw                m0, m4
390  pavgw                m1, m5
391%if %2 == 1 ; avg
392  pavgw                m0, [secq]
393  pavgw                m1, [secq+16]
394%endif
395  SUM_SSE              m0, m2, m1, m3, m6, m7
396
397  lea                srcq, [srcq + src_strideq*2]
398  lea                dstq, [dstq + dst_strideq*2]
399%if %2 == 1 ; avg
400  add                secq, sec_str
401%endif
402%else ; %1 < 16
403  movu                 m0, [srcq]
404  movu                 m1, [srcq + src_strideq*2]
405  movu                 m4, [srcq + 2]
406  movu                 m5, [srcq + src_strideq*2 + 2]
407  mova                 m2, [dstq]
408  mova                 m3, [dstq + dst_strideq*2]
409  pavgw                m0, m4
410  pavgw                m1, m5
411%if %2 == 1 ; avg
412  pavgw                m0, [secq]
413  add                secq, sec_str
414  pavgw                m1, [secq]
415%endif
416  SUM_SSE              m0, m2, m1, m3, m6, m7
417
418  lea                srcq, [srcq + src_strideq*4]
419  lea                dstq, [dstq + dst_strideq*4]
420%if %2 == 1 ; avg
421  add                secq, sec_str
422%endif
423%endif
424  dec                   block_height
425  jg .x_half_y_zero_loop
426  STORE_AND_RET
427
428.x_half_y_nonzero:
429  cmp           y_offsetd, 8
430  jne .x_half_y_nonhalf
431
432  ; x_offset == 0.5 && y_offset == 0.5
433%if %1 == 16
434  movu                 m0, [srcq]
435  movu                 m1, [srcq+16]
436  movu                 m2, [srcq+2]
437  movu                 m3, [srcq+18]
438  lea                srcq, [srcq + src_strideq*2]
439  pavgw                m0, m2
440  pavgw                m1, m3
441.x_half_y_half_loop:
442  movu                 m2, [srcq]
443  movu                 m3, [srcq + 16]
444  movu                 m4, [srcq + 2]
445  movu                 m5, [srcq + 18]
446  pavgw                m2, m4
447  pavgw                m3, m5
448  pavgw                m0, m2
449  pavgw                m1, m3
450  mova                 m4, [dstq]
451  mova                 m5, [dstq + 16]
452%if %2 == 1 ; avg
453  pavgw                m0, [secq]
454  pavgw                m1, [secq+16]
455%endif
456  SUM_SSE              m0, m4, m1, m5, m6, m7
457  mova                 m0, m2
458  mova                 m1, m3
459
460  lea                srcq, [srcq + src_strideq*2]
461  lea                dstq, [dstq + dst_strideq*2]
462%if %2 == 1 ; avg
463  add                secq, sec_str
464%endif
465%else ; %1 < 16
466  movu                 m0, [srcq]
467  movu                 m2, [srcq+2]
468  lea                srcq, [srcq + src_strideq*2]
469  pavgw                m0, m2
470.x_half_y_half_loop:
471  movu                 m2, [srcq]
472  movu                 m3, [srcq + src_strideq*2]
473  movu                 m4, [srcq + 2]
474  movu                 m5, [srcq + src_strideq*2 + 2]
475  pavgw                m2, m4
476  pavgw                m3, m5
477  pavgw                m0, m2
478  pavgw                m2, m3
479  mova                 m4, [dstq]
480  mova                 m5, [dstq + dst_strideq*2]
481%if %2 == 1 ; avg
482  pavgw                m0, [secq]
483  add                secq, sec_str
484  pavgw                m2, [secq]
485%endif
486  SUM_SSE              m0, m4, m2, m5, m6, m7
487  mova                 m0, m3
488
489  lea                srcq, [srcq + src_strideq*4]
490  lea                dstq, [dstq + dst_strideq*4]
491%if %2 == 1 ; avg
492  add                secq, sec_str
493%endif
494%endif
495  dec                   block_height
496  jg .x_half_y_half_loop
497  STORE_AND_RET
498
499.x_half_y_nonhalf:
500  ; x_offset == 0.5 && y_offset == bilin interpolation
501%if AOM_ARCH_X86_64
502  lea        bilin_filter, [GLOBAL(bilin_filter_m)]
503%endif
504  shl           y_offsetd, filter_idx_shift
505%if AOM_ARCH_X86_64 && mmsize == 16
506  mova                 m8, [bilin_filter+y_offsetq]
507  mova                 m9, [bilin_filter+y_offsetq+16]
508  mova                m10, [GLOBAL(pw_8)]
509%define filter_y_a m8
510%define filter_y_b m9
511%define filter_rnd m10
512%else  ; x86_32
513%if AOM_ARCH_X86=1 && CONFIG_PIC=1
514; x_offset == 0.5. We can reuse x_offset reg
515%define tempq x_offsetq
516  add y_offsetq, g_bilin_filterm
517%define filter_y_a [y_offsetq]
518%define filter_y_b [y_offsetq+16]
519  mov tempq, g_pw_8m
520%define filter_rnd [tempq]
521%else
522  add           y_offsetq, bilin_filter
523%define filter_y_a [y_offsetq]
524%define filter_y_b [y_offsetq+16]
525%define filter_rnd [GLOBAL(pw_8)]
526%endif
527%endif
528
529%if %1 == 16
530  movu                 m0, [srcq]
531  movu                 m1, [srcq+16]
532  movu                 m2, [srcq+2]
533  movu                 m3, [srcq+18]
534  lea                srcq, [srcq + src_strideq*2]
535  pavgw                m0, m2
536  pavgw                m1, m3
537.x_half_y_other_loop:
538  movu                 m2, [srcq]
539  movu                 m3, [srcq+16]
540  movu                 m4, [srcq+2]
541  movu                 m5, [srcq+18]
542  pavgw                m2, m4
543  pavgw                m3, m5
544  mova                 m4, m2
545  mova                 m5, m3
546  pmullw               m1, filter_y_a
547  pmullw               m3, filter_y_b
548  paddw                m1, filter_rnd
549  paddw                m1, m3
550  pmullw               m0, filter_y_a
551  pmullw               m2, filter_y_b
552  paddw                m0, filter_rnd
553  psrlw                m1, 4
554  paddw                m0, m2
555  mova                 m2, [dstq]
556  psrlw                m0, 4
557  mova                 m3, [dstq+16]
558%if %2 == 1 ; avg
559  pavgw                m0, [secq]
560  pavgw                m1, [secq+16]
561%endif
562  SUM_SSE              m0, m2, m1, m3, m6, m7
563  mova                 m0, m4
564  mova                 m1, m5
565
566  lea                srcq, [srcq + src_strideq*2]
567  lea                dstq, [dstq + dst_strideq*2]
568%if %2 == 1 ; avg
569  add                secq, sec_str
570%endif
571%else ; %1 < 16
572  movu                 m0, [srcq]
573  movu                 m2, [srcq+2]
574  lea                srcq, [srcq + src_strideq*2]
575  pavgw                m0, m2
576.x_half_y_other_loop:
577  movu                 m2, [srcq]
578  movu                 m3, [srcq+src_strideq*2]
579  movu                 m4, [srcq+2]
580  movu                 m5, [srcq+src_strideq*2+2]
581  pavgw                m2, m4
582  pavgw                m3, m5
583  mova                 m4, m2
584  mova                 m5, m3
585  pmullw               m4, filter_y_a
586  pmullw               m3, filter_y_b
587  paddw                m4, filter_rnd
588  paddw                m4, m3
589  pmullw               m0, filter_y_a
590  pmullw               m2, filter_y_b
591  paddw                m0, filter_rnd
592  psrlw                m4, 4
593  paddw                m0, m2
594  mova                 m2, [dstq]
595  psrlw                m0, 4
596  mova                 m3, [dstq+dst_strideq*2]
597%if %2 == 1 ; avg
598  pavgw                m0, [secq]
599  add                secq, sec_str
600  pavgw                m4, [secq]
601%endif
602  SUM_SSE              m0, m2, m4, m3, m6, m7
603  mova                 m0, m5
604
605  lea                srcq, [srcq + src_strideq*4]
606  lea                dstq, [dstq + dst_strideq*4]
607%if %2 == 1 ; avg
608  add                secq, sec_str
609%endif
610%endif
611  dec                   block_height
612  jg .x_half_y_other_loop
613%undef filter_y_a
614%undef filter_y_b
615%undef filter_rnd
616  STORE_AND_RET
617
618.x_nonhalf:
619  test          y_offsetd, y_offsetd
620  jnz .x_nonhalf_y_nonzero
621
622  ; x_offset == bilin interpolation && y_offset == 0
623%if AOM_ARCH_X86_64
624  lea        bilin_filter, [GLOBAL(bilin_filter_m)]
625%endif
626  shl           x_offsetd, filter_idx_shift
627%if AOM_ARCH_X86_64 && mmsize == 16
628  mova                 m8, [bilin_filter+x_offsetq]
629  mova                 m9, [bilin_filter+x_offsetq+16]
630  mova                m10, [GLOBAL(pw_8)]
631%define filter_x_a m8
632%define filter_x_b m9
633%define filter_rnd m10
634%else    ; x86-32
635%if AOM_ARCH_X86=1 && CONFIG_PIC=1
636; y_offset == 0. We can reuse y_offset reg.
637%define tempq y_offsetq
638  add x_offsetq, g_bilin_filterm
639%define filter_x_a [x_offsetq]
640%define filter_x_b [x_offsetq+16]
641  mov tempq, g_pw_8m
642%define filter_rnd [tempq]
643%else
644  add           x_offsetq, bilin_filter
645%define filter_x_a [x_offsetq]
646%define filter_x_b [x_offsetq+16]
647%define filter_rnd [GLOBAL(pw_8)]
648%endif
649%endif
650
651.x_other_y_zero_loop:
652%if %1 == 16
653  movu                 m0, [srcq]
654  movu                 m1, [srcq+16]
655  movu                 m2, [srcq+2]
656  movu                 m3, [srcq+18]
657  mova                 m4, [dstq]
658  mova                 m5, [dstq+16]
659  pmullw               m1, filter_x_a
660  pmullw               m3, filter_x_b
661  paddw                m1, filter_rnd
662  pmullw               m0, filter_x_a
663  pmullw               m2, filter_x_b
664  paddw                m0, filter_rnd
665  paddw                m1, m3
666  paddw                m0, m2
667  psrlw                m1, 4
668  psrlw                m0, 4
669%if %2 == 1 ; avg
670  pavgw                m0, [secq]
671  pavgw                m1, [secq+16]
672%endif
673  SUM_SSE              m0, m4, m1, m5, m6, m7
674
675  lea                srcq, [srcq+src_strideq*2]
676  lea                dstq, [dstq+dst_strideq*2]
677%if %2 == 1 ; avg
678  add                secq, sec_str
679%endif
680%else ; %1 < 16
681  movu                 m0, [srcq]
682  movu                 m1, [srcq+src_strideq*2]
683  movu                 m2, [srcq+2]
684  movu                 m3, [srcq+src_strideq*2+2]
685  mova                 m4, [dstq]
686  mova                 m5, [dstq+dst_strideq*2]
687  pmullw               m1, filter_x_a
688  pmullw               m3, filter_x_b
689  paddw                m1, filter_rnd
690  pmullw               m0, filter_x_a
691  pmullw               m2, filter_x_b
692  paddw                m0, filter_rnd
693  paddw                m1, m3
694  paddw                m0, m2
695  psrlw                m1, 4
696  psrlw                m0, 4
697%if %2 == 1 ; avg
698  pavgw                m0, [secq]
699  add                secq, sec_str
700  pavgw                m1, [secq]
701%endif
702  SUM_SSE              m0, m4, m1, m5, m6, m7
703
704  lea                srcq, [srcq+src_strideq*4]
705  lea                dstq, [dstq+dst_strideq*4]
706%if %2 == 1 ; avg
707  add                secq, sec_str
708%endif
709%endif
710  dec                   block_height
711  jg .x_other_y_zero_loop
712%undef filter_x_a
713%undef filter_x_b
714%undef filter_rnd
715  STORE_AND_RET
716
717.x_nonhalf_y_nonzero:
718  cmp           y_offsetd, 8
719  jne .x_nonhalf_y_nonhalf
720
721  ; x_offset == bilin interpolation && y_offset == 0.5
722%if AOM_ARCH_X86_64
723  lea        bilin_filter, [GLOBAL(bilin_filter_m)]
724%endif
725  shl           x_offsetd, filter_idx_shift
726%if AOM_ARCH_X86_64 && mmsize == 16
727  mova                 m8, [bilin_filter+x_offsetq]
728  mova                 m9, [bilin_filter+x_offsetq+16]
729  mova                m10, [GLOBAL(pw_8)]
730%define filter_x_a m8
731%define filter_x_b m9
732%define filter_rnd m10
733%else    ; x86-32
734%if AOM_ARCH_X86=1 && CONFIG_PIC=1
735; y_offset == 0.5. We can reuse y_offset reg.
736%define tempq y_offsetq
737  add x_offsetq, g_bilin_filterm
738%define filter_x_a [x_offsetq]
739%define filter_x_b [x_offsetq+16]
740  mov tempq, g_pw_8m
741%define filter_rnd [tempq]
742%else
743  add           x_offsetq, bilin_filter
744%define filter_x_a [x_offsetq]
745%define filter_x_b [x_offsetq+16]
746%define filter_rnd [GLOBAL(pw_8)]
747%endif
748%endif
749
750%if %1 == 16
751  movu                 m0, [srcq]
752  movu                 m1, [srcq+16]
753  movu                 m2, [srcq+2]
754  movu                 m3, [srcq+18]
755  pmullw               m0, filter_x_a
756  pmullw               m2, filter_x_b
757  paddw                m0, filter_rnd
758  pmullw               m1, filter_x_a
759  pmullw               m3, filter_x_b
760  paddw                m1, filter_rnd
761  paddw                m0, m2
762  paddw                m1, m3
763  psrlw                m0, 4
764  psrlw                m1, 4
765  lea                srcq, [srcq+src_strideq*2]
766.x_other_y_half_loop:
767  movu                 m2, [srcq]
768  movu                 m3, [srcq+16]
769  movu                 m4, [srcq+2]
770  movu                 m5, [srcq+18]
771  pmullw               m2, filter_x_a
772  pmullw               m4, filter_x_b
773  paddw                m2, filter_rnd
774  pmullw               m3, filter_x_a
775  pmullw               m5, filter_x_b
776  paddw                m3, filter_rnd
777  paddw                m2, m4
778  paddw                m3, m5
779  mova                 m4, [dstq]
780  mova                 m5, [dstq+16]
781  psrlw                m2, 4
782  psrlw                m3, 4
783  pavgw                m0, m2
784  pavgw                m1, m3
785%if %2 == 1 ; avg
786  pavgw                m0, [secq]
787  pavgw                m1, [secq+16]
788%endif
789  SUM_SSE              m0, m4, m1, m5, m6, m7
790  mova                 m0, m2
791  mova                 m1, m3
792
793  lea                srcq, [srcq+src_strideq*2]
794  lea                dstq, [dstq+dst_strideq*2]
795%if %2 == 1 ; avg
796  add                secq, sec_str
797%endif
798%else ; %1 < 16
799  movu                 m0, [srcq]
800  movu                 m2, [srcq+2]
801  pmullw               m0, filter_x_a
802  pmullw               m2, filter_x_b
803  paddw                m0, filter_rnd
804  paddw                m0, m2
805  psrlw                m0, 4
806  lea                srcq, [srcq+src_strideq*2]
807.x_other_y_half_loop:
808  movu                 m2, [srcq]
809  movu                 m3, [srcq+src_strideq*2]
810  movu                 m4, [srcq+2]
811  movu                 m5, [srcq+src_strideq*2+2]
812  pmullw               m2, filter_x_a
813  pmullw               m4, filter_x_b
814  paddw                m2, filter_rnd
815  pmullw               m3, filter_x_a
816  pmullw               m5, filter_x_b
817  paddw                m3, filter_rnd
818  paddw                m2, m4
819  paddw                m3, m5
820  mova                 m4, [dstq]
821  mova                 m5, [dstq+dst_strideq*2]
822  psrlw                m2, 4
823  psrlw                m3, 4
824  pavgw                m0, m2
825  pavgw                m2, m3
826%if %2 == 1 ; avg
827  pavgw                m0, [secq]
828  add                secq, sec_str
829  pavgw                m2, [secq]
830%endif
831  SUM_SSE              m0, m4, m2, m5, m6, m7
832  mova                 m0, m3
833
834  lea                srcq, [srcq+src_strideq*4]
835  lea                dstq, [dstq+dst_strideq*4]
836%if %2 == 1 ; avg
837  add                secq, sec_str
838%endif
839%endif
840  dec                   block_height
841  jg .x_other_y_half_loop
842%undef filter_x_a
843%undef filter_x_b
844%undef filter_rnd
845  STORE_AND_RET
846
847.x_nonhalf_y_nonhalf:
848; loading filter - this is same as in 8-bit depth
849%if AOM_ARCH_X86_64
850  lea        bilin_filter, [GLOBAL(bilin_filter_m)]
851%endif
852  shl           x_offsetd, filter_idx_shift ; filter_idx_shift = 5
853  shl           y_offsetd, filter_idx_shift
854%if AOM_ARCH_X86_64 && mmsize == 16
855  mova                 m8, [bilin_filter+x_offsetq]
856  mova                 m9, [bilin_filter+x_offsetq+16]
857  mova                m10, [bilin_filter+y_offsetq]
858  mova                m11, [bilin_filter+y_offsetq+16]
859  mova                m12, [GLOBAL(pw_8)]
860%define filter_x_a m8
861%define filter_x_b m9
862%define filter_y_a m10
863%define filter_y_b m11
864%define filter_rnd m12
865%else   ; x86-32
866%if AOM_ARCH_X86=1 && CONFIG_PIC=1
867; In this case, there is NO unused register. Used src_stride register. Later,
868; src_stride has to be loaded from stack when it is needed.
869%define tempq src_strideq
870  mov tempq, g_bilin_filterm
871  add           x_offsetq, tempq
872  add           y_offsetq, tempq
873%define filter_x_a [x_offsetq]
874%define filter_x_b [x_offsetq+16]
875%define filter_y_a [y_offsetq]
876%define filter_y_b [y_offsetq+16]
877
878  mov tempq, g_pw_8m
879%define filter_rnd [tempq]
880%else
881  add           x_offsetq, bilin_filter
882  add           y_offsetq, bilin_filter
883%define filter_x_a [x_offsetq]
884%define filter_x_b [x_offsetq+16]
885%define filter_y_a [y_offsetq]
886%define filter_y_b [y_offsetq+16]
887%define filter_rnd [GLOBAL(pw_8)]
888%endif
889%endif
890; end of load filter
891
892  ; x_offset == bilin interpolation && y_offset == bilin interpolation
893%if %1 == 16
894  movu                 m0, [srcq]
895  movu                 m2, [srcq+2]
896  movu                 m1, [srcq+16]
897  movu                 m3, [srcq+18]
898  pmullw               m0, filter_x_a
899  pmullw               m2, filter_x_b
900  paddw                m0, filter_rnd
901  pmullw               m1, filter_x_a
902  pmullw               m3, filter_x_b
903  paddw                m1, filter_rnd
904  paddw                m0, m2
905  paddw                m1, m3
906  psrlw                m0, 4
907  psrlw                m1, 4
908
909  INC_SRC_BY_SRC_STRIDE
910
911.x_other_y_other_loop:
912  movu                 m2, [srcq]
913  movu                 m4, [srcq+2]
914  movu                 m3, [srcq+16]
915  movu                 m5, [srcq+18]
916  pmullw               m2, filter_x_a
917  pmullw               m4, filter_x_b
918  paddw                m2, filter_rnd
919  pmullw               m3, filter_x_a
920  pmullw               m5, filter_x_b
921  paddw                m3, filter_rnd
922  paddw                m2, m4
923  paddw                m3, m5
924  psrlw                m2, 4
925  psrlw                m3, 4
926  mova                 m4, m2
927  mova                 m5, m3
928  pmullw               m0, filter_y_a
929  pmullw               m2, filter_y_b
930  paddw                m0, filter_rnd
931  pmullw               m1, filter_y_a
932  pmullw               m3, filter_y_b
933  paddw                m0, m2
934  paddw                m1, filter_rnd
935  mova                 m2, [dstq]
936  paddw                m1, m3
937  psrlw                m0, 4
938  psrlw                m1, 4
939  mova                 m3, [dstq+16]
940%if %2 == 1 ; avg
941  pavgw                m0, [secq]
942  pavgw                m1, [secq+16]
943%endif
944  SUM_SSE              m0, m2, m1, m3, m6, m7
945  mova                 m0, m4
946  mova                 m1, m5
947
948  INC_SRC_BY_SRC_STRIDE
949  lea                dstq, [dstq + dst_strideq * 2]
950%if %2 == 1 ; avg
951  add                secq, sec_str
952%endif
953%else ; %1 < 16
954  movu                 m0, [srcq]
955  movu                 m2, [srcq+2]
956  pmullw               m0, filter_x_a
957  pmullw               m2, filter_x_b
958  paddw                m0, filter_rnd
959  paddw                m0, m2
960  psrlw                m0, 4
961
962  INC_SRC_BY_SRC_STRIDE
963
964.x_other_y_other_loop:
965  movu                 m2, [srcq]
966  movu                 m4, [srcq+2]
967  INC_SRC_BY_SRC_STRIDE
968  movu                 m3, [srcq]
969  movu                 m5, [srcq+2]
970  pmullw               m2, filter_x_a
971  pmullw               m4, filter_x_b
972  paddw                m2, filter_rnd
973  pmullw               m3, filter_x_a
974  pmullw               m5, filter_x_b
975  paddw                m3, filter_rnd
976  paddw                m2, m4
977  paddw                m3, m5
978  psrlw                m2, 4
979  psrlw                m3, 4
980  mova                 m4, m2
981  mova                 m5, m3
982  pmullw               m0, filter_y_a
983  pmullw               m2, filter_y_b
984  paddw                m0, filter_rnd
985  pmullw               m4, filter_y_a
986  pmullw               m3, filter_y_b
987  paddw                m0, m2
988  paddw                m4, filter_rnd
989  mova                 m2, [dstq]
990  paddw                m4, m3
991  psrlw                m0, 4
992  psrlw                m4, 4
993  mova                 m3, [dstq+dst_strideq*2]
994%if %2 == 1 ; avg
995  pavgw                m0, [secq]
996  add                secq, sec_str
997  pavgw                m4, [secq]
998%endif
999  SUM_SSE              m0, m2, m4, m3, m6, m7
1000  mova                 m0, m5
1001
1002  INC_SRC_BY_SRC_STRIDE
1003  lea                dstq, [dstq + dst_strideq * 4]
1004%if %2 == 1 ; avg
1005  add                secq, sec_str
1006%endif
1007%endif
1008  dec                   block_height
1009  jg .x_other_y_other_loop
1010%undef filter_x_a
1011%undef filter_x_b
1012%undef filter_y_a
1013%undef filter_y_b
1014%undef filter_rnd
1015  STORE_AND_RET
1016%endmacro
1017
1018INIT_XMM sse2
1019SUBPEL_VARIANCE  8
1020SUBPEL_VARIANCE 16
1021
1022INIT_XMM sse2
1023SUBPEL_VARIANCE  8, 1
1024SUBPEL_VARIANCE 16, 1
1025