xref: /aosp_15_r20/external/libdav1d/src/x86/mc_avx512.asm (revision c09093415860a1c2373dacd84c4fde00c507cdfd)
1; Copyright © 2020, VideoLAN and dav1d authors
2; Copyright © 2020, Two Orioles, LLC
3; All rights reserved.
4;
5; Redistribution and use in source and binary forms, with or without
6; modification, are permitted provided that the following conditions are met:
7;
8; 1. Redistributions of source code must retain the above copyright notice, this
9;    list of conditions and the following disclaimer.
10;
11; 2. Redistributions in binary form must reproduce the above copyright notice,
12;    this list of conditions and the following disclaimer in the documentation
13;    and/or other materials provided with the distribution.
14;
15; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
19; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25
26%include "config.asm"
27%include "ext/x86/x86inc.asm"
28
29%if ARCH_X86_64
30
31SECTION_RODATA 64
32
33obmc_masks:
34pw_512:         times 2 dw 512
35                ; 2
36                db 45, 19, 64,  0
37                ; 4
38                db 39, 25, 50, 14, 59,  5, 64,  0
39                ; 8
40                db 36, 28, 42, 22, 48, 16, 53, 11, 57,  7, 61,  3, 64,  0, 64,  0
41                ; 16
42                db 34, 30, 37, 27, 40, 24, 43, 21, 46, 18, 49, 15, 52, 12, 54, 10
43                db 56,  8, 58,  6, 60,  4, 61,  3, 64,  0, 64,  0, 64,  0, 64,  0
44                ; 32
45                db 33, 31, 35, 29, 36, 28, 38, 26, 40, 24, 41, 23, 43, 21, 44, 20
46                db 45, 19, 47, 17, 48, 16, 50, 14, 51, 13, 52, 12, 53, 11, 55,  9
47                db 56,  8, 57,  7, 58,  6, 59,  5, 60,  4, 60,  4, 61,  3, 62,  2
48                db 64,  0, 64,  0, 64,  0, 64,  0, 64,  0, 64,  0, 64,  0, 64,  0
49
50warp_8x8_permA: db  4,  5,  6,  7, 16, 17, 18, 19,  5,  6,  7,  8, 17, 18, 19, 20
51                db  6,  7,  8,  9, 18, 19, 20, 21,  7,  8,  9, 10, 19, 20, 21, 22
52                db  8,  9, 10, 11, 20, 21, 22, 23,  9, 10, 11, 12, 21, 22, 23, 24
53                db 10, 11, 12, 13, 22, 23, 24, 25, 11, 12, 13, 14, 23, 24, 25, 26
54warp_8x8_permB: db  0,  1,  2,  3, 20, 21, 22, 23,  1,  2,  3,  4, 21, 22, 23, 24
55                db  2,  3,  4,  5, 22, 23, 24, 25,  3,  4,  5,  6, 23, 24, 25, 26
56                db  4,  5,  6,  7, 24, 25, 26, 27,  5,  6,  7,  8, 25, 26, 27, 28
57                db  6,  7,  8,  9, 26, 27, 28, 29,  7,  8,  9, 10, 27, 28, 29, 30
58warp_8x8_permC: db -1,  0, -1,  1, -1,  8, -1,  9, -1,  4, -1,  5, -1, 12, -1, 13
59warp_8x8_permD: db -1,  2, -1,  3, -1, 10, -1, 11, -1,  6, -1,  7, -1, 14, -1, 15
60pd_0to7:        dd  0,  1,  2,  3,  4,  5,  6,  7
61warp_8x8_hpack: db  3, 11,  3, 11, 35, 43, 35, 43
62pd_16384:       dd 16384
63pd_262144:      dd 262144
64warp_8x8_end:   db  0,  4, 16, 20, 32, 36, 48, 52,  2,  6, 18, 22, 34, 38, 50, 54
65warp_8x8t_end:  db  2,  3, 10, 11, 18, 19, 26, 27, 34, 35, 42, 43, 50, 51, 58, 59
66                db  6,  7, 14, 15, 22, 23, 30, 31, 38, 39, 46, 47, 54, 55, 62, 63
67bidir_sctr_w4:  dd  0,  1,  8,  9,  2,  3, 10, 11,  4,  5, 12, 13,  6,  7, 14, 15
68wm_420_perm4:   db  1,  3,  9, 11,  5,  7, 13, 15, 17, 19, 25, 27, 21, 23, 29, 31
69                db 33, 35, 41, 43, 37, 39, 45, 47, 49, 51, 57, 59, 53, 55, 61, 63
70                db  0,  2,  8, 10,  4,  6, 12, 14, 16, 18, 24, 26, 20, 22, 28, 30
71                db 32, 34, 40, 42, 36, 38, 44, 46, 48, 50, 56, 58, 52, 54, 60, 62
72wm_420_perm8:   db  1,  3, 17, 19,  5,  7, 21, 23,  9, 11, 25, 27, 13, 15, 29, 31
73                db 33, 35, 49, 51, 37, 39, 53, 55, 41, 43, 57, 59, 45, 47, 61, 63
74                db  0,  2, 16, 18,  4,  6, 20, 22,  8, 10, 24, 26, 12, 14, 28, 30
75                db 32, 34, 48, 50, 36, 38, 52, 54, 40, 42, 56, 58, 44, 46, 60, 62
76wm_420_perm16:  db  1,  3, 33, 35,  5,  7, 37, 39,  9, 11, 41, 43, 13, 15, 45, 47
77                db 17, 19, 49, 51, 21, 23, 53, 55, 25, 27, 57, 59, 29, 31, 61, 63
78                db  0,  2, 32, 34,  4,  6, 36, 38,  8, 10, 40, 42, 12, 14, 44, 46
79                db 16, 18, 48, 50, 20, 22, 52, 54, 24, 26, 56, 58, 28, 30, 60, 62
80wm_420_mask:    db  3,  7, 11, 15, 19, 23, 27, 31, 35, 39, 43, 47, 51, 55, 59, 63
81                db 67, 71, 75, 79, 83, 87, 91, 95, 99,103,107,111,115,119,123,127
82                db  1,  5,  9, 13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, 61
83                db 65, 69, 73, 77, 81, 85, 89, 93, 97,101,105,109,113,117,121,125
84wm_422_mask:    db  2,  6, 10, 14, 18, 22, 26, 30, 34, 38, 42, 46, 50, 54, 58, 62
85                db  1,  5,  9, 13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, 61
86                db 66, 70, 74, 78, 82, 86, 90, 94, 98,102,106,110,114,118,122,126
87                db 65, 69, 73, 77, 81, 85, 89, 93, 97,101,105,109,113,117,121,125
88wm_444_mask:    db  1,  3,  5,  7,  9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
89                db 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63
90                db  0,  2,  4,  6,  8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
91                db 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62
92bilin_h_perm16: db  0,  1,  1,  2,  2,  3,  3,  4,  4,  5,  5,  6,  6,  7,  7,  8
93                db  8,  9,  9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 16
94                db 32, 33, 33, 34, 34, 35, 35, 36, 36, 37, 37, 38, 38, 39, 39, 40
95                db 40, 41, 41, 42, 42, 43, 43, 44, 44, 45, 45, 46, 46, 47, 47, 48
96bilin_h_perm32: db  0,  1,  1,  2,  2,  3,  3,  4,  4,  5,  5,  6,  6,  7,  7,  8
97                db  8,  9,  9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 16
98                db 16, 17, 17, 18, 18, 19, 19, 20, 20, 21, 21, 22, 22, 23, 23, 24
99                db 24, 25, 25, 26, 26, 27, 27, 28, 28, 29, 29, 30, 30, 31, 31, 32
100bilin_v_perm8:  db  0, 16,  1, 17,  2, 18,  3, 19,  4, 20,  5, 21,  6, 22,  7, 23
101                db 16, 80, 17, 81, 18, 82, 19, 83, 20, 84, 21, 85, 22, 86, 23, 87
102                db 80, 32, 81, 33, 82, 34, 83, 35, 84, 36, 85, 37, 86, 38, 87, 39
103                db 32, 64, 33, 65, 34, 66, 35, 67, 36, 68, 37, 69, 38, 70, 39, 71
104bilin_v_perm16: db  0, 16,  1, 17,  2, 18,  3, 19,  4, 20,  5, 21,  6, 22,  7, 23
105                db  8, 24,  9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31
106                db 16, 64, 17, 65, 18, 66, 19, 67, 20, 68, 21, 69, 22, 70, 23, 71
107                db 24, 72, 25, 73, 26, 74, 27, 75, 28, 76, 29, 77, 30, 78, 31, 79
108bilin_v_perm32: db  0, 64,  1, 65,  2, 66,  3, 67,  4, 68,  5, 69,  6, 70,  7, 71
109                db  8, 72,  9, 73, 10, 74, 11, 75, 12, 76, 13, 77, 14, 78, 15, 79
110                db 16, 80, 17, 81, 18, 82, 19, 83, 20, 84, 21, 85, 22, 86, 23, 87
111                db 24, 88, 25, 89, 26, 90, 27, 91, 28, 92, 29, 93, 30, 94, 31, 95
112bilin_v_perm64: dd  0,  0,  4,  8,  1,  1,  5,  9,  2,  2,  6, 10,  3,  3,  7, 11
113spel_h_perm16:  db  0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6
114                db  8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
115                db 32, 33, 34, 35, 33, 34, 35, 36, 34, 35, 36, 37, 35, 36, 37, 38
116                db 40, 41, 42, 43, 41, 42, 43, 44, 42, 43, 44, 45, 43, 44, 45, 46
117spel_h_perm32:  db  0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6
118                db  8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
119                db 16, 17, 18, 19, 17, 18, 19, 20, 18, 19, 20, 21, 19, 20, 21, 22
120                db 24, 25, 26, 27, 25, 26, 27, 28, 26, 27, 28, 29, 27, 28, 29, 30
121spel_v_perm8:   db  0,  8,  1,  9,  2, 10,  3, 11,  4, 12,  5, 13,  6, 14,  7, 15
122                db  8, 16,  9, 17, 10, 18, 11, 19, 12, 20, 13, 21, 14, 22, 15, 23
123                db 16, 24, 17, 25, 18, 26, 19, 27, 20, 28, 21, 29, 22, 30, 23, 31
124                db 24, 32, 25, 33, 26, 34, 27, 35, 28, 36, 29, 37, 30, 38, 31, 39
125spel_v_perm16a: db 32,  0, 33,  1, 34,  2, 35,  3, 36,  4, 37,  5, 38,  6, 39,  7
126                db  0,  8,  1,  9,  2, 10,  3, 11,  4, 12,  5, 13,  6, 14,  7, 15
127                db 40, 16, 41, 17, 42, 18, 43, 19, 44, 20, 45, 21, 46, 22, 47, 23
128                db 16, 24, 17, 25, 18, 26, 19, 27, 20, 28, 21, 29, 22, 30, 23, 31
129spel_v_perm16b: db 32,  0, 33,  1, 34,  2, 35,  3, 36,  4, 37,  5, 38,  6, 39,  7
130                db 40, 16, 41, 17, 42, 18, 43, 19, 44, 20, 45, 21, 46, 22, 47, 23
131                db  0,  8,  1,  9,  2, 10,  3, 11,  4, 12,  5, 13,  6, 14,  7, 15
132                db 16, 24, 17, 25, 18, 26, 19, 27, 20, 28, 21, 29, 22, 30, 23, 31
133spel_v_perm32:  db  0, 32,  1, 33,  2, 34,  3, 35,  4, 36,  5, 37,  6, 38,  7, 39
134                db  8, 40,  9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47
135                db 16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55
136                db 24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63
137spel_hv_perm4a: db  8,  9, 16, 17, 10, 11, 18, 19, 12, 13, 20, 21, 14, 15, 22, 23
138                db 16, 17, 24, 25, 18, 19, 26, 27, 20, 21, 28, 29, 22, 23, 30, 31
139spel_hv_perm4b: db 24, 25, 32, 33, 26, 27, 34, 35, 28, 29, 36, 37, 30, 31, 38, 39
140                db 32, 33, 40, 41, 34, 35, 42, 43, 36, 37, 44, 45, 38, 39, 46, 47
141spel_hv_perm4c: db 40, 41, 48, 49, 42, 43, 50, 51, 44, 45, 52, 53, 46, 47, 54, 55
142                db 48, 49, 56, 57, 50, 51, 58, 59, 52, 53, 60, 61, 54, 55, 62, 63
143spel_hv_perm4d: db 18, 19,  0,  1, 22, 23,  4,  5, 26, 27,  8,  9, 30, 31, 12, 13
144                db  0,  1, 16, 17,  4,  5, 20, 21,  8,  9, 24, 25, 12, 13, 28, 29
145spel_hv_perm8a: db  0,  1, 16, 17,  2,  3, 18, 19,  4,  5, 20, 21,  6,  7, 22, 23
146                db  8,  9, 24, 25, 10, 11, 26, 27, 12, 13, 28, 29, 14, 15, 30, 31
147                db 16, 17, 32, 33, 18, 19, 34, 35, 20, 21, 36, 37, 22, 23, 38, 39
148                db 24, 25, 40, 41, 26, 27, 42, 43, 28, 29, 44, 45, 30, 31, 46, 47
149spel_hv_perm8b: db 34, 35,  0,  1, 38, 39,  4,  5, 42, 43,  8,  9, 46, 47, 12, 13
150                db 50, 51, 16, 17, 54, 55, 20, 21, 58, 59, 24, 25, 62, 63, 28, 29
151                db  0,  1, 32, 33,  4,  5, 36, 37,  8,  9, 40, 41, 12, 13, 44, 45
152                db 16, 17, 48, 49, 20, 21, 52, 53, 24, 25, 56, 57, 28, 29, 60, 61
153spel_hv_perm16a:db  0,  1,  2,  3, 32, 33, 34, 35,  1,  2,  3,  4, 33, 34, 35, 36
154                db  2,  3,  4,  5, 34, 35, 36, 37,  3,  4,  5,  6, 35, 36, 37, 38
155                db  8,  9, 10, 11, 40, 41, 42, 43,  9, 10, 11, 12, 41, 42, 43, 44
156                db 10, 11, 12, 13, 42, 43, 44, 45, 11, 12, 13, 14, 43, 44, 45, 46
157spel_hv_perm16b:db  0,  1,  2,  3,  1,  2,  3,  4,  4,  5,  6,  7,  5,  6,  7,  8
158                db  2,  3,  4,  5,  3,  4,  5,  6,  6,  7,  8,  9,  7,  8,  9, 10
159                db  8,  9, 10, 11,  9, 10, 11, 12, 12, 13, 14, 15, 13, 14, 15, 16
160                db 10, 11, 12, 13, 11, 12, 13, 14, 14, 15, 16, 17, 15, 16, 17, 18
161spel_hv_end16:  db  1,  3, 17, 19,  5,  7, 21, 23, 33, 35, 49, 51, 37, 39, 53, 55
162                db  9, 11, 25, 27, 13, 15, 29, 31, 41, 43, 57, 59, 45, 47, 61, 63
163spel_hv_end:    db  1,  3,  5,  7, 17, 19, 21, 23, 33, 35, 37, 39, 49, 51, 53, 55
164deint_shuf4:    db  0,  4,  1,  5,  2,  6,  3,  7,  4,  8,  5,  9,  6, 10,  7, 11
165subpel_h_shuf4: db  0,  1,  2,  3,  1,  2,  3,  4,  8,  9, 10, 11,  9, 10, 11, 12
166                db  2,  3,  4,  5,  3,  4,  5,  6, 10, 11, 12, 13, 11, 12, 13, 14
167subpel_h_shufA: db  0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6
168subpel_h_shufB: db  4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10
169subpel_h_shufC: db  8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
170bilin_h_shuf4:  db  0,  1,  1,  2,  2,  3,  3,  4,  8,  9,  9, 10, 10, 11, 11, 12
171bilin_v_shuf4:  db  0,  4,  1,  5,  2,  6,  3,  7,  4,  8,  5,  9,  6, 10,  7, 11
172blend_shuf:     db  0,  1,  0,  1,  0,  1,  0,  1,  2,  3,  2,  3,  2,  3,  2,  3
173rescale_mul:    dd  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15
174resize_permA:   dd  0,  2,  4,  6,  8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
175resize_permB:   dd  1,  3,  5,  7,  9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
176resize_permC:   dd  0,  4,  8, 12
177resize_shuf:    db  0,  0,  0,  0,  0,  1,  2,  3,  4,  5,  6,  7,  7,  7,  7,  7
178pb_02461357:    db  0,  2,  4,  6,  1,  3,  5,  7
179
180wm_420_perm64:  dq 0xfedcba9876543210
181wm_sign:        dd 0x40804080, 0xc0c0c0c0, 0x40404040
182
183pb_8x0_8x8: times 8 db 0
184            times 8 db 8
185pb_4:       times 4 db 4
186pb_32:      times 4 db 32
187pb_127:     times 4 db 127
188pw_m128     times 2 dw -128
189pw_m256:    times 2 dw -256
190pw_1024:    times 2 dw 1024
191pw_2048:    times 2 dw 2048
192pw_6903:    times 2 dw 6903
193pw_8192:    times 2 dw 8192
194pd_32:              dd 32
195pd_34:              dd 34
196pd_63:              dd 63
197pd_512:             dd 512
198
199%define pb_m64 (wm_sign+4)
200%define pb_64  (wm_sign+8)
201%define pd_2   (pd_0to7+8)
202
203cextern mc_subpel_filters
204%define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8)
205cextern mc_warp_filter
206cextern resize_filter
207
208%macro BASE_JMP_TABLE 3-*
209    %xdefine %1_%2_table (%%table - %3)
210    %xdefine %%base %1_%2
211    %%table:
212    %rep %0 - 2
213        dw %%base %+ _w%3 - %%base
214        %rotate 1
215    %endrep
216%endmacro
217
218%macro HV_JMP_TABLE 5-*
219    %xdefine %%prefix mangle(private_prefix %+ _%1_%2_8bpc_%3)
220    %xdefine %%base %1_%3
221    %assign %%types %4
222    %if %%types & 1
223        %xdefine %1_%2_h_%3_table  (%%h  - %5)
224        %%h:
225        %rep %0 - 4
226            dw %%prefix %+ .h_w%5 - %%base
227            %rotate 1
228        %endrep
229        %rotate 4
230    %endif
231    %if %%types & 2
232        %xdefine %1_%2_v_%3_table  (%%v  - %5)
233        %%v:
234        %rep %0 - 4
235            dw %%prefix %+ .v_w%5 - %%base
236            %rotate 1
237        %endrep
238        %rotate 4
239    %endif
240    %if %%types & 4
241        %xdefine %1_%2_hv_%3_table (%%hv - %5)
242        %%hv:
243        %rep %0 - 4
244            dw %%prefix %+ .hv_w%5 - %%base
245            %rotate 1
246        %endrep
247    %endif
248%endmacro
249
250%macro BIDIR_JMP_TABLE 2-*
251    %xdefine %1_%2_table (%%table - 2*%3)
252    %xdefine %%base %1_%2_table
253    %xdefine %%prefix mangle(private_prefix %+ _%1_8bpc_%2)
254    %%table:
255    %rep %0 - 2
256        dd %%prefix %+ .w%3 - %%base
257        %rotate 1
258    %endrep
259%endmacro
260
261%xdefine put_avx512icl mangle(private_prefix %+ _put_bilin_8bpc_avx512icl.put)
262%xdefine prep_avx512icl mangle(private_prefix %+ _prep_bilin_8bpc_avx512icl.prep)
263
264%define table_offset(type, fn) type %+ fn %+ SUFFIX %+ _table - type %+ SUFFIX
265
266BASE_JMP_TABLE put,  avx512icl,         2, 4, 8, 16, 32, 64, 128
267BASE_JMP_TABLE prep, avx512icl,            4, 8, 16, 32, 64, 128
268HV_JMP_TABLE put,  bilin, avx512icl, 7, 2, 4, 8, 16, 32, 64, 128
269HV_JMP_TABLE prep, bilin, avx512icl, 7,    4, 8, 16, 32, 64, 128
270HV_JMP_TABLE put,  6tap,  avx512icl, 2, 2, 4, 8, 16, 32, 64, 128
271HV_JMP_TABLE put,  8tap,  avx512icl, 3, 2, 4, 8, 16, 32, 64, 128
272HV_JMP_TABLE prep, 6tap,  avx512icl, 2,    4, 8, 16, 32, 64, 128
273HV_JMP_TABLE prep, 8tap,  avx512icl, 3,    4, 8, 16, 32, 64, 128
274BIDIR_JMP_TABLE avg, avx512icl,            4, 8, 16, 32, 64, 128
275BIDIR_JMP_TABLE w_avg, avx512icl,          4, 8, 16, 32, 64, 128
276BIDIR_JMP_TABLE mask, avx512icl,           4, 8, 16, 32, 64, 128
277BIDIR_JMP_TABLE w_mask_420, avx512icl,     4, 8, 16, 32, 64, 128
278BIDIR_JMP_TABLE w_mask_422, avx512icl,     4, 8, 16, 32, 64, 128
279BIDIR_JMP_TABLE w_mask_444, avx512icl,     4, 8, 16, 32, 64, 128
280BIDIR_JMP_TABLE blend, avx512icl,          4, 8, 16, 32
281BIDIR_JMP_TABLE blend_v, avx512icl,     2, 4, 8, 16, 32
282BIDIR_JMP_TABLE blend_h, avx512icl,     2, 4, 8, 16, 32, 64, 128
283
284SECTION .text
285
286%macro WRAP_YMM 1+
287INIT_YMM cpuname
288    %1
289INIT_ZMM cpuname
290%endmacro
291
292INIT_ZMM avx512icl
293cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w, h, mxy
294    movifnidn          mxyd, r6m ; mx
295    lea                  r7, [put_avx512icl]
296    tzcnt                wd, wm
297    movifnidn            hd, hm
298    test               mxyd, mxyd
299    jnz .h
300    mov                mxyd, r7m ; my
301    test               mxyd, mxyd
302    jnz .v
303.put:
304    movzx                wd, word [r7+wq*2+table_offset(put,)]
305    add                  wq, r7
306    jmp                  wq
307.put_w2:
308    movzx               r6d, word [srcq+ssq*0]
309    movzx               r7d, word [srcq+ssq*1]
310    lea                srcq, [srcq+ssq*2]
311    mov        [dstq+dsq*0], r6w
312    mov        [dstq+dsq*1], r7w
313    lea                dstq, [dstq+dsq*2]
314    sub                  hd, 2
315    jg .put_w2
316    RET
317.put_w4:
318    mov                 r6d, [srcq+ssq*0]
319    mov                 r7d, [srcq+ssq*1]
320    lea                srcq, [srcq+ssq*2]
321    mov        [dstq+dsq*0], r6d
322    mov        [dstq+dsq*1], r7d
323    lea                dstq, [dstq+dsq*2]
324    sub                  hd, 2
325    jg .put_w4
326    RET
327.put_w8:
328    mov                  r6, [srcq+ssq*0]
329    mov                  r7, [srcq+ssq*1]
330    lea                srcq, [srcq+ssq*2]
331    mov        [dstq+dsq*0], r6
332    mov        [dstq+dsq*1], r7
333    lea                dstq, [dstq+dsq*2]
334    sub                  hd, 2
335    jg .put_w8
336    RET
337.put_w16:
338    movu               xmm0, [srcq+ssq*0]
339    movu               xmm1, [srcq+ssq*1]
340    lea                srcq, [srcq+ssq*2]
341    mova       [dstq+dsq*0], xmm0
342    mova       [dstq+dsq*1], xmm1
343    lea                dstq, [dstq+dsq*2]
344    sub                  hd, 2
345    jg .put_w16
346    RET
347.put_w32:
348    movu                ym0, [srcq+ssq*0]
349    movu                ym1, [srcq+ssq*1]
350    lea                srcq, [srcq+ssq*2]
351    mova       [dstq+dsq*0], ym0
352    mova       [dstq+dsq*1], ym1
353    lea                dstq, [dstq+dsq*2]
354    sub                  hd, 2
355    jg .put_w32
356    RET
357.put_w64:
358    movu                 m0, [srcq+ssq*0]
359    movu                 m1, [srcq+ssq*1]
360    lea                srcq, [srcq+ssq*2]
361    mova       [dstq+dsq*0], m0
362    mova       [dstq+dsq*1], m1
363    lea                dstq, [dstq+dsq*2]
364    sub                  hd, 2
365    jg .put_w64
366    RET
367.put_w128:
368    movu                 m0, [srcq+ssq*0+64*0]
369    movu                 m1, [srcq+ssq*0+64*1]
370    movu                 m2, [srcq+ssq*1+64*0]
371    movu                 m3, [srcq+ssq*1+64*1]
372    lea                srcq, [srcq+ssq*2]
373    mova  [dstq+dsq*0+64*0], m0
374    mova  [dstq+dsq*0+64*1], m1
375    mova  [dstq+dsq*1+64*0], m2
376    mova  [dstq+dsq*1+64*1], m3
377    lea                dstq, [dstq+dsq*2]
378    sub                  hd, 2
379    jg .put_w128
380    RET
381.h:
382    ; (16 * src[x] + (mx * (src[x + 1] - src[x])) + 8) >> 4
383    ; = ((16 - mx) * src[x] + mx * src[x + 1] + 8) >> 4
384    imul               mxyd, 255
385    vbroadcasti128       m4, [bilin_h_perm16]
386    add                mxyd, 16
387    vpbroadcastw         m5, mxyd
388    mov                mxyd, r7m ; my
389    test               mxyd, mxyd
390    jnz .hv
391    movzx                wd, word [r7+wq*2+table_offset(put, _bilin_h)]
392    vpbroadcastd         m3, [pw_2048]
393    add                  wq, r7
394    jmp                  wq
395.h_w2:
396    movd               xmm0, [srcq+ssq*0]
397    pinsrd             xmm0, [srcq+ssq*1], 1
398    lea                srcq, [srcq+ssq*2]
399    pshufb             xmm0, xm4
400    pmaddubsw          xmm0, xm5
401    pmulhrsw           xmm0, xm3
402    packuswb           xmm0, xmm0
403    pextrw     [dstq+dsq*0], xmm0, 0
404    pextrw     [dstq+dsq*1], xmm0, 2
405    lea                dstq, [dstq+dsq*2]
406    sub                  hd, 2
407    jg .h_w2
408    RET
409.h_w4:
410    mova               xmm4, [bilin_h_shuf4]
411.h_w4_loop:
412    movq               xmm0, [srcq+ssq*0]
413    movhps             xmm0, [srcq+ssq*1]
414    lea                srcq, [srcq+ssq*2]
415    pshufb             xmm0, xmm4
416    pmaddubsw          xmm0, xm5
417    pmulhrsw           xmm0, xm3
418    packuswb           xmm0, xmm0
419    movd       [dstq+dsq*0], xmm0
420    pextrd     [dstq+dsq*1], xmm0, 1
421    lea                dstq, [dstq+dsq*2]
422    sub                  hd, 2
423    jg .h_w4_loop
424    RET
425.h_w8:
426    movu                xm0, [srcq+ssq*0]
427    vinserti32x4        ym0, [srcq+ssq*1], 1
428    lea                srcq, [srcq+ssq*2]
429    pshufb              ym0, ym4
430    pmaddubsw           ym0, ym5
431    pmulhrsw            ym0, ym3
432    vpmovuswb           xm0, ym0
433    movq       [dstq+dsq*0], xm0
434    movhps     [dstq+dsq*1], xm0
435    lea                dstq, [dstq+dsq*2]
436    sub                  hd, 2
437    jg .h_w8
438    RET
439.h_w16:
440    mova                 m4, [bilin_h_perm16]
441.h_w16_loop:
442    movu                ym0, [srcq+ssq*0]
443    vinserti32x8         m0, [srcq+ssq*1], 1
444    lea                srcq, [srcq+ssq*2]
445    vpermb               m0, m4, m0
446    pmaddubsw            m0, m5
447    pmulhrsw             m0, m3
448    vpmovuswb           ym0, m0
449    mova         [dstq+dsq*0], xm0
450    vextracti128 [dstq+dsq*1], ym0, 1
451    lea                dstq, [dstq+dsq*2]
452    sub                  hd, 2
453    jg .h_w16_loop
454    RET
455.h_w32:
456    movu                ym0, [srcq+ssq*0+8*0]
457    vinserti32x8         m0, [srcq+ssq*1+8*0], 1
458    movu                ym1, [srcq+ssq*0+8*1]
459    vinserti32x8         m1, [srcq+ssq*1+8*1], 1
460    lea                srcq, [srcq+ssq*2]
461    pshufb               m0, m4
462    pshufb               m1, m4
463    pmaddubsw            m0, m5
464    pmaddubsw            m1, m5
465    pmulhrsw             m0, m3
466    pmulhrsw             m1, m3
467    packuswb             m0, m1
468    mova          [dstq+dsq*0], ym0
469    vextracti32x8 [dstq+dsq*1], m0, 1
470    lea                dstq, [dstq+dsq*2]
471    sub                  hd, 2
472    jg .h_w32
473    RET
474.h_w64:
475    movu                 m0, [srcq+8*0]
476    movu                 m1, [srcq+8*1]
477    pshufb               m0, m4
478    pshufb               m1, m4
479    pmaddubsw            m0, m5
480    pmaddubsw            m1, m5
481    pmulhrsw             m0, m3
482    pmulhrsw             m1, m3
483    packuswb             m0, m1
484    add                srcq, ssq
485    mova             [dstq], m0
486    add                dstq, dsq
487    dec                  hd
488    jg .h_w64
489    RET
490.h_w128:
491    movu                 m0, [srcq+8*0]
492    movu                 m2, [srcq+8*1]
493    movu                 m1, [srcq+8*8]
494    movu                 m6, [srcq+8*9]
495    add                srcq, ssq
496    REPX  {pshufb    x, m4}, m0, m2, m1, m6
497    REPX  {pmaddubsw x, m5}, m0, m2, m1, m6
498    REPX  {pmulhrsw  x, m3}, m0, m2, m1, m6
499    packuswb             m0, m2
500    packuswb             m1, m6
501    mova        [dstq+64*0], m0
502    mova        [dstq+64*1], m1
503    add                dstq, dsq
504    dec                  hd
505    jg .h_w128
506    RET
507.v:
508    movzx                wd, word [r7+wq*2+table_offset(put, _bilin_v)]
509    imul               mxyd, 255
510    vpbroadcastd         m5, [pw_2048]
511    add                mxyd, 16
512    add                  wq, r7
513    vpbroadcastw         m4, mxyd
514    jmp                  wq
515.v_w2:
516    movd               xmm0,       [srcq+ssq*0]
517.v_w2_loop:
518    pinsrw             xmm1, xmm0, [srcq+ssq*1], 1 ; 0 1
519    lea                srcq,       [srcq+ssq*2]
520    pinsrw             xmm0, xmm1, [srcq+ssq*0], 0 ; 2 1
521    pshuflw            xmm1, xmm1, q2301           ; 1 0
522    punpcklbw          xmm1, xmm0
523    pmaddubsw          xmm1, xm4
524    pmulhrsw           xmm1, xm5
525    packuswb           xmm1, xmm1
526    pextrw     [dstq+dsq*0], xmm1, 1
527    pextrw     [dstq+dsq*1], xmm1, 0
528    lea                dstq, [dstq+dsq*2]
529    sub                  hd, 2
530    jg .v_w2_loop
531    RET
532.v_w4:
533    movd               xmm0, [srcq+ssq*0]
534.v_w4_loop:
535    vpbroadcastd       xmm2, [srcq+ssq*1]
536    lea                srcq, [srcq+ssq*2]
537    vpblendd           xmm1, xmm2, xmm0, 0x01 ; 0 1
538    vpbroadcastd       xmm0, [srcq+ssq*0]
539    vpblendd           xmm2, xmm0, 0x02       ; 1 2
540    punpcklbw          xmm1, xmm2
541    pmaddubsw          xmm1, xm4
542    pmulhrsw           xmm1, xm5
543    packuswb           xmm1, xmm1
544    movd       [dstq+dsq*0], xmm1
545    pextrd     [dstq+dsq*1], xmm1, 1
546    lea                dstq, [dstq+dsq*2]
547    sub                  hd, 2
548    jg .v_w4_loop
549    RET
550.v_w8:
551    movq               xmm0, [srcq+ssq*0]
552.v_w8_loop:
553    movq               xmm2, [srcq+ssq*1]
554    lea                srcq, [srcq+ssq*2]
555    punpcklbw          xmm1, xmm0, xmm2
556    movq               xmm0, [srcq+ssq*0]
557    punpcklbw          xmm2, xmm0
558    pmaddubsw          xmm1, xm4
559    pmaddubsw          xmm2, xm4
560    pmulhrsw           xmm1, xm5
561    pmulhrsw           xmm2, xm5
562    packuswb           xmm1, xmm2
563    movq       [dstq+dsq*0], xmm1
564    movhps     [dstq+dsq*1], xmm1
565    lea                dstq, [dstq+dsq*2]
566    sub                  hd, 2
567    jg .v_w8_loop
568    RET
569.v_w16:
570    movu               xmm0, [srcq+ssq*0]
571.v_w16_loop:
572    vbroadcasti128     ymm3, [srcq+ssq*1]
573    lea                srcq, [srcq+ssq*2]
574    vpblendd           ymm2, ymm3, ymm0, 0x0f ; 0 1
575    vbroadcasti128     ymm0, [srcq+ssq*0]
576    vpblendd           ymm3, ymm0, 0xf0       ; 1 2
577    punpcklbw          ymm1, ymm2, ymm3
578    punpckhbw          ymm2, ymm3
579    pmaddubsw          ymm1, ym4
580    pmaddubsw          ymm2, ym4
581    pmulhrsw           ymm1, ym5
582    pmulhrsw           ymm2, ym5
583    packuswb           ymm1, ymm2
584    mova         [dstq+dsq*0], xmm1
585    vextracti128 [dstq+dsq*1], ymm1, 1
586    lea                dstq, [dstq+dsq*2]
587    sub                  hd, 2
588    jg .v_w16_loop
589    vzeroupper
590    RET
591.v_w32:
592    movu                ym0, [srcq+ssq*0]
593    kxnorb               k1, k1, k1
594.v_w32_loop:
595    vbroadcasti32x8      m3, [srcq+ssq*1]
596    lea                srcq, [srcq+ssq*2]
597    vpblendmd        m2{k1}, m3, m0 ; 0 1
598    vbroadcasti32x8      m0, [srcq+ssq*0]
599    vpblendmd        m3{k1}, m0, m3 ; 1 2
600    punpcklbw            m1, m2, m3
601    punpckhbw            m2, m3
602    pmaddubsw            m1, m4
603    pmaddubsw            m2, m4
604    pmulhrsw             m1, m5
605    pmulhrsw             m2, m5
606    packuswb             m1, m2
607    mova          [dstq+dsq*0], ym1
608    vextracti32x8 [dstq+dsq*1], m1, 1
609    lea                dstq, [dstq+dsq*2]
610    sub                  hd, 2
611    jg .v_w32_loop
612    RET
613.v_w64:
614    movu                 m0, [srcq+ssq*0]
615.v_w64_loop:
616    movu                 m3, [srcq+ssq*1]
617    lea                srcq, [srcq+ssq*2]
618    punpcklbw            m1, m0, m3
619    punpckhbw            m6, m0, m3
620    movu                 m0, [srcq+ssq*0]
621    pmaddubsw            m1, m4
622    pmaddubsw            m6, m4
623    punpcklbw            m2, m3, m0
624    punpckhbw            m3, m0
625    pmaddubsw            m2, m4
626    pmaddubsw            m3, m4
627    REPX   {pmulhrsw x, m5}, m1, m6, m2, m3
628    packuswb             m1, m6
629    packuswb             m2, m3
630    mova       [dstq+dsq*0], m1
631    mova       [dstq+dsq*1], m2
632    lea                dstq, [dstq+dsq*2]
633    sub                  hd, 2
634    jg .v_w64_loop
635    RET
636.v_w128:
637    movu                 m0, [srcq+64*0]
638    movu                 m1, [srcq+64*1]
639.v_w128_loop:
640    add                srcq, ssq
641    movu                 m2, [srcq+64*0]
642    movu                 m3, [srcq+64*1]
643    punpcklbw            m6, m0, m2
644    pmaddubsw            m6, m4
645    punpckhbw            m0, m2
646    pmaddubsw            m0, m4
647    punpcklbw            m7, m1, m3
648    pmaddubsw            m7, m4
649    punpckhbw            m1, m3
650    pmaddubsw            m1, m4
651    REPX   {pmulhrsw x, m5}, m6, m0, m7, m1
652    packuswb             m6, m0
653    mova                 m0, m2
654    packuswb             m7, m1
655    mova                 m1, m3
656    mova        [dstq+64*0], m6
657    mova        [dstq+64*1], m7
658    add                dstq, dsq
659    dec                  hd
660    jg .v_w128_loop
661    RET
662.hv:
663    ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 128) >> 8
664    ; = (src[x] + ((my * (src[x + src_stride] - src[x])) >> 4) + 8) >> 4
665    movzx                wd, word [r7+wq*2+table_offset(put, _bilin_hv)]
666    WIN64_SPILL_XMM       8
667    shl                mxyd, 11 ; can't shift by 12 due to signed overflow
668    vpbroadcastd         m7, [pw_2048]
669    add                  wq, r7
670    vpbroadcastw         m6, mxyd
671    jmp                  wq
672.hv_w2:
673    vpbroadcastd       xmm0, [srcq+ssq*0]
674    pshufb             xmm0, xm4
675    pmaddubsw          xmm0, xm5
676.hv_w2_loop:
677    movd               xmm1, [srcq+ssq*1]
678    lea                srcq, [srcq+ssq*2]
679    pinsrd             xmm1, [srcq+ssq*0], 1
680    pshufb             xmm1, xm4
681    pmaddubsw          xmm1, xm5               ; 1 _ 2 _
682    shufps             xmm2, xmm0, xmm1, q1032 ; 0 _ 1 _
683    mova               xmm0, xmm1
684    psubw              xmm1, xmm2
685    paddw              xmm1, xmm1
686    pmulhw             xmm1, xm6
687    paddw              xmm1, xmm2
688    pmulhrsw           xmm1, xm7
689    packuswb           xmm1, xmm1
690    pextrw     [dstq+dsq*0], xmm1, 0
691    pextrw     [dstq+dsq*1], xmm1, 2
692    lea                dstq, [dstq+dsq*2]
693    sub                  hd, 2
694    jg .hv_w2_loop
695    RET
696.hv_w4:
697    mova               xmm4, [bilin_h_shuf4]
698    movddup            xmm0, [srcq+ssq*0]
699    pshufb             xmm0, xmm4
700    pmaddubsw          xmm0, xm5
701.hv_w4_loop:
702    movq               xmm1, [srcq+ssq*1]
703    lea                srcq, [srcq+ssq*2]
704    movhps             xmm1, [srcq+ssq*0]
705    pshufb             xmm1, xmm4
706    pmaddubsw          xmm1, xm5               ; 1 2
707    shufps             xmm2, xmm0, xmm1, q1032 ; 0 1
708    mova               xmm0, xmm1
709    psubw              xmm1, xmm2
710    paddw              xmm1, xmm1
711    pmulhw             xmm1, xm6
712    paddw              xmm1, xmm2
713    pmulhrsw           xmm1, xm7
714    packuswb           xmm1, xmm1
715    movd       [dstq+dsq*0], xmm1
716    pextrd     [dstq+dsq*1], xmm1, 1
717    lea                dstq, [dstq+dsq*2]
718    sub                  hd, 2
719    jg .hv_w4_loop
720    RET
721.hv_w8:
722    vbroadcasti128      ym0, [srcq+ssq*0]
723    pshufb              ym0, ym4
724    pmaddubsw           ym0, ym5
725.hv_w8_loop:
726    movu                xm1, [srcq+ssq*1]
727    lea                srcq, [srcq+ssq*2]
728    vinserti128         ym1, [srcq+ssq*0], 1
729    pshufb              ym1, ym4
730    pmaddubsw           ym1, ym5            ; 1 2
731    valignq             ym2, ym1, ym0, 2
732    mova                ym0, ym1
733    psubw               ym1, ym2
734    paddw               ym1, ym1
735    pmulhw              ym1, ym6
736    paddw               ym1, ym2
737    pmulhrsw            ym1, ym7
738    vpmovuswb           xm1, ym1
739    movq       [dstq+dsq*0], xm1
740    movhps     [dstq+dsq*1], xm1
741    lea                dstq, [dstq+dsq*2]
742    sub                  hd, 2
743    jg .hv_w8_loop
744    RET
745.hv_w16:
746    vbroadcasti32x8      m0, [srcq+ssq*0]
747    mova                 m4, [bilin_h_perm16]
748    vpermb               m0, m4, m0
749    pmaddubsw            m0, m5
750.hv_w16_loop:
751    movu                ym1, [srcq+ssq*1]
752    lea                srcq, [srcq+ssq*2]
753    vinserti32x8         m1, [srcq+ssq*0], 1
754    vpermb               m1, m4, m1
755    pmaddubsw            m1, m5        ; 1 2
756    valignq              m2, m1, m0, 4 ; 0 1
757    mova                 m0, m1
758    psubw                m1, m2
759    paddw                m1, m1
760    pmulhw               m1, m6
761    paddw                m1, m2
762    pmulhrsw             m1, m7
763    vpmovuswb           ym1, m1
764    mova          [dstq+dsq*0], xm1
765    vextracti32x4 [dstq+dsq*1], ym1, 1
766    lea                dstq, [dstq+dsq*2]
767    sub                  hd, 2
768    jg .hv_w16_loop
769    RET
770.hv_w32:
771    mova                 m4, [bilin_h_perm32]
772    vpermb               m0, m4, [srcq+ssq*0]
773    pmovzxbq             m8, [pb_02461357]
774    pmaddubsw            m0, m5
775.hv_w32_loop:
776    vpermb               m2, m4, [srcq+ssq*1]
777    lea                srcq, [srcq+ssq*2]
778    vpermb               m3, m4, [srcq+ssq*0]
779    pmaddubsw            m2, m5
780    psubw                m1, m2, m0
781    paddw                m1, m1
782    pmulhw               m1, m6
783    paddw                m1, m0
784    pmaddubsw            m0, m3, m5
785    psubw                m3, m0, m2
786    paddw                m3, m3
787    pmulhw               m3, m6
788    paddw                m3, m2
789    pmulhrsw             m1, m7
790    pmulhrsw             m3, m7
791    packuswb             m1, m3
792    vpermq               m1, m8, m1
793    mova          [dstq+dsq*0], ym1
794    vextracti32x8 [dstq+dsq*1], m1, 1
795    lea                dstq, [dstq+dsq*2]
796    sub                  hd, 2
797    jg .hv_w32_loop
798    RET
799.hv_w64:
800    movu                 m0, [srcq+8*0]
801    movu                 m1, [srcq+8*1]
802    pshufb               m0, m4
803    pshufb               m1, m4
804    pmaddubsw            m0, m5
805    pmaddubsw            m1, m5
806.hv_w64_loop:
807    add                srcq, ssq
808    movu                 m2, [srcq+8*0]
809    movu                 m3, [srcq+8*1]
810    pshufb               m2, m4
811    pshufb               m3, m4
812    pmaddubsw            m2, m5
813    pmaddubsw            m3, m5
814    psubw                m8, m2, m0
815    psubw                m9, m3, m1
816    paddw                m8, m8
817    pmulhw               m8, m6
818    paddw                m9, m9
819    pmulhw               m9, m6
820    paddw                m8, m0
821    pmulhrsw             m8, m7
822    paddw                m9, m1
823    pmulhrsw             m9, m7
824    mova                 m0, m2
825    mova                 m1, m3
826    packuswb             m8, m9
827    mova             [dstq], m8
828    add                dstq, dsq
829    dec                  hd
830    jg .hv_w64_loop
831    RET
832.hv_w128:
833    movu                 m0, [srcq+8*0]
834    movu                 m1, [srcq+8*1]
835    movu                 m2, [srcq+8*8]
836    movu                 m3, [srcq+8*9]
837    REPX  {pshufb    x, m4}, m0, m1, m2, m3
838    REPX  {pmaddubsw x, m5}, m0, m1, m2, m3
839.hv_w128_loop:
840    add                srcq, ssq
841    movu                 m8, [srcq+8*0]
842    movu                 m9, [srcq+8*1]
843    movu                m10, [srcq+8*8]
844    movu                m11, [srcq+8*9]
845    REPX  {pshufb    x, m4}, m8, m9, m10, m11
846    REPX  {pmaddubsw x, m5}, m8, m9, m10, m11
847    psubw               m12, m8, m0
848    psubw               m13, m9, m1
849    psubw               m14, m10, m2
850    psubw               m15, m11, m3
851    paddw               m12, m12
852    pmulhw              m12, m6
853    paddw               m13, m13
854    pmulhw              m13, m6
855    paddw               m14, m14
856    pmulhw              m14, m6
857    paddw               m15, m15
858    pmulhw              m15, m6
859    paddw               m12, m0
860    pmulhrsw            m12, m7
861    paddw               m13, m1
862    pmulhrsw            m13, m7
863    paddw               m14, m2
864    pmulhrsw            m14, m7
865    paddw               m15, m3
866    pmulhrsw            m15, m7
867    mova                 m0, m8
868    mova                 m1, m9
869    mova                 m2, m10
870    mova                 m3, m11
871    packuswb            m12, m13
872    packuswb            m14, m15
873    mova        [dstq+64*0], m12
874    mova        [dstq+64*1], m14
875    add                dstq, dsq
876    dec                  hd
877    jg .hv_w128_loop
878    RET
879
880DECLARE_REG_TMP 3, 5, 6
881
882cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
883    movifnidn          mxyd, r5m ; mx
884    lea                  t2, [prep_avx512icl]
885    tzcnt                wd, wm
886    movifnidn            hd, hm
887    test               mxyd, mxyd
888    jnz .h
889    mov                mxyd, r6m ; my
890    test               mxyd, mxyd
891    jnz .v
892.prep:
893    movzx                wd, word [t2+wq*2+table_offset(prep,)]
894    add                  wq, t2
895    lea            stride3q, [strideq*3]
896    jmp                  wq
897.prep_w4:
898    movd               xmm0, [srcq+strideq*0]
899    pinsrd             xmm0, [srcq+strideq*1], 1
900    pinsrd             xmm0, [srcq+strideq*2], 2
901    pinsrd             xmm0, [srcq+stride3q ], 3
902    lea                srcq, [srcq+strideq*4]
903    pmovzxbw            ym0, xmm0
904    psllw               ym0, 4
905    mova             [tmpq], ym0
906    add                tmpq, 32
907    sub                  hd, 4
908    jg .prep_w4
909    RET
910.prep_w8:
911    movq               xmm0, [srcq+strideq*0]
912    movq               xmm1, [srcq+strideq*1]
913    vinserti128         ym0, ymm0, [srcq+strideq*2], 1
914    vinserti128         ym1, ymm1, [srcq+stride3q ], 1
915    lea                srcq, [srcq+strideq*4]
916    punpcklqdq          ym0, ym1
917    pmovzxbw             m0, ym0
918    psllw                m0, 4
919    mova             [tmpq], m0
920    add                tmpq, 32*2
921    sub                  hd, 4
922    jg .prep_w8
923    RET
924.prep_w16:
925    movu               xmm0, [srcq+strideq*0]
926    vinserti128         ym0, ymm0, [srcq+strideq*1], 1
927    movu               xmm1, [srcq+strideq*2]
928    vinserti128         ym1, ymm1, [srcq+stride3q ], 1
929    lea                srcq, [srcq+strideq*4]
930    pmovzxbw             m0, ym0
931    pmovzxbw             m1, ym1
932    psllw                m0, 4
933    psllw                m1, 4
934    mova        [tmpq+64*0], m0
935    mova        [tmpq+64*1], m1
936    add                tmpq, 32*4
937    sub                  hd, 4
938    jg .prep_w16
939    RET
940.prep_w32:
941    pmovzxbw             m0, [srcq+strideq*0]
942    pmovzxbw             m1, [srcq+strideq*1]
943    pmovzxbw             m2, [srcq+strideq*2]
944    pmovzxbw             m3, [srcq+stride3q ]
945    lea                srcq, [srcq+strideq*4]
946    REPX       {psllw x, 4}, m0, m1, m2, m3
947    mova        [tmpq+64*0], m0
948    mova        [tmpq+64*1], m1
949    mova        [tmpq+64*2], m2
950    mova        [tmpq+64*3], m3
951    add                tmpq, 64*4
952    sub                  hd, 4
953    jg .prep_w32
954    RET
955.prep_w64:
956    pmovzxbw             m0, [srcq+strideq*0+32*0]
957    pmovzxbw             m1, [srcq+strideq*0+32*1]
958    pmovzxbw             m2, [srcq+strideq*1+32*0]
959    pmovzxbw             m3, [srcq+strideq*1+32*1]
960    lea                srcq, [srcq+strideq*2]
961    REPX       {psllw x, 4}, m0, m1, m2, m3
962    mova        [tmpq+64*0], m0
963    mova        [tmpq+64*1], m1
964    mova        [tmpq+64*2], m2
965    mova        [tmpq+64*3], m3
966    add                tmpq, 64*4
967    sub                  hd, 2
968    jg .prep_w64
969    RET
970.prep_w128:
971    pmovzxbw             m0, [srcq+32*0]
972    pmovzxbw             m1, [srcq+32*1]
973    pmovzxbw             m2, [srcq+32*2]
974    pmovzxbw             m3, [srcq+32*3]
975    REPX       {psllw x, 4}, m0, m1, m2, m3
976    mova    [tmpq+64*0], m0
977    mova    [tmpq+64*1], m1
978    mova    [tmpq+64*2], m2
979    mova    [tmpq+64*3], m3
980    add                tmpq, 64*4
981    add                srcq, strideq
982    dec                  hd
983    jg .prep_w128
984    RET
985.h:
986    ; 16 * src[x] + (mx * (src[x + 1] - src[x]))
987    ; = (16 - mx) * src[x] + mx * src[x + 1]
988    imul               mxyd, 255
989    add                mxyd, 16
990    vpbroadcastw         m5, mxyd
991    mov                mxyd, r6m ; my
992    test               mxyd, mxyd
993    jnz .hv
994    movzx                wd, word [t2+wq*2+table_offset(prep, _bilin_h)]
995    add                  wq, t2
996    lea            stride3q, [strideq*3]
997    jmp                  wq
998.h_w4:
999    vbroadcasti32x4     ym4, [bilin_h_shuf4]
1000.h_w4_loop:
1001    movq               xmm0, [srcq+strideq*0]
1002    movq               xmm1, [srcq+strideq*1]
1003    vinserti32x4        ym0, ymm0, [srcq+strideq*2], 1
1004    vinserti32x4        ym1, ymm1, [srcq+stride3q ], 1
1005    lea                srcq, [srcq+strideq*4]
1006    punpcklqdq          ym0, ym1
1007    pshufb              ym0, ym4
1008    pmaddubsw           ym0, ym5
1009    mova             [tmpq], ym0
1010    add                tmpq, 32
1011    sub                  hd, 4
1012    jg .h_w4_loop
1013    RET
1014.h_w8:
1015    vbroadcasti32x4      m4, [bilin_h_perm16]
1016.h_w8_loop:
1017    movu               xmm0, [srcq+strideq*0]
1018    vinserti32x4        ym0, ymm0, [srcq+strideq*1], 1
1019    vinserti32x4         m0, [srcq+strideq*2], 2
1020    vinserti32x4         m0, [srcq+stride3q ], 3
1021    lea                srcq, [srcq+strideq*4]
1022    pshufb               m0, m4
1023    pmaddubsw            m0, m5
1024    mova             [tmpq], m0
1025    add                tmpq, 64
1026    sub                  hd, 4
1027    jg .h_w8_loop
1028    RET
1029.h_w16:
1030    mova                 m4, [bilin_h_perm16]
1031.h_w16_loop:
1032    movu                ym0, [srcq+strideq*0]
1033    vinserti32x8         m0, [srcq+strideq*1], 1
1034    movu                ym1, [srcq+strideq*2]
1035    vinserti32x8         m1, [srcq+stride3q ], 1
1036    lea                srcq, [srcq+strideq*4]
1037    vpermb               m0, m4, m0
1038    vpermb               m1, m4, m1
1039    pmaddubsw            m0, m5
1040    pmaddubsw            m1, m5
1041    mova        [tmpq+64*0], m0
1042    mova        [tmpq+64*1], m1
1043    add                tmpq, 64*2
1044    sub                  hd, 4
1045    jg .h_w16_loop
1046    RET
1047.h_w32:
1048    mova                 m4, [bilin_h_perm32]
1049.h_w32_loop:
1050    vpermb               m0, m4, [srcq+strideq*0]
1051    vpermb               m1, m4, [srcq+strideq*1]
1052    vpermb               m2, m4, [srcq+strideq*2]
1053    vpermb               m3, m4, [srcq+stride3q ]
1054    lea                srcq, [srcq+strideq*4]
1055    pmaddubsw            m0, m5
1056    pmaddubsw            m1, m5
1057    pmaddubsw            m2, m5
1058    pmaddubsw            m3, m5
1059    mova        [tmpq+64*0], m0
1060    mova        [tmpq+64*1], m1
1061    mova        [tmpq+64*2], m2
1062    mova        [tmpq+64*3], m3
1063    add                tmpq, 64*4
1064    sub                  hd, 4
1065    jg .h_w32_loop
1066    RET
1067.h_w64:
1068    mova                 m4, [bilin_h_perm32]
1069.h_w64_loop:
1070    vpermb               m0, m4, [srcq+strideq*0+32*0]
1071    vpermb               m1, m4, [srcq+strideq*0+32*1]
1072    vpermb               m2, m4, [srcq+strideq*1+32*0]
1073    vpermb               m3, m4, [srcq+strideq*1+32*1]
1074    lea                srcq, [srcq+strideq*2]
1075    pmaddubsw            m0, m5
1076    pmaddubsw            m1, m5
1077    pmaddubsw            m2, m5
1078    pmaddubsw            m3, m5
1079    mova        [tmpq+64*0], m0
1080    mova        [tmpq+64*1], m1
1081    mova        [tmpq+64*2], m2
1082    mova        [tmpq+64*3], m3
1083    add                tmpq, 64*4
1084    sub                  hd, 2
1085    jg .h_w64_loop
1086    RET
1087.h_w128:
1088    mova                 m4, [bilin_h_perm32]
1089.h_w128_loop:
1090    vpermb               m0, m4, [srcq+32*0]
1091    vpermb               m1, m4, [srcq+32*1]
1092    vpermb               m2, m4, [srcq+32*2]
1093    vpermb               m3, m4, [srcq+32*3]
1094    pmaddubsw            m0, m5
1095    pmaddubsw            m1, m5
1096    pmaddubsw            m2, m5
1097    pmaddubsw            m3, m5
1098    mova        [tmpq+64*0], m0
1099    mova        [tmpq+64*1], m1
1100    mova        [tmpq+64*2], m2
1101    mova        [tmpq+64*3], m3
1102    add                tmpq, 64*4
1103    add                srcq, strideq
1104    dec                  hd
1105    jg .h_w128_loop
1106    RET
1107.v:
1108    WIN64_SPILL_XMM       7
1109    movzx                wd, word [t2+wq*2+table_offset(prep, _bilin_v)]
1110    imul               mxyd, 255
1111    add                mxyd, 16
1112    add                  wq, t2
1113    lea            stride3q, [strideq*3]
1114    vpbroadcastw         m6, mxyd
1115    jmp                  wq
1116.v_w4:
1117    vpbroadcastd        xm0, [srcq+strideq*0]
1118    mov                 r3d, 0x29
1119    vbroadcasti32x4     ym3, [bilin_v_shuf4]
1120    kmovb                k1, r3d
1121.v_w4_loop:
1122    vpblendmd       xm1{k1}, xm0, [srcq+strideq*1] {1to4} ; __01 ____
1123    vpbroadcastd        ym2, [srcq+strideq*2]
1124    vpbroadcastd    ym2{k1}, [srcq+stride3q ]             ; __2_ 23__
1125    lea                srcq, [srcq+strideq*4]
1126    vpbroadcastd        ym0, [srcq+strideq*0]
1127    punpckhqdq      ym2{k1}, ym1, ym0                     ; 012_ 234_
1128    pshufb              ym2, ym3
1129    pmaddubsw           ym2, ym6
1130    mova             [tmpq], ym2
1131    add                tmpq, 32
1132    sub                  hd, 4
1133    jg .v_w4_loop
1134    RET
1135.v_w8:
1136    mova                 m5, [bilin_v_perm8]
1137    vbroadcasti32x4     ym0, [srcq+strideq*0]
1138.v_w8_loop:
1139    vinserti32x4        ym1, ym0, [srcq+strideq*1], 1
1140    vpbroadcastq        ym0, [srcq+strideq*2]
1141    vinserti32x4         m1, [srcq+stride3q ], 2
1142    lea                srcq, [srcq+strideq*4]
1143    vinserti32x4        ym0, [srcq+strideq*0], 0
1144    vpermt2b             m1, m5, m0
1145    pmaddubsw            m1, m6
1146    mova             [tmpq], m1
1147    add                tmpq, 64
1148    sub                  hd, 4
1149    jg .v_w8_loop
1150    RET
1151.v_w16:
1152    mova                 m5, [bilin_v_perm16]
1153    movu                xm0, [srcq+strideq*0]
1154.v_w16_loop:
1155    movu                xm2, [srcq+strideq*2]
1156    vinserti32x4        ym1, ym0, [srcq+strideq*1], 1
1157    vpermt2b             m1, m5, m2
1158    vinserti32x4        ym2, [srcq+stride3q ], 1
1159    lea                srcq, [srcq+strideq*4]
1160    movu                xm0, [srcq+strideq*0]
1161    vpermt2b             m2, m5, m0
1162    pmaddubsw            m1, m6
1163    pmaddubsw            m2, m6
1164    mova        [tmpq+64*0], m1
1165    mova        [tmpq+64*1], m2
1166    add                tmpq, 64*2
1167    sub                  hd, 4
1168    jg .v_w16_loop
1169    RET
1170.v_w32:
1171    mova                 m5, [bilin_v_perm32]
1172    movu                ym0, [srcq+strideq*0]
1173.v_w32_loop:
1174    movu                ym2, [srcq+strideq*1]
1175    movu                ym3, [srcq+strideq*2]
1176    movu                ym4, [srcq+stride3q ]
1177    lea                srcq, [srcq+strideq*4]
1178    vpermt2b             m0, m5, m2
1179    vpermt2b             m2, m5, m3
1180    vpermt2b             m3, m5, m4
1181    pmaddubsw            m1, m0, m6
1182    movu                ym0, [srcq+strideq*0]
1183    vpermt2b             m4, m5, m0
1184    pmaddubsw            m2, m6
1185    pmaddubsw            m3, m6
1186    pmaddubsw            m4, m6
1187    mova        [tmpq+64*0], m1
1188    mova        [tmpq+64*1], m2
1189    mova        [tmpq+64*2], m3
1190    mova        [tmpq+64*3], m4
1191    add                tmpq, 64*4
1192    sub                  hd, 4
1193    jg .v_w32_loop
1194    RET
1195.v_w64:
1196    mova                 m5, [bilin_v_perm64]
1197    vpermq               m0, m5, [srcq+strideq*0]
1198.v_w64_loop:
1199    vpermq               m1, m5, [srcq+strideq*1]
1200    lea                srcq, [srcq+strideq*2]
1201    punpcklbw            m4, m0, m1
1202    punpckhbw            m2, m0, m1
1203    vpermq               m0, m5, [srcq+strideq*0]
1204    punpcklbw            m3, m1, m0
1205    punpckhbw            m1, m0
1206    pmaddubsw            m4, m6
1207    pmaddubsw            m2, m6
1208    pmaddubsw            m3, m6
1209    pmaddubsw            m1, m6
1210    mova        [tmpq+64*0], m4
1211    mova        [tmpq+64*1], m2
1212    mova        [tmpq+64*2], m3
1213    mova        [tmpq+64*3], m1
1214    add                tmpq, 64*4
1215    sub                  hd, 2
1216    jg .v_w64_loop
1217    RET
1218.v_w128:
1219    mova                 m5, [bilin_v_perm64]
1220    vpermq               m0, m5, [srcq+strideq*0+ 0]
1221    vpermq               m1, m5, [srcq+strideq*0+64]
1222.v_w128_loop:
1223    vpermq               m2, m5, [srcq+strideq*1+ 0]
1224    vpermq               m3, m5, [srcq+strideq*1+64]
1225    lea                srcq, [srcq+strideq*2]
1226    punpcklbw            m4, m0, m2
1227    punpckhbw            m0, m2
1228    pmaddubsw            m4, m6
1229    pmaddubsw            m0, m6
1230    mova        [tmpq+64*0], m4
1231    mova        [tmpq+64*1], m0
1232    punpcklbw            m4, m1, m3
1233    punpckhbw            m1, m3
1234    pmaddubsw            m4, m6
1235    pmaddubsw            m1, m6
1236    mova        [tmpq+64*2], m4
1237    mova        [tmpq+64*3], m1
1238    vpermq               m0, m5, [srcq+strideq*0+ 0]
1239    vpermq               m1, m5, [srcq+strideq*0+64]
1240    punpcklbw            m4, m2, m0
1241    punpckhbw            m2, m0
1242    pmaddubsw            m4, m6
1243    pmaddubsw            m2, m6
1244    mova        [tmpq+64*4], m4
1245    mova        [tmpq+64*5], m2
1246    punpcklbw            m4, m3, m1
1247    punpckhbw            m3, m1
1248    pmaddubsw            m4, m6
1249    pmaddubsw            m3, m6
1250    mova        [tmpq+64*6], m4
1251    mova        [tmpq+64*7], m3
1252    add                tmpq, 64*8
1253    sub                  hd, 2
1254    jg .v_w128_loop
1255    RET
1256.hv:
1257    ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 8) >> 4
1258    ; = src[x] + (((my * (src[x + src_stride] - src[x])) + 8) >> 4)
1259    WIN64_SPILL_XMM       7
1260    movzx                wd, word [t2+wq*2+table_offset(prep, _bilin_hv)]
1261    shl                mxyd, 11
1262    vpbroadcastw         m6, mxyd
1263    add                  wq, t2
1264    lea            stride3q, [strideq*3]
1265    jmp                  wq
1266.hv_w4:
1267    vbroadcasti32x4     ym4, [bilin_h_shuf4]
1268    vpbroadcastq        ym0, [srcq+strideq*0]
1269    pshufb              ym0, ym4
1270    pmaddubsw           ym0, ym5
1271.hv_w4_loop:
1272    movq               xmm1, [srcq+strideq*1]
1273    movq               xmm2, [srcq+strideq*2]
1274    vinserti32x4        ym1, ymm1, [srcq+stride3q ], 1
1275    lea                srcq, [srcq+strideq*4]
1276    vinserti32x4        ym2, ymm2, [srcq+strideq*0], 1
1277    punpcklqdq          ym1, ym2
1278    pshufb              ym1, ym4
1279    pmaddubsw           ym1, ym5         ; 1 2 3 4
1280    valignq             ym2, ym1, ym0, 3 ; 0 1 2 3
1281    mova                ym0, ym1
1282    psubw               ym1, ym2
1283    pmulhrsw            ym1, ym6
1284    paddw               ym1, ym2
1285    mova             [tmpq], ym1
1286    add                tmpq, 32
1287    sub                  hd, 4
1288    jg .hv_w4_loop
1289    RET
1290.hv_w8:
1291    vbroadcasti32x4      m4, [bilin_h_perm16]
1292    vbroadcasti32x4      m0, [srcq+strideq*0]
1293    pshufb               m0, m4
1294    pmaddubsw            m0, m5
1295.hv_w8_loop:
1296    movu               xmm1, [srcq+strideq*1]
1297    vinserti128         ym1, ymm1, [srcq+strideq*2], 1
1298    vinserti128          m1, [srcq+stride3q ], 2
1299    lea                srcq, [srcq+strideq*4]
1300    vinserti128          m1, [srcq+strideq*0], 3
1301    pshufb               m1, m4
1302    pmaddubsw            m1, m5        ; 1 2 3 4
1303    valignq              m2, m1, m0, 6 ; 0 1 2 3
1304    mova                 m0, m1
1305    psubw                m1, m2
1306    pmulhrsw             m1, m6
1307    paddw                m1, m2
1308    mova             [tmpq], m1
1309    add                tmpq, 64
1310    sub                  hd, 4
1311    jg .hv_w8_loop
1312    RET
1313.hv_w16:
1314    mova                 m4, [bilin_h_perm16]
1315    vbroadcasti32x8      m0, [srcq+strideq*0]
1316    vpermb               m0, m4, m0
1317    pmaddubsw            m0, m5
1318.hv_w16_loop:
1319    movu                ym1, [srcq+strideq*1]
1320    vinserti32x8         m1, [srcq+strideq*2], 1
1321    movu                ym2, [srcq+stride3q ]
1322    lea                srcq, [srcq+strideq*4]
1323    vinserti32x8         m2, [srcq+strideq*0], 1
1324    vpermb               m1, m4, m1
1325    vpermb               m2, m4, m2
1326    pmaddubsw            m1, m5            ; 1 2
1327    vshufi32x4           m3, m0, m1, q1032 ; 0 1
1328    pmaddubsw            m0, m2, m5        ; 3 4
1329    vshufi32x4           m2, m1, m0, q1032 ; 2 3
1330    psubw                m1, m3
1331    pmulhrsw             m1, m6
1332    paddw                m1, m3
1333    psubw                m3, m0, m2
1334    pmulhrsw             m3, m6
1335    paddw                m3, m2
1336    mova        [tmpq+64*0], m1
1337    mova        [tmpq+64*1], m3
1338    add                tmpq, 64*2
1339    sub                  hd, 4
1340    jg .hv_w16_loop
1341    RET
1342.hv_w32:
1343    mova                 m4, [bilin_h_perm32]
1344    vpermb               m0, m4, [srcq+strideq*0]
1345    pmaddubsw            m0, m5
1346.hv_w32_loop:
1347    vpermb               m1, m4, [srcq+strideq*1]
1348    lea                srcq, [srcq+strideq*2]
1349    vpermb               m2, m4, [srcq+strideq*0]
1350    pmaddubsw            m1, m5
1351    psubw                m3, m1, m0
1352    pmulhrsw             m3, m6
1353    paddw                m3, m0
1354    pmaddubsw            m0, m2, m5
1355    psubw                m2, m0, m1
1356    pmulhrsw             m2, m6
1357    paddw                m2, m1
1358    mova        [tmpq+64*0], m3
1359    mova        [tmpq+64*1], m2
1360    add                tmpq, 64*2
1361    sub                  hd, 2
1362    jg .hv_w32_loop
1363    RET
1364.hv_w64:
1365    mova                 m4, [bilin_h_perm32]
1366    vpermb               m0, m4, [srcq+32*0]
1367    vpermb               m1, m4, [srcq+32*1]
1368    pmaddubsw            m0, m5
1369    pmaddubsw            m1, m5
1370.hv_w64_loop:
1371    add                srcq, strideq
1372    vpermb               m2, m4, [srcq+32*0]
1373    vpermb               m3, m4, [srcq+32*1]
1374    pmaddubsw            m2, m5
1375    pmaddubsw            m3, m5
1376    psubw                m7, m2, m0
1377    psubw                m8, m3, m1
1378    pmulhrsw             m7, m6
1379    pmulhrsw             m8, m6
1380    paddw                m7, m0
1381    mova                 m0, m2
1382    paddw                m8, m1
1383    mova                 m1, m3
1384    mova        [tmpq+64*0], m7
1385    mova        [tmpq+64*1], m8
1386    add                tmpq, 64*2
1387    dec                  hd
1388    jg .hv_w64_loop
1389    RET
1390.hv_w128:
1391    mova                 m4, [bilin_h_perm32]
1392    vpermb               m0, m4, [srcq+32*0]
1393    vpermb               m1, m4, [srcq+32*1]
1394    vpermb               m2, m4, [srcq+32*2]
1395    vpermb               m3, m4, [srcq+32*3]
1396    REPX  {pmaddubsw x, m5}, m0, m1, m2, m3
1397.hv_w128_loop:
1398    add                srcq, strideq
1399    vpermb               m7, m4, [srcq+32*0]
1400    vpermb               m8, m4, [srcq+32*1]
1401    vpermb               m9, m4, [srcq+32*2]
1402    vpermb              m10, m4, [srcq+32*3]
1403    REPX  {pmaddubsw x, m5}, m7, m8, m9, m10
1404    psubw               m11, m7, m0
1405    psubw               m12, m8, m1
1406    psubw               m13, m9, m2
1407    psubw               m14, m10, m3
1408    REPX  {pmulhrsw  x, m6}, m11, m12, m13, m14
1409    paddw               m11, m0
1410    mova                 m0, m7
1411    paddw               m12, m1
1412    mova                 m1, m8
1413    paddw               m13, m2
1414    mova                 m2, m9
1415    paddw               m14, m3
1416    mova                 m3, m10
1417    mova        [tmpq+64*0], m11
1418    mova        [tmpq+64*1], m12
1419    mova        [tmpq+64*2], m13
1420    mova        [tmpq+64*3], m14
1421    add                tmpq, 64*4
1422    dec                  hd
1423    jg .hv_w128_loop
1424    RET
1425
1426; int8_t subpel_filters[5][15][8]
1427%assign FILTER_REGULAR (0*15 << 16) | 3*15
1428%assign FILTER_SMOOTH  (1*15 << 16) | 4*15
1429%assign FILTER_SHARP   (2*15 << 16) | 3*15
1430
1431%macro FN 4-5 ; fn, type, type_h, type_v, jmp_to
1432cglobal %1_%2_8bpc
1433    mov                 t0d, FILTER_%3
1434%ifidn %3, %4
1435    mov                 t1d, t0d
1436%else
1437    mov                 t1d, FILTER_%4
1438%endif
1439%if %0 == 5 ; skip the jump in the last filter
1440    jmp mangle(private_prefix %+ _%5 %+ SUFFIX)
1441%endif
1442%endmacro
1443
1444%macro PUT_8TAP_H 4-5 0 ; dst/src, tmp[1-3], vpermb
1445%if %5
1446    vpermb              m%2, m6, m%1
1447    vpermb              m%3, m7, m%1
1448    vpermb              m%4, m8, m%1
1449%else
1450%if %2 < %4 ; reuse a previous value if possible
1451    pshufb              m%2, m%1, m6
1452%endif
1453    pshufb              m%3, m%1, m7
1454    pshufb              m%4, m%1, m8
1455%endif
1456    mova                m%1, m5
1457    vpdpbusd            m%1, m%2, m9
1458    mova                m%2, m5
1459    vpdpbusd            m%2, m%3, m9
1460    vpdpbusd            m%1, m%3, m10
1461    vpdpbusd            m%2, m%4, m10
1462    packusdw            m%1, m%2
1463    psrlw               m%1, 6
1464%endmacro
1465
1466%if WIN64
1467DECLARE_REG_TMP 4, 5
1468%else
1469DECLARE_REG_TMP 7, 8
1470%endif
1471
1472; Due to the use of vpdpbusd (which does 4 pixels per instruction) in
1473; the horizontal filter, 6-tap is only used for the vertical filter.
1474%define PUT_8TAP_FN FN put_8tap,
1475PUT_8TAP_FN sharp_smooth,   SHARP,   SMOOTH,  put_6tap_8bpc
1476PUT_8TAP_FN sharp_regular,  SHARP,   REGULAR, put_6tap_8bpc
1477PUT_8TAP_FN smooth,         SMOOTH,  SMOOTH,  put_6tap_8bpc
1478PUT_8TAP_FN smooth_regular, SMOOTH,  REGULAR, put_6tap_8bpc
1479PUT_8TAP_FN regular_smooth, REGULAR, SMOOTH,  put_6tap_8bpc
1480PUT_8TAP_FN regular,        REGULAR, REGULAR
1481
1482cglobal put_6tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ns
1483%define base r8-put_avx512icl
1484    imul                mxd, mxm, 0x010101
1485    add                 mxd, t0d ; 6tap_h, mx, 4tap_h
1486    imul                myd, mym, 0x010101
1487    add                 myd, t1d ; 6tap_v, my, 4tap_v
1488    lea                  r8, [put_avx512icl]
1489    movsxd               wq, wm
1490    movifnidn            hd, hm
1491    test                mxd, 0xf00
1492    jnz .h
1493    test                myd, 0xf00
1494    jnz .v
1495.put:
1496    tzcnt                wd, wd
1497    movzx                wd, word [r8+wq*2+table_offset(put,)]
1498    add                  wq, r8
1499    lea                  r6, [ssq*3]
1500    lea                  r7, [dsq*3]
1501%if WIN64
1502    pop                  r8
1503%endif
1504    jmp                  wq
1505.v:
1506    movzx               mxd, myb
1507    shr                 myd, 16
1508    cmp                  hd, 6
1509    cmovs               myd, mxd
1510    tzcnt               r6d, wd
1511    movzx               r6d, word [r8+r6*2+table_offset(put, _6tap_v)]
1512    vpbroadcastd         m6, [pw_512]
1513    lea                 myq, [base+subpel_filters+1+myq*8]
1514    vpbroadcastw         m7, [myq+0]
1515    add                  r6, r8
1516    vpbroadcastw         m8, [myq+2]
1517    mov                 nsq, ssq
1518    vpbroadcastw         m9, [myq+4]
1519    neg                 nsq
1520    jmp                  r6
1521.v_w2:
1522    movd               xmm2, [srcq+nsq*2]
1523    pinsrw             xmm2, [srcq+nsq*1], 2
1524    pinsrw             xmm2, [srcq+ssq*0], 4
1525    pinsrw             xmm2, [srcq+ssq*1], 6  ; 0 1 2 3
1526    lea                srcq, [srcq+ssq*2]
1527    vpbroadcastd       xmm0, [srcq+ssq*0]
1528    palignr            xmm3, xmm0, xmm2, 4    ; 1 2 3 4
1529    punpcklbw          xmm1, xmm2, xmm3       ; 01 12
1530    punpckhbw          xmm2, xmm3             ; 23 34
1531.v_w2_loop:
1532    vpbroadcastd       xmm4, [srcq+ssq*1]
1533    lea                srcq, [srcq+ssq*2]
1534    pmaddubsw          xmm3, xmm1, xm7        ; a0 b0
1535    mova               xmm1, xmm2
1536    pmaddubsw          xmm2, xm8              ; a1 b1
1537    paddw              xmm3, xmm2
1538    vpblendd           xmm2, xmm0, xmm4, 0x02 ; 4 5
1539    vpbroadcastd       xmm0, [srcq+ssq*0]
1540    vpblendd           xmm4, xmm0, 0x02       ; 5 6
1541    punpcklbw          xmm2, xmm4             ; 67 78
1542    pmaddubsw          xmm4, xmm2, xm9        ; a3 b3
1543    paddw              xmm3, xmm4
1544    pmulhrsw           xmm3, xm6
1545    packuswb           xmm3, xmm3
1546    pextrw     [dstq+dsq*0], xmm3, 0
1547    pextrw     [dstq+dsq*1], xmm3, 2
1548    lea                dstq, [dstq+dsq*2]
1549    sub                  hd, 2
1550    jg .v_w2_loop
1551    RET
1552.v_w4:
1553    movd               xmm2, [srcq+nsq*2]
1554    pinsrd             xmm2, [srcq+nsq*1], 1
1555    pinsrd             xmm2, [srcq+ssq*0], 2
1556    pinsrd             xmm2, [srcq+ssq*1], 3  ; 0 1 2 3
1557    lea                srcq, [srcq+ssq*2]
1558    vpbroadcastd       xmm0, [srcq+ssq*0]
1559    palignr            xmm3, xmm0, xmm2, 4    ; 1 2 3 4
1560    punpcklbw          xmm1, xmm2, xmm3       ; 01 12
1561    punpckhbw          xmm2, xmm3             ; 23 34
1562.v_w4_loop:
1563    vpbroadcastd       xmm4, [srcq+ssq*1]
1564    lea                srcq, [srcq+ssq*2]
1565    pmaddubsw          xmm3, xmm1, xm7        ; a0 b0
1566    mova               xmm1, xmm2
1567    pmaddubsw          xmm2, xm8              ; a1 b1
1568    paddw              xmm3, xmm2
1569    vpblendd           xmm2, xmm0, xmm4, 0x02 ; 4 5
1570    vpbroadcastd       xmm0, [srcq+ssq*0]
1571    vpblendd           xmm4, xmm0, 0x02       ; 5 6
1572    punpcklbw          xmm2, xmm4             ; 45 56
1573    pmaddubsw          xmm4, xmm2, xm9        ; a2 b2
1574    paddw              xmm3, xmm4
1575    pmulhrsw           xmm3, xm6
1576    packuswb           xmm3, xmm3
1577    movd       [dstq+dsq*0], xmm3
1578    pextrd     [dstq+dsq*1], xmm3, 1
1579    lea                dstq, [dstq+dsq*2]
1580    sub                  hd, 2
1581    jg .v_w4_loop
1582    RET
1583.v_w8:
1584    movq               xmm1, [srcq+nsq*2]
1585    vpbroadcastq       ymm3, [srcq+nsq*1]
1586    vpbroadcastq       ymm2, [srcq+ssq*0]
1587    vpbroadcastq       ymm4, [srcq+ssq*1]
1588    lea                srcq, [srcq+ssq*2]
1589    vpbroadcastq       ymm0, [srcq+ssq*0]
1590    vpblendd           ymm1, ymm3, 0x30
1591    vpblendd           ymm3, ymm2, 0x30
1592    punpcklbw          ymm1, ymm3      ; 01 12
1593    vpblendd           ymm2, ymm4, 0x30
1594    vpblendd           ymm4, ymm0, 0x30
1595    punpcklbw          ymm2, ymm4      ; 23 34
1596.v_w8_loop:
1597    vpbroadcastq       ymm4, [srcq+ssq*1]
1598    lea                srcq, [srcq+ssq*2]
1599    pmaddubsw          ymm3, ymm1, ym7 ; a0 b0
1600    mova               ymm1, ymm2
1601    pmaddubsw          ymm2, ym8       ; a1 b1
1602    paddw              ymm3, ymm2
1603    vpblendd           ymm2, ymm0, ymm4, 0x30
1604    vpbroadcastq       ymm0, [srcq+ssq*0]
1605    vpblendd           ymm4, ymm0, 0x30
1606    punpcklbw          ymm2, ymm4      ; 45 56
1607    pmaddubsw          ymm4, ymm2, ym9 ; a2 b2
1608    paddw              ymm3, ymm4
1609    pmulhrsw           ymm3, ym6
1610    vextracti128       xmm4, ymm3, 1
1611    packuswb           xmm3, xmm4
1612    movq       [dstq+dsq*0], xmm3
1613    movhps     [dstq+dsq*1], xmm3
1614    lea                dstq, [dstq+dsq*2]
1615    sub                  hd, 2
1616    jg .v_w8_loop
1617    vzeroupper
1618    RET
1619.v_w16:
1620    mova                 m5, [spel_v_perm16a]
1621    vbroadcasti32x4      m1, [srcq+nsq*2]
1622    vbroadcasti32x4     ym3, [srcq+nsq*1]
1623    mov                 r6d, 0x0f
1624    vbroadcasti32x4      m2, [srcq+ssq*0]
1625    kmovb                k1, r6d
1626    vbroadcasti32x4     ym4, [srcq+ssq*1]
1627    lea                srcq, [srcq+ssq*2]
1628    vbroadcasti32x4      m0, [srcq+ssq*0]
1629    vshufpd          m1{k1}, m3, m2, 0xcc
1630    vshufpd          m2{k1}, m4, m0, 0xcc
1631    vpermb               m1, m5, m1 ; 01 12
1632    vpermb               m2, m5, m2 ; 23 34
1633.v_w16_loop:
1634    vbroadcasti32x4     ym4, [srcq+ssq*1]
1635    lea                srcq, [srcq+ssq*2]
1636    pmaddubsw            m3, m1, m7 ; a0 b0
1637    mova                 m1, m2
1638    pmaddubsw            m2, m8     ; a1 b1
1639    paddw                m3, m2
1640    mova                 m2, m0
1641    vbroadcasti32x4      m0, [srcq+ssq*0]
1642    vshufpd          m2{k1}, m4, m0, 0xcc
1643    vpermb               m2, m5, m2 ; 45 56
1644    pmaddubsw            m4, m2, m9 ; a2 b2
1645    paddw                m3, m4
1646    pmulhrsw             m3, m6
1647    vextracti32x8       ym4, m3, 1
1648    packuswb            ym3, ym4
1649    mova          [dstq+dsq*0], xm3
1650    vextracti32x4 [dstq+dsq*1], ym3, 1
1651    lea                dstq, [dstq+dsq*2]
1652    sub                  hd, 2
1653    jg .v_w16_loop
1654    RET
1655.v_w32:
1656    mova                m10, [spel_v_perm32]
1657    pmovzxbq             m5, [pb_02461357]
1658    vpshrdw             m11, m10, m10, 8
1659    movu                ym0, [srcq+nsq*2]
1660    vinserti32x8         m0, [srcq+nsq*1], 1
1661    vpermb               m1, m10, m0 ; 01
1662    vinserti32x8         m0, [srcq+ssq*0], 0
1663    vpermb               m2, m11, m0 ; 12
1664    vinserti32x8         m0, [srcq+ssq*1], 1
1665    lea                srcq, [srcq+ssq*2]
1666    vpermb               m3, m10, m0 ; 23
1667    vinserti32x8         m0, [srcq+ssq*0], 0
1668    vpermb               m4, m11, m0 ; 34
1669.v_w32_loop:
1670    vinserti32x8         m0, [srcq+ssq*1], 1
1671    lea                srcq, [srcq+ssq*2]
1672    pmaddubsw           m12, m1, m7
1673    mova                 m1, m3
1674    pmaddubsw           m13, m2, m7
1675    mova                 m2, m4
1676    pmaddubsw           m14, m3, m8
1677    vpermb               m3, m10, m0 ; 45
1678    vinserti32x8         m0, [srcq+ssq*0], 0
1679    pmaddubsw           m15, m4, m8
1680    vpermb               m4, m11, m0 ; 56
1681    paddw               m12, m14
1682    pmaddubsw           m14, m3, m9
1683    paddw               m13, m15
1684    pmaddubsw           m15, m4, m9
1685    paddw               m12, m14
1686    paddw               m13, m15
1687    pmulhrsw            m12, m6
1688    pmulhrsw            m13, m6
1689    packuswb            m12, m13
1690    vpermq              m12, m5, m12
1691    mova          [dstq+dsq*0], ym12
1692    vextracti32x8 [dstq+dsq*1], m12, 1
1693    lea                dstq, [dstq+dsq*2]
1694    sub                  hd, 2
1695    jg .v_w32_loop
1696    RET
1697.v_w64:
1698.v_w128:
1699    lea                 r6d, [hq+wq*4-256]
1700.v_loop0:
1701    movu                 m2, [srcq+nsq*2]
1702    movu                 m4, [srcq+nsq*1]
1703    lea                  r4, [srcq+ssq*2]
1704    movu                m11, [srcq+ssq*0]
1705    movu                m13, [srcq+ssq*1]
1706    mov                  r7, dstq
1707    movu                 m0, [r4  +ssq*0]
1708    punpcklbw            m1, m2, m4   ; 01l
1709    punpckhbw            m2, m4       ; 01h
1710    punpcklbw            m3, m4, m11  ; 12l
1711    punpckhbw            m4, m11      ; 12h
1712    punpcklbw           m10, m11, m13 ; 23l
1713    punpckhbw           m11, m13      ; 23h
1714    punpcklbw           m12, m13, m0  ; 34l
1715    punpckhbw           m13, m0       ; 34h
1716.v_loop:
1717    movu                 m5, [r4+ssq*1]
1718    pmaddubsw           m14, m1, m7   ; a0l
1719    mova                 m1, m10
1720    pmaddubsw           m10, m8       ; a1l
1721    lea                  r4, [r4+ssq*2]
1722    pmaddubsw           m15, m2, m7   ; a0h
1723    mova                 m2, m11
1724    pmaddubsw           m11, m8       ; a1h
1725    paddw               m14, m10
1726    punpcklbw           m10, m0, m5   ; 45l
1727    paddw               m15, m11
1728    punpckhbw           m11, m0, m5   ; 45h
1729    pmaddubsw            m0, m10, m9  ; a2l
1730    paddw               m14, m0
1731    pmaddubsw            m0, m11, m9  ; a2h
1732    paddw               m15, m0
1733    movu                 m0, [r4+ssq*0]
1734    pmulhrsw            m14, m6
1735    pmulhrsw            m15, m6
1736    packuswb            m14, m15
1737    pmaddubsw           m15, m3, m7   ; b0l
1738    mova                 m3, m12
1739    pmaddubsw           m12, m8       ; b1l
1740    mova         [r7+dsq*0], m14
1741    pmaddubsw           m14, m4, m7   ; b0h
1742    mova                 m4, m13
1743    pmaddubsw           m13, m8       ; b1h
1744    paddw               m15, m12
1745    punpcklbw           m12, m5, m0   ; 56l
1746    paddw               m14, m13
1747    punpckhbw           m13, m5, m0   ; 56h
1748    pmaddubsw            m5, m12, m9  ; b2l
1749    paddw               m15, m5
1750    pmaddubsw            m5, m13, m9  ; b2h
1751    paddw               m14, m5
1752    pmulhrsw            m15, m6
1753    pmulhrsw            m14, m6
1754    packuswb            m15, m14
1755    mova         [r7+dsq*1], m15
1756    lea                  r7, [r7+dsq*2]
1757    sub                  hd, 2
1758    jg .v_loop
1759    add                srcq, 64
1760    add                dstq, 64
1761    movzx                hd, r6b
1762    sub                 r6d, 256
1763    jg .v_loop0
1764    RET
1765.h:
1766    test                myd, 0xf00
1767    jz mangle(private_prefix %+ _put_8tap_8bpc_avx512icl).h2
1768.hv:
1769    vpbroadcastd         m9, [pd_34]
1770    mova               xm10, [spel_hv_end]
1771    pxor                xm0, xm0
1772    cmp                  wd, 4
1773    jg .hv_w8
1774    movzx               mxd, mxb
1775    dec                srcq
1776    vpbroadcastd         m7, [base+subpel_filters+mxq*8+2]
1777    movzx               mxd, myb
1778    shr                 myd, 16
1779    cmp                  hd, 6
1780    cmovs               myd, mxd
1781    vpbroadcastq        ym1, [base+subpel_filters+1+myq*8]
1782    mov                 nsq, ssq
1783    punpcklbw           ym0, ym1
1784    neg                 nsq
1785    psraw               ym0, 2 ; << 6
1786    pshufd             ym11, ym0, q0000
1787    pshufd             ym12, ym0, q1111
1788    pshufd             ym13, ym0, q2222
1789    cmp                  wd, 4
1790    je .hv_w4
1791    vbroadcasti128      ym5, [subpel_h_shuf4]
1792    movq               xmm0, [srcq+nsq*2]
1793    movhps             xmm0, [srcq+nsq*1]
1794    movq               xmm2, [srcq+ssq*0]
1795    movhps             xmm2, [srcq+ssq*1]
1796    lea                srcq, [srcq+ssq*2]
1797    vpbroadcastq       ymm1, [srcq+ssq*0]
1798    vpblendd           ymm0, ymm1, 0x30
1799    pshufb             xmm2, xm5        ; 2 3
1800    pshufb             ymm0, ym5        ; 0 1   4
1801    mova               xmm1, xm9
1802    vpdpbusd           xmm1, xmm2, xm7
1803    mova               ymm2, ym9
1804    vpdpbusd           ymm2, ymm0, ym7
1805    packssdw           ymm2, ymm1
1806    psraw              ymm2, 2
1807    vextracti128       xmm0, ymm2, 1
1808    vzeroupper
1809    palignr            xmm0, xmm2, 4
1810    punpcklwd          xmm1, xmm2, xmm0 ; 01 12
1811    punpckhwd          xmm2, xmm0       ; 23 34
1812.hv_w2_loop:
1813    movq               xmm3, [srcq+ssq*1]
1814    lea                srcq, [srcq+ssq*2]
1815    movhps             xmm3, [srcq+ssq*0]
1816    pmaddwd            xmm4, xmm1, xm11 ; a0 b0
1817    mova               xmm1, xmm2
1818    vpdpwssd           xmm4, xmm2, xm12 ; a1 b1
1819    pshufb             xmm3, xm5
1820    mova               xmm2, xm9
1821    vpdpbusd           xmm2, xmm3, xm7
1822    packssdw           xmm3, xmm2, xmm2
1823    psraw              xmm3, 2
1824    palignr            xmm2, xmm3, xmm0, 12
1825    mova               xmm0, xmm3
1826    punpcklwd          xmm2, xmm3       ; 45 56
1827    vpdpwssd           xmm4, xmm2, xm13 ; a2 b2
1828    packuswb           xmm4, xmm4
1829    pshufb             xmm4, xm10
1830    pextrw     [dstq+dsq*0], xmm4, 0
1831    pextrw     [dstq+dsq*1], xmm4, 1
1832    lea                dstq, [dstq+dsq*2]
1833    sub                  hd, 2
1834    jg .hv_w2_loop
1835    RET
1836.hv_w4:
1837    movq                xm2, [srcq+nsq*2]
1838    vpbroadcastq        ym1, [srcq+nsq*1]
1839    vinserti32x4        ym2, [srcq+ssq*0], 1
1840    vinserti32x4         m1, [srcq+ssq*1], 2 ; _ 1 3
1841    lea                srcq, [srcq+ssq*2]
1842    vbroadcasti32x4      m5, [subpel_h_shufA]
1843    vinserti32x4         m2, [srcq+ssq*0], 2 ; 0 2 4
1844    pshufb               m1, m5
1845    mova                 m0, m9
1846    pshufb               m2, m5
1847    mova                 m3, m9
1848    vpdpbusd             m0, m1, m7
1849    mova                ym1, [spel_hv_perm4a]
1850    vpdpbusd             m3, m2, m7
1851    mova                ym2, [spel_hv_perm4b]
1852    mov                 r6d, 0x5555
1853    mova                ym6, [spel_hv_perm4d]
1854    packssdw             m0, m3
1855    kmovw                k1, r6d
1856    psraw                m0, 2 ; _ 0   1 2   3 4   5 6
1857    vpermb              ym1, ym1, ym0 ; 01 12
1858    vpermb               m2, m2, m0   ; 23 34
1859.hv_w4_loop:
1860    movq                xm3, [srcq+ssq*1]
1861    lea                srcq, [srcq+ssq*2]
1862    vinserti32x4        ym3, [srcq+ssq*0], 1
1863    pmaddwd             ym4, ym1, ym11 ; a0 b0
1864    mova                ym1, ym2
1865    pshufb              ym3, ym5
1866    mova                ym0, ym9
1867    vpdpbusd            ym0, ym3, ym7
1868    vpdpwssd            ym4, ym2, ym12 ; a1 b1
1869    vpsraw          ym2{k1}, ym0, 2    ; 5 6
1870    vpermb              ym2, ym6, ym2  ; 45 56
1871    vpdpwssd            ym4, ym2, ym13 ; a2 b2
1872    packuswb            ym4, ym4
1873    vpermb              ym4, ym10, ym4
1874    movd       [dstq+dsq*0], xm4
1875    pextrd     [dstq+dsq*1], xm4, 1
1876    lea                dstq, [dstq+dsq*2]
1877    sub                  hd, 2
1878    jg .hv_w4_loop
1879    RET
1880.hv_w8:
1881    shr                 mxd, 16
1882    sub                srcq, 3
1883    vpbroadcastd        m11, [base+subpel_filters+mxq*8+0]
1884    vpbroadcastd        m12, [base+subpel_filters+mxq*8+4]
1885    movzx               mxd, myb
1886    shr                 myd, 16
1887    cmp                  hd, 6
1888    cmovs               myd, mxd
1889    vpbroadcastq         m1, [base+subpel_filters+1+myq*8]
1890    mov                 nsq, ssq
1891    punpcklbw            m0, m1
1892    neg                 nsq
1893    psraw                m0, 2 ; << 6
1894    pshufd              m13, m0, q0000
1895    pshufd              m14, m0, q1111
1896    pshufd              m15, m0, q2222
1897    cmp                  wd, 8
1898    jne .hv_w16
1899    movu                xm0, [srcq+nsq*2]
1900    vinserti32x4        ym0, [srcq+nsq*1], 1
1901    vbroadcasti32x4      m1, [subpel_h_shufA]
1902    vinserti32x4         m0, [srcq+ssq*0], 2
1903    vbroadcasti32x4      m4, [subpel_h_shufB]
1904    vinserti32x4         m0, [srcq+ssq*1], 3
1905    lea                srcq, [srcq+ssq*2]
1906    vbroadcasti32x4      m7, [subpel_h_shufC]
1907    vbroadcasti32x4     ym5, [srcq+ssq*0]
1908    vbroadcasti32x8      m6, [subpel_h_shufA]
1909    pshufb               m1, m0, m1   ; 0 1 2 3    0123
1910    mova                 m2, m9
1911    vpdpbusd             m2, m1, m11
1912    pshufb               m4, m0, m4   ; 0 1 2 3    4567
1913    mova                 m1, m9
1914    vpdpbusd             m1, m4, m11
1915    pshufb               m0, m7       ; 0 1 2 3    89ab
1916    pshufb              ym7, ym5, ym6 ; 4     0123 4567
1917    mova                ym3, ym9
1918    vpdpbusd            ym3, ym7, ym11
1919    vbroadcasti32x8      m7, [subpel_h_shufB]
1920    vpdpbusd             m2, m4, m12
1921    mova                 m4, [spel_hv_perm8a]
1922    pshufb              ym5, ym7      ; 4     4567 89ab
1923    vpdpbusd             m1, m0, m12
1924    vpaddd               m0, m4, [pb_32] {1to16}
1925    vpdpbusd            ym3, ym5, ym12
1926    mova                 m5, [spel_hv_perm8b]
1927    mov                  r6, 0x55555555ff00
1928    packssdw             m2, m1
1929    vpmovsdw            xm3, ym3
1930    kmovq                k1, r6
1931    psraw                m2, 2        ; 0 1 2 3
1932    psraw               xm3, 2        ; 4
1933    vpermb               m1, m4, m2   ; 01 12
1934    kshiftrq             k2, k1, 16
1935    vpermt2b             m2, m0, m3   ; 23 34
1936.hv_w8_loop:
1937    vbroadcasti32x4     ym3, [srcq+ssq*1]
1938    lea                srcq, [srcq+ssq*2]
1939    vbroadcasti32x4  m3{k1}, [srcq+ssq*0]
1940    pmaddwd              m0, m1, m13  ; a0 b0
1941    pshufb               m1, m3, m6   ; 5 6   0123 4567
1942    mova                 m4, m9
1943    vpdpbusd             m4, m1, m11
1944    pshufb               m3, m7       ; 5 6   4567 89ab
1945    vpdpwssd             m0, m2, m14  ; a1 b1
1946    mova                 m1, m2
1947    vpdpbusd             m4, m3, m12
1948    psraw            m2{k2}, m4, 2    ; 53 64
1949    vpermb               m2, m5, m2   ; 45 56
1950    vpdpwssd             m0, m2, m15  ; a2 b2
1951    packuswb             m0, m0
1952    vpermb               m0, m10, m0
1953    movq       [dstq+dsq*0], xm0
1954    movhps     [dstq+dsq*1], xm0
1955    lea                dstq, [dstq+dsq*2]
1956    sub                  hd, 2
1957    jg .hv_w8_loop
1958    RET
1959.hv_w16:
1960    movu                m19, [spel_hv_perm16a]
1961    vpbroadcastd         m7, [pb_4]
1962    lea                 r6d, [wq*2-32]
1963    mova                 m6, [spel_hv_perm16b]
1964    paddb               m20, m7, m19
1965    lea                 r6d, [hq+r6*8]
1966    paddb               m21, m7, m20
1967    mova               ym10, [spel_hv_end16]
1968    paddb                m7, m6
1969.hv_w16_loop0:
1970    movu               ym16, [srcq+nsq*2]
1971    vinserti32x8        m16, [srcq+nsq*1], 1
1972    lea                  r4, [srcq+ssq*2]
1973    movu               ym17, [srcq+ssq*0]
1974    vinserti32x8        m17, [srcq+ssq*1], 1
1975    mov                  r7, dstq
1976    movu               ym18, [r4  +ssq*0]
1977    vpermb               m2, m19, m16    ; 0 1   0123   89ab
1978    mova                 m1, m9
1979    vpermb               m3, m21, m16    ; 0 1   89ab   ghij
1980    vpdpbusd             m1, m2, m11
1981    mova                 m2, m9
1982    vpermb               m4, m19, m17    ; 2 3   0123   89ab
1983    vpdpbusd             m2, m3, m12
1984    mova                 m3, m9
1985    vpermb               m5, m21, m17    ; 2 3   89ab   ghij
1986    vpdpbusd             m3, m4, m11
1987    mova                 m4, m9
1988    vpermb               m0, m6, m18     ; 4     0145   2367   89cd   abef
1989    vpdpbusd             m4, m5, m12
1990    mova                 m5, m9
1991    vpermb              m16, m20, m16    ; 0 1   4567   cdef
1992    vpdpbusd             m5, m0, m11
1993    vpermb              m17, m20, m17    ; 2 3   4567   cdef
1994    vpdpbusd             m1, m16, m12
1995    vpermb              m18, m7, m18     ; 4     4589   67ab   cdgh   efij
1996    vpdpbusd             m2, m16, m11
1997    vpdpbusd             m3, m17, m12
1998    vpdpbusd             m4, m17, m11
1999    vpdpbusd             m5, m18, m12
2000    packssdw             m1, m2          ; 01
2001    packssdw             m3, m4          ; 23
2002    REPX       {psraw x, 2}, m1, m3, m5
2003    vpshrdd              m2, m1, m3, 16  ; 12
2004    vpshrdd              m4, m3, m5, 16  ; 34
2005.hv_w16_loop:
2006    movu               ym18, [r4+ssq*1]
2007    lea                  r4, [r4+ssq*2]
2008    vinserti32x8        m18, [r4+ssq*0], 1
2009    pmaddwd             m16, m1, m13     ; a0
2010    vpermb               m1, m19, m18    ; 5 6   0123   89ab
2011    pmaddwd             m17, m2, m13     ; b0
2012    vpermb               m2, m20, m18    ; 5 6   4567   cdef
2013    mova                 m0, m9
2014    vpdpbusd             m0, m1, m11
2015    vpermb              m18, m21, m18
2016    mova                 m1, m9
2017    vpdpbusd             m1, m2, m11
2018    vpdpwssd            m16, m3, m14    ; a1
2019    vpdpwssd            m17, m4, m14    ; b1
2020    vpdpbusd             m0, m2, m12
2021    mova                 m2, m4
2022    vpdpbusd             m1, m18, m12
2023    packssdw             m0, m1
2024    mova                 m1, m3
2025    psraw                m4, m0, 2      ; 5 6
2026    vpshrdd              m3, m2, m4, 16 ; 4 5
2027    vpdpwssd            m17, m4, m15    ; b2
2028    vpdpwssd            m16, m3, m15    ; a2
2029    packuswb            m16, m17
2030    vpermb              m16, m10, m16
2031    mova         [r7+dsq*0], xm16
2032    vextracti128 [r7+dsq*1], ym16, 1
2033    lea                  r7, [r7+dsq*2]
2034    sub                  hd, 2
2035    jg .hv_w16_loop
2036    add                srcq, 16
2037    add                dstq, 16
2038    movzx                hd, r6b
2039    sub                 r6d, 1<<8
2040    jg .hv_w16_loop0
2041    vzeroupper
2042    RET
2043
2044PUT_8TAP_FN smooth_sharp,   SMOOTH,  SHARP,   put_8tap_8bpc
2045PUT_8TAP_FN regular_sharp,  REGULAR, SHARP,   put_8tap_8bpc
2046PUT_8TAP_FN sharp,          SHARP,   SHARP
2047
2048cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
2049    imul                mxd, mxm, 0x010101
2050    add                 mxd, t0d ; 8tap_h, mx, 4tap_h
2051    imul                myd, mym, 0x010101
2052    add                 myd, t1d ; 8tap_v, my, 4tap_v
2053    lea                  r8, [put_avx512icl]
2054    movsxd               wq, wm
2055    movifnidn            hd, hm
2056    test                mxd, 0xf00
2057    jnz .h
2058    test                myd, 0xf00
2059    jz mangle(private_prefix %+ _put_6tap_8bpc_avx512icl).put
2060.v:
2061    movzx               mxd, myb
2062    shr                 myd, 16
2063    cmp                  hd, 6
2064    cmovs               myd, mxd
2065    tzcnt               r6d, wd
2066    lea                 myq, [base+subpel_filters+myq*8]
2067    movzx               r6d, word [r8+r6*2+table_offset(put, _8tap_v)]
2068    vpbroadcastd         m7, [pw_512]
2069    vpbroadcastw         m8, [myq+0]
2070    add                  r6, r8
2071    vpbroadcastw         m9, [myq+2]
2072    lea                ss3q, [ssq*3]
2073    vpbroadcastw        m10, [myq+4]
2074    sub                srcq, ss3q
2075    vpbroadcastw        m11, [myq+6]
2076    jmp                  r6
2077.v_w2:
2078    movd               xmm2, [srcq+ssq*0]
2079    pinsrw             xmm2, [srcq+ssq*1], 2
2080    pinsrw             xmm2, [srcq+ssq*2], 4
2081    add                srcq, ss3q
2082    pinsrw             xmm2, [srcq+ssq*0], 6  ; 0 1 2 3
2083    movd               xmm3, [srcq+ssq*1]
2084    vpbroadcastd       xmm1, [srcq+ssq*2]
2085    add                srcq, ss3q
2086    vpbroadcastd       xmm0, [srcq+ssq*0]
2087    vpblendd           xmm3, xmm3, xmm1, 0x02 ; 4 5
2088    vpblendd           xmm1, xmm1, xmm0, 0x02 ; 5 6
2089    palignr            xmm4, xmm3, xmm2, 4    ; 1 2 3 4
2090    punpcklbw          xmm3, xmm1             ; 45 56
2091    punpcklbw          xmm1, xmm2, xmm4       ; 01 12
2092    punpckhbw          xmm2, xmm4             ; 23 34
2093.v_w2_loop:
2094    pmaddubsw          xmm5, xmm1, xm8        ; a0 b0
2095    mova               xmm1, xmm2
2096    pmaddubsw          xmm2, xm9              ; a1 b1
2097    paddw              xmm5, xmm2
2098    mova               xmm2, xmm3
2099    pmaddubsw          xmm3, xm10             ; a2 b2
2100    paddw              xmm5, xmm3
2101    vpbroadcastd       xmm4, [srcq+ssq*1]
2102    lea                srcq, [srcq+ssq*2]
2103    vpblendd           xmm3, xmm0, xmm4, 0x02 ; 6 7
2104    vpbroadcastd       xmm0, [srcq+ssq*0]
2105    vpblendd           xmm4, xmm4, xmm0, 0x02 ; 7 8
2106    punpcklbw          xmm3, xmm4             ; 67 78
2107    pmaddubsw          xmm4, xmm3, xm11       ; a3 b3
2108    paddw              xmm5, xmm4
2109    pmulhrsw           xmm5, xm7
2110    packuswb           xmm5, xmm5
2111    pextrw     [dstq+dsq*0], xmm5, 0
2112    pextrw     [dstq+dsq*1], xmm5, 2
2113    lea                dstq, [dstq+dsq*2]
2114    sub                  hd, 2
2115    jg .v_w2_loop
2116    RET
2117.v_w4:
2118    movd               xmm2, [srcq+ssq*0]
2119    pinsrd             xmm2, [srcq+ssq*1], 1
2120    pinsrd             xmm2, [srcq+ssq*2], 2
2121    add                srcq, ss3q
2122    pinsrd             xmm2, [srcq+ssq*0], 3  ; 0 1 2 3
2123    movd               xmm3, [srcq+ssq*1]
2124    vpbroadcastd       xmm1, [srcq+ssq*2]
2125    add                srcq, ss3q
2126    vpbroadcastd       xmm0, [srcq+ssq*0]
2127    vpblendd           xmm3, xmm3, xmm1, 0x02 ; 4 5
2128    vpblendd           xmm1, xmm1, xmm0, 0x02 ; 5 6
2129    palignr            xmm4, xmm3, xmm2, 4    ; 1 2 3 4
2130    punpcklbw          xmm3, xmm1             ; 45 56
2131    punpcklbw          xmm1, xmm2, xmm4       ; 01 12
2132    punpckhbw          xmm2, xmm4             ; 23 34
2133.v_w4_loop:
2134    vpbroadcastd       xmm4, [srcq+ssq*1]
2135    lea                srcq, [srcq+ssq*2]
2136    pmaddubsw          xmm5, xmm1, xm8        ; a0 b0
2137    mova               xmm1, xmm2
2138    pmaddubsw          xmm2, xm9              ; a1 b1
2139    paddw              xmm5, xmm2
2140    mova               xmm2, xmm3
2141    pmaddubsw          xmm3, xm10             ; a2 b2
2142    paddw              xmm5, xmm3
2143    vpblendd           xmm3, xmm0, xmm4, 0x02 ; 6 7
2144    vpbroadcastd       xmm0, [srcq+ssq*0]
2145    vpblendd           xmm4, xmm4, xmm0, 0x02 ; 7 8
2146    punpcklbw          xmm3, xmm4             ; 67 78
2147    pmaddubsw          xmm4, xmm3, xm11       ; a3 b3
2148    paddw              xmm5, xmm4
2149    pmulhrsw           xmm5, xm7
2150    packuswb           xmm5, xmm5
2151    movd       [dstq+dsq*0], xmm5
2152    pextrd     [dstq+dsq*1], xmm5, 1
2153    lea                dstq, [dstq+dsq*2]
2154    sub                  hd, 2
2155    jg .v_w4_loop
2156    RET
2157.v_w8:
2158    movq               xmm1, [srcq+ssq*0]
2159    vpbroadcastq       ymm0, [srcq+ssq*1]
2160    vpbroadcastq       ymm2, [srcq+ssq*2]
2161    add                srcq, ss3q
2162    vpbroadcastq       ymm5, [srcq+ssq*0]
2163    vpbroadcastq       ymm3, [srcq+ssq*1]
2164    vpbroadcastq       ymm4, [srcq+ssq*2]
2165    add                srcq, ss3q
2166    vpblendd           ymm1, ymm0, 0x30
2167    vpblendd           ymm0, ymm2, 0x30
2168    punpcklbw          ymm1, ymm0 ; 01 12
2169    vpbroadcastq       ymm0, [srcq+ssq*0]
2170    vpblendd           ymm2, ymm5, 0x30
2171    vpblendd           ymm5, ymm3, 0x30
2172    punpcklbw          ymm2, ymm5 ; 23 34
2173    vpblendd           ymm3, ymm4, 0x30
2174    vpblendd           ymm4, ymm0, 0x30
2175    punpcklbw          ymm3, ymm4 ; 45 56
2176.v_w8_loop:
2177    vpbroadcastq       ymm4, [srcq+ssq*1]
2178    lea                srcq, [srcq+ssq*2]
2179    pmaddubsw          ymm5, ymm1, ym8  ; a0 b0
2180    mova               ymm1, ymm2
2181    pmaddubsw          ymm2, ym9        ; a1 b1
2182    paddw              ymm5, ymm2
2183    mova               ymm2, ymm3
2184    pmaddubsw          ymm3, ym10       ; a2 b2
2185    paddw              ymm5, ymm3
2186    vpblendd           ymm3, ymm0, ymm4, 0x30
2187    vpbroadcastq       ymm0, [srcq+ssq*0]
2188    vpblendd           ymm4, ymm4, ymm0, 0x30
2189    punpcklbw          ymm3, ymm4       ; 67 78
2190    pmaddubsw          ymm4, ymm3, ym11 ; a3 b3
2191    paddw              ymm5, ymm4
2192    pmulhrsw           ymm5, ym7
2193    vextracti128       xmm4, ymm5, 1
2194    packuswb           xmm5, xmm4
2195    movq       [dstq+dsq*0], xmm5
2196    movhps     [dstq+dsq*1], xmm5
2197    lea                dstq, [dstq+dsq*2]
2198    sub                  hd, 2
2199    jg .v_w8_loop
2200    vzeroupper
2201    RET
2202.v_w16:
2203    mova                m12, [spel_v_perm16a]
2204    vbroadcasti32x4      m1, [srcq+ssq*0]
2205    vbroadcasti32x4     ym4, [srcq+ssq*1]
2206    mov                 r6d, 0x0f
2207    vbroadcasti32x4      m2, [srcq+ssq*2]
2208    add                srcq, ss3q
2209    vbroadcasti32x4     ym5, [srcq+ssq*0]
2210    kmovb                k1, r6d
2211    vbroadcasti32x4      m3, [srcq+ssq*1]
2212    vbroadcasti32x4     ym6, [srcq+ssq*2]
2213    add                srcq, ss3q
2214    vbroadcasti32x4      m0, [srcq+ssq*0]
2215    vshufpd          m1{k1}, m4, m2, 0xcc
2216    vshufpd          m2{k1}, m5, m3, 0xcc
2217    vshufpd          m3{k1}, m6, m0, 0xcc
2218    vpermb               m1, m12, m1 ; 01 12
2219    vpermb               m2, m12, m2 ; 23 34
2220    vpermb               m3, m12, m3 ; 45 56
2221.v_w16_loop:
2222    pmaddubsw            m4, m1, m8  ; a0 b0
2223    mova                 m1, m2
2224    pmaddubsw            m5, m2, m9  ; a1 b1
2225    mova                 m2, m3
2226    pmaddubsw            m6, m3, m10 ; a2 b2
2227    mova                 m3, m0
2228    paddw                m4, m5
2229    vbroadcasti32x4     ym5, [srcq+ssq*1]
2230    lea                srcq, [srcq+ssq*2]
2231    vbroadcasti32x4      m0, [srcq+ssq*0]
2232    vshufpd          m3{k1}, m5, m0, 0xcc
2233    vpermb               m3, m12, m3 ; 67 78
2234    pmaddubsw            m5, m3, m11 ; a3 b3
2235    paddw                m4, m6
2236    paddw                m4, m5
2237    pmulhrsw             m4, m7
2238    vextracti32x8       ym5, m4, 1
2239    packuswb            ym4, ym5
2240    mova          [dstq+dsq*0], xm4
2241    vextracti32x4 [dstq+dsq*1], ym4, 1
2242    lea                dstq, [dstq+dsq*2]
2243    sub                  hd, 2
2244    jg .v_w16_loop
2245    RET
2246.v_w32:
2247    mova                m12, [spel_v_perm32]
2248    pmovzxbq            m14, [pb_02461357]
2249    vpshrdw             m13, m12, m12, 8
2250    movu                ym0, [srcq+ssq*0]
2251    vinserti32x8         m0, [srcq+ssq*1], 1
2252    vpermb               m1, m12, m0 ; 01
2253    vinserti32x8         m0, [srcq+ssq*2], 0
2254    add                srcq, ss3q
2255    vpermb               m2, m13, m0 ; 12
2256    vinserti32x8         m0, [srcq+ssq*0], 1
2257    vpermb               m3, m12, m0 ; 23
2258    vinserti32x8         m0, [srcq+ssq*1], 0
2259    vpermb               m4, m13, m0 ; 34
2260    vinserti32x8         m0, [srcq+ssq*2], 1
2261    add                srcq, ss3q
2262    vpermb               m5, m12, m0 ; 45
2263    vinserti32x8         m0, [srcq+ssq*0], 0
2264    vpermb               m6, m13, m0 ; 56
2265.v_w32_loop:
2266    vinserti32x8         m0, [srcq+ssq*1], 1
2267    lea                srcq, [srcq+ssq*2]
2268    pmaddubsw           m15, m1, m8
2269    mova                 m1, m3
2270    pmaddubsw           m16, m2, m8
2271    mova                 m2, m4
2272    pmaddubsw           m17, m3, m9
2273    mova                 m3, m5
2274    pmaddubsw           m18, m4, m9
2275    mova                 m4, m6
2276    pmaddubsw           m19, m5, m10
2277    vpermb               m5, m12, m0 ; 67
2278    vinserti32x8         m0, [srcq+ssq*0], 0
2279    pmaddubsw           m20, m6, m10
2280    vpermb               m6, m13, m0 ; 78
2281    paddw               m15, m17
2282    pmaddubsw           m17, m5, m11
2283    paddw               m16, m18
2284    pmaddubsw           m18, m6, m11
2285    paddw               m15, m19
2286    paddw               m16, m20
2287    paddw               m15, m17
2288    paddw               m16, m18
2289    pmulhrsw            m15, m7
2290    pmulhrsw            m16, m7
2291    packuswb            m15, m16
2292    vpermq              m15, m14, m15
2293    mova          [dstq+dsq*0], ym15
2294    vextracti32x8 [dstq+dsq*1], m15, 1
2295    lea                dstq, [dstq+dsq*2]
2296    sub                  hd, 2
2297    jg .v_w32_loop
2298    vzeroupper
2299    RET
2300.v_w64:
2301.v_w128:
2302    lea                 r6d, [hq+wq*4-256]
2303    mov                  r4, srcq
2304    mov                  r7, dstq
2305.v_loop0:
2306    movu                 m2, [srcq+ssq*0]
2307    movu                 m4, [srcq+ssq*1]
2308    movu                 m6, [srcq+ssq*2]
2309    add                srcq, ss3q
2310    movu                m13, [srcq+ssq*0]
2311    movu                m15, [srcq+ssq*1]
2312    movu                m17, [srcq+ssq*2]
2313    add                srcq, ss3q
2314    movu                 m0, [srcq+ssq*0]
2315    punpcklbw            m1, m2, m4    ; 01l
2316    punpckhbw            m2, m4        ; 01h
2317    punpcklbw            m3, m4, m6    ; 12l
2318    punpckhbw            m4, m6        ; 12h
2319    punpcklbw            m5, m6, m13   ; 23l
2320    punpckhbw            m6, m13       ; 23h
2321    punpcklbw           m12, m13, m15  ; 34l
2322    punpckhbw           m13, m15       ; 34h
2323    punpcklbw           m14, m15, m17  ; 45l
2324    punpckhbw           m15, m17       ; 45h
2325    punpcklbw           m16, m17, m0   ; 56l
2326    punpckhbw           m17, m0        ; 56h
2327.v_loop:
2328    pmaddubsw           m18, m1, m8    ; a0l
2329    mova                 m1, m5
2330    pmaddubsw           m19, m2, m8    ; a0h
2331    mova                 m2, m6
2332    pmaddubsw           m20, m3, m8    ; b0l
2333    mova                 m3, m12
2334    pmaddubsw           m21, m4, m8    ; b0h
2335    mova                 m4, m13
2336    pmaddubsw            m5, m9        ; a1l
2337    pmaddubsw            m6, m9        ; a1h
2338    pmaddubsw           m12, m9        ; b1l
2339    pmaddubsw           m13, m9        ; b1h
2340    paddw               m18, m5
2341    mova                 m5, m14
2342    pmaddubsw           m14, m10       ; a2l
2343    paddw               m19, m6
2344    mova                 m6, m15
2345    pmaddubsw           m15, m10       ; a2h
2346    paddw               m20, m12
2347    mova                m12, m16
2348    pmaddubsw           m16, m10       ; b2l
2349    paddw               m21, m13
2350    mova                m13, m17
2351    pmaddubsw           m17, m10       ; b2h
2352    paddw               m18, m14
2353    paddw               m19, m15
2354    paddw               m20, m16
2355    paddw               m21, m17
2356    movu                m17, [srcq+ssq*1]
2357    lea                srcq, [srcq+ssq*2]
2358    punpcklbw           m14, m0, m17  ; 67l
2359    punpckhbw           m15, m0, m17  ; 67h
2360    pmaddubsw           m16, m14, m11 ; a3l
2361    pmaddubsw            m0, m15, m11 ; a3h
2362    paddw               m18, m16
2363    paddw               m19, m0
2364    movu                 m0, [srcq+ssq*0]
2365    punpcklbw           m16, m17, m0  ; 78l
2366    punpckhbw           m17, m0       ; 78h
2367    pmulhrsw            m18, m7
2368    pmulhrsw            m19, m7
2369    packuswb            m18, m19
2370    mova       [dstq+dsq*0], m18
2371    pmaddubsw           m18, m16, m11 ; b3l
2372    pmaddubsw           m19, m17, m11 ; b3h
2373    paddw               m18, m20
2374    paddw               m19, m21
2375    pmulhrsw            m18, m7
2376    pmulhrsw            m19, m7
2377    packuswb            m18, m19
2378    mova       [dstq+dsq*1], m18
2379    lea                dstq, [dstq+dsq*2]
2380    sub                  hd, 2
2381    jg .v_loop
2382    add                  r4, 64
2383    add                  r7, 64
2384    movzx                hd, r6b
2385    mov                srcq, r4
2386    mov                dstq, r7
2387    sub                 r6d, 256
2388    jg .v_loop0
2389    vzeroupper
2390    RET
2391.h:
2392    test                myd, 0xf00
2393    jnz .hv
2394.h2:
2395    vpbroadcastd         m5, [pd_34] ; 2 + (8 << 2)
2396    cmp                  wd, 4
2397    jl .h_w2
2398    vbroadcasti128       m6, [subpel_h_shufA]
2399    je .h_w4
2400    tzcnt                wd, wd
2401    vbroadcasti128       m7, [subpel_h_shufB]
2402    vbroadcasti128       m8, [subpel_h_shufC]
2403    shr                 mxd, 16
2404    sub                srcq, 3
2405    movzx                wd, word [r8+wq*2+table_offset(put, _8tap_h)]
2406    vpbroadcastd         m9, [base+mxq*8+subpel_filters+0]
2407    vpbroadcastd        m10, [base+mxq*8+subpel_filters+4]
2408    add                  wq, r8
2409    jmp                  wq
2410.h_w2:
2411    movzx               mxd, mxb
2412    dec                srcq
2413    mova               xmm4, [subpel_h_shuf4]
2414    vpbroadcastd       xmm3, [base+mxq*8+subpel_filters+2]
2415.h_w2_loop:
2416    movq               xmm0, [srcq+ssq*0]
2417    movhps             xmm0, [srcq+ssq*1]
2418    lea                srcq, [srcq+ssq*2]
2419    pshufb             xmm0, xmm4
2420    mova               xmm1, xm5
2421    vpdpbusd           xmm1, xmm0, xmm3
2422    packssdw           xmm0, xmm1, xmm1
2423    psraw              xmm0, 6
2424    packuswb           xmm0, xm0
2425    pextrw     [dstq+dsq*0], xmm0, 0
2426    pextrw     [dstq+dsq*1], xmm0, 1
2427    lea                dstq, [dstq+dsq*2]
2428    sub                  hd, 2
2429    jg .h_w2_loop
2430    RET
2431.h_w4:
2432    movzx               mxd, mxb
2433    dec                srcq
2434    vpbroadcastd       xmm3, [base+mxq*8+subpel_filters+2]
2435.h_w4_loop:
2436    movq               xmm0, [srcq+ssq*0]
2437    movq               xmm1, [srcq+ssq*1]
2438    lea                srcq, [srcq+ssq*2]
2439    pshufb             xmm0, xm6
2440    pshufb             xmm1, xm6
2441    mova               xmm2, xm5
2442    vpdpbusd           xmm2, xmm0, xmm3
2443    mova               xmm0, xm5
2444    vpdpbusd           xmm0, xmm1, xmm3
2445    packssdw           xmm0, xmm2, xmm0
2446    psraw              xmm0, 6
2447    packuswb           xmm0, xmm0
2448    movd       [dstq+dsq*0], xmm0
2449    pextrd     [dstq+dsq*1], xmm0, 1
2450    lea                dstq, [dstq+dsq*2]
2451    sub                  hd, 2
2452    jg .h_w4_loop
2453    RET
2454.h_w8:
2455    movu                xm0, [srcq+ssq*0]
2456    vinserti32x4        ym0, [srcq+ssq*1], 1
2457    lea                srcq, [srcq+ssq*2]
2458    WRAP_YMM PUT_8TAP_H   0, 1, 2, 3
2459    vpmovuswb           xm0, ym0
2460    movq       [dstq+dsq*0], xm0
2461    movhps     [dstq+dsq*1], xm0
2462    lea                dstq, [dstq+dsq*2]
2463    sub                  hd, 2
2464    jg .h_w8
2465    RET
2466.h_w16:
2467    mova                 m6, [spel_h_perm16]
2468    vpbroadcastd         m8, [pb_4]
2469    paddb                m7, m8, m6
2470    paddb                m8, m7
2471.h_w16_loop:
2472    movu                ym0, [srcq+ssq*0]
2473    vinserti32x8         m0, [srcq+ssq*1], 1
2474    lea                srcq, [srcq+ssq*2]
2475    PUT_8TAP_H            0, 1, 2, 3, 1
2476    vpmovuswb           ym0, m0
2477    mova         [dstq+dsq*0], xm0
2478    vextracti128 [dstq+dsq*1], ym0, 1
2479    lea                dstq, [dstq+dsq*2]
2480    sub                  hd, 2
2481    jg .h_w16_loop
2482    RET
2483.h_w32:
2484    movu                ym0, [srcq+ssq*0+8*0]
2485    vinserti32x8         m0, [srcq+ssq*1+8*0], 1
2486    movu                ym1, [srcq+ssq*0+8*1]
2487    vinserti32x8         m1, [srcq+ssq*1+8*1], 1
2488    lea                srcq, [srcq+ssq*2]
2489    PUT_8TAP_H            0, 2, 3, 4
2490    PUT_8TAP_H            1, 4, 3, 2
2491    packuswb             m0, m1
2492    mova          [dstq+dsq*0], ym0
2493    vextracti32x8 [dstq+dsq*1], m0, 1
2494    lea                dstq, [dstq+dsq*2]
2495    sub                  hd, 2
2496    jg .h_w32
2497    RET
2498.h_w64:
2499    movu                 m0, [srcq+8*0]
2500    movu                 m1, [srcq+8*1]
2501    add                srcq, ssq
2502    PUT_8TAP_H            0, 2, 3, 4
2503    PUT_8TAP_H            1, 4, 3, 2
2504    packuswb             m0, m1
2505    mova             [dstq], m0
2506    add                dstq, dsq
2507    dec                  hd
2508    jg .h_w64
2509    RET
2510.h_w128:
2511    movu                 m0, [srcq+8*0]
2512    movu                 m2, [srcq+8*1]
2513    movu                 m1, [srcq+8*8]
2514    movu                 m3, [srcq+8*9]
2515    add                srcq, ssq
2516    PUT_8TAP_H            0,  4, 11, 12
2517    PUT_8TAP_H            2, 12, 11,  4
2518    PUT_8TAP_H            1,  4, 11, 12
2519    PUT_8TAP_H            3, 12, 11,  4
2520    packuswb             m0, m2
2521    packuswb             m1, m3
2522    mova        [dstq+64*0], m0
2523    mova        [dstq+64*1], m1
2524    add                dstq, dsq
2525    dec                  hd
2526    jg .h_w128
2527    RET
2528.hv:
2529    vpbroadcastd         m9, [pd_34]
2530    pxor                xm0, xm0
2531    cmp                  wd, 4
2532    jg .hv_w8
2533    movzx               mxd, mxb
2534    dec                srcq
2535    vpbroadcastd         m7, [base+subpel_filters+mxq*8+2]
2536    movzx               mxd, myb
2537    shr                 myd, 16
2538    cmp                  hd, 6
2539    cmovs               myd, mxd
2540    vpbroadcastq        ym1, [base+subpel_filters+myq*8]
2541    lea                ss3q, [ssq*3]
2542    mov                  r6, srcq
2543    punpcklbw           ym0, ym1
2544    sub                  r6, ss3q
2545    psraw               ym0, 2 ; << 6
2546    mova               xm14, [spel_hv_end]
2547    pshufd             ym10, ym0, q0000
2548    pshufd             ym11, ym0, q1111
2549    pshufd             ym12, ym0, q2222
2550    pshufd             ym13, ym0, q3333
2551    cmp                  wd, 4
2552    je .hv_w4
2553    vbroadcasti128      ym6, [subpel_h_shuf4]
2554    movq               xmm2, [r6+ssq*0]
2555    movhps             xmm2, [r6+ssq*1]
2556    movq               xmm0, [r6+ssq*2]
2557    movhps             xmm0, [srcq+ssq*0]
2558    vpbroadcastq       ymm3, [srcq+ssq*1]
2559    vpbroadcastq       ymm4, [srcq+ssq*2]
2560    add                srcq, ss3q
2561    vpbroadcastq       ymm1, [srcq+ssq*0]
2562    vpblendd           ymm2, ymm3, 0x30
2563    vpblendd           ymm0, ymm1, 0x30 ; 2 3   6 _
2564    vpblendd           ymm2, ymm4, 0xc0 ; 0 1   4 5
2565    pshufb             ymm2, ym6
2566    pshufb             ymm0, ym6
2567    mova               ymm1, ym9
2568    vpdpbusd           ymm1, ymm2, ym7
2569    mova               ymm2, ym9
2570    vpdpbusd           ymm2, ymm0, ym7
2571    packssdw           ymm2, ymm1, ymm2
2572    psraw              ymm2, 2
2573    vextracti128       xmm3, ymm2, 1
2574    palignr            xmm4, xmm3, xmm2, 4
2575    punpcklwd          xmm1, xmm2, xmm4 ; 01 12
2576    punpckhwd          xmm2, xmm4       ; 23 34
2577    pshufd             xmm0, xmm3, q2121
2578    punpcklwd          xmm3, xmm0       ; 45 56
2579.hv_w2_loop:
2580    movq               xmm4, [srcq+ssq*1]
2581    lea                srcq, [srcq+ssq*2]
2582    movhps             xmm4, [srcq+ssq*0]
2583    pmaddwd            xmm5, xmm1, xm10 ; a0 b0
2584    mova               xmm1, xmm2
2585    vpdpwssd           xmm5, xmm2, xm11 ; a1 b1
2586    pshufb             xmm4, xm6
2587    mova               xmm2, xmm3
2588    vpdpwssd           xmm5, xmm3, xm12 ; a2 b2
2589    mova               xmm3, xm9
2590    vpdpbusd           xmm3, xmm4, xm7
2591    packssdw           xmm4, xmm3, xmm3
2592    psraw              xmm4, 2
2593    palignr            xmm3, xmm4, xmm0, 12
2594    mova               xmm0, xmm4
2595    punpcklwd          xmm3, xmm4       ; 67 78
2596    vpdpwssd           xmm5, xmm3, xm13 ; a3 b3
2597    packuswb           xmm5, xmm5
2598    pshufb             xmm5, xm14
2599    pextrw     [dstq+dsq*0], xmm5, 0
2600    pextrw     [dstq+dsq*1], xmm5, 1
2601    lea                dstq, [dstq+dsq*2]
2602    sub                  hd, 2
2603    jg .hv_w2_loop
2604    vzeroupper
2605    RET
2606.hv_w4:
2607    movq               xmm1, [r6+ssq*0]
2608    vpbroadcastq        ym2, [r6+ssq*1]
2609    vinserti32x4        ym1, ymm1, [r6+ssq*2], 1
2610    vinserti32x4         m2, [srcq+ssq*0], 2
2611    vinserti32x4         m1, [srcq+ssq*1], 2
2612    vinserti32x4         m2, [srcq+ssq*2], 3 ; _ 1 3 5
2613    vbroadcasti32x4      m6, [subpel_h_shufA]
2614    add                srcq, ss3q
2615    vinserti32x4         m1, [srcq+ssq*0], 3 ; 0 2 4 6
2616    pshufb               m2, m6
2617    pshufb               m1, m6
2618    mova                 m0, m9
2619    vpdpbusd             m0, m2, m7
2620    mova                 m4, m9
2621    vpdpbusd             m4, m1, m7
2622    mova                ym1, [spel_hv_perm4a]
2623    mova                ym2, [spel_hv_perm4b]
2624    mova                ym3, [spel_hv_perm4c]
2625    packssdw             m0, m4
2626    psraw                m0, 2 ; _ 0   1 2   3 4   5 6
2627    mov                 r6d, 0x5555
2628    vpermb              ym1, ym1, ym0 ; 01 12
2629    vpermb               m2, m2, m0   ; 23 34
2630    vpermb               m3, m3, m0   ; 45 56
2631    kmovw                k1, r6d
2632    mova               ym15, [spel_hv_perm4d]
2633.hv_w4_loop:
2634    movq               xmm4, [srcq+ssq*1]
2635    lea                srcq, [srcq+ssq*2]
2636    vinserti32x4        ym4, ymm4, [srcq+ssq*0], 1
2637    pmaddwd             ym5, ym1, ym10 ; a0 b0
2638    mova                ym1, ym2
2639    pshufb              ym4, ym6
2640    mova                ym0, ym9
2641    vpdpbusd            ym0, ym4, ym7
2642    vpdpwssd            ym5, ym2, ym11 ; a1 b1
2643    mova                ym2, ym3
2644    vpdpwssd            ym5, ym3, ym12 ; a2 b2
2645    vpsraw          ym3{k1}, ym0, 2    ; 7 8
2646    vpermb              ym3, ym15, ym3 ; 67 78
2647    vpdpwssd            ym5, ym3, ym13 ; a3 b3
2648    packuswb            ym5, ym5
2649    vpermb              ym5, ym14, ym5
2650    movd       [dstq+dsq*0], xm5
2651    pextrd     [dstq+dsq*1], xm5, 1
2652    lea                dstq, [dstq+dsq*2]
2653    sub                  hd, 2
2654    jg .hv_w4_loop
2655    RET
2656.hv_w8:
2657    shr                 mxd, 16
2658    sub                srcq, 3
2659    vpbroadcastd        m10, [base+subpel_filters+mxq*8+0]
2660    vpbroadcastd        m11, [base+subpel_filters+mxq*8+4]
2661    movzx               mxd, myb
2662    shr                 myd, 16
2663    cmp                  hd, 6
2664    cmovs               myd, mxd
2665    vpbroadcastq         m1, [base+subpel_filters+myq*8]
2666    punpcklbw            m0, m1
2667    lea                ss3q, [ssq*3]
2668    psraw                m0, 2 ; << 6
2669    pshufd              m12, m0, q0000
2670    pshufd              m13, m0, q1111
2671    pshufd              m14, m0, q2222
2672    pshufd              m15, m0, q3333
2673    cmp                  wd, 8
2674    jne .hv_w16
2675    mov                  r6, srcq
2676    sub                  r6, ss3q
2677    movu               xmm1, [r6+ssq*0]
2678    vinserti128        ymm1, [r6+ssq*1], 1
2679    movu               xmm2, [srcq+ssq*1]
2680    vinserti32x4         m6, zmm1, [r6+ssq*2], 2
2681    vinserti128        ymm2, [srcq+ssq*2], 1
2682    vinserti32x4         m6, [srcq+ssq*0], 3 ; 0 1 2 3
2683    add                srcq, ss3q
2684    vbroadcasti32x4      m4, [subpel_h_shufA]
2685    vinserti32x4         m0, zmm2, [srcq+ssq*0], 2 ; 4 5 6 _
2686    vbroadcasti32x4      m7, [subpel_h_shufB]
2687    vbroadcasti32x4      m8, [subpel_h_shufC]
2688    pshufb               m1, m6, m4  ; 0 1 2 3   0123
2689    mova                 m2, m9
2690    vpdpbusd             m2, m1, m10
2691    pshufb               m5, m6, m7  ; 0 1 2 3   4567
2692    mova                 m1, m9
2693    vpdpbusd             m1, m5, m10
2694    pshufb               m4, m0, m4  ; 4 5 6 _   0123
2695    mova                 m3, m9
2696    vpdpbusd             m3, m4, m10
2697    pshufb               m7, m0, m7  ; 4 5 6 _   4567
2698    mova                 m4, m9
2699    vpdpbusd             m4, m7, m10
2700    pshufb               m6, m8
2701    vpdpbusd             m2, m5, m11
2702    vpdpbusd             m1, m6, m11
2703    pshufb               m6, m0, m8
2704    vpdpbusd             m3, m7, m11
2705    vpdpbusd             m4, m6, m11
2706    mova                 m5, [spel_hv_perm8a]
2707    vpaddd               m0, m5, [pb_32] {1to16}
2708    mov                  r6, 0x55555555ff00
2709    packssdw             m2, m1
2710    packssdw             m3, m4
2711    mova                 m8, [spel_hv_perm8b]
2712    psraw                m2, 2 ; 0 1 2 3
2713    psraw                m3, 2 ; 4 5 6 _
2714    vpermb               m1, m5, m2 ; 01 12
2715    vbroadcasti32x8      m6, [subpel_h_shufA]
2716    kmovq                k1, r6
2717    vpermt2b             m2, m0, m3 ; 23 34
2718    vbroadcasti32x8      m7, [subpel_h_shufB]
2719    kshiftrq             k2, k1, 16
2720    mova               xm16, [spel_hv_end]
2721    vpermb               m3, m5, m3 ; 45 56
2722.hv_w8_loop:
2723    vbroadcasti32x4     ym4, [srcq+ssq*1]
2724    lea                srcq, [srcq+ssq*2]
2725    vbroadcasti32x4  m4{k1}, [srcq+ssq*0]
2726    pmaddwd              m0, m1, m12 ; a0 b0
2727    pshufb               m1, m4, m6  ; 7 8   0123 4567
2728    mova                 m5, m9
2729    vpdpbusd             m5, m1, m10
2730    pshufb               m4, m7      ; 7 8   4567 89ab
2731    vpdpwssd             m0, m2, m13 ; a1 b1
2732    mova                 m1, m2
2733    vpdpbusd             m5, m4, m11
2734    mova                 m2, m3
2735    vpdpwssd             m0, m3, m14 ; a2 b2
2736    psraw            m3{k2}, m5, 2   ; 75 86
2737    vpermb               m3, m8, m3  ; 67 78
2738    vpdpwssd             m0, m3, m15 ; a3 b3
2739    packuswb             m0, m0
2740    vpermb             zmm1, m16, m0
2741    movq       [dstq+dsq*0], xmm1
2742    movhps     [dstq+dsq*1], xmm1
2743    lea                dstq, [dstq+dsq*2]
2744    sub                  hd, 2
2745    jg .hv_w8_loop
2746    vzeroupper
2747    RET
2748.hv_w16:
2749    WIN64_SPILL_XMM      23
2750    movu                m22, [spel_hv_perm16a]
2751    sub                srcq, ss3q
2752    vpbroadcastd         m8, [pb_4]
2753    lea                 r6d, [wq*2-32]
2754    mova                 m7, [spel_hv_perm16b]
2755    paddb               m20, m8, m22
2756    mova               ym16, [spel_hv_end16]
2757    paddb               m21, m8, m20
2758    lea                 r6d, [hq+r6*8]
2759    paddb                m8, m7
2760.hv_w16_loop0:
2761    movu               ym17, [srcq+ssq*0]
2762    vinserti32x8        m17, [srcq+ssq*1], 1 ; 0 1
2763    lea                  r4, [srcq+ss3q]
2764    movu               ym18, [srcq+ssq*2]
2765    vinserti32x8        m18, [r4  +ssq*0], 1 ; 2 3
2766    mov                  r7, dstq
2767    movu               ym19, [r4  +ssq*1]
2768    vinserti32x8        m19, [r4  +ssq*2], 1 ; 4 5
2769    add                  r4, ss3q
2770    vpermb               m2, m22, m17    ; 0 1   0123   89ab
2771    mova                 m1, m9
2772    vpermb               m3, m21, m17    ; 0 1   89ab   ghij
2773    vpdpbusd             m1, m2, m10
2774    mova                 m2, m9
2775    vpermb               m4, m22, m18    ; 2 3   0123   89ab
2776    vpdpbusd             m2, m3, m11
2777    mova                 m3, m9
2778    vpermb               m5, m21, m18    ; 2 3   89ab   ghij
2779    vpdpbusd             m3, m4, m10
2780    mova                 m4, m9
2781    vpermb               m6, m22, m19    ; 4 5   0123   89ab
2782    vpdpbusd             m4, m5, m11
2783    mova                 m5, m9
2784    vpermb              m17, m20, m17    ; 0 1   4567   cdef
2785    vpdpbusd             m5, m6, m10
2786    mova                 m6, m9
2787    vpermb               m0, m21, m19    ; 4 5   89ab   ghij
2788    vpdpbusd             m1, m17, m11
2789    vpdpbusd             m2, m17, m10
2790    movu               ym17, [r4+ssq*0]  ; 6
2791    vpermb              m18, m20, m18    ; 2 3   4567   cdef
2792    vpdpbusd             m6, m0, m11
2793    vpermb               m0, m7, m17     ; 6     0145   2367   89cd   abef
2794    vpdpbusd             m3, m18, m11
2795    vpermb              m19, m20, m19    ; 4 5   4567   cdef
2796    vpdpbusd             m4, m18, m10
2797    mova                m18, m9
2798    vpermb              m17, m8, m17     ; 6     4589   67ab   cdgh   efij
2799    vpdpbusd            m18, m0, m10
2800    packssdw             m1, m2
2801    vpdpbusd             m5, m19, m11
2802    vpdpbusd             m6, m19, m10
2803    packssdw             m3, m4
2804    vpdpbusd            m18, m17, m11
2805    psraw                m1, 2           ; 01
2806    psraw                m3, 2           ; 23
2807    packssdw             m5, m6
2808    vpshrdd              m2, m1, m3, 16  ; 12
2809    psraw                m5, 2           ; 45
2810    vpshrdd              m4, m3, m5, 16  ; 34
2811    psraw               m18, 2
2812    vpshrdd              m6, m5, m18, 16 ; 56
2813.hv_w16_loop:
2814    movu               ym19, [r4+ssq*1]
2815    lea                  r4, [r4+ssq*2]
2816    vinserti32x8        m19, [r4+ssq*0], 1
2817    pmaddwd             m17, m1, m12     ; a0
2818    vpermb               m1, m22, m19    ; 7 8   0123   89ab
2819    pmaddwd             m18, m2, m12     ; b0
2820    mova                 m0, m9
2821    vpermb               m2, m21, m19    ; 7 8   89ab   ghij
2822    vpdpbusd             m0, m1, m10
2823    mova                 m1, m9
2824    vpermb              m19, m20, m19    ; 7 8   4567   cdef
2825    vpdpbusd             m1, m2, m11
2826    mova                 m2, m4
2827    vpdpwssd            m17, m3, m13     ; a1
2828    vpdpwssd            m18, m4, m13     ; b1
2829    mova                 m4, m6
2830    vpdpbusd             m0, m19, m11
2831    vpdpbusd             m1, m19, m10
2832    vpdpwssd            m17, m5, m14     ; a2
2833    vpdpwssd            m18, m6, m14     ; b2
2834    packssdw             m0, m1
2835    mova                 m1, m3
2836    psraw                m6, m0, 2       ; 78
2837    mova                 m3, m5
2838    vpshrdd              m5, m4, m6, 16  ; 67
2839    vpdpwssd            m18, m6, m15     ; b3
2840    vpdpwssd            m17, m5, m15     ; a3
2841    packuswb            m17, m18
2842    vpermb              m17, m16, m17
2843    mova         [r7+dsq*0], xm17
2844    vextracti128 [r7+dsq*1], ym17, 1
2845    lea                  r7, [r7+dsq*2]
2846    sub                  hd, 2
2847    jg .hv_w16_loop
2848    add                srcq, 16
2849    add                dstq, 16
2850    movzx                hd, r6b
2851    sub                 r6d, 1<<8
2852    jg .hv_w16_loop0
2853    RET
2854
2855%if WIN64
2856DECLARE_REG_TMP 6, 4
2857%else
2858DECLARE_REG_TMP 6, 7
2859%endif
2860
2861%define PREP_8TAP_FN FN prep_8tap,
2862PREP_8TAP_FN sharp_smooth,   SHARP,   SMOOTH,  prep_6tap_8bpc
2863PREP_8TAP_FN sharp_regular,  SHARP,   REGULAR, prep_6tap_8bpc
2864PREP_8TAP_FN smooth,         SMOOTH,  SMOOTH,  prep_6tap_8bpc
2865PREP_8TAP_FN smooth_regular, SMOOTH,  REGULAR, prep_6tap_8bpc
2866PREP_8TAP_FN regular_smooth, REGULAR, SMOOTH,  prep_6tap_8bpc
2867PREP_8TAP_FN regular,        REGULAR, REGULAR
2868
2869cglobal prep_6tap_8bpc, 4, 8, 0, tmp, src, ss, w, h, mx, my, ss3
2870%define base r7-prep_avx512icl
2871    imul                mxd, mxm, 0x010101
2872    add                 mxd, t0d ; 6tap_h, mx, 4tap_h
2873    imul                myd, mym, 0x010101
2874    add                 myd, t1d ; 6tap_v, my, 4tap_v
2875    lea                  r7, [prep_avx512icl]
2876    movifnidn            hd, hm
2877    test                mxd, 0xf00
2878    jnz .h
2879    test                myd, 0xf00
2880    jnz .v
2881.prep:
2882    tzcnt                wd, wd
2883    movzx                wd, word [r7+wq*2+table_offset(prep,)]
2884    add                  wq, r7
2885    lea                  r6, [ssq*3]
2886%if WIN64
2887    pop                  r7
2888%endif
2889    jmp                  wq
2890.v:
2891    movzx               mxd, myb
2892    shr                 myd, 16
2893    cmp                  hd, 4
2894    cmove               myd, mxd
2895    tzcnt               r5d, wd
2896    lea                 myq, [base+subpel_filters+1+myq*8]
2897    movzx               r5d, word [r7+r5*2+table_offset(prep, _6tap_v)]
2898    vpbroadcastd         m7, [pw_8192]
2899    sub                srcq, ssq
2900    vpbroadcastw         m8, [myq+0]
2901    add                  r5, r7
2902    vpbroadcastw         m9, [myq+2]
2903    lea                ss3q, [ssq*3]
2904    vpbroadcastw        m10, [myq+4]
2905    sub                srcq, ssq
2906    jmp                  r5
2907.v_w4:
2908    movd               xmm2, [srcq+ssq*0]
2909    pinsrd             xmm2, [srcq+ssq*1], 1
2910    vpbroadcastd       ymm1, [srcq+ssq*2]
2911    add                srcq, ss3q
2912    vpbroadcastd       ymm3, [srcq+ssq*0]
2913    vpbroadcastd       ymm0, [srcq+ssq*1]
2914    vbroadcasti128     ymm5, [deint_shuf4]
2915    vpblendd           ymm1, ymm2, 0xeb
2916    punpcklqdq         ymm3, ymm0
2917    vpblendd           ymm1, ymm3, 0x60 ; 0 1 2 _   2 3 4 _
2918    pshufb             ymm1, ymm5       ; 01 12 23 34
2919.v_w4_loop:
2920    pinsrd             xmm0, [srcq+ssq*2], 1
2921    vpbroadcastd       ymm2, [srcq+ss3q ]
2922    lea                srcq, [srcq+ssq*4]
2923    vpbroadcastd       ymm3, [srcq+ssq*0]
2924    vpblendd           ymm2, ymm0, 0xeb
2925    vpbroadcastd       ymm0, [srcq+ssq*1]
2926    punpcklqdq         ymm3, ymm0
2927    vpblendd           ymm2, ymm3, 0x60 ; 4 5 6 _   6 7 8 _
2928    pshufb             ymm2, ymm5       ; 45 56 67 78
2929    pmaddubsw          ymm3, ymm1, ym8  ; a0 b0 c0 d0
2930    vperm2i128         ymm1, ymm2, 0x21 ; 23 34 45 56
2931    pmaddubsw          ymm4, ymm2, ym10 ; a2 b2 c2 d2
2932    pmaddubsw          ymm1, ym9        ; a1 b1 c1 d1
2933    paddw              ymm3, ymm4
2934    paddw              ymm3, ymm1
2935    pmulhrsw           ymm3, ym7
2936    mova               ymm1, ymm2
2937    mova             [tmpq], ymm3
2938    add                tmpq, 32
2939    sub                  hd, 4
2940    jg .v_w4_loop
2941    vzeroupper
2942    RET
2943.v_w8:
2944    mova                 m6, [spel_v_perm8]
2945    movq                xm1, [srcq+ssq*0]
2946    mov                 r6d, 0x3e
2947    movq                xm2, [srcq+ssq*1]
2948    kmovb                k1, r6d
2949    vpbroadcastq        ym3, [srcq+ssq*2]
2950    add                srcq, ss3q
2951    vpunpcklqdq         ym2, [srcq+ssq*0] {1to4}
2952    vpunpcklqdq      m1{k1}, m3, [srcq+ssq*1] {1to8}
2953    movq                xm0, [srcq+ssq*1]
2954    kshiftlb             k2, k1, 2
2955    shufpd               m1, m2, 0x18  ; 0 1   2 3   4
2956    vpermb               m1, m6, m1    ; 01 12 23 34
2957.v_w8_loop:
2958    vpbroadcastq        ym3, [srcq+ss3q ]
2959    vpunpcklqdq     ym0{k1}, ym3, [srcq+ssq*2] {1to4}
2960    lea                srcq, [srcq+ssq*4]
2961    vpbroadcastq         m3, [srcq+ssq*1]
2962    vpunpcklqdq      m0{k2}, m3, [srcq+ssq*0] {1to8}
2963    pmaddubsw            m4, m1, m8    ; a0 b0 c0 d0
2964    vpermb               m2, m6, m0    ; 45 56 67 78
2965    mova                xm0, xm3
2966    vshufi32x4           m1, m2, q1032 ; 23 34 45 56
2967    pmaddubsw            m3, m2, m10   ; a3 b3 c3 d3
2968    pmaddubsw            m5, m1, m9    ; a2 b2 c2 d2
2969    mova                 m1, m2
2970    paddw                m4, m3
2971    paddw                m4, m5
2972    pmulhrsw             m4, m7
2973    mova             [tmpq], m4
2974    add                tmpq, 64
2975    sub                  hd, 4
2976    jg .v_w8_loop
2977    RET
2978.v_w16:
2979    mova                m11, [spel_v_perm16b]
2980    vbroadcasti32x4      m1, [srcq+ssq*0]
2981    mov                 r6d, 0x0f
2982    vbroadcasti32x4     ym3, [srcq+ssq*1]
2983    vbroadcasti32x4      m2, [srcq+ssq*2]
2984    kmovb                k1, r6d
2985    add                srcq, ss3q
2986    vbroadcasti32x4     ym4, [srcq+ssq*0]
2987    vbroadcasti32x4      m0, [srcq+ssq*1]
2988    vshufpd          m1{k1}, m3, m2, 0xcc
2989    vshufpd          m2{k1}, m4, m0, 0xcc
2990    vpermb               m1, m11, m1 ; 01 12
2991    vpermb               m2, m11, m2 ; 23 34
2992.v_w16_loop:
2993    pmaddubsw            m3, m1, m8  ; a0 b0
2994    pmaddubsw            m5, m2, m9  ; a1 b1
2995    vbroadcasti32x4     ym6, [srcq+ssq*2]
2996    pmaddubsw            m4, m2, m8  ; c0 d0
2997    vbroadcasti32x4      m2, [srcq+ss3q ]
2998    lea                srcq, [srcq+ssq*4]
2999    vshufpd          m0{k1}, m6, m2, 0xcc
3000    vbroadcasti32x4     ym6, [srcq+ssq*0]
3001    vpermb               m1, m11, m0 ; 45 56
3002    vbroadcasti32x4      m0, [srcq+ssq*1]
3003    vshufpd          m2{k1}, m6, m0, 0xcc
3004    pmaddubsw            m6, m1, m9  ; c1 d1
3005    vpermb               m2, m11, m2 ; 67 78
3006    paddw                m3, m5
3007    pmaddubsw            m5, m1, m10 ; a2 b2
3008    paddw                m4, m6
3009    pmaddubsw            m6, m2, m10 ; c2 d2
3010    paddw                m3, m5
3011    paddw                m4, m6
3012    pmulhrsw             m3, m7
3013    pmulhrsw             m4, m7
3014    mova          [tmpq+ 0], m3
3015    mova          [tmpq+64], m4
3016    add                tmpq, 64*2
3017    sub                  hd, 4
3018    jg .v_w16_loop
3019    RET
3020.v_w32:
3021    movshdup             m6, [bilin_v_perm64]
3022    movu               ym16, [srcq+ssq*0]
3023    movu               ym17, [srcq+ssq*1]
3024    movu               ym18, [srcq+ssq*2]
3025    add                srcq, ss3q
3026    movu               ym19, [srcq+ssq*0]
3027    add                srcq, ssq
3028    movu               ym20, [srcq+ssq*0]
3029    vpermt2q            m16, m6, m18   ; 0 2
3030    vpermt2q            m17, m6, m19   ; 1 3
3031    vpermt2q            m18, m6, m20   ; 2 4
3032    punpcklbw            m0, m16, m17  ; 01
3033    punpcklbw            m1, m17, m18  ; 12
3034    punpckhbw            m2, m16, m17  ; 23
3035    punpckhbw            m3, m17, m18  ; 34
3036.v_w32_loop:
3037    movu               ym16, [srcq+ssq*1]
3038    lea                srcq, [srcq+ssq*2]
3039    movu               ym17, [srcq+ssq*0]
3040    pmaddubsw            m4, m0, m8    ; a0
3041    mova                 m0, m2
3042    pmaddubsw            m2, m9        ; a1
3043    vpermt2q            m16, m6, m17   ; 5 6
3044    pmaddubsw            m5, m1, m8    ; b0
3045    mova                 m1, m3
3046    pmaddubsw            m3, m9        ; b1
3047    shufpd              m18, m16, 0x55 ; 4 5
3048    paddw                m4, m2
3049    punpcklbw            m2, m18, m16  ; 45
3050    paddw                m5, m3
3051    punpckhbw            m3, m18, m16  ; 56
3052    mova                m18, m16
3053    pmaddubsw           m16, m2, m10   ; a2
3054    pmaddubsw           m17, m3, m10   ; b2
3055    paddw                m4, m16
3056    paddw                m5, m17
3057    pmulhrsw             m4, m7
3058    pmulhrsw             m5, m7
3059    mova          [tmpq+ 0], m4
3060    mova          [tmpq+64], m5
3061    add                tmpq, 64*2
3062    sub                  hd, 2
3063    jg .v_w32_loop
3064    vzeroupper
3065    RET
3066.v_w64:
3067.v_w128:
3068    mova                 m6, [bilin_v_perm64]
3069    add                  wd, wd
3070    lea                 r6d, [hq+wq]
3071.v_loop0:
3072    vpermq              m12, m6, [srcq+ssq*0]
3073    vpermq              m13, m6, [srcq+ssq*1]
3074    lea                  r5, [srcq+ssq*2]
3075    vpermq              m14, m6, [r5  +ssq*0]
3076    vpermq              m15, m6, [r5  +ssq*1]
3077    lea                  r5, [r5+ssq*2]
3078    vpermq              m16, m6, [r5  +ssq*0]
3079    mov                  r7, tmpq
3080    punpcklbw            m0, m12, m13 ; 01
3081    punpckhbw           m12, m13
3082    punpcklbw            m1, m13, m14 ; 12
3083    punpckhbw           m13, m14
3084    punpcklbw            m2, m14, m15 ; 23
3085    punpckhbw           m14, m15
3086    punpcklbw            m3, m15, m16 ; 34
3087    punpckhbw           m15, m16
3088.v_loop:
3089    pmaddubsw           m17, m0, m8   ; a0
3090    vpermq               m5, m6, [r5+ssq*1]
3091    pmaddubsw           m18, m12, m8
3092    mova                 m0, m2
3093    pmaddubsw            m2, m9       ; a1
3094    mova                m12, m14
3095    pmaddubsw           m14, m9
3096    lea                  r5, [r5+ssq*2]
3097    pmaddubsw           m19, m1, m8   ; b0
3098    pmaddubsw           m20, m13, m8
3099    mova                 m1, m3
3100    pmaddubsw            m3, m9       ; b1
3101    mova                m13, m15
3102    pmaddubsw           m15, m9
3103    paddw               m17, m2
3104    punpcklbw            m2, m16, m5  ; 67
3105    paddw               m18, m14
3106    punpckhbw           m14, m16, m5
3107    vpermq              m16, m6, [r5+ssq*0]
3108    paddw               m19, m3
3109    pmaddubsw            m3, m2, m10  ; a3
3110    paddw               m20, m15
3111    pmaddubsw           m15, m14, m10
3112    paddw               m17, m3
3113    punpcklbw            m3, m5, m16  ; 78
3114    pmaddubsw            m4, m3, m10  ; b3
3115    paddw               m18, m15
3116    punpckhbw           m15, m5, m16
3117    pmaddubsw            m5, m15, m10
3118    paddw               m19, m4
3119    paddw               m20, m5
3120    REPX   {pmulhrsw x, m7}, m17, m18, m19, m20
3121    mova       [r7+wq*0+ 0], m17
3122    mova       [r7+wq*0+64], m18
3123    mova       [r7+wq*1+ 0], m19
3124    mova       [r7+wq*1+64], m20
3125    lea                  r7, [r7+wq*2]
3126    sub                  hd, 2
3127    jg .v_loop
3128    add                srcq, 64
3129    add                tmpq, 128
3130    movzx                hd, r6b
3131    sub                 r6d, 1<<8
3132    jg .v_loop0
3133    vzeroupper
3134    RET
3135.h:
3136    test                myd, 0xf00
3137    jz mangle(private_prefix %+ _prep_8tap_8bpc_avx512icl).h2
3138.hv:
3139    vpbroadcastd         m8, [pd_2]
3140    vpbroadcastd         m9, [pd_32]
3141    cmp                  wd, 4
3142    jg .hv_w8
3143    movzx               mxd, mxb
3144    vpbroadcastd        m11, [base+subpel_filters+mxq*8+2]
3145    movzx               mxd, myb
3146    shr                 myd, 16
3147    cmp                  hd, 4
3148    cmove               myd, mxd
3149    vpbroadcastq         m3, [base+subpel_filters+1+myq*8]
3150    vbroadcasti128      m10, [subpel_h_shufA]
3151    lea                  r6, [ssq*2+1]
3152    mov                 r3d, 0x30
3153    sub                srcq, r6
3154    kmovb                k1, r3d
3155    vpbroadcastq        ym2, [srcq+ssq*0]
3156    lea                ss3q, [ssq*3]
3157    vpbroadcastq         m1, [srcq+ssq*1]
3158    kaddb                k2, k1, k1
3159    vpbroadcastq     m2{k1}, [srcq+ssq*2]
3160    add                srcq, ss3q
3161    vpbroadcastq     m1{k2}, [srcq+ssq*0] ; _ _ 1 3
3162    punpcklbw            m3, m3
3163    vpbroadcastq     m2{k2}, [srcq+ssq*1] ; _ 0 2 4
3164    psraw                m3, 8 ; sign-extend
3165    mova                 m6, [spel_hv_perm4a]
3166    kshiftrb             k1, k1, 2
3167    movu                 m7, [spel_hv_perm4b]
3168    pshufb               m1, m10
3169    mova                 m0, m8
3170    vpdpbusd             m0, m1, m11
3171    pshufb               m2, m10
3172    mova                 m1, m8
3173    vpdpbusd             m1, m2, m11
3174    pshufd              m12, m3, q0000
3175    pshufd              m13, m3, q1111
3176    pshufd              m14, m3, q2222
3177    packssdw             m0, m1           ; _ _   _ 0   1 2   3 4
3178    psraw                m0, 2
3179    vpermb               m1, m7, m0       ; 01 12 23 34
3180.hv_w4_loop:
3181    movq                xm3, [srcq+ssq*2]
3182    movq                xm4, [srcq+ss3q ]
3183    lea                srcq, [srcq+ssq*4]
3184    vpbroadcastq    ym3{k1}, [srcq+ssq*0] ; 5 7
3185    vpbroadcastq    ym4{k1}, [srcq+ssq*1] ; 6 8
3186    pshufb              ym3, ym10
3187    mova                ym2, ym8
3188    vpdpbusd            ym2, ym3, ym11
3189    pshufb              ym4, ym10
3190    mova                ym3, ym8
3191    vpdpbusd            ym3, ym4, ym11
3192    mova                 m4, m9
3193    vpdpwssd             m4, m1, m12      ; a0 b0 c0 d0
3194    packssdw            ym2, ym3          ; 5 6   7 8
3195    psraw               ym2, 2
3196    vshufi32x4           m0, m2, q1032    ; _ 2   3 4   5 6   7 8
3197    vpermb               m2, m6, m0       ; 23 34 45 56
3198    vpermb               m1, m7, m0       ; 45 56 67 78
3199    vpdpwssd             m4, m2, m13      ; a1 b1 c1 d1
3200    vpdpwssd             m4, m1, m14      ; a2 b2 c2 d2
3201    psrad                m4, 6
3202    vpmovdw          [tmpq], m4
3203    add                tmpq, 32
3204    sub                  hd, 4
3205    jg .hv_w4_loop
3206    RET
3207.hv_w8:
3208    shr                 mxd, 16
3209    vpbroadcastd        m10, [base+subpel_filters+mxq*8+0]
3210    vpbroadcastd        m11, [base+subpel_filters+mxq*8+4]
3211    movzx               mxd, myb
3212    shr                 myd, 16
3213    cmp                  hd, 4
3214    cmove               myd, mxd
3215    vpbroadcastq         m0, [base+subpel_filters+1+myq*8]
3216    lea                  r6, [ssq*2+3]
3217    punpcklbw            m0, m0
3218    sub                srcq, r6
3219    psraw                m0, 8 ; sign-extend
3220    lea                ss3q, [ssq*3]
3221    pshufd              m12, m0, q0000
3222    pshufd              m13, m0, q1111
3223    pshufd              m14, m0, q2222
3224    cmp                  wd, 8
3225    jg .hv_w16
3226    movu               xm16, [srcq+ssq*0]
3227    vbroadcasti32x4     m19, [subpel_h_shufA]
3228    vinserti128        ym16, [srcq+ssq*1], 1
3229    vbroadcasti32x4     m21, [subpel_h_shufC]
3230    vinserti32x4        m16, [srcq+ssq*2], 2
3231    add                srcq, ss3q
3232    vinserti32x4        m16, [srcq+ssq*0], 3
3233    movu               xm17, [srcq+ssq*1]
3234    vbroadcasti32x4     m20, [subpel_h_shufB]
3235    pshufb               m3, m16, m19   ; 0 1 2 3   0123
3236    mova                 m2, m8
3237    pshufb               m0, m16, m21   ; 0 1 2 3   89ab
3238    vpdpbusd             m2, m3, m10
3239    mova                 m3, m8
3240    pshufb              xm1, xm17, xm19 ; 3 4 5 6   0123
3241    vpdpbusd             m3, m0, m11
3242    mova                xm0, xm8
3243    pshufb             xm18, xm17, xm21 ; 3 4 5 6   89ab
3244    vpdpbusd            xm0, xm1, xm10
3245    mova                xm1, xm8
3246    pshufb              m16, m20        ; 0 1 2 3   4567
3247    vpdpbusd            xm1, xm18, xm11
3248    pshufb             xm17, xm20       ; 3 4 5 6   4567
3249    vpdpbusd             m2, m16, m11
3250    vpdpbusd             m3, m16, m10
3251    vpdpbusd            xm0, xm17, xm11
3252    vpdpbusd            xm1, xm17, xm10
3253    packssdw             m2, m3
3254    packssdw            xm0, xm1
3255    psraw                m2, 2          ; 0 1 2 3
3256    psraw               xm0, 2          ; 4
3257    valignq              m0, m2, 2      ; 1 2 3 4
3258    punpcklwd            m1, m2, m0     ; 01 12 23 34
3259    punpckhwd            m2, m0
3260.hv_w8_loop:
3261    movu               xm16, [srcq+ssq*2]
3262    vinserti128        ym16, [srcq+ss3q ], 1
3263    lea                srcq, [srcq+ssq*4]
3264    vinserti32x4        m16, [srcq+ssq*0], 2
3265    vinserti32x4        m16, [srcq+ssq*1], 3
3266    pshufb               m6, m16, m19   ; 5 6 7 8   0123
3267    mova                 m5, m8
3268    pshufb               m3, m16, m21   ; 5 6 7 8   89ab
3269    vpdpbusd             m5, m6, m10
3270    mova                 m6, m8
3271    pshufb              m16, m20        ; 5 6 7 8   4567
3272    vpdpbusd             m6, m3, m11
3273    mova                 m3, m9
3274    vpdpwssd             m3, m1, m12    ; a0 b0 c0 d0
3275    mova                 m4, m9
3276    vpdpwssd             m4, m2, m12
3277    vpdpbusd             m5, m16, m11
3278    vpdpbusd             m6, m16, m10
3279    mova                m16, m1
3280    packssdw             m5, m6
3281    mova                 m6, m2
3282    psraw                m5, 2          ; 5 6 7 8
3283    valignq              m2, m5, m0, 6  ; 4 5 6 7
3284    mova                 m0, m5
3285    punpcklwd            m1, m2, m5     ; 45 56 67 78
3286    punpckhwd            m2, m5
3287    vpdpwssd             m3, m1, m14    ; a2 b2 c2 d2
3288    vpdpwssd             m4, m2, m14
3289    vshufi32x4          m16, m1, q1032  ; 23 34 45 56
3290    vshufi32x4           m6, m2, q1032
3291    vpdpwssd             m3, m16, m13   ; a1 b1 c1 d1
3292    vpdpwssd             m4, m6, m13
3293    psrad                m3, 6
3294    psrad                m4, 6
3295    packssdw             m3, m4
3296    mova             [tmpq], m3
3297    add                tmpq, 64
3298    sub                  hd, 4
3299    jg .hv_w8_loop
3300    vzeroupper
3301    RET
3302.hv_w16:
3303    mova                m16, [spel_h_perm16]
3304    vpbroadcastd        m18, [pb_4]
3305    add                  wd, wd
3306    paddb               m17, m18, m16
3307    lea                 r6d, [hq+wq*8-256]
3308    paddb               m18, m17
3309.hv_w16_loop0:
3310    movu               ym19, [srcq+ssq*0]
3311    vinserti32x8        m19, [srcq+ssq*1], 1
3312    lea                  r5, [srcq+ssq*2]
3313    movu               ym20, [r5  +ssq*0]
3314    vinserti32x8        m20, [r5  +ssq*1], 1
3315    lea                  r5, [r5  +ssq*2]
3316    movu               ym21, [r5  +ssq*0]
3317    mov                  r7, tmpq
3318    vpermb               m3, m16, m19      ; 0 1   0123   89ab
3319    mova                 m2, m8
3320    vpermb               m4, m18, m19      ; 0 1   89ab   ghij
3321    vpdpbusd             m2, m3, m10
3322    mova                 m3, m8
3323    vpermb               m5, m16, m20      ; 2 3   0123   89ab
3324    vpdpbusd             m3, m4, m11
3325    mova                 m4, m8
3326    vpermb               m0, m18, m20      ; 2 3   89ab   ghij
3327    vpdpbusd             m4, m5, m10
3328    mova                 m5, m8
3329    vpermb              ym1, ym16, ym21    ; 4     0123   89ab
3330    vpdpbusd             m5, m0, m11
3331    mova                ym0, ym8
3332    vpermb              ym6, ym18, ym21    ; 4     89ab   ghij
3333    vpdpbusd            ym0, ym1, ym10
3334    mova                ym1, ym8
3335    vpermb              m19, m17, m19      ; 0 1   4567   cdef
3336    vpdpbusd            ym1, ym6, ym11
3337    vpermb              m20, m17, m20      ; 2 3   4567   cdef
3338    vpdpbusd             m2, m19, m11
3339    vpdpbusd             m3, m19, m10
3340    vpermb             ym21, ym17, ym21    ; 4     4567   cdef
3341    vpdpbusd             m4, m20, m11
3342    vpdpbusd             m5, m20, m10
3343    vpdpbusd            ym0, ym21, ym11
3344    vpdpbusd            ym1, ym21, ym10
3345    packssdw             m2, m3            ; 0 1
3346    packssdw             m4, m5            ; 2 3
3347    packssdw            ym0, ym1           ; 4
3348    REPX       {psraw x, 2}, m2, m4, ym0
3349    vshufi32x4           m3, m2, m4, q1032 ; 1 2
3350    vshufi32x4           m0, m4, m0, q1032 ; 3 4
3351    punpcklwd            m1, m2, m3        ; 01 12
3352    punpckhwd            m2, m3
3353    punpcklwd            m3, m4, m0        ; 23 34
3354    punpckhwd            m4, m0
3355.hv_w16_loop:
3356    movu               ym19, [r5+ssq*1]
3357    lea                  r5, [r5+ssq*2]
3358    vinserti32x8        m19, [r5+ssq*0], 1
3359    vpermb               m6, m16, m19      ; 5 6   0123   89ab
3360    mova                 m5, m8
3361    vpermb              m20, m18, m19      ; 5 6   89ab   ghij
3362    vpdpbusd             m5, m6, m10
3363    mova                 m6, m8
3364    vpermb              m19, m17, m19      ; 5 6   4567   cdef
3365    vpdpbusd             m6, m20, m11
3366    mova                m20, m9
3367    vpdpwssd            m20, m1, m12       ; a0 b0
3368    mova                m21, m9
3369    vpdpwssd            m21, m2, m12
3370    vpdpbusd             m5, m19, m11
3371    vpdpbusd             m6, m19, m10
3372    vpdpwssd            m20, m3, m13       ; a1 b1
3373    vpdpwssd            m21, m4, m13
3374    packssdw             m5, m6
3375    mova                 m1, m3
3376    psraw                m5, 2             ; 5 6
3377    mova                 m2, m4
3378    vshufi32x4           m4, m0, m5, q1032 ; 4 5
3379    mova                 m0, m5
3380    punpcklwd            m3, m4, m0        ; 45 56
3381    punpckhwd            m4, m0
3382    vpdpwssd            m20, m3, m14       ; a2 b2
3383    vpdpwssd            m21, m4, m14
3384    psrad               m20, 6
3385    psrad               m21, 6
3386    packssdw            m20, m21
3387    mova          [r7+wq*0], ym20
3388    vextracti32x8 [r7+wq*1], m20, 1
3389    lea                  r7, [r7+wq*2]
3390    sub                  hd, 2
3391    jg .hv_w16_loop
3392    add                srcq, 16
3393    add                tmpq, 32
3394    movzx                hd, r6b
3395    sub                 r6d, 1<<8
3396    jg .hv_w16_loop0
3397    vzeroupper
3398    RET
3399
3400%macro PREP_8TAP_H 0
3401    vpermb              m10, m5, m0
3402    vpermb              m11, m5, m1
3403    vpermb              m12, m6, m0
3404    vpermb              m13, m6, m1
3405    vpermb              m14, m7, m0
3406    vpermb              m15, m7, m1
3407    mova                 m0, m4
3408    vpdpbusd             m0, m10, m8
3409    mova                 m2, m4
3410    vpdpbusd             m2, m12, m8
3411    mova                 m1, m4
3412    vpdpbusd             m1, m11, m8
3413    mova                 m3, m4
3414    vpdpbusd             m3, m13, m8
3415    vpdpbusd             m0, m12, m9
3416    vpdpbusd             m2, m14, m9
3417    vpdpbusd             m1, m13, m9
3418    vpdpbusd             m3, m15, m9
3419    packssdw             m0, m2
3420    packssdw             m1, m3
3421    psraw                m0, 2
3422    psraw                m1, 2
3423    mova        [tmpq+64*0], m0
3424    mova        [tmpq+64*1], m1
3425%endmacro
3426
3427PREP_8TAP_FN smooth_sharp,   SMOOTH,  SHARP,   prep_8tap_8bpc
3428PREP_8TAP_FN regular_sharp,  REGULAR, SHARP,   prep_8tap_8bpc
3429PREP_8TAP_FN sharp,          SHARP,   SHARP
3430
3431cglobal prep_8tap_8bpc, 4, 8, 0, tmp, src, stride, w, h, mx, my, stride3
3432    imul                mxd, mxm, 0x010101
3433    add                 mxd, t0d ; 8tap_h, mx, 4tap_h
3434    imul                myd, mym, 0x010101
3435    add                 myd, t1d ; 8tap_v, my, 4tap_v
3436    lea                  r7, [prep_avx512icl]
3437    movifnidn            hd, hm
3438    test                mxd, 0xf00
3439    jnz .h
3440    test                myd, 0xf00
3441    jz mangle(private_prefix %+ _prep_6tap_8bpc_avx512icl).prep
3442.v:
3443    movzx               mxd, myb ; Select 4-tap/8-tap filter multipliers.
3444    shr                 myd, 16  ; Note that the code is 8-tap only, having
3445    cmp                  hd, 4   ; a separate 4-tap code path for (4|8|16)x4
3446    cmove               myd, mxd ; had a negligible effect on performance.
3447    tzcnt               r5d, wd
3448    lea                 myq, [base+subpel_filters+myq*8]
3449    movzx               r5d, word [r7+r5*2+table_offset(prep, _8tap_v)]
3450    vpbroadcastd         m7, [pw_8192]
3451    vpbroadcastw         m8, [myq+0]
3452    add                  r5, r7
3453    vpbroadcastw         m9, [myq+2]
3454    lea            stride3q, [strideq*3]
3455    vpbroadcastw        m10, [myq+4]
3456    sub                srcq, stride3q
3457    vpbroadcastw        m11, [myq+6]
3458    jmp                  r5
3459.v_w4:
3460    movd               xmm0, [srcq+strideq*0]
3461    vpbroadcastd       ymm1, [srcq+strideq*2]
3462    vpbroadcastd       xmm2, [srcq+strideq*1]
3463    vpbroadcastd       ymm3, [srcq+stride3q ]
3464    lea                srcq, [srcq+strideq*4]
3465    vpblendd           ymm1, ymm0, 0x01       ; 0 2 2 _   2 _ _ _
3466    vpblendd           ymm3, ymm2, 0x03       ; 1 1 3 3   3 3 _ _
3467    vpbroadcastd       ymm0, [srcq+strideq*0]
3468    vpbroadcastd       ymm2, [srcq+strideq*1]
3469    vpblendd           ymm1, ymm0, 0x68       ; 0 2 2 4   2 4 4 _
3470    vpbroadcastd       ymm0, [srcq+strideq*2]
3471    vbroadcasti128     ymm5, [deint_shuf4]
3472    vpblendd           ymm3, ymm2, 0xc0       ; 1 1 3 3   3 3 5 5
3473    vpblendd           ymm2, ymm3, ymm1, 0x55 ; 0 1 2 3   2 3 4 5
3474    vpblendd           ymm3, ymm1, 0xaa       ; 1 2 3 4   3 4 5 _
3475    punpcklbw          ymm1, ymm2, ymm3       ; 01  12    23  34
3476    vpblendd           ymm3, ymm0, 0x80       ; 1 2 3 4   3 4 5 6
3477    punpckhbw          ymm2, ymm3             ; 23  34    45  56
3478.v_w4_loop:
3479    pinsrd             xmm0, [srcq+stride3q ], 1
3480    lea                srcq, [srcq+strideq*4]
3481    vpbroadcastd       ymm3, [srcq+strideq*0]
3482    vpbroadcastd       ymm4, [srcq+strideq*1]
3483    vpblendd           ymm3, ymm4, 0x20       ; _ _ 8 _   8 9 _ _
3484    vpblendd           ymm3, ymm0, 0x03       ; 6 7 8 _   8 9 _ _
3485    vpbroadcastd       ymm0, [srcq+strideq*2]
3486    vpblendd           ymm3, ymm0, 0x40       ; 6 7 8 _   8 9 a _
3487    pshufb             ymm3, ymm5             ; 67  78    89  9a
3488    pmaddubsw          ymm4, ymm1, ym8
3489    vperm2i128         ymm1, ymm2, ymm3, 0x21 ; 45  56    67  78
3490    pmaddubsw          ymm2, ym9
3491    paddw              ymm4, ymm2
3492    mova               ymm2, ymm3
3493    pmaddubsw          ymm3, ym11
3494    paddw              ymm3, ymm4
3495    pmaddubsw          ymm4, ymm1, ym10
3496    paddw              ymm3, ymm4
3497    pmulhrsw           ymm3, ym7
3498    mova             [tmpq], ymm3
3499    add                tmpq, 32
3500    sub                  hd, 4
3501    jg .v_w4_loop
3502    vzeroupper
3503    RET
3504.v_w8:
3505    mova                 m6, [spel_v_perm8]
3506    movq                xm1, [srcq+strideq*0]
3507    mov                 r6d, 0x3e
3508    movq                xm2, [srcq+strideq*1]
3509    vpbroadcastq        ym3, [srcq+strideq*2]
3510    kmovb                k1, r6d
3511    vpbroadcastq        ym4, [srcq+stride3q ]
3512    lea                srcq, [srcq+strideq*4]
3513    vpunpcklqdq      m1{k1}, m3, [srcq+strideq*0] {1to8}
3514    vpunpcklqdq      m2{k1}, m4, [srcq+strideq*1] {1to8}
3515    movq                xm0, [srcq+strideq*2]
3516    kshiftlb             k2, k1, 2
3517    shufpd               m1, m2, 0x30      ; 0 1   2 3   4 5
3518    vshufi32x4           m2, m1, m0, q0021 ; 2 3   4 5   6 _
3519    vpermb               m1, m6, m1        ; 01 12 23 34
3520    vpermb               m2, m6, m2        ; 23 34 45 56
3521.v_w8_loop:
3522    vpbroadcastq        ym3, [srcq+strideq*4]
3523    vpunpcklqdq     ym0{k1}, ym3, [srcq+stride3q] {1to4}
3524    lea                srcq, [srcq+strideq*4]
3525    vpbroadcastq         m3, [srcq+strideq*2]
3526    vpunpcklqdq      m0{k2}, m3, [srcq+strideq*1] {1to8}
3527    pmaddubsw            m4, m1, m8        ; a0 b0 c0 d0
3528    mova                 m1, m2
3529    pmaddubsw            m5, m2, m9        ; a1 b1 c1 d1
3530    vpermb               m2, m6, m0        ; 67 78 89 9a
3531    mova                xm0, xm3
3532    vshufi32x4           m1, m2, q1032     ; 45 56 67 78
3533    pmaddubsw            m3, m2, m11       ; a3 b3 c3 d3
3534    paddw                m4, m5
3535    pmaddubsw            m5, m1, m10       ; a2 b2 c2 d2
3536    paddw                m4, m3
3537    paddw                m4, m5
3538    pmulhrsw             m4, m7
3539    mova             [tmpq], m4
3540    add                tmpq, 64
3541    sub                  hd, 4
3542    jg .v_w8_loop
3543    RET
3544.v_w16:
3545    mova                m12, [spel_v_perm16b]
3546    vbroadcasti32x4      m1, [srcq+strideq*0]
3547    mov                 r6d, 0x0f
3548    vbroadcasti32x4     ym4, [srcq+strideq*1]
3549    vbroadcasti32x4      m2, [srcq+strideq*2]
3550    kmovb                k1, r6d
3551    vbroadcasti32x4     ym5, [srcq+stride3q ]
3552    lea                srcq, [srcq+strideq*4]
3553    vbroadcasti32x4      m3, [srcq+strideq*0]
3554    vbroadcasti32x4     ym6, [srcq+strideq*1]
3555    vbroadcasti32x4      m0, [srcq+strideq*2]
3556    vshufpd          m1{k1}, m4, m2, 0xcc
3557    vshufpd          m2{k1}, m5, m3, 0xcc
3558    vshufpd          m3{k1}, m6, m0, 0xcc
3559    vpermb               m1, m12, m1 ; 01 12
3560    vpermb               m2, m12, m2 ; 23 34
3561    vpermb               m3, m12, m3 ; 45 56
3562.v_w16_loop:
3563    pmaddubsw            m4, m1, m8  ; a0 b0
3564    mova                 m1, m3
3565    pmaddubsw           m13, m2, m9  ; a1 b1
3566    vbroadcasti32x4     ym6, [srcq+stride3q ]
3567    pmaddubsw            m5, m2, m8  ; c0 d0
3568    lea                srcq, [srcq+strideq*4]
3569    pmaddubsw           m14, m3, m9  ; c1 d1
3570    vbroadcasti32x4      m3, [srcq+strideq*0]
3571    vshufpd          m0{k1}, m6, m3, 0xcc
3572    vbroadcasti32x4     ym6, [srcq+strideq*1]
3573    vpermb               m2, m12, m0 ; 67 78
3574    vbroadcasti32x4      m0, [srcq+strideq*2]
3575    vshufpd          m3{k1}, m6, m0, 0xcc
3576    paddw                m4, m13
3577    pmaddubsw           m13, m1, m10 ; a2 b2
3578    vpermb               m3, m12, m3 ; 89 9a
3579    paddw                m5, m14
3580    pmaddubsw           m14, m2, m10 ; c2 d2
3581    pmaddubsw           m15, m2, m11 ; a3 b3
3582    pmaddubsw            m6, m3, m11 ; c3 d3
3583    paddw                m4, m13
3584    paddw                m5, m14
3585    paddw                m4, m15
3586    paddw                m5, m6
3587    pmulhrsw             m4, m7
3588    pmulhrsw             m5, m7
3589    mova          [tmpq+ 0], m4
3590    mova          [tmpq+64], m5
3591    add                tmpq, 64*2
3592    sub                  hd, 4
3593    jg .v_w16_loop
3594    RET
3595.v_w32:
3596    movshdup            m21, [bilin_v_perm64]
3597    movu               ym16, [srcq+strideq*0]
3598    movu               ym17, [srcq+strideq*1]
3599    movu               ym18, [srcq+strideq*2]
3600    add                srcq, stride3q
3601    movu               ym19, [srcq+strideq*0]
3602    vpermt2q            m16, m21, m19  ; 0 3
3603    movu               ym20, [srcq+strideq*1]
3604    vpermt2q            m17, m21, m20  ; 1 4
3605    movu               ym20, [srcq+strideq*2]
3606    add                srcq, stride3q
3607    vpermt2q            m18, m21, m20  ; 2 5
3608    movu               ym20, [srcq+strideq*0]
3609    vpermt2q            m19, m21, m20  ; 3 6
3610    punpcklbw            m0, m16, m17  ; 01
3611    punpcklbw            m1, m17, m18  ; 12
3612    punpcklbw            m2, m18, m19  ; 23
3613    punpckhbw            m3, m16, m17  ; 34
3614    punpckhbw            m4, m17, m18  ; 45
3615    punpckhbw            m5, m18, m19  ; 56
3616.v_w32_loop:
3617    movu               ym16, [srcq+strideq*1]
3618    lea                srcq, [srcq+strideq*2]
3619    movu               ym17, [srcq+strideq*0]
3620    pmaddubsw           m14, m0, m8
3621    mova                 m0, m2
3622    pmaddubsw           m15, m1, m8
3623    mova                 m1, m3
3624    pmaddubsw            m2, m9
3625    vpermt2q            m16, m21, m17  ; 7 8
3626    pmaddubsw            m3, m9
3627    pmaddubsw           m12, m4, m10
3628    pmaddubsw           m13, m5, m10
3629    shufpd              m19, m16, 0x55 ; 6 7
3630    paddw               m14, m2
3631    mova                 m2, m4
3632    punpcklbw            m4, m19, m16  ; 67
3633    paddw               m15, m3
3634    mova                 m3, m5
3635    punpckhbw            m5, m19, m16  ; 78
3636    paddw               m14, m12
3637    paddw               m15, m13
3638    pmaddubsw           m12, m4, m11
3639    pmaddubsw           m13, m5, m11
3640    mova                m19, m16
3641    paddw               m14, m12
3642    paddw               m15, m13
3643    pmulhrsw            m14, m7
3644    pmulhrsw            m15, m7
3645    mova          [tmpq+ 0], m14
3646    mova          [tmpq+64], m15
3647    add                tmpq, 64*2
3648    sub                  hd, 2
3649    jg .v_w32_loop
3650    vzeroupper
3651    RET
3652.v_w64:
3653.v_w128:
3654    WIN64_SPILL_XMM      24
3655    mova                m23, [bilin_v_perm64]
3656    add                  wd, wd
3657    lea                 r6d, [hq+wq]
3658.v_loop0:
3659    vpermq              m12, m23, [srcq+strideq*0]
3660    vpermq              m13, m23, [srcq+strideq*1]
3661    lea                  r5, [srcq+strideq*2]
3662    vpermq              m14, m23, [r5  +strideq*0]
3663    vpermq              m15, m23, [r5  +strideq*1]
3664    lea                  r5, [r5+strideq*2]
3665    vpermq              m16, m23, [r5  +strideq*0]
3666    vpermq              m17, m23, [r5  +strideq*1]
3667    lea                  r5, [r5+strideq*2]
3668    vpermq              m18, m23, [r5  +strideq*0]
3669    mov                  r7, tmpq
3670    punpcklbw            m0, m12, m13 ; 01
3671    punpckhbw           m12, m13
3672    punpcklbw            m1, m13, m14 ; 12
3673    punpckhbw           m13, m14
3674    punpcklbw            m2, m14, m15 ; 23
3675    punpckhbw           m14, m15
3676    punpcklbw            m3, m15, m16 ; 34
3677    punpckhbw           m15, m16
3678    punpcklbw            m4, m16, m17 ; 45
3679    punpckhbw           m16, m17
3680    punpcklbw            m5, m17, m18 ; 56
3681    punpckhbw           m17, m18
3682.v_loop:
3683    pmaddubsw           m19, m0, m8   ; a0
3684    vpermq               m6, m23, [r5+strideq*1]
3685    pmaddubsw           m20, m12, m8
3686    mova                 m0, m2
3687    pmaddubsw            m2, m9       ; a1
3688    mova                m12, m14
3689    pmaddubsw           m14, m9
3690    lea                  r5, [r5+strideq*2]
3691    pmaddubsw           m21, m1, m8   ; b0
3692    pmaddubsw           m22, m13, m8
3693    mova                 m1, m3
3694    pmaddubsw            m3, m9       ; b1
3695    mova                m13, m15
3696    pmaddubsw           m15, m9
3697    paddw               m19, m2
3698    mova                 m2, m4
3699    pmaddubsw            m4, m10      ; a2
3700    paddw               m20, m14
3701    mova                m14, m16
3702    pmaddubsw           m16, m10
3703    paddw               m21, m3
3704    mova                 m3, m5
3705    pmaddubsw            m5, m10      ; b2
3706    paddw               m22, m15
3707    mova                m15, m17
3708    pmaddubsw           m17, m10
3709    paddw               m19, m4
3710    punpcklbw            m4, m18, m6  ; 67
3711    paddw               m20, m16
3712    punpckhbw           m16, m18, m6
3713    vpermq              m18, m23, [r5+strideq*0]
3714    paddw               m21, m5
3715    pmaddubsw            m5, m4, m11  ; a3
3716    paddw               m22, m17
3717    pmaddubsw           m17, m16, m11
3718    paddw               m19, m5
3719    punpcklbw            m5, m6, m18  ; 78
3720    paddw               m20, m17
3721    punpckhbw           m17, m6, m18
3722    pmaddubsw            m6, m5, m11  ; b3
3723    paddw               m21, m6
3724    pmaddubsw            m6, m17, m11
3725    paddw               m22, m6
3726    REPX   {pmulhrsw x, m7}, m19, m20, m21, m22
3727    mova       [r7+wq*0+ 0], m19
3728    mova       [r7+wq*0+64], m20
3729    mova       [r7+wq*1+ 0], m21
3730    mova       [r7+wq*1+64], m22
3731    lea                  r7, [r7+wq*2]
3732    sub                  hd, 2
3733    jg .v_loop
3734    add                srcq, 64
3735    add                tmpq, 128
3736    movzx                hd, r6b
3737    sub                 r6d, 1<<8
3738    jg .v_loop0
3739    RET
3740.h:
3741    RESET_STACK_STATE
3742    test                myd, 0xf00
3743    jnz .hv
3744.h2:
3745    vpbroadcastd         m4, [pd_2]
3746    cmp                  wd, 4
3747    je .h_w4
3748    tzcnt                wd, wd
3749    shr                 mxd, 16
3750    sub                srcq, 3
3751    movzx                wd, word [r7+wq*2+table_offset(prep, _8tap_h)]
3752    vpbroadcastd         m8, [base+subpel_filters+mxq*8+0]
3753    vpbroadcastd         m9, [base+subpel_filters+mxq*8+4]
3754    add                  wq, r7
3755    jmp                  wq
3756.h_w4:
3757    movzx               mxd, mxb
3758    vbroadcasti128      ym5, [subpel_h_shufA]
3759    mov                 r3d, 0x4
3760    dec                srcq
3761    vpbroadcastd        ym6, [base+subpel_filters+mxq*8+2]
3762    kmovb                k1, r3d
3763    lea            stride3q, [strideq*3]
3764.h_w4_loop:
3765    movq                xm2, [srcq+strideq*0]
3766    movq                xm3, [srcq+strideq*1]
3767    vpbroadcastq    ym2{k1}, [srcq+strideq*2]
3768    vpbroadcastq    ym3{k1}, [srcq+stride3q ]
3769    lea                srcq, [srcq+strideq*4]
3770    pshufb              ym2, ym5
3771    pshufb              ym3, ym5
3772    mova                ym0, ym4
3773    vpdpbusd            ym0, ym2, ym6
3774    mova                ym1, ym4
3775    vpdpbusd            ym1, ym3, ym6
3776    packssdw            ym0, ym1
3777    psraw               ym0, 2
3778    mova             [tmpq], ym0
3779    add                tmpq, 32
3780    sub                  hd, 4
3781    jg .h_w4_loop
3782    RET
3783.h_w8:
3784    vbroadcasti128       m5, [subpel_h_shufA]
3785    vbroadcasti128       m6, [subpel_h_shufB]
3786    vbroadcasti128       m7, [subpel_h_shufC]
3787    lea            stride3q, [strideq*3]
3788.h_w8_loop:
3789    movu               xmm3, [srcq+strideq*0]
3790    vinserti128         ym3, ymm3, [srcq+strideq*1], 1
3791    vinserti128          m3, [srcq+strideq*2], 2
3792    vinserti128          m3, [srcq+stride3q ], 3
3793    lea                srcq, [srcq+strideq*4]
3794    pshufb               m1, m3, m5
3795    pshufb               m2, m3, m6
3796    mova                 m0, m4
3797    vpdpbusd             m0, m1, m8
3798    mova                 m1, m4
3799    vpdpbusd             m1, m2, m8
3800    pshufb               m3, m7
3801    vpdpbusd             m0, m2, m9
3802    vpdpbusd             m1, m3, m9
3803    packssdw             m0, m1
3804    psraw                m0, 2
3805    mova             [tmpq], m0
3806    add                tmpq, 64
3807    sub                  hd, 4
3808    jg .h_w8_loop
3809    RET
3810.h_w16:
3811    mova                 m5, [spel_h_perm16]
3812    vpbroadcastd         m7, [pb_4]
3813    lea            stride3q, [strideq*3]
3814    paddb                m6, m7, m5
3815    paddb                m7, m6
3816.h_w16_loop:
3817    movu                ym0, [srcq+strideq*0]
3818    movu                ym1, [srcq+strideq*2]
3819    vinserti32x8         m0, [srcq+strideq*1], 1
3820    vinserti32x8         m1, [srcq+stride3q ], 1
3821    lea                srcq, [srcq+strideq*4]
3822    PREP_8TAP_H
3823    add                tmpq, 64*2
3824    sub                  hd, 4
3825    jg .h_w16_loop
3826    RET
3827.h_w32:
3828    mova                 m5, [spel_h_perm32]
3829    vpbroadcastd         m7, [pb_4]
3830    paddb                m6, m7, m5
3831    paddb                m7, m6
3832.h_w32_loop:
3833    movu                 m0, [srcq+strideq*0]
3834    movu                 m1, [srcq+strideq*1]
3835    lea                srcq, [srcq+strideq*2]
3836    PREP_8TAP_H
3837    add                tmpq, 64*2
3838    sub                  hd, 2
3839    jg .h_w32_loop
3840    RET
3841.h_w64:
3842    xor                 r6d, r6d
3843    jmp .h_start
3844.h_w128:
3845    mov                  r6, -64*1
3846.h_start:
3847    mova                 m5, [spel_h_perm32]
3848    vpbroadcastd         m7, [pb_4]
3849    sub                srcq, r6
3850    paddb                m6, m7, m5
3851    paddb                m7, m6
3852.h_loop0:
3853    mov                  r5, r6
3854.h_loop:
3855    movu                 m0, [srcq+r5+32*0]
3856    movu                 m1, [srcq+r5+32*1]
3857    PREP_8TAP_H
3858    add                tmpq, 64*2
3859    add                  r5, 64
3860    jle .h_loop
3861    add                srcq, strideq
3862    dec                  hd
3863    jg .h_loop0
3864    RET
3865.hv:
3866    RESET_STACK_STATE
3867    vpbroadcastd         m8, [pd_2]
3868    vpbroadcastd         m9, [pd_32]
3869    cmp                  wd, 4
3870    jg .hv_w8
3871    movzx               mxd, mxb
3872    dec                srcq
3873    vpbroadcastd        m11, [base+subpel_filters+mxq*8+2]
3874    movzx               mxd, myb
3875    shr                 myd, 16
3876    cmp                  hd, 4
3877    cmove               myd, mxd
3878    vpbroadcastq         m0, [base+subpel_filters+myq*8]
3879    lea            stride3q, [strideq*3]
3880    sub                srcq, stride3q
3881    mov                 r3d, 0x04
3882    kmovb                k1, r3d
3883    kshiftlb             k2, k1, 2
3884    kshiftlb             k3, k1, 4
3885    vbroadcasti128      m10, [subpel_h_shufA]
3886    punpcklbw            m0, m0
3887    psraw                m0, 8 ; sign-extend
3888    pshufd              m12, m0, q0000
3889    pshufd              m13, m0, q1111
3890    pshufd              m14, m0, q2222
3891    pshufd              m15, m0, q3333
3892    movq                xm3, [srcq+strideq*0]
3893    vpbroadcastq        ym2, [srcq+strideq*1]
3894    vpbroadcastq    ym3{k1}, [srcq+strideq*2]
3895    vpbroadcastq     m2{k2}, [srcq+stride3q ]
3896    lea                srcq, [srcq+strideq*4]
3897    vpbroadcastq     m3{k2}, [srcq+strideq*0]
3898    vpbroadcastq     m2{k3}, [srcq+strideq*1]
3899    vpbroadcastq     m3{k3}, [srcq+strideq*2]
3900    mova                 m6, [spel_hv_perm4a]
3901    movu                 m7, [spel_hv_perm4b]
3902    mova                 m0, m8
3903    mova                 m1, m8
3904    pshufb               m2, m10
3905    pshufb               m3, m10
3906    vpdpbusd             m0, m2, m11
3907    vpdpbusd             m1, m3, m11
3908    packssdw             m0, m1        ; _ 0  1 2  3 4  5 6
3909    psraw                m0, 2
3910    vpermb               m1, m6, m0    ; 01 12 23 34
3911    vpermb               m2, m7, m0    ; 23 34 45 56
3912.hv_w4_loop:
3913    movq                xm3, [srcq+stride3q ]
3914    lea                srcq, [srcq+strideq*4]
3915    movq                xm4, [srcq+strideq*0]
3916    vpbroadcastq    ym3{k1}, [srcq+strideq*1]
3917    vpbroadcastq    ym4{k1}, [srcq+strideq*2]
3918    mova                 m5, m9
3919    pshufb              ym3, ym10
3920    vpdpwssd             m5, m1, m12   ; a0 b0 c0 d0
3921    mova                ym1, ym8
3922    pshufb              ym4, ym10
3923    vpdpbusd            ym1, ym3, ym11
3924    mova                ym3, ym8
3925    vpdpbusd            ym3, ym4, ym11
3926    vpdpwssd             m5, m2, m13   ; a1 b1 c1 d1
3927    packssdw            ym1, ym3       ; 7 8  9 a
3928    psraw               ym1, 2
3929    vshufi32x4           m0, m1, q1032 ; _ 4  5 6  7 8  9 a
3930    vpermb               m1, m6, m0    ; 45 56 67 78
3931    vpermb               m2, m7, m0    ; 67 78 89 9a
3932    vpdpwssd             m5, m1, m14   ; a2 b2 c2 d2
3933    vpdpwssd             m5, m2, m15   ; a3 b3 c3 d3
3934    psrad                m5, 6
3935    vpmovdw          [tmpq], m5
3936    add                tmpq, 32
3937    sub                  hd, 4
3938    jg .hv_w4_loop
3939    RET
3940.hv_w8:
3941    shr                 mxd, 16
3942    sub                srcq, 3
3943    vpbroadcastd        m10, [base+subpel_filters+mxq*8+0]
3944    vpbroadcastd        m11, [base+subpel_filters+mxq*8+4]
3945    movzx               mxd, myb
3946    shr                 myd, 16
3947    cmp                  hd, 4
3948    cmove               myd, mxd
3949    vpbroadcastq         m0, [base+subpel_filters+myq*8]
3950    lea            stride3q, [strideq*3]
3951    sub                srcq, stride3q
3952    punpcklbw            m0, m0
3953    psraw                m0, 8 ; sign-extend
3954    pshufd              m12, m0, q0000
3955    pshufd              m13, m0, q1111
3956    pshufd              m14, m0, q2222
3957    pshufd              m15, m0, q3333
3958    cmp                  wd, 8
3959    jg .hv_w16
3960    vbroadcasti32x4     m17, [srcq+stride3q ]
3961    vinserti32x4        m16, m17, [srcq+strideq*0], 0
3962    vbroadcasti32x4     m19, [subpel_h_shufA]
3963    vinserti32x4        m16, [srcq+strideq*1], 1
3964    vbroadcasti32x4     m21, [subpel_h_shufC]
3965    vinserti32x4        m16, [srcq+strideq*2], 2
3966    lea                srcq, [srcq+strideq*4]
3967    vinserti128        ym17, [srcq+strideq*0], 1
3968    vbroadcasti32x4     m20, [subpel_h_shufB]
3969    vinserti32x4        m17, [srcq+strideq*1], 2
3970    vinserti32x4        m17, [srcq+strideq*2], 3
3971    pshufb               m3, m16, m19      ; 0 1 2 3   0123
3972    mova                 m2, m8
3973    pshufb               m0, m16, m21      ; 0 1 2 3   89ab
3974    vpdpbusd             m2, m3, m10
3975    mova                 m3, m8
3976    pshufb               m1, m17, m19      ; 3 4 5 6   0123
3977    vpdpbusd             m3, m0, m11
3978    mova                 m0, m8
3979    pshufb               m4, m17, m21      ; 3 4 5 6   89ab
3980    vpdpbusd             m0, m1, m10
3981    mova                 m1, m8
3982    pshufb              m16, m20           ; 0 1 2 3   4567
3983    vpdpbusd             m1, m4, m11
3984    pshufb              m17, m20           ; 3 4 5 6   4567
3985    vpdpbusd             m2, m16, m11
3986    vpdpbusd             m3, m16, m10
3987    vpdpbusd             m0, m17, m11
3988    vpdpbusd             m1, m17, m10
3989    packssdw             m2, m3
3990    packssdw             m0, m1
3991    psraw                m2, 2             ; 0 1 2 3
3992    psraw                m0, 2             ; 3 4 5 6
3993    vshufi32x4           m4, m2, m0, q2132 ; 2 3 4 5
3994    vshufi32x4           m5, m2, m0, q1021 ; 1 2 3 4
3995    punpcklwd            m3, m4, m0        ; 23 34 45 56
3996    punpckhwd            m4, m0
3997    punpcklwd            m1, m2, m5        ; 01 12 23 34
3998    punpckhwd            m2, m5
3999.hv_w8_loop:
4000    movu               xm18, [srcq+stride3q ]
4001    lea                srcq, [srcq+strideq*4]
4002    vinserti128        ym18, [srcq+strideq*0], 1
4003    vinserti32x4        m18, [srcq+strideq*1], 2
4004    vinserti32x4        m18, [srcq+strideq*2], 3
4005    pshufb              m17, m18, m19      ; 7 8 9 a   0123
4006    mova                m16, m8
4007    pshufb               m5, m18, m21      ; 7 8 9 a   89ab
4008    vpdpbusd            m16, m17, m10
4009    mova                m17, m8
4010    pshufb              m18, m20           ; 7 8 9 a   4567
4011    vpdpbusd            m17, m5, m11
4012    mova                 m5, m9
4013    vpdpwssd             m5, m3, m13       ; a1 b1 c1 d1
4014    mova                 m6, m9
4015    vpdpwssd             m6, m4, m13
4016    vpdpbusd            m16, m18, m11
4017    vpdpbusd            m17, m18, m10
4018    vpdpwssd             m5, m1, m12       ; a0 b0 c0 d0
4019    mova                 m1, m3
4020    vpdpwssd             m6, m2, m12
4021    mova                 m2, m4
4022    packssdw            m16, m17
4023    psraw               m16, 2             ; 7 8 9 a
4024    valignq              m4, m16, m0, 6    ; 6 7 8 9
4025    mova                 m0, m16
4026    punpcklwd            m3, m4, m16       ; 67 78 89 9a
4027    punpckhwd            m4, m16
4028    vpdpwssd             m5, m3, m15       ; a3 b3 c3 d3
4029    vpdpwssd             m6, m4, m15
4030    vshufi32x4           m1, m3, q1032     ; 45 56 67 78
4031    vshufi32x4           m2, m4, q1032
4032    vpdpwssd             m5, m1, m14       ; a2 b2 c2 d2
4033    vpdpwssd             m6, m2, m14
4034    psrad                m5, 6
4035    psrad                m6, 6
4036    packssdw             m5, m6
4037    mova             [tmpq], m5
4038    add                tmpq, 64
4039    sub                  hd, 4
4040    jg .hv_w8_loop
4041    vzeroupper
4042    RET
4043.hv_w16:
4044    WIN64_SPILL_XMM      23
4045    mova                m16, [spel_h_perm16]
4046    vpbroadcastd        m18, [pb_4]
4047    add                  wd, wd
4048    paddb               m17, m18, m16
4049    lea                 r6d, [hq+wq*8-256]
4050    paddb               m18, m17
4051.hv_w16_loop0:
4052    movu               ym19, [srcq+strideq*0]
4053    vinserti32x8        m19, [srcq+strideq*1], 1
4054    lea                  r5, [srcq+strideq*2]
4055    movu               ym20, [r5  +strideq*0]
4056    vinserti32x8        m20, [r5  +strideq*1], 1
4057    lea                  r5, [r5  +strideq*2]
4058    movu               ym21, [r5  +strideq*0]
4059    vinserti32x8        m21, [r5  +strideq*1], 1
4060    lea                  r5, [r5  +strideq*2]
4061    movu               ym22, [r5  +strideq*0]
4062    mov                  r7, tmpq
4063    vpermb               m3, m16, m19      ; 0 1   0123   89ab
4064    mova                 m2, m8
4065    vpermb               m4, m18, m19      ; 0 1   89ab   ghij
4066    vpdpbusd             m2, m3, m10
4067    mova                 m3, m8
4068    vpermb               m5, m16, m20      ; 2 3   0123   89ab
4069    vpdpbusd             m3, m4, m11
4070    mova                 m4, m8
4071    vpermb               m6, m18, m20      ; 2 3   89ab   ghij
4072    vpdpbusd             m4, m5, m10
4073    mova                 m5, m8
4074    vpermb               m7, m16, m21      ; 4 5   0123   89ab
4075    vpdpbusd             m5, m6, m11
4076    mova                 m6, m8
4077    vpermb               m0, m18, m21      ; 4 5   89ab   ghij
4078    vpdpbusd             m6, m7, m10
4079    mova                 m7, m8
4080    vpermb              ym1, ym16, ym22    ; 6     0123   89ab
4081    vpdpbusd             m7, m0, m11
4082    mova                ym0, ym8
4083    vpermb              m19, m17, m19      ; 0 1   4567   cdef
4084    vpdpbusd            ym0, ym1, ym10
4085    vpermb              ym1, ym18, ym22    ; 6     89ab   ghij
4086    vpdpbusd             m2, m19, m11
4087    vpdpbusd             m3, m19, m10
4088    mova               ym19, ym8
4089    vpermb              m20, m17, m20      ; 2 3   4567   cdef
4090    vpdpbusd           ym19, ym1, ym11
4091    vpermb              m21, m17, m21      ; 4 5   4567   cdef
4092    vpdpbusd             m4, m20, m11
4093    vpdpbusd             m5, m20, m10
4094    vpermb             ym22, ym17, ym22    ; 6     4567   cdef
4095    vpdpbusd             m6, m21, m11
4096    vpdpbusd             m7, m21, m10
4097    packssdw             m2, m3            ; 0 1
4098    vpdpbusd            ym0, ym22, ym11
4099    packssdw             m4, m5            ; 2 3
4100    vpdpbusd           ym19, ym22, ym10
4101    packssdw             m6, m7            ; 4 5
4102    packssdw            ym0, ym19          ; 6
4103    REPX       {psraw x, 2}, m2, m4, m6, ym0
4104    vshufi32x4           m3, m2, m4, q1032 ; 1 2
4105    vshufi32x4           m5, m4, m6, q1032 ; 3 4
4106    vshufi32x4           m0, m6, m0, q1032 ; 5 6
4107    punpcklwd            m1, m2, m3  ; 01 12
4108    punpckhwd            m2, m3
4109    punpcklwd            m3, m4, m5  ; 23 34
4110    punpckhwd            m4, m5
4111    punpcklwd            m5, m6, m0  ; 45 56
4112    punpckhwd            m6, m0
4113.hv_w16_loop:
4114    movu               ym19, [r5+strideq*1]
4115    lea                  r5, [r5+strideq*2]
4116    vinserti32x8        m19, [r5+strideq*0], 1
4117    mova                m20, m9
4118    vpdpwssd            m20, m1, m12 ; a0
4119    vpermb               m1, m16, m19
4120    mova                m21, m9
4121    vpdpwssd            m21, m2, m12 ; b0
4122    vpermb               m2, m17, m19
4123    mova                m22, m8
4124    vpdpbusd            m22, m1, m10
4125    mova                 m1, m8
4126    vpermb              m19, m18, m19
4127    vpdpbusd             m1, m2, m10
4128    vpdpwssd            m20, m3, m13 ; a1
4129    vpdpwssd            m21, m4, m13 ; b1
4130    vpdpbusd            m22, m2, m11
4131    mova                 m2, m4
4132    vpdpbusd             m1, m19, m11
4133    mova                 m4, m6
4134    vpdpwssd            m20, m5, m14 ; a2
4135    vpdpwssd            m21, m6, m14 ; b2
4136    packssdw            m22, m1
4137    mova                 m1, m3
4138    psraw               m22, 2              ; 7 8
4139    mova                 m3, m5
4140    vshufi32x4           m6, m0, m22, q1032 ; 6 7
4141    mova                 m0, m22
4142    punpcklwd            m5, m6, m0  ; 67 78
4143    punpckhwd            m6, m0
4144    vpdpwssd            m20, m5, m15 ; a3
4145    vpdpwssd            m21, m6, m15 ; b3
4146    psrad               m20, 6
4147    psrad               m21, 6
4148    packssdw            m20, m21
4149    mova          [r7+wq*0], ym20
4150    vextracti32x8 [r7+wq*1], m20, 1
4151    lea                  r7, [r7+wq*2]
4152    sub                  hd, 2
4153    jg .hv_w16_loop
4154    add                srcq, 16
4155    add                tmpq, 32
4156    movzx                hd, r6b
4157    sub                 r6d, 1<<8
4158    jg .hv_w16_loop0
4159    RET
4160
4161cglobal warp_affine_8x8t_8bpc, 4, 7, 22, tmp, ts
4162    vpbroadcastd         m9, [pd_16384]
4163    mova               ym15, [warp_8x8t_end]
4164    call mangle(private_prefix %+ _warp_affine_8x8_8bpc_avx512icl).main
4165    jmp .start
4166.loop:
4167    call mangle(private_prefix %+ _warp_affine_8x8_8bpc_avx512icl).main2
4168    lea                tmpq, [tmpq+tsq*4]
4169.start:
4170    paddd               m16, m16
4171    vpermb              m16, m15, m16
4172    mova         [tmpq+tsq*0], xm16
4173    vextracti128 [tmpq+tsq*2], ym16, 1
4174    sub                 r6d, 0x1800
4175    jg .loop
4176    RET
4177
4178cglobal warp_affine_8x8_8bpc, 4, 7, 22, dst, ds, src, ss, abcd, filter
4179    vpbroadcastd         m9, [pd_262144]
4180    mova               xm15, [warp_8x8_end]
4181    call .main
4182    jmp .start
4183.loop:
4184    call .main2
4185    lea                dstq, [dstq+dsq*2]
4186.start:
4187    psrad               m16, 19
4188    packuswb            m16, m16
4189    vpermb              m16, m15, m16
4190    movq       [dstq+dsq*0], xm16
4191    movhps     [dstq+dsq*1], xm16
4192    sub                 r6d, 0x1800
4193    jg .loop
4194    RET
4195ALIGN function_align
4196.main:
4197    vpbroadcastd         m1, [pd_512]
4198%if WIN64
4199    mov               abcdq, r5mp
4200    vpaddd             ym18, ym1, r6m {1to8} ; mx
4201%else
4202    add                 r5d, 512
4203    vpbroadcastd       ym18, r5d
4204%endif
4205    vpaddd             ym20, ym1, r7m {1to8} ; my
4206    mova               ym16, [pd_0to7]
4207    vpbroadcastd       ym19, [abcdq+4*0]
4208    vpbroadcastd       ym21, [abcdq+4*1]
4209    lea                  r4, [ssq*3+3]
4210    mova                m10, [warp_8x8_permA]
4211    mov                 r6d, 0x5555
4212    mova                m11, [warp_8x8_permB]
4213    lea             filterq, [mc_warp_filter+64*8]
4214    vpbroadcastq        m12, [warp_8x8_hpack]
4215    sub                srcq, r4               ; src -= src_stride*3 + 3
4216    vbroadcasti32x4     m13, [warp_8x8_permC]
4217    kxnorb               k2, k2, k2
4218    vbroadcasti32x4     m14, [warp_8x8_permD]
4219    vpdpwssd           ym18, ym19, ym16       ; alpha
4220    vpdpwssd           ym20, ym21, ym16       ; gamma
4221    vbroadcasti32x4      m0, [srcq]
4222    psrad              ym19, 16               ; beta
4223    psrad              ym21, 16               ; delta
4224    kmovw                k1, r6d
4225    psrad              ym16, ym18, 10
4226    kmovb                k3, k2
4227    paddd              ym18, ym19
4228    vpgatherdq       m2{k2}, [filterq+ym16*8] ; filter_x0
4229    psrld                m1, 8                ; pd_2
4230    pshufb               m0, m11
4231    paddd                m8, m1, m1           ; pd_4
4232    vpdpbusd             m1, m0, m2
4233    call .h
4234    psllq                m2, m1, 45
4235    pslld                m1, 13
4236    paddd                m1, m2
4237    vpshrdq              m1, m0, 48           ; 01 12
4238    call .h
4239    vpshrdq              m2, m1, m0, 48       ; 23 34
4240    call .h
4241    vpshrdq              m3, m2, m0, 48       ; 45 56
4242.main2:
4243    call .h
4244    psrad              ym17, ym20, 10
4245    kmovb                k2, k3
4246    paddd              ym20, ym21
4247    vpgatherdq       m7{k3}, [filterq+ym17*8] ; filter_y0
4248    psrad              ym16, ym20, 10
4249    kmovb                k3, k2
4250    paddd              ym20, ym21
4251    vpgatherdq      m17{k2}, [filterq+ym16*8] ; filter_y1
4252    shufps               m5, m7, m17, q2020   ; a0 a1 a2 a3 b0 b1 b2 b3 A0 A1 A2 A3 B0 B1 B2 B3
4253    mova                m16, m9
4254    pshufb               m4, m5, m13          ;    a0    a1    A0    A1    b0    b1    B0    B1
4255    vpdpwssd            m16, m1, m4
4256    pshufb               m5, m14              ;    a2    a3    A2    A3    b2    b3    B2    B3
4257    mova                 m1, m2
4258    vpdpwssd            m16, m2, m5
4259    shufps               m5, m7, m17, q3131   ; a4 a5 a6 a7 b4 b5 b6 b7 A4 A5 A6 A7 B4 B5 B6 B7
4260    mova                 m2, m3
4261    pshufb               m4, m5, m13          ;    a4    a5    A4    A5    b4    b5    B4    B5
4262    vpdpwssd            m16, m3, m4
4263    vpshrdq              m3, m0, 48           ; 67 78
4264    pshufb               m5, m14              ;    a6    a7    A6    A7    b6    b7    B6    B7
4265    vpdpwssd            m16, m3, m5
4266    ret
4267ALIGN function_align
4268.h:
4269    movu                xm5, [srcq+ssq*1]
4270    psrad              ym16, ym18, 10
4271    lea                srcq, [srcq+ssq*2]
4272    vinserti32x4        ym5, [srcq+ssq*0], 1
4273    kmovb                k2, k3
4274    paddd              ym18, ym19
4275    vpgatherdq       m6{k3}, [filterq+ym16*8] ; filter_x1
4276    psrad              ym17, ym18, 10
4277    kmovb                k3, k2
4278    paddd              ym18, ym19
4279    vpgatherdq      m16{k2}, [filterq+ym17*8] ; filter_x2
4280    mova                 m0, m8
4281    vpermb               m4, m10, m5          ; a4 b0 a5 b1   a6 b2 a7 b3   a8 b4 a9 b5   aa b6 ab b7
4282    vpshldq             m17, m16, m6, 32      ; a4 a5 a6 a7   b0 b1 b2 b3
4283    vpdpbusd             m0, m4, m17
4284    vpermb               m5, m11, m5          ; a0 b4 a1 b5   a2 b6 a3 b7   a4 b8 a5 b9   a6 ba a7 bb
4285    vmovdqa32       m16{k1}, m6               ; a0 a1 a2 a3   b4 b5 b6 b7
4286    vpdpbusd             m0, m5, m16
4287    vpmultishiftqb       m0, m12, m0          ; 1 1 2 2 (>> 3)
4288    ret
4289
4290%macro BIDIR_FN 1 ; op
4291    lea            stride3q, [strideq*3]
4292    jmp                  wq
4293.w4:
4294    cmp                  hd, 8
4295    jg .w4_h16
4296    WRAP_YMM %1           0
4297    vextracti32x4       xm1, ym0, 1
4298    movd   [dstq          ], xm0
4299    pextrd [dstq+strideq*1], xm0, 1
4300    movd   [dstq+strideq*2], xm1
4301    pextrd [dstq+stride3q ], xm1, 1
4302    jl .w4_ret
4303    lea                dstq, [dstq+strideq*4]
4304    pextrd [dstq          ], xm0, 2
4305    pextrd [dstq+strideq*1], xm0, 3
4306    pextrd [dstq+strideq*2], xm1, 2
4307    pextrd [dstq+stride3q ], xm1, 3
4308.w4_ret:
4309    RET
4310.w4_h16:
4311    vpbroadcastd         m7, strided
4312    pmulld               m7, [bidir_sctr_w4]
4313    %1                    0
4314    kxnorw               k1, k1, k1
4315    vpscatterdd [dstq+m7]{k1}, m0
4316    RET
4317.w8:
4318    cmp                  hd, 4
4319    jne .w8_h8
4320    WRAP_YMM %1           0
4321    vextracti32x4       xm1, ym0, 1
4322    movq   [dstq          ], xm0
4323    movq   [dstq+strideq*1], xm1
4324    movhps [dstq+strideq*2], xm0
4325    movhps [dstq+stride3q ], xm1
4326    RET
4327.w8_loop:
4328    %1_INC_PTR            2
4329    lea                dstq, [dstq+strideq*4]
4330.w8_h8:
4331    %1                    0
4332    vextracti32x4       xm1, ym0, 1
4333    vextracti32x4       xm2, m0, 2
4334    vextracti32x4       xm3, m0, 3
4335    movq   [dstq          ], xm0
4336    movq   [dstq+strideq*1], xm1
4337    movq   [dstq+strideq*2], xm2
4338    movq   [dstq+stride3q ], xm3
4339    lea                dstq, [dstq+strideq*4]
4340    movhps [dstq          ], xm0
4341    movhps [dstq+strideq*1], xm1
4342    movhps [dstq+strideq*2], xm2
4343    movhps [dstq+stride3q ], xm3
4344    sub                  hd, 8
4345    jg .w8_loop
4346    RET
4347.w16_loop:
4348    %1_INC_PTR            2
4349    lea                dstq, [dstq+strideq*4]
4350.w16:
4351    %1                    0
4352    vpermq               m0, m0, q3120
4353    mova          [dstq          ], xm0
4354    vextracti32x4 [dstq+strideq*1], m0, 2
4355    vextracti32x4 [dstq+strideq*2], ym0, 1
4356    vextracti32x4 [dstq+stride3q ], m0, 3
4357    sub                  hd, 4
4358    jg .w16_loop
4359    RET
4360.w32:
4361    pmovzxbq             m7, [pb_02461357]
4362.w32_loop:
4363    %1                    0
4364    %1_INC_PTR            2
4365    vpermq               m0, m7, m0
4366    mova          [dstq+strideq*0], ym0
4367    vextracti32x8 [dstq+strideq*1], m0, 1
4368    lea                dstq, [dstq+strideq*2]
4369    sub                  hd, 2
4370    jg .w32_loop
4371    RET
4372.w64:
4373    pmovzxbq             m7, [pb_02461357]
4374.w64_loop:
4375    %1                    0
4376    %1_INC_PTR            2
4377    vpermq               m0, m7, m0
4378    mova             [dstq], m0
4379    add                dstq, strideq
4380    dec                  hd
4381    jg .w64_loop
4382    RET
4383.w128:
4384    pmovzxbq             m7, [pb_02461357]
4385.w128_loop:
4386    %1                    0
4387    vpermq               m6, m7, m0
4388    %1                    2
4389    mova        [dstq+64*0], m6
4390    %1_INC_PTR            4
4391    vpermq               m6, m7, m0
4392    mova        [dstq+64*1], m6
4393    add                dstq, strideq
4394    dec                  hd
4395    jg .w128_loop
4396    RET
4397%endmacro
4398
4399%macro AVG 1 ; src_offset
4400    mova                 m0, [tmp1q+(%1+0)*mmsize]
4401    paddw                m0, [tmp2q+(%1+0)*mmsize]
4402    mova                 m1, [tmp1q+(%1+1)*mmsize]
4403    paddw                m1, [tmp2q+(%1+1)*mmsize]
4404    pmulhrsw             m0, m4
4405    pmulhrsw             m1, m4
4406    packuswb             m0, m1
4407%endmacro
4408
4409%macro AVG_INC_PTR 1
4410    add               tmp1q, %1*mmsize
4411    add               tmp2q, %1*mmsize
4412%endmacro
4413
4414cglobal avg_8bpc, 4, 7, 5, dst, stride, tmp1, tmp2, w, h, stride3
4415%define base r6-avg_avx512icl_table
4416    lea                  r6, [avg_avx512icl_table]
4417    tzcnt                wd, wm
4418    movifnidn            hd, hm
4419    movsxd               wq, dword [r6+wq*4]
4420    vpbroadcastd         m4, [base+pw_1024]
4421    add                  wq, r6
4422    BIDIR_FN            AVG
4423
4424%macro W_AVG 1 ; src_offset
4425    ; (a * weight + b * (16 - weight) + 128) >> 8
4426    ; = ((a - b) * weight + (b << 4) + 128) >> 8
4427    ; = ((((a - b) * ((weight-16) << 12)) >> 16) + a + 8) >> 4
4428    ; = ((((b - a) * (-weight     << 12)) >> 16) + b + 8) >> 4
4429    mova                 m0,     [tmp1q+(%1+0)*mmsize]
4430    psubw                m2, m0, [tmp2q+(%1+0)*mmsize]
4431    mova                 m1,     [tmp1q+(%1+1)*mmsize]
4432    psubw                m3, m1, [tmp2q+(%1+1)*mmsize]
4433    pmulhw               m2, m4
4434    pmulhw               m3, m4
4435    paddw                m0, m2
4436    paddw                m1, m3
4437    pmulhrsw             m0, m5
4438    pmulhrsw             m1, m5
4439    packuswb             m0, m1
4440%endmacro
4441
4442%define W_AVG_INC_PTR AVG_INC_PTR
4443
4444cglobal w_avg_8bpc, 4, 7, 6, dst, stride, tmp1, tmp2, w, h, stride3
4445%define base r6-w_avg_avx512icl_table
4446    lea                  r6, [w_avg_avx512icl_table]
4447    tzcnt                wd, wm
4448    movifnidn            hd, hm
4449    vpbroadcastw         m4, r6m ; weight
4450    movsxd               wq, dword [r6+wq*4]
4451    vpbroadcastd         m5, [base+pw_2048]
4452    psllw                m4, 12 ; (weight-16) << 12 when interpreted as signed
4453    add                  wq, r6
4454    cmp           dword r6m, 7
4455    jg .weight_gt7
4456    mov                  r6, tmp1q
4457    pxor                 m0, m0
4458    mov               tmp1q, tmp2q
4459    psubw                m4, m0, m4 ; -weight
4460    mov               tmp2q, r6
4461.weight_gt7:
4462    BIDIR_FN          W_AVG
4463
4464%macro MASK 1 ; src_offset
4465    ; (a * m + b * (64 - m) + 512) >> 10
4466    ; = ((a - b) * m + (b << 6) + 512) >> 10
4467    ; = ((((b - a) * (-m << 10)) >> 16) + b + 8) >> 4
4468%if mmsize == 64
4469    vpermq               m3, m8, [maskq+%1*32]
4470%else
4471    vpermq               m3,     [maskq+%1*16], q3120
4472%endif
4473    mova                 m0,     [tmp2q+(%1+0)*mmsize]
4474    psubw                m1, m0, [tmp1q+(%1+0)*mmsize]
4475    psubb                m3, m4, m3
4476    paddw                m1, m1     ; (b - a) << 1
4477    paddb                m3, m3
4478    punpcklbw            m2, m4, m3 ; -m << 9
4479    pmulhw               m1, m2
4480    paddw                m0, m1
4481    mova                 m1,     [tmp2q+(%1+1)*mmsize]
4482    psubw                m2, m1, [tmp1q+(%1+1)*mmsize]
4483    paddw                m2, m2
4484    punpckhbw            m3, m4, m3
4485    pmulhw               m2, m3
4486    paddw                m1, m2
4487    pmulhrsw             m0, m5
4488    pmulhrsw             m1, m5
4489    packuswb             m0, m1
4490%endmacro
4491
4492%macro MASK_INC_PTR 1
4493    add               maskq, %1*32
4494    add               tmp2q, %1*64
4495    add               tmp1q, %1*64
4496%endmacro
4497
4498cglobal mask_8bpc, 4, 8, 6, dst, stride, tmp1, tmp2, w, h, mask, stride3
4499%define base r7-mask_avx512icl_table
4500    lea                  r7, [mask_avx512icl_table]
4501    tzcnt                wd, wm
4502    movifnidn            hd, hm
4503    mov               maskq, maskmp
4504    movsxd               wq, dword [r7+wq*4]
4505    pxor                 m4, m4
4506    mova                 m8, [base+bilin_v_perm64]
4507    vpbroadcastd         m5, [base+pw_2048]
4508    add                  wq, r7
4509    BIDIR_FN           MASK
4510
4511%macro W_MASK 4-5 0 ; dst, mask, tmp_offset[1-2], 4:4:4
4512    mova                m%1, [tmp1q+mmsize*%3]
4513    mova                 m1, [tmp2q+mmsize*%3]
4514    psubw                m1, m%1
4515    pabsw               m%2, m1
4516    psubusw             m%2, m6, m%2
4517    psrlw               m%2, 8 ; 64 - m
4518    psllw                m2, m%2, 10
4519    pmulhw               m1, m2
4520    paddw               m%1, m1
4521    mova                 m1, [tmp1q+mmsize*%4]
4522    mova                 m2, [tmp2q+mmsize*%4]
4523    psubw                m2, m1
4524    pabsw                m3, m2
4525    psubusw              m3, m6, m3
4526    vpshldw             m%2, m3, 8
4527    psllw                m3, m%2, 10
4528%if %5
4529    psubb               m%2, m5, m%2
4530%endif
4531    pmulhw               m2, m3
4532    paddw                m1, m2
4533    pmulhrsw            m%1, m7
4534    pmulhrsw             m1, m7
4535    packuswb            m%1, m1
4536%endmacro
4537
4538cglobal w_mask_420_8bpc, 4, 8, 16, dst, stride, tmp1, tmp2, w, h, mask, stride3
4539%define base r7-w_mask_420_avx512icl_table
4540    lea                  r7, [w_mask_420_avx512icl_table]
4541    tzcnt                wd, wm
4542    mov                 r6d, r7m ; sign
4543    movifnidn            hd, hm
4544    movsxd               wq, [r7+wq*4]
4545    vpbroadcastd         m6, [base+pw_6903] ; ((64 - 38) << 8) + 255 - 8
4546    vpbroadcastd         m7, [base+pw_2048]
4547    vpbroadcastd         m9, [base+pb_m64]             ; -1 << 6
4548    mova               ym10, [base+wm_420_mask+32]
4549    vpbroadcastd         m8, [base+wm_sign+r6*8] ; (258 - sign) << 6
4550    add                  wq, r7
4551    mov               maskq, maskmp
4552    lea            stride3q, [strideq*3]
4553    jmp                  wq
4554.w4:
4555    mova                 m5, [wm_420_perm4]
4556    cmp                  hd, 8
4557    jg .w4_h16
4558    WRAP_YMM W_MASK       0, 4, 0, 1
4559    vinserti128         ym5, [wm_420_perm4+32], 1
4560    vpermb              ym4, ym5, ym4
4561    vpdpbusd            ym8, ym4, ym9
4562    vextracti32x4       xm1, m0, 1
4563    movd   [dstq+strideq*0], xm0
4564    pextrd [dstq+strideq*1], xm0, 1
4565    movd   [dstq+strideq*2], xm1
4566    pextrd [dstq+stride3q ], xm1, 1
4567    jl .w4_end
4568    lea                dstq, [dstq+strideq*4]
4569    pextrd [dstq+strideq*0], xm0, 2
4570    pextrd [dstq+strideq*1], xm0, 3
4571    pextrd [dstq+strideq*2], xm1, 2
4572    pextrd [dstq+stride3q ], xm1, 3
4573.w4_end:
4574    vpermb              ym8, ym10, ym8
4575    movq            [maskq], xm8
4576    RET
4577.w4_h16:
4578    vpbroadcastd        m11, strided
4579    pmulld              m11, [bidir_sctr_w4]
4580    W_MASK                0, 4, 0, 1
4581    vpermb               m4, m5, m4
4582    vpdpbusd             m8, m4, m9
4583    kxnorw               k1, k1, k1
4584    vpermb               m8, m10, m8
4585    mova            [maskq], xm8
4586    vpscatterdd [dstq+m11]{k1}, m0
4587    RET
4588.w8:
4589    mova                 m5, [wm_420_perm8]
4590    cmp                  hd, 4
4591    jne .w8_h8
4592    WRAP_YMM W_MASK       0, 4, 0, 1
4593    vinserti128         ym5, [wm_420_perm8+32], 1
4594    vpermb              ym4, ym5, ym4
4595    vpdpbusd            ym8, ym4, ym9
4596    vpermb               m8, m10, m8
4597    mova            [maskq], xm8
4598    vextracti32x4       xm1, ym0, 1
4599    movq   [dstq+strideq*0], xm0
4600    movq   [dstq+strideq*1], xm1
4601    movhps [dstq+strideq*2], xm0
4602    movhps [dstq+stride3q ], xm1
4603    RET
4604.w8_loop:
4605    add               tmp1q, 128
4606    add               tmp2q, 128
4607    add               maskq, 16
4608    lea                dstq, [dstq+strideq*4]
4609.w8_h8:
4610    W_MASK                0, 4, 0, 1
4611    vpermb               m4, m5, m4
4612    mova                 m1, m8
4613    vpdpbusd             m1, m4, m9
4614    vpermb               m1, m10, m1
4615    mova            [maskq], xm1
4616    vextracti32x4       xm1, ym0, 1
4617    vextracti32x4       xm2, m0, 2
4618    vextracti32x4       xm3, m0, 3
4619    movq   [dstq+strideq*0], xm0
4620    movq   [dstq+strideq*1], xm1
4621    movq   [dstq+strideq*2], xm2
4622    movq   [dstq+stride3q ], xm3
4623    lea                dstq, [dstq+strideq*4]
4624    movhps [dstq+strideq*0], xm0
4625    movhps [dstq+strideq*1], xm1
4626    movhps [dstq+strideq*2], xm2
4627    movhps [dstq+stride3q ], xm3
4628    sub                  hd, 8
4629    jg .w8_loop
4630    RET
4631.w16:
4632    mova                 m5, [wm_420_perm16]
4633.w16_loop:
4634    W_MASK                0, 4, 0, 1
4635    vpermb               m4, m5, m4
4636    mova                 m1, m8
4637    vpdpbusd             m1, m4, m9
4638    add               tmp1q, 128
4639    add               tmp2q, 128
4640    vpermb               m1, m10, m1
4641    vpermq               m0, m0, q3120
4642    mova            [maskq], xm1
4643    add               maskq, 16
4644    mova          [dstq+strideq*0], xm0
4645    vextracti32x4 [dstq+strideq*1], m0, 2
4646    vextracti32x4 [dstq+strideq*2], ym0, 1
4647    vextracti32x4 [dstq+stride3q ], m0, 3
4648    lea                dstq, [dstq+strideq*4]
4649    sub                  hd, 4
4650    jg .w16_loop
4651    RET
4652.w32:
4653    pmovzxbq             m5, [pb_02461357]
4654.w32_loop:
4655    W_MASK                0, 4, 0, 1
4656    mova                 m1, m8
4657    vpdpbusd             m1, m4, m9
4658    add               tmp1q, 128
4659    add               tmp2q, 128
4660    vpermb               m1, m10, m1
4661    vpermq               m0, m5, m0
4662    mova            [maskq], xm1
4663    add               maskq, 16
4664    mova          [dstq+strideq*0], ym0
4665    vextracti32x8 [dstq+strideq*1], m0, 1
4666    lea                dstq, [dstq+strideq*2]
4667    sub                  hd, 2
4668    jg .w32_loop
4669    RET
4670.w64:
4671    pmovzxbq            m12, [wm_420_perm64] ; 0, 2, 4, 6, 8, 10, 12, 14
4672    psrlq               m13, m12, 4          ; 1, 3, 5, 7, 9, 11, 13, 15
4673.w64_loop:
4674    W_MASK                0, 4, 0, 2
4675    W_MASK               11, 5, 1, 3
4676    mova                 m2, m8
4677    vpdpbusd             m2, m4, m9
4678    mova                 m3, m8
4679    vpdpbusd             m3, m5, m9
4680    add               tmp1q, 256
4681    add               tmp2q, 256
4682    vpermt2b             m2, m10, m3
4683    mova                 m1, m0
4684    vpermt2q             m0, m12, m11
4685    vpermt2q             m1, m13, m11
4686    mova            [maskq], ym2
4687    add               maskq, 32
4688    mova   [dstq+strideq*0], m0
4689    mova   [dstq+strideq*1], m1
4690    lea                dstq, [dstq+strideq*2]
4691    sub                  hd, 2
4692    jg .w64_loop
4693    RET
4694.w128:
4695    pmovzxbq            m14, [wm_420_perm64]
4696    mova                m10, [wm_420_mask]
4697    psrlq               m15, m14, 4
4698.w128_loop:
4699    W_MASK                0, 12, 0, 4
4700    W_MASK               11, 13, 1, 5
4701    mova                 m4, m8
4702    vpdpbusd             m4, m12, m9
4703    mova                 m5, m8
4704    vpdpbusd             m5, m13, m9
4705    mova                 m1, m0
4706    vpermt2q             m0, m14, m11
4707    vpermt2q             m1, m15, m11
4708    mova [dstq+strideq*0+64*0], m0
4709    mova [dstq+strideq*1+64*0], m1
4710    W_MASK                0, 12, 2, 6
4711    W_MASK               11, 13, 3, 7
4712    vprold               m4, 16
4713    vprold               m5, 16
4714    vpdpbusd             m4, m12, m9
4715    vpdpbusd             m5, m13, m9
4716    add               tmp1q, 512
4717    add               tmp2q, 512
4718    vpermt2b             m4, m10, m5
4719    mova                 m1, m0
4720    vpermt2q             m0, m14, m11
4721    vpermt2q             m1, m15, m11
4722    mova            [maskq], m4
4723    add               maskq, 64
4724    mova [dstq+strideq*0+64*1], m0
4725    mova [dstq+strideq*1+64*1], m1
4726    lea                dstq, [dstq+strideq*2]
4727    sub                  hd, 2
4728    jg .w128_loop
4729    RET
4730
4731cglobal w_mask_422_8bpc, 4, 8, 14, dst, stride, tmp1, tmp2, w, h, mask, stride3
4732%define base r7-w_mask_422_avx512icl_table
4733    lea                  r7, [w_mask_422_avx512icl_table]
4734    tzcnt                wd, wm
4735    mov                 r6d, r7m ; sign
4736    movifnidn            hd, hm
4737    movsxd               wq, dword [r7+wq*4]
4738    vpbroadcastd         m6, [base+pw_6903] ; ((64 - 38) << 8) + 255 - 8
4739    vpbroadcastd         m7, [base+pw_2048]
4740    vpbroadcastd         m9, [base+pw_m128]
4741    mova                m10, [base+wm_422_mask]
4742    vpbroadcastd        m11, [base+pb_127]
4743    add                  wq, r7
4744    vpbroadcastd         m8, [base+wm_sign+4+r6*4]
4745    mov               maskq, maskmp
4746    lea            stride3q, [strideq*3]
4747    jmp                  wq
4748.w4:
4749    cmp                  hd, 8
4750    jg .w4_h16
4751    WRAP_YMM W_MASK       0, 4, 0, 1
4752    movhps             xm10, [wm_422_mask+16]
4753    vpdpwssd            ym8, ym4, ym9
4754    vpermb              ym8, ym10, ym8
4755    vextracti32x4       xm1, m0, 1
4756    movd   [dstq+strideq*0], xm0
4757    pextrd [dstq+strideq*1], xm0, 1
4758    movd   [dstq+strideq*2], xm1
4759    pextrd [dstq+stride3q ], xm1, 1
4760    jl .w4_end
4761    lea                dstq, [dstq+strideq*4]
4762    pextrd [dstq+strideq*0], xm0, 2
4763    pextrd [dstq+strideq*1], xm0, 3
4764    pextrd [dstq+strideq*2], xm1, 2
4765    pextrd [dstq+stride3q ], xm1, 3
4766.w4_end:
4767    pand                xm8, xm11
4768    mova            [maskq], xm8
4769    RET
4770.w4_h16:
4771    vpbroadcastd         m5, strided
4772    pmulld               m5, [bidir_sctr_w4]
4773    W_MASK                0, 4, 0, 1
4774    vpdpwssd             m8, m4, m9
4775    kxnorw               k1, k1, k1
4776    vpermb               m8, m10, m8
4777    pand                ym8, ym11
4778    mova            [maskq], ym8
4779    vpscatterdd [dstq+m5]{k1}, m0
4780    RET
4781.w8:
4782    cmp                  hd, 4
4783    jne .w8_h8
4784    WRAP_YMM W_MASK       0, 4, 0, 1
4785    movhps             xm10, [wm_422_mask+16]
4786    vpdpwssd            ym8, ym4, ym9
4787    vpermb              ym8, ym10, ym8
4788    pand                xm8, xm11
4789    mova            [maskq], xm8
4790    vextracti32x4       xm1, ym0, 1
4791    movq   [dstq+strideq*0], xm0
4792    movq   [dstq+strideq*1], xm1
4793    movhps [dstq+strideq*2], xm0
4794    movhps [dstq+stride3q ], xm1
4795    RET
4796.w8_loop:
4797    add               tmp1q, 128
4798    add               tmp2q, 128
4799    add               maskq, 32
4800    lea                dstq, [dstq+strideq*4]
4801.w8_h8:
4802    W_MASK                0, 4, 0, 1
4803    mova                 m1, m8
4804    vpdpwssd             m1, m4, m9
4805    vpermb               m1, m10, m1
4806    pand                ym1, ym11
4807    mova            [maskq], ym1
4808    vextracti32x4       xm1, ym0, 1
4809    vextracti32x4       xm2, m0, 2
4810    vextracti32x4       xm3, m0, 3
4811    movq   [dstq+strideq*0], xm0
4812    movq   [dstq+strideq*1], xm1
4813    movq   [dstq+strideq*2], xm2
4814    movq   [dstq+stride3q ], xm3
4815    lea                dstq, [dstq+strideq*4]
4816    movhps [dstq+strideq*0], xm0
4817    movhps [dstq+strideq*1], xm1
4818    movhps [dstq+strideq*2], xm2
4819    movhps [dstq+stride3q ], xm3
4820    sub                  hd, 8
4821    jg .w8_loop
4822    RET
4823.w16_loop:
4824    add               tmp1q, 128
4825    add               tmp2q, 128
4826    add               maskq, 32
4827    lea                dstq, [dstq+strideq*4]
4828.w16:
4829    W_MASK                0, 4, 0, 1
4830    mova                 m1, m8
4831    vpdpwssd             m1, m4, m9
4832    vpermb               m1, m10, m1
4833    vpermq               m0, m0, q3120
4834    pand                ym1, ym11
4835    mova            [maskq], ym1
4836    mova          [dstq+strideq*0], xm0
4837    vextracti32x4 [dstq+strideq*1], m0, 2
4838    vextracti32x4 [dstq+strideq*2], ym0, 1
4839    vextracti32x4 [dstq+stride3q ], m0, 3
4840    sub                  hd, 4
4841    jg .w16_loop
4842    RET
4843.w32:
4844    pmovzxbq             m5, [pb_02461357]
4845.w32_loop:
4846    W_MASK                0, 4, 0, 1
4847    mova                 m1, m8
4848    vpdpwssd             m1, m4, m9
4849    add               tmp1q, 128
4850    add               tmp2q, 128
4851    vpermb               m1, m10, m1
4852    vpermq               m0, m5, m0
4853    pand                ym1, ym11
4854    mova            [maskq], ym1
4855    add               maskq, 32
4856    mova          [dstq+strideq*0], ym0
4857    vextracti32x8 [dstq+strideq*1], m0, 1
4858    lea                dstq, [dstq+strideq*2]
4859    sub                  hd, 2
4860    jg .w32_loop
4861    RET
4862.w64:
4863    pmovzxbq             m5, [pb_02461357]
4864.w64_loop:
4865    W_MASK                0, 4, 0, 1
4866    mova                 m1, m8
4867    vpdpwssd             m1, m4, m9
4868    add               tmp1q, 128
4869    add               tmp2q, 128
4870    vpermb               m1, m10, m1
4871    vpermq               m0, m5, m0
4872    pand                ym1, ym11
4873    mova            [maskq], ym1
4874    add               maskq, 32
4875    mova             [dstq], m0
4876    add                dstq, strideq
4877    dec                  hd
4878    jg .w64_loop
4879    RET
4880.w128:
4881    pmovzxbq            m13, [pb_02461357]
4882.w128_loop:
4883    W_MASK                0, 4, 0, 1
4884    W_MASK               12, 5, 2, 3
4885    mova                 m2, m8
4886    vpdpwssd             m2, m4, m9
4887    mova                 m3, m8
4888    vpdpwssd             m3, m5, m9
4889    add               tmp1q, 256
4890    add               tmp2q, 256
4891    vpermt2b             m2, m10, m3
4892    vpermq               m0, m13, m0
4893    vpermq               m1, m13, m12
4894    pand                 m2, m11
4895    mova            [maskq], m2
4896    add               maskq, 64
4897    mova        [dstq+64*0], m0
4898    mova        [dstq+64*1], m1
4899    add                dstq, strideq
4900    dec                  hd
4901    jg .w128_loop
4902    RET
4903
4904cglobal w_mask_444_8bpc, 4, 8, 12, dst, stride, tmp1, tmp2, w, h, mask, stride3
4905%define base r7-w_mask_444_avx512icl_table
4906    lea                  r7, [w_mask_444_avx512icl_table]
4907    tzcnt                wd, wm
4908    movifnidn            hd, hm
4909    movsxd               wq, dword [r7+wq*4]
4910    vpbroadcastd         m6, [base+pw_6903] ; ((64 - 38) << 8) + 255 - 8
4911    vpbroadcastd         m5, [base+pb_64]
4912    vpbroadcastd         m7, [base+pw_2048]
4913    mova                 m8, [base+wm_444_mask]
4914    add                  wq, r7
4915    mov               maskq, maskmp
4916    lea            stride3q, [strideq*3]
4917    jmp                  wq
4918.w4:
4919    cmp                  hd, 8
4920    jg .w4_h16
4921    WRAP_YMM W_MASK       0, 4, 0, 1, 1
4922    vinserti128         ym8, [wm_444_mask+32], 1
4923    vpermb              ym4, ym8, ym4
4924    mova            [maskq], ym4
4925    vextracti32x4      xm1, m0, 1
4926    movd   [dstq+strideq*0], xm0
4927    pextrd [dstq+strideq*1], xm0, 1
4928    movd   [dstq+strideq*2], xm1
4929    pextrd [dstq+stride3q ], xm1, 1
4930    jl .w4_end
4931    lea                dstq, [dstq+strideq*4]
4932    pextrd [dstq+strideq*0], xm0, 2
4933    pextrd [dstq+strideq*1], xm0, 3
4934    pextrd [dstq+strideq*2], xm1, 2
4935    pextrd [dstq+stride3q ], xm1, 3
4936.w4_end:
4937    RET
4938.w4_h16:
4939    vpbroadcastd         m9, strided
4940    pmulld               m9, [bidir_sctr_w4]
4941    W_MASK                0, 4, 0, 1, 1
4942    vpermb               m4, m8, m4
4943    kxnorw               k1, k1, k1
4944    mova            [maskq], m4
4945    vpscatterdd [dstq+m9]{k1}, m0
4946    RET
4947.w8:
4948    cmp                  hd, 4
4949    jne .w8_h8
4950    WRAP_YMM W_MASK       0, 4, 0, 1, 1
4951    vinserti128         ym8, [wm_444_mask+32], 1
4952    vpermb              ym4, ym8, ym4
4953    mova            [maskq], ym4
4954    vextracti32x4       xm1, ym0, 1
4955    movq   [dstq+strideq*0], xm0
4956    movq   [dstq+strideq*1], xm1
4957    movhps [dstq+strideq*2], xm0
4958    movhps [dstq+stride3q ], xm1
4959    RET
4960.w8_loop:
4961    add               tmp1q, 128
4962    add               tmp2q, 128
4963    add               maskq, 64
4964    lea                dstq, [dstq+strideq*4]
4965.w8_h8:
4966    W_MASK                0, 4, 0, 1, 1
4967    vpermb               m4, m8, m4
4968    mova            [maskq], m4
4969    vextracti32x4       xm1, ym0, 1
4970    vextracti32x4       xm2, m0, 2
4971    vextracti32x4       xm3, m0, 3
4972    movq   [dstq+strideq*0], xm0
4973    movq   [dstq+strideq*1], xm1
4974    movq   [dstq+strideq*2], xm2
4975    movq   [dstq+stride3q ], xm3
4976    lea                dstq, [dstq+strideq*4]
4977    movhps [dstq+strideq*0], xm0
4978    movhps [dstq+strideq*1], xm1
4979    movhps [dstq+strideq*2], xm2
4980    movhps [dstq+stride3q ], xm3
4981    sub                  hd, 8
4982    jg .w8_loop
4983    RET
4984.w16_loop:
4985    add               tmp1q, 128
4986    add               tmp2q, 128
4987    add               maskq, 64
4988    lea                dstq, [dstq+strideq*4]
4989.w16:
4990    W_MASK                0, 4, 0, 1, 1
4991    vpermb               m4, m8, m4
4992    vpermq               m0, m0, q3120
4993    mova            [maskq], m4
4994    mova          [dstq+strideq*0], xm0
4995    vextracti32x4 [dstq+strideq*1], m0, 2
4996    vextracti32x4 [dstq+strideq*2], ym0, 1
4997    vextracti32x4 [dstq+stride3q ], m0, 3
4998    sub                  hd, 4
4999    jg .w16_loop
5000    RET
5001.w32:
5002    pmovzxbq             m9, [pb_02461357]
5003.w32_loop:
5004    W_MASK                0, 4, 0, 1, 1
5005    vpermb               m4, m8, m4
5006    add               tmp1q, 128
5007    add               tmp2q, 128
5008    vpermq               m0, m9, m0
5009    mova            [maskq], m4
5010    add               maskq, 64
5011    mova          [dstq+strideq*0], ym0
5012    vextracti32x8 [dstq+strideq*1], m0, 1
5013    lea                dstq, [dstq+strideq*2]
5014    sub                  hd, 2
5015    jg .w32_loop
5016    RET
5017.w64:
5018    pmovzxbq             m9, [pb_02461357]
5019.w64_loop:
5020    W_MASK                0, 4, 0, 1, 1
5021    vpermb               m4, m8, m4
5022    add               tmp1q, 128
5023    add               tmp2q, 128
5024    vpermq               m0, m9, m0
5025    mova            [maskq], m4
5026    add               maskq, 64
5027    mova             [dstq], m0
5028    add                dstq, strideq
5029    dec                  hd
5030    jg .w64_loop
5031    RET
5032.w128:
5033    pmovzxbq            m11, [pb_02461357]
5034.w128_loop:
5035    W_MASK                0, 4, 0, 1, 1
5036    W_MASK               10, 9, 2, 3, 1
5037    vpermb               m4, m8, m4
5038    vpermb               m9, m8, m9
5039    add               tmp1q, 256
5040    add               tmp2q, 256
5041    vpermq               m0, m11, m0
5042    vpermq              m10, m11, m10
5043    mova       [maskq+64*0], m4
5044    mova       [maskq+64*1], m9
5045    add               maskq, 128
5046    mova        [dstq+64*0], m0
5047    mova        [dstq+64*1], m10
5048    add                dstq, strideq
5049    dec                  hd
5050    jg .w128_loop
5051    RET
5052
5053cglobal blend_8bpc, 3, 7, 8, dst, ds, tmp, w, h, mask
5054%define base r6-blend_avx512icl_table
5055    lea                  r6, [blend_avx512icl_table]
5056    tzcnt                wd, wm
5057    movifnidn         maskq, maskmp
5058    movifnidn            hd, hm
5059    movsxd               wq, [r6+wq*4]
5060    vpbroadcastd         m6, [base+pb_64]
5061    vpbroadcastd         m7, [base+pw_512]
5062    sub                tmpq, maskq
5063    add                  wq, r6
5064    lea                  r6, [dsq*3]
5065    jmp                  wq
5066.w4:
5067    movd               xmm0, [dstq+dsq*0]
5068    pinsrd             xmm0, [dstq+dsq*1], 1
5069    vpbroadcastd       xmm1, [dstq+dsq*2]
5070    pinsrd             xmm1, [dstq+r6   ], 3
5071    mova               xmm4, [maskq]
5072    mova               xmm5, [maskq+tmpq]
5073    add               maskq, 4*4
5074    psubb              xmm3, xm6, xmm4
5075    punpcklbw          xmm0, xmm5
5076    punpcklbw          xmm2, xmm3, xmm4
5077    punpckhbw          xmm1, xmm5
5078    punpckhbw          xmm3, xmm4
5079    pmaddubsw          xmm0, xmm2
5080    pmaddubsw          xmm1, xmm3
5081    pmulhrsw           xmm0, xm7
5082    pmulhrsw           xmm1, xm7
5083    packuswb           xmm0, xmm1
5084    movd       [dstq+dsq*0], xmm0
5085    pextrd     [dstq+dsq*1], xmm0, 1
5086    pextrd     [dstq+dsq*2], xmm0, 2
5087    pextrd     [dstq+r6   ], xmm0, 3
5088    lea                dstq, [dstq+dsq*4]
5089    sub                  hd, 4
5090    jg .w4
5091    RET
5092.w8:
5093    movq               xmm0, [dstq+dsq*0]
5094    vpbroadcastq       xmm1, [dstq+dsq*1]
5095    vpbroadcastq       ymm2, [dstq+dsq*2]
5096    vpbroadcastq       ymm3, [dstq+r6   ]
5097    mova               ymm4, [maskq]
5098    mova               ymm5, [maskq+tmpq]
5099    add               maskq, 8*4
5100    vpblendd           ymm0, ymm2, 0x30
5101    vpblendd           ymm1, ymm3, 0xc0
5102    psubb              ymm3, ym6, ymm4
5103    punpcklbw          ymm0, ymm5
5104    punpcklbw          ymm2, ymm3, ymm4
5105    punpckhbw          ymm1, ymm5
5106    punpckhbw          ymm3, ymm4
5107    pmaddubsw          ymm0, ymm2
5108    pmaddubsw          ymm1, ymm3
5109    pmulhrsw           ymm0, ym7
5110    pmulhrsw           ymm1, ym7
5111    packuswb           ymm0, ymm1
5112    vextracti128       xmm1, ymm0, 1
5113    movq       [dstq+dsq*0], xmm0
5114    movhps     [dstq+dsq*1], xmm0
5115    movq       [dstq+dsq*2], xmm1
5116    movhps     [dstq+r6   ], xmm1
5117    lea                dstq, [dstq+dsq*4]
5118    sub                  hd, 4
5119    jg .w8
5120    vzeroupper
5121    RET
5122.w16:
5123    mova                xm1, [dstq+dsq*0]
5124    vinserti32x4        ym1, [dstq+dsq*1], 1
5125    vinserti32x4         m1, [dstq+dsq*2], 2
5126    mova                 m4, [maskq]
5127    vinserti32x4         m1, [dstq+r6   ], 3
5128    mova                 m5, [maskq+tmpq]
5129    add               maskq, 16*4
5130    psubb                m3, m6, m4
5131    punpcklbw            m0, m1, m5
5132    punpcklbw            m2, m3, m4
5133    punpckhbw            m1, m5
5134    punpckhbw            m3, m4
5135    pmaddubsw            m0, m2
5136    pmaddubsw            m1, m3
5137    pmulhrsw             m0, m7
5138    pmulhrsw             m1, m7
5139    packuswb             m0, m1
5140    mova          [dstq+dsq*0], xm0
5141    vextracti32x4 [dstq+dsq*1], ym0, 1
5142    vextracti32x4 [dstq+dsq*2], m0, 2
5143    vextracti32x4 [dstq+r6   ], m0, 3
5144    lea                dstq, [dstq+dsq*4]
5145    sub                  hd, 4
5146    jg .w16
5147    RET
5148.w32:
5149    mova                ym1, [dstq+dsq*0]
5150    vinserti32x8         m1, [dstq+dsq*1], 1
5151    mova                 m4, [maskq]
5152    mova                 m5, [maskq+tmpq]
5153    add               maskq, 32*2
5154    psubb                m3, m6, m4
5155    punpcklbw            m0, m1, m5
5156    punpcklbw            m2, m3, m4
5157    punpckhbw            m1, m5
5158    punpckhbw            m3, m4
5159    pmaddubsw            m0, m2
5160    pmaddubsw            m1, m3
5161    pmulhrsw             m0, m7
5162    pmulhrsw             m1, m7
5163    packuswb             m0, m1
5164    mova          [dstq+dsq*0], ym0
5165    vextracti32x8 [dstq+dsq*1], m0, 1
5166    lea                dstq, [dstq+dsq*2]
5167    sub                  hd, 2
5168    jg .w32
5169    RET
5170
5171cglobal blend_v_8bpc, 3, 6, 6, dst, ds, tmp, w, h, mask
5172%define base r5-blend_v_avx512icl_table
5173    lea                  r5, [blend_v_avx512icl_table]
5174    tzcnt                wd, wm
5175    movifnidn            hd, hm
5176    movsxd               wq, [r5+wq*4]
5177    vpbroadcastd         m5, [base+pw_512]
5178    add                  wq, r5
5179    add               maskq, obmc_masks-blend_v_avx512icl_table
5180    jmp                  wq
5181.w2:
5182    vpbroadcastd       xmm2, [maskq+2*2]
5183.w2_s0_loop:
5184    movd               xmm0, [dstq+dsq*0]
5185    pinsrw             xmm0, [dstq+dsq*1], 1
5186    movd               xmm1, [tmpq]
5187    add                tmpq, 2*2
5188    punpcklbw          xmm0, xmm1
5189    pmaddubsw          xmm0, xmm2
5190    pmulhrsw           xmm0, xm5
5191    packuswb           xmm0, xmm0
5192    pextrw     [dstq+dsq*0], xmm0, 0
5193    pextrw     [dstq+dsq*1], xmm0, 1
5194    lea                dstq, [dstq+dsq*2]
5195    sub                  hd, 2
5196    jg .w2_s0_loop
5197    RET
5198.w4:
5199    vpbroadcastq       xmm2, [maskq+4*2]
5200.w4_loop:
5201    movd               xmm0, [dstq+dsq*0]
5202    pinsrd             xmm0, [dstq+dsq*1], 1
5203    movq               xmm1, [tmpq]
5204    add                tmpq, 4*2
5205    punpcklbw          xmm0, xmm1
5206    pmaddubsw          xmm0, xmm2
5207    pmulhrsw           xmm0, xm5
5208    packuswb           xmm0, xmm0
5209    movd       [dstq+dsq*0], xmm0
5210    pextrd     [dstq+dsq*1], xmm0, 1
5211    lea                dstq, [dstq+dsq*2]
5212    sub                  hd, 2
5213    jg .w4_loop
5214    RET
5215.w8:
5216    mova               xmm3, [maskq+8*2]
5217.w8_loop:
5218    movq               xmm0, [dstq+dsq*0]
5219    vpbroadcastq       xmm1, [dstq+dsq*1]
5220    mova               xmm2, [tmpq]
5221    add                tmpq, 8*2
5222    punpcklbw          xmm0, xmm2
5223    punpckhbw          xmm1, xmm2
5224    pmaddubsw          xmm0, xmm3
5225    pmaddubsw          xmm1, xmm3
5226    pmulhrsw           xmm0, xm5
5227    pmulhrsw           xmm1, xm5
5228    packuswb           xmm0, xmm1
5229    movq       [dstq+dsq*0], xmm0
5230    movhps     [dstq+dsq*1], xmm0
5231    lea                dstq, [dstq+dsq*2]
5232    sub                  hd, 2
5233    jg .w8_loop
5234    RET
5235.w16:
5236    vbroadcasti32x4     ym3, [maskq+16*2]
5237    vbroadcasti32x4     ym4, [maskq+16*3]
5238.w16_loop:
5239    mova                xm1, [dstq+dsq*0]
5240    vinserti32x4        ym1, [dstq+dsq*1], 1
5241    mova                ym2, [tmpq]
5242    add                tmpq, 16*2
5243    punpcklbw           ym0, ym1, ym2
5244    punpckhbw           ym1, ym2
5245    pmaddubsw           ym0, ym3
5246    pmaddubsw           ym1, ym4
5247    pmulhrsw            ym0, ym5
5248    pmulhrsw            ym1, ym5
5249    packuswb            ym0, ym1
5250    mova          [dstq+dsq*0], xm0
5251    vextracti32x4 [dstq+dsq*1], m0, 1
5252    lea                dstq, [dstq+dsq*2]
5253    sub                  hd, 2
5254    jg .w16_loop
5255    RET
5256.w32:
5257    mova                 m4, [maskq+32*2]
5258    vshufi32x4           m3, m4, m4, q2020
5259    vshufi32x4           m4, m4, q3131
5260.w32_loop:
5261    mova                ym1, [dstq+dsq*0]
5262    vinserti32x8         m1, [dstq+dsq*1], 1
5263    mova                 m2, [tmpq]
5264    add                tmpq, 32*2
5265    punpcklbw            m0, m1, m2
5266    punpckhbw            m1, m2
5267    pmaddubsw            m0, m3
5268    pmaddubsw            m1, m4
5269    pmulhrsw             m0, m5
5270    pmulhrsw             m1, m5
5271    packuswb             m0, m1
5272    mova          [dstq+dsq*0], ym0
5273    vextracti32x8 [dstq+dsq*1], m0, 1
5274    lea                dstq, [dstq+dsq*2]
5275    sub                  hd, 2
5276    jg .w32_loop
5277    RET
5278
5279cglobal blend_h_8bpc, 3, 7, 6, dst, ds, tmp, w, h, mask
5280%define base r6-blend_h_avx512icl_table
5281    lea                  r6, [blend_h_avx512icl_table]
5282    tzcnt                wd, wm
5283    mov                  hd, hm
5284    movsxd               wq, [r6+wq*4]
5285    lea               maskq, [base+obmc_masks+hq*2]
5286    vpbroadcastd         m5, [base+pw_512]
5287    lea                  hd, [hq*3]
5288    add                  wq, r6
5289    shr                  hd, 2 ; h * 3/4
5290    lea               maskq, [maskq+hq*2]
5291    neg                  hq
5292    jmp                  wq
5293.w2:
5294    movd               xmm0, [dstq+dsq*0]
5295    pinsrw             xmm0, [dstq+dsq*1], 1
5296    movd               xmm2, [maskq+hq*2]
5297    movd               xmm1, [tmpq]
5298    add                tmpq, 2*2
5299    punpcklwd          xmm2, xmm2
5300    punpcklbw          xmm0, xmm1
5301    pmaddubsw          xmm0, xmm2
5302    pmulhrsw           xmm0, xm5
5303    packuswb           xmm0, xmm0
5304    pextrw     [dstq+dsq*0], xmm0, 0
5305    pextrw     [dstq+dsq*1], xmm0, 1
5306    lea                dstq, [dstq+dsq*2]
5307    add                  hq, 2
5308    jl .w2
5309    RET
5310.w4:
5311    mova               xmm3, [blend_shuf]
5312.w4_loop:
5313    movd               xmm0, [dstq+dsq*0]
5314    pinsrd             xmm0, [dstq+dsq*1], 1
5315    movd               xmm2, [maskq+hq*2]
5316    movq               xmm1, [tmpq]
5317    add                tmpq, 4*2
5318    pshufb             xmm2, xmm3
5319    punpcklbw          xmm0, xmm1
5320    pmaddubsw          xmm0, xmm2
5321    pmulhrsw           xmm0, xm5
5322    packuswb           xmm0, xmm0
5323    movd       [dstq+dsq*0], xmm0
5324    pextrd     [dstq+dsq*1], xmm0, 1
5325    lea                dstq, [dstq+dsq*2]
5326    add                  hq, 2
5327    jl .w4_loop
5328    RET
5329.w8:
5330    vbroadcasti128     ymm4, [blend_shuf]
5331    shufpd             ymm4, ymm4, 0x03
5332.w8_loop:
5333    vpbroadcastq       ymm1, [dstq+dsq*0]
5334    movq               xmm0, [dstq+dsq*1]
5335    vpblendd           ymm0, ymm1, 0x30
5336    vpbroadcastd       ymm3, [maskq+hq*2]
5337    movq               xmm1, [tmpq+8*1]
5338    vinserti128        ymm1, [tmpq+8*0], 1
5339    add                tmpq, 8*2
5340    pshufb             ymm3, ymm4
5341    punpcklbw          ymm0, ymm1
5342    pmaddubsw          ymm0, ymm3
5343    pmulhrsw           ymm0, ym5
5344    vextracti128       xmm1, ymm0, 1
5345    packuswb           xmm0, xmm1
5346    movhps     [dstq+dsq*0], xmm0
5347    movq       [dstq+dsq*1], xmm0
5348    lea                dstq, [dstq+dsq*2]
5349    add                  hq, 2
5350    jl .w8_loop
5351    vzeroupper
5352    RET
5353.w16:
5354    vbroadcasti32x4     ym4, [blend_shuf]
5355    shufpd              ym4, ym4, 0x0c
5356.w16_loop:
5357    mova                xm1, [dstq+dsq*0]
5358    vinserti32x4        ym1, [dstq+dsq*1], 1
5359    vpbroadcastd        ym3, [maskq+hq*2]
5360    mova                ym2, [tmpq]
5361    add                tmpq, 16*2
5362    pshufb              ym3, ym4
5363    punpcklbw           ym0, ym1, ym2
5364    punpckhbw           ym1, ym2
5365    pmaddubsw           ym0, ym3
5366    pmaddubsw           ym1, ym3
5367    pmulhrsw            ym0, ym5
5368    pmulhrsw            ym1, ym5
5369    packuswb            ym0, ym1
5370    mova          [dstq+dsq*0], xm0
5371    vextracti32x4 [dstq+dsq*1], m0, 1
5372    lea                dstq, [dstq+dsq*2]
5373    add                  hq, 2
5374    jl .w16_loop
5375    RET
5376.w32:
5377    vbroadcasti32x4      m4, [blend_shuf]
5378    shufpd               m4, m4, 0xf0
5379.w32_loop:
5380    mova                ym1, [dstq+dsq*0]
5381    vinserti32x8         m1, [dstq+dsq*1], 1
5382    vpbroadcastd         m3, [maskq+hq*2]
5383    mova                 m2, [tmpq]
5384    add                tmpq, 32*2
5385    pshufb               m3, m4
5386    punpcklbw            m0, m1, m2
5387    punpckhbw            m1, m2
5388    pmaddubsw            m0, m3
5389    pmaddubsw            m1, m3
5390    pmulhrsw             m0, m5
5391    pmulhrsw             m1, m5
5392    packuswb             m0, m1
5393    mova          [dstq+dsq*0], ym0
5394    vextracti32x8 [dstq+dsq*1], m0, 1
5395    lea                dstq, [dstq+dsq*2]
5396    add                  hq, 2
5397    jl .w32_loop
5398    RET
5399.w64:
5400    vpbroadcastw         m3, [maskq+hq*2]
5401    mova                 m1, [dstq]
5402    mova                 m2, [tmpq]
5403    add                tmpq, 32*2
5404    punpcklbw            m0, m1, m2
5405    punpckhbw            m1, m2
5406    pmaddubsw            m0, m3
5407    pmaddubsw            m1, m3
5408    pmulhrsw             m0, m5
5409    pmulhrsw             m1, m5
5410    packuswb             m0, m1
5411    mova             [dstq], m0
5412    add                dstq, dsq
5413    inc                  hq
5414    jl .w64
5415    RET
5416.w128:
5417    vpbroadcastw         m6, [maskq+hq*2]
5418    mova                 m2, [dstq+64*0]
5419    mova                 m1, [tmpq+64*0]
5420    mova                 m3, [dstq+64*1]
5421    mova                 m4, [tmpq+64*1]
5422    add                tmpq, 64*2
5423    punpcklbw            m0, m2, m1
5424    punpckhbw            m2, m1
5425    pmaddubsw            m0, m6
5426    pmaddubsw            m2, m6
5427    punpcklbw            m1, m3, m4
5428    punpckhbw            m3, m4
5429    pmaddubsw            m1, m6
5430    pmaddubsw            m3, m6
5431    REPX   {pmulhrsw x, m5}, m0, m2, m1, m3
5432    packuswb             m0, m2
5433    packuswb             m1, m3
5434    mova        [dstq+64*0], m0
5435    mova        [dstq+64*1], m1
5436    add                dstq, dsq
5437    inc                  hq
5438    jl .w128
5439    RET
5440
5441cglobal resize_8bpc, 6, 12, 19, dst, dst_stride, src, src_stride, \
5442                                dst_w, h, src_w, dx, mx0
5443    sub          dword mx0m, 4<<14
5444    sub        dword src_wm, 8
5445    mov                  r6, ~0
5446    vpbroadcastd         m5, dxm
5447    vpbroadcastd         m8, mx0m
5448    vpbroadcastd         m6, src_wm
5449    kmovq                k3, r6
5450 DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, h, x
5451    LEA                  r7, $$
5452%define base r7-$$
5453    vpbroadcastd         m3, [base+pw_m256]
5454    vpbroadcastd         m7, [base+pd_63]
5455    vbroadcasti32x4     m15, [base+pb_8x0_8x8]
5456    vpdpwssd             m8, m5, [base+rescale_mul] ; mx+dx*[0-15]
5457    pslld                m5, 4                      ; dx*16
5458    pslld                m6, 14
5459    pxor                 m2, m2
5460    mova                m16, [base+resize_permA]
5461    mova                m17, [base+resize_permB]
5462    mova               xm18, [base+resize_permC]
5463.loop_y:
5464    xor                  xd, xd
5465    mova                 m4, m8     ; per-line working version of mx
5466.loop_x:
5467    pmaxsd               m0, m4, m2
5468    psrad                m9, m4, 8  ; filter offset (unmasked)
5469    pminsd               m0, m6     ; iclip(mx, 0, src_w-8)
5470    psubd                m1, m4, m0 ; pshufb offset
5471    psrad                m0, 14     ; clipped src_x offset
5472    psrad                m1, 14     ; pshufb edge_emu offset
5473    vptestmd             k4, m1, m1
5474    pand                 m9, m7     ; filter offset (masked)
5475    ktestw               k4, k4
5476    jz .load
5477    vextracti32x8      ym12, m0, 1
5478    vextracti32x8      ym13, m1, 1
5479    kmovq                k1, k3
5480    kmovq                k2, k3
5481    vpgatherdq      m10{k1}, [srcq+ym0]
5482    vpgatherdq      m11{k2}, [srcq+ym12]
5483    kmovq                k1, k3
5484    kmovq                k2, k3
5485    vpgatherdq      m14{k1}, [base+resize_shuf+4+ym1]
5486    vpgatherdq       m0{k2}, [base+resize_shuf+4+ym13]
5487    mova                m12, m16
5488    mova                m13, m17
5489    paddb               m14, m15
5490    paddb                m0, m15
5491    pshufb              m10, m14
5492    pshufb              m11, m0
5493    vpermi2d            m12, m10, m11
5494    vpermi2d            m13, m10, m11
5495    jmp .filter
5496.load:
5497    kmovq                k1, k3
5498    kmovq                k2, k3
5499    vpgatherdd      m12{k1}, [srcq+m0+0]
5500    vpgatherdd      m13{k2}, [srcq+m0+4]
5501.filter:
5502    kmovq                k1, k3
5503    kmovq                k2, k3
5504    vpgatherdd      m10{k1}, [base+resize_filter+m9*8+0]
5505    vpgatherdd      m11{k2}, [base+resize_filter+m9*8+4]
5506    mova                m14, m2
5507    vpdpbusd            m14, m12, m10
5508    vpdpbusd            m14, m13, m11
5509    packssdw            m14, m14
5510    pmulhrsw            m14, m3
5511    packuswb            m14, m14
5512    vpermd              m14, m18, m14
5513    mova          [dstq+xq], xm14
5514    paddd                m4, m5
5515    add                  xd, 16
5516    cmp                  xd, dst_wd
5517    jl .loop_x
5518    add                dstq, dst_strideq
5519    add                srcq, src_strideq
5520    dec                  hd
5521    jg .loop_y
5522    RET
5523
5524%endif ; ARCH_X86_64
5525