xref: /aosp_15_r20/external/libaom/aom_dsp/x86/blend_a64_mask_sse4.c (revision 77c1e3ccc04c968bd2bc212e87364f250e820521)
1 /*
2  * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include <smmintrin.h>  // SSE4.1
13 
14 #include <assert.h>
15 
16 #include "aom/aom_integer.h"
17 #include "aom_ports/mem.h"
18 #include "aom_dsp/aom_dsp_common.h"
19 #include "aom_dsp/blend.h"
20 
21 #include "aom_dsp/x86/synonyms.h"
22 #include "aom_dsp/x86/blend_sse4.h"
23 #include "aom_dsp/x86/blend_mask_sse4.h"
24 
25 #include "config/aom_dsp_rtcd.h"
26 
27 //////////////////////////////////////////////////////////////////////////////
28 // No sub-sampling
29 //////////////////////////////////////////////////////////////////////////////
30 
blend_a64_mask_w4_sse4_1(uint8_t * dst,uint32_t dst_stride,const uint8_t * src0,uint32_t src0_stride,const uint8_t * src1,uint32_t src1_stride,const uint8_t * mask,uint32_t mask_stride,int w,int h)31 static void blend_a64_mask_w4_sse4_1(uint8_t *dst, uint32_t dst_stride,
32                                      const uint8_t *src0, uint32_t src0_stride,
33                                      const uint8_t *src1, uint32_t src1_stride,
34                                      const uint8_t *mask, uint32_t mask_stride,
35                                      int w, int h) {
36   (void)w;
37   const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
38   const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
39   do {
40     const __m128i v_m0_b = xx_loadl_32(mask);
41     const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
42     const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
43     xx_storel_32(dst, v_res_b);
44 
45     dst += dst_stride;
46     src0 += src0_stride;
47     src1 += src1_stride;
48     mask += mask_stride;
49   } while (--h);
50 }
51 
blend_a64_mask_w8_sse4_1(uint8_t * dst,uint32_t dst_stride,const uint8_t * src0,uint32_t src0_stride,const uint8_t * src1,uint32_t src1_stride,const uint8_t * mask,uint32_t mask_stride,int w,int h)52 static void blend_a64_mask_w8_sse4_1(uint8_t *dst, uint32_t dst_stride,
53                                      const uint8_t *src0, uint32_t src0_stride,
54                                      const uint8_t *src1, uint32_t src1_stride,
55                                      const uint8_t *mask, uint32_t mask_stride,
56                                      int w, int h) {
57   (void)w;
58   const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
59   const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
60   do {
61     const __m128i v_m0_b = xx_loadl_64(mask);
62     const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
63     const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
64     xx_storel_64(dst, v_res_b);
65 
66     dst += dst_stride;
67     src0 += src0_stride;
68     src1 += src1_stride;
69     mask += mask_stride;
70   } while (--h);
71 }
72 
blend_a64_mask_w16n_sse4_1(uint8_t * dst,uint32_t dst_stride,const uint8_t * src0,uint32_t src0_stride,const uint8_t * src1,uint32_t src1_stride,const uint8_t * mask,uint32_t mask_stride,int w,int h)73 static void blend_a64_mask_w16n_sse4_1(
74     uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
75     uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
76     const uint8_t *mask, uint32_t mask_stride, int w, int h) {
77   const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
78   const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
79 
80   do {
81     int c;
82     for (c = 0; c < w; c += 16) {
83       const __m128i v_m0_b = xx_loadu_128(mask + c);
84       const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
85 
86       const __m128i v_res_b =
87           blend_16_u8(src0 + c, src1 + c, &v_m0_b, &v_m1_b, &_r);
88 
89       xx_storeu_128(dst + c, v_res_b);
90     }
91     dst += dst_stride;
92     src0 += src0_stride;
93     src1 += src1_stride;
94     mask += mask_stride;
95   } while (--h);
96 }
97 
98 //////////////////////////////////////////////////////////////////////////////
99 // Horizontal sub-sampling
100 //////////////////////////////////////////////////////////////////////////////
101 
blend_a64_mask_sx_w4_sse4_1(uint8_t * dst,uint32_t dst_stride,const uint8_t * src0,uint32_t src0_stride,const uint8_t * src1,uint32_t src1_stride,const uint8_t * mask,uint32_t mask_stride,int w,int h)102 static void blend_a64_mask_sx_w4_sse4_1(
103     uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
104     uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
105     const uint8_t *mask, uint32_t mask_stride, int w, int h) {
106   (void)w;
107 
108   const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle);
109   const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
110   const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
111   do {
112     const __m128i v_r_b = xx_loadl_64(mask);
113     const __m128i v_r0_s_b = _mm_shuffle_epi8(v_r_b, v_shuffle_b);
114     const __m128i v_r_lo_b = _mm_unpacklo_epi64(v_r0_s_b, v_r0_s_b);
115     const __m128i v_r_hi_b = _mm_unpackhi_epi64(v_r0_s_b, v_r0_s_b);
116     const __m128i v_m0_b = _mm_avg_epu8(v_r_lo_b, v_r_hi_b);
117     const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
118 
119     const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
120     xx_storel_32(dst, v_res_b);
121 
122     dst += dst_stride;
123     src0 += src0_stride;
124     src1 += src1_stride;
125     mask += mask_stride;
126   } while (--h);
127 }
128 
blend_a64_mask_sx_w8_sse4_1(uint8_t * dst,uint32_t dst_stride,const uint8_t * src0,uint32_t src0_stride,const uint8_t * src1,uint32_t src1_stride,const uint8_t * mask,uint32_t mask_stride,int w,int h)129 static void blend_a64_mask_sx_w8_sse4_1(
130     uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
131     uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
132     const uint8_t *mask, uint32_t mask_stride, int w, int h) {
133   (void)w;
134 
135   const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle);
136   const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
137   const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
138   do {
139     const __m128i v_r_b = xx_loadu_128(mask);
140     const __m128i v_r0_s_b = _mm_shuffle_epi8(v_r_b, v_shuffle_b);
141     const __m128i v_r_lo_b = _mm_unpacklo_epi64(v_r0_s_b, v_r0_s_b);
142     const __m128i v_r_hi_b = _mm_unpackhi_epi64(v_r0_s_b, v_r0_s_b);
143     const __m128i v_m0_b = _mm_avg_epu8(v_r_lo_b, v_r_hi_b);
144     const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
145 
146     const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
147 
148     xx_storel_64(dst, v_res_b);
149 
150     dst += dst_stride;
151     src0 += src0_stride;
152     src1 += src1_stride;
153     mask += mask_stride;
154   } while (--h);
155 }
156 
blend_a64_mask_sx_w16n_sse4_1(uint8_t * dst,uint32_t dst_stride,const uint8_t * src0,uint32_t src0_stride,const uint8_t * src1,uint32_t src1_stride,const uint8_t * mask,uint32_t mask_stride,int w,int h)157 static void blend_a64_mask_sx_w16n_sse4_1(
158     uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
159     uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
160     const uint8_t *mask, uint32_t mask_stride, int w, int h) {
161   const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle);
162   const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
163   const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
164 
165   do {
166     int c;
167     for (c = 0; c < w; c += 16) {
168       const __m128i v_r0_b = xx_loadu_128(mask + 2 * c);
169       const __m128i v_r1_b = xx_loadu_128(mask + 2 * c + 16);
170       const __m128i v_r0_s_b = _mm_shuffle_epi8(v_r0_b, v_shuffle_b);
171       const __m128i v_r1_s_b = _mm_shuffle_epi8(v_r1_b, v_shuffle_b);
172       const __m128i v_r_lo_b = _mm_unpacklo_epi64(v_r0_s_b, v_r1_s_b);
173       const __m128i v_r_hi_b = _mm_unpackhi_epi64(v_r0_s_b, v_r1_s_b);
174       const __m128i v_m0_b = _mm_avg_epu8(v_r_lo_b, v_r_hi_b);
175       const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
176 
177       const __m128i v_res_b =
178           blend_16_u8(src0 + c, src1 + c, &v_m0_b, &v_m1_b, &_r);
179 
180       xx_storeu_128(dst + c, v_res_b);
181     }
182     dst += dst_stride;
183     src0 += src0_stride;
184     src1 += src1_stride;
185     mask += mask_stride;
186   } while (--h);
187 }
188 
189 //////////////////////////////////////////////////////////////////////////////
190 // Vertical sub-sampling
191 //////////////////////////////////////////////////////////////////////////////
192 
blend_a64_mask_sy_w4_sse4_1(uint8_t * dst,uint32_t dst_stride,const uint8_t * src0,uint32_t src0_stride,const uint8_t * src1,uint32_t src1_stride,const uint8_t * mask,uint32_t mask_stride,int w,int h)193 static void blend_a64_mask_sy_w4_sse4_1(
194     uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
195     uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
196     const uint8_t *mask, uint32_t mask_stride, int w, int h) {
197   (void)w;
198 
199   const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
200   const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
201 
202   do {
203     const __m128i v_ra_b = xx_loadl_32(mask);
204     const __m128i v_rb_b = xx_loadl_32(mask + mask_stride);
205     const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b);
206     const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
207 
208     const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
209 
210     xx_storel_32(dst, v_res_b);
211 
212     dst += dst_stride;
213     src0 += src0_stride;
214     src1 += src1_stride;
215     mask += 2 * mask_stride;
216   } while (--h);
217 }
218 
blend_a64_mask_sy_w8_sse4_1(uint8_t * dst,uint32_t dst_stride,const uint8_t * src0,uint32_t src0_stride,const uint8_t * src1,uint32_t src1_stride,const uint8_t * mask,uint32_t mask_stride,int w,int h)219 static void blend_a64_mask_sy_w8_sse4_1(
220     uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
221     uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
222     const uint8_t *mask, uint32_t mask_stride, int w, int h) {
223   (void)w;
224 
225   const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
226   const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
227   do {
228     const __m128i v_ra_b = xx_loadl_64(mask);
229     const __m128i v_rb_b = xx_loadl_64(mask + mask_stride);
230     const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b);
231     const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
232     const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
233 
234     xx_storel_64(dst, v_res_b);
235 
236     dst += dst_stride;
237     src0 += src0_stride;
238     src1 += src1_stride;
239     mask += 2 * mask_stride;
240   } while (--h);
241 }
242 
blend_a64_mask_sy_w16n_sse4_1(uint8_t * dst,uint32_t dst_stride,const uint8_t * src0,uint32_t src0_stride,const uint8_t * src1,uint32_t src1_stride,const uint8_t * mask,uint32_t mask_stride,int w,int h)243 static void blend_a64_mask_sy_w16n_sse4_1(
244     uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
245     uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
246     const uint8_t *mask, uint32_t mask_stride, int w, int h) {
247   const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
248   const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
249   do {
250     int c;
251     for (c = 0; c < w; c += 16) {
252       const __m128i v_ra_b = xx_loadu_128(mask + c);
253       const __m128i v_rb_b = xx_loadu_128(mask + c + mask_stride);
254       const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b);
255       const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
256 
257       const __m128i v_res_b =
258           blend_16_u8(src0 + c, src1 + c, &v_m0_b, &v_m1_b, &_r);
259 
260       xx_storeu_128(dst + c, v_res_b);
261     }
262     dst += dst_stride;
263     src0 += src0_stride;
264     src1 += src1_stride;
265     mask += 2 * mask_stride;
266   } while (--h);
267 }
268 
269 //////////////////////////////////////////////////////////////////////////////
270 // Horizontal and Vertical sub-sampling
271 //////////////////////////////////////////////////////////////////////////////
272 
blend_a64_mask_sx_sy_w4_sse4_1(uint8_t * dst,uint32_t dst_stride,const uint8_t * src0,uint32_t src0_stride,const uint8_t * src1,uint32_t src1_stride,const uint8_t * mask,uint32_t mask_stride,int w,int h)273 static void blend_a64_mask_sx_sy_w4_sse4_1(
274     uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
275     uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
276     const uint8_t *mask, uint32_t mask_stride, int w, int h) {
277   const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle);
278   const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
279   const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
280   (void)w;
281 
282   do {
283     const __m128i v_ra_b = xx_loadl_64(mask);
284     const __m128i v_rb_b = xx_loadl_64(mask + mask_stride);
285     const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
286     const __m128i v_r_s_b = _mm_shuffle_epi8(v_rvs_b, v_shuffle_b);
287     const __m128i v_r0_s_w = _mm_cvtepu8_epi16(v_r_s_b);
288     const __m128i v_r1_s_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_r_s_b, 8));
289     const __m128i v_rs_w = _mm_add_epi16(v_r0_s_w, v_r1_s_w);
290     const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2);
291     const __m128i v_m0_b = _mm_packus_epi16(v_m0_w, v_m0_w);
292     const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
293 
294     const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
295 
296     xx_storel_32(dst, v_res_b);
297 
298     dst += dst_stride;
299     src0 += src0_stride;
300     src1 += src1_stride;
301     mask += 2 * mask_stride;
302   } while (--h);
303 }
304 
blend_a64_mask_sx_sy_w8_sse4_1(uint8_t * dst,uint32_t dst_stride,const uint8_t * src0,uint32_t src0_stride,const uint8_t * src1,uint32_t src1_stride,const uint8_t * mask,uint32_t mask_stride,int w,int h)305 static void blend_a64_mask_sx_sy_w8_sse4_1(
306     uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
307     uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
308     const uint8_t *mask, uint32_t mask_stride, int w, int h) {
309   const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle);
310   const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
311   const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
312   (void)w;
313 
314   do {
315     const __m128i v_ra_b = xx_loadu_128(mask);
316     const __m128i v_rb_b = xx_loadu_128(mask + mask_stride);
317 
318     const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
319     const __m128i v_r_s_b = _mm_shuffle_epi8(v_rvs_b, v_shuffle_b);
320     const __m128i v_r0_s_w = _mm_cvtepu8_epi16(v_r_s_b);
321     const __m128i v_r1_s_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_r_s_b, 8));
322     const __m128i v_rs_w = _mm_add_epi16(v_r0_s_w, v_r1_s_w);
323     const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2);
324     const __m128i v_m0_b = _mm_packus_epi16(v_m0_w, v_m0_w);
325     const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
326 
327     const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
328 
329     xx_storel_64(dst, v_res_b);
330 
331     dst += dst_stride;
332     src0 += src0_stride;
333     src1 += src1_stride;
334     mask += 2 * mask_stride;
335   } while (--h);
336 }
337 
blend_a64_mask_sx_sy_w16n_sse4_1(uint8_t * dst,uint32_t dst_stride,const uint8_t * src0,uint32_t src0_stride,const uint8_t * src1,uint32_t src1_stride,const uint8_t * mask,uint32_t mask_stride,int w,int h)338 static void blend_a64_mask_sx_sy_w16n_sse4_1(
339     uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
340     uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
341     const uint8_t *mask, uint32_t mask_stride, int w, int h) {
342   const __m128i v_zmask_b =
343       _mm_set_epi8(0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1);
344   const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
345   const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
346   do {
347     int c;
348     for (c = 0; c < w; c += 16) {
349       const __m128i v_ral_b = xx_loadu_128(mask + 2 * c);
350       const __m128i v_rah_b = xx_loadu_128(mask + 2 * c + 16);
351       const __m128i v_rbl_b = xx_loadu_128(mask + mask_stride + 2 * c);
352       const __m128i v_rbh_b = xx_loadu_128(mask + mask_stride + 2 * c + 16);
353       const __m128i v_rvsl_b = _mm_add_epi8(v_ral_b, v_rbl_b);
354       const __m128i v_rvsh_b = _mm_add_epi8(v_rah_b, v_rbh_b);
355       const __m128i v_rvsal_w = _mm_and_si128(v_rvsl_b, v_zmask_b);
356       const __m128i v_rvsah_w = _mm_and_si128(v_rvsh_b, v_zmask_b);
357       const __m128i v_rvsbl_w =
358           _mm_and_si128(_mm_srli_si128(v_rvsl_b, 1), v_zmask_b);
359       const __m128i v_rvsbh_w =
360           _mm_and_si128(_mm_srli_si128(v_rvsh_b, 1), v_zmask_b);
361       const __m128i v_rsl_w = _mm_add_epi16(v_rvsal_w, v_rvsbl_w);
362       const __m128i v_rsh_w = _mm_add_epi16(v_rvsah_w, v_rvsbh_w);
363 
364       const __m128i v_m0l_w = xx_roundn_epu16(v_rsl_w, 2);
365       const __m128i v_m0h_w = xx_roundn_epu16(v_rsh_w, 2);
366       const __m128i v_m0_b = _mm_packus_epi16(v_m0l_w, v_m0h_w);
367       const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
368 
369       const __m128i v_res_b =
370           blend_16_u8(src0 + c, src1 + c, &v_m0_b, &v_m1_b, &_r);
371 
372       xx_storeu_128(dst + c, v_res_b);
373     }
374     dst += dst_stride;
375     src0 += src0_stride;
376     src1 += src1_stride;
377     mask += 2 * mask_stride;
378   } while (--h);
379 }
380 
381 //////////////////////////////////////////////////////////////////////////////
382 // Dispatch
383 //////////////////////////////////////////////////////////////////////////////
384 
aom_blend_a64_mask_sse4_1(uint8_t * dst,uint32_t dst_stride,const uint8_t * src0,uint32_t src0_stride,const uint8_t * src1,uint32_t src1_stride,const uint8_t * mask,uint32_t mask_stride,int w,int h,int subw,int subh)385 void aom_blend_a64_mask_sse4_1(uint8_t *dst, uint32_t dst_stride,
386                                const uint8_t *src0, uint32_t src0_stride,
387                                const uint8_t *src1, uint32_t src1_stride,
388                                const uint8_t *mask, uint32_t mask_stride, int w,
389                                int h, int subw, int subh) {
390   typedef void (*blend_fn)(
391       uint8_t * dst, uint32_t dst_stride, const uint8_t *src0,
392       uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
393       const uint8_t *mask, uint32_t mask_stride, int w, int h);
394 
395   // Dimensions are: width_index X subx X suby
396   static const blend_fn blend[3][2][2] = {
397     { // w % 16 == 0
398       { blend_a64_mask_w16n_sse4_1, blend_a64_mask_sy_w16n_sse4_1 },
399       { blend_a64_mask_sx_w16n_sse4_1, blend_a64_mask_sx_sy_w16n_sse4_1 } },
400     { // w == 4
401       { blend_a64_mask_w4_sse4_1, blend_a64_mask_sy_w4_sse4_1 },
402       { blend_a64_mask_sx_w4_sse4_1, blend_a64_mask_sx_sy_w4_sse4_1 } },
403     { // w == 8
404       { blend_a64_mask_w8_sse4_1, blend_a64_mask_sy_w8_sse4_1 },
405       { blend_a64_mask_sx_w8_sse4_1, blend_a64_mask_sx_sy_w8_sse4_1 } }
406   };
407 
408   assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
409   assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
410 
411   assert(h >= 1);
412   assert(w >= 1);
413   assert(IS_POWER_OF_TWO(h));
414   assert(IS_POWER_OF_TWO(w));
415 
416   if (UNLIKELY((h | w) & 3)) {  // if (w <= 2 || h <= 2)
417     aom_blend_a64_mask_c(dst, dst_stride, src0, src0_stride, src1, src1_stride,
418                          mask, mask_stride, w, h, subw, subh);
419   } else {
420     blend[(w >> 2) & 3][subw != 0][subh != 0](dst, dst_stride, src0,
421                                               src0_stride, src1, src1_stride,
422                                               mask, mask_stride, w, h);
423   }
424 }
425 
426 #if CONFIG_AV1_HIGHBITDEPTH
427 //////////////////////////////////////////////////////////////////////////////
428 // No sub-sampling
429 //////////////////////////////////////////////////////////////////////////////
430 
blend_a64_mask_bn_w4_sse4_1(uint16_t * dst,uint32_t dst_stride,const uint16_t * src0,uint32_t src0_stride,const uint16_t * src1,uint32_t src1_stride,const uint8_t * mask,uint32_t mask_stride,int h,blend_unit_fn blend)431 static inline void blend_a64_mask_bn_w4_sse4_1(
432     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
433     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
434     const uint8_t *mask, uint32_t mask_stride, int h, blend_unit_fn blend) {
435   const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
436 
437   do {
438     const __m128i v_m0_b = xx_loadl_32(mask);
439     const __m128i v_m0_w = _mm_cvtepu8_epi16(v_m0_b);
440     const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
441 
442     const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w);
443 
444     xx_storel_64(dst, v_res_w);
445 
446     dst += dst_stride;
447     src0 += src0_stride;
448     src1 += src1_stride;
449     mask += mask_stride;
450   } while (--h);
451 }
452 
blend_a64_mask_b10_w4_sse4_1(uint16_t * dst,uint32_t dst_stride,const uint16_t * src0,uint32_t src0_stride,const uint16_t * src1,uint32_t src1_stride,const uint8_t * mask,uint32_t mask_stride,int w,int h)453 static void blend_a64_mask_b10_w4_sse4_1(
454     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
455     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
456     const uint8_t *mask, uint32_t mask_stride, int w, int h) {
457   (void)w;
458   blend_a64_mask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
459                               src1_stride, mask, mask_stride, h, blend_4_b10);
460 }
461 
blend_a64_mask_b12_w4_sse4_1(uint16_t * dst,uint32_t dst_stride,const uint16_t * src0,uint32_t src0_stride,const uint16_t * src1,uint32_t src1_stride,const uint8_t * mask,uint32_t mask_stride,int w,int h)462 static void blend_a64_mask_b12_w4_sse4_1(
463     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
464     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
465     const uint8_t *mask, uint32_t mask_stride, int w, int h) {
466   (void)w;
467   blend_a64_mask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
468                               src1_stride, mask, mask_stride, h, blend_4_b12);
469 }
470 
blend_a64_mask_bn_w8n_sse4_1(uint16_t * dst,uint32_t dst_stride,const uint16_t * src0,uint32_t src0_stride,const uint16_t * src1,uint32_t src1_stride,const uint8_t * mask,uint32_t mask_stride,int w,int h,blend_unit_fn blend)471 static inline void blend_a64_mask_bn_w8n_sse4_1(
472     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
473     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
474     const uint8_t *mask, uint32_t mask_stride, int w, int h,
475     blend_unit_fn blend) {
476   const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
477 
478   do {
479     int c;
480     for (c = 0; c < w; c += 8) {
481       const __m128i v_m0_b = xx_loadl_64(mask + c);
482       const __m128i v_m0_w = _mm_cvtepu8_epi16(v_m0_b);
483       const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
484 
485       const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w);
486 
487       xx_storeu_128(dst + c, v_res_w);
488     }
489     dst += dst_stride;
490     src0 += src0_stride;
491     src1 += src1_stride;
492     mask += mask_stride;
493   } while (--h);
494 }
495 
blend_a64_mask_b10_w8n_sse4_1(uint16_t * dst,uint32_t dst_stride,const uint16_t * src0,uint32_t src0_stride,const uint16_t * src1,uint32_t src1_stride,const uint8_t * mask,uint32_t mask_stride,int w,int h)496 static void blend_a64_mask_b10_w8n_sse4_1(
497     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
498     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
499     const uint8_t *mask, uint32_t mask_stride, int w, int h) {
500   blend_a64_mask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
501                                src1_stride, mask, mask_stride, w, h,
502                                blend_8_b10);
503 }
504 
blend_a64_mask_b12_w8n_sse4_1(uint16_t * dst,uint32_t dst_stride,const uint16_t * src0,uint32_t src0_stride,const uint16_t * src1,uint32_t src1_stride,const uint8_t * mask,uint32_t mask_stride,int w,int h)505 static void blend_a64_mask_b12_w8n_sse4_1(
506     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
507     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
508     const uint8_t *mask, uint32_t mask_stride, int w, int h) {
509   blend_a64_mask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
510                                src1_stride, mask, mask_stride, w, h,
511                                blend_8_b12);
512 }
513 
514 //////////////////////////////////////////////////////////////////////////////
515 // Horizontal sub-sampling
516 //////////////////////////////////////////////////////////////////////////////
517 
blend_a64_mask_bn_sx_w4_sse4_1(uint16_t * dst,uint32_t dst_stride,const uint16_t * src0,uint32_t src0_stride,const uint16_t * src1,uint32_t src1_stride,const uint8_t * mask,uint32_t mask_stride,int h,blend_unit_fn blend)518 static inline void blend_a64_mask_bn_sx_w4_sse4_1(
519     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
520     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
521     const uint8_t *mask, uint32_t mask_stride, int h, blend_unit_fn blend) {
522   const __m128i v_zmask_b =
523       _mm_set_epi8(0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1);
524   const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
525 
526   do {
527     const __m128i v_r_b = xx_loadl_64(mask);
528     const __m128i v_a_b = _mm_avg_epu8(v_r_b, _mm_srli_si128(v_r_b, 1));
529 
530     const __m128i v_m0_w = _mm_and_si128(v_a_b, v_zmask_b);
531     const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
532 
533     const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w);
534 
535     xx_storel_64(dst, v_res_w);
536 
537     dst += dst_stride;
538     src0 += src0_stride;
539     src1 += src1_stride;
540     mask += mask_stride;
541   } while (--h);
542 }
543 
blend_a64_mask_b10_sx_w4_sse4_1(uint16_t * dst,uint32_t dst_stride,const uint16_t * src0,uint32_t src0_stride,const uint16_t * src1,uint32_t src1_stride,const uint8_t * mask,uint32_t mask_stride,int w,int h)544 static void blend_a64_mask_b10_sx_w4_sse4_1(
545     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
546     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
547     const uint8_t *mask, uint32_t mask_stride, int w, int h) {
548   (void)w;
549   blend_a64_mask_bn_sx_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
550                                  src1_stride, mask, mask_stride, h,
551                                  blend_4_b10);
552 }
553 
blend_a64_mask_b12_sx_w4_sse4_1(uint16_t * dst,uint32_t dst_stride,const uint16_t * src0,uint32_t src0_stride,const uint16_t * src1,uint32_t src1_stride,const uint8_t * mask,uint32_t mask_stride,int w,int h)554 static void blend_a64_mask_b12_sx_w4_sse4_1(
555     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
556     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
557     const uint8_t *mask, uint32_t mask_stride, int w, int h) {
558   (void)w;
559   blend_a64_mask_bn_sx_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
560                                  src1_stride, mask, mask_stride, h,
561                                  blend_4_b12);
562 }
563 
blend_a64_mask_bn_sx_w8n_sse4_1(uint16_t * dst,uint32_t dst_stride,const uint16_t * src0,uint32_t src0_stride,const uint16_t * src1,uint32_t src1_stride,const uint8_t * mask,uint32_t mask_stride,int w,int h,blend_unit_fn blend)564 static inline void blend_a64_mask_bn_sx_w8n_sse4_1(
565     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
566     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
567     const uint8_t *mask, uint32_t mask_stride, int w, int h,
568     blend_unit_fn blend) {
569   const __m128i v_zmask_b =
570       _mm_set_epi8(0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1);
571   const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
572 
573   do {
574     int c;
575     for (c = 0; c < w; c += 8) {
576       const __m128i v_r_b = xx_loadu_128(mask + 2 * c);
577       const __m128i v_a_b = _mm_avg_epu8(v_r_b, _mm_srli_si128(v_r_b, 1));
578 
579       const __m128i v_m0_w = _mm_and_si128(v_a_b, v_zmask_b);
580       const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
581 
582       const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w);
583 
584       xx_storeu_128(dst + c, v_res_w);
585     }
586     dst += dst_stride;
587     src0 += src0_stride;
588     src1 += src1_stride;
589     mask += mask_stride;
590   } while (--h);
591 }
592 
blend_a64_mask_b10_sx_w8n_sse4_1(uint16_t * dst,uint32_t dst_stride,const uint16_t * src0,uint32_t src0_stride,const uint16_t * src1,uint32_t src1_stride,const uint8_t * mask,uint32_t mask_stride,int w,int h)593 static void blend_a64_mask_b10_sx_w8n_sse4_1(
594     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
595     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
596     const uint8_t *mask, uint32_t mask_stride, int w, int h) {
597   blend_a64_mask_bn_sx_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
598                                   src1_stride, mask, mask_stride, w, h,
599                                   blend_8_b10);
600 }
601 
blend_a64_mask_b12_sx_w8n_sse4_1(uint16_t * dst,uint32_t dst_stride,const uint16_t * src0,uint32_t src0_stride,const uint16_t * src1,uint32_t src1_stride,const uint8_t * mask,uint32_t mask_stride,int w,int h)602 static void blend_a64_mask_b12_sx_w8n_sse4_1(
603     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
604     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
605     const uint8_t *mask, uint32_t mask_stride, int w, int h) {
606   blend_a64_mask_bn_sx_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
607                                   src1_stride, mask, mask_stride, w, h,
608                                   blend_8_b12);
609 }
610 
611 //////////////////////////////////////////////////////////////////////////////
612 // Vertical sub-sampling
613 //////////////////////////////////////////////////////////////////////////////
614 
blend_a64_mask_bn_sy_w4_sse4_1(uint16_t * dst,uint32_t dst_stride,const uint16_t * src0,uint32_t src0_stride,const uint16_t * src1,uint32_t src1_stride,const uint8_t * mask,uint32_t mask_stride,int h,blend_unit_fn blend)615 static inline void blend_a64_mask_bn_sy_w4_sse4_1(
616     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
617     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
618     const uint8_t *mask, uint32_t mask_stride, int h, blend_unit_fn blend) {
619   const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
620 
621   do {
622     const __m128i v_ra_b = xx_loadl_32(mask);
623     const __m128i v_rb_b = xx_loadl_32(mask + mask_stride);
624     const __m128i v_a_b = _mm_avg_epu8(v_ra_b, v_rb_b);
625 
626     const __m128i v_m0_w = _mm_cvtepu8_epi16(v_a_b);
627     const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
628 
629     const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w);
630 
631     xx_storel_64(dst, v_res_w);
632 
633     dst += dst_stride;
634     src0 += src0_stride;
635     src1 += src1_stride;
636     mask += 2 * mask_stride;
637   } while (--h);
638 }
639 
blend_a64_mask_b10_sy_w4_sse4_1(uint16_t * dst,uint32_t dst_stride,const uint16_t * src0,uint32_t src0_stride,const uint16_t * src1,uint32_t src1_stride,const uint8_t * mask,uint32_t mask_stride,int w,int h)640 static void blend_a64_mask_b10_sy_w4_sse4_1(
641     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
642     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
643     const uint8_t *mask, uint32_t mask_stride, int w, int h) {
644   (void)w;
645   blend_a64_mask_bn_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
646                                  src1_stride, mask, mask_stride, h,
647                                  blend_4_b10);
648 }
649 
blend_a64_mask_b12_sy_w4_sse4_1(uint16_t * dst,uint32_t dst_stride,const uint16_t * src0,uint32_t src0_stride,const uint16_t * src1,uint32_t src1_stride,const uint8_t * mask,uint32_t mask_stride,int w,int h)650 static void blend_a64_mask_b12_sy_w4_sse4_1(
651     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
652     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
653     const uint8_t *mask, uint32_t mask_stride, int w, int h) {
654   (void)w;
655   blend_a64_mask_bn_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
656                                  src1_stride, mask, mask_stride, h,
657                                  blend_4_b12);
658 }
659 
blend_a64_mask_bn_sy_w8n_sse4_1(uint16_t * dst,uint32_t dst_stride,const uint16_t * src0,uint32_t src0_stride,const uint16_t * src1,uint32_t src1_stride,const uint8_t * mask,uint32_t mask_stride,int w,int h,blend_unit_fn blend)660 static inline void blend_a64_mask_bn_sy_w8n_sse4_1(
661     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
662     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
663     const uint8_t *mask, uint32_t mask_stride, int w, int h,
664     blend_unit_fn blend) {
665   const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
666 
667   do {
668     int c;
669     for (c = 0; c < w; c += 8) {
670       const __m128i v_ra_b = xx_loadl_64(mask + c);
671       const __m128i v_rb_b = xx_loadl_64(mask + c + mask_stride);
672       const __m128i v_a_b = _mm_avg_epu8(v_ra_b, v_rb_b);
673 
674       const __m128i v_m0_w = _mm_cvtepu8_epi16(v_a_b);
675       const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
676 
677       const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w);
678 
679       xx_storeu_128(dst + c, v_res_w);
680     }
681     dst += dst_stride;
682     src0 += src0_stride;
683     src1 += src1_stride;
684     mask += 2 * mask_stride;
685   } while (--h);
686 }
687 
blend_a64_mask_b10_sy_w8n_sse4_1(uint16_t * dst,uint32_t dst_stride,const uint16_t * src0,uint32_t src0_stride,const uint16_t * src1,uint32_t src1_stride,const uint8_t * mask,uint32_t mask_stride,int w,int h)688 static void blend_a64_mask_b10_sy_w8n_sse4_1(
689     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
690     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
691     const uint8_t *mask, uint32_t mask_stride, int w, int h) {
692   blend_a64_mask_bn_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
693                                   src1_stride, mask, mask_stride, w, h,
694                                   blend_8_b10);
695 }
696 
blend_a64_mask_b12_sy_w8n_sse4_1(uint16_t * dst,uint32_t dst_stride,const uint16_t * src0,uint32_t src0_stride,const uint16_t * src1,uint32_t src1_stride,const uint8_t * mask,uint32_t mask_stride,int w,int h)697 static void blend_a64_mask_b12_sy_w8n_sse4_1(
698     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
699     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
700     const uint8_t *mask, uint32_t mask_stride, int w, int h) {
701   blend_a64_mask_bn_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
702                                   src1_stride, mask, mask_stride, w, h,
703                                   blend_8_b12);
704 }
705 
706 //////////////////////////////////////////////////////////////////////////////
707 // Horizontal and Vertical sub-sampling
708 //////////////////////////////////////////////////////////////////////////////
709 
blend_a64_mask_bn_sx_sy_w4_sse4_1(uint16_t * dst,uint32_t dst_stride,const uint16_t * src0,uint32_t src0_stride,const uint16_t * src1,uint32_t src1_stride,const uint8_t * mask,uint32_t mask_stride,int h,blend_unit_fn blend)710 static inline void blend_a64_mask_bn_sx_sy_w4_sse4_1(
711     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
712     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
713     const uint8_t *mask, uint32_t mask_stride, int h, blend_unit_fn blend) {
714   const __m128i v_zmask_b =
715       _mm_set_epi8(0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1);
716   const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
717 
718   do {
719     const __m128i v_ra_b = xx_loadl_64(mask);
720     const __m128i v_rb_b = xx_loadl_64(mask + mask_stride);
721     const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
722     const __m128i v_rvsa_w = _mm_and_si128(v_rvs_b, v_zmask_b);
723     const __m128i v_rvsb_w =
724         _mm_and_si128(_mm_srli_si128(v_rvs_b, 1), v_zmask_b);
725     const __m128i v_rs_w = _mm_add_epi16(v_rvsa_w, v_rvsb_w);
726 
727     const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2);
728     const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
729 
730     const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w);
731 
732     xx_storel_64(dst, v_res_w);
733 
734     dst += dst_stride;
735     src0 += src0_stride;
736     src1 += src1_stride;
737     mask += 2 * mask_stride;
738   } while (--h);
739 }
740 
blend_a64_mask_b10_sx_sy_w4_sse4_1(uint16_t * dst,uint32_t dst_stride,const uint16_t * src0,uint32_t src0_stride,const uint16_t * src1,uint32_t src1_stride,const uint8_t * mask,uint32_t mask_stride,int w,int h)741 static void blend_a64_mask_b10_sx_sy_w4_sse4_1(
742     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
743     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
744     const uint8_t *mask, uint32_t mask_stride, int w, int h) {
745   (void)w;
746   blend_a64_mask_bn_sx_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
747                                     src1_stride, mask, mask_stride, h,
748                                     blend_4_b10);
749 }
750 
blend_a64_mask_b12_sx_sy_w4_sse4_1(uint16_t * dst,uint32_t dst_stride,const uint16_t * src0,uint32_t src0_stride,const uint16_t * src1,uint32_t src1_stride,const uint8_t * mask,uint32_t mask_stride,int w,int h)751 static void blend_a64_mask_b12_sx_sy_w4_sse4_1(
752     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
753     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
754     const uint8_t *mask, uint32_t mask_stride, int w, int h) {
755   (void)w;
756   blend_a64_mask_bn_sx_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
757                                     src1_stride, mask, mask_stride, h,
758                                     blend_4_b12);
759 }
760 
blend_a64_mask_bn_sx_sy_w8n_sse4_1(uint16_t * dst,uint32_t dst_stride,const uint16_t * src0,uint32_t src0_stride,const uint16_t * src1,uint32_t src1_stride,const uint8_t * mask,uint32_t mask_stride,int w,int h,blend_unit_fn blend)761 static inline void blend_a64_mask_bn_sx_sy_w8n_sse4_1(
762     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
763     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
764     const uint8_t *mask, uint32_t mask_stride, int w, int h,
765     blend_unit_fn blend) {
766   const __m128i v_zmask_b =
767       _mm_set_epi8(0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1);
768   const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
769 
770   do {
771     int c;
772     for (c = 0; c < w; c += 8) {
773       const __m128i v_ra_b = xx_loadu_128(mask + 2 * c);
774       const __m128i v_rb_b = xx_loadu_128(mask + 2 * c + mask_stride);
775       const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
776       const __m128i v_rvsa_w = _mm_and_si128(v_rvs_b, v_zmask_b);
777       const __m128i v_rvsb_w =
778           _mm_and_si128(_mm_srli_si128(v_rvs_b, 1), v_zmask_b);
779       const __m128i v_rs_w = _mm_add_epi16(v_rvsa_w, v_rvsb_w);
780 
781       const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2);
782       const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
783 
784       const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w);
785 
786       xx_storeu_128(dst + c, v_res_w);
787     }
788     dst += dst_stride;
789     src0 += src0_stride;
790     src1 += src1_stride;
791     mask += 2 * mask_stride;
792   } while (--h);
793 }
794 
blend_a64_mask_b10_sx_sy_w8n_sse4_1(uint16_t * dst,uint32_t dst_stride,const uint16_t * src0,uint32_t src0_stride,const uint16_t * src1,uint32_t src1_stride,const uint8_t * mask,uint32_t mask_stride,int w,int h)795 static void blend_a64_mask_b10_sx_sy_w8n_sse4_1(
796     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
797     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
798     const uint8_t *mask, uint32_t mask_stride, int w, int h) {
799   blend_a64_mask_bn_sx_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
800                                      src1_stride, mask, mask_stride, w, h,
801                                      blend_8_b10);
802 }
803 
blend_a64_mask_b12_sx_sy_w8n_sse4_1(uint16_t * dst,uint32_t dst_stride,const uint16_t * src0,uint32_t src0_stride,const uint16_t * src1,uint32_t src1_stride,const uint8_t * mask,uint32_t mask_stride,int w,int h)804 static void blend_a64_mask_b12_sx_sy_w8n_sse4_1(
805     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
806     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
807     const uint8_t *mask, uint32_t mask_stride, int w, int h) {
808   blend_a64_mask_bn_sx_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
809                                      src1_stride, mask, mask_stride, w, h,
810                                      blend_8_b12);
811 }
812 
813 //////////////////////////////////////////////////////////////////////////////
814 // Dispatch
815 //////////////////////////////////////////////////////////////////////////////
aom_highbd_blend_a64_mask_sse4_1(uint8_t * dst_8,uint32_t dst_stride,const uint8_t * src0_8,uint32_t src0_stride,const uint8_t * src1_8,uint32_t src1_stride,const uint8_t * mask,uint32_t mask_stride,int w,int h,int subw,int subh,int bd)816 void aom_highbd_blend_a64_mask_sse4_1(uint8_t *dst_8, uint32_t dst_stride,
817                                       const uint8_t *src0_8,
818                                       uint32_t src0_stride,
819                                       const uint8_t *src1_8,
820                                       uint32_t src1_stride, const uint8_t *mask,
821                                       uint32_t mask_stride, int w, int h,
822                                       int subw, int subh, int bd) {
823   typedef void (*blend_fn)(
824       uint16_t * dst, uint32_t dst_stride, const uint16_t *src0,
825       uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
826       const uint8_t *mask, uint32_t mask_stride, int w, int h);
827 
828   // Dimensions are: bd_index X width_index X subw X subh
829   static const blend_fn blend[2][2][2][2] = {
830     {   // bd == 8 or 10
831       { // w % 8 == 0
832         { blend_a64_mask_b10_w8n_sse4_1, blend_a64_mask_b10_sy_w8n_sse4_1 },
833         { blend_a64_mask_b10_sx_w8n_sse4_1,
834           blend_a64_mask_b10_sx_sy_w8n_sse4_1 } },
835       { // w == 4
836         { blend_a64_mask_b10_w4_sse4_1, blend_a64_mask_b10_sy_w4_sse4_1 },
837         { blend_a64_mask_b10_sx_w4_sse4_1,
838           blend_a64_mask_b10_sx_sy_w4_sse4_1 } } },
839     {   // bd == 12
840       { // w % 8 == 0
841         { blend_a64_mask_b12_w8n_sse4_1, blend_a64_mask_b12_sy_w8n_sse4_1 },
842         { blend_a64_mask_b12_sx_w8n_sse4_1,
843           blend_a64_mask_b12_sx_sy_w8n_sse4_1 } },
844       { // w == 4
845         { blend_a64_mask_b12_w4_sse4_1, blend_a64_mask_b12_sy_w4_sse4_1 },
846         { blend_a64_mask_b12_sx_w4_sse4_1,
847           blend_a64_mask_b12_sx_sy_w4_sse4_1 } } }
848   };
849 
850   assert(IMPLIES(src0_8 == dst_8, src0_stride == dst_stride));
851   assert(IMPLIES(src1_8 == dst_8, src1_stride == dst_stride));
852 
853   assert(h >= 1);
854   assert(w >= 1);
855   assert(IS_POWER_OF_TWO(h));
856   assert(IS_POWER_OF_TWO(w));
857 
858   assert(bd == 8 || bd == 10 || bd == 12);
859   if (UNLIKELY((h | w) & 3)) {  // if (w <= 2 || h <= 2)
860     aom_highbd_blend_a64_mask_c(dst_8, dst_stride, src0_8, src0_stride, src1_8,
861                                 src1_stride, mask, mask_stride, w, h, subw,
862                                 subh, bd);
863   } else {
864     uint16_t *const dst = CONVERT_TO_SHORTPTR(dst_8);
865     const uint16_t *const src0 = CONVERT_TO_SHORTPTR(src0_8);
866     const uint16_t *const src1 = CONVERT_TO_SHORTPTR(src1_8);
867 
868     blend[bd == 12][(w >> 2) & 1][subw != 0][subh != 0](
869         dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
870         mask_stride, w, h);
871   }
872 }
873 #endif  // CONFIG_AV1_HIGHBITDEPTH
874 
blend_a64_d16_mask_w16_sse41(uint8_t * dst,const CONV_BUF_TYPE * src0,const CONV_BUF_TYPE * src1,const __m128i * m0,const __m128i * m1,const __m128i * v_round_offset,const __m128i * v_maxval,int shift)875 static inline void blend_a64_d16_mask_w16_sse41(
876     uint8_t *dst, const CONV_BUF_TYPE *src0, const CONV_BUF_TYPE *src1,
877     const __m128i *m0, const __m128i *m1, const __m128i *v_round_offset,
878     const __m128i *v_maxval, int shift) {
879   const __m128i max_minus_m0 = _mm_sub_epi16(*v_maxval, *m0);
880   const __m128i max_minus_m1 = _mm_sub_epi16(*v_maxval, *m1);
881   const __m128i s0_0 = xx_loadu_128(src0);
882   const __m128i s0_1 = xx_loadu_128(src0 + 8);
883   const __m128i s1_0 = xx_loadu_128(src1);
884   const __m128i s1_1 = xx_loadu_128(src1 + 8);
885   __m128i res0_lo = _mm_madd_epi16(_mm_unpacklo_epi16(s0_0, s1_0),
886                                    _mm_unpacklo_epi16(*m0, max_minus_m0));
887   __m128i res0_hi = _mm_madd_epi16(_mm_unpackhi_epi16(s0_0, s1_0),
888                                    _mm_unpackhi_epi16(*m0, max_minus_m0));
889   __m128i res1_lo = _mm_madd_epi16(_mm_unpacklo_epi16(s0_1, s1_1),
890                                    _mm_unpacklo_epi16(*m1, max_minus_m1));
891   __m128i res1_hi = _mm_madd_epi16(_mm_unpackhi_epi16(s0_1, s1_1),
892                                    _mm_unpackhi_epi16(*m1, max_minus_m1));
893   res0_lo = _mm_srai_epi32(_mm_sub_epi32(res0_lo, *v_round_offset), shift);
894   res0_hi = _mm_srai_epi32(_mm_sub_epi32(res0_hi, *v_round_offset), shift);
895   res1_lo = _mm_srai_epi32(_mm_sub_epi32(res1_lo, *v_round_offset), shift);
896   res1_hi = _mm_srai_epi32(_mm_sub_epi32(res1_hi, *v_round_offset), shift);
897   const __m128i res0 = _mm_packs_epi32(res0_lo, res0_hi);
898   const __m128i res1 = _mm_packs_epi32(res1_lo, res1_hi);
899   const __m128i res = _mm_packus_epi16(res0, res1);
900 
901   _mm_storeu_si128((__m128i *)(dst), res);
902 }
903 
lowbd_blend_a64_d16_mask_subw0_subh0_w16_sse4_1(uint8_t * dst,uint32_t dst_stride,const CONV_BUF_TYPE * src0,uint32_t src0_stride,const CONV_BUF_TYPE * src1,uint32_t src1_stride,const uint8_t * mask,uint32_t mask_stride,int h,int w,const __m128i * round_offset,int shift)904 static inline void lowbd_blend_a64_d16_mask_subw0_subh0_w16_sse4_1(
905     uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
906     uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
907     const uint8_t *mask, uint32_t mask_stride, int h, int w,
908     const __m128i *round_offset, int shift) {
909   const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
910   for (int i = 0; i < h; ++i) {
911     for (int j = 0; j < w; j += 16) {
912       const __m128i m = xx_loadu_128(mask + j);
913       const __m128i m0 = _mm_cvtepu8_epi16(m);
914       const __m128i m1 = _mm_cvtepu8_epi16(_mm_srli_si128(m, 8));
915 
916       blend_a64_d16_mask_w16_sse41(dst + j, src0 + j, src1 + j, &m0, &m1,
917                                    round_offset, &v_maxval, shift);
918     }
919     mask += mask_stride;
920     dst += dst_stride;
921     src0 += src0_stride;
922     src1 += src1_stride;
923   }
924 }
925 
lowbd_blend_a64_d16_mask_subw1_subh1_w16_sse4_1(uint8_t * dst,uint32_t dst_stride,const CONV_BUF_TYPE * src0,uint32_t src0_stride,const CONV_BUF_TYPE * src1,uint32_t src1_stride,const uint8_t * mask,uint32_t mask_stride,int h,int w,const __m128i * round_offset,int shift)926 static inline void lowbd_blend_a64_d16_mask_subw1_subh1_w16_sse4_1(
927     uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
928     uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
929     const uint8_t *mask, uint32_t mask_stride, int h, int w,
930     const __m128i *round_offset, int shift) {
931   const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
932   const __m128i one_b = _mm_set1_epi8(1);
933   const __m128i two_w = _mm_set1_epi16(2);
934   for (int i = 0; i < h; ++i) {
935     for (int j = 0; j < w; j += 16) {
936       const __m128i m_i00 = xx_loadu_128(mask + 2 * j);
937       const __m128i m_i01 = xx_loadu_128(mask + 2 * j + 16);
938       const __m128i m_i10 = xx_loadu_128(mask + mask_stride + 2 * j);
939       const __m128i m_i11 = xx_loadu_128(mask + mask_stride + 2 * j + 16);
940 
941       const __m128i m0_ac = _mm_adds_epu8(m_i00, m_i10);
942       const __m128i m1_ac = _mm_adds_epu8(m_i01, m_i11);
943       const __m128i m0_acbd = _mm_maddubs_epi16(m0_ac, one_b);
944       const __m128i m1_acbd = _mm_maddubs_epi16(m1_ac, one_b);
945       const __m128i m0 = _mm_srli_epi16(_mm_add_epi16(m0_acbd, two_w), 2);
946       const __m128i m1 = _mm_srli_epi16(_mm_add_epi16(m1_acbd, two_w), 2);
947 
948       blend_a64_d16_mask_w16_sse41(dst + j, src0 + j, src1 + j, &m0, &m1,
949                                    round_offset, &v_maxval, shift);
950     }
951     mask += mask_stride << 1;
952     dst += dst_stride;
953     src0 += src0_stride;
954     src1 += src1_stride;
955   }
956 }
957 
lowbd_blend_a64_d16_mask_subw1_subh0_w16_sse4_1(uint8_t * dst,uint32_t dst_stride,const CONV_BUF_TYPE * src0,uint32_t src0_stride,const CONV_BUF_TYPE * src1,uint32_t src1_stride,const uint8_t * mask,uint32_t mask_stride,int h,int w,const __m128i * round_offset,int shift)958 static inline void lowbd_blend_a64_d16_mask_subw1_subh0_w16_sse4_1(
959     uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
960     uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
961     const uint8_t *mask, uint32_t mask_stride, int h, int w,
962     const __m128i *round_offset, int shift) {
963   const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
964   const __m128i one_b = _mm_set1_epi8(1);
965   const __m128i zeros = _mm_setzero_si128();
966   for (int i = 0; i < h; ++i) {
967     for (int j = 0; j < w; j += 16) {
968       const __m128i m_i00 = xx_loadu_128(mask + 2 * j);
969       const __m128i m_i01 = xx_loadu_128(mask + 2 * j + 16);
970       const __m128i m0_ac = _mm_maddubs_epi16(m_i00, one_b);
971       const __m128i m1_ac = _mm_maddubs_epi16(m_i01, one_b);
972       const __m128i m0 = _mm_avg_epu16(m0_ac, zeros);
973       const __m128i m1 = _mm_avg_epu16(m1_ac, zeros);
974 
975       blend_a64_d16_mask_w16_sse41(dst + j, src0 + j, src1 + j, &m0, &m1,
976                                    round_offset, &v_maxval, shift);
977     }
978     mask += mask_stride;
979     dst += dst_stride;
980     src0 += src0_stride;
981     src1 += src1_stride;
982   }
983 }
984 
lowbd_blend_a64_d16_mask_subw0_subh1_w16_sse4_1(uint8_t * dst,uint32_t dst_stride,const CONV_BUF_TYPE * src0,uint32_t src0_stride,const CONV_BUF_TYPE * src1,uint32_t src1_stride,const uint8_t * mask,uint32_t mask_stride,int h,int w,const __m128i * round_offset,int shift)985 static inline void lowbd_blend_a64_d16_mask_subw0_subh1_w16_sse4_1(
986     uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
987     uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
988     const uint8_t *mask, uint32_t mask_stride, int h, int w,
989     const __m128i *round_offset, int shift) {
990   const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
991   const __m128i zeros = _mm_setzero_si128();
992   for (int i = 0; i < h; ++i) {
993     for (int j = 0; j < w; j += 16) {
994       const __m128i m_i00 = xx_loadu_128(mask + j);
995       const __m128i m_i10 = xx_loadu_128(mask + mask_stride + j);
996 
997       const __m128i m_ac = _mm_avg_epu8(_mm_adds_epu8(m_i00, m_i10), zeros);
998       const __m128i m0 = _mm_cvtepu8_epi16(m_ac);
999       const __m128i m1 = _mm_cvtepu8_epi16(_mm_srli_si128(m_ac, 8));
1000 
1001       blend_a64_d16_mask_w16_sse41(dst + j, src0 + j, src1 + j, &m0, &m1,
1002                                    round_offset, &v_maxval, shift);
1003     }
1004     mask += mask_stride << 1;
1005     dst += dst_stride;
1006     src0 += src0_stride;
1007     src1 += src1_stride;
1008   }
1009 }
1010 
aom_lowbd_blend_a64_d16_mask_sse4_1(uint8_t * dst,uint32_t dst_stride,const CONV_BUF_TYPE * src0,uint32_t src0_stride,const CONV_BUF_TYPE * src1,uint32_t src1_stride,const uint8_t * mask,uint32_t mask_stride,int w,int h,int subw,int subh,ConvolveParams * conv_params)1011 void aom_lowbd_blend_a64_d16_mask_sse4_1(
1012     uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
1013     uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
1014     const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh,
1015     ConvolveParams *conv_params) {
1016   const int bd = 8;
1017   const int round_bits =
1018       2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
1019 
1020   const int round_offset =
1021       ((1 << (round_bits + bd)) + (1 << (round_bits + bd - 1)) -
1022        (1 << (round_bits - 1)))
1023       << AOM_BLEND_A64_ROUND_BITS;
1024 
1025   const int shift = round_bits + AOM_BLEND_A64_ROUND_BITS;
1026   assert(IMPLIES((void *)src0 == dst, src0_stride == dst_stride));
1027   assert(IMPLIES((void *)src1 == dst, src1_stride == dst_stride));
1028 
1029   assert(h >= 4);
1030   assert(w >= 4);
1031   assert(IS_POWER_OF_TWO(h));
1032   assert(IS_POWER_OF_TWO(w));
1033 
1034   const __m128i v_round_offset = _mm_set1_epi32(round_offset);
1035 
1036   if (subw == 0 && subh == 0) {
1037     switch (w) {
1038       case 4:
1039         aom_lowbd_blend_a64_d16_mask_subw0_subh0_w4_sse4_1(
1040             dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
1041             mask_stride, h, &v_round_offset, shift);
1042         break;
1043       case 8:
1044         aom_lowbd_blend_a64_d16_mask_subw0_subh0_w8_sse4_1(
1045             dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
1046             mask_stride, h, &v_round_offset, shift);
1047         break;
1048       default:
1049         lowbd_blend_a64_d16_mask_subw0_subh0_w16_sse4_1(
1050             dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
1051             mask_stride, h, w, &v_round_offset, shift);
1052         break;
1053     }
1054 
1055   } else if (subw == 1 && subh == 1) {
1056     switch (w) {
1057       case 4:
1058         aom_lowbd_blend_a64_d16_mask_subw1_subh1_w4_sse4_1(
1059             dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
1060             mask_stride, h, &v_round_offset, shift);
1061         break;
1062       case 8:
1063         aom_lowbd_blend_a64_d16_mask_subw1_subh1_w8_sse4_1(
1064             dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
1065             mask_stride, h, &v_round_offset, shift);
1066         break;
1067       default:
1068         lowbd_blend_a64_d16_mask_subw1_subh1_w16_sse4_1(
1069             dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
1070             mask_stride, h, w, &v_round_offset, shift);
1071         break;
1072     }
1073   } else if (subw == 1 && subh == 0) {
1074     switch (w) {
1075       case 4:
1076         aom_lowbd_blend_a64_d16_mask_subw1_subh0_w4_sse4_1(
1077             dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
1078             mask_stride, h, &v_round_offset, shift);
1079         break;
1080       case 8:
1081         aom_lowbd_blend_a64_d16_mask_subw1_subh0_w8_sse4_1(
1082             dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
1083             mask_stride, h, &v_round_offset, shift);
1084         break;
1085       default:
1086         lowbd_blend_a64_d16_mask_subw1_subh0_w16_sse4_1(
1087             dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
1088             mask_stride, h, w, &v_round_offset, shift);
1089         break;
1090     }
1091   } else {
1092     switch (w) {
1093       case 4:
1094         aom_lowbd_blend_a64_d16_mask_subw0_subh1_w4_sse4_1(
1095             dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
1096             mask_stride, h, &v_round_offset, shift);
1097         break;
1098       case 8:
1099         aom_lowbd_blend_a64_d16_mask_subw0_subh1_w8_sse4_1(
1100             dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
1101             mask_stride, h, &v_round_offset, shift);
1102         break;
1103       default:
1104         lowbd_blend_a64_d16_mask_subw0_subh1_w16_sse4_1(
1105             dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
1106             mask_stride, h, w, &v_round_offset, shift);
1107         break;
1108     }
1109   }
1110 }
1111 
1112 //////////////////////////////////////////////////////////////////////////////
1113 // aom_highbd_blend_a64_d16_mask_sse4_1()
1114 //////////////////////////////////////////////////////////////////////////////
1115 #if CONFIG_AV1_HIGHBITDEPTH
highbd_blend_a64_d16_mask_w4_sse4_1(uint16_t * dst,int dst_stride,const CONV_BUF_TYPE * src0,int src0_stride,const CONV_BUF_TYPE * src1,int src1_stride,const __m128i * mask0a,const __m128i * mask0b,const __m128i * round_offset,int shift,const __m128i * clip_low,const __m128i * clip_high,const __m128i * mask_max)1116 static inline void highbd_blend_a64_d16_mask_w4_sse4_1(
1117     uint16_t *dst, int dst_stride, const CONV_BUF_TYPE *src0, int src0_stride,
1118     const CONV_BUF_TYPE *src1, int src1_stride, const __m128i *mask0a,
1119     const __m128i *mask0b, const __m128i *round_offset, int shift,
1120     const __m128i *clip_low, const __m128i *clip_high,
1121     const __m128i *mask_max) {
1122   // Load 4 pixels from each of 4 rows from each source
1123   const __m128i s0a = xx_loadu_2x64(src0, src0 + src0_stride);
1124   const __m128i s0b =
1125       xx_loadu_2x64(src0 + 2 * src0_stride, src0 + 3 * src0_stride);
1126   const __m128i s1a = xx_loadu_2x64(src1, src1 + src1_stride);
1127   const __m128i s1b =
1128       xx_loadu_2x64(src1 + 2 * src1_stride, src1 + 3 * src1_stride);
1129 
1130   // Generate the inverse masks
1131   const __m128i mask1a = _mm_sub_epi16(*mask_max, *mask0a);
1132   const __m128i mask1b = _mm_sub_epi16(*mask_max, *mask0b);
1133 
1134   // Multiply each mask by the respective source
1135   const __m128i mul0a_highs = _mm_mulhi_epu16(*mask0a, s0a);
1136   const __m128i mul0a_lows = _mm_mullo_epi16(*mask0a, s0a);
1137   const __m128i mul0ah = _mm_unpackhi_epi16(mul0a_lows, mul0a_highs);
1138   const __m128i mul0al = _mm_unpacklo_epi16(mul0a_lows, mul0a_highs);
1139   const __m128i mul1a_highs = _mm_mulhi_epu16(mask1a, s1a);
1140   const __m128i mul1a_lows = _mm_mullo_epi16(mask1a, s1a);
1141   const __m128i mul1ah = _mm_unpackhi_epi16(mul1a_lows, mul1a_highs);
1142   const __m128i mul1al = _mm_unpacklo_epi16(mul1a_lows, mul1a_highs);
1143 
1144   const __m128i mul0b_highs = _mm_mulhi_epu16(*mask0b, s0b);
1145   const __m128i mul0b_lows = _mm_mullo_epi16(*mask0b, s0b);
1146   const __m128i mul0bh = _mm_unpackhi_epi16(mul0b_lows, mul0b_highs);
1147   const __m128i mul0bl = _mm_unpacklo_epi16(mul0b_lows, mul0b_highs);
1148   const __m128i mul1b_highs = _mm_mulhi_epu16(mask1b, s1b);
1149   const __m128i mul1b_lows = _mm_mullo_epi16(mask1b, s1b);
1150   const __m128i mul1bh = _mm_unpackhi_epi16(mul1b_lows, mul1b_highs);
1151   const __m128i mul1bl = _mm_unpacklo_epi16(mul1b_lows, mul1b_highs);
1152 
1153   const __m128i sumah = _mm_add_epi32(mul0ah, mul1ah);
1154   const __m128i sumal = _mm_add_epi32(mul0al, mul1al);
1155   const __m128i sumbh = _mm_add_epi32(mul0bh, mul1bh);
1156   const __m128i sumbl = _mm_add_epi32(mul0bl, mul1bl);
1157 
1158   const __m128i roundah =
1159       _mm_srai_epi32(_mm_sub_epi32(sumah, *round_offset), shift);
1160   const __m128i roundbh =
1161       _mm_srai_epi32(_mm_sub_epi32(sumbh, *round_offset), shift);
1162   const __m128i roundal =
1163       _mm_srai_epi32(_mm_sub_epi32(sumal, *round_offset), shift);
1164   const __m128i roundbl =
1165       _mm_srai_epi32(_mm_sub_epi32(sumbl, *round_offset), shift);
1166 
1167   const __m128i packa = _mm_packs_epi32(roundal, roundah);
1168   const __m128i packb = _mm_packs_epi32(roundbl, roundbh);
1169 
1170   const __m128i clipa =
1171       _mm_min_epi16(_mm_max_epi16(packa, *clip_low), *clip_high);
1172   const __m128i clipb =
1173       _mm_min_epi16(_mm_max_epi16(packb, *clip_low), *clip_high);
1174 
1175   xx_storel_64(dst, _mm_srli_si128(clipa, 8));
1176   xx_storel_64(dst + dst_stride, clipa);
1177   xx_storel_64(dst + 2 * dst_stride, _mm_srli_si128(clipb, 8));
1178   xx_storel_64(dst + 3 * dst_stride, clipb);
1179 }
1180 
highbd_blend_a64_d16_mask_subw0_subh0_w4_sse4_1(uint16_t * dst,uint32_t dst_stride,const CONV_BUF_TYPE * src0,uint32_t src0_stride,const CONV_BUF_TYPE * src1,uint32_t src1_stride,const uint8_t * mask,uint32_t mask_stride,int h,const __m128i * round_offset,int shift,const __m128i * clip_low,const __m128i * clip_high,const __m128i * mask_max)1181 static inline void highbd_blend_a64_d16_mask_subw0_subh0_w4_sse4_1(
1182     uint16_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
1183     uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
1184     const uint8_t *mask, uint32_t mask_stride, int h,
1185     const __m128i *round_offset, int shift, const __m128i *clip_low,
1186     const __m128i *clip_high, const __m128i *mask_max) {
1187   do {
1188     const __m128i mask0a8 =
1189         _mm_set_epi32(0, 0, *(int32_t *)mask, *(int32_t *)(mask + mask_stride));
1190     const __m128i mask0b8 =
1191         _mm_set_epi32(0, 0, *(int32_t *)(mask + 2 * mask_stride),
1192                       *(int32_t *)(mask + 3 * mask_stride));
1193     const __m128i mask0a = _mm_cvtepu8_epi16(mask0a8);
1194     const __m128i mask0b = _mm_cvtepu8_epi16(mask0b8);
1195 
1196     highbd_blend_a64_d16_mask_w4_sse4_1(
1197         dst, dst_stride, src0, src0_stride, src1, src1_stride, &mask0a, &mask0b,
1198         round_offset, shift, clip_low, clip_high, mask_max);
1199 
1200     dst += dst_stride * 4;
1201     src0 += src0_stride * 4;
1202     src1 += src1_stride * 4;
1203     mask += mask_stride * 4;
1204   } while (h -= 4);
1205 }
1206 
highbd_blend_a64_d16_mask_subw1_subh1_w4_sse4_1(uint16_t * dst,uint32_t dst_stride,const CONV_BUF_TYPE * src0,uint32_t src0_stride,const CONV_BUF_TYPE * src1,uint32_t src1_stride,const uint8_t * mask,uint32_t mask_stride,int h,const __m128i * round_offset,int shift,const __m128i * clip_low,const __m128i * clip_high,const __m128i * mask_max)1207 static inline void highbd_blend_a64_d16_mask_subw1_subh1_w4_sse4_1(
1208     uint16_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
1209     uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
1210     const uint8_t *mask, uint32_t mask_stride, int h,
1211     const __m128i *round_offset, int shift, const __m128i *clip_low,
1212     const __m128i *clip_high, const __m128i *mask_max) {
1213   const __m128i one_b = _mm_set1_epi8(1);
1214   const __m128i two_w = _mm_set1_epi16(2);
1215   do {
1216     // Load 8 pixels from each of 8 rows of mask,
1217     // (saturating) add together rows then use madd to add adjacent pixels
1218     // Finally, divide each value by 4 (with rounding)
1219     const __m128i m02 = _mm_set_epi64x(*(int64_t *)(mask),
1220                                        *(int64_t *)(mask + 2 * mask_stride));
1221     const __m128i m13 = _mm_set_epi64x(*(int64_t *)(mask + mask_stride),
1222                                        *(int64_t *)(mask + 3 * mask_stride));
1223     const __m128i m0123 = _mm_maddubs_epi16(_mm_adds_epu8(m02, m13), one_b);
1224     const __m128i mask_0a = _mm_srli_epi16(_mm_add_epi16(m0123, two_w), 2);
1225     const __m128i m46 = _mm_set_epi64x(*(int64_t *)(mask + 4 * mask_stride),
1226                                        *(int64_t *)(mask + 6 * mask_stride));
1227     const __m128i m57 = _mm_set_epi64x(*(int64_t *)(mask + 5 * mask_stride),
1228                                        *(int64_t *)(mask + 7 * mask_stride));
1229     const __m128i m4567 = _mm_maddubs_epi16(_mm_adds_epu8(m46, m57), one_b);
1230     const __m128i mask_0b = _mm_srli_epi16(_mm_add_epi16(m4567, two_w), 2);
1231 
1232     highbd_blend_a64_d16_mask_w4_sse4_1(
1233         dst, dst_stride, src0, src0_stride, src1, src1_stride, &mask_0a,
1234         &mask_0b, round_offset, shift, clip_low, clip_high, mask_max);
1235 
1236     dst += dst_stride * 4;
1237     src0 += src0_stride * 4;
1238     src1 += src1_stride * 4;
1239     mask += mask_stride * 8;
1240   } while (h -= 4);
1241 }
1242 
highbd_blend_a64_d16_mask_w8_sse4_1(uint16_t * dst,int dst_stride,const CONV_BUF_TYPE * src0,int src0_stride,const CONV_BUF_TYPE * src1,int src1_stride,const __m128i * mask0a,const __m128i * mask0b,const __m128i * round_offset,int shift,const __m128i * clip_low,const __m128i * clip_high,const __m128i * max_mask)1243 static inline void highbd_blend_a64_d16_mask_w8_sse4_1(
1244     uint16_t *dst, int dst_stride, const CONV_BUF_TYPE *src0, int src0_stride,
1245     const CONV_BUF_TYPE *src1, int src1_stride, const __m128i *mask0a,
1246     const __m128i *mask0b, const __m128i *round_offset, int shift,
1247     const __m128i *clip_low, const __m128i *clip_high,
1248     const __m128i *max_mask) {
1249   // Load 8x pixels from each of 2 rows from each source
1250   const __m128i s0a = xx_loadu_128(src0);
1251   const __m128i s0b = xx_loadu_128(src0 + src0_stride);
1252   const __m128i s1a = xx_loadu_128(src1);
1253   const __m128i s1b = xx_loadu_128(src1 + src1_stride);
1254 
1255   // Generate inverse masks
1256   const __m128i mask1a = _mm_sub_epi16(*max_mask, *mask0a);
1257   const __m128i mask1b = _mm_sub_epi16(*max_mask, *mask0b);
1258 
1259   // Multiply sources by respective masks
1260   const __m128i mul0a_highs = _mm_mulhi_epu16(*mask0a, s0a);
1261   const __m128i mul0a_lows = _mm_mullo_epi16(*mask0a, s0a);
1262   const __m128i mul0ah = _mm_unpackhi_epi16(mul0a_lows, mul0a_highs);
1263   const __m128i mul0al = _mm_unpacklo_epi16(mul0a_lows, mul0a_highs);
1264 
1265   const __m128i mul1a_highs = _mm_mulhi_epu16(mask1a, s1a);
1266   const __m128i mul1a_lows = _mm_mullo_epi16(mask1a, s1a);
1267   const __m128i mul1ah = _mm_unpackhi_epi16(mul1a_lows, mul1a_highs);
1268   const __m128i mul1al = _mm_unpacklo_epi16(mul1a_lows, mul1a_highs);
1269 
1270   const __m128i sumah = _mm_add_epi32(mul0ah, mul1ah);
1271   const __m128i sumal = _mm_add_epi32(mul0al, mul1al);
1272 
1273   const __m128i mul0b_highs = _mm_mulhi_epu16(*mask0b, s0b);
1274   const __m128i mul0b_lows = _mm_mullo_epi16(*mask0b, s0b);
1275   const __m128i mul0bh = _mm_unpackhi_epi16(mul0b_lows, mul0b_highs);
1276   const __m128i mul0bl = _mm_unpacklo_epi16(mul0b_lows, mul0b_highs);
1277 
1278   const __m128i mul1b_highs = _mm_mulhi_epu16(mask1b, s1b);
1279   const __m128i mul1b_lows = _mm_mullo_epi16(mask1b, s1b);
1280   const __m128i mul1bh = _mm_unpackhi_epi16(mul1b_lows, mul1b_highs);
1281   const __m128i mul1bl = _mm_unpacklo_epi16(mul1b_lows, mul1b_highs);
1282 
1283   const __m128i sumbh = _mm_add_epi32(mul0bh, mul1bh);
1284   const __m128i sumbl = _mm_add_epi32(mul0bl, mul1bl);
1285 
1286   const __m128i roundah =
1287       _mm_srai_epi32(_mm_sub_epi32(sumah, *round_offset), shift);
1288   const __m128i roundal =
1289       _mm_srai_epi32(_mm_sub_epi32(sumal, *round_offset), shift);
1290   const __m128i roundbh =
1291       _mm_srai_epi32(_mm_sub_epi32(sumbh, *round_offset), shift);
1292   const __m128i roundbl =
1293       _mm_srai_epi32(_mm_sub_epi32(sumbl, *round_offset), shift);
1294 
1295   const __m128i packa = _mm_packs_epi32(roundal, roundah);
1296   const __m128i clipa =
1297       _mm_min_epi16(_mm_max_epi16(packa, *clip_low), *clip_high);
1298   const __m128i packb = _mm_packs_epi32(roundbl, roundbh);
1299   const __m128i clipb =
1300       _mm_min_epi16(_mm_max_epi16(packb, *clip_low), *clip_high);
1301 
1302   xx_storeu_128(dst, clipa);
1303   xx_storeu_128(dst + dst_stride, clipb);
1304 }
1305 
highbd_blend_a64_d16_mask_subw0_subh0_w8_sse4_1(uint16_t * dst,uint32_t dst_stride,const CONV_BUF_TYPE * src0,uint32_t src0_stride,const CONV_BUF_TYPE * src1,uint32_t src1_stride,const uint8_t * mask,uint32_t mask_stride,int h,const __m128i * round_offset,int shift,const __m128i * clip_low,const __m128i * clip_high,const __m128i * max_mask)1306 static inline void highbd_blend_a64_d16_mask_subw0_subh0_w8_sse4_1(
1307     uint16_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
1308     uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
1309     const uint8_t *mask, uint32_t mask_stride, int h,
1310     const __m128i *round_offset, int shift, const __m128i *clip_low,
1311     const __m128i *clip_high, const __m128i *max_mask) {
1312   do {
1313     const __m128i mask0a = _mm_cvtepu8_epi16(xx_loadl_64(mask));
1314     const __m128i mask0b = _mm_cvtepu8_epi16(xx_loadl_64(mask + mask_stride));
1315     highbd_blend_a64_d16_mask_w8_sse4_1(
1316         dst, dst_stride, src0, src0_stride, src1, src1_stride, &mask0a, &mask0b,
1317         round_offset, shift, clip_low, clip_high, max_mask);
1318 
1319     dst += dst_stride * 2;
1320     src0 += src0_stride * 2;
1321     src1 += src1_stride * 2;
1322     mask += mask_stride * 2;
1323   } while (h -= 2);
1324 }
1325 
highbd_blend_a64_d16_mask_subw1_subh1_w8_sse4_1(uint16_t * dst,uint32_t dst_stride,const CONV_BUF_TYPE * src0,uint32_t src0_stride,const CONV_BUF_TYPE * src1,uint32_t src1_stride,const uint8_t * mask,uint32_t mask_stride,int h,const __m128i * round_offset,int shift,const __m128i * clip_low,const __m128i * clip_high,const __m128i * max_mask)1326 static inline void highbd_blend_a64_d16_mask_subw1_subh1_w8_sse4_1(
1327     uint16_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
1328     uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
1329     const uint8_t *mask, uint32_t mask_stride, int h,
1330     const __m128i *round_offset, int shift, const __m128i *clip_low,
1331     const __m128i *clip_high, const __m128i *max_mask) {
1332   const __m128i one_b = _mm_set1_epi8(1);
1333   const __m128i two_w = _mm_set1_epi16(2);
1334   do {
1335     const __m128i mask_thisrowa = xx_loadu_128(mask);
1336     const __m128i mask_nextrowa = xx_loadu_128(mask + mask_stride);
1337     const __m128i mask_thisrowb = xx_loadu_128(mask + 2 * mask_stride);
1338     const __m128i mask_nextrowb = xx_loadu_128(mask + 3 * mask_stride);
1339     const __m128i mask_bothrowsa = _mm_adds_epu8(mask_thisrowa, mask_nextrowa);
1340     const __m128i mask_bothrowsb = _mm_adds_epu8(mask_thisrowb, mask_nextrowb);
1341     const __m128i mask_16a = _mm_maddubs_epi16(mask_bothrowsa, one_b);
1342     const __m128i mask_16b = _mm_maddubs_epi16(mask_bothrowsb, one_b);
1343     const __m128i mask_sa = _mm_srli_epi16(_mm_add_epi16(mask_16a, two_w), 2);
1344     const __m128i mask_sb = _mm_srli_epi16(_mm_add_epi16(mask_16b, two_w), 2);
1345 
1346     highbd_blend_a64_d16_mask_w8_sse4_1(
1347         dst, dst_stride, src0, src0_stride, src1, src1_stride, &mask_sa,
1348         &mask_sb, round_offset, shift, clip_low, clip_high, max_mask);
1349 
1350     dst += dst_stride * 2;
1351     src0 += src0_stride * 2;
1352     src1 += src1_stride * 2;
1353     mask += mask_stride * 4;
1354   } while (h -= 2);
1355 }
1356 
highbd_blend_a64_d16_mask_w16_sse4_1(uint16_t * dst,const CONV_BUF_TYPE * src0,const CONV_BUF_TYPE * src1,const __m128i * round_offset,int shift,const __m128i * mask0l,const __m128i * mask0h,const __m128i * clip_low,const __m128i * clip_high,const __m128i * mask_max)1357 static inline void highbd_blend_a64_d16_mask_w16_sse4_1(
1358     uint16_t *dst, const CONV_BUF_TYPE *src0, const CONV_BUF_TYPE *src1,
1359     const __m128i *round_offset, int shift, const __m128i *mask0l,
1360     const __m128i *mask0h, const __m128i *clip_low, const __m128i *clip_high,
1361     const __m128i *mask_max) {
1362   // Load 16x u16 pixels for this row from each src
1363   const __m128i s0l = xx_loadu_128(src0);
1364   const __m128i s0h = xx_loadu_128(src0 + 8);
1365   const __m128i s1l = xx_loadu_128(src1);
1366   const __m128i s1h = xx_loadu_128(src1 + 8);
1367 
1368   // Calculate inverse masks
1369   const __m128i mask1h = _mm_sub_epi16(*mask_max, *mask0h);
1370   const __m128i mask1l = _mm_sub_epi16(*mask_max, *mask0l);
1371 
1372   const __m128i mul0_highs = _mm_mulhi_epu16(*mask0h, s0h);
1373   const __m128i mul0_lows = _mm_mullo_epi16(*mask0h, s0h);
1374   const __m128i mul0h = _mm_unpackhi_epi16(mul0_lows, mul0_highs);
1375   const __m128i mul0l = _mm_unpacklo_epi16(mul0_lows, mul0_highs);
1376 
1377   const __m128i mul1_highs = _mm_mulhi_epu16(mask1h, s1h);
1378   const __m128i mul1_lows = _mm_mullo_epi16(mask1h, s1h);
1379   const __m128i mul1h = _mm_unpackhi_epi16(mul1_lows, mul1_highs);
1380   const __m128i mul1l = _mm_unpacklo_epi16(mul1_lows, mul1_highs);
1381 
1382   const __m128i mulhh = _mm_add_epi32(mul0h, mul1h);
1383   const __m128i mulhl = _mm_add_epi32(mul0l, mul1l);
1384 
1385   const __m128i mul2_highs = _mm_mulhi_epu16(*mask0l, s0l);
1386   const __m128i mul2_lows = _mm_mullo_epi16(*mask0l, s0l);
1387   const __m128i mul2h = _mm_unpackhi_epi16(mul2_lows, mul2_highs);
1388   const __m128i mul2l = _mm_unpacklo_epi16(mul2_lows, mul2_highs);
1389 
1390   const __m128i mul3_highs = _mm_mulhi_epu16(mask1l, s1l);
1391   const __m128i mul3_lows = _mm_mullo_epi16(mask1l, s1l);
1392   const __m128i mul3h = _mm_unpackhi_epi16(mul3_lows, mul3_highs);
1393   const __m128i mul3l = _mm_unpacklo_epi16(mul3_lows, mul3_highs);
1394 
1395   const __m128i mullh = _mm_add_epi32(mul2h, mul3h);
1396   const __m128i mulll = _mm_add_epi32(mul2l, mul3l);
1397 
1398   const __m128i reshh =
1399       _mm_srai_epi32(_mm_sub_epi32(mulhh, *round_offset), shift);
1400   const __m128i reshl =
1401       _mm_srai_epi32(_mm_sub_epi32(mulhl, *round_offset), shift);
1402   const __m128i reslh =
1403       _mm_srai_epi32(_mm_sub_epi32(mullh, *round_offset), shift);
1404   const __m128i resll =
1405       _mm_srai_epi32(_mm_sub_epi32(mulll, *round_offset), shift);
1406 
1407   // Signed saturating pack from i32 to i16:
1408   const __m128i packh = _mm_packs_epi32(reshl, reshh);
1409   const __m128i packl = _mm_packs_epi32(resll, reslh);
1410 
1411   // Clip the values to the valid range
1412   const __m128i cliph =
1413       _mm_min_epi16(_mm_max_epi16(packh, *clip_low), *clip_high);
1414   const __m128i clipl =
1415       _mm_min_epi16(_mm_max_epi16(packl, *clip_low), *clip_high);
1416 
1417   // Store 16 pixels
1418   xx_storeu_128(dst, clipl);
1419   xx_storeu_128(dst + 8, cliph);
1420 }
1421 
highbd_blend_a64_d16_mask_subw0_subh0_w16_sse4_1(uint16_t * dst,uint32_t dst_stride,const CONV_BUF_TYPE * src0,uint32_t src0_stride,const CONV_BUF_TYPE * src1,uint32_t src1_stride,const uint8_t * mask,uint32_t mask_stride,int h,int w,const __m128i * round_offset,int shift,const __m128i * clip_low,const __m128i * clip_high,const __m128i * mask_max)1422 static inline void highbd_blend_a64_d16_mask_subw0_subh0_w16_sse4_1(
1423     uint16_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
1424     uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
1425     const uint8_t *mask, uint32_t mask_stride, int h, int w,
1426     const __m128i *round_offset, int shift, const __m128i *clip_low,
1427     const __m128i *clip_high, const __m128i *mask_max) {
1428   for (int i = 0; i < h; i++) {
1429     for (int j = 0; j < w; j += 16) {
1430       // Load 16x u8 alpha-mask values and pad to u16
1431       const __m128i masks_u8 = xx_loadu_128(mask + j);
1432       const __m128i mask0l = _mm_cvtepu8_epi16(masks_u8);
1433       const __m128i mask0h = _mm_cvtepu8_epi16(_mm_srli_si128(masks_u8, 8));
1434 
1435       highbd_blend_a64_d16_mask_w16_sse4_1(
1436           dst + j, src0 + j, src1 + j, round_offset, shift, &mask0l, &mask0h,
1437           clip_low, clip_high, mask_max);
1438     }
1439     dst += dst_stride;
1440     src0 += src0_stride;
1441     src1 += src1_stride;
1442     mask += mask_stride;
1443   }
1444 }
1445 
highbd_blend_a64_d16_mask_subw1_subh1_w16_sse4_1(uint16_t * dst,uint32_t dst_stride,const CONV_BUF_TYPE * src0,uint32_t src0_stride,const CONV_BUF_TYPE * src1,uint32_t src1_stride,const uint8_t * mask,uint32_t mask_stride,int h,int w,const __m128i * round_offset,int shift,const __m128i * clip_low,const __m128i * clip_high,const __m128i * mask_max)1446 static inline void highbd_blend_a64_d16_mask_subw1_subh1_w16_sse4_1(
1447     uint16_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
1448     uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
1449     const uint8_t *mask, uint32_t mask_stride, int h, int w,
1450     const __m128i *round_offset, int shift, const __m128i *clip_low,
1451     const __m128i *clip_high, const __m128i *mask_max) {
1452   const __m128i one_b = _mm_set1_epi8(1);
1453   const __m128i two_w = _mm_set1_epi16(2);
1454   for (int i = 0; i < h; i++) {
1455     for (int j = 0; j < w; j += 16) {
1456       const __m128i m_i00 = xx_loadu_128(mask + 2 * j);
1457       const __m128i m_i01 = xx_loadu_128(mask + 2 * j + 16);
1458       const __m128i m_i10 = xx_loadu_128(mask + mask_stride + 2 * j);
1459       const __m128i m_i11 = xx_loadu_128(mask + mask_stride + 2 * j + 16);
1460 
1461       const __m128i m0_ac = _mm_adds_epu8(m_i00, m_i10);
1462       const __m128i m1_ac = _mm_adds_epu8(m_i01, m_i11);
1463       const __m128i m0_acbd = _mm_maddubs_epi16(m0_ac, one_b);
1464       const __m128i m1_acbd = _mm_maddubs_epi16(m1_ac, one_b);
1465       const __m128i mask_l = _mm_srli_epi16(_mm_add_epi16(m0_acbd, two_w), 2);
1466       const __m128i mask_h = _mm_srli_epi16(_mm_add_epi16(m1_acbd, two_w), 2);
1467 
1468       highbd_blend_a64_d16_mask_w16_sse4_1(
1469           dst + j, src0 + j, src1 + j, round_offset, shift, &mask_l, &mask_h,
1470           clip_low, clip_high, mask_max);
1471     }
1472     dst += dst_stride;
1473     src0 += src0_stride;
1474     src1 += src1_stride;
1475     mask += mask_stride * 2;
1476   }
1477 }
1478 
aom_highbd_blend_a64_d16_mask_sse4_1(uint8_t * dst8,uint32_t dst_stride,const CONV_BUF_TYPE * src0,uint32_t src0_stride,const CONV_BUF_TYPE * src1,uint32_t src1_stride,const uint8_t * mask,uint32_t mask_stride,int w,int h,int subw,int subh,ConvolveParams * conv_params,const int bd)1479 void aom_highbd_blend_a64_d16_mask_sse4_1(
1480     uint8_t *dst8, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
1481     uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
1482     const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh,
1483     ConvolveParams *conv_params, const int bd) {
1484   uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
1485   const int round_bits =
1486       2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
1487   const int32_t round_offset =
1488       ((1 << (round_bits + bd)) + (1 << (round_bits + bd - 1)) -
1489        (1 << (round_bits - 1)))
1490       << AOM_BLEND_A64_ROUND_BITS;
1491   const __m128i v_round_offset = _mm_set1_epi32(round_offset);
1492   const int shift = round_bits + AOM_BLEND_A64_ROUND_BITS;
1493 
1494   const __m128i clip_low = _mm_setzero_si128();
1495   const __m128i clip_high = _mm_set1_epi16((1 << bd) - 1);
1496   const __m128i mask_max = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
1497 
1498   assert(IMPLIES((void *)src0 == dst, src0_stride == dst_stride));
1499   assert(IMPLIES((void *)src1 == dst, src1_stride == dst_stride));
1500 
1501   assert(h >= 4);
1502   assert(w >= 4);
1503   assert(IS_POWER_OF_TWO(h));
1504   assert(IS_POWER_OF_TWO(w));
1505 
1506   if (subw == 0 && subh == 0) {
1507     switch (w) {
1508       case 4:
1509         highbd_blend_a64_d16_mask_subw0_subh0_w4_sse4_1(
1510             dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
1511             mask_stride, h, &v_round_offset, shift, &clip_low, &clip_high,
1512             &mask_max);
1513         break;
1514       case 8:
1515         highbd_blend_a64_d16_mask_subw0_subh0_w8_sse4_1(
1516             dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
1517             mask_stride, h, &v_round_offset, shift, &clip_low, &clip_high,
1518             &mask_max);
1519         break;
1520       default:  // >=16
1521         highbd_blend_a64_d16_mask_subw0_subh0_w16_sse4_1(
1522             dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
1523             mask_stride, h, w, &v_round_offset, shift, &clip_low, &clip_high,
1524             &mask_max);
1525         break;
1526     }
1527 
1528   } else if (subw == 1 && subh == 1) {
1529     switch (w) {
1530       case 4:
1531         highbd_blend_a64_d16_mask_subw1_subh1_w4_sse4_1(
1532             dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
1533             mask_stride, h, &v_round_offset, shift, &clip_low, &clip_high,
1534             &mask_max);
1535         break;
1536       case 8:
1537         highbd_blend_a64_d16_mask_subw1_subh1_w8_sse4_1(
1538             dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
1539             mask_stride, h, &v_round_offset, shift, &clip_low, &clip_high,
1540             &mask_max);
1541         break;
1542       default:  // >=16
1543         highbd_blend_a64_d16_mask_subw1_subh1_w16_sse4_1(
1544             dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
1545             mask_stride, h, w, &v_round_offset, shift, &clip_low, &clip_high,
1546             &mask_max);
1547         break;
1548     }
1549   } else {
1550     // Sub-sampling in only one axis doesn't seem to happen very much, so fall
1551     // back to the vanilla C implementation instead of having all the optimised
1552     // code for these.
1553     aom_highbd_blend_a64_d16_mask_c(dst8, dst_stride, src0, src0_stride, src1,
1554                                     src1_stride, mask, mask_stride, w, h, subw,
1555                                     subh, conv_params, bd);
1556   }
1557 }
1558 #endif  // CONFIG_AV1_HIGHBITDEPTH
1559