1 /*
2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #include <smmintrin.h> // SSE4.1
13
14 #include <assert.h>
15
16 #include "aom/aom_integer.h"
17 #include "aom_ports/mem.h"
18 #include "aom_dsp/aom_dsp_common.h"
19 #include "aom_dsp/blend.h"
20
21 #include "aom_dsp/x86/synonyms.h"
22 #include "aom_dsp/x86/blend_sse4.h"
23 #include "aom_dsp/x86/blend_mask_sse4.h"
24
25 #include "config/aom_dsp_rtcd.h"
26
27 //////////////////////////////////////////////////////////////////////////////
28 // No sub-sampling
29 //////////////////////////////////////////////////////////////////////////////
30
blend_a64_mask_w4_sse4_1(uint8_t * dst,uint32_t dst_stride,const uint8_t * src0,uint32_t src0_stride,const uint8_t * src1,uint32_t src1_stride,const uint8_t * mask,uint32_t mask_stride,int w,int h)31 static void blend_a64_mask_w4_sse4_1(uint8_t *dst, uint32_t dst_stride,
32 const uint8_t *src0, uint32_t src0_stride,
33 const uint8_t *src1, uint32_t src1_stride,
34 const uint8_t *mask, uint32_t mask_stride,
35 int w, int h) {
36 (void)w;
37 const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
38 const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
39 do {
40 const __m128i v_m0_b = xx_loadl_32(mask);
41 const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
42 const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
43 xx_storel_32(dst, v_res_b);
44
45 dst += dst_stride;
46 src0 += src0_stride;
47 src1 += src1_stride;
48 mask += mask_stride;
49 } while (--h);
50 }
51
blend_a64_mask_w8_sse4_1(uint8_t * dst,uint32_t dst_stride,const uint8_t * src0,uint32_t src0_stride,const uint8_t * src1,uint32_t src1_stride,const uint8_t * mask,uint32_t mask_stride,int w,int h)52 static void blend_a64_mask_w8_sse4_1(uint8_t *dst, uint32_t dst_stride,
53 const uint8_t *src0, uint32_t src0_stride,
54 const uint8_t *src1, uint32_t src1_stride,
55 const uint8_t *mask, uint32_t mask_stride,
56 int w, int h) {
57 (void)w;
58 const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
59 const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
60 do {
61 const __m128i v_m0_b = xx_loadl_64(mask);
62 const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
63 const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
64 xx_storel_64(dst, v_res_b);
65
66 dst += dst_stride;
67 src0 += src0_stride;
68 src1 += src1_stride;
69 mask += mask_stride;
70 } while (--h);
71 }
72
blend_a64_mask_w16n_sse4_1(uint8_t * dst,uint32_t dst_stride,const uint8_t * src0,uint32_t src0_stride,const uint8_t * src1,uint32_t src1_stride,const uint8_t * mask,uint32_t mask_stride,int w,int h)73 static void blend_a64_mask_w16n_sse4_1(
74 uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
75 uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
76 const uint8_t *mask, uint32_t mask_stride, int w, int h) {
77 const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
78 const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
79
80 do {
81 int c;
82 for (c = 0; c < w; c += 16) {
83 const __m128i v_m0_b = xx_loadu_128(mask + c);
84 const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
85
86 const __m128i v_res_b =
87 blend_16_u8(src0 + c, src1 + c, &v_m0_b, &v_m1_b, &_r);
88
89 xx_storeu_128(dst + c, v_res_b);
90 }
91 dst += dst_stride;
92 src0 += src0_stride;
93 src1 += src1_stride;
94 mask += mask_stride;
95 } while (--h);
96 }
97
98 //////////////////////////////////////////////////////////////////////////////
99 // Horizontal sub-sampling
100 //////////////////////////////////////////////////////////////////////////////
101
blend_a64_mask_sx_w4_sse4_1(uint8_t * dst,uint32_t dst_stride,const uint8_t * src0,uint32_t src0_stride,const uint8_t * src1,uint32_t src1_stride,const uint8_t * mask,uint32_t mask_stride,int w,int h)102 static void blend_a64_mask_sx_w4_sse4_1(
103 uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
104 uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
105 const uint8_t *mask, uint32_t mask_stride, int w, int h) {
106 (void)w;
107
108 const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle);
109 const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
110 const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
111 do {
112 const __m128i v_r_b = xx_loadl_64(mask);
113 const __m128i v_r0_s_b = _mm_shuffle_epi8(v_r_b, v_shuffle_b);
114 const __m128i v_r_lo_b = _mm_unpacklo_epi64(v_r0_s_b, v_r0_s_b);
115 const __m128i v_r_hi_b = _mm_unpackhi_epi64(v_r0_s_b, v_r0_s_b);
116 const __m128i v_m0_b = _mm_avg_epu8(v_r_lo_b, v_r_hi_b);
117 const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
118
119 const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
120 xx_storel_32(dst, v_res_b);
121
122 dst += dst_stride;
123 src0 += src0_stride;
124 src1 += src1_stride;
125 mask += mask_stride;
126 } while (--h);
127 }
128
blend_a64_mask_sx_w8_sse4_1(uint8_t * dst,uint32_t dst_stride,const uint8_t * src0,uint32_t src0_stride,const uint8_t * src1,uint32_t src1_stride,const uint8_t * mask,uint32_t mask_stride,int w,int h)129 static void blend_a64_mask_sx_w8_sse4_1(
130 uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
131 uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
132 const uint8_t *mask, uint32_t mask_stride, int w, int h) {
133 (void)w;
134
135 const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle);
136 const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
137 const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
138 do {
139 const __m128i v_r_b = xx_loadu_128(mask);
140 const __m128i v_r0_s_b = _mm_shuffle_epi8(v_r_b, v_shuffle_b);
141 const __m128i v_r_lo_b = _mm_unpacklo_epi64(v_r0_s_b, v_r0_s_b);
142 const __m128i v_r_hi_b = _mm_unpackhi_epi64(v_r0_s_b, v_r0_s_b);
143 const __m128i v_m0_b = _mm_avg_epu8(v_r_lo_b, v_r_hi_b);
144 const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
145
146 const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
147
148 xx_storel_64(dst, v_res_b);
149
150 dst += dst_stride;
151 src0 += src0_stride;
152 src1 += src1_stride;
153 mask += mask_stride;
154 } while (--h);
155 }
156
blend_a64_mask_sx_w16n_sse4_1(uint8_t * dst,uint32_t dst_stride,const uint8_t * src0,uint32_t src0_stride,const uint8_t * src1,uint32_t src1_stride,const uint8_t * mask,uint32_t mask_stride,int w,int h)157 static void blend_a64_mask_sx_w16n_sse4_1(
158 uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
159 uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
160 const uint8_t *mask, uint32_t mask_stride, int w, int h) {
161 const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle);
162 const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
163 const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
164
165 do {
166 int c;
167 for (c = 0; c < w; c += 16) {
168 const __m128i v_r0_b = xx_loadu_128(mask + 2 * c);
169 const __m128i v_r1_b = xx_loadu_128(mask + 2 * c + 16);
170 const __m128i v_r0_s_b = _mm_shuffle_epi8(v_r0_b, v_shuffle_b);
171 const __m128i v_r1_s_b = _mm_shuffle_epi8(v_r1_b, v_shuffle_b);
172 const __m128i v_r_lo_b = _mm_unpacklo_epi64(v_r0_s_b, v_r1_s_b);
173 const __m128i v_r_hi_b = _mm_unpackhi_epi64(v_r0_s_b, v_r1_s_b);
174 const __m128i v_m0_b = _mm_avg_epu8(v_r_lo_b, v_r_hi_b);
175 const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
176
177 const __m128i v_res_b =
178 blend_16_u8(src0 + c, src1 + c, &v_m0_b, &v_m1_b, &_r);
179
180 xx_storeu_128(dst + c, v_res_b);
181 }
182 dst += dst_stride;
183 src0 += src0_stride;
184 src1 += src1_stride;
185 mask += mask_stride;
186 } while (--h);
187 }
188
189 //////////////////////////////////////////////////////////////////////////////
190 // Vertical sub-sampling
191 //////////////////////////////////////////////////////////////////////////////
192
blend_a64_mask_sy_w4_sse4_1(uint8_t * dst,uint32_t dst_stride,const uint8_t * src0,uint32_t src0_stride,const uint8_t * src1,uint32_t src1_stride,const uint8_t * mask,uint32_t mask_stride,int w,int h)193 static void blend_a64_mask_sy_w4_sse4_1(
194 uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
195 uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
196 const uint8_t *mask, uint32_t mask_stride, int w, int h) {
197 (void)w;
198
199 const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
200 const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
201
202 do {
203 const __m128i v_ra_b = xx_loadl_32(mask);
204 const __m128i v_rb_b = xx_loadl_32(mask + mask_stride);
205 const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b);
206 const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
207
208 const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
209
210 xx_storel_32(dst, v_res_b);
211
212 dst += dst_stride;
213 src0 += src0_stride;
214 src1 += src1_stride;
215 mask += 2 * mask_stride;
216 } while (--h);
217 }
218
blend_a64_mask_sy_w8_sse4_1(uint8_t * dst,uint32_t dst_stride,const uint8_t * src0,uint32_t src0_stride,const uint8_t * src1,uint32_t src1_stride,const uint8_t * mask,uint32_t mask_stride,int w,int h)219 static void blend_a64_mask_sy_w8_sse4_1(
220 uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
221 uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
222 const uint8_t *mask, uint32_t mask_stride, int w, int h) {
223 (void)w;
224
225 const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
226 const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
227 do {
228 const __m128i v_ra_b = xx_loadl_64(mask);
229 const __m128i v_rb_b = xx_loadl_64(mask + mask_stride);
230 const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b);
231 const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
232 const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
233
234 xx_storel_64(dst, v_res_b);
235
236 dst += dst_stride;
237 src0 += src0_stride;
238 src1 += src1_stride;
239 mask += 2 * mask_stride;
240 } while (--h);
241 }
242
blend_a64_mask_sy_w16n_sse4_1(uint8_t * dst,uint32_t dst_stride,const uint8_t * src0,uint32_t src0_stride,const uint8_t * src1,uint32_t src1_stride,const uint8_t * mask,uint32_t mask_stride,int w,int h)243 static void blend_a64_mask_sy_w16n_sse4_1(
244 uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
245 uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
246 const uint8_t *mask, uint32_t mask_stride, int w, int h) {
247 const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
248 const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
249 do {
250 int c;
251 for (c = 0; c < w; c += 16) {
252 const __m128i v_ra_b = xx_loadu_128(mask + c);
253 const __m128i v_rb_b = xx_loadu_128(mask + c + mask_stride);
254 const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b);
255 const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
256
257 const __m128i v_res_b =
258 blend_16_u8(src0 + c, src1 + c, &v_m0_b, &v_m1_b, &_r);
259
260 xx_storeu_128(dst + c, v_res_b);
261 }
262 dst += dst_stride;
263 src0 += src0_stride;
264 src1 += src1_stride;
265 mask += 2 * mask_stride;
266 } while (--h);
267 }
268
269 //////////////////////////////////////////////////////////////////////////////
270 // Horizontal and Vertical sub-sampling
271 //////////////////////////////////////////////////////////////////////////////
272
blend_a64_mask_sx_sy_w4_sse4_1(uint8_t * dst,uint32_t dst_stride,const uint8_t * src0,uint32_t src0_stride,const uint8_t * src1,uint32_t src1_stride,const uint8_t * mask,uint32_t mask_stride,int w,int h)273 static void blend_a64_mask_sx_sy_w4_sse4_1(
274 uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
275 uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
276 const uint8_t *mask, uint32_t mask_stride, int w, int h) {
277 const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle);
278 const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
279 const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
280 (void)w;
281
282 do {
283 const __m128i v_ra_b = xx_loadl_64(mask);
284 const __m128i v_rb_b = xx_loadl_64(mask + mask_stride);
285 const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
286 const __m128i v_r_s_b = _mm_shuffle_epi8(v_rvs_b, v_shuffle_b);
287 const __m128i v_r0_s_w = _mm_cvtepu8_epi16(v_r_s_b);
288 const __m128i v_r1_s_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_r_s_b, 8));
289 const __m128i v_rs_w = _mm_add_epi16(v_r0_s_w, v_r1_s_w);
290 const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2);
291 const __m128i v_m0_b = _mm_packus_epi16(v_m0_w, v_m0_w);
292 const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
293
294 const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
295
296 xx_storel_32(dst, v_res_b);
297
298 dst += dst_stride;
299 src0 += src0_stride;
300 src1 += src1_stride;
301 mask += 2 * mask_stride;
302 } while (--h);
303 }
304
blend_a64_mask_sx_sy_w8_sse4_1(uint8_t * dst,uint32_t dst_stride,const uint8_t * src0,uint32_t src0_stride,const uint8_t * src1,uint32_t src1_stride,const uint8_t * mask,uint32_t mask_stride,int w,int h)305 static void blend_a64_mask_sx_sy_w8_sse4_1(
306 uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
307 uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
308 const uint8_t *mask, uint32_t mask_stride, int w, int h) {
309 const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle);
310 const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
311 const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
312 (void)w;
313
314 do {
315 const __m128i v_ra_b = xx_loadu_128(mask);
316 const __m128i v_rb_b = xx_loadu_128(mask + mask_stride);
317
318 const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
319 const __m128i v_r_s_b = _mm_shuffle_epi8(v_rvs_b, v_shuffle_b);
320 const __m128i v_r0_s_w = _mm_cvtepu8_epi16(v_r_s_b);
321 const __m128i v_r1_s_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_r_s_b, 8));
322 const __m128i v_rs_w = _mm_add_epi16(v_r0_s_w, v_r1_s_w);
323 const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2);
324 const __m128i v_m0_b = _mm_packus_epi16(v_m0_w, v_m0_w);
325 const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
326
327 const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
328
329 xx_storel_64(dst, v_res_b);
330
331 dst += dst_stride;
332 src0 += src0_stride;
333 src1 += src1_stride;
334 mask += 2 * mask_stride;
335 } while (--h);
336 }
337
blend_a64_mask_sx_sy_w16n_sse4_1(uint8_t * dst,uint32_t dst_stride,const uint8_t * src0,uint32_t src0_stride,const uint8_t * src1,uint32_t src1_stride,const uint8_t * mask,uint32_t mask_stride,int w,int h)338 static void blend_a64_mask_sx_sy_w16n_sse4_1(
339 uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
340 uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
341 const uint8_t *mask, uint32_t mask_stride, int w, int h) {
342 const __m128i v_zmask_b =
343 _mm_set_epi8(0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1);
344 const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
345 const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
346 do {
347 int c;
348 for (c = 0; c < w; c += 16) {
349 const __m128i v_ral_b = xx_loadu_128(mask + 2 * c);
350 const __m128i v_rah_b = xx_loadu_128(mask + 2 * c + 16);
351 const __m128i v_rbl_b = xx_loadu_128(mask + mask_stride + 2 * c);
352 const __m128i v_rbh_b = xx_loadu_128(mask + mask_stride + 2 * c + 16);
353 const __m128i v_rvsl_b = _mm_add_epi8(v_ral_b, v_rbl_b);
354 const __m128i v_rvsh_b = _mm_add_epi8(v_rah_b, v_rbh_b);
355 const __m128i v_rvsal_w = _mm_and_si128(v_rvsl_b, v_zmask_b);
356 const __m128i v_rvsah_w = _mm_and_si128(v_rvsh_b, v_zmask_b);
357 const __m128i v_rvsbl_w =
358 _mm_and_si128(_mm_srli_si128(v_rvsl_b, 1), v_zmask_b);
359 const __m128i v_rvsbh_w =
360 _mm_and_si128(_mm_srli_si128(v_rvsh_b, 1), v_zmask_b);
361 const __m128i v_rsl_w = _mm_add_epi16(v_rvsal_w, v_rvsbl_w);
362 const __m128i v_rsh_w = _mm_add_epi16(v_rvsah_w, v_rvsbh_w);
363
364 const __m128i v_m0l_w = xx_roundn_epu16(v_rsl_w, 2);
365 const __m128i v_m0h_w = xx_roundn_epu16(v_rsh_w, 2);
366 const __m128i v_m0_b = _mm_packus_epi16(v_m0l_w, v_m0h_w);
367 const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
368
369 const __m128i v_res_b =
370 blend_16_u8(src0 + c, src1 + c, &v_m0_b, &v_m1_b, &_r);
371
372 xx_storeu_128(dst + c, v_res_b);
373 }
374 dst += dst_stride;
375 src0 += src0_stride;
376 src1 += src1_stride;
377 mask += 2 * mask_stride;
378 } while (--h);
379 }
380
381 //////////////////////////////////////////////////////////////////////////////
382 // Dispatch
383 //////////////////////////////////////////////////////////////////////////////
384
aom_blend_a64_mask_sse4_1(uint8_t * dst,uint32_t dst_stride,const uint8_t * src0,uint32_t src0_stride,const uint8_t * src1,uint32_t src1_stride,const uint8_t * mask,uint32_t mask_stride,int w,int h,int subw,int subh)385 void aom_blend_a64_mask_sse4_1(uint8_t *dst, uint32_t dst_stride,
386 const uint8_t *src0, uint32_t src0_stride,
387 const uint8_t *src1, uint32_t src1_stride,
388 const uint8_t *mask, uint32_t mask_stride, int w,
389 int h, int subw, int subh) {
390 typedef void (*blend_fn)(
391 uint8_t * dst, uint32_t dst_stride, const uint8_t *src0,
392 uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
393 const uint8_t *mask, uint32_t mask_stride, int w, int h);
394
395 // Dimensions are: width_index X subx X suby
396 static const blend_fn blend[3][2][2] = {
397 { // w % 16 == 0
398 { blend_a64_mask_w16n_sse4_1, blend_a64_mask_sy_w16n_sse4_1 },
399 { blend_a64_mask_sx_w16n_sse4_1, blend_a64_mask_sx_sy_w16n_sse4_1 } },
400 { // w == 4
401 { blend_a64_mask_w4_sse4_1, blend_a64_mask_sy_w4_sse4_1 },
402 { blend_a64_mask_sx_w4_sse4_1, blend_a64_mask_sx_sy_w4_sse4_1 } },
403 { // w == 8
404 { blend_a64_mask_w8_sse4_1, blend_a64_mask_sy_w8_sse4_1 },
405 { blend_a64_mask_sx_w8_sse4_1, blend_a64_mask_sx_sy_w8_sse4_1 } }
406 };
407
408 assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
409 assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
410
411 assert(h >= 1);
412 assert(w >= 1);
413 assert(IS_POWER_OF_TWO(h));
414 assert(IS_POWER_OF_TWO(w));
415
416 if (UNLIKELY((h | w) & 3)) { // if (w <= 2 || h <= 2)
417 aom_blend_a64_mask_c(dst, dst_stride, src0, src0_stride, src1, src1_stride,
418 mask, mask_stride, w, h, subw, subh);
419 } else {
420 blend[(w >> 2) & 3][subw != 0][subh != 0](dst, dst_stride, src0,
421 src0_stride, src1, src1_stride,
422 mask, mask_stride, w, h);
423 }
424 }
425
426 #if CONFIG_AV1_HIGHBITDEPTH
427 //////////////////////////////////////////////////////////////////////////////
428 // No sub-sampling
429 //////////////////////////////////////////////////////////////////////////////
430
blend_a64_mask_bn_w4_sse4_1(uint16_t * dst,uint32_t dst_stride,const uint16_t * src0,uint32_t src0_stride,const uint16_t * src1,uint32_t src1_stride,const uint8_t * mask,uint32_t mask_stride,int h,blend_unit_fn blend)431 static inline void blend_a64_mask_bn_w4_sse4_1(
432 uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
433 uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
434 const uint8_t *mask, uint32_t mask_stride, int h, blend_unit_fn blend) {
435 const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
436
437 do {
438 const __m128i v_m0_b = xx_loadl_32(mask);
439 const __m128i v_m0_w = _mm_cvtepu8_epi16(v_m0_b);
440 const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
441
442 const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w);
443
444 xx_storel_64(dst, v_res_w);
445
446 dst += dst_stride;
447 src0 += src0_stride;
448 src1 += src1_stride;
449 mask += mask_stride;
450 } while (--h);
451 }
452
blend_a64_mask_b10_w4_sse4_1(uint16_t * dst,uint32_t dst_stride,const uint16_t * src0,uint32_t src0_stride,const uint16_t * src1,uint32_t src1_stride,const uint8_t * mask,uint32_t mask_stride,int w,int h)453 static void blend_a64_mask_b10_w4_sse4_1(
454 uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
455 uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
456 const uint8_t *mask, uint32_t mask_stride, int w, int h) {
457 (void)w;
458 blend_a64_mask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
459 src1_stride, mask, mask_stride, h, blend_4_b10);
460 }
461
blend_a64_mask_b12_w4_sse4_1(uint16_t * dst,uint32_t dst_stride,const uint16_t * src0,uint32_t src0_stride,const uint16_t * src1,uint32_t src1_stride,const uint8_t * mask,uint32_t mask_stride,int w,int h)462 static void blend_a64_mask_b12_w4_sse4_1(
463 uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
464 uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
465 const uint8_t *mask, uint32_t mask_stride, int w, int h) {
466 (void)w;
467 blend_a64_mask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
468 src1_stride, mask, mask_stride, h, blend_4_b12);
469 }
470
blend_a64_mask_bn_w8n_sse4_1(uint16_t * dst,uint32_t dst_stride,const uint16_t * src0,uint32_t src0_stride,const uint16_t * src1,uint32_t src1_stride,const uint8_t * mask,uint32_t mask_stride,int w,int h,blend_unit_fn blend)471 static inline void blend_a64_mask_bn_w8n_sse4_1(
472 uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
473 uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
474 const uint8_t *mask, uint32_t mask_stride, int w, int h,
475 blend_unit_fn blend) {
476 const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
477
478 do {
479 int c;
480 for (c = 0; c < w; c += 8) {
481 const __m128i v_m0_b = xx_loadl_64(mask + c);
482 const __m128i v_m0_w = _mm_cvtepu8_epi16(v_m0_b);
483 const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
484
485 const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w);
486
487 xx_storeu_128(dst + c, v_res_w);
488 }
489 dst += dst_stride;
490 src0 += src0_stride;
491 src1 += src1_stride;
492 mask += mask_stride;
493 } while (--h);
494 }
495
blend_a64_mask_b10_w8n_sse4_1(uint16_t * dst,uint32_t dst_stride,const uint16_t * src0,uint32_t src0_stride,const uint16_t * src1,uint32_t src1_stride,const uint8_t * mask,uint32_t mask_stride,int w,int h)496 static void blend_a64_mask_b10_w8n_sse4_1(
497 uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
498 uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
499 const uint8_t *mask, uint32_t mask_stride, int w, int h) {
500 blend_a64_mask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
501 src1_stride, mask, mask_stride, w, h,
502 blend_8_b10);
503 }
504
blend_a64_mask_b12_w8n_sse4_1(uint16_t * dst,uint32_t dst_stride,const uint16_t * src0,uint32_t src0_stride,const uint16_t * src1,uint32_t src1_stride,const uint8_t * mask,uint32_t mask_stride,int w,int h)505 static void blend_a64_mask_b12_w8n_sse4_1(
506 uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
507 uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
508 const uint8_t *mask, uint32_t mask_stride, int w, int h) {
509 blend_a64_mask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
510 src1_stride, mask, mask_stride, w, h,
511 blend_8_b12);
512 }
513
514 //////////////////////////////////////////////////////////////////////////////
515 // Horizontal sub-sampling
516 //////////////////////////////////////////////////////////////////////////////
517
blend_a64_mask_bn_sx_w4_sse4_1(uint16_t * dst,uint32_t dst_stride,const uint16_t * src0,uint32_t src0_stride,const uint16_t * src1,uint32_t src1_stride,const uint8_t * mask,uint32_t mask_stride,int h,blend_unit_fn blend)518 static inline void blend_a64_mask_bn_sx_w4_sse4_1(
519 uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
520 uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
521 const uint8_t *mask, uint32_t mask_stride, int h, blend_unit_fn blend) {
522 const __m128i v_zmask_b =
523 _mm_set_epi8(0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1);
524 const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
525
526 do {
527 const __m128i v_r_b = xx_loadl_64(mask);
528 const __m128i v_a_b = _mm_avg_epu8(v_r_b, _mm_srli_si128(v_r_b, 1));
529
530 const __m128i v_m0_w = _mm_and_si128(v_a_b, v_zmask_b);
531 const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
532
533 const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w);
534
535 xx_storel_64(dst, v_res_w);
536
537 dst += dst_stride;
538 src0 += src0_stride;
539 src1 += src1_stride;
540 mask += mask_stride;
541 } while (--h);
542 }
543
blend_a64_mask_b10_sx_w4_sse4_1(uint16_t * dst,uint32_t dst_stride,const uint16_t * src0,uint32_t src0_stride,const uint16_t * src1,uint32_t src1_stride,const uint8_t * mask,uint32_t mask_stride,int w,int h)544 static void blend_a64_mask_b10_sx_w4_sse4_1(
545 uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
546 uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
547 const uint8_t *mask, uint32_t mask_stride, int w, int h) {
548 (void)w;
549 blend_a64_mask_bn_sx_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
550 src1_stride, mask, mask_stride, h,
551 blend_4_b10);
552 }
553
blend_a64_mask_b12_sx_w4_sse4_1(uint16_t * dst,uint32_t dst_stride,const uint16_t * src0,uint32_t src0_stride,const uint16_t * src1,uint32_t src1_stride,const uint8_t * mask,uint32_t mask_stride,int w,int h)554 static void blend_a64_mask_b12_sx_w4_sse4_1(
555 uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
556 uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
557 const uint8_t *mask, uint32_t mask_stride, int w, int h) {
558 (void)w;
559 blend_a64_mask_bn_sx_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
560 src1_stride, mask, mask_stride, h,
561 blend_4_b12);
562 }
563
blend_a64_mask_bn_sx_w8n_sse4_1(uint16_t * dst,uint32_t dst_stride,const uint16_t * src0,uint32_t src0_stride,const uint16_t * src1,uint32_t src1_stride,const uint8_t * mask,uint32_t mask_stride,int w,int h,blend_unit_fn blend)564 static inline void blend_a64_mask_bn_sx_w8n_sse4_1(
565 uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
566 uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
567 const uint8_t *mask, uint32_t mask_stride, int w, int h,
568 blend_unit_fn blend) {
569 const __m128i v_zmask_b =
570 _mm_set_epi8(0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1);
571 const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
572
573 do {
574 int c;
575 for (c = 0; c < w; c += 8) {
576 const __m128i v_r_b = xx_loadu_128(mask + 2 * c);
577 const __m128i v_a_b = _mm_avg_epu8(v_r_b, _mm_srli_si128(v_r_b, 1));
578
579 const __m128i v_m0_w = _mm_and_si128(v_a_b, v_zmask_b);
580 const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
581
582 const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w);
583
584 xx_storeu_128(dst + c, v_res_w);
585 }
586 dst += dst_stride;
587 src0 += src0_stride;
588 src1 += src1_stride;
589 mask += mask_stride;
590 } while (--h);
591 }
592
blend_a64_mask_b10_sx_w8n_sse4_1(uint16_t * dst,uint32_t dst_stride,const uint16_t * src0,uint32_t src0_stride,const uint16_t * src1,uint32_t src1_stride,const uint8_t * mask,uint32_t mask_stride,int w,int h)593 static void blend_a64_mask_b10_sx_w8n_sse4_1(
594 uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
595 uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
596 const uint8_t *mask, uint32_t mask_stride, int w, int h) {
597 blend_a64_mask_bn_sx_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
598 src1_stride, mask, mask_stride, w, h,
599 blend_8_b10);
600 }
601
blend_a64_mask_b12_sx_w8n_sse4_1(uint16_t * dst,uint32_t dst_stride,const uint16_t * src0,uint32_t src0_stride,const uint16_t * src1,uint32_t src1_stride,const uint8_t * mask,uint32_t mask_stride,int w,int h)602 static void blend_a64_mask_b12_sx_w8n_sse4_1(
603 uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
604 uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
605 const uint8_t *mask, uint32_t mask_stride, int w, int h) {
606 blend_a64_mask_bn_sx_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
607 src1_stride, mask, mask_stride, w, h,
608 blend_8_b12);
609 }
610
611 //////////////////////////////////////////////////////////////////////////////
612 // Vertical sub-sampling
613 //////////////////////////////////////////////////////////////////////////////
614
blend_a64_mask_bn_sy_w4_sse4_1(uint16_t * dst,uint32_t dst_stride,const uint16_t * src0,uint32_t src0_stride,const uint16_t * src1,uint32_t src1_stride,const uint8_t * mask,uint32_t mask_stride,int h,blend_unit_fn blend)615 static inline void blend_a64_mask_bn_sy_w4_sse4_1(
616 uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
617 uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
618 const uint8_t *mask, uint32_t mask_stride, int h, blend_unit_fn blend) {
619 const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
620
621 do {
622 const __m128i v_ra_b = xx_loadl_32(mask);
623 const __m128i v_rb_b = xx_loadl_32(mask + mask_stride);
624 const __m128i v_a_b = _mm_avg_epu8(v_ra_b, v_rb_b);
625
626 const __m128i v_m0_w = _mm_cvtepu8_epi16(v_a_b);
627 const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
628
629 const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w);
630
631 xx_storel_64(dst, v_res_w);
632
633 dst += dst_stride;
634 src0 += src0_stride;
635 src1 += src1_stride;
636 mask += 2 * mask_stride;
637 } while (--h);
638 }
639
blend_a64_mask_b10_sy_w4_sse4_1(uint16_t * dst,uint32_t dst_stride,const uint16_t * src0,uint32_t src0_stride,const uint16_t * src1,uint32_t src1_stride,const uint8_t * mask,uint32_t mask_stride,int w,int h)640 static void blend_a64_mask_b10_sy_w4_sse4_1(
641 uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
642 uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
643 const uint8_t *mask, uint32_t mask_stride, int w, int h) {
644 (void)w;
645 blend_a64_mask_bn_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
646 src1_stride, mask, mask_stride, h,
647 blend_4_b10);
648 }
649
blend_a64_mask_b12_sy_w4_sse4_1(uint16_t * dst,uint32_t dst_stride,const uint16_t * src0,uint32_t src0_stride,const uint16_t * src1,uint32_t src1_stride,const uint8_t * mask,uint32_t mask_stride,int w,int h)650 static void blend_a64_mask_b12_sy_w4_sse4_1(
651 uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
652 uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
653 const uint8_t *mask, uint32_t mask_stride, int w, int h) {
654 (void)w;
655 blend_a64_mask_bn_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
656 src1_stride, mask, mask_stride, h,
657 blend_4_b12);
658 }
659
blend_a64_mask_bn_sy_w8n_sse4_1(uint16_t * dst,uint32_t dst_stride,const uint16_t * src0,uint32_t src0_stride,const uint16_t * src1,uint32_t src1_stride,const uint8_t * mask,uint32_t mask_stride,int w,int h,blend_unit_fn blend)660 static inline void blend_a64_mask_bn_sy_w8n_sse4_1(
661 uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
662 uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
663 const uint8_t *mask, uint32_t mask_stride, int w, int h,
664 blend_unit_fn blend) {
665 const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
666
667 do {
668 int c;
669 for (c = 0; c < w; c += 8) {
670 const __m128i v_ra_b = xx_loadl_64(mask + c);
671 const __m128i v_rb_b = xx_loadl_64(mask + c + mask_stride);
672 const __m128i v_a_b = _mm_avg_epu8(v_ra_b, v_rb_b);
673
674 const __m128i v_m0_w = _mm_cvtepu8_epi16(v_a_b);
675 const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
676
677 const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w);
678
679 xx_storeu_128(dst + c, v_res_w);
680 }
681 dst += dst_stride;
682 src0 += src0_stride;
683 src1 += src1_stride;
684 mask += 2 * mask_stride;
685 } while (--h);
686 }
687
blend_a64_mask_b10_sy_w8n_sse4_1(uint16_t * dst,uint32_t dst_stride,const uint16_t * src0,uint32_t src0_stride,const uint16_t * src1,uint32_t src1_stride,const uint8_t * mask,uint32_t mask_stride,int w,int h)688 static void blend_a64_mask_b10_sy_w8n_sse4_1(
689 uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
690 uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
691 const uint8_t *mask, uint32_t mask_stride, int w, int h) {
692 blend_a64_mask_bn_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
693 src1_stride, mask, mask_stride, w, h,
694 blend_8_b10);
695 }
696
blend_a64_mask_b12_sy_w8n_sse4_1(uint16_t * dst,uint32_t dst_stride,const uint16_t * src0,uint32_t src0_stride,const uint16_t * src1,uint32_t src1_stride,const uint8_t * mask,uint32_t mask_stride,int w,int h)697 static void blend_a64_mask_b12_sy_w8n_sse4_1(
698 uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
699 uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
700 const uint8_t *mask, uint32_t mask_stride, int w, int h) {
701 blend_a64_mask_bn_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
702 src1_stride, mask, mask_stride, w, h,
703 blend_8_b12);
704 }
705
706 //////////////////////////////////////////////////////////////////////////////
707 // Horizontal and Vertical sub-sampling
708 //////////////////////////////////////////////////////////////////////////////
709
blend_a64_mask_bn_sx_sy_w4_sse4_1(uint16_t * dst,uint32_t dst_stride,const uint16_t * src0,uint32_t src0_stride,const uint16_t * src1,uint32_t src1_stride,const uint8_t * mask,uint32_t mask_stride,int h,blend_unit_fn blend)710 static inline void blend_a64_mask_bn_sx_sy_w4_sse4_1(
711 uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
712 uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
713 const uint8_t *mask, uint32_t mask_stride, int h, blend_unit_fn blend) {
714 const __m128i v_zmask_b =
715 _mm_set_epi8(0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1);
716 const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
717
718 do {
719 const __m128i v_ra_b = xx_loadl_64(mask);
720 const __m128i v_rb_b = xx_loadl_64(mask + mask_stride);
721 const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
722 const __m128i v_rvsa_w = _mm_and_si128(v_rvs_b, v_zmask_b);
723 const __m128i v_rvsb_w =
724 _mm_and_si128(_mm_srli_si128(v_rvs_b, 1), v_zmask_b);
725 const __m128i v_rs_w = _mm_add_epi16(v_rvsa_w, v_rvsb_w);
726
727 const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2);
728 const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
729
730 const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w);
731
732 xx_storel_64(dst, v_res_w);
733
734 dst += dst_stride;
735 src0 += src0_stride;
736 src1 += src1_stride;
737 mask += 2 * mask_stride;
738 } while (--h);
739 }
740
blend_a64_mask_b10_sx_sy_w4_sse4_1(uint16_t * dst,uint32_t dst_stride,const uint16_t * src0,uint32_t src0_stride,const uint16_t * src1,uint32_t src1_stride,const uint8_t * mask,uint32_t mask_stride,int w,int h)741 static void blend_a64_mask_b10_sx_sy_w4_sse4_1(
742 uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
743 uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
744 const uint8_t *mask, uint32_t mask_stride, int w, int h) {
745 (void)w;
746 blend_a64_mask_bn_sx_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
747 src1_stride, mask, mask_stride, h,
748 blend_4_b10);
749 }
750
blend_a64_mask_b12_sx_sy_w4_sse4_1(uint16_t * dst,uint32_t dst_stride,const uint16_t * src0,uint32_t src0_stride,const uint16_t * src1,uint32_t src1_stride,const uint8_t * mask,uint32_t mask_stride,int w,int h)751 static void blend_a64_mask_b12_sx_sy_w4_sse4_1(
752 uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
753 uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
754 const uint8_t *mask, uint32_t mask_stride, int w, int h) {
755 (void)w;
756 blend_a64_mask_bn_sx_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
757 src1_stride, mask, mask_stride, h,
758 blend_4_b12);
759 }
760
blend_a64_mask_bn_sx_sy_w8n_sse4_1(uint16_t * dst,uint32_t dst_stride,const uint16_t * src0,uint32_t src0_stride,const uint16_t * src1,uint32_t src1_stride,const uint8_t * mask,uint32_t mask_stride,int w,int h,blend_unit_fn blend)761 static inline void blend_a64_mask_bn_sx_sy_w8n_sse4_1(
762 uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
763 uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
764 const uint8_t *mask, uint32_t mask_stride, int w, int h,
765 blend_unit_fn blend) {
766 const __m128i v_zmask_b =
767 _mm_set_epi8(0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1);
768 const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
769
770 do {
771 int c;
772 for (c = 0; c < w; c += 8) {
773 const __m128i v_ra_b = xx_loadu_128(mask + 2 * c);
774 const __m128i v_rb_b = xx_loadu_128(mask + 2 * c + mask_stride);
775 const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
776 const __m128i v_rvsa_w = _mm_and_si128(v_rvs_b, v_zmask_b);
777 const __m128i v_rvsb_w =
778 _mm_and_si128(_mm_srli_si128(v_rvs_b, 1), v_zmask_b);
779 const __m128i v_rs_w = _mm_add_epi16(v_rvsa_w, v_rvsb_w);
780
781 const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2);
782 const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
783
784 const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w);
785
786 xx_storeu_128(dst + c, v_res_w);
787 }
788 dst += dst_stride;
789 src0 += src0_stride;
790 src1 += src1_stride;
791 mask += 2 * mask_stride;
792 } while (--h);
793 }
794
blend_a64_mask_b10_sx_sy_w8n_sse4_1(uint16_t * dst,uint32_t dst_stride,const uint16_t * src0,uint32_t src0_stride,const uint16_t * src1,uint32_t src1_stride,const uint8_t * mask,uint32_t mask_stride,int w,int h)795 static void blend_a64_mask_b10_sx_sy_w8n_sse4_1(
796 uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
797 uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
798 const uint8_t *mask, uint32_t mask_stride, int w, int h) {
799 blend_a64_mask_bn_sx_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
800 src1_stride, mask, mask_stride, w, h,
801 blend_8_b10);
802 }
803
blend_a64_mask_b12_sx_sy_w8n_sse4_1(uint16_t * dst,uint32_t dst_stride,const uint16_t * src0,uint32_t src0_stride,const uint16_t * src1,uint32_t src1_stride,const uint8_t * mask,uint32_t mask_stride,int w,int h)804 static void blend_a64_mask_b12_sx_sy_w8n_sse4_1(
805 uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
806 uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
807 const uint8_t *mask, uint32_t mask_stride, int w, int h) {
808 blend_a64_mask_bn_sx_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
809 src1_stride, mask, mask_stride, w, h,
810 blend_8_b12);
811 }
812
813 //////////////////////////////////////////////////////////////////////////////
814 // Dispatch
815 //////////////////////////////////////////////////////////////////////////////
aom_highbd_blend_a64_mask_sse4_1(uint8_t * dst_8,uint32_t dst_stride,const uint8_t * src0_8,uint32_t src0_stride,const uint8_t * src1_8,uint32_t src1_stride,const uint8_t * mask,uint32_t mask_stride,int w,int h,int subw,int subh,int bd)816 void aom_highbd_blend_a64_mask_sse4_1(uint8_t *dst_8, uint32_t dst_stride,
817 const uint8_t *src0_8,
818 uint32_t src0_stride,
819 const uint8_t *src1_8,
820 uint32_t src1_stride, const uint8_t *mask,
821 uint32_t mask_stride, int w, int h,
822 int subw, int subh, int bd) {
823 typedef void (*blend_fn)(
824 uint16_t * dst, uint32_t dst_stride, const uint16_t *src0,
825 uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
826 const uint8_t *mask, uint32_t mask_stride, int w, int h);
827
828 // Dimensions are: bd_index X width_index X subw X subh
829 static const blend_fn blend[2][2][2][2] = {
830 { // bd == 8 or 10
831 { // w % 8 == 0
832 { blend_a64_mask_b10_w8n_sse4_1, blend_a64_mask_b10_sy_w8n_sse4_1 },
833 { blend_a64_mask_b10_sx_w8n_sse4_1,
834 blend_a64_mask_b10_sx_sy_w8n_sse4_1 } },
835 { // w == 4
836 { blend_a64_mask_b10_w4_sse4_1, blend_a64_mask_b10_sy_w4_sse4_1 },
837 { blend_a64_mask_b10_sx_w4_sse4_1,
838 blend_a64_mask_b10_sx_sy_w4_sse4_1 } } },
839 { // bd == 12
840 { // w % 8 == 0
841 { blend_a64_mask_b12_w8n_sse4_1, blend_a64_mask_b12_sy_w8n_sse4_1 },
842 { blend_a64_mask_b12_sx_w8n_sse4_1,
843 blend_a64_mask_b12_sx_sy_w8n_sse4_1 } },
844 { // w == 4
845 { blend_a64_mask_b12_w4_sse4_1, blend_a64_mask_b12_sy_w4_sse4_1 },
846 { blend_a64_mask_b12_sx_w4_sse4_1,
847 blend_a64_mask_b12_sx_sy_w4_sse4_1 } } }
848 };
849
850 assert(IMPLIES(src0_8 == dst_8, src0_stride == dst_stride));
851 assert(IMPLIES(src1_8 == dst_8, src1_stride == dst_stride));
852
853 assert(h >= 1);
854 assert(w >= 1);
855 assert(IS_POWER_OF_TWO(h));
856 assert(IS_POWER_OF_TWO(w));
857
858 assert(bd == 8 || bd == 10 || bd == 12);
859 if (UNLIKELY((h | w) & 3)) { // if (w <= 2 || h <= 2)
860 aom_highbd_blend_a64_mask_c(dst_8, dst_stride, src0_8, src0_stride, src1_8,
861 src1_stride, mask, mask_stride, w, h, subw,
862 subh, bd);
863 } else {
864 uint16_t *const dst = CONVERT_TO_SHORTPTR(dst_8);
865 const uint16_t *const src0 = CONVERT_TO_SHORTPTR(src0_8);
866 const uint16_t *const src1 = CONVERT_TO_SHORTPTR(src1_8);
867
868 blend[bd == 12][(w >> 2) & 1][subw != 0][subh != 0](
869 dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
870 mask_stride, w, h);
871 }
872 }
873 #endif // CONFIG_AV1_HIGHBITDEPTH
874
blend_a64_d16_mask_w16_sse41(uint8_t * dst,const CONV_BUF_TYPE * src0,const CONV_BUF_TYPE * src1,const __m128i * m0,const __m128i * m1,const __m128i * v_round_offset,const __m128i * v_maxval,int shift)875 static inline void blend_a64_d16_mask_w16_sse41(
876 uint8_t *dst, const CONV_BUF_TYPE *src0, const CONV_BUF_TYPE *src1,
877 const __m128i *m0, const __m128i *m1, const __m128i *v_round_offset,
878 const __m128i *v_maxval, int shift) {
879 const __m128i max_minus_m0 = _mm_sub_epi16(*v_maxval, *m0);
880 const __m128i max_minus_m1 = _mm_sub_epi16(*v_maxval, *m1);
881 const __m128i s0_0 = xx_loadu_128(src0);
882 const __m128i s0_1 = xx_loadu_128(src0 + 8);
883 const __m128i s1_0 = xx_loadu_128(src1);
884 const __m128i s1_1 = xx_loadu_128(src1 + 8);
885 __m128i res0_lo = _mm_madd_epi16(_mm_unpacklo_epi16(s0_0, s1_0),
886 _mm_unpacklo_epi16(*m0, max_minus_m0));
887 __m128i res0_hi = _mm_madd_epi16(_mm_unpackhi_epi16(s0_0, s1_0),
888 _mm_unpackhi_epi16(*m0, max_minus_m0));
889 __m128i res1_lo = _mm_madd_epi16(_mm_unpacklo_epi16(s0_1, s1_1),
890 _mm_unpacklo_epi16(*m1, max_minus_m1));
891 __m128i res1_hi = _mm_madd_epi16(_mm_unpackhi_epi16(s0_1, s1_1),
892 _mm_unpackhi_epi16(*m1, max_minus_m1));
893 res0_lo = _mm_srai_epi32(_mm_sub_epi32(res0_lo, *v_round_offset), shift);
894 res0_hi = _mm_srai_epi32(_mm_sub_epi32(res0_hi, *v_round_offset), shift);
895 res1_lo = _mm_srai_epi32(_mm_sub_epi32(res1_lo, *v_round_offset), shift);
896 res1_hi = _mm_srai_epi32(_mm_sub_epi32(res1_hi, *v_round_offset), shift);
897 const __m128i res0 = _mm_packs_epi32(res0_lo, res0_hi);
898 const __m128i res1 = _mm_packs_epi32(res1_lo, res1_hi);
899 const __m128i res = _mm_packus_epi16(res0, res1);
900
901 _mm_storeu_si128((__m128i *)(dst), res);
902 }
903
lowbd_blend_a64_d16_mask_subw0_subh0_w16_sse4_1(uint8_t * dst,uint32_t dst_stride,const CONV_BUF_TYPE * src0,uint32_t src0_stride,const CONV_BUF_TYPE * src1,uint32_t src1_stride,const uint8_t * mask,uint32_t mask_stride,int h,int w,const __m128i * round_offset,int shift)904 static inline void lowbd_blend_a64_d16_mask_subw0_subh0_w16_sse4_1(
905 uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
906 uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
907 const uint8_t *mask, uint32_t mask_stride, int h, int w,
908 const __m128i *round_offset, int shift) {
909 const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
910 for (int i = 0; i < h; ++i) {
911 for (int j = 0; j < w; j += 16) {
912 const __m128i m = xx_loadu_128(mask + j);
913 const __m128i m0 = _mm_cvtepu8_epi16(m);
914 const __m128i m1 = _mm_cvtepu8_epi16(_mm_srli_si128(m, 8));
915
916 blend_a64_d16_mask_w16_sse41(dst + j, src0 + j, src1 + j, &m0, &m1,
917 round_offset, &v_maxval, shift);
918 }
919 mask += mask_stride;
920 dst += dst_stride;
921 src0 += src0_stride;
922 src1 += src1_stride;
923 }
924 }
925
lowbd_blend_a64_d16_mask_subw1_subh1_w16_sse4_1(uint8_t * dst,uint32_t dst_stride,const CONV_BUF_TYPE * src0,uint32_t src0_stride,const CONV_BUF_TYPE * src1,uint32_t src1_stride,const uint8_t * mask,uint32_t mask_stride,int h,int w,const __m128i * round_offset,int shift)926 static inline void lowbd_blend_a64_d16_mask_subw1_subh1_w16_sse4_1(
927 uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
928 uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
929 const uint8_t *mask, uint32_t mask_stride, int h, int w,
930 const __m128i *round_offset, int shift) {
931 const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
932 const __m128i one_b = _mm_set1_epi8(1);
933 const __m128i two_w = _mm_set1_epi16(2);
934 for (int i = 0; i < h; ++i) {
935 for (int j = 0; j < w; j += 16) {
936 const __m128i m_i00 = xx_loadu_128(mask + 2 * j);
937 const __m128i m_i01 = xx_loadu_128(mask + 2 * j + 16);
938 const __m128i m_i10 = xx_loadu_128(mask + mask_stride + 2 * j);
939 const __m128i m_i11 = xx_loadu_128(mask + mask_stride + 2 * j + 16);
940
941 const __m128i m0_ac = _mm_adds_epu8(m_i00, m_i10);
942 const __m128i m1_ac = _mm_adds_epu8(m_i01, m_i11);
943 const __m128i m0_acbd = _mm_maddubs_epi16(m0_ac, one_b);
944 const __m128i m1_acbd = _mm_maddubs_epi16(m1_ac, one_b);
945 const __m128i m0 = _mm_srli_epi16(_mm_add_epi16(m0_acbd, two_w), 2);
946 const __m128i m1 = _mm_srli_epi16(_mm_add_epi16(m1_acbd, two_w), 2);
947
948 blend_a64_d16_mask_w16_sse41(dst + j, src0 + j, src1 + j, &m0, &m1,
949 round_offset, &v_maxval, shift);
950 }
951 mask += mask_stride << 1;
952 dst += dst_stride;
953 src0 += src0_stride;
954 src1 += src1_stride;
955 }
956 }
957
lowbd_blend_a64_d16_mask_subw1_subh0_w16_sse4_1(uint8_t * dst,uint32_t dst_stride,const CONV_BUF_TYPE * src0,uint32_t src0_stride,const CONV_BUF_TYPE * src1,uint32_t src1_stride,const uint8_t * mask,uint32_t mask_stride,int h,int w,const __m128i * round_offset,int shift)958 static inline void lowbd_blend_a64_d16_mask_subw1_subh0_w16_sse4_1(
959 uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
960 uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
961 const uint8_t *mask, uint32_t mask_stride, int h, int w,
962 const __m128i *round_offset, int shift) {
963 const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
964 const __m128i one_b = _mm_set1_epi8(1);
965 const __m128i zeros = _mm_setzero_si128();
966 for (int i = 0; i < h; ++i) {
967 for (int j = 0; j < w; j += 16) {
968 const __m128i m_i00 = xx_loadu_128(mask + 2 * j);
969 const __m128i m_i01 = xx_loadu_128(mask + 2 * j + 16);
970 const __m128i m0_ac = _mm_maddubs_epi16(m_i00, one_b);
971 const __m128i m1_ac = _mm_maddubs_epi16(m_i01, one_b);
972 const __m128i m0 = _mm_avg_epu16(m0_ac, zeros);
973 const __m128i m1 = _mm_avg_epu16(m1_ac, zeros);
974
975 blend_a64_d16_mask_w16_sse41(dst + j, src0 + j, src1 + j, &m0, &m1,
976 round_offset, &v_maxval, shift);
977 }
978 mask += mask_stride;
979 dst += dst_stride;
980 src0 += src0_stride;
981 src1 += src1_stride;
982 }
983 }
984
lowbd_blend_a64_d16_mask_subw0_subh1_w16_sse4_1(uint8_t * dst,uint32_t dst_stride,const CONV_BUF_TYPE * src0,uint32_t src0_stride,const CONV_BUF_TYPE * src1,uint32_t src1_stride,const uint8_t * mask,uint32_t mask_stride,int h,int w,const __m128i * round_offset,int shift)985 static inline void lowbd_blend_a64_d16_mask_subw0_subh1_w16_sse4_1(
986 uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
987 uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
988 const uint8_t *mask, uint32_t mask_stride, int h, int w,
989 const __m128i *round_offset, int shift) {
990 const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
991 const __m128i zeros = _mm_setzero_si128();
992 for (int i = 0; i < h; ++i) {
993 for (int j = 0; j < w; j += 16) {
994 const __m128i m_i00 = xx_loadu_128(mask + j);
995 const __m128i m_i10 = xx_loadu_128(mask + mask_stride + j);
996
997 const __m128i m_ac = _mm_avg_epu8(_mm_adds_epu8(m_i00, m_i10), zeros);
998 const __m128i m0 = _mm_cvtepu8_epi16(m_ac);
999 const __m128i m1 = _mm_cvtepu8_epi16(_mm_srli_si128(m_ac, 8));
1000
1001 blend_a64_d16_mask_w16_sse41(dst + j, src0 + j, src1 + j, &m0, &m1,
1002 round_offset, &v_maxval, shift);
1003 }
1004 mask += mask_stride << 1;
1005 dst += dst_stride;
1006 src0 += src0_stride;
1007 src1 += src1_stride;
1008 }
1009 }
1010
aom_lowbd_blend_a64_d16_mask_sse4_1(uint8_t * dst,uint32_t dst_stride,const CONV_BUF_TYPE * src0,uint32_t src0_stride,const CONV_BUF_TYPE * src1,uint32_t src1_stride,const uint8_t * mask,uint32_t mask_stride,int w,int h,int subw,int subh,ConvolveParams * conv_params)1011 void aom_lowbd_blend_a64_d16_mask_sse4_1(
1012 uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
1013 uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
1014 const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh,
1015 ConvolveParams *conv_params) {
1016 const int bd = 8;
1017 const int round_bits =
1018 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
1019
1020 const int round_offset =
1021 ((1 << (round_bits + bd)) + (1 << (round_bits + bd - 1)) -
1022 (1 << (round_bits - 1)))
1023 << AOM_BLEND_A64_ROUND_BITS;
1024
1025 const int shift = round_bits + AOM_BLEND_A64_ROUND_BITS;
1026 assert(IMPLIES((void *)src0 == dst, src0_stride == dst_stride));
1027 assert(IMPLIES((void *)src1 == dst, src1_stride == dst_stride));
1028
1029 assert(h >= 4);
1030 assert(w >= 4);
1031 assert(IS_POWER_OF_TWO(h));
1032 assert(IS_POWER_OF_TWO(w));
1033
1034 const __m128i v_round_offset = _mm_set1_epi32(round_offset);
1035
1036 if (subw == 0 && subh == 0) {
1037 switch (w) {
1038 case 4:
1039 aom_lowbd_blend_a64_d16_mask_subw0_subh0_w4_sse4_1(
1040 dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
1041 mask_stride, h, &v_round_offset, shift);
1042 break;
1043 case 8:
1044 aom_lowbd_blend_a64_d16_mask_subw0_subh0_w8_sse4_1(
1045 dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
1046 mask_stride, h, &v_round_offset, shift);
1047 break;
1048 default:
1049 lowbd_blend_a64_d16_mask_subw0_subh0_w16_sse4_1(
1050 dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
1051 mask_stride, h, w, &v_round_offset, shift);
1052 break;
1053 }
1054
1055 } else if (subw == 1 && subh == 1) {
1056 switch (w) {
1057 case 4:
1058 aom_lowbd_blend_a64_d16_mask_subw1_subh1_w4_sse4_1(
1059 dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
1060 mask_stride, h, &v_round_offset, shift);
1061 break;
1062 case 8:
1063 aom_lowbd_blend_a64_d16_mask_subw1_subh1_w8_sse4_1(
1064 dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
1065 mask_stride, h, &v_round_offset, shift);
1066 break;
1067 default:
1068 lowbd_blend_a64_d16_mask_subw1_subh1_w16_sse4_1(
1069 dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
1070 mask_stride, h, w, &v_round_offset, shift);
1071 break;
1072 }
1073 } else if (subw == 1 && subh == 0) {
1074 switch (w) {
1075 case 4:
1076 aom_lowbd_blend_a64_d16_mask_subw1_subh0_w4_sse4_1(
1077 dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
1078 mask_stride, h, &v_round_offset, shift);
1079 break;
1080 case 8:
1081 aom_lowbd_blend_a64_d16_mask_subw1_subh0_w8_sse4_1(
1082 dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
1083 mask_stride, h, &v_round_offset, shift);
1084 break;
1085 default:
1086 lowbd_blend_a64_d16_mask_subw1_subh0_w16_sse4_1(
1087 dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
1088 mask_stride, h, w, &v_round_offset, shift);
1089 break;
1090 }
1091 } else {
1092 switch (w) {
1093 case 4:
1094 aom_lowbd_blend_a64_d16_mask_subw0_subh1_w4_sse4_1(
1095 dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
1096 mask_stride, h, &v_round_offset, shift);
1097 break;
1098 case 8:
1099 aom_lowbd_blend_a64_d16_mask_subw0_subh1_w8_sse4_1(
1100 dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
1101 mask_stride, h, &v_round_offset, shift);
1102 break;
1103 default:
1104 lowbd_blend_a64_d16_mask_subw0_subh1_w16_sse4_1(
1105 dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
1106 mask_stride, h, w, &v_round_offset, shift);
1107 break;
1108 }
1109 }
1110 }
1111
1112 //////////////////////////////////////////////////////////////////////////////
1113 // aom_highbd_blend_a64_d16_mask_sse4_1()
1114 //////////////////////////////////////////////////////////////////////////////
1115 #if CONFIG_AV1_HIGHBITDEPTH
highbd_blend_a64_d16_mask_w4_sse4_1(uint16_t * dst,int dst_stride,const CONV_BUF_TYPE * src0,int src0_stride,const CONV_BUF_TYPE * src1,int src1_stride,const __m128i * mask0a,const __m128i * mask0b,const __m128i * round_offset,int shift,const __m128i * clip_low,const __m128i * clip_high,const __m128i * mask_max)1116 static inline void highbd_blend_a64_d16_mask_w4_sse4_1(
1117 uint16_t *dst, int dst_stride, const CONV_BUF_TYPE *src0, int src0_stride,
1118 const CONV_BUF_TYPE *src1, int src1_stride, const __m128i *mask0a,
1119 const __m128i *mask0b, const __m128i *round_offset, int shift,
1120 const __m128i *clip_low, const __m128i *clip_high,
1121 const __m128i *mask_max) {
1122 // Load 4 pixels from each of 4 rows from each source
1123 const __m128i s0a = xx_loadu_2x64(src0, src0 + src0_stride);
1124 const __m128i s0b =
1125 xx_loadu_2x64(src0 + 2 * src0_stride, src0 + 3 * src0_stride);
1126 const __m128i s1a = xx_loadu_2x64(src1, src1 + src1_stride);
1127 const __m128i s1b =
1128 xx_loadu_2x64(src1 + 2 * src1_stride, src1 + 3 * src1_stride);
1129
1130 // Generate the inverse masks
1131 const __m128i mask1a = _mm_sub_epi16(*mask_max, *mask0a);
1132 const __m128i mask1b = _mm_sub_epi16(*mask_max, *mask0b);
1133
1134 // Multiply each mask by the respective source
1135 const __m128i mul0a_highs = _mm_mulhi_epu16(*mask0a, s0a);
1136 const __m128i mul0a_lows = _mm_mullo_epi16(*mask0a, s0a);
1137 const __m128i mul0ah = _mm_unpackhi_epi16(mul0a_lows, mul0a_highs);
1138 const __m128i mul0al = _mm_unpacklo_epi16(mul0a_lows, mul0a_highs);
1139 const __m128i mul1a_highs = _mm_mulhi_epu16(mask1a, s1a);
1140 const __m128i mul1a_lows = _mm_mullo_epi16(mask1a, s1a);
1141 const __m128i mul1ah = _mm_unpackhi_epi16(mul1a_lows, mul1a_highs);
1142 const __m128i mul1al = _mm_unpacklo_epi16(mul1a_lows, mul1a_highs);
1143
1144 const __m128i mul0b_highs = _mm_mulhi_epu16(*mask0b, s0b);
1145 const __m128i mul0b_lows = _mm_mullo_epi16(*mask0b, s0b);
1146 const __m128i mul0bh = _mm_unpackhi_epi16(mul0b_lows, mul0b_highs);
1147 const __m128i mul0bl = _mm_unpacklo_epi16(mul0b_lows, mul0b_highs);
1148 const __m128i mul1b_highs = _mm_mulhi_epu16(mask1b, s1b);
1149 const __m128i mul1b_lows = _mm_mullo_epi16(mask1b, s1b);
1150 const __m128i mul1bh = _mm_unpackhi_epi16(mul1b_lows, mul1b_highs);
1151 const __m128i mul1bl = _mm_unpacklo_epi16(mul1b_lows, mul1b_highs);
1152
1153 const __m128i sumah = _mm_add_epi32(mul0ah, mul1ah);
1154 const __m128i sumal = _mm_add_epi32(mul0al, mul1al);
1155 const __m128i sumbh = _mm_add_epi32(mul0bh, mul1bh);
1156 const __m128i sumbl = _mm_add_epi32(mul0bl, mul1bl);
1157
1158 const __m128i roundah =
1159 _mm_srai_epi32(_mm_sub_epi32(sumah, *round_offset), shift);
1160 const __m128i roundbh =
1161 _mm_srai_epi32(_mm_sub_epi32(sumbh, *round_offset), shift);
1162 const __m128i roundal =
1163 _mm_srai_epi32(_mm_sub_epi32(sumal, *round_offset), shift);
1164 const __m128i roundbl =
1165 _mm_srai_epi32(_mm_sub_epi32(sumbl, *round_offset), shift);
1166
1167 const __m128i packa = _mm_packs_epi32(roundal, roundah);
1168 const __m128i packb = _mm_packs_epi32(roundbl, roundbh);
1169
1170 const __m128i clipa =
1171 _mm_min_epi16(_mm_max_epi16(packa, *clip_low), *clip_high);
1172 const __m128i clipb =
1173 _mm_min_epi16(_mm_max_epi16(packb, *clip_low), *clip_high);
1174
1175 xx_storel_64(dst, _mm_srli_si128(clipa, 8));
1176 xx_storel_64(dst + dst_stride, clipa);
1177 xx_storel_64(dst + 2 * dst_stride, _mm_srli_si128(clipb, 8));
1178 xx_storel_64(dst + 3 * dst_stride, clipb);
1179 }
1180
highbd_blend_a64_d16_mask_subw0_subh0_w4_sse4_1(uint16_t * dst,uint32_t dst_stride,const CONV_BUF_TYPE * src0,uint32_t src0_stride,const CONV_BUF_TYPE * src1,uint32_t src1_stride,const uint8_t * mask,uint32_t mask_stride,int h,const __m128i * round_offset,int shift,const __m128i * clip_low,const __m128i * clip_high,const __m128i * mask_max)1181 static inline void highbd_blend_a64_d16_mask_subw0_subh0_w4_sse4_1(
1182 uint16_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
1183 uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
1184 const uint8_t *mask, uint32_t mask_stride, int h,
1185 const __m128i *round_offset, int shift, const __m128i *clip_low,
1186 const __m128i *clip_high, const __m128i *mask_max) {
1187 do {
1188 const __m128i mask0a8 =
1189 _mm_set_epi32(0, 0, *(int32_t *)mask, *(int32_t *)(mask + mask_stride));
1190 const __m128i mask0b8 =
1191 _mm_set_epi32(0, 0, *(int32_t *)(mask + 2 * mask_stride),
1192 *(int32_t *)(mask + 3 * mask_stride));
1193 const __m128i mask0a = _mm_cvtepu8_epi16(mask0a8);
1194 const __m128i mask0b = _mm_cvtepu8_epi16(mask0b8);
1195
1196 highbd_blend_a64_d16_mask_w4_sse4_1(
1197 dst, dst_stride, src0, src0_stride, src1, src1_stride, &mask0a, &mask0b,
1198 round_offset, shift, clip_low, clip_high, mask_max);
1199
1200 dst += dst_stride * 4;
1201 src0 += src0_stride * 4;
1202 src1 += src1_stride * 4;
1203 mask += mask_stride * 4;
1204 } while (h -= 4);
1205 }
1206
highbd_blend_a64_d16_mask_subw1_subh1_w4_sse4_1(uint16_t * dst,uint32_t dst_stride,const CONV_BUF_TYPE * src0,uint32_t src0_stride,const CONV_BUF_TYPE * src1,uint32_t src1_stride,const uint8_t * mask,uint32_t mask_stride,int h,const __m128i * round_offset,int shift,const __m128i * clip_low,const __m128i * clip_high,const __m128i * mask_max)1207 static inline void highbd_blend_a64_d16_mask_subw1_subh1_w4_sse4_1(
1208 uint16_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
1209 uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
1210 const uint8_t *mask, uint32_t mask_stride, int h,
1211 const __m128i *round_offset, int shift, const __m128i *clip_low,
1212 const __m128i *clip_high, const __m128i *mask_max) {
1213 const __m128i one_b = _mm_set1_epi8(1);
1214 const __m128i two_w = _mm_set1_epi16(2);
1215 do {
1216 // Load 8 pixels from each of 8 rows of mask,
1217 // (saturating) add together rows then use madd to add adjacent pixels
1218 // Finally, divide each value by 4 (with rounding)
1219 const __m128i m02 = _mm_set_epi64x(*(int64_t *)(mask),
1220 *(int64_t *)(mask + 2 * mask_stride));
1221 const __m128i m13 = _mm_set_epi64x(*(int64_t *)(mask + mask_stride),
1222 *(int64_t *)(mask + 3 * mask_stride));
1223 const __m128i m0123 = _mm_maddubs_epi16(_mm_adds_epu8(m02, m13), one_b);
1224 const __m128i mask_0a = _mm_srli_epi16(_mm_add_epi16(m0123, two_w), 2);
1225 const __m128i m46 = _mm_set_epi64x(*(int64_t *)(mask + 4 * mask_stride),
1226 *(int64_t *)(mask + 6 * mask_stride));
1227 const __m128i m57 = _mm_set_epi64x(*(int64_t *)(mask + 5 * mask_stride),
1228 *(int64_t *)(mask + 7 * mask_stride));
1229 const __m128i m4567 = _mm_maddubs_epi16(_mm_adds_epu8(m46, m57), one_b);
1230 const __m128i mask_0b = _mm_srli_epi16(_mm_add_epi16(m4567, two_w), 2);
1231
1232 highbd_blend_a64_d16_mask_w4_sse4_1(
1233 dst, dst_stride, src0, src0_stride, src1, src1_stride, &mask_0a,
1234 &mask_0b, round_offset, shift, clip_low, clip_high, mask_max);
1235
1236 dst += dst_stride * 4;
1237 src0 += src0_stride * 4;
1238 src1 += src1_stride * 4;
1239 mask += mask_stride * 8;
1240 } while (h -= 4);
1241 }
1242
highbd_blend_a64_d16_mask_w8_sse4_1(uint16_t * dst,int dst_stride,const CONV_BUF_TYPE * src0,int src0_stride,const CONV_BUF_TYPE * src1,int src1_stride,const __m128i * mask0a,const __m128i * mask0b,const __m128i * round_offset,int shift,const __m128i * clip_low,const __m128i * clip_high,const __m128i * max_mask)1243 static inline void highbd_blend_a64_d16_mask_w8_sse4_1(
1244 uint16_t *dst, int dst_stride, const CONV_BUF_TYPE *src0, int src0_stride,
1245 const CONV_BUF_TYPE *src1, int src1_stride, const __m128i *mask0a,
1246 const __m128i *mask0b, const __m128i *round_offset, int shift,
1247 const __m128i *clip_low, const __m128i *clip_high,
1248 const __m128i *max_mask) {
1249 // Load 8x pixels from each of 2 rows from each source
1250 const __m128i s0a = xx_loadu_128(src0);
1251 const __m128i s0b = xx_loadu_128(src0 + src0_stride);
1252 const __m128i s1a = xx_loadu_128(src1);
1253 const __m128i s1b = xx_loadu_128(src1 + src1_stride);
1254
1255 // Generate inverse masks
1256 const __m128i mask1a = _mm_sub_epi16(*max_mask, *mask0a);
1257 const __m128i mask1b = _mm_sub_epi16(*max_mask, *mask0b);
1258
1259 // Multiply sources by respective masks
1260 const __m128i mul0a_highs = _mm_mulhi_epu16(*mask0a, s0a);
1261 const __m128i mul0a_lows = _mm_mullo_epi16(*mask0a, s0a);
1262 const __m128i mul0ah = _mm_unpackhi_epi16(mul0a_lows, mul0a_highs);
1263 const __m128i mul0al = _mm_unpacklo_epi16(mul0a_lows, mul0a_highs);
1264
1265 const __m128i mul1a_highs = _mm_mulhi_epu16(mask1a, s1a);
1266 const __m128i mul1a_lows = _mm_mullo_epi16(mask1a, s1a);
1267 const __m128i mul1ah = _mm_unpackhi_epi16(mul1a_lows, mul1a_highs);
1268 const __m128i mul1al = _mm_unpacklo_epi16(mul1a_lows, mul1a_highs);
1269
1270 const __m128i sumah = _mm_add_epi32(mul0ah, mul1ah);
1271 const __m128i sumal = _mm_add_epi32(mul0al, mul1al);
1272
1273 const __m128i mul0b_highs = _mm_mulhi_epu16(*mask0b, s0b);
1274 const __m128i mul0b_lows = _mm_mullo_epi16(*mask0b, s0b);
1275 const __m128i mul0bh = _mm_unpackhi_epi16(mul0b_lows, mul0b_highs);
1276 const __m128i mul0bl = _mm_unpacklo_epi16(mul0b_lows, mul0b_highs);
1277
1278 const __m128i mul1b_highs = _mm_mulhi_epu16(mask1b, s1b);
1279 const __m128i mul1b_lows = _mm_mullo_epi16(mask1b, s1b);
1280 const __m128i mul1bh = _mm_unpackhi_epi16(mul1b_lows, mul1b_highs);
1281 const __m128i mul1bl = _mm_unpacklo_epi16(mul1b_lows, mul1b_highs);
1282
1283 const __m128i sumbh = _mm_add_epi32(mul0bh, mul1bh);
1284 const __m128i sumbl = _mm_add_epi32(mul0bl, mul1bl);
1285
1286 const __m128i roundah =
1287 _mm_srai_epi32(_mm_sub_epi32(sumah, *round_offset), shift);
1288 const __m128i roundal =
1289 _mm_srai_epi32(_mm_sub_epi32(sumal, *round_offset), shift);
1290 const __m128i roundbh =
1291 _mm_srai_epi32(_mm_sub_epi32(sumbh, *round_offset), shift);
1292 const __m128i roundbl =
1293 _mm_srai_epi32(_mm_sub_epi32(sumbl, *round_offset), shift);
1294
1295 const __m128i packa = _mm_packs_epi32(roundal, roundah);
1296 const __m128i clipa =
1297 _mm_min_epi16(_mm_max_epi16(packa, *clip_low), *clip_high);
1298 const __m128i packb = _mm_packs_epi32(roundbl, roundbh);
1299 const __m128i clipb =
1300 _mm_min_epi16(_mm_max_epi16(packb, *clip_low), *clip_high);
1301
1302 xx_storeu_128(dst, clipa);
1303 xx_storeu_128(dst + dst_stride, clipb);
1304 }
1305
highbd_blend_a64_d16_mask_subw0_subh0_w8_sse4_1(uint16_t * dst,uint32_t dst_stride,const CONV_BUF_TYPE * src0,uint32_t src0_stride,const CONV_BUF_TYPE * src1,uint32_t src1_stride,const uint8_t * mask,uint32_t mask_stride,int h,const __m128i * round_offset,int shift,const __m128i * clip_low,const __m128i * clip_high,const __m128i * max_mask)1306 static inline void highbd_blend_a64_d16_mask_subw0_subh0_w8_sse4_1(
1307 uint16_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
1308 uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
1309 const uint8_t *mask, uint32_t mask_stride, int h,
1310 const __m128i *round_offset, int shift, const __m128i *clip_low,
1311 const __m128i *clip_high, const __m128i *max_mask) {
1312 do {
1313 const __m128i mask0a = _mm_cvtepu8_epi16(xx_loadl_64(mask));
1314 const __m128i mask0b = _mm_cvtepu8_epi16(xx_loadl_64(mask + mask_stride));
1315 highbd_blend_a64_d16_mask_w8_sse4_1(
1316 dst, dst_stride, src0, src0_stride, src1, src1_stride, &mask0a, &mask0b,
1317 round_offset, shift, clip_low, clip_high, max_mask);
1318
1319 dst += dst_stride * 2;
1320 src0 += src0_stride * 2;
1321 src1 += src1_stride * 2;
1322 mask += mask_stride * 2;
1323 } while (h -= 2);
1324 }
1325
highbd_blend_a64_d16_mask_subw1_subh1_w8_sse4_1(uint16_t * dst,uint32_t dst_stride,const CONV_BUF_TYPE * src0,uint32_t src0_stride,const CONV_BUF_TYPE * src1,uint32_t src1_stride,const uint8_t * mask,uint32_t mask_stride,int h,const __m128i * round_offset,int shift,const __m128i * clip_low,const __m128i * clip_high,const __m128i * max_mask)1326 static inline void highbd_blend_a64_d16_mask_subw1_subh1_w8_sse4_1(
1327 uint16_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
1328 uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
1329 const uint8_t *mask, uint32_t mask_stride, int h,
1330 const __m128i *round_offset, int shift, const __m128i *clip_low,
1331 const __m128i *clip_high, const __m128i *max_mask) {
1332 const __m128i one_b = _mm_set1_epi8(1);
1333 const __m128i two_w = _mm_set1_epi16(2);
1334 do {
1335 const __m128i mask_thisrowa = xx_loadu_128(mask);
1336 const __m128i mask_nextrowa = xx_loadu_128(mask + mask_stride);
1337 const __m128i mask_thisrowb = xx_loadu_128(mask + 2 * mask_stride);
1338 const __m128i mask_nextrowb = xx_loadu_128(mask + 3 * mask_stride);
1339 const __m128i mask_bothrowsa = _mm_adds_epu8(mask_thisrowa, mask_nextrowa);
1340 const __m128i mask_bothrowsb = _mm_adds_epu8(mask_thisrowb, mask_nextrowb);
1341 const __m128i mask_16a = _mm_maddubs_epi16(mask_bothrowsa, one_b);
1342 const __m128i mask_16b = _mm_maddubs_epi16(mask_bothrowsb, one_b);
1343 const __m128i mask_sa = _mm_srli_epi16(_mm_add_epi16(mask_16a, two_w), 2);
1344 const __m128i mask_sb = _mm_srli_epi16(_mm_add_epi16(mask_16b, two_w), 2);
1345
1346 highbd_blend_a64_d16_mask_w8_sse4_1(
1347 dst, dst_stride, src0, src0_stride, src1, src1_stride, &mask_sa,
1348 &mask_sb, round_offset, shift, clip_low, clip_high, max_mask);
1349
1350 dst += dst_stride * 2;
1351 src0 += src0_stride * 2;
1352 src1 += src1_stride * 2;
1353 mask += mask_stride * 4;
1354 } while (h -= 2);
1355 }
1356
highbd_blend_a64_d16_mask_w16_sse4_1(uint16_t * dst,const CONV_BUF_TYPE * src0,const CONV_BUF_TYPE * src1,const __m128i * round_offset,int shift,const __m128i * mask0l,const __m128i * mask0h,const __m128i * clip_low,const __m128i * clip_high,const __m128i * mask_max)1357 static inline void highbd_blend_a64_d16_mask_w16_sse4_1(
1358 uint16_t *dst, const CONV_BUF_TYPE *src0, const CONV_BUF_TYPE *src1,
1359 const __m128i *round_offset, int shift, const __m128i *mask0l,
1360 const __m128i *mask0h, const __m128i *clip_low, const __m128i *clip_high,
1361 const __m128i *mask_max) {
1362 // Load 16x u16 pixels for this row from each src
1363 const __m128i s0l = xx_loadu_128(src0);
1364 const __m128i s0h = xx_loadu_128(src0 + 8);
1365 const __m128i s1l = xx_loadu_128(src1);
1366 const __m128i s1h = xx_loadu_128(src1 + 8);
1367
1368 // Calculate inverse masks
1369 const __m128i mask1h = _mm_sub_epi16(*mask_max, *mask0h);
1370 const __m128i mask1l = _mm_sub_epi16(*mask_max, *mask0l);
1371
1372 const __m128i mul0_highs = _mm_mulhi_epu16(*mask0h, s0h);
1373 const __m128i mul0_lows = _mm_mullo_epi16(*mask0h, s0h);
1374 const __m128i mul0h = _mm_unpackhi_epi16(mul0_lows, mul0_highs);
1375 const __m128i mul0l = _mm_unpacklo_epi16(mul0_lows, mul0_highs);
1376
1377 const __m128i mul1_highs = _mm_mulhi_epu16(mask1h, s1h);
1378 const __m128i mul1_lows = _mm_mullo_epi16(mask1h, s1h);
1379 const __m128i mul1h = _mm_unpackhi_epi16(mul1_lows, mul1_highs);
1380 const __m128i mul1l = _mm_unpacklo_epi16(mul1_lows, mul1_highs);
1381
1382 const __m128i mulhh = _mm_add_epi32(mul0h, mul1h);
1383 const __m128i mulhl = _mm_add_epi32(mul0l, mul1l);
1384
1385 const __m128i mul2_highs = _mm_mulhi_epu16(*mask0l, s0l);
1386 const __m128i mul2_lows = _mm_mullo_epi16(*mask0l, s0l);
1387 const __m128i mul2h = _mm_unpackhi_epi16(mul2_lows, mul2_highs);
1388 const __m128i mul2l = _mm_unpacklo_epi16(mul2_lows, mul2_highs);
1389
1390 const __m128i mul3_highs = _mm_mulhi_epu16(mask1l, s1l);
1391 const __m128i mul3_lows = _mm_mullo_epi16(mask1l, s1l);
1392 const __m128i mul3h = _mm_unpackhi_epi16(mul3_lows, mul3_highs);
1393 const __m128i mul3l = _mm_unpacklo_epi16(mul3_lows, mul3_highs);
1394
1395 const __m128i mullh = _mm_add_epi32(mul2h, mul3h);
1396 const __m128i mulll = _mm_add_epi32(mul2l, mul3l);
1397
1398 const __m128i reshh =
1399 _mm_srai_epi32(_mm_sub_epi32(mulhh, *round_offset), shift);
1400 const __m128i reshl =
1401 _mm_srai_epi32(_mm_sub_epi32(mulhl, *round_offset), shift);
1402 const __m128i reslh =
1403 _mm_srai_epi32(_mm_sub_epi32(mullh, *round_offset), shift);
1404 const __m128i resll =
1405 _mm_srai_epi32(_mm_sub_epi32(mulll, *round_offset), shift);
1406
1407 // Signed saturating pack from i32 to i16:
1408 const __m128i packh = _mm_packs_epi32(reshl, reshh);
1409 const __m128i packl = _mm_packs_epi32(resll, reslh);
1410
1411 // Clip the values to the valid range
1412 const __m128i cliph =
1413 _mm_min_epi16(_mm_max_epi16(packh, *clip_low), *clip_high);
1414 const __m128i clipl =
1415 _mm_min_epi16(_mm_max_epi16(packl, *clip_low), *clip_high);
1416
1417 // Store 16 pixels
1418 xx_storeu_128(dst, clipl);
1419 xx_storeu_128(dst + 8, cliph);
1420 }
1421
highbd_blend_a64_d16_mask_subw0_subh0_w16_sse4_1(uint16_t * dst,uint32_t dst_stride,const CONV_BUF_TYPE * src0,uint32_t src0_stride,const CONV_BUF_TYPE * src1,uint32_t src1_stride,const uint8_t * mask,uint32_t mask_stride,int h,int w,const __m128i * round_offset,int shift,const __m128i * clip_low,const __m128i * clip_high,const __m128i * mask_max)1422 static inline void highbd_blend_a64_d16_mask_subw0_subh0_w16_sse4_1(
1423 uint16_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
1424 uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
1425 const uint8_t *mask, uint32_t mask_stride, int h, int w,
1426 const __m128i *round_offset, int shift, const __m128i *clip_low,
1427 const __m128i *clip_high, const __m128i *mask_max) {
1428 for (int i = 0; i < h; i++) {
1429 for (int j = 0; j < w; j += 16) {
1430 // Load 16x u8 alpha-mask values and pad to u16
1431 const __m128i masks_u8 = xx_loadu_128(mask + j);
1432 const __m128i mask0l = _mm_cvtepu8_epi16(masks_u8);
1433 const __m128i mask0h = _mm_cvtepu8_epi16(_mm_srli_si128(masks_u8, 8));
1434
1435 highbd_blend_a64_d16_mask_w16_sse4_1(
1436 dst + j, src0 + j, src1 + j, round_offset, shift, &mask0l, &mask0h,
1437 clip_low, clip_high, mask_max);
1438 }
1439 dst += dst_stride;
1440 src0 += src0_stride;
1441 src1 += src1_stride;
1442 mask += mask_stride;
1443 }
1444 }
1445
highbd_blend_a64_d16_mask_subw1_subh1_w16_sse4_1(uint16_t * dst,uint32_t dst_stride,const CONV_BUF_TYPE * src0,uint32_t src0_stride,const CONV_BUF_TYPE * src1,uint32_t src1_stride,const uint8_t * mask,uint32_t mask_stride,int h,int w,const __m128i * round_offset,int shift,const __m128i * clip_low,const __m128i * clip_high,const __m128i * mask_max)1446 static inline void highbd_blend_a64_d16_mask_subw1_subh1_w16_sse4_1(
1447 uint16_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
1448 uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
1449 const uint8_t *mask, uint32_t mask_stride, int h, int w,
1450 const __m128i *round_offset, int shift, const __m128i *clip_low,
1451 const __m128i *clip_high, const __m128i *mask_max) {
1452 const __m128i one_b = _mm_set1_epi8(1);
1453 const __m128i two_w = _mm_set1_epi16(2);
1454 for (int i = 0; i < h; i++) {
1455 for (int j = 0; j < w; j += 16) {
1456 const __m128i m_i00 = xx_loadu_128(mask + 2 * j);
1457 const __m128i m_i01 = xx_loadu_128(mask + 2 * j + 16);
1458 const __m128i m_i10 = xx_loadu_128(mask + mask_stride + 2 * j);
1459 const __m128i m_i11 = xx_loadu_128(mask + mask_stride + 2 * j + 16);
1460
1461 const __m128i m0_ac = _mm_adds_epu8(m_i00, m_i10);
1462 const __m128i m1_ac = _mm_adds_epu8(m_i01, m_i11);
1463 const __m128i m0_acbd = _mm_maddubs_epi16(m0_ac, one_b);
1464 const __m128i m1_acbd = _mm_maddubs_epi16(m1_ac, one_b);
1465 const __m128i mask_l = _mm_srli_epi16(_mm_add_epi16(m0_acbd, two_w), 2);
1466 const __m128i mask_h = _mm_srli_epi16(_mm_add_epi16(m1_acbd, two_w), 2);
1467
1468 highbd_blend_a64_d16_mask_w16_sse4_1(
1469 dst + j, src0 + j, src1 + j, round_offset, shift, &mask_l, &mask_h,
1470 clip_low, clip_high, mask_max);
1471 }
1472 dst += dst_stride;
1473 src0 += src0_stride;
1474 src1 += src1_stride;
1475 mask += mask_stride * 2;
1476 }
1477 }
1478
aom_highbd_blend_a64_d16_mask_sse4_1(uint8_t * dst8,uint32_t dst_stride,const CONV_BUF_TYPE * src0,uint32_t src0_stride,const CONV_BUF_TYPE * src1,uint32_t src1_stride,const uint8_t * mask,uint32_t mask_stride,int w,int h,int subw,int subh,ConvolveParams * conv_params,const int bd)1479 void aom_highbd_blend_a64_d16_mask_sse4_1(
1480 uint8_t *dst8, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
1481 uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
1482 const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh,
1483 ConvolveParams *conv_params, const int bd) {
1484 uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
1485 const int round_bits =
1486 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
1487 const int32_t round_offset =
1488 ((1 << (round_bits + bd)) + (1 << (round_bits + bd - 1)) -
1489 (1 << (round_bits - 1)))
1490 << AOM_BLEND_A64_ROUND_BITS;
1491 const __m128i v_round_offset = _mm_set1_epi32(round_offset);
1492 const int shift = round_bits + AOM_BLEND_A64_ROUND_BITS;
1493
1494 const __m128i clip_low = _mm_setzero_si128();
1495 const __m128i clip_high = _mm_set1_epi16((1 << bd) - 1);
1496 const __m128i mask_max = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
1497
1498 assert(IMPLIES((void *)src0 == dst, src0_stride == dst_stride));
1499 assert(IMPLIES((void *)src1 == dst, src1_stride == dst_stride));
1500
1501 assert(h >= 4);
1502 assert(w >= 4);
1503 assert(IS_POWER_OF_TWO(h));
1504 assert(IS_POWER_OF_TWO(w));
1505
1506 if (subw == 0 && subh == 0) {
1507 switch (w) {
1508 case 4:
1509 highbd_blend_a64_d16_mask_subw0_subh0_w4_sse4_1(
1510 dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
1511 mask_stride, h, &v_round_offset, shift, &clip_low, &clip_high,
1512 &mask_max);
1513 break;
1514 case 8:
1515 highbd_blend_a64_d16_mask_subw0_subh0_w8_sse4_1(
1516 dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
1517 mask_stride, h, &v_round_offset, shift, &clip_low, &clip_high,
1518 &mask_max);
1519 break;
1520 default: // >=16
1521 highbd_blend_a64_d16_mask_subw0_subh0_w16_sse4_1(
1522 dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
1523 mask_stride, h, w, &v_round_offset, shift, &clip_low, &clip_high,
1524 &mask_max);
1525 break;
1526 }
1527
1528 } else if (subw == 1 && subh == 1) {
1529 switch (w) {
1530 case 4:
1531 highbd_blend_a64_d16_mask_subw1_subh1_w4_sse4_1(
1532 dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
1533 mask_stride, h, &v_round_offset, shift, &clip_low, &clip_high,
1534 &mask_max);
1535 break;
1536 case 8:
1537 highbd_blend_a64_d16_mask_subw1_subh1_w8_sse4_1(
1538 dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
1539 mask_stride, h, &v_round_offset, shift, &clip_low, &clip_high,
1540 &mask_max);
1541 break;
1542 default: // >=16
1543 highbd_blend_a64_d16_mask_subw1_subh1_w16_sse4_1(
1544 dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
1545 mask_stride, h, w, &v_round_offset, shift, &clip_low, &clip_high,
1546 &mask_max);
1547 break;
1548 }
1549 } else {
1550 // Sub-sampling in only one axis doesn't seem to happen very much, so fall
1551 // back to the vanilla C implementation instead of having all the optimised
1552 // code for these.
1553 aom_highbd_blend_a64_d16_mask_c(dst8, dst_stride, src0, src0_stride, src1,
1554 src1_stride, mask, mask_stride, w, h, subw,
1555 subh, conv_params, bd);
1556 }
1557 }
1558 #endif // CONFIG_AV1_HIGHBITDEPTH
1559