1 /*
2 * Copyright (c) 2020, Alliance for Open Media. All rights reserved.
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #include <assert.h>
13 #include <immintrin.h> // AVX2
14
15 #include "config/aom_dsp_rtcd.h"
16 #include "aom_dsp/aom_filter.h"
17 #include "aom_dsp/x86/synonyms.h"
18
19 typedef void (*high_variance_fn_t)(const uint16_t *src, int src_stride,
20 const uint16_t *ref, int ref_stride,
21 uint32_t *sse, int *sum);
22
aom_highbd_var_filter_block2d_bil_avx2(const uint8_t * src_ptr8,unsigned int src_pixels_per_line,int pixel_step,unsigned int output_height,unsigned int output_width,const uint32_t xoffset,const uint32_t yoffset,const uint8_t * dst_ptr8,int dst_stride,uint32_t * sse)23 static uint32_t aom_highbd_var_filter_block2d_bil_avx2(
24 const uint8_t *src_ptr8, unsigned int src_pixels_per_line, int pixel_step,
25 unsigned int output_height, unsigned int output_width,
26 const uint32_t xoffset, const uint32_t yoffset, const uint8_t *dst_ptr8,
27 int dst_stride, uint32_t *sse) {
28 const __m256i filter1 =
29 _mm256_set1_epi32((int)(bilinear_filters_2t[xoffset][1] << 16) |
30 bilinear_filters_2t[xoffset][0]);
31 const __m256i filter2 =
32 _mm256_set1_epi32((int)(bilinear_filters_2t[yoffset][1] << 16) |
33 bilinear_filters_2t[yoffset][0]);
34 const __m256i one = _mm256_set1_epi16(1);
35 const int bitshift = 0x40;
36 (void)pixel_step;
37 unsigned int i, j, prev = 0, curr = 2;
38 uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src_ptr8);
39 uint16_t *dst_ptr = CONVERT_TO_SHORTPTR(dst_ptr8);
40 uint16_t *src_ptr_ref = src_ptr;
41 uint16_t *dst_ptr_ref = dst_ptr;
42 int64_t sum_long = 0;
43 uint64_t sse_long = 0;
44 unsigned int rshift = 0, inc = 1;
45 __m256i rbias = _mm256_set1_epi32(bitshift);
46 __m256i opointer[8];
47 unsigned int range;
48 if (xoffset == 0) {
49 if (yoffset == 0) { // xoffset==0 && yoffset==0
50 range = output_width / 16;
51 if (output_height == 8) inc = 2;
52 if (output_height == 4) inc = 4;
53 for (j = 0; j < range * output_height * inc / 16; j++) {
54 if (j % (output_height * inc / 16) == 0) {
55 src_ptr = src_ptr_ref;
56 src_ptr_ref += 16;
57 dst_ptr = dst_ptr_ref;
58 dst_ptr_ref += 16;
59 }
60 __m256i sum1 = _mm256_setzero_si256();
61 __m256i sse1 = _mm256_setzero_si256();
62 for (i = 0; i < 16 / inc; ++i) {
63 __m256i V_S_SRC = _mm256_loadu_si256((const __m256i *)src_ptr);
64 src_ptr += src_pixels_per_line;
65 __m256i V_D_DST = _mm256_loadu_si256((const __m256i *)dst_ptr);
66 dst_ptr += dst_stride;
67
68 __m256i V_R_SUB = _mm256_sub_epi16(V_S_SRC, V_D_DST);
69 __m256i V_R_MAD = _mm256_madd_epi16(V_R_SUB, V_R_SUB);
70
71 sum1 = _mm256_add_epi16(sum1, V_R_SUB);
72 sse1 = _mm256_add_epi32(sse1, V_R_MAD);
73 }
74
75 __m256i v_sum0 = _mm256_madd_epi16(sum1, one);
76 __m256i v_d_l = _mm256_unpacklo_epi32(v_sum0, sse1);
77 __m256i v_d_h = _mm256_unpackhi_epi32(v_sum0, sse1);
78 __m256i v_d_lh = _mm256_add_epi32(v_d_l, v_d_h);
79 const __m128i v_d0_d = _mm256_castsi256_si128(v_d_lh);
80 const __m128i v_d1_d = _mm256_extracti128_si256(v_d_lh, 1);
81 __m128i v_d = _mm_add_epi32(v_d0_d, v_d1_d);
82 v_d = _mm_add_epi32(v_d, _mm_srli_si128(v_d, 8));
83 sum_long += _mm_extract_epi32(v_d, 0);
84 sse_long += _mm_extract_epi32(v_d, 1);
85 }
86
87 rshift = get_msb(output_height) + get_msb(output_width);
88
89 } else if (yoffset == 4) { // xoffset==0 && yoffset==4
90 range = output_width / 16;
91 if (output_height == 8) inc = 2;
92 if (output_height == 4) inc = 4;
93 for (j = 0; j < range * output_height * inc / 16; j++) {
94 if (j % (output_height * inc / 16) == 0) {
95 src_ptr = src_ptr_ref;
96 src_ptr_ref += 16;
97 dst_ptr = dst_ptr_ref;
98 dst_ptr_ref += 16;
99
100 opointer[0] = _mm256_loadu_si256((const __m256i *)src_ptr);
101 src_ptr += src_pixels_per_line;
102 curr = 0;
103 }
104
105 __m256i sum1 = _mm256_setzero_si256();
106 __m256i sse1 = _mm256_setzero_si256();
107
108 for (i = 0; i < 16 / inc; ++i) {
109 prev = curr;
110 curr = (curr == 0) ? 1 : 0;
111 opointer[curr] = _mm256_loadu_si256((const __m256i *)src_ptr);
112 src_ptr += src_pixels_per_line;
113
114 __m256i V_S_SRC = _mm256_avg_epu16(opointer[curr], opointer[prev]);
115
116 __m256i V_D_DST = _mm256_loadu_si256((const __m256i *)dst_ptr);
117 dst_ptr += dst_stride;
118 __m256i V_R_SUB = _mm256_sub_epi16(V_S_SRC, V_D_DST);
119 __m256i V_R_MAD = _mm256_madd_epi16(V_R_SUB, V_R_SUB);
120 sum1 = _mm256_add_epi16(sum1, V_R_SUB);
121 sse1 = _mm256_add_epi32(sse1, V_R_MAD);
122 }
123
124 __m256i v_sum0 = _mm256_madd_epi16(sum1, one);
125 __m256i v_d_l = _mm256_unpacklo_epi32(v_sum0, sse1);
126 __m256i v_d_h = _mm256_unpackhi_epi32(v_sum0, sse1);
127 __m256i v_d_lh = _mm256_add_epi32(v_d_l, v_d_h);
128 const __m128i v_d0_d = _mm256_castsi256_si128(v_d_lh);
129 const __m128i v_d1_d = _mm256_extracti128_si256(v_d_lh, 1);
130 __m128i v_d = _mm_add_epi32(v_d0_d, v_d1_d);
131 v_d = _mm_add_epi32(v_d, _mm_srli_si128(v_d, 8));
132 sum_long += _mm_extract_epi32(v_d, 0);
133 sse_long += _mm_extract_epi32(v_d, 1);
134 }
135
136 rshift = get_msb(output_height) + get_msb(output_width);
137
138 } else { // xoffset==0 && yoffset==1,2,3,5,6,7
139 range = output_width / 16;
140 if (output_height == 8) inc = 2;
141 if (output_height == 4) inc = 4;
142 for (j = 0; j < range * output_height * inc / 16; j++) {
143 if (j % (output_height * inc / 16) == 0) {
144 src_ptr = src_ptr_ref;
145 src_ptr_ref += 16;
146 dst_ptr = dst_ptr_ref;
147 dst_ptr_ref += 16;
148
149 opointer[0] = _mm256_loadu_si256((const __m256i *)src_ptr);
150 src_ptr += src_pixels_per_line;
151 curr = 0;
152 }
153
154 __m256i sum1 = _mm256_setzero_si256();
155 __m256i sse1 = _mm256_setzero_si256();
156
157 for (i = 0; i < 16 / inc; ++i) {
158 prev = curr;
159 curr = (curr == 0) ? 1 : 0;
160 opointer[curr] = _mm256_loadu_si256((const __m256i *)src_ptr);
161 src_ptr += src_pixels_per_line;
162
163 __m256i V_S_M1 =
164 _mm256_unpacklo_epi16(opointer[prev], opointer[curr]);
165 __m256i V_S_M2 =
166 _mm256_unpackhi_epi16(opointer[prev], opointer[curr]);
167
168 __m256i V_S_MAD1 = _mm256_madd_epi16(V_S_M1, filter2);
169 __m256i V_S_MAD2 = _mm256_madd_epi16(V_S_M2, filter2);
170
171 __m256i V_S_S1 =
172 _mm256_srli_epi32(_mm256_add_epi32(V_S_MAD1, rbias), 7);
173 __m256i V_S_S2 =
174 _mm256_srli_epi32(_mm256_add_epi32(V_S_MAD2, rbias), 7);
175
176 __m256i V_S_SRC = _mm256_packus_epi32(V_S_S1, V_S_S2);
177
178 __m256i V_D_DST = _mm256_loadu_si256((const __m256i *)dst_ptr);
179 dst_ptr += dst_stride;
180
181 __m256i V_R_SUB = _mm256_sub_epi16(V_S_SRC, V_D_DST);
182 __m256i V_R_MAD = _mm256_madd_epi16(V_R_SUB, V_R_SUB);
183
184 sum1 = _mm256_add_epi16(sum1, V_R_SUB);
185 sse1 = _mm256_add_epi32(sse1, V_R_MAD);
186 }
187
188 __m256i v_sum0 = _mm256_madd_epi16(sum1, one);
189 __m256i v_d_l = _mm256_unpacklo_epi32(v_sum0, sse1);
190 __m256i v_d_h = _mm256_unpackhi_epi32(v_sum0, sse1);
191 __m256i v_d_lh = _mm256_add_epi32(v_d_l, v_d_h);
192 const __m128i v_d0_d = _mm256_castsi256_si128(v_d_lh);
193 const __m128i v_d1_d = _mm256_extracti128_si256(v_d_lh, 1);
194 __m128i v_d = _mm_add_epi32(v_d0_d, v_d1_d);
195 v_d = _mm_add_epi32(v_d, _mm_srli_si128(v_d, 8));
196 sum_long += _mm_extract_epi32(v_d, 0);
197 sse_long += _mm_extract_epi32(v_d, 1);
198 }
199
200 rshift = get_msb(output_height) + get_msb(output_width);
201 }
202 } else if (xoffset == 4) {
203 if (yoffset == 0) { // xoffset==4 && yoffset==0
204 range = output_width / 16;
205 if (output_height == 8) inc = 2;
206 if (output_height == 4) inc = 4;
207 for (j = 0; j < range * output_height * inc / 16; j++) {
208 if (j % (output_height * inc / 16) == 0) {
209 src_ptr = src_ptr_ref;
210 src_ptr_ref += 16;
211 dst_ptr = dst_ptr_ref;
212 dst_ptr_ref += 16;
213 __m256i V_H_D1 = _mm256_loadu_si256((const __m256i *)src_ptr);
214 __m256i V_H_D2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 1));
215 src_ptr += src_pixels_per_line;
216
217 opointer[0] = _mm256_avg_epu16(V_H_D1, V_H_D2);
218
219 curr = 0;
220 }
221
222 __m256i sum1 = _mm256_setzero_si256();
223 __m256i sse1 = _mm256_setzero_si256();
224
225 for (i = 0; i < 16 / inc; ++i) {
226 prev = curr;
227 curr = (curr == 0) ? 1 : 0;
228 __m256i V_V_D1 = _mm256_loadu_si256((const __m256i *)src_ptr);
229 __m256i V_V_D2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 1));
230 src_ptr += src_pixels_per_line;
231
232 opointer[curr] = _mm256_avg_epu16(V_V_D1, V_V_D2);
233
234 __m256i V_S_M1 =
235 _mm256_unpacklo_epi16(opointer[prev], opointer[curr]);
236 __m256i V_S_M2 =
237 _mm256_unpackhi_epi16(opointer[prev], opointer[curr]);
238
239 __m256i V_S_MAD1 = _mm256_madd_epi16(V_S_M1, filter2);
240 __m256i V_S_MAD2 = _mm256_madd_epi16(V_S_M2, filter2);
241
242 __m256i V_S_S1 =
243 _mm256_srli_epi32(_mm256_add_epi32(V_S_MAD1, rbias), 7);
244 __m256i V_S_S2 =
245 _mm256_srli_epi32(_mm256_add_epi32(V_S_MAD2, rbias), 7);
246
247 __m256i V_S_SRC = _mm256_packus_epi32(V_S_S1, V_S_S2);
248
249 __m256i V_D_DST = _mm256_loadu_si256((const __m256i *)dst_ptr);
250 dst_ptr += dst_stride;
251
252 __m256i V_R_SUB = _mm256_sub_epi16(V_S_SRC, V_D_DST);
253 __m256i V_R_MAD = _mm256_madd_epi16(V_R_SUB, V_R_SUB);
254
255 sum1 = _mm256_add_epi16(sum1, V_R_SUB);
256 sse1 = _mm256_add_epi32(sse1, V_R_MAD);
257 }
258
259 __m256i v_sum0 = _mm256_madd_epi16(sum1, one);
260 __m256i v_d_l = _mm256_unpacklo_epi32(v_sum0, sse1);
261 __m256i v_d_h = _mm256_unpackhi_epi32(v_sum0, sse1);
262 __m256i v_d_lh = _mm256_add_epi32(v_d_l, v_d_h);
263 const __m128i v_d0_d = _mm256_castsi256_si128(v_d_lh);
264 const __m128i v_d1_d = _mm256_extracti128_si256(v_d_lh, 1);
265 __m128i v_d = _mm_add_epi32(v_d0_d, v_d1_d);
266 v_d = _mm_add_epi32(v_d, _mm_srli_si128(v_d, 8));
267 sum_long += _mm_extract_epi32(v_d, 0);
268 sse_long += _mm_extract_epi32(v_d, 1);
269 }
270
271 rshift = get_msb(output_height) + get_msb(output_width);
272
273 } else if (yoffset == 4) { // xoffset==4 && yoffset==4
274 range = output_width / 16;
275 if (output_height == 8) inc = 2;
276 if (output_height == 4) inc = 4;
277 for (j = 0; j < range * output_height * inc / 16; j++) {
278 if (j % (output_height * inc / 16) == 0) {
279 src_ptr = src_ptr_ref;
280 src_ptr_ref += 16;
281 dst_ptr = dst_ptr_ref;
282 dst_ptr_ref += 16;
283
284 __m256i V_H_D1 = _mm256_loadu_si256((const __m256i *)src_ptr);
285 __m256i V_H_D2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 1));
286 src_ptr += src_pixels_per_line;
287 opointer[0] = _mm256_avg_epu16(V_H_D1, V_H_D2);
288 curr = 0;
289 }
290
291 __m256i sum1 = _mm256_setzero_si256();
292 __m256i sse1 = _mm256_setzero_si256();
293
294 for (i = 0; i < 16 / inc; ++i) {
295 prev = curr;
296 curr = (curr == 0) ? 1 : 0;
297 __m256i V_V_D1 = _mm256_loadu_si256((const __m256i *)src_ptr);
298 __m256i V_V_D2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 1));
299 src_ptr += src_pixels_per_line;
300 opointer[curr] = _mm256_avg_epu16(V_V_D1, V_V_D2);
301 __m256i V_S_SRC = _mm256_avg_epu16(opointer[curr], opointer[prev]);
302
303 __m256i V_D_DST = _mm256_loadu_si256((const __m256i *)dst_ptr);
304 dst_ptr += dst_stride;
305 __m256i V_R_SUB = _mm256_sub_epi16(V_S_SRC, V_D_DST);
306 __m256i V_R_MAD = _mm256_madd_epi16(V_R_SUB, V_R_SUB);
307 sum1 = _mm256_add_epi16(sum1, V_R_SUB);
308 sse1 = _mm256_add_epi32(sse1, V_R_MAD);
309 }
310
311 __m256i v_sum0 = _mm256_madd_epi16(sum1, one);
312 __m256i v_d_l = _mm256_unpacklo_epi32(v_sum0, sse1);
313 __m256i v_d_h = _mm256_unpackhi_epi32(v_sum0, sse1);
314 __m256i v_d_lh = _mm256_add_epi32(v_d_l, v_d_h);
315 const __m128i v_d0_d = _mm256_castsi256_si128(v_d_lh);
316 const __m128i v_d1_d = _mm256_extracti128_si256(v_d_lh, 1);
317 __m128i v_d = _mm_add_epi32(v_d0_d, v_d1_d);
318 v_d = _mm_add_epi32(v_d, _mm_srli_si128(v_d, 8));
319 sum_long += _mm_extract_epi32(v_d, 0);
320 sse_long += _mm_extract_epi32(v_d, 1);
321 }
322
323 rshift = get_msb(output_height) + get_msb(output_width);
324
325 } else { // xoffset==4 && yoffset==1,2,3,5,6,7
326 range = output_width / 16;
327 if (output_height == 8) inc = 2;
328 if (output_height == 4) inc = 4;
329 for (j = 0; j < range * output_height * inc / 16; j++) {
330 if (j % (output_height * inc / 16) == 0) {
331 src_ptr = src_ptr_ref;
332 src_ptr_ref += 16;
333 dst_ptr = dst_ptr_ref;
334 dst_ptr_ref += 16;
335
336 __m256i V_H_D1 = _mm256_loadu_si256((const __m256i *)src_ptr);
337 __m256i V_H_D2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 1));
338 src_ptr += src_pixels_per_line;
339 opointer[0] = _mm256_avg_epu16(V_H_D1, V_H_D2);
340 curr = 0;
341 }
342
343 __m256i sum1 = _mm256_setzero_si256();
344 __m256i sse1 = _mm256_setzero_si256();
345
346 for (i = 0; i < 16 / inc; ++i) {
347 prev = curr;
348 curr = (curr == 0) ? 1 : 0;
349 __m256i V_V_D1 = _mm256_loadu_si256((const __m256i *)src_ptr);
350 __m256i V_V_D2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 1));
351 src_ptr += src_pixels_per_line;
352 opointer[curr] = _mm256_avg_epu16(V_V_D1, V_V_D2);
353
354 __m256i V_S_M1 =
355 _mm256_unpacklo_epi16(opointer[prev], opointer[curr]);
356 __m256i V_S_M2 =
357 _mm256_unpackhi_epi16(opointer[prev], opointer[curr]);
358
359 __m256i V_S_MAD1 = _mm256_madd_epi16(V_S_M1, filter2);
360 __m256i V_S_MAD2 = _mm256_madd_epi16(V_S_M2, filter2);
361
362 __m256i V_S_S1 =
363 _mm256_srli_epi32(_mm256_add_epi32(V_S_MAD1, rbias), 7);
364 __m256i V_S_S2 =
365 _mm256_srli_epi32(_mm256_add_epi32(V_S_MAD2, rbias), 7);
366
367 __m256i V_S_SRC = _mm256_packus_epi32(V_S_S1, V_S_S2);
368
369 __m256i V_D_DST = _mm256_loadu_si256((const __m256i *)dst_ptr);
370 dst_ptr += dst_stride;
371
372 __m256i V_R_SUB = _mm256_sub_epi16(V_S_SRC, V_D_DST);
373 __m256i V_R_MAD = _mm256_madd_epi16(V_R_SUB, V_R_SUB);
374
375 sum1 = _mm256_add_epi16(sum1, V_R_SUB);
376 sse1 = _mm256_add_epi32(sse1, V_R_MAD);
377 }
378
379 __m256i v_sum0 = _mm256_madd_epi16(sum1, one);
380 __m256i v_d_l = _mm256_unpacklo_epi32(v_sum0, sse1);
381 __m256i v_d_h = _mm256_unpackhi_epi32(v_sum0, sse1);
382 __m256i v_d_lh = _mm256_add_epi32(v_d_l, v_d_h);
383 const __m128i v_d0_d = _mm256_castsi256_si128(v_d_lh);
384 const __m128i v_d1_d = _mm256_extracti128_si256(v_d_lh, 1);
385 __m128i v_d = _mm_add_epi32(v_d0_d, v_d1_d);
386 v_d = _mm_add_epi32(v_d, _mm_srli_si128(v_d, 8));
387 sum_long += _mm_extract_epi32(v_d, 0);
388 sse_long += _mm_extract_epi32(v_d, 1);
389 }
390
391 rshift = get_msb(output_height) + get_msb(output_width);
392 }
393 } else if (yoffset == 0) { // xoffset==1,2,3,5,6,7 && yoffset==0
394 range = output_width / 16;
395 if (output_height == 8) inc = 2;
396 if (output_height == 4) inc = 4;
397 for (j = 0; j < range * output_height * inc / 16; j++) {
398 if (j % (output_height * inc / 16) == 0) {
399 src_ptr = src_ptr_ref;
400 src_ptr_ref += 16;
401 dst_ptr = dst_ptr_ref;
402 dst_ptr_ref += 16;
403
404 curr = 0;
405 }
406
407 __m256i sum1 = _mm256_setzero_si256();
408 __m256i sse1 = _mm256_setzero_si256();
409
410 for (i = 0; i < 16 / inc; ++i) {
411 __m256i V_V_D1 = _mm256_loadu_si256((const __m256i *)src_ptr);
412 __m256i V_V_D2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 1));
413 src_ptr += src_pixels_per_line;
414 __m256i V_V_M1 = _mm256_unpacklo_epi16(V_V_D1, V_V_D2);
415 __m256i V_V_M2 = _mm256_unpackhi_epi16(V_V_D1, V_V_D2);
416 __m256i V_V_MAD1 = _mm256_madd_epi16(V_V_M1, filter1);
417 __m256i V_V_MAD2 = _mm256_madd_epi16(V_V_M2, filter1);
418 __m256i V_V_S1 =
419 _mm256_srli_epi32(_mm256_add_epi32(V_V_MAD1, rbias), 7);
420 __m256i V_V_S2 =
421 _mm256_srli_epi32(_mm256_add_epi32(V_V_MAD2, rbias), 7);
422 opointer[curr] = _mm256_packus_epi32(V_V_S1, V_V_S2);
423
424 __m256i V_D_DST = _mm256_loadu_si256((const __m256i *)dst_ptr);
425 dst_ptr += dst_stride;
426 __m256i V_R_SUB = _mm256_sub_epi16(opointer[curr], V_D_DST);
427 __m256i V_R_MAD = _mm256_madd_epi16(V_R_SUB, V_R_SUB);
428
429 sum1 = _mm256_add_epi16(sum1, V_R_SUB);
430 sse1 = _mm256_add_epi32(sse1, V_R_MAD);
431 }
432
433 __m256i v_sum0 = _mm256_madd_epi16(sum1, one);
434 __m256i v_d_l = _mm256_unpacklo_epi32(v_sum0, sse1);
435 __m256i v_d_h = _mm256_unpackhi_epi32(v_sum0, sse1);
436 __m256i v_d_lh = _mm256_add_epi32(v_d_l, v_d_h);
437 const __m128i v_d0_d = _mm256_castsi256_si128(v_d_lh);
438 const __m128i v_d1_d = _mm256_extracti128_si256(v_d_lh, 1);
439 __m128i v_d = _mm_add_epi32(v_d0_d, v_d1_d);
440 v_d = _mm_add_epi32(v_d, _mm_srli_si128(v_d, 8));
441 sum_long += _mm_extract_epi32(v_d, 0);
442 sse_long += _mm_extract_epi32(v_d, 1);
443 }
444
445 rshift = get_msb(output_height) + get_msb(output_width);
446
447 } else if (yoffset == 4) { // xoffset==1,2,3,5,6,7 && yoffset==4
448
449 range = output_width / 16;
450 if (output_height == 8) inc = 2;
451 if (output_height == 4) inc = 4;
452 for (j = 0; j < range * output_height * inc / 16; j++) {
453 if (j % (output_height * inc / 16) == 0) {
454 src_ptr = src_ptr_ref;
455 src_ptr_ref += 16;
456 dst_ptr = dst_ptr_ref;
457 dst_ptr_ref += 16;
458
459 __m256i V_H_D1 = _mm256_loadu_si256((const __m256i *)src_ptr);
460 __m256i V_H_D2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 1));
461 src_ptr += src_pixels_per_line;
462
463 __m256i V_H_M1 = _mm256_unpacklo_epi16(V_H_D1, V_H_D2);
464 __m256i V_H_M2 = _mm256_unpackhi_epi16(V_H_D1, V_H_D2);
465
466 __m256i V_H_MAD1 = _mm256_madd_epi16(V_H_M1, filter1);
467 __m256i V_H_MAD2 = _mm256_madd_epi16(V_H_M2, filter1);
468
469 __m256i V_H_S1 =
470 _mm256_srli_epi32(_mm256_add_epi32(V_H_MAD1, rbias), 7);
471 __m256i V_H_S2 =
472 _mm256_srli_epi32(_mm256_add_epi32(V_H_MAD2, rbias), 7);
473
474 opointer[0] = _mm256_packus_epi32(V_H_S1, V_H_S2);
475
476 curr = 0;
477 }
478
479 __m256i sum1 = _mm256_setzero_si256();
480 __m256i sse1 = _mm256_setzero_si256();
481
482 for (i = 0; i < 16 / inc; ++i) {
483 prev = curr;
484 curr = (curr == 0) ? 1 : 0;
485 __m256i V_V_D1 = _mm256_loadu_si256((const __m256i *)src_ptr);
486 __m256i V_V_D2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 1));
487 src_ptr += src_pixels_per_line;
488 __m256i V_V_M1 = _mm256_unpacklo_epi16(V_V_D1, V_V_D2);
489 __m256i V_V_M2 = _mm256_unpackhi_epi16(V_V_D1, V_V_D2);
490 __m256i V_V_MAD1 = _mm256_madd_epi16(V_V_M1, filter1);
491 __m256i V_V_MAD2 = _mm256_madd_epi16(V_V_M2, filter1);
492 __m256i V_V_S1 =
493 _mm256_srli_epi32(_mm256_add_epi32(V_V_MAD1, rbias), 7);
494 __m256i V_V_S2 =
495 _mm256_srli_epi32(_mm256_add_epi32(V_V_MAD2, rbias), 7);
496 opointer[curr] = _mm256_packus_epi32(V_V_S1, V_V_S2);
497
498 __m256i V_S_SRC = _mm256_avg_epu16(opointer[prev], opointer[curr]);
499
500 __m256i V_D_DST = _mm256_loadu_si256((const __m256i *)dst_ptr);
501 dst_ptr += dst_stride;
502
503 __m256i V_R_SUB = _mm256_sub_epi16(V_S_SRC, V_D_DST);
504 __m256i V_R_MAD = _mm256_madd_epi16(V_R_SUB, V_R_SUB);
505
506 sum1 = _mm256_add_epi16(sum1, V_R_SUB);
507 sse1 = _mm256_add_epi32(sse1, V_R_MAD);
508 }
509
510 __m256i v_sum0 = _mm256_madd_epi16(sum1, one);
511 __m256i v_d_l = _mm256_unpacklo_epi32(v_sum0, sse1);
512 __m256i v_d_h = _mm256_unpackhi_epi32(v_sum0, sse1);
513 __m256i v_d_lh = _mm256_add_epi32(v_d_l, v_d_h);
514 const __m128i v_d0_d = _mm256_castsi256_si128(v_d_lh);
515 const __m128i v_d1_d = _mm256_extracti128_si256(v_d_lh, 1);
516 __m128i v_d = _mm_add_epi32(v_d0_d, v_d1_d);
517 v_d = _mm_add_epi32(v_d, _mm_srli_si128(v_d, 8));
518 sum_long += _mm_extract_epi32(v_d, 0);
519 sse_long += _mm_extract_epi32(v_d, 1);
520 }
521
522 rshift = get_msb(output_height) + get_msb(output_width);
523
524 } else { // xoffset==1,2,3,5,6,7 && yoffset==1,2,3,5,6,7
525 range = output_width / 16;
526 if (output_height == 8) inc = 2;
527 if (output_height == 4) inc = 4;
528 unsigned int nloop = 16 / inc;
529 for (j = 0; j < range * output_height * inc / 16; j++) {
530 if (j % (output_height * inc / 16) == 0) {
531 src_ptr = src_ptr_ref;
532 src_ptr_ref += 16;
533 dst_ptr = dst_ptr_ref;
534 dst_ptr_ref += 16;
535
536 __m256i V_H_D1 = _mm256_loadu_si256((const __m256i *)src_ptr);
537 __m256i V_H_D2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 1));
538 src_ptr += src_pixels_per_line;
539
540 __m256i V_H_M1 = _mm256_unpacklo_epi16(V_H_D1, V_H_D2);
541 __m256i V_H_M2 = _mm256_unpackhi_epi16(V_H_D1, V_H_D2);
542
543 __m256i V_H_MAD1 = _mm256_madd_epi16(V_H_M1, filter1);
544 __m256i V_H_MAD2 = _mm256_madd_epi16(V_H_M2, filter1);
545
546 __m256i V_H_S1 =
547 _mm256_srli_epi32(_mm256_add_epi32(V_H_MAD1, rbias), 7);
548 __m256i V_H_S2 =
549 _mm256_srli_epi32(_mm256_add_epi32(V_H_MAD2, rbias), 7);
550
551 opointer[0] = _mm256_packus_epi32(V_H_S1, V_H_S2);
552
553 curr = 0;
554 }
555
556 __m256i sum1 = _mm256_setzero_si256();
557 __m256i sse1 = _mm256_setzero_si256();
558
559 for (i = 0; i < nloop; ++i) {
560 prev = curr;
561 curr = !curr;
562 __m256i V_V_D1 = _mm256_loadu_si256((const __m256i *)src_ptr);
563 __m256i V_V_D2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 1));
564 src_ptr += src_pixels_per_line;
565 __m256i V_V_M1 = _mm256_unpacklo_epi16(V_V_D1, V_V_D2);
566 __m256i V_V_M2 = _mm256_unpackhi_epi16(V_V_D1, V_V_D2);
567 __m256i V_V_MAD1 = _mm256_madd_epi16(V_V_M1, filter1);
568 __m256i V_V_MAD2 = _mm256_madd_epi16(V_V_M2, filter1);
569 __m256i V_V_S1 =
570 _mm256_srli_epi32(_mm256_add_epi32(V_V_MAD1, rbias), 7);
571 __m256i V_V_S2 =
572 _mm256_srli_epi32(_mm256_add_epi32(V_V_MAD2, rbias), 7);
573 opointer[curr] = _mm256_packus_epi32(V_V_S1, V_V_S2);
574
575 __m256i V_S_M1 = _mm256_unpacklo_epi16(opointer[prev], opointer[curr]);
576 __m256i V_S_M2 = _mm256_unpackhi_epi16(opointer[prev], opointer[curr]);
577
578 __m256i V_S_MAD1 = _mm256_madd_epi16(V_S_M1, filter2);
579 __m256i V_S_MAD2 = _mm256_madd_epi16(V_S_M2, filter2);
580
581 __m256i V_S_S1 =
582 _mm256_srli_epi32(_mm256_add_epi32(V_S_MAD1, rbias), 7);
583 __m256i V_S_S2 =
584 _mm256_srli_epi32(_mm256_add_epi32(V_S_MAD2, rbias), 7);
585
586 __m256i V_S_SRC = _mm256_packus_epi32(V_S_S1, V_S_S2);
587
588 __m256i V_D_DST = _mm256_loadu_si256((const __m256i *)dst_ptr);
589 dst_ptr += dst_stride;
590
591 __m256i V_R_SUB = _mm256_sub_epi16(V_S_SRC, V_D_DST);
592 __m256i V_R_MAD = _mm256_madd_epi16(V_R_SUB, V_R_SUB);
593
594 sum1 = _mm256_add_epi16(sum1, V_R_SUB);
595 sse1 = _mm256_add_epi32(sse1, V_R_MAD);
596 }
597
598 __m256i v_sum0 = _mm256_madd_epi16(sum1, one);
599 __m256i v_d_l = _mm256_unpacklo_epi32(v_sum0, sse1);
600 __m256i v_d_h = _mm256_unpackhi_epi32(v_sum0, sse1);
601 __m256i v_d_lh = _mm256_add_epi32(v_d_l, v_d_h);
602 const __m128i v_d0_d = _mm256_castsi256_si128(v_d_lh);
603 const __m128i v_d1_d = _mm256_extracti128_si256(v_d_lh, 1);
604 __m128i v_d = _mm_add_epi32(v_d0_d, v_d1_d);
605 v_d = _mm_add_epi32(v_d, _mm_srli_si128(v_d, 8));
606 sum_long += _mm_extract_epi32(v_d, 0);
607 sse_long += _mm_extract_epi32(v_d, 1);
608 }
609
610 rshift = get_msb(output_height) + get_msb(output_width);
611 }
612
613 *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4);
614 int sum = (int)ROUND_POWER_OF_TWO(sum_long, 2);
615
616 int32_t var = *sse - (uint32_t)(((int64_t)sum * sum) >> rshift);
617
618 return (var > 0) ? var : 0;
619 }
620
highbd_calc8x8var_avx2(const uint16_t * src,int src_stride,const uint16_t * ref,int ref_stride,uint32_t * sse,int * sum)621 static void highbd_calc8x8var_avx2(const uint16_t *src, int src_stride,
622 const uint16_t *ref, int ref_stride,
623 uint32_t *sse, int *sum) {
624 __m256i v_sum_d = _mm256_setzero_si256();
625 __m256i v_sse_d = _mm256_setzero_si256();
626 for (int i = 0; i < 8; i += 2) {
627 const __m128i v_p_a0 = _mm_loadu_si128((const __m128i *)src);
628 const __m128i v_p_a1 = _mm_loadu_si128((const __m128i *)(src + src_stride));
629 const __m128i v_p_b0 = _mm_loadu_si128((const __m128i *)ref);
630 const __m128i v_p_b1 = _mm_loadu_si128((const __m128i *)(ref + ref_stride));
631 __m256i v_p_a = _mm256_castsi128_si256(v_p_a0);
632 __m256i v_p_b = _mm256_castsi128_si256(v_p_b0);
633 v_p_a = _mm256_inserti128_si256(v_p_a, v_p_a1, 1);
634 v_p_b = _mm256_inserti128_si256(v_p_b, v_p_b1, 1);
635 const __m256i v_diff = _mm256_sub_epi16(v_p_a, v_p_b);
636 const __m256i v_sqrdiff = _mm256_madd_epi16(v_diff, v_diff);
637 v_sum_d = _mm256_add_epi16(v_sum_d, v_diff);
638 v_sse_d = _mm256_add_epi32(v_sse_d, v_sqrdiff);
639 src += src_stride * 2;
640 ref += ref_stride * 2;
641 }
642 __m256i v_sum00 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(v_sum_d));
643 __m256i v_sum01 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(v_sum_d, 1));
644 __m256i v_sum0 = _mm256_add_epi32(v_sum00, v_sum01);
645 __m256i v_d_l = _mm256_unpacklo_epi32(v_sum0, v_sse_d);
646 __m256i v_d_h = _mm256_unpackhi_epi32(v_sum0, v_sse_d);
647 __m256i v_d_lh = _mm256_add_epi32(v_d_l, v_d_h);
648 const __m128i v_d0_d = _mm256_castsi256_si128(v_d_lh);
649 const __m128i v_d1_d = _mm256_extracti128_si256(v_d_lh, 1);
650 __m128i v_d = _mm_add_epi32(v_d0_d, v_d1_d);
651 v_d = _mm_add_epi32(v_d, _mm_srli_si128(v_d, 8));
652 *sum = _mm_extract_epi32(v_d, 0);
653 *sse = _mm_extract_epi32(v_d, 1);
654 }
655
highbd_calc16x16var_avx2(const uint16_t * src,int src_stride,const uint16_t * ref,int ref_stride,uint32_t * sse,int * sum)656 static void highbd_calc16x16var_avx2(const uint16_t *src, int src_stride,
657 const uint16_t *ref, int ref_stride,
658 uint32_t *sse, int *sum) {
659 __m256i v_sum_d = _mm256_setzero_si256();
660 __m256i v_sse_d = _mm256_setzero_si256();
661 const __m256i one = _mm256_set1_epi16(1);
662 for (int i = 0; i < 16; ++i) {
663 const __m256i v_p_a = _mm256_loadu_si256((const __m256i *)src);
664 const __m256i v_p_b = _mm256_loadu_si256((const __m256i *)ref);
665 const __m256i v_diff = _mm256_sub_epi16(v_p_a, v_p_b);
666 const __m256i v_sqrdiff = _mm256_madd_epi16(v_diff, v_diff);
667 v_sum_d = _mm256_add_epi16(v_sum_d, v_diff);
668 v_sse_d = _mm256_add_epi32(v_sse_d, v_sqrdiff);
669 src += src_stride;
670 ref += ref_stride;
671 }
672 __m256i v_sum0 = _mm256_madd_epi16(v_sum_d, one);
673 __m256i v_d_l = _mm256_unpacklo_epi32(v_sum0, v_sse_d);
674 __m256i v_d_h = _mm256_unpackhi_epi32(v_sum0, v_sse_d);
675 __m256i v_d_lh = _mm256_add_epi32(v_d_l, v_d_h);
676 const __m128i v_d0_d = _mm256_castsi256_si128(v_d_lh);
677 const __m128i v_d1_d = _mm256_extracti128_si256(v_d_lh, 1);
678 __m128i v_d = _mm_add_epi32(v_d0_d, v_d1_d);
679 v_d = _mm_add_epi32(v_d, _mm_srli_si128(v_d, 8));
680 *sum = _mm_extract_epi32(v_d, 0);
681 *sse = _mm_extract_epi32(v_d, 1);
682 }
683
highbd_10_variance_avx2(const uint16_t * src,int src_stride,const uint16_t * ref,int ref_stride,int w,int h,uint32_t * sse,int * sum,high_variance_fn_t var_fn,int block_size)684 static void highbd_10_variance_avx2(const uint16_t *src, int src_stride,
685 const uint16_t *ref, int ref_stride, int w,
686 int h, uint32_t *sse, int *sum,
687 high_variance_fn_t var_fn, int block_size) {
688 int i, j;
689 uint64_t sse_long = 0;
690 int32_t sum_long = 0;
691
692 for (i = 0; i < h; i += block_size) {
693 for (j = 0; j < w; j += block_size) {
694 unsigned int sse0;
695 int sum0;
696 var_fn(src + src_stride * i + j, src_stride, ref + ref_stride * i + j,
697 ref_stride, &sse0, &sum0);
698 sse_long += sse0;
699 sum_long += sum0;
700 }
701 }
702 *sum = ROUND_POWER_OF_TWO(sum_long, 2);
703 *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4);
704 }
705
706 #define VAR_FN(w, h, block_size, shift) \
707 uint32_t aom_highbd_10_variance##w##x##h##_avx2( \
708 const uint8_t *src8, int src_stride, const uint8_t *ref8, \
709 int ref_stride, uint32_t *sse) { \
710 int sum; \
711 int64_t var; \
712 uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
713 uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
714 highbd_10_variance_avx2(src, src_stride, ref, ref_stride, w, h, sse, &sum, \
715 highbd_calc##block_size##x##block_size##var_avx2, \
716 block_size); \
717 var = (int64_t)(*sse) - (((int64_t)sum * sum) >> shift); \
718 return (var >= 0) ? (uint32_t)var : 0; \
719 }
720
721 VAR_FN(128, 128, 16, 14)
722 VAR_FN(128, 64, 16, 13)
723 VAR_FN(64, 128, 16, 13)
724 VAR_FN(64, 64, 16, 12)
725 VAR_FN(64, 32, 16, 11)
726 VAR_FN(32, 64, 16, 11)
727 VAR_FN(32, 32, 16, 10)
728 VAR_FN(32, 16, 16, 9)
729 VAR_FN(16, 32, 16, 9)
730 VAR_FN(16, 16, 16, 8)
731 VAR_FN(16, 8, 8, 7)
732 VAR_FN(8, 16, 8, 7)
733 VAR_FN(8, 8, 8, 6)
734
735 #if !CONFIG_REALTIME_ONLY
736 VAR_FN(16, 64, 16, 10)
737 VAR_FN(32, 8, 8, 8)
738 VAR_FN(64, 16, 16, 10)
739 VAR_FN(8, 32, 8, 8)
740 #endif // !CONFIG_REALTIME_ONLY
741
742 #undef VAR_FN
743
aom_highbd_10_mse16x16_avx2(const uint8_t * src8,int src_stride,const uint8_t * ref8,int ref_stride,unsigned int * sse)744 unsigned int aom_highbd_10_mse16x16_avx2(const uint8_t *src8, int src_stride,
745 const uint8_t *ref8, int ref_stride,
746 unsigned int *sse) {
747 int sum;
748 uint16_t *src = CONVERT_TO_SHORTPTR(src8);
749 uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
750 highbd_10_variance_avx2(src, src_stride, ref, ref_stride, 16, 16, sse, &sum,
751 highbd_calc16x16var_avx2, 16);
752 return *sse;
753 }
754
755 #define SSE2_HEIGHT(H) \
756 uint32_t aom_highbd_10_sub_pixel_variance8x##H##_sse2( \
757 const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
758 const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr);
759
760 SSE2_HEIGHT(8)
761 SSE2_HEIGHT(16)
762
763 #undef SSE2_HEIGHT
764
765 #define HIGHBD_SUBPIX_VAR(W, H) \
766 uint32_t aom_highbd_10_sub_pixel_variance##W##x##H##_avx2( \
767 const uint8_t *src, int src_stride, int xoffset, int yoffset, \
768 const uint8_t *dst, int dst_stride, uint32_t *sse) { \
769 if (W == 8 && H == 16) \
770 return aom_highbd_10_sub_pixel_variance8x16_sse2( \
771 src, src_stride, xoffset, yoffset, dst, dst_stride, sse); \
772 else if (W == 8 && H == 8) \
773 return aom_highbd_10_sub_pixel_variance8x8_sse2( \
774 src, src_stride, xoffset, yoffset, dst, dst_stride, sse); \
775 else \
776 return aom_highbd_var_filter_block2d_bil_avx2( \
777 src, src_stride, 1, H, W, xoffset, yoffset, dst, dst_stride, sse); \
778 }
779
780 HIGHBD_SUBPIX_VAR(128, 128)
781 HIGHBD_SUBPIX_VAR(128, 64)
782 HIGHBD_SUBPIX_VAR(64, 128)
783 HIGHBD_SUBPIX_VAR(64, 64)
784 HIGHBD_SUBPIX_VAR(64, 32)
785 HIGHBD_SUBPIX_VAR(32, 64)
786 HIGHBD_SUBPIX_VAR(32, 32)
787 HIGHBD_SUBPIX_VAR(32, 16)
788 HIGHBD_SUBPIX_VAR(16, 32)
789 HIGHBD_SUBPIX_VAR(16, 16)
790 HIGHBD_SUBPIX_VAR(16, 8)
791 HIGHBD_SUBPIX_VAR(8, 16)
792 HIGHBD_SUBPIX_VAR(8, 8)
793
794 #undef HIGHBD_SUBPIX_VAR
795
mse_4xh_16bit_highbd_avx2(uint16_t * dst,int dstride,uint16_t * src,int sstride,int h)796 static uint64_t mse_4xh_16bit_highbd_avx2(uint16_t *dst, int dstride,
797 uint16_t *src, int sstride, int h) {
798 uint64_t sum = 0;
799 __m128i reg0_4x16, reg1_4x16, reg2_4x16, reg3_4x16;
800 __m256i src0_8x16, src1_8x16, src_16x16;
801 __m256i dst0_8x16, dst1_8x16, dst_16x16;
802 __m256i res0_4x64, res1_4x64, res2_4x64, res3_4x64;
803 __m256i sub_result;
804 const __m256i zeros = _mm256_broadcastsi128_si256(_mm_setzero_si128());
805 __m256i square_result = _mm256_broadcastsi128_si256(_mm_setzero_si128());
806 for (int i = 0; i < h; i += 4) {
807 reg0_4x16 = _mm_loadl_epi64((__m128i const *)(&dst[(i + 0) * dstride]));
808 reg1_4x16 = _mm_loadl_epi64((__m128i const *)(&dst[(i + 1) * dstride]));
809 reg2_4x16 = _mm_loadl_epi64((__m128i const *)(&dst[(i + 2) * dstride]));
810 reg3_4x16 = _mm_loadl_epi64((__m128i const *)(&dst[(i + 3) * dstride]));
811 dst0_8x16 =
812 _mm256_castsi128_si256(_mm_unpacklo_epi64(reg0_4x16, reg1_4x16));
813 dst1_8x16 =
814 _mm256_castsi128_si256(_mm_unpacklo_epi64(reg2_4x16, reg3_4x16));
815 dst_16x16 = _mm256_permute2x128_si256(dst0_8x16, dst1_8x16, 0x20);
816
817 reg0_4x16 = _mm_loadl_epi64((__m128i const *)(&src[(i + 0) * sstride]));
818 reg1_4x16 = _mm_loadl_epi64((__m128i const *)(&src[(i + 1) * sstride]));
819 reg2_4x16 = _mm_loadl_epi64((__m128i const *)(&src[(i + 2) * sstride]));
820 reg3_4x16 = _mm_loadl_epi64((__m128i const *)(&src[(i + 3) * sstride]));
821 src0_8x16 =
822 _mm256_castsi128_si256(_mm_unpacklo_epi64(reg0_4x16, reg1_4x16));
823 src1_8x16 =
824 _mm256_castsi128_si256(_mm_unpacklo_epi64(reg2_4x16, reg3_4x16));
825 src_16x16 = _mm256_permute2x128_si256(src0_8x16, src1_8x16, 0x20);
826
827 sub_result = _mm256_abs_epi16(_mm256_sub_epi16(src_16x16, dst_16x16));
828
829 src_16x16 = _mm256_unpacklo_epi16(sub_result, zeros);
830 dst_16x16 = _mm256_unpackhi_epi16(sub_result, zeros);
831
832 src_16x16 = _mm256_madd_epi16(src_16x16, src_16x16);
833 dst_16x16 = _mm256_madd_epi16(dst_16x16, dst_16x16);
834
835 res0_4x64 = _mm256_unpacklo_epi32(src_16x16, zeros);
836 res1_4x64 = _mm256_unpackhi_epi32(src_16x16, zeros);
837 res2_4x64 = _mm256_unpacklo_epi32(dst_16x16, zeros);
838 res3_4x64 = _mm256_unpackhi_epi32(dst_16x16, zeros);
839
840 square_result = _mm256_add_epi64(
841 square_result,
842 _mm256_add_epi64(
843 _mm256_add_epi64(_mm256_add_epi64(res0_4x64, res1_4x64), res2_4x64),
844 res3_4x64));
845 }
846 const __m128i sum_2x64 =
847 _mm_add_epi64(_mm256_castsi256_si128(square_result),
848 _mm256_extracti128_si256(square_result, 1));
849 const __m128i sum_1x64 = _mm_add_epi64(sum_2x64, _mm_srli_si128(sum_2x64, 8));
850 xx_storel_64(&sum, sum_1x64);
851 return sum;
852 }
853
mse_8xh_16bit_highbd_avx2(uint16_t * dst,int dstride,uint16_t * src,int sstride,int h)854 static uint64_t mse_8xh_16bit_highbd_avx2(uint16_t *dst, int dstride,
855 uint16_t *src, int sstride, int h) {
856 uint64_t sum = 0;
857 __m256i src0_8x16, src1_8x16, src_16x16;
858 __m256i dst0_8x16, dst1_8x16, dst_16x16;
859 __m256i res0_4x64, res1_4x64, res2_4x64, res3_4x64;
860 __m256i sub_result;
861 const __m256i zeros = _mm256_broadcastsi128_si256(_mm_setzero_si128());
862 __m256i square_result = _mm256_broadcastsi128_si256(_mm_setzero_si128());
863
864 for (int i = 0; i < h; i += 2) {
865 dst0_8x16 =
866 _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)&dst[i * dstride]));
867 dst1_8x16 = _mm256_castsi128_si256(
868 _mm_loadu_si128((__m128i *)&dst[(i + 1) * dstride]));
869 dst_16x16 = _mm256_permute2x128_si256(dst0_8x16, dst1_8x16, 0x20);
870
871 src0_8x16 =
872 _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)&src[i * sstride]));
873 src1_8x16 = _mm256_castsi128_si256(
874 _mm_loadu_si128((__m128i *)&src[(i + 1) * sstride]));
875 src_16x16 = _mm256_permute2x128_si256(src0_8x16, src1_8x16, 0x20);
876
877 sub_result = _mm256_abs_epi16(_mm256_sub_epi16(src_16x16, dst_16x16));
878
879 src_16x16 = _mm256_unpacklo_epi16(sub_result, zeros);
880 dst_16x16 = _mm256_unpackhi_epi16(sub_result, zeros);
881
882 src_16x16 = _mm256_madd_epi16(src_16x16, src_16x16);
883 dst_16x16 = _mm256_madd_epi16(dst_16x16, dst_16x16);
884
885 res0_4x64 = _mm256_unpacklo_epi32(src_16x16, zeros);
886 res1_4x64 = _mm256_unpackhi_epi32(src_16x16, zeros);
887 res2_4x64 = _mm256_unpacklo_epi32(dst_16x16, zeros);
888 res3_4x64 = _mm256_unpackhi_epi32(dst_16x16, zeros);
889
890 square_result = _mm256_add_epi64(
891 square_result,
892 _mm256_add_epi64(
893 _mm256_add_epi64(_mm256_add_epi64(res0_4x64, res1_4x64), res2_4x64),
894 res3_4x64));
895 }
896
897 const __m128i sum_2x64 =
898 _mm_add_epi64(_mm256_castsi256_si128(square_result),
899 _mm256_extracti128_si256(square_result, 1));
900 const __m128i sum_1x64 = _mm_add_epi64(sum_2x64, _mm_srli_si128(sum_2x64, 8));
901 xx_storel_64(&sum, sum_1x64);
902 return sum;
903 }
904
aom_mse_wxh_16bit_highbd_avx2(uint16_t * dst,int dstride,uint16_t * src,int sstride,int w,int h)905 uint64_t aom_mse_wxh_16bit_highbd_avx2(uint16_t *dst, int dstride,
906 uint16_t *src, int sstride, int w,
907 int h) {
908 assert((w == 8 || w == 4) && (h == 8 || h == 4) &&
909 "w=8/4 and h=8/4 must satisfy");
910 switch (w) {
911 case 4: return mse_4xh_16bit_highbd_avx2(dst, dstride, src, sstride, h);
912 case 8: return mse_8xh_16bit_highbd_avx2(dst, dstride, src, sstride, h);
913 default: assert(0 && "unsupported width"); return -1;
914 }
915 }
916