1*77c1e3ccSAndroid Build Coastguard Worker /*
2*77c1e3ccSAndroid Build Coastguard Worker * Copyright (c) 2017, Alliance for Open Media. All rights reserved.
3*77c1e3ccSAndroid Build Coastguard Worker *
4*77c1e3ccSAndroid Build Coastguard Worker * This source code is subject to the terms of the BSD 2 Clause License and
5*77c1e3ccSAndroid Build Coastguard Worker * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6*77c1e3ccSAndroid Build Coastguard Worker * was not distributed with this source code in the LICENSE file, you can
7*77c1e3ccSAndroid Build Coastguard Worker * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8*77c1e3ccSAndroid Build Coastguard Worker * Media Patent License 1.0 was not distributed with this source code in the
9*77c1e3ccSAndroid Build Coastguard Worker * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10*77c1e3ccSAndroid Build Coastguard Worker */
11*77c1e3ccSAndroid Build Coastguard Worker
12*77c1e3ccSAndroid Build Coastguard Worker #include <emmintrin.h>
13*77c1e3ccSAndroid Build Coastguard Worker
14*77c1e3ccSAndroid Build Coastguard Worker #include "av1/common/cfl.h"
15*77c1e3ccSAndroid Build Coastguard Worker #include "config/av1_rtcd.h"
16*77c1e3ccSAndroid Build Coastguard Worker
fill_sum_epi32(__m128i l0)17*77c1e3ccSAndroid Build Coastguard Worker static inline __m128i fill_sum_epi32(__m128i l0) {
18*77c1e3ccSAndroid Build Coastguard Worker l0 = _mm_add_epi32(l0, _mm_shuffle_epi32(l0, _MM_SHUFFLE(1, 0, 3, 2)));
19*77c1e3ccSAndroid Build Coastguard Worker return _mm_add_epi32(l0, _mm_shuffle_epi32(l0, _MM_SHUFFLE(2, 3, 0, 1)));
20*77c1e3ccSAndroid Build Coastguard Worker }
21*77c1e3ccSAndroid Build Coastguard Worker
subtract_average_sse2(const uint16_t * src_ptr,int16_t * dst_ptr,int width,int height,int round_offset,int num_pel_log2)22*77c1e3ccSAndroid Build Coastguard Worker static inline void subtract_average_sse2(const uint16_t *src_ptr,
23*77c1e3ccSAndroid Build Coastguard Worker int16_t *dst_ptr, int width,
24*77c1e3ccSAndroid Build Coastguard Worker int height, int round_offset,
25*77c1e3ccSAndroid Build Coastguard Worker int num_pel_log2) {
26*77c1e3ccSAndroid Build Coastguard Worker const __m128i zeros = _mm_setzero_si128();
27*77c1e3ccSAndroid Build Coastguard Worker const __m128i round_offset_epi32 = _mm_set1_epi32(round_offset);
28*77c1e3ccSAndroid Build Coastguard Worker const __m128i *src = (__m128i *)src_ptr;
29*77c1e3ccSAndroid Build Coastguard Worker const __m128i *const end = src + height * CFL_BUF_LINE_I128;
30*77c1e3ccSAndroid Build Coastguard Worker const int step = CFL_BUF_LINE_I128 * (1 + (width == 8) + 3 * (width == 4));
31*77c1e3ccSAndroid Build Coastguard Worker
32*77c1e3ccSAndroid Build Coastguard Worker __m128i sum = zeros;
33*77c1e3ccSAndroid Build Coastguard Worker do {
34*77c1e3ccSAndroid Build Coastguard Worker __m128i l0;
35*77c1e3ccSAndroid Build Coastguard Worker if (width == 4) {
36*77c1e3ccSAndroid Build Coastguard Worker l0 = _mm_add_epi16(_mm_loadl_epi64(src),
37*77c1e3ccSAndroid Build Coastguard Worker _mm_loadl_epi64(src + CFL_BUF_LINE_I128));
38*77c1e3ccSAndroid Build Coastguard Worker __m128i l1 = _mm_add_epi16(_mm_loadl_epi64(src + 2 * CFL_BUF_LINE_I128),
39*77c1e3ccSAndroid Build Coastguard Worker _mm_loadl_epi64(src + 3 * CFL_BUF_LINE_I128));
40*77c1e3ccSAndroid Build Coastguard Worker sum = _mm_add_epi32(sum, _mm_add_epi32(_mm_unpacklo_epi16(l0, zeros),
41*77c1e3ccSAndroid Build Coastguard Worker _mm_unpacklo_epi16(l1, zeros)));
42*77c1e3ccSAndroid Build Coastguard Worker } else {
43*77c1e3ccSAndroid Build Coastguard Worker if (width == 8) {
44*77c1e3ccSAndroid Build Coastguard Worker l0 = _mm_add_epi16(_mm_loadu_si128(src),
45*77c1e3ccSAndroid Build Coastguard Worker _mm_loadu_si128(src + CFL_BUF_LINE_I128));
46*77c1e3ccSAndroid Build Coastguard Worker } else {
47*77c1e3ccSAndroid Build Coastguard Worker l0 = _mm_add_epi16(_mm_loadu_si128(src), _mm_loadu_si128(src + 1));
48*77c1e3ccSAndroid Build Coastguard Worker }
49*77c1e3ccSAndroid Build Coastguard Worker sum = _mm_add_epi32(sum, _mm_add_epi32(_mm_unpacklo_epi16(l0, zeros),
50*77c1e3ccSAndroid Build Coastguard Worker _mm_unpackhi_epi16(l0, zeros)));
51*77c1e3ccSAndroid Build Coastguard Worker if (width == 32) {
52*77c1e3ccSAndroid Build Coastguard Worker l0 = _mm_add_epi16(_mm_loadu_si128(src + 2), _mm_loadu_si128(src + 3));
53*77c1e3ccSAndroid Build Coastguard Worker sum = _mm_add_epi32(sum, _mm_add_epi32(_mm_unpacklo_epi16(l0, zeros),
54*77c1e3ccSAndroid Build Coastguard Worker _mm_unpackhi_epi16(l0, zeros)));
55*77c1e3ccSAndroid Build Coastguard Worker }
56*77c1e3ccSAndroid Build Coastguard Worker }
57*77c1e3ccSAndroid Build Coastguard Worker src += step;
58*77c1e3ccSAndroid Build Coastguard Worker } while (src < end);
59*77c1e3ccSAndroid Build Coastguard Worker
60*77c1e3ccSAndroid Build Coastguard Worker sum = fill_sum_epi32(sum);
61*77c1e3ccSAndroid Build Coastguard Worker
62*77c1e3ccSAndroid Build Coastguard Worker __m128i avg_epi16 =
63*77c1e3ccSAndroid Build Coastguard Worker _mm_srli_epi32(_mm_add_epi32(sum, round_offset_epi32), num_pel_log2);
64*77c1e3ccSAndroid Build Coastguard Worker avg_epi16 = _mm_packs_epi32(avg_epi16, avg_epi16);
65*77c1e3ccSAndroid Build Coastguard Worker
66*77c1e3ccSAndroid Build Coastguard Worker src = (__m128i *)src_ptr;
67*77c1e3ccSAndroid Build Coastguard Worker __m128i *dst = (__m128i *)dst_ptr;
68*77c1e3ccSAndroid Build Coastguard Worker do {
69*77c1e3ccSAndroid Build Coastguard Worker if (width == 4) {
70*77c1e3ccSAndroid Build Coastguard Worker _mm_storel_epi64(dst, _mm_sub_epi16(_mm_loadl_epi64(src), avg_epi16));
71*77c1e3ccSAndroid Build Coastguard Worker } else {
72*77c1e3ccSAndroid Build Coastguard Worker _mm_storeu_si128(dst, _mm_sub_epi16(_mm_loadu_si128(src), avg_epi16));
73*77c1e3ccSAndroid Build Coastguard Worker if (width > 8) {
74*77c1e3ccSAndroid Build Coastguard Worker _mm_storeu_si128(dst + 1,
75*77c1e3ccSAndroid Build Coastguard Worker _mm_sub_epi16(_mm_loadu_si128(src + 1), avg_epi16));
76*77c1e3ccSAndroid Build Coastguard Worker if (width == 32) {
77*77c1e3ccSAndroid Build Coastguard Worker _mm_storeu_si128(dst + 2,
78*77c1e3ccSAndroid Build Coastguard Worker _mm_sub_epi16(_mm_loadu_si128(src + 2), avg_epi16));
79*77c1e3ccSAndroid Build Coastguard Worker _mm_storeu_si128(dst + 3,
80*77c1e3ccSAndroid Build Coastguard Worker _mm_sub_epi16(_mm_loadu_si128(src + 3), avg_epi16));
81*77c1e3ccSAndroid Build Coastguard Worker }
82*77c1e3ccSAndroid Build Coastguard Worker }
83*77c1e3ccSAndroid Build Coastguard Worker }
84*77c1e3ccSAndroid Build Coastguard Worker src += CFL_BUF_LINE_I128;
85*77c1e3ccSAndroid Build Coastguard Worker dst += CFL_BUF_LINE_I128;
86*77c1e3ccSAndroid Build Coastguard Worker } while (src < end);
87*77c1e3ccSAndroid Build Coastguard Worker }
88*77c1e3ccSAndroid Build Coastguard Worker
89*77c1e3ccSAndroid Build Coastguard Worker CFL_SUB_AVG_FN(sse2)
90