xref: /aosp_15_r20/external/libaom/av1/common/x86/av1_convolve_horiz_rs_sse4.c (revision 77c1e3ccc04c968bd2bc212e87364f250e820521)
1 /*
2  * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include <assert.h>
13 #include <smmintrin.h>
14 
15 #include "config/av1_rtcd.h"
16 
17 #include "av1/common/convolve.h"
18 #include "av1/common/resize.h"
19 #include "aom_dsp/x86/synonyms.h"
20 
21 // Note: If the crop width is not a multiple of 4, then, unlike the C version,
22 // this function will overwrite some of the padding on the right hand side of
23 // the frame. This padding appears to be trashed anyway, so this should not
24 // affect the running of the decoder.
av1_convolve_horiz_rs_sse4_1(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const int16_t * x_filters,int x0_qn,int x_step_qn)25 void av1_convolve_horiz_rs_sse4_1(const uint8_t *src, int src_stride,
26                                   uint8_t *dst, int dst_stride, int w, int h,
27                                   const int16_t *x_filters, int x0_qn,
28                                   int x_step_qn) {
29   assert(UPSCALE_NORMATIVE_TAPS == 8);
30 
31   src -= UPSCALE_NORMATIVE_TAPS / 2 - 1;
32 
33   const __m128i round_add = _mm_set1_epi32((1 << FILTER_BITS) >> 1);
34   const __m128i zero = _mm_setzero_si128();
35 
36   const uint8_t *src_y;
37   uint8_t *dst_y;
38   int x_qn = x0_qn;
39   for (int x = 0; x < w; x += 4, x_qn += 4 * x_step_qn) {
40     const int x_filter_idx0 =
41         ((x_qn + 0 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
42     const int x_filter_idx1 =
43         ((x_qn + 1 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
44     const int x_filter_idx2 =
45         ((x_qn + 2 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
46     const int x_filter_idx3 =
47         ((x_qn + 3 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
48 
49     assert(x_filter_idx0 <= RS_SUBPEL_MASK);
50     assert(x_filter_idx1 <= RS_SUBPEL_MASK);
51     assert(x_filter_idx2 <= RS_SUBPEL_MASK);
52     assert(x_filter_idx3 <= RS_SUBPEL_MASK);
53 
54     const int16_t *const x_filter0 =
55         &x_filters[x_filter_idx0 * UPSCALE_NORMATIVE_TAPS];
56     const int16_t *const x_filter1 =
57         &x_filters[x_filter_idx1 * UPSCALE_NORMATIVE_TAPS];
58     const int16_t *const x_filter2 =
59         &x_filters[x_filter_idx2 * UPSCALE_NORMATIVE_TAPS];
60     const int16_t *const x_filter3 =
61         &x_filters[x_filter_idx3 * UPSCALE_NORMATIVE_TAPS];
62 
63     const __m128i fil0_16 = xx_loadu_128(x_filter0);
64     const __m128i fil1_16 = xx_loadu_128(x_filter1);
65     const __m128i fil2_16 = xx_loadu_128(x_filter2);
66     const __m128i fil3_16 = xx_loadu_128(x_filter3);
67 
68     src_y = src;
69     dst_y = dst;
70     for (int y = 0; y < h; y++, src_y += src_stride, dst_y += dst_stride) {
71       const uint8_t *const src_x0 =
72           &src_y[(x_qn + 0 * x_step_qn) >> RS_SCALE_SUBPEL_BITS];
73       const uint8_t *const src_x1 =
74           &src_y[(x_qn + 1 * x_step_qn) >> RS_SCALE_SUBPEL_BITS];
75       const uint8_t *const src_x2 =
76           &src_y[(x_qn + 2 * x_step_qn) >> RS_SCALE_SUBPEL_BITS];
77       const uint8_t *const src_x3 =
78           &src_y[(x_qn + 3 * x_step_qn) >> RS_SCALE_SUBPEL_BITS];
79 
80       // Load up the source data. This is 8-bit input data, so each load
81       // gets 8 pixels.
82       const __m128i src0_8 = xx_loadl_64(src_x0);
83       const __m128i src1_8 = xx_loadl_64(src_x1);
84       const __m128i src2_8 = xx_loadl_64(src_x2);
85       const __m128i src3_8 = xx_loadl_64(src_x3);
86 
87       // Now zero-extend up to 16-bit precision, i.e.
88       // [ 00 00 00 00 hg fe dc ba ] -> [ 0h 0g 0f 0e 0d 0c 0b 0a ]
89       const __m128i src0_16 = _mm_cvtepu8_epi16(src0_8);
90       const __m128i src1_16 = _mm_cvtepu8_epi16(src1_8);
91       const __m128i src2_16 = _mm_cvtepu8_epi16(src2_8);
92       const __m128i src3_16 = _mm_cvtepu8_epi16(src3_8);
93 
94       // Multiply by filter coefficients (results in a 32-bit value),
95       // and add adjacent pairs, i.e.
96       // ([ s7 s6 s5 s4 s3 s2 s1 s0], [ f7 f6 f5 f4 f3 f2 f1 f0 ])
97       // -> [ {s7*f7+s6*f6} {s5*f5+s4*f4} {s3*f3+s2*f2} {s1*f1+s0*f0} ]
98       const __m128i conv0_32 = _mm_madd_epi16(src0_16, fil0_16);
99       const __m128i conv1_32 = _mm_madd_epi16(src1_16, fil1_16);
100       const __m128i conv2_32 = _mm_madd_epi16(src2_16, fil2_16);
101       const __m128i conv3_32 = _mm_madd_epi16(src3_16, fil3_16);
102 
103       // Reduce horizontally and add, i.e.
104       // ([ D C B A ], [ S R Q P ]) -> [ S+R Q+P D+C B+A ]
105       const __m128i conv01_32 = _mm_hadd_epi32(conv0_32, conv1_32);
106       const __m128i conv23_32 = _mm_hadd_epi32(conv2_32, conv3_32);
107 
108       const __m128i conv0123_32 = _mm_hadd_epi32(conv01_32, conv23_32);
109 
110       // Divide down by (1 << FILTER_BITS), rounding to nearest.
111       const __m128i shifted_32 =
112           _mm_srai_epi32(_mm_add_epi32(conv0123_32, round_add), FILTER_BITS);
113 
114       // Pack 32-bit values into 16-bit values, i.e.
115       // ([ D C B A ], [ 0 0 0 0 ]) -> [ 0 0 0 0 D C B A ]
116       const __m128i shifted_16 = _mm_packus_epi32(shifted_32, zero);
117 
118       // Pack 16-bit values into 8-bit values, i.e.
119       // ([ 0 0 0 0 D C B A ], [ 0 0 0 0 0 0 0 0 ])
120       // -> [ 0 0 0 0 0 0 DC BA ]
121       const __m128i shifted_8 = _mm_packus_epi16(shifted_16, zero);
122 
123       // Write to the output
124       xx_storel_32(&dst_y[x], shifted_8);
125     }
126   }
127 }
128 
129 #if CONFIG_AV1_HIGHBITDEPTH
130 // Note: If the crop width is not a multiple of 4, then, unlike the C version,
131 // this function will overwrite some of the padding on the right hand side of
132 // the frame. This padding appears to be trashed anyway, so this should not
133 // affect the running of the decoder.
av1_highbd_convolve_horiz_rs_sse4_1(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const int16_t * x_filters,int x0_qn,int x_step_qn,int bd)134 void av1_highbd_convolve_horiz_rs_sse4_1(const uint16_t *src, int src_stride,
135                                          uint16_t *dst, int dst_stride, int w,
136                                          int h, const int16_t *x_filters,
137                                          int x0_qn, int x_step_qn, int bd) {
138   assert(UPSCALE_NORMATIVE_TAPS == 8);
139   assert(bd == 8 || bd == 10 || bd == 12);
140 
141   src -= UPSCALE_NORMATIVE_TAPS / 2 - 1;
142 
143   const __m128i round_add = _mm_set1_epi32((1 << FILTER_BITS) >> 1);
144   const __m128i zero = _mm_setzero_si128();
145   const __m128i clip_maximum = _mm_set1_epi16((1 << bd) - 1);
146 
147   const uint16_t *src_y;
148   uint16_t *dst_y;
149   int x_qn = x0_qn;
150   for (int x = 0; x < w; x += 4, x_qn += 4 * x_step_qn) {
151     const int x_filter_idx0 =
152         ((x_qn + 0 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
153     const int x_filter_idx1 =
154         ((x_qn + 1 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
155     const int x_filter_idx2 =
156         ((x_qn + 2 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
157     const int x_filter_idx3 =
158         ((x_qn + 3 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
159 
160     assert(x_filter_idx0 <= RS_SUBPEL_MASK);
161     assert(x_filter_idx1 <= RS_SUBPEL_MASK);
162     assert(x_filter_idx2 <= RS_SUBPEL_MASK);
163     assert(x_filter_idx3 <= RS_SUBPEL_MASK);
164 
165     const int16_t *const x_filter0 =
166         &x_filters[x_filter_idx0 * UPSCALE_NORMATIVE_TAPS];
167     const int16_t *const x_filter1 =
168         &x_filters[x_filter_idx1 * UPSCALE_NORMATIVE_TAPS];
169     const int16_t *const x_filter2 =
170         &x_filters[x_filter_idx2 * UPSCALE_NORMATIVE_TAPS];
171     const int16_t *const x_filter3 =
172         &x_filters[x_filter_idx3 * UPSCALE_NORMATIVE_TAPS];
173 
174     const __m128i fil0_16 = xx_loadu_128(x_filter0);
175     const __m128i fil1_16 = xx_loadu_128(x_filter1);
176     const __m128i fil2_16 = xx_loadu_128(x_filter2);
177     const __m128i fil3_16 = xx_loadu_128(x_filter3);
178 
179     src_y = src;
180     dst_y = dst;
181     for (int y = 0; y < h; y++, src_y += src_stride, dst_y += dst_stride) {
182       const uint16_t *const src_x0 =
183           &src_y[(x_qn + 0 * x_step_qn) >> RS_SCALE_SUBPEL_BITS];
184       const uint16_t *const src_x1 =
185           &src_y[(x_qn + 1 * x_step_qn) >> RS_SCALE_SUBPEL_BITS];
186       const uint16_t *const src_x2 =
187           &src_y[(x_qn + 2 * x_step_qn) >> RS_SCALE_SUBPEL_BITS];
188       const uint16_t *const src_x3 =
189           &src_y[(x_qn + 3 * x_step_qn) >> RS_SCALE_SUBPEL_BITS];
190 
191       // Load up the source data. This is 16-bit input data, so each load
192       // gets 8 pixels.
193       const __m128i src0_16 = xx_loadu_128(src_x0);
194       const __m128i src1_16 = xx_loadu_128(src_x1);
195       const __m128i src2_16 = xx_loadu_128(src_x2);
196       const __m128i src3_16 = xx_loadu_128(src_x3);
197 
198       // Multiply by filter coefficients (results in a 32-bit value),
199       // and add adjacent pairs, i.e.
200       // ([ s7 s6 s5 s4 s3 s2 s1 s0], [ f7 f6 f5 f4 f3 f2 f1 f0 ])
201       // -> [ {s7*f7+s6*f6} {s5*f5+s4*f4} {s3*f3+s2*f2} {s1*f1+s0*f0} ]
202       const __m128i conv0_32 = _mm_madd_epi16(src0_16, fil0_16);
203       const __m128i conv1_32 = _mm_madd_epi16(src1_16, fil1_16);
204       const __m128i conv2_32 = _mm_madd_epi16(src2_16, fil2_16);
205       const __m128i conv3_32 = _mm_madd_epi16(src3_16, fil3_16);
206 
207       // Reduce horizontally and add, i.e.
208       // ([ D C B A ], [ S R Q P ]) -> [ S+R Q+P D+C B+A ]
209       const __m128i conv01_32 = _mm_hadd_epi32(conv0_32, conv1_32);
210       const __m128i conv23_32 = _mm_hadd_epi32(conv2_32, conv3_32);
211 
212       const __m128i conv0123_32 = _mm_hadd_epi32(conv01_32, conv23_32);
213 
214       // Divide down by (1 << FILTER_BITS), rounding to nearest.
215       const __m128i shifted_32 =
216           _mm_srai_epi32(_mm_add_epi32(conv0123_32, round_add), FILTER_BITS);
217 
218       // Pack 32-bit values into 16-bit values, i.e.
219       // ([ D C B A ], [ 0 0 0 0 ]) -> [ 0 0 0 0 D C B A ]
220       const __m128i shifted_16 = _mm_packus_epi32(shifted_32, zero);
221 
222       // Clip the values at (1 << bd) - 1
223       const __m128i clipped_16 = _mm_min_epi16(shifted_16, clip_maximum);
224 
225       // Write to the output
226       xx_storel_64(&dst_y[x], clipped_16);
227     }
228   }
229 }
230 #endif  // CONFIG_AV1_HIGHBITDEPTH
231