xref: /aosp_15_r20/external/libaom/av1/encoder/x86/reconinter_enc_sse2.c (revision 77c1e3ccc04c968bd2bc212e87364f250e820521)
1 /*
2  * Copyright (c) 2021, Alliance for Open Media. All rights reserved.
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include <assert.h>
13 #include <emmintrin.h>  // SSE2
14 
15 #include "config/aom_config.h"
16 #include "config/aom_dsp_rtcd.h"
17 #include "config/aom_scale_rtcd.h"
18 
19 #include "aom/aom_integer.h"
20 #include "aom_dsp/blend.h"
21 #include "aom_dsp/x86/mem_sse2.h"
22 #include "aom_dsp/x86/synonyms.h"
23 
24 #include "av1/common/av1_common_int.h"
25 #include "av1/common/blockd.h"
26 #include "av1/common/mvref_common.h"
27 #include "av1/common/obmc.h"
28 #include "av1/common/reconinter.h"
29 #include "av1/common/reconintra.h"
30 #include "av1/encoder/reconinter_enc.h"
31 
aom_upsampled_pred_sse2(MACROBLOCKD * xd,const struct AV1Common * const cm,int mi_row,int mi_col,const MV * const mv,uint8_t * comp_pred,int width,int height,int subpel_x_q3,int subpel_y_q3,const uint8_t * ref,int ref_stride,int subpel_search)32 void aom_upsampled_pred_sse2(MACROBLOCKD *xd, const struct AV1Common *const cm,
33                              int mi_row, int mi_col, const MV *const mv,
34                              uint8_t *comp_pred, int width, int height,
35                              int subpel_x_q3, int subpel_y_q3,
36                              const uint8_t *ref, int ref_stride,
37                              int subpel_search) {
38   // expect xd == NULL only in tests
39   if (xd != NULL) {
40     const MB_MODE_INFO *mi = xd->mi[0];
41     const int ref_num = 0;
42     const int is_intrabc = is_intrabc_block(mi);
43     const struct scale_factors *const sf =
44         is_intrabc ? &cm->sf_identity : xd->block_ref_scale_factors[ref_num];
45     const int is_scaled = av1_is_scaled(sf);
46 
47     if (is_scaled) {
48       int plane = 0;
49       const int mi_x = mi_col * MI_SIZE;
50       const int mi_y = mi_row * MI_SIZE;
51       const struct macroblockd_plane *const pd = &xd->plane[plane];
52       const struct buf_2d *const dst_buf = &pd->dst;
53       const struct buf_2d *const pre_buf =
54           is_intrabc ? dst_buf : &pd->pre[ref_num];
55 
56       InterPredParams inter_pred_params;
57       inter_pred_params.conv_params = get_conv_params(0, plane, xd->bd);
58       const int_interpfilters filters =
59           av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
60       av1_init_inter_params(
61           &inter_pred_params, width, height, mi_y >> pd->subsampling_y,
62           mi_x >> pd->subsampling_x, pd->subsampling_x, pd->subsampling_y,
63           xd->bd, is_cur_buf_hbd(xd), is_intrabc, sf, pre_buf, filters);
64       av1_enc_build_one_inter_predictor(comp_pred, width, mv,
65                                         &inter_pred_params);
66       return;
67     }
68   }
69 
70   const InterpFilterParams *filter = av1_get_filter(subpel_search);
71   // (TODO:yunqing) 2-tap case uses 4-tap functions since there is no SIMD for
72   // 2-tap yet.
73   int filter_taps = (subpel_search <= USE_4_TAPS) ? 4 : SUBPEL_TAPS;
74 
75   if (!subpel_x_q3 && !subpel_y_q3) {
76     if (width >= 16) {
77       int i;
78       assert(!(width & 15));
79       /*Read 16 pixels one row at a time.*/
80       for (i = 0; i < height; i++) {
81         int j;
82         for (j = 0; j < width; j += 16) {
83           xx_storeu_128(comp_pred, xx_loadu_128(ref));
84           comp_pred += 16;
85           ref += 16;
86         }
87         ref += ref_stride - width;
88       }
89     } else if (width >= 8) {
90       int i;
91       assert(!(width & 7));
92       assert(!(height & 1));
93       /*Read 8 pixels two rows at a time.*/
94       for (i = 0; i < height; i += 2) {
95         __m128i s0 = xx_loadl_64(ref + 0 * ref_stride);
96         __m128i s1 = xx_loadl_64(ref + 1 * ref_stride);
97         xx_storeu_128(comp_pred, _mm_unpacklo_epi64(s0, s1));
98         comp_pred += 16;
99         ref += 2 * ref_stride;
100       }
101     } else {
102       int i;
103       assert(!(width & 3));
104       assert(!(height & 3));
105       /*Read 4 pixels four rows at a time.*/
106       for (i = 0; i < height; i++) {
107         const __m128i row0 = xx_loadl_64(ref + 0 * ref_stride);
108         const __m128i row1 = xx_loadl_64(ref + 1 * ref_stride);
109         const __m128i row2 = xx_loadl_64(ref + 2 * ref_stride);
110         const __m128i row3 = xx_loadl_64(ref + 3 * ref_stride);
111         const __m128i reg = _mm_unpacklo_epi64(_mm_unpacklo_epi32(row0, row1),
112                                                _mm_unpacklo_epi32(row2, row3));
113         xx_storeu_128(comp_pred, reg);
114         comp_pred += 16;
115         ref += 4 * ref_stride;
116       }
117     }
118   } else if (!subpel_y_q3) {
119     const int16_t *const kernel =
120         av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
121     aom_convolve8_horiz(ref, ref_stride, comp_pred, width, kernel, 16, NULL, -1,
122                         width, height);
123   } else if (!subpel_x_q3) {
124     const int16_t *const kernel =
125         av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
126     aom_convolve8_vert(ref, ref_stride, comp_pred, width, NULL, -1, kernel, 16,
127                        width, height);
128   } else {
129     DECLARE_ALIGNED(16, uint8_t,
130                     temp[((MAX_SB_SIZE * 2 + 16) + 16) * MAX_SB_SIZE]);
131     const int16_t *const kernel_x =
132         av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
133     const int16_t *const kernel_y =
134         av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
135     const uint8_t *ref_start = ref - ref_stride * ((filter_taps >> 1) - 1);
136     uint8_t *temp_start_horiz = (subpel_search <= USE_4_TAPS)
137                                     ? temp + (filter_taps >> 1) * MAX_SB_SIZE
138                                     : temp;
139     uint8_t *temp_start_vert = temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1);
140     int intermediate_height =
141         (((height - 1) * 8 + subpel_y_q3) >> 3) + filter_taps;
142     assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16);
143     aom_convolve8_horiz(ref_start, ref_stride, temp_start_horiz, MAX_SB_SIZE,
144                         kernel_x, 16, NULL, -1, width, intermediate_height);
145     aom_convolve8_vert(temp_start_vert, MAX_SB_SIZE, comp_pred, width, NULL, -1,
146                        kernel_y, 16, width, height);
147   }
148 }
149 
150 #if CONFIG_AV1_HIGHBITDEPTH
highbd_compute_dist_wtd_comp_avg(__m128i * p0,__m128i * p1,const __m128i * w0,const __m128i * w1,const __m128i * r,void * const result)151 static inline void highbd_compute_dist_wtd_comp_avg(__m128i *p0, __m128i *p1,
152                                                     const __m128i *w0,
153                                                     const __m128i *w1,
154                                                     const __m128i *r,
155                                                     void *const result) {
156   assert(DIST_PRECISION_BITS <= 4);
157   __m128i mult0 = _mm_mullo_epi16(*p0, *w0);
158   __m128i mult1 = _mm_mullo_epi16(*p1, *w1);
159   __m128i sum = _mm_adds_epu16(mult0, mult1);
160   __m128i round = _mm_adds_epu16(sum, *r);
161   __m128i shift = _mm_srli_epi16(round, DIST_PRECISION_BITS);
162 
163   xx_storeu_128(result, shift);
164 }
165 
aom_highbd_upsampled_pred_sse2(MACROBLOCKD * xd,const struct AV1Common * const cm,int mi_row,int mi_col,const MV * const mv,uint8_t * comp_pred8,int width,int height,int subpel_x_q3,int subpel_y_q3,const uint8_t * ref8,int ref_stride,int bd,int subpel_search)166 void aom_highbd_upsampled_pred_sse2(MACROBLOCKD *xd,
167                                     const struct AV1Common *const cm,
168                                     int mi_row, int mi_col, const MV *const mv,
169                                     uint8_t *comp_pred8, int width, int height,
170                                     int subpel_x_q3, int subpel_y_q3,
171                                     const uint8_t *ref8, int ref_stride, int bd,
172                                     int subpel_search) {
173   // expect xd == NULL only in tests
174   if (xd != NULL) {
175     const MB_MODE_INFO *mi = xd->mi[0];
176     const int ref_num = 0;
177     const int is_intrabc = is_intrabc_block(mi);
178     const struct scale_factors *const sf =
179         is_intrabc ? &cm->sf_identity : xd->block_ref_scale_factors[ref_num];
180     const int is_scaled = av1_is_scaled(sf);
181 
182     if (is_scaled) {
183       int plane = 0;
184       const int mi_x = mi_col * MI_SIZE;
185       const int mi_y = mi_row * MI_SIZE;
186       const struct macroblockd_plane *const pd = &xd->plane[plane];
187       const struct buf_2d *const dst_buf = &pd->dst;
188       const struct buf_2d *const pre_buf =
189           is_intrabc ? dst_buf : &pd->pre[ref_num];
190 
191       InterPredParams inter_pred_params;
192       inter_pred_params.conv_params = get_conv_params(0, plane, xd->bd);
193       const int_interpfilters filters =
194           av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
195       av1_init_inter_params(
196           &inter_pred_params, width, height, mi_y >> pd->subsampling_y,
197           mi_x >> pd->subsampling_x, pd->subsampling_x, pd->subsampling_y,
198           xd->bd, is_cur_buf_hbd(xd), is_intrabc, sf, pre_buf, filters);
199       av1_enc_build_one_inter_predictor(comp_pred8, width, mv,
200                                         &inter_pred_params);
201       return;
202     }
203   }
204 
205   const InterpFilterParams *filter = av1_get_filter(subpel_search);
206   int filter_taps = (subpel_search <= USE_4_TAPS) ? 4 : SUBPEL_TAPS;
207   if (!subpel_x_q3 && !subpel_y_q3) {
208     uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
209     uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
210     if (width >= 8) {
211       int i;
212       assert(!(width & 7));
213       /*Read 8 pixels one row at a time.*/
214       for (i = 0; i < height; i++) {
215         int j;
216         for (j = 0; j < width; j += 8) {
217           __m128i s0 = _mm_loadu_si128((const __m128i *)ref);
218           _mm_storeu_si128((__m128i *)comp_pred, s0);
219           comp_pred += 8;
220           ref += 8;
221         }
222         ref += ref_stride - width;
223       }
224     } else {
225       int i;
226       assert(!(width & 3));
227       /*Read 4 pixels two rows at a time.*/
228       for (i = 0; i < height; i += 2) {
229         __m128i s0 = _mm_loadl_epi64((const __m128i *)ref);
230         __m128i s1 = _mm_loadl_epi64((const __m128i *)(ref + ref_stride));
231         __m128i t0 = _mm_unpacklo_epi64(s0, s1);
232         _mm_storeu_si128((__m128i *)comp_pred, t0);
233         comp_pred += 8;
234         ref += 2 * ref_stride;
235       }
236     }
237   } else if (!subpel_y_q3) {
238     const int16_t *const kernel =
239         av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
240     aom_highbd_convolve8_horiz(ref8, ref_stride, comp_pred8, width, kernel, 16,
241                                NULL, -1, width, height, bd);
242   } else if (!subpel_x_q3) {
243     const int16_t *const kernel =
244         av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
245     aom_highbd_convolve8_vert(ref8, ref_stride, comp_pred8, width, NULL, -1,
246                               kernel, 16, width, height, bd);
247   } else {
248     DECLARE_ALIGNED(16, uint16_t,
249                     temp[((MAX_SB_SIZE + 16) + 16) * MAX_SB_SIZE]);
250     const int16_t *const kernel_x =
251         av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
252     const int16_t *const kernel_y =
253         av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
254     const uint8_t *ref_start = ref8 - ref_stride * ((filter_taps >> 1) - 1);
255     uint16_t *temp_start_horiz = (subpel_search <= USE_4_TAPS)
256                                      ? temp + (filter_taps >> 1) * MAX_SB_SIZE
257                                      : temp;
258     uint16_t *temp_start_vert = temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1);
259     const int intermediate_height =
260         (((height - 1) * 8 + subpel_y_q3) >> 3) + filter_taps;
261     assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16);
262     aom_highbd_convolve8_horiz(
263         ref_start, ref_stride, CONVERT_TO_BYTEPTR(temp_start_horiz),
264         MAX_SB_SIZE, kernel_x, 16, NULL, -1, width, intermediate_height, bd);
265     aom_highbd_convolve8_vert(CONVERT_TO_BYTEPTR(temp_start_vert), MAX_SB_SIZE,
266                               comp_pred8, width, NULL, -1, kernel_y, 16, width,
267                               height, bd);
268   }
269 }
270 
aom_highbd_comp_avg_upsampled_pred_sse2(MACROBLOCKD * xd,const struct AV1Common * const cm,int mi_row,int mi_col,const MV * const mv,uint8_t * comp_pred8,const uint8_t * pred8,int width,int height,int subpel_x_q3,int subpel_y_q3,const uint8_t * ref8,int ref_stride,int bd,int subpel_search)271 void aom_highbd_comp_avg_upsampled_pred_sse2(
272     MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
273     const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
274     int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
275     int ref_stride, int bd, int subpel_search) {
276   aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred8, width,
277                             height, subpel_x_q3, subpel_y_q3, ref8, ref_stride,
278                             bd, subpel_search);
279   uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
280   uint16_t *comp_pred16 = CONVERT_TO_SHORTPTR(comp_pred8);
281   /*The total number of pixels must be a multiple of 8 (e.g., 4x4).*/
282   assert(!(width * height & 7));
283   int n = width * height >> 3;
284   for (int i = 0; i < n; i++) {
285     __m128i s0 = _mm_loadu_si128((const __m128i *)comp_pred16);
286     __m128i p0 = _mm_loadu_si128((const __m128i *)pred);
287     _mm_storeu_si128((__m128i *)comp_pred16, _mm_avg_epu16(s0, p0));
288     comp_pred16 += 8;
289     pred += 8;
290   }
291 }
292 
aom_highbd_dist_wtd_comp_avg_upsampled_pred_sse2(MACROBLOCKD * xd,const struct AV1Common * const cm,int mi_row,int mi_col,const MV * const mv,uint8_t * comp_pred8,const uint8_t * pred8,int width,int height,int subpel_x_q3,int subpel_y_q3,const uint8_t * ref8,int ref_stride,int bd,const DIST_WTD_COMP_PARAMS * jcp_param,int subpel_search)293 void aom_highbd_dist_wtd_comp_avg_upsampled_pred_sse2(
294     MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
295     const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
296     int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
297     int ref_stride, int bd, const DIST_WTD_COMP_PARAMS *jcp_param,
298     int subpel_search) {
299   uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
300   int n;
301   int i;
302   aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred8, width,
303                             height, subpel_x_q3, subpel_y_q3, ref8, ref_stride,
304                             bd, subpel_search);
305   assert(!(width * height & 7));
306   n = width * height >> 3;
307 
308   const int16_t wt0 = (int16_t)jcp_param->fwd_offset;
309   const int16_t wt1 = (int16_t)jcp_param->bck_offset;
310   const __m128i w0 = _mm_set1_epi16(wt0);
311   const __m128i w1 = _mm_set1_epi16(wt1);
312   const int16_t round = (int16_t)((1 << DIST_PRECISION_BITS) >> 1);
313   const __m128i r = _mm_set1_epi16(round);
314 
315   uint16_t *comp_pred16 = CONVERT_TO_SHORTPTR(comp_pred8);
316   for (i = 0; i < n; i++) {
317     __m128i p0 = xx_loadu_128(comp_pred16);
318     __m128i p1 = xx_loadu_128(pred);
319 
320     highbd_compute_dist_wtd_comp_avg(&p0, &p1, &w0, &w1, &r, comp_pred16);
321 
322     comp_pred16 += 8;
323     pred += 8;
324   }
325 }
326 #endif  // CONFIG_AV1_HIGHBITDEPTH
327 
aom_comp_avg_upsampled_pred_sse2(MACROBLOCKD * xd,const struct AV1Common * const cm,int mi_row,int mi_col,const MV * const mv,uint8_t * comp_pred,const uint8_t * pred,int width,int height,int subpel_x_q3,int subpel_y_q3,const uint8_t * ref,int ref_stride,int subpel_search)328 void aom_comp_avg_upsampled_pred_sse2(
329     MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
330     const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
331     int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
332     int ref_stride, int subpel_search) {
333   int n;
334   int i;
335   aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
336                      subpel_x_q3, subpel_y_q3, ref, ref_stride, subpel_search);
337   /*The total number of pixels must be a multiple of 16 (e.g., 4x4).*/
338   assert(!(width * height & 15));
339   n = width * height >> 4;
340   for (i = 0; i < n; i++) {
341     __m128i s0 = xx_loadu_128(comp_pred);
342     __m128i p0 = xx_loadu_128(pred);
343     xx_storeu_128(comp_pred, _mm_avg_epu8(s0, p0));
344     comp_pred += 16;
345     pred += 16;
346   }
347 }
348