xref: /aosp_15_r20/external/libaom/aom_dsp/intrapred.c (revision 77c1e3ccc04c968bd2bc212e87364f250e820521)
1 /*
2  * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include <assert.h>
13 #include <math.h>
14 
15 #include "config/aom_config.h"
16 #include "config/aom_dsp_rtcd.h"
17 
18 #include "aom_dsp/aom_dsp_common.h"
19 #include "aom_dsp/intrapred_common.h"
20 #include "aom_mem/aom_mem.h"
21 #include "aom_ports/bitops.h"
22 
v_predictor(uint8_t * dst,ptrdiff_t stride,int bw,int bh,const uint8_t * above,const uint8_t * left)23 static inline void v_predictor(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
24                                const uint8_t *above, const uint8_t *left) {
25   int r;
26   (void)left;
27 
28   for (r = 0; r < bh; r++) {
29     memcpy(dst, above, bw);
30     dst += stride;
31   }
32 }
33 
h_predictor(uint8_t * dst,ptrdiff_t stride,int bw,int bh,const uint8_t * above,const uint8_t * left)34 static inline void h_predictor(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
35                                const uint8_t *above, const uint8_t *left) {
36   int r;
37   (void)above;
38 
39   for (r = 0; r < bh; r++) {
40     memset(dst, left[r], bw);
41     dst += stride;
42   }
43 }
44 
abs_diff(int a,int b)45 static inline int abs_diff(int a, int b) { return (a > b) ? a - b : b - a; }
46 
paeth_predictor_single(uint16_t left,uint16_t top,uint16_t top_left)47 static inline uint16_t paeth_predictor_single(uint16_t left, uint16_t top,
48                                               uint16_t top_left) {
49   const int base = top + left - top_left;
50   const int p_left = abs_diff(base, left);
51   const int p_top = abs_diff(base, top);
52   const int p_top_left = abs_diff(base, top_left);
53 
54   // Return nearest to base of left, top and top_left.
55   return (p_left <= p_top && p_left <= p_top_left) ? left
56          : (p_top <= p_top_left)                   ? top
57                                                    : top_left;
58 }
59 
paeth_predictor(uint8_t * dst,ptrdiff_t stride,int bw,int bh,const uint8_t * above,const uint8_t * left)60 static inline void paeth_predictor(uint8_t *dst, ptrdiff_t stride, int bw,
61                                    int bh, const uint8_t *above,
62                                    const uint8_t *left) {
63   int r, c;
64   const uint8_t ytop_left = above[-1];
65 
66   for (r = 0; r < bh; r++) {
67     for (c = 0; c < bw; c++)
68       dst[c] = (uint8_t)paeth_predictor_single(left[r], above[c], ytop_left);
69     dst += stride;
70   }
71 }
72 
73 // Some basic checks on weights for smooth predictor.
74 #define sm_weights_sanity_checks(weights_w, weights_h, weights_scale, \
75                                  pred_scale)                          \
76   assert(weights_w[0] < weights_scale);                               \
77   assert(weights_h[0] < weights_scale);                               \
78   assert(weights_scale - weights_w[bw - 1] < weights_scale);          \
79   assert(weights_scale - weights_h[bh - 1] < weights_scale);          \
80   assert(pred_scale < 31)  // ensures no overflow when calculating predictor.
81 
82 #define divide_round(value, bits) (((value) + (1 << ((bits)-1))) >> (bits))
83 
smooth_predictor(uint8_t * dst,ptrdiff_t stride,int bw,int bh,const uint8_t * above,const uint8_t * left)84 static inline void smooth_predictor(uint8_t *dst, ptrdiff_t stride, int bw,
85                                     int bh, const uint8_t *above,
86                                     const uint8_t *left) {
87   const uint8_t below_pred = left[bh - 1];   // estimated by bottom-left pixel
88   const uint8_t right_pred = above[bw - 1];  // estimated by top-right pixel
89   const uint8_t *const sm_weights_w = smooth_weights + bw - 4;
90   const uint8_t *const sm_weights_h = smooth_weights + bh - 4;
91   // scale = 2 * 2^SMOOTH_WEIGHT_LOG2_SCALE
92   const int log2_scale = 1 + SMOOTH_WEIGHT_LOG2_SCALE;
93   const uint16_t scale = (1 << SMOOTH_WEIGHT_LOG2_SCALE);
94   sm_weights_sanity_checks(sm_weights_w, sm_weights_h, scale,
95                            log2_scale + sizeof(*dst));
96   int r;
97   for (r = 0; r < bh; ++r) {
98     int c;
99     for (c = 0; c < bw; ++c) {
100       const uint8_t pixels[] = { above[c], below_pred, left[r], right_pred };
101       const uint8_t weights[] = { sm_weights_h[r], scale - sm_weights_h[r],
102                                   sm_weights_w[c], scale - sm_weights_w[c] };
103       uint32_t this_pred = 0;
104       int i;
105       assert(scale >= sm_weights_h[r] && scale >= sm_weights_w[c]);
106       for (i = 0; i < 4; ++i) {
107         this_pred += weights[i] * pixels[i];
108       }
109       dst[c] = divide_round(this_pred, log2_scale);
110     }
111     dst += stride;
112   }
113 }
114 
smooth_v_predictor(uint8_t * dst,ptrdiff_t stride,int bw,int bh,const uint8_t * above,const uint8_t * left)115 static inline void smooth_v_predictor(uint8_t *dst, ptrdiff_t stride, int bw,
116                                       int bh, const uint8_t *above,
117                                       const uint8_t *left) {
118   const uint8_t below_pred = left[bh - 1];  // estimated by bottom-left pixel
119   const uint8_t *const sm_weights = smooth_weights + bh - 4;
120   // scale = 2^SMOOTH_WEIGHT_LOG2_SCALE
121   const int log2_scale = SMOOTH_WEIGHT_LOG2_SCALE;
122   const uint16_t scale = (1 << SMOOTH_WEIGHT_LOG2_SCALE);
123   sm_weights_sanity_checks(sm_weights, sm_weights, scale,
124                            log2_scale + sizeof(*dst));
125 
126   int r;
127   for (r = 0; r < bh; r++) {
128     int c;
129     for (c = 0; c < bw; ++c) {
130       const uint8_t pixels[] = { above[c], below_pred };
131       const uint8_t weights[] = { sm_weights[r], scale - sm_weights[r] };
132       uint32_t this_pred = 0;
133       assert(scale >= sm_weights[r]);
134       int i;
135       for (i = 0; i < 2; ++i) {
136         this_pred += weights[i] * pixels[i];
137       }
138       dst[c] = divide_round(this_pred, log2_scale);
139     }
140     dst += stride;
141   }
142 }
143 
smooth_h_predictor(uint8_t * dst,ptrdiff_t stride,int bw,int bh,const uint8_t * above,const uint8_t * left)144 static inline void smooth_h_predictor(uint8_t *dst, ptrdiff_t stride, int bw,
145                                       int bh, const uint8_t *above,
146                                       const uint8_t *left) {
147   const uint8_t right_pred = above[bw - 1];  // estimated by top-right pixel
148   const uint8_t *const sm_weights = smooth_weights + bw - 4;
149   // scale = 2^SMOOTH_WEIGHT_LOG2_SCALE
150   const int log2_scale = SMOOTH_WEIGHT_LOG2_SCALE;
151   const uint16_t scale = (1 << SMOOTH_WEIGHT_LOG2_SCALE);
152   sm_weights_sanity_checks(sm_weights, sm_weights, scale,
153                            log2_scale + sizeof(*dst));
154 
155   int r;
156   for (r = 0; r < bh; r++) {
157     int c;
158     for (c = 0; c < bw; ++c) {
159       const uint8_t pixels[] = { left[r], right_pred };
160       const uint8_t weights[] = { sm_weights[c], scale - sm_weights[c] };
161       uint32_t this_pred = 0;
162       assert(scale >= sm_weights[c]);
163       int i;
164       for (i = 0; i < 2; ++i) {
165         this_pred += weights[i] * pixels[i];
166       }
167       dst[c] = divide_round(this_pred, log2_scale);
168     }
169     dst += stride;
170   }
171 }
172 
dc_128_predictor(uint8_t * dst,ptrdiff_t stride,int bw,int bh,const uint8_t * above,const uint8_t * left)173 static inline void dc_128_predictor(uint8_t *dst, ptrdiff_t stride, int bw,
174                                     int bh, const uint8_t *above,
175                                     const uint8_t *left) {
176   int r;
177   (void)above;
178   (void)left;
179 
180   for (r = 0; r < bh; r++) {
181     memset(dst, 128, bw);
182     dst += stride;
183   }
184 }
185 
dc_left_predictor(uint8_t * dst,ptrdiff_t stride,int bw,int bh,const uint8_t * above,const uint8_t * left)186 static inline void dc_left_predictor(uint8_t *dst, ptrdiff_t stride, int bw,
187                                      int bh, const uint8_t *above,
188                                      const uint8_t *left) {
189   int i, r, expected_dc, sum = 0;
190   (void)above;
191 
192   for (i = 0; i < bh; i++) sum += left[i];
193   expected_dc = (sum + (bh >> 1)) / bh;
194 
195   for (r = 0; r < bh; r++) {
196     memset(dst, expected_dc, bw);
197     dst += stride;
198   }
199 }
200 
dc_top_predictor(uint8_t * dst,ptrdiff_t stride,int bw,int bh,const uint8_t * above,const uint8_t * left)201 static inline void dc_top_predictor(uint8_t *dst, ptrdiff_t stride, int bw,
202                                     int bh, const uint8_t *above,
203                                     const uint8_t *left) {
204   int i, r, expected_dc, sum = 0;
205   (void)left;
206 
207   for (i = 0; i < bw; i++) sum += above[i];
208   expected_dc = (sum + (bw >> 1)) / bw;
209 
210   for (r = 0; r < bh; r++) {
211     memset(dst, expected_dc, bw);
212     dst += stride;
213   }
214 }
215 
dc_predictor(uint8_t * dst,ptrdiff_t stride,int bw,int bh,const uint8_t * above,const uint8_t * left)216 static inline void dc_predictor(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
217                                 const uint8_t *above, const uint8_t *left) {
218   int i, r, expected_dc, sum = 0;
219   const int count = bw + bh;
220 
221   for (i = 0; i < bw; i++) {
222     sum += above[i];
223   }
224   for (i = 0; i < bh; i++) {
225     sum += left[i];
226   }
227 
228   expected_dc = (sum + (count >> 1)) / count;
229 
230   for (r = 0; r < bh; r++) {
231     memset(dst, expected_dc, bw);
232     dst += stride;
233   }
234 }
235 
divide_using_multiply_shift(int num,int shift1,int multiplier,int shift2)236 static inline int divide_using_multiply_shift(int num, int shift1,
237                                               int multiplier, int shift2) {
238   const int interm = num >> shift1;
239   return interm * multiplier >> shift2;
240 }
241 
242 // The constants (multiplier and shifts) for a given block size are obtained
243 // as follows:
244 // - Let sum_w_h =  block width + block height.
245 // - Shift 'sum_w_h' right until we reach an odd number. Let the number of
246 // shifts for that block size be called 'shift1' (see the parameter in
247 // dc_predictor_rect() function), and let the odd number be 'd'. [d has only 2
248 // possible values: d = 3 for a 1:2 rect block and d = 5 for a 1:4 rect
249 // block].
250 // - Find multipliers for (i) dividing by 3, and (ii) dividing by 5,
251 // using the "Algorithm 1" in:
252 // http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=1467632
253 // by ensuring that m + n = 16 (in that algorithm). This ensures that our 2nd
254 // shift will be 16, regardless of the block size.
255 
256 // Note: For low bitdepth, assembly code may be optimized by using smaller
257 // constants for smaller block sizes, where the range of the 'sum' is
258 // restricted to fewer bits.
259 
260 #define DC_MULTIPLIER_1X2 0x5556
261 #define DC_MULTIPLIER_1X4 0x3334
262 
263 #define DC_SHIFT2 16
264 
dc_predictor_rect(uint8_t * dst,ptrdiff_t stride,int bw,int bh,const uint8_t * above,const uint8_t * left,int shift1,int multiplier)265 static inline void dc_predictor_rect(uint8_t *dst, ptrdiff_t stride, int bw,
266                                      int bh, const uint8_t *above,
267                                      const uint8_t *left, int shift1,
268                                      int multiplier) {
269   int sum = 0;
270 
271   for (int i = 0; i < bw; i++) {
272     sum += above[i];
273   }
274   for (int i = 0; i < bh; i++) {
275     sum += left[i];
276   }
277 
278   const int expected_dc = divide_using_multiply_shift(
279       sum + ((bw + bh) >> 1), shift1, multiplier, DC_SHIFT2);
280   assert(expected_dc < (1 << 8));
281 
282   for (int r = 0; r < bh; r++) {
283     memset(dst, expected_dc, bw);
284     dst += stride;
285   }
286 }
287 
288 #undef DC_SHIFT2
289 
aom_dc_predictor_4x8_c(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)290 void aom_dc_predictor_4x8_c(uint8_t *dst, ptrdiff_t stride,
291                             const uint8_t *above, const uint8_t *left) {
292   dc_predictor_rect(dst, stride, 4, 8, above, left, 2, DC_MULTIPLIER_1X2);
293 }
294 
aom_dc_predictor_8x4_c(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)295 void aom_dc_predictor_8x4_c(uint8_t *dst, ptrdiff_t stride,
296                             const uint8_t *above, const uint8_t *left) {
297   dc_predictor_rect(dst, stride, 8, 4, above, left, 2, DC_MULTIPLIER_1X2);
298 }
299 
300 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_dc_predictor_4x16_c(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)301 void aom_dc_predictor_4x16_c(uint8_t *dst, ptrdiff_t stride,
302                              const uint8_t *above, const uint8_t *left) {
303   dc_predictor_rect(dst, stride, 4, 16, above, left, 2, DC_MULTIPLIER_1X4);
304 }
305 
aom_dc_predictor_16x4_c(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)306 void aom_dc_predictor_16x4_c(uint8_t *dst, ptrdiff_t stride,
307                              const uint8_t *above, const uint8_t *left) {
308   dc_predictor_rect(dst, stride, 16, 4, above, left, 2, DC_MULTIPLIER_1X4);
309 }
310 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
311 
aom_dc_predictor_8x16_c(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)312 void aom_dc_predictor_8x16_c(uint8_t *dst, ptrdiff_t stride,
313                              const uint8_t *above, const uint8_t *left) {
314   dc_predictor_rect(dst, stride, 8, 16, above, left, 3, DC_MULTIPLIER_1X2);
315 }
316 
aom_dc_predictor_16x8_c(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)317 void aom_dc_predictor_16x8_c(uint8_t *dst, ptrdiff_t stride,
318                              const uint8_t *above, const uint8_t *left) {
319   dc_predictor_rect(dst, stride, 16, 8, above, left, 3, DC_MULTIPLIER_1X2);
320 }
321 
322 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_dc_predictor_8x32_c(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)323 void aom_dc_predictor_8x32_c(uint8_t *dst, ptrdiff_t stride,
324                              const uint8_t *above, const uint8_t *left) {
325   dc_predictor_rect(dst, stride, 8, 32, above, left, 3, DC_MULTIPLIER_1X4);
326 }
327 
aom_dc_predictor_32x8_c(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)328 void aom_dc_predictor_32x8_c(uint8_t *dst, ptrdiff_t stride,
329                              const uint8_t *above, const uint8_t *left) {
330   dc_predictor_rect(dst, stride, 32, 8, above, left, 3, DC_MULTIPLIER_1X4);
331 }
332 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
333 
aom_dc_predictor_16x32_c(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)334 void aom_dc_predictor_16x32_c(uint8_t *dst, ptrdiff_t stride,
335                               const uint8_t *above, const uint8_t *left) {
336   dc_predictor_rect(dst, stride, 16, 32, above, left, 4, DC_MULTIPLIER_1X2);
337 }
338 
aom_dc_predictor_32x16_c(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)339 void aom_dc_predictor_32x16_c(uint8_t *dst, ptrdiff_t stride,
340                               const uint8_t *above, const uint8_t *left) {
341   dc_predictor_rect(dst, stride, 32, 16, above, left, 4, DC_MULTIPLIER_1X2);
342 }
343 
344 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_dc_predictor_16x64_c(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)345 void aom_dc_predictor_16x64_c(uint8_t *dst, ptrdiff_t stride,
346                               const uint8_t *above, const uint8_t *left) {
347   dc_predictor_rect(dst, stride, 16, 64, above, left, 4, DC_MULTIPLIER_1X4);
348 }
349 
aom_dc_predictor_64x16_c(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)350 void aom_dc_predictor_64x16_c(uint8_t *dst, ptrdiff_t stride,
351                               const uint8_t *above, const uint8_t *left) {
352   dc_predictor_rect(dst, stride, 64, 16, above, left, 4, DC_MULTIPLIER_1X4);
353 }
354 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
355 
aom_dc_predictor_32x64_c(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)356 void aom_dc_predictor_32x64_c(uint8_t *dst, ptrdiff_t stride,
357                               const uint8_t *above, const uint8_t *left) {
358   dc_predictor_rect(dst, stride, 32, 64, above, left, 5, DC_MULTIPLIER_1X2);
359 }
360 
aom_dc_predictor_64x32_c(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)361 void aom_dc_predictor_64x32_c(uint8_t *dst, ptrdiff_t stride,
362                               const uint8_t *above, const uint8_t *left) {
363   dc_predictor_rect(dst, stride, 64, 32, above, left, 5, DC_MULTIPLIER_1X2);
364 }
365 
366 #undef DC_MULTIPLIER_1X2
367 #undef DC_MULTIPLIER_1X4
368 
369 #if CONFIG_AV1_HIGHBITDEPTH
370 
highbd_v_predictor(uint16_t * dst,ptrdiff_t stride,int bw,int bh,const uint16_t * above,const uint16_t * left,int bd)371 static inline void highbd_v_predictor(uint16_t *dst, ptrdiff_t stride, int bw,
372                                       int bh, const uint16_t *above,
373                                       const uint16_t *left, int bd) {
374   int r;
375   (void)left;
376   (void)bd;
377   for (r = 0; r < bh; r++) {
378     memcpy(dst, above, bw * sizeof(uint16_t));
379     dst += stride;
380   }
381 }
382 
highbd_h_predictor(uint16_t * dst,ptrdiff_t stride,int bw,int bh,const uint16_t * above,const uint16_t * left,int bd)383 static inline void highbd_h_predictor(uint16_t *dst, ptrdiff_t stride, int bw,
384                                       int bh, const uint16_t *above,
385                                       const uint16_t *left, int bd) {
386   int r;
387   (void)above;
388   (void)bd;
389   for (r = 0; r < bh; r++) {
390     aom_memset16(dst, left[r], bw);
391     dst += stride;
392   }
393 }
394 
highbd_paeth_predictor(uint16_t * dst,ptrdiff_t stride,int bw,int bh,const uint16_t * above,const uint16_t * left,int bd)395 static inline void highbd_paeth_predictor(uint16_t *dst, ptrdiff_t stride,
396                                           int bw, int bh, const uint16_t *above,
397                                           const uint16_t *left, int bd) {
398   int r, c;
399   const uint16_t ytop_left = above[-1];
400   (void)bd;
401 
402   for (r = 0; r < bh; r++) {
403     for (c = 0; c < bw; c++)
404       dst[c] = paeth_predictor_single(left[r], above[c], ytop_left);
405     dst += stride;
406   }
407 }
408 
highbd_smooth_predictor(uint16_t * dst,ptrdiff_t stride,int bw,int bh,const uint16_t * above,const uint16_t * left,int bd)409 static inline void highbd_smooth_predictor(uint16_t *dst, ptrdiff_t stride,
410                                            int bw, int bh,
411                                            const uint16_t *above,
412                                            const uint16_t *left, int bd) {
413   (void)bd;
414   const uint16_t below_pred = left[bh - 1];   // estimated by bottom-left pixel
415   const uint16_t right_pred = above[bw - 1];  // estimated by top-right pixel
416   const uint8_t *const sm_weights_w = smooth_weights + bw - 4;
417   const uint8_t *const sm_weights_h = smooth_weights + bh - 4;
418   // scale = 2 * 2^SMOOTH_WEIGHT_LOG2_SCALE
419   const int log2_scale = 1 + SMOOTH_WEIGHT_LOG2_SCALE;
420   const uint16_t scale = (1 << SMOOTH_WEIGHT_LOG2_SCALE);
421   sm_weights_sanity_checks(sm_weights_w, sm_weights_h, scale,
422                            log2_scale + sizeof(*dst));
423   int r;
424   for (r = 0; r < bh; ++r) {
425     int c;
426     for (c = 0; c < bw; ++c) {
427       const uint16_t pixels[] = { above[c], below_pred, left[r], right_pred };
428       const uint8_t weights[] = { sm_weights_h[r], scale - sm_weights_h[r],
429                                   sm_weights_w[c], scale - sm_weights_w[c] };
430       uint32_t this_pred = 0;
431       int i;
432       assert(scale >= sm_weights_h[r] && scale >= sm_weights_w[c]);
433       for (i = 0; i < 4; ++i) {
434         this_pred += weights[i] * pixels[i];
435       }
436       dst[c] = divide_round(this_pred, log2_scale);
437     }
438     dst += stride;
439   }
440 }
441 
highbd_smooth_v_predictor(uint16_t * dst,ptrdiff_t stride,int bw,int bh,const uint16_t * above,const uint16_t * left,int bd)442 static inline void highbd_smooth_v_predictor(uint16_t *dst, ptrdiff_t stride,
443                                              int bw, int bh,
444                                              const uint16_t *above,
445                                              const uint16_t *left, int bd) {
446   (void)bd;
447   const uint16_t below_pred = left[bh - 1];  // estimated by bottom-left pixel
448   const uint8_t *const sm_weights = smooth_weights + bh - 4;
449   // scale = 2^SMOOTH_WEIGHT_LOG2_SCALE
450   const int log2_scale = SMOOTH_WEIGHT_LOG2_SCALE;
451   const uint16_t scale = (1 << SMOOTH_WEIGHT_LOG2_SCALE);
452   sm_weights_sanity_checks(sm_weights, sm_weights, scale,
453                            log2_scale + sizeof(*dst));
454 
455   int r;
456   for (r = 0; r < bh; r++) {
457     int c;
458     for (c = 0; c < bw; ++c) {
459       const uint16_t pixels[] = { above[c], below_pred };
460       const uint8_t weights[] = { sm_weights[r], scale - sm_weights[r] };
461       uint32_t this_pred = 0;
462       assert(scale >= sm_weights[r]);
463       int i;
464       for (i = 0; i < 2; ++i) {
465         this_pred += weights[i] * pixels[i];
466       }
467       dst[c] = divide_round(this_pred, log2_scale);
468     }
469     dst += stride;
470   }
471 }
472 
highbd_smooth_h_predictor(uint16_t * dst,ptrdiff_t stride,int bw,int bh,const uint16_t * above,const uint16_t * left,int bd)473 static inline void highbd_smooth_h_predictor(uint16_t *dst, ptrdiff_t stride,
474                                              int bw, int bh,
475                                              const uint16_t *above,
476                                              const uint16_t *left, int bd) {
477   (void)bd;
478   const uint16_t right_pred = above[bw - 1];  // estimated by top-right pixel
479   const uint8_t *const sm_weights = smooth_weights + bw - 4;
480   // scale = 2^SMOOTH_WEIGHT_LOG2_SCALE
481   const int log2_scale = SMOOTH_WEIGHT_LOG2_SCALE;
482   const uint16_t scale = (1 << SMOOTH_WEIGHT_LOG2_SCALE);
483   sm_weights_sanity_checks(sm_weights, sm_weights, scale,
484                            log2_scale + sizeof(*dst));
485 
486   int r;
487   for (r = 0; r < bh; r++) {
488     int c;
489     for (c = 0; c < bw; ++c) {
490       const uint16_t pixels[] = { left[r], right_pred };
491       const uint8_t weights[] = { sm_weights[c], scale - sm_weights[c] };
492       uint32_t this_pred = 0;
493       assert(scale >= sm_weights[c]);
494       int i;
495       for (i = 0; i < 2; ++i) {
496         this_pred += weights[i] * pixels[i];
497       }
498       dst[c] = divide_round(this_pred, log2_scale);
499     }
500     dst += stride;
501   }
502 }
503 
highbd_dc_128_predictor(uint16_t * dst,ptrdiff_t stride,int bw,int bh,const uint16_t * above,const uint16_t * left,int bd)504 static inline void highbd_dc_128_predictor(uint16_t *dst, ptrdiff_t stride,
505                                            int bw, int bh,
506                                            const uint16_t *above,
507                                            const uint16_t *left, int bd) {
508   int r;
509   (void)above;
510   (void)left;
511 
512   for (r = 0; r < bh; r++) {
513     aom_memset16(dst, 128 << (bd - 8), bw);
514     dst += stride;
515   }
516 }
517 
highbd_dc_left_predictor(uint16_t * dst,ptrdiff_t stride,int bw,int bh,const uint16_t * above,const uint16_t * left,int bd)518 static inline void highbd_dc_left_predictor(uint16_t *dst, ptrdiff_t stride,
519                                             int bw, int bh,
520                                             const uint16_t *above,
521                                             const uint16_t *left, int bd) {
522   int i, r, expected_dc, sum = 0;
523   (void)above;
524   (void)bd;
525 
526   for (i = 0; i < bh; i++) sum += left[i];
527   expected_dc = (sum + (bh >> 1)) / bh;
528 
529   for (r = 0; r < bh; r++) {
530     aom_memset16(dst, expected_dc, bw);
531     dst += stride;
532   }
533 }
534 
highbd_dc_top_predictor(uint16_t * dst,ptrdiff_t stride,int bw,int bh,const uint16_t * above,const uint16_t * left,int bd)535 static inline void highbd_dc_top_predictor(uint16_t *dst, ptrdiff_t stride,
536                                            int bw, int bh,
537                                            const uint16_t *above,
538                                            const uint16_t *left, int bd) {
539   int i, r, expected_dc, sum = 0;
540   (void)left;
541   (void)bd;
542 
543   for (i = 0; i < bw; i++) sum += above[i];
544   expected_dc = (sum + (bw >> 1)) / bw;
545 
546   for (r = 0; r < bh; r++) {
547     aom_memset16(dst, expected_dc, bw);
548     dst += stride;
549   }
550 }
551 
highbd_dc_predictor(uint16_t * dst,ptrdiff_t stride,int bw,int bh,const uint16_t * above,const uint16_t * left,int bd)552 static inline void highbd_dc_predictor(uint16_t *dst, ptrdiff_t stride, int bw,
553                                        int bh, const uint16_t *above,
554                                        const uint16_t *left, int bd) {
555   int i, r, expected_dc, sum = 0;
556   const int count = bw + bh;
557   (void)bd;
558 
559   for (i = 0; i < bw; i++) {
560     sum += above[i];
561   }
562   for (i = 0; i < bh; i++) {
563     sum += left[i];
564   }
565 
566   expected_dc = (sum + (count >> 1)) / count;
567 
568   for (r = 0; r < bh; r++) {
569     aom_memset16(dst, expected_dc, bw);
570     dst += stride;
571   }
572 }
573 
574 // Obtained similarly as DC_MULTIPLIER_1X2 and DC_MULTIPLIER_1X4 above, but
575 // assume 2nd shift of 17 bits instead of 16.
576 // Note: Strictly speaking, 2nd shift needs to be 17 only when:
577 // - bit depth == 12, and
578 // - bw + bh is divisible by 5 (as opposed to divisible by 3).
579 // All other cases can use half the multipliers with a shift of 16 instead.
580 // This special optimization can be used when writing assembly code.
581 #define HIGHBD_DC_MULTIPLIER_1X2 0xAAAB
582 // Note: This constant is odd, but a smaller even constant (0x199a) with the
583 // appropriate shift should work for neon in 8/10-bit.
584 #define HIGHBD_DC_MULTIPLIER_1X4 0x6667
585 
586 #define HIGHBD_DC_SHIFT2 17
587 
highbd_dc_predictor_rect(uint16_t * dst,ptrdiff_t stride,int bw,int bh,const uint16_t * above,const uint16_t * left,int bd,int shift1,uint32_t multiplier)588 static inline void highbd_dc_predictor_rect(uint16_t *dst, ptrdiff_t stride,
589                                             int bw, int bh,
590                                             const uint16_t *above,
591                                             const uint16_t *left, int bd,
592                                             int shift1, uint32_t multiplier) {
593   int sum = 0;
594   (void)bd;
595 
596   for (int i = 0; i < bw; i++) {
597     sum += above[i];
598   }
599   for (int i = 0; i < bh; i++) {
600     sum += left[i];
601   }
602 
603   const int expected_dc = divide_using_multiply_shift(
604       sum + ((bw + bh) >> 1), shift1, multiplier, HIGHBD_DC_SHIFT2);
605   assert(expected_dc < (1 << bd));
606 
607   for (int r = 0; r < bh; r++) {
608     aom_memset16(dst, expected_dc, bw);
609     dst += stride;
610   }
611 }
612 
613 #undef HIGHBD_DC_SHIFT2
614 
aom_highbd_dc_predictor_4x8_c(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)615 void aom_highbd_dc_predictor_4x8_c(uint16_t *dst, ptrdiff_t stride,
616                                    const uint16_t *above, const uint16_t *left,
617                                    int bd) {
618   highbd_dc_predictor_rect(dst, stride, 4, 8, above, left, bd, 2,
619                            HIGHBD_DC_MULTIPLIER_1X2);
620 }
621 
aom_highbd_dc_predictor_8x4_c(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)622 void aom_highbd_dc_predictor_8x4_c(uint16_t *dst, ptrdiff_t stride,
623                                    const uint16_t *above, const uint16_t *left,
624                                    int bd) {
625   highbd_dc_predictor_rect(dst, stride, 8, 4, above, left, bd, 2,
626                            HIGHBD_DC_MULTIPLIER_1X2);
627 }
628 
629 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_highbd_dc_predictor_4x16_c(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)630 void aom_highbd_dc_predictor_4x16_c(uint16_t *dst, ptrdiff_t stride,
631                                     const uint16_t *above, const uint16_t *left,
632                                     int bd) {
633   highbd_dc_predictor_rect(dst, stride, 4, 16, above, left, bd, 2,
634                            HIGHBD_DC_MULTIPLIER_1X4);
635 }
636 
aom_highbd_dc_predictor_16x4_c(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)637 void aom_highbd_dc_predictor_16x4_c(uint16_t *dst, ptrdiff_t stride,
638                                     const uint16_t *above, const uint16_t *left,
639                                     int bd) {
640   highbd_dc_predictor_rect(dst, stride, 16, 4, above, left, bd, 2,
641                            HIGHBD_DC_MULTIPLIER_1X4);
642 }
643 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
644 
aom_highbd_dc_predictor_8x16_c(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)645 void aom_highbd_dc_predictor_8x16_c(uint16_t *dst, ptrdiff_t stride,
646                                     const uint16_t *above, const uint16_t *left,
647                                     int bd) {
648   highbd_dc_predictor_rect(dst, stride, 8, 16, above, left, bd, 3,
649                            HIGHBD_DC_MULTIPLIER_1X2);
650 }
651 
aom_highbd_dc_predictor_16x8_c(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)652 void aom_highbd_dc_predictor_16x8_c(uint16_t *dst, ptrdiff_t stride,
653                                     const uint16_t *above, const uint16_t *left,
654                                     int bd) {
655   highbd_dc_predictor_rect(dst, stride, 16, 8, above, left, bd, 3,
656                            HIGHBD_DC_MULTIPLIER_1X2);
657 }
658 
659 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_highbd_dc_predictor_8x32_c(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)660 void aom_highbd_dc_predictor_8x32_c(uint16_t *dst, ptrdiff_t stride,
661                                     const uint16_t *above, const uint16_t *left,
662                                     int bd) {
663   highbd_dc_predictor_rect(dst, stride, 8, 32, above, left, bd, 3,
664                            HIGHBD_DC_MULTIPLIER_1X4);
665 }
666 
aom_highbd_dc_predictor_32x8_c(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)667 void aom_highbd_dc_predictor_32x8_c(uint16_t *dst, ptrdiff_t stride,
668                                     const uint16_t *above, const uint16_t *left,
669                                     int bd) {
670   highbd_dc_predictor_rect(dst, stride, 32, 8, above, left, bd, 3,
671                            HIGHBD_DC_MULTIPLIER_1X4);
672 }
673 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
674 
aom_highbd_dc_predictor_16x32_c(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)675 void aom_highbd_dc_predictor_16x32_c(uint16_t *dst, ptrdiff_t stride,
676                                      const uint16_t *above,
677                                      const uint16_t *left, int bd) {
678   highbd_dc_predictor_rect(dst, stride, 16, 32, above, left, bd, 4,
679                            HIGHBD_DC_MULTIPLIER_1X2);
680 }
681 
aom_highbd_dc_predictor_32x16_c(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)682 void aom_highbd_dc_predictor_32x16_c(uint16_t *dst, ptrdiff_t stride,
683                                      const uint16_t *above,
684                                      const uint16_t *left, int bd) {
685   highbd_dc_predictor_rect(dst, stride, 32, 16, above, left, bd, 4,
686                            HIGHBD_DC_MULTIPLIER_1X2);
687 }
688 
689 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_highbd_dc_predictor_16x64_c(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)690 void aom_highbd_dc_predictor_16x64_c(uint16_t *dst, ptrdiff_t stride,
691                                      const uint16_t *above,
692                                      const uint16_t *left, int bd) {
693   highbd_dc_predictor_rect(dst, stride, 16, 64, above, left, bd, 4,
694                            HIGHBD_DC_MULTIPLIER_1X4);
695 }
696 
aom_highbd_dc_predictor_64x16_c(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)697 void aom_highbd_dc_predictor_64x16_c(uint16_t *dst, ptrdiff_t stride,
698                                      const uint16_t *above,
699                                      const uint16_t *left, int bd) {
700   highbd_dc_predictor_rect(dst, stride, 64, 16, above, left, bd, 4,
701                            HIGHBD_DC_MULTIPLIER_1X4);
702 }
703 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
704 
aom_highbd_dc_predictor_32x64_c(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)705 void aom_highbd_dc_predictor_32x64_c(uint16_t *dst, ptrdiff_t stride,
706                                      const uint16_t *above,
707                                      const uint16_t *left, int bd) {
708   highbd_dc_predictor_rect(dst, stride, 32, 64, above, left, bd, 5,
709                            HIGHBD_DC_MULTIPLIER_1X2);
710 }
711 
aom_highbd_dc_predictor_64x32_c(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)712 void aom_highbd_dc_predictor_64x32_c(uint16_t *dst, ptrdiff_t stride,
713                                      const uint16_t *above,
714                                      const uint16_t *left, int bd) {
715   highbd_dc_predictor_rect(dst, stride, 64, 32, above, left, bd, 5,
716                            HIGHBD_DC_MULTIPLIER_1X2);
717 }
718 
719 #undef HIGHBD_DC_MULTIPLIER_1X2
720 #undef HIGHBD_DC_MULTIPLIER_1X4
721 #endif  // CONFIG_AV1_HIGHBITDEPTH
722 
723 // This serves as a wrapper function, so that all the prediction functions
724 // can be unified and accessed as a pointer array. Note that the boundary
725 // above and left are not necessarily used all the time.
726 #define intra_pred_sized(type, width, height)                  \
727   void aom_##type##_predictor_##width##x##height##_c(          \
728       uint8_t *dst, ptrdiff_t stride, const uint8_t *above,    \
729       const uint8_t *left) {                                   \
730     type##_predictor(dst, stride, width, height, above, left); \
731   }
732 
733 #if CONFIG_AV1_HIGHBITDEPTH
734 #define intra_pred_highbd_sized(type, width, height)                        \
735   void aom_highbd_##type##_predictor_##width##x##height##_c(                \
736       uint16_t *dst, ptrdiff_t stride, const uint16_t *above,               \
737       const uint16_t *left, int bd) {                                       \
738     highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
739   }
740 #else  // !CONFIG_AV1_HIGHBITDEPTH
741 #define intra_pred_highbd_sized(type, width, height)
742 #endif  // CONFIG_AV1_HIGHBITDEPTH
743 
744 /* clang-format off */
745 #if CONFIG_REALTIME_ONLY && !CONFIG_AV1_DECODER
746 #define intra_pred_rectangular(type) \
747   intra_pred_sized(type, 4, 8) \
748   intra_pred_sized(type, 8, 4) \
749   intra_pred_sized(type, 8, 16) \
750   intra_pred_sized(type, 16, 8) \
751   intra_pred_sized(type, 16, 32) \
752   intra_pred_sized(type, 32, 16) \
753   intra_pred_sized(type, 32, 64) \
754   intra_pred_sized(type, 64, 32) \
755   intra_pred_highbd_sized(type, 4, 8) \
756   intra_pred_highbd_sized(type, 8, 4) \
757   intra_pred_highbd_sized(type, 8, 16) \
758   intra_pred_highbd_sized(type, 16, 8) \
759   intra_pred_highbd_sized(type, 16, 32) \
760   intra_pred_highbd_sized(type, 32, 16) \
761   intra_pred_highbd_sized(type, 32, 64) \
762   intra_pred_highbd_sized(type, 64, 32)
763 #else
764 #define intra_pred_rectangular(type) \
765   intra_pred_sized(type, 4, 8) \
766   intra_pred_sized(type, 8, 4) \
767   intra_pred_sized(type, 8, 16) \
768   intra_pred_sized(type, 16, 8) \
769   intra_pred_sized(type, 16, 32) \
770   intra_pred_sized(type, 32, 16) \
771   intra_pred_sized(type, 32, 64) \
772   intra_pred_sized(type, 64, 32) \
773   intra_pred_sized(type, 4, 16) \
774   intra_pred_sized(type, 16, 4) \
775   intra_pred_sized(type, 8, 32) \
776   intra_pred_sized(type, 32, 8) \
777   intra_pred_sized(type, 16, 64) \
778   intra_pred_sized(type, 64, 16) \
779   intra_pred_highbd_sized(type, 4, 8) \
780   intra_pred_highbd_sized(type, 8, 4) \
781   intra_pred_highbd_sized(type, 8, 16) \
782   intra_pred_highbd_sized(type, 16, 8) \
783   intra_pred_highbd_sized(type, 16, 32) \
784   intra_pred_highbd_sized(type, 32, 16) \
785   intra_pred_highbd_sized(type, 32, 64) \
786   intra_pred_highbd_sized(type, 64, 32) \
787   intra_pred_highbd_sized(type, 4, 16) \
788   intra_pred_highbd_sized(type, 16, 4) \
789   intra_pred_highbd_sized(type, 8, 32) \
790   intra_pred_highbd_sized(type, 32, 8) \
791   intra_pred_highbd_sized(type, 16, 64) \
792   intra_pred_highbd_sized(type, 64, 16)
793 #endif // CONFIG_REALTIME_ONLY && !CONFIG_AV1_DECODER
794 
795 #define intra_pred_above_4x4(type) \
796   intra_pred_sized(type, 8, 8) \
797   intra_pred_sized(type, 16, 16) \
798   intra_pred_sized(type, 32, 32) \
799   intra_pred_sized(type, 64, 64) \
800   intra_pred_highbd_sized(type, 4, 4) \
801   intra_pred_highbd_sized(type, 8, 8) \
802   intra_pred_highbd_sized(type, 16, 16) \
803   intra_pred_highbd_sized(type, 32, 32) \
804   intra_pred_highbd_sized(type, 64, 64) \
805   intra_pred_rectangular(type)
806 #define intra_pred_allsizes(type) \
807   intra_pred_sized(type, 4, 4) \
808   intra_pred_above_4x4(type)
809 #define intra_pred_square(type) \
810   intra_pred_sized(type, 4, 4) \
811   intra_pred_sized(type, 8, 8) \
812   intra_pred_sized(type, 16, 16) \
813   intra_pred_sized(type, 32, 32) \
814   intra_pred_sized(type, 64, 64) \
815   intra_pred_highbd_sized(type, 4, 4) \
816   intra_pred_highbd_sized(type, 8, 8) \
817   intra_pred_highbd_sized(type, 16, 16) \
818   intra_pred_highbd_sized(type, 32, 32) \
819   intra_pred_highbd_sized(type, 64, 64)
820 
821 intra_pred_allsizes(v)
822 intra_pred_allsizes(h)
823 intra_pred_allsizes(smooth)
824 intra_pred_allsizes(smooth_v)
825 intra_pred_allsizes(smooth_h)
826 intra_pred_allsizes(paeth)
827 intra_pred_allsizes(dc_128)
828 intra_pred_allsizes(dc_left)
829 intra_pred_allsizes(dc_top)
830 intra_pred_square(dc)
831 /* clang-format on */
832 #undef intra_pred_allsizes
833