1 /*
2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #include <assert.h>
13 #include <math.h>
14
15 #include "config/aom_config.h"
16 #include "config/aom_dsp_rtcd.h"
17
18 #include "aom_dsp/aom_dsp_common.h"
19 #include "aom_dsp/intrapred_common.h"
20 #include "aom_mem/aom_mem.h"
21 #include "aom_ports/bitops.h"
22
v_predictor(uint8_t * dst,ptrdiff_t stride,int bw,int bh,const uint8_t * above,const uint8_t * left)23 static inline void v_predictor(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
24 const uint8_t *above, const uint8_t *left) {
25 int r;
26 (void)left;
27
28 for (r = 0; r < bh; r++) {
29 memcpy(dst, above, bw);
30 dst += stride;
31 }
32 }
33
h_predictor(uint8_t * dst,ptrdiff_t stride,int bw,int bh,const uint8_t * above,const uint8_t * left)34 static inline void h_predictor(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
35 const uint8_t *above, const uint8_t *left) {
36 int r;
37 (void)above;
38
39 for (r = 0; r < bh; r++) {
40 memset(dst, left[r], bw);
41 dst += stride;
42 }
43 }
44
abs_diff(int a,int b)45 static inline int abs_diff(int a, int b) { return (a > b) ? a - b : b - a; }
46
paeth_predictor_single(uint16_t left,uint16_t top,uint16_t top_left)47 static inline uint16_t paeth_predictor_single(uint16_t left, uint16_t top,
48 uint16_t top_left) {
49 const int base = top + left - top_left;
50 const int p_left = abs_diff(base, left);
51 const int p_top = abs_diff(base, top);
52 const int p_top_left = abs_diff(base, top_left);
53
54 // Return nearest to base of left, top and top_left.
55 return (p_left <= p_top && p_left <= p_top_left) ? left
56 : (p_top <= p_top_left) ? top
57 : top_left;
58 }
59
paeth_predictor(uint8_t * dst,ptrdiff_t stride,int bw,int bh,const uint8_t * above,const uint8_t * left)60 static inline void paeth_predictor(uint8_t *dst, ptrdiff_t stride, int bw,
61 int bh, const uint8_t *above,
62 const uint8_t *left) {
63 int r, c;
64 const uint8_t ytop_left = above[-1];
65
66 for (r = 0; r < bh; r++) {
67 for (c = 0; c < bw; c++)
68 dst[c] = (uint8_t)paeth_predictor_single(left[r], above[c], ytop_left);
69 dst += stride;
70 }
71 }
72
73 // Some basic checks on weights for smooth predictor.
74 #define sm_weights_sanity_checks(weights_w, weights_h, weights_scale, \
75 pred_scale) \
76 assert(weights_w[0] < weights_scale); \
77 assert(weights_h[0] < weights_scale); \
78 assert(weights_scale - weights_w[bw - 1] < weights_scale); \
79 assert(weights_scale - weights_h[bh - 1] < weights_scale); \
80 assert(pred_scale < 31) // ensures no overflow when calculating predictor.
81
82 #define divide_round(value, bits) (((value) + (1 << ((bits)-1))) >> (bits))
83
smooth_predictor(uint8_t * dst,ptrdiff_t stride,int bw,int bh,const uint8_t * above,const uint8_t * left)84 static inline void smooth_predictor(uint8_t *dst, ptrdiff_t stride, int bw,
85 int bh, const uint8_t *above,
86 const uint8_t *left) {
87 const uint8_t below_pred = left[bh - 1]; // estimated by bottom-left pixel
88 const uint8_t right_pred = above[bw - 1]; // estimated by top-right pixel
89 const uint8_t *const sm_weights_w = smooth_weights + bw - 4;
90 const uint8_t *const sm_weights_h = smooth_weights + bh - 4;
91 // scale = 2 * 2^SMOOTH_WEIGHT_LOG2_SCALE
92 const int log2_scale = 1 + SMOOTH_WEIGHT_LOG2_SCALE;
93 const uint16_t scale = (1 << SMOOTH_WEIGHT_LOG2_SCALE);
94 sm_weights_sanity_checks(sm_weights_w, sm_weights_h, scale,
95 log2_scale + sizeof(*dst));
96 int r;
97 for (r = 0; r < bh; ++r) {
98 int c;
99 for (c = 0; c < bw; ++c) {
100 const uint8_t pixels[] = { above[c], below_pred, left[r], right_pred };
101 const uint8_t weights[] = { sm_weights_h[r], scale - sm_weights_h[r],
102 sm_weights_w[c], scale - sm_weights_w[c] };
103 uint32_t this_pred = 0;
104 int i;
105 assert(scale >= sm_weights_h[r] && scale >= sm_weights_w[c]);
106 for (i = 0; i < 4; ++i) {
107 this_pred += weights[i] * pixels[i];
108 }
109 dst[c] = divide_round(this_pred, log2_scale);
110 }
111 dst += stride;
112 }
113 }
114
smooth_v_predictor(uint8_t * dst,ptrdiff_t stride,int bw,int bh,const uint8_t * above,const uint8_t * left)115 static inline void smooth_v_predictor(uint8_t *dst, ptrdiff_t stride, int bw,
116 int bh, const uint8_t *above,
117 const uint8_t *left) {
118 const uint8_t below_pred = left[bh - 1]; // estimated by bottom-left pixel
119 const uint8_t *const sm_weights = smooth_weights + bh - 4;
120 // scale = 2^SMOOTH_WEIGHT_LOG2_SCALE
121 const int log2_scale = SMOOTH_WEIGHT_LOG2_SCALE;
122 const uint16_t scale = (1 << SMOOTH_WEIGHT_LOG2_SCALE);
123 sm_weights_sanity_checks(sm_weights, sm_weights, scale,
124 log2_scale + sizeof(*dst));
125
126 int r;
127 for (r = 0; r < bh; r++) {
128 int c;
129 for (c = 0; c < bw; ++c) {
130 const uint8_t pixels[] = { above[c], below_pred };
131 const uint8_t weights[] = { sm_weights[r], scale - sm_weights[r] };
132 uint32_t this_pred = 0;
133 assert(scale >= sm_weights[r]);
134 int i;
135 for (i = 0; i < 2; ++i) {
136 this_pred += weights[i] * pixels[i];
137 }
138 dst[c] = divide_round(this_pred, log2_scale);
139 }
140 dst += stride;
141 }
142 }
143
smooth_h_predictor(uint8_t * dst,ptrdiff_t stride,int bw,int bh,const uint8_t * above,const uint8_t * left)144 static inline void smooth_h_predictor(uint8_t *dst, ptrdiff_t stride, int bw,
145 int bh, const uint8_t *above,
146 const uint8_t *left) {
147 const uint8_t right_pred = above[bw - 1]; // estimated by top-right pixel
148 const uint8_t *const sm_weights = smooth_weights + bw - 4;
149 // scale = 2^SMOOTH_WEIGHT_LOG2_SCALE
150 const int log2_scale = SMOOTH_WEIGHT_LOG2_SCALE;
151 const uint16_t scale = (1 << SMOOTH_WEIGHT_LOG2_SCALE);
152 sm_weights_sanity_checks(sm_weights, sm_weights, scale,
153 log2_scale + sizeof(*dst));
154
155 int r;
156 for (r = 0; r < bh; r++) {
157 int c;
158 for (c = 0; c < bw; ++c) {
159 const uint8_t pixels[] = { left[r], right_pred };
160 const uint8_t weights[] = { sm_weights[c], scale - sm_weights[c] };
161 uint32_t this_pred = 0;
162 assert(scale >= sm_weights[c]);
163 int i;
164 for (i = 0; i < 2; ++i) {
165 this_pred += weights[i] * pixels[i];
166 }
167 dst[c] = divide_round(this_pred, log2_scale);
168 }
169 dst += stride;
170 }
171 }
172
dc_128_predictor(uint8_t * dst,ptrdiff_t stride,int bw,int bh,const uint8_t * above,const uint8_t * left)173 static inline void dc_128_predictor(uint8_t *dst, ptrdiff_t stride, int bw,
174 int bh, const uint8_t *above,
175 const uint8_t *left) {
176 int r;
177 (void)above;
178 (void)left;
179
180 for (r = 0; r < bh; r++) {
181 memset(dst, 128, bw);
182 dst += stride;
183 }
184 }
185
dc_left_predictor(uint8_t * dst,ptrdiff_t stride,int bw,int bh,const uint8_t * above,const uint8_t * left)186 static inline void dc_left_predictor(uint8_t *dst, ptrdiff_t stride, int bw,
187 int bh, const uint8_t *above,
188 const uint8_t *left) {
189 int i, r, expected_dc, sum = 0;
190 (void)above;
191
192 for (i = 0; i < bh; i++) sum += left[i];
193 expected_dc = (sum + (bh >> 1)) / bh;
194
195 for (r = 0; r < bh; r++) {
196 memset(dst, expected_dc, bw);
197 dst += stride;
198 }
199 }
200
dc_top_predictor(uint8_t * dst,ptrdiff_t stride,int bw,int bh,const uint8_t * above,const uint8_t * left)201 static inline void dc_top_predictor(uint8_t *dst, ptrdiff_t stride, int bw,
202 int bh, const uint8_t *above,
203 const uint8_t *left) {
204 int i, r, expected_dc, sum = 0;
205 (void)left;
206
207 for (i = 0; i < bw; i++) sum += above[i];
208 expected_dc = (sum + (bw >> 1)) / bw;
209
210 for (r = 0; r < bh; r++) {
211 memset(dst, expected_dc, bw);
212 dst += stride;
213 }
214 }
215
dc_predictor(uint8_t * dst,ptrdiff_t stride,int bw,int bh,const uint8_t * above,const uint8_t * left)216 static inline void dc_predictor(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
217 const uint8_t *above, const uint8_t *left) {
218 int i, r, expected_dc, sum = 0;
219 const int count = bw + bh;
220
221 for (i = 0; i < bw; i++) {
222 sum += above[i];
223 }
224 for (i = 0; i < bh; i++) {
225 sum += left[i];
226 }
227
228 expected_dc = (sum + (count >> 1)) / count;
229
230 for (r = 0; r < bh; r++) {
231 memset(dst, expected_dc, bw);
232 dst += stride;
233 }
234 }
235
divide_using_multiply_shift(int num,int shift1,int multiplier,int shift2)236 static inline int divide_using_multiply_shift(int num, int shift1,
237 int multiplier, int shift2) {
238 const int interm = num >> shift1;
239 return interm * multiplier >> shift2;
240 }
241
242 // The constants (multiplier and shifts) for a given block size are obtained
243 // as follows:
244 // - Let sum_w_h = block width + block height.
245 // - Shift 'sum_w_h' right until we reach an odd number. Let the number of
246 // shifts for that block size be called 'shift1' (see the parameter in
247 // dc_predictor_rect() function), and let the odd number be 'd'. [d has only 2
248 // possible values: d = 3 for a 1:2 rect block and d = 5 for a 1:4 rect
249 // block].
250 // - Find multipliers for (i) dividing by 3, and (ii) dividing by 5,
251 // using the "Algorithm 1" in:
252 // http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=1467632
253 // by ensuring that m + n = 16 (in that algorithm). This ensures that our 2nd
254 // shift will be 16, regardless of the block size.
255
256 // Note: For low bitdepth, assembly code may be optimized by using smaller
257 // constants for smaller block sizes, where the range of the 'sum' is
258 // restricted to fewer bits.
259
260 #define DC_MULTIPLIER_1X2 0x5556
261 #define DC_MULTIPLIER_1X4 0x3334
262
263 #define DC_SHIFT2 16
264
dc_predictor_rect(uint8_t * dst,ptrdiff_t stride,int bw,int bh,const uint8_t * above,const uint8_t * left,int shift1,int multiplier)265 static inline void dc_predictor_rect(uint8_t *dst, ptrdiff_t stride, int bw,
266 int bh, const uint8_t *above,
267 const uint8_t *left, int shift1,
268 int multiplier) {
269 int sum = 0;
270
271 for (int i = 0; i < bw; i++) {
272 sum += above[i];
273 }
274 for (int i = 0; i < bh; i++) {
275 sum += left[i];
276 }
277
278 const int expected_dc = divide_using_multiply_shift(
279 sum + ((bw + bh) >> 1), shift1, multiplier, DC_SHIFT2);
280 assert(expected_dc < (1 << 8));
281
282 for (int r = 0; r < bh; r++) {
283 memset(dst, expected_dc, bw);
284 dst += stride;
285 }
286 }
287
288 #undef DC_SHIFT2
289
aom_dc_predictor_4x8_c(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)290 void aom_dc_predictor_4x8_c(uint8_t *dst, ptrdiff_t stride,
291 const uint8_t *above, const uint8_t *left) {
292 dc_predictor_rect(dst, stride, 4, 8, above, left, 2, DC_MULTIPLIER_1X2);
293 }
294
aom_dc_predictor_8x4_c(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)295 void aom_dc_predictor_8x4_c(uint8_t *dst, ptrdiff_t stride,
296 const uint8_t *above, const uint8_t *left) {
297 dc_predictor_rect(dst, stride, 8, 4, above, left, 2, DC_MULTIPLIER_1X2);
298 }
299
300 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_dc_predictor_4x16_c(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)301 void aom_dc_predictor_4x16_c(uint8_t *dst, ptrdiff_t stride,
302 const uint8_t *above, const uint8_t *left) {
303 dc_predictor_rect(dst, stride, 4, 16, above, left, 2, DC_MULTIPLIER_1X4);
304 }
305
aom_dc_predictor_16x4_c(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)306 void aom_dc_predictor_16x4_c(uint8_t *dst, ptrdiff_t stride,
307 const uint8_t *above, const uint8_t *left) {
308 dc_predictor_rect(dst, stride, 16, 4, above, left, 2, DC_MULTIPLIER_1X4);
309 }
310 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
311
aom_dc_predictor_8x16_c(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)312 void aom_dc_predictor_8x16_c(uint8_t *dst, ptrdiff_t stride,
313 const uint8_t *above, const uint8_t *left) {
314 dc_predictor_rect(dst, stride, 8, 16, above, left, 3, DC_MULTIPLIER_1X2);
315 }
316
aom_dc_predictor_16x8_c(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)317 void aom_dc_predictor_16x8_c(uint8_t *dst, ptrdiff_t stride,
318 const uint8_t *above, const uint8_t *left) {
319 dc_predictor_rect(dst, stride, 16, 8, above, left, 3, DC_MULTIPLIER_1X2);
320 }
321
322 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_dc_predictor_8x32_c(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)323 void aom_dc_predictor_8x32_c(uint8_t *dst, ptrdiff_t stride,
324 const uint8_t *above, const uint8_t *left) {
325 dc_predictor_rect(dst, stride, 8, 32, above, left, 3, DC_MULTIPLIER_1X4);
326 }
327
aom_dc_predictor_32x8_c(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)328 void aom_dc_predictor_32x8_c(uint8_t *dst, ptrdiff_t stride,
329 const uint8_t *above, const uint8_t *left) {
330 dc_predictor_rect(dst, stride, 32, 8, above, left, 3, DC_MULTIPLIER_1X4);
331 }
332 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
333
aom_dc_predictor_16x32_c(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)334 void aom_dc_predictor_16x32_c(uint8_t *dst, ptrdiff_t stride,
335 const uint8_t *above, const uint8_t *left) {
336 dc_predictor_rect(dst, stride, 16, 32, above, left, 4, DC_MULTIPLIER_1X2);
337 }
338
aom_dc_predictor_32x16_c(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)339 void aom_dc_predictor_32x16_c(uint8_t *dst, ptrdiff_t stride,
340 const uint8_t *above, const uint8_t *left) {
341 dc_predictor_rect(dst, stride, 32, 16, above, left, 4, DC_MULTIPLIER_1X2);
342 }
343
344 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_dc_predictor_16x64_c(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)345 void aom_dc_predictor_16x64_c(uint8_t *dst, ptrdiff_t stride,
346 const uint8_t *above, const uint8_t *left) {
347 dc_predictor_rect(dst, stride, 16, 64, above, left, 4, DC_MULTIPLIER_1X4);
348 }
349
aom_dc_predictor_64x16_c(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)350 void aom_dc_predictor_64x16_c(uint8_t *dst, ptrdiff_t stride,
351 const uint8_t *above, const uint8_t *left) {
352 dc_predictor_rect(dst, stride, 64, 16, above, left, 4, DC_MULTIPLIER_1X4);
353 }
354 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
355
aom_dc_predictor_32x64_c(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)356 void aom_dc_predictor_32x64_c(uint8_t *dst, ptrdiff_t stride,
357 const uint8_t *above, const uint8_t *left) {
358 dc_predictor_rect(dst, stride, 32, 64, above, left, 5, DC_MULTIPLIER_1X2);
359 }
360
aom_dc_predictor_64x32_c(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)361 void aom_dc_predictor_64x32_c(uint8_t *dst, ptrdiff_t stride,
362 const uint8_t *above, const uint8_t *left) {
363 dc_predictor_rect(dst, stride, 64, 32, above, left, 5, DC_MULTIPLIER_1X2);
364 }
365
366 #undef DC_MULTIPLIER_1X2
367 #undef DC_MULTIPLIER_1X4
368
369 #if CONFIG_AV1_HIGHBITDEPTH
370
highbd_v_predictor(uint16_t * dst,ptrdiff_t stride,int bw,int bh,const uint16_t * above,const uint16_t * left,int bd)371 static inline void highbd_v_predictor(uint16_t *dst, ptrdiff_t stride, int bw,
372 int bh, const uint16_t *above,
373 const uint16_t *left, int bd) {
374 int r;
375 (void)left;
376 (void)bd;
377 for (r = 0; r < bh; r++) {
378 memcpy(dst, above, bw * sizeof(uint16_t));
379 dst += stride;
380 }
381 }
382
highbd_h_predictor(uint16_t * dst,ptrdiff_t stride,int bw,int bh,const uint16_t * above,const uint16_t * left,int bd)383 static inline void highbd_h_predictor(uint16_t *dst, ptrdiff_t stride, int bw,
384 int bh, const uint16_t *above,
385 const uint16_t *left, int bd) {
386 int r;
387 (void)above;
388 (void)bd;
389 for (r = 0; r < bh; r++) {
390 aom_memset16(dst, left[r], bw);
391 dst += stride;
392 }
393 }
394
highbd_paeth_predictor(uint16_t * dst,ptrdiff_t stride,int bw,int bh,const uint16_t * above,const uint16_t * left,int bd)395 static inline void highbd_paeth_predictor(uint16_t *dst, ptrdiff_t stride,
396 int bw, int bh, const uint16_t *above,
397 const uint16_t *left, int bd) {
398 int r, c;
399 const uint16_t ytop_left = above[-1];
400 (void)bd;
401
402 for (r = 0; r < bh; r++) {
403 for (c = 0; c < bw; c++)
404 dst[c] = paeth_predictor_single(left[r], above[c], ytop_left);
405 dst += stride;
406 }
407 }
408
highbd_smooth_predictor(uint16_t * dst,ptrdiff_t stride,int bw,int bh,const uint16_t * above,const uint16_t * left,int bd)409 static inline void highbd_smooth_predictor(uint16_t *dst, ptrdiff_t stride,
410 int bw, int bh,
411 const uint16_t *above,
412 const uint16_t *left, int bd) {
413 (void)bd;
414 const uint16_t below_pred = left[bh - 1]; // estimated by bottom-left pixel
415 const uint16_t right_pred = above[bw - 1]; // estimated by top-right pixel
416 const uint8_t *const sm_weights_w = smooth_weights + bw - 4;
417 const uint8_t *const sm_weights_h = smooth_weights + bh - 4;
418 // scale = 2 * 2^SMOOTH_WEIGHT_LOG2_SCALE
419 const int log2_scale = 1 + SMOOTH_WEIGHT_LOG2_SCALE;
420 const uint16_t scale = (1 << SMOOTH_WEIGHT_LOG2_SCALE);
421 sm_weights_sanity_checks(sm_weights_w, sm_weights_h, scale,
422 log2_scale + sizeof(*dst));
423 int r;
424 for (r = 0; r < bh; ++r) {
425 int c;
426 for (c = 0; c < bw; ++c) {
427 const uint16_t pixels[] = { above[c], below_pred, left[r], right_pred };
428 const uint8_t weights[] = { sm_weights_h[r], scale - sm_weights_h[r],
429 sm_weights_w[c], scale - sm_weights_w[c] };
430 uint32_t this_pred = 0;
431 int i;
432 assert(scale >= sm_weights_h[r] && scale >= sm_weights_w[c]);
433 for (i = 0; i < 4; ++i) {
434 this_pred += weights[i] * pixels[i];
435 }
436 dst[c] = divide_round(this_pred, log2_scale);
437 }
438 dst += stride;
439 }
440 }
441
highbd_smooth_v_predictor(uint16_t * dst,ptrdiff_t stride,int bw,int bh,const uint16_t * above,const uint16_t * left,int bd)442 static inline void highbd_smooth_v_predictor(uint16_t *dst, ptrdiff_t stride,
443 int bw, int bh,
444 const uint16_t *above,
445 const uint16_t *left, int bd) {
446 (void)bd;
447 const uint16_t below_pred = left[bh - 1]; // estimated by bottom-left pixel
448 const uint8_t *const sm_weights = smooth_weights + bh - 4;
449 // scale = 2^SMOOTH_WEIGHT_LOG2_SCALE
450 const int log2_scale = SMOOTH_WEIGHT_LOG2_SCALE;
451 const uint16_t scale = (1 << SMOOTH_WEIGHT_LOG2_SCALE);
452 sm_weights_sanity_checks(sm_weights, sm_weights, scale,
453 log2_scale + sizeof(*dst));
454
455 int r;
456 for (r = 0; r < bh; r++) {
457 int c;
458 for (c = 0; c < bw; ++c) {
459 const uint16_t pixels[] = { above[c], below_pred };
460 const uint8_t weights[] = { sm_weights[r], scale - sm_weights[r] };
461 uint32_t this_pred = 0;
462 assert(scale >= sm_weights[r]);
463 int i;
464 for (i = 0; i < 2; ++i) {
465 this_pred += weights[i] * pixels[i];
466 }
467 dst[c] = divide_round(this_pred, log2_scale);
468 }
469 dst += stride;
470 }
471 }
472
highbd_smooth_h_predictor(uint16_t * dst,ptrdiff_t stride,int bw,int bh,const uint16_t * above,const uint16_t * left,int bd)473 static inline void highbd_smooth_h_predictor(uint16_t *dst, ptrdiff_t stride,
474 int bw, int bh,
475 const uint16_t *above,
476 const uint16_t *left, int bd) {
477 (void)bd;
478 const uint16_t right_pred = above[bw - 1]; // estimated by top-right pixel
479 const uint8_t *const sm_weights = smooth_weights + bw - 4;
480 // scale = 2^SMOOTH_WEIGHT_LOG2_SCALE
481 const int log2_scale = SMOOTH_WEIGHT_LOG2_SCALE;
482 const uint16_t scale = (1 << SMOOTH_WEIGHT_LOG2_SCALE);
483 sm_weights_sanity_checks(sm_weights, sm_weights, scale,
484 log2_scale + sizeof(*dst));
485
486 int r;
487 for (r = 0; r < bh; r++) {
488 int c;
489 for (c = 0; c < bw; ++c) {
490 const uint16_t pixels[] = { left[r], right_pred };
491 const uint8_t weights[] = { sm_weights[c], scale - sm_weights[c] };
492 uint32_t this_pred = 0;
493 assert(scale >= sm_weights[c]);
494 int i;
495 for (i = 0; i < 2; ++i) {
496 this_pred += weights[i] * pixels[i];
497 }
498 dst[c] = divide_round(this_pred, log2_scale);
499 }
500 dst += stride;
501 }
502 }
503
highbd_dc_128_predictor(uint16_t * dst,ptrdiff_t stride,int bw,int bh,const uint16_t * above,const uint16_t * left,int bd)504 static inline void highbd_dc_128_predictor(uint16_t *dst, ptrdiff_t stride,
505 int bw, int bh,
506 const uint16_t *above,
507 const uint16_t *left, int bd) {
508 int r;
509 (void)above;
510 (void)left;
511
512 for (r = 0; r < bh; r++) {
513 aom_memset16(dst, 128 << (bd - 8), bw);
514 dst += stride;
515 }
516 }
517
highbd_dc_left_predictor(uint16_t * dst,ptrdiff_t stride,int bw,int bh,const uint16_t * above,const uint16_t * left,int bd)518 static inline void highbd_dc_left_predictor(uint16_t *dst, ptrdiff_t stride,
519 int bw, int bh,
520 const uint16_t *above,
521 const uint16_t *left, int bd) {
522 int i, r, expected_dc, sum = 0;
523 (void)above;
524 (void)bd;
525
526 for (i = 0; i < bh; i++) sum += left[i];
527 expected_dc = (sum + (bh >> 1)) / bh;
528
529 for (r = 0; r < bh; r++) {
530 aom_memset16(dst, expected_dc, bw);
531 dst += stride;
532 }
533 }
534
highbd_dc_top_predictor(uint16_t * dst,ptrdiff_t stride,int bw,int bh,const uint16_t * above,const uint16_t * left,int bd)535 static inline void highbd_dc_top_predictor(uint16_t *dst, ptrdiff_t stride,
536 int bw, int bh,
537 const uint16_t *above,
538 const uint16_t *left, int bd) {
539 int i, r, expected_dc, sum = 0;
540 (void)left;
541 (void)bd;
542
543 for (i = 0; i < bw; i++) sum += above[i];
544 expected_dc = (sum + (bw >> 1)) / bw;
545
546 for (r = 0; r < bh; r++) {
547 aom_memset16(dst, expected_dc, bw);
548 dst += stride;
549 }
550 }
551
highbd_dc_predictor(uint16_t * dst,ptrdiff_t stride,int bw,int bh,const uint16_t * above,const uint16_t * left,int bd)552 static inline void highbd_dc_predictor(uint16_t *dst, ptrdiff_t stride, int bw,
553 int bh, const uint16_t *above,
554 const uint16_t *left, int bd) {
555 int i, r, expected_dc, sum = 0;
556 const int count = bw + bh;
557 (void)bd;
558
559 for (i = 0; i < bw; i++) {
560 sum += above[i];
561 }
562 for (i = 0; i < bh; i++) {
563 sum += left[i];
564 }
565
566 expected_dc = (sum + (count >> 1)) / count;
567
568 for (r = 0; r < bh; r++) {
569 aom_memset16(dst, expected_dc, bw);
570 dst += stride;
571 }
572 }
573
574 // Obtained similarly as DC_MULTIPLIER_1X2 and DC_MULTIPLIER_1X4 above, but
575 // assume 2nd shift of 17 bits instead of 16.
576 // Note: Strictly speaking, 2nd shift needs to be 17 only when:
577 // - bit depth == 12, and
578 // - bw + bh is divisible by 5 (as opposed to divisible by 3).
579 // All other cases can use half the multipliers with a shift of 16 instead.
580 // This special optimization can be used when writing assembly code.
581 #define HIGHBD_DC_MULTIPLIER_1X2 0xAAAB
582 // Note: This constant is odd, but a smaller even constant (0x199a) with the
583 // appropriate shift should work for neon in 8/10-bit.
584 #define HIGHBD_DC_MULTIPLIER_1X4 0x6667
585
586 #define HIGHBD_DC_SHIFT2 17
587
highbd_dc_predictor_rect(uint16_t * dst,ptrdiff_t stride,int bw,int bh,const uint16_t * above,const uint16_t * left,int bd,int shift1,uint32_t multiplier)588 static inline void highbd_dc_predictor_rect(uint16_t *dst, ptrdiff_t stride,
589 int bw, int bh,
590 const uint16_t *above,
591 const uint16_t *left, int bd,
592 int shift1, uint32_t multiplier) {
593 int sum = 0;
594 (void)bd;
595
596 for (int i = 0; i < bw; i++) {
597 sum += above[i];
598 }
599 for (int i = 0; i < bh; i++) {
600 sum += left[i];
601 }
602
603 const int expected_dc = divide_using_multiply_shift(
604 sum + ((bw + bh) >> 1), shift1, multiplier, HIGHBD_DC_SHIFT2);
605 assert(expected_dc < (1 << bd));
606
607 for (int r = 0; r < bh; r++) {
608 aom_memset16(dst, expected_dc, bw);
609 dst += stride;
610 }
611 }
612
613 #undef HIGHBD_DC_SHIFT2
614
aom_highbd_dc_predictor_4x8_c(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)615 void aom_highbd_dc_predictor_4x8_c(uint16_t *dst, ptrdiff_t stride,
616 const uint16_t *above, const uint16_t *left,
617 int bd) {
618 highbd_dc_predictor_rect(dst, stride, 4, 8, above, left, bd, 2,
619 HIGHBD_DC_MULTIPLIER_1X2);
620 }
621
aom_highbd_dc_predictor_8x4_c(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)622 void aom_highbd_dc_predictor_8x4_c(uint16_t *dst, ptrdiff_t stride,
623 const uint16_t *above, const uint16_t *left,
624 int bd) {
625 highbd_dc_predictor_rect(dst, stride, 8, 4, above, left, bd, 2,
626 HIGHBD_DC_MULTIPLIER_1X2);
627 }
628
629 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_highbd_dc_predictor_4x16_c(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)630 void aom_highbd_dc_predictor_4x16_c(uint16_t *dst, ptrdiff_t stride,
631 const uint16_t *above, const uint16_t *left,
632 int bd) {
633 highbd_dc_predictor_rect(dst, stride, 4, 16, above, left, bd, 2,
634 HIGHBD_DC_MULTIPLIER_1X4);
635 }
636
aom_highbd_dc_predictor_16x4_c(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)637 void aom_highbd_dc_predictor_16x4_c(uint16_t *dst, ptrdiff_t stride,
638 const uint16_t *above, const uint16_t *left,
639 int bd) {
640 highbd_dc_predictor_rect(dst, stride, 16, 4, above, left, bd, 2,
641 HIGHBD_DC_MULTIPLIER_1X4);
642 }
643 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
644
aom_highbd_dc_predictor_8x16_c(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)645 void aom_highbd_dc_predictor_8x16_c(uint16_t *dst, ptrdiff_t stride,
646 const uint16_t *above, const uint16_t *left,
647 int bd) {
648 highbd_dc_predictor_rect(dst, stride, 8, 16, above, left, bd, 3,
649 HIGHBD_DC_MULTIPLIER_1X2);
650 }
651
aom_highbd_dc_predictor_16x8_c(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)652 void aom_highbd_dc_predictor_16x8_c(uint16_t *dst, ptrdiff_t stride,
653 const uint16_t *above, const uint16_t *left,
654 int bd) {
655 highbd_dc_predictor_rect(dst, stride, 16, 8, above, left, bd, 3,
656 HIGHBD_DC_MULTIPLIER_1X2);
657 }
658
659 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_highbd_dc_predictor_8x32_c(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)660 void aom_highbd_dc_predictor_8x32_c(uint16_t *dst, ptrdiff_t stride,
661 const uint16_t *above, const uint16_t *left,
662 int bd) {
663 highbd_dc_predictor_rect(dst, stride, 8, 32, above, left, bd, 3,
664 HIGHBD_DC_MULTIPLIER_1X4);
665 }
666
aom_highbd_dc_predictor_32x8_c(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)667 void aom_highbd_dc_predictor_32x8_c(uint16_t *dst, ptrdiff_t stride,
668 const uint16_t *above, const uint16_t *left,
669 int bd) {
670 highbd_dc_predictor_rect(dst, stride, 32, 8, above, left, bd, 3,
671 HIGHBD_DC_MULTIPLIER_1X4);
672 }
673 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
674
aom_highbd_dc_predictor_16x32_c(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)675 void aom_highbd_dc_predictor_16x32_c(uint16_t *dst, ptrdiff_t stride,
676 const uint16_t *above,
677 const uint16_t *left, int bd) {
678 highbd_dc_predictor_rect(dst, stride, 16, 32, above, left, bd, 4,
679 HIGHBD_DC_MULTIPLIER_1X2);
680 }
681
aom_highbd_dc_predictor_32x16_c(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)682 void aom_highbd_dc_predictor_32x16_c(uint16_t *dst, ptrdiff_t stride,
683 const uint16_t *above,
684 const uint16_t *left, int bd) {
685 highbd_dc_predictor_rect(dst, stride, 32, 16, above, left, bd, 4,
686 HIGHBD_DC_MULTIPLIER_1X2);
687 }
688
689 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_highbd_dc_predictor_16x64_c(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)690 void aom_highbd_dc_predictor_16x64_c(uint16_t *dst, ptrdiff_t stride,
691 const uint16_t *above,
692 const uint16_t *left, int bd) {
693 highbd_dc_predictor_rect(dst, stride, 16, 64, above, left, bd, 4,
694 HIGHBD_DC_MULTIPLIER_1X4);
695 }
696
aom_highbd_dc_predictor_64x16_c(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)697 void aom_highbd_dc_predictor_64x16_c(uint16_t *dst, ptrdiff_t stride,
698 const uint16_t *above,
699 const uint16_t *left, int bd) {
700 highbd_dc_predictor_rect(dst, stride, 64, 16, above, left, bd, 4,
701 HIGHBD_DC_MULTIPLIER_1X4);
702 }
703 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
704
aom_highbd_dc_predictor_32x64_c(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)705 void aom_highbd_dc_predictor_32x64_c(uint16_t *dst, ptrdiff_t stride,
706 const uint16_t *above,
707 const uint16_t *left, int bd) {
708 highbd_dc_predictor_rect(dst, stride, 32, 64, above, left, bd, 5,
709 HIGHBD_DC_MULTIPLIER_1X2);
710 }
711
aom_highbd_dc_predictor_64x32_c(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)712 void aom_highbd_dc_predictor_64x32_c(uint16_t *dst, ptrdiff_t stride,
713 const uint16_t *above,
714 const uint16_t *left, int bd) {
715 highbd_dc_predictor_rect(dst, stride, 64, 32, above, left, bd, 5,
716 HIGHBD_DC_MULTIPLIER_1X2);
717 }
718
719 #undef HIGHBD_DC_MULTIPLIER_1X2
720 #undef HIGHBD_DC_MULTIPLIER_1X4
721 #endif // CONFIG_AV1_HIGHBITDEPTH
722
723 // This serves as a wrapper function, so that all the prediction functions
724 // can be unified and accessed as a pointer array. Note that the boundary
725 // above and left are not necessarily used all the time.
726 #define intra_pred_sized(type, width, height) \
727 void aom_##type##_predictor_##width##x##height##_c( \
728 uint8_t *dst, ptrdiff_t stride, const uint8_t *above, \
729 const uint8_t *left) { \
730 type##_predictor(dst, stride, width, height, above, left); \
731 }
732
733 #if CONFIG_AV1_HIGHBITDEPTH
734 #define intra_pred_highbd_sized(type, width, height) \
735 void aom_highbd_##type##_predictor_##width##x##height##_c( \
736 uint16_t *dst, ptrdiff_t stride, const uint16_t *above, \
737 const uint16_t *left, int bd) { \
738 highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
739 }
740 #else // !CONFIG_AV1_HIGHBITDEPTH
741 #define intra_pred_highbd_sized(type, width, height)
742 #endif // CONFIG_AV1_HIGHBITDEPTH
743
744 /* clang-format off */
745 #if CONFIG_REALTIME_ONLY && !CONFIG_AV1_DECODER
746 #define intra_pred_rectangular(type) \
747 intra_pred_sized(type, 4, 8) \
748 intra_pred_sized(type, 8, 4) \
749 intra_pred_sized(type, 8, 16) \
750 intra_pred_sized(type, 16, 8) \
751 intra_pred_sized(type, 16, 32) \
752 intra_pred_sized(type, 32, 16) \
753 intra_pred_sized(type, 32, 64) \
754 intra_pred_sized(type, 64, 32) \
755 intra_pred_highbd_sized(type, 4, 8) \
756 intra_pred_highbd_sized(type, 8, 4) \
757 intra_pred_highbd_sized(type, 8, 16) \
758 intra_pred_highbd_sized(type, 16, 8) \
759 intra_pred_highbd_sized(type, 16, 32) \
760 intra_pred_highbd_sized(type, 32, 16) \
761 intra_pred_highbd_sized(type, 32, 64) \
762 intra_pred_highbd_sized(type, 64, 32)
763 #else
764 #define intra_pred_rectangular(type) \
765 intra_pred_sized(type, 4, 8) \
766 intra_pred_sized(type, 8, 4) \
767 intra_pred_sized(type, 8, 16) \
768 intra_pred_sized(type, 16, 8) \
769 intra_pred_sized(type, 16, 32) \
770 intra_pred_sized(type, 32, 16) \
771 intra_pred_sized(type, 32, 64) \
772 intra_pred_sized(type, 64, 32) \
773 intra_pred_sized(type, 4, 16) \
774 intra_pred_sized(type, 16, 4) \
775 intra_pred_sized(type, 8, 32) \
776 intra_pred_sized(type, 32, 8) \
777 intra_pred_sized(type, 16, 64) \
778 intra_pred_sized(type, 64, 16) \
779 intra_pred_highbd_sized(type, 4, 8) \
780 intra_pred_highbd_sized(type, 8, 4) \
781 intra_pred_highbd_sized(type, 8, 16) \
782 intra_pred_highbd_sized(type, 16, 8) \
783 intra_pred_highbd_sized(type, 16, 32) \
784 intra_pred_highbd_sized(type, 32, 16) \
785 intra_pred_highbd_sized(type, 32, 64) \
786 intra_pred_highbd_sized(type, 64, 32) \
787 intra_pred_highbd_sized(type, 4, 16) \
788 intra_pred_highbd_sized(type, 16, 4) \
789 intra_pred_highbd_sized(type, 8, 32) \
790 intra_pred_highbd_sized(type, 32, 8) \
791 intra_pred_highbd_sized(type, 16, 64) \
792 intra_pred_highbd_sized(type, 64, 16)
793 #endif // CONFIG_REALTIME_ONLY && !CONFIG_AV1_DECODER
794
795 #define intra_pred_above_4x4(type) \
796 intra_pred_sized(type, 8, 8) \
797 intra_pred_sized(type, 16, 16) \
798 intra_pred_sized(type, 32, 32) \
799 intra_pred_sized(type, 64, 64) \
800 intra_pred_highbd_sized(type, 4, 4) \
801 intra_pred_highbd_sized(type, 8, 8) \
802 intra_pred_highbd_sized(type, 16, 16) \
803 intra_pred_highbd_sized(type, 32, 32) \
804 intra_pred_highbd_sized(type, 64, 64) \
805 intra_pred_rectangular(type)
806 #define intra_pred_allsizes(type) \
807 intra_pred_sized(type, 4, 4) \
808 intra_pred_above_4x4(type)
809 #define intra_pred_square(type) \
810 intra_pred_sized(type, 4, 4) \
811 intra_pred_sized(type, 8, 8) \
812 intra_pred_sized(type, 16, 16) \
813 intra_pred_sized(type, 32, 32) \
814 intra_pred_sized(type, 64, 64) \
815 intra_pred_highbd_sized(type, 4, 4) \
816 intra_pred_highbd_sized(type, 8, 8) \
817 intra_pred_highbd_sized(type, 16, 16) \
818 intra_pred_highbd_sized(type, 32, 32) \
819 intra_pred_highbd_sized(type, 64, 64)
820
821 intra_pred_allsizes(v)
822 intra_pred_allsizes(h)
823 intra_pred_allsizes(smooth)
824 intra_pred_allsizes(smooth_v)
825 intra_pred_allsizes(smooth_h)
826 intra_pred_allsizes(paeth)
827 intra_pred_allsizes(dc_128)
828 intra_pred_allsizes(dc_left)
829 intra_pred_allsizes(dc_top)
830 intra_pred_square(dc)
831 /* clang-format on */
832 #undef intra_pred_allsizes
833