1 // Copyright 2019 The libgav1 Authors
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "src/dsp/intrapred_directional.h"
16 #include "src/utils/cpu.h"
17
18 #if LIBGAV1_ENABLE_NEON
19
20 #include <arm_neon.h>
21
22 #include <algorithm>
23 #include <cassert>
24 #include <cstddef>
25 #include <cstdint>
26 #include <cstring>
27
28 #include "src/dsp/arm/common_neon.h"
29 #include "src/dsp/constants.h"
30 #include "src/dsp/dsp.h"
31 #include "src/utils/common.h"
32 #include "src/utils/compiler_attributes.h"
33
34 namespace libgav1 {
35 namespace dsp {
36 namespace low_bitdepth {
37 namespace {
38
39 // Blend two values based on weights that sum to 32.
WeightedBlend(const uint8x8_t a,const uint8x8_t b,const uint8x8_t a_weight,const uint8x8_t b_weight)40 inline uint8x8_t WeightedBlend(const uint8x8_t a, const uint8x8_t b,
41 const uint8x8_t a_weight,
42 const uint8x8_t b_weight) {
43 const uint16x8_t a_product = vmull_u8(a, a_weight);
44 const uint16x8_t sum = vmlal_u8(a_product, b, b_weight);
45
46 return vrshrn_n_u16(sum, 5 /*log2(32)*/);
47 }
48
49 // For vertical operations the weights are one constant value.
WeightedBlend(const uint8x8_t a,const uint8x8_t b,const uint8_t weight)50 inline uint8x8_t WeightedBlend(const uint8x8_t a, const uint8x8_t b,
51 const uint8_t weight) {
52 return WeightedBlend(a, b, vdup_n_u8(32 - weight), vdup_n_u8(weight));
53 }
54
55 // Fill |left| and |right| with the appropriate values for a given |base_step|.
LoadStepwise(const uint8_t * LIBGAV1_RESTRICT const source,const uint8x8_t left_step,const uint8x8_t right_step,uint8x8_t * left,uint8x8_t * right)56 inline void LoadStepwise(const uint8_t* LIBGAV1_RESTRICT const source,
57 const uint8x8_t left_step, const uint8x8_t right_step,
58 uint8x8_t* left, uint8x8_t* right) {
59 const uint8x16_t mixed = vld1q_u8(source);
60 *left = VQTbl1U8(mixed, left_step);
61 *right = VQTbl1U8(mixed, right_step);
62 }
63
64 // Handle signed step arguments by ignoring the sign. Negative values are
65 // considered out of range and overwritten later.
LoadStepwise(const uint8_t * LIBGAV1_RESTRICT const source,const int8x8_t left_step,const int8x8_t right_step,uint8x8_t * left,uint8x8_t * right)66 inline void LoadStepwise(const uint8_t* LIBGAV1_RESTRICT const source,
67 const int8x8_t left_step, const int8x8_t right_step,
68 uint8x8_t* left, uint8x8_t* right) {
69 LoadStepwise(source, vreinterpret_u8_s8(left_step),
70 vreinterpret_u8_s8(right_step), left, right);
71 }
72
73 // Process 4 or 8 |width| by any |height|.
74 template <int width>
DirectionalZone1_WxH(uint8_t * LIBGAV1_RESTRICT dst,const ptrdiff_t stride,const int height,const uint8_t * LIBGAV1_RESTRICT const top,const int xstep,const bool upsampled)75 inline void DirectionalZone1_WxH(uint8_t* LIBGAV1_RESTRICT dst,
76 const ptrdiff_t stride, const int height,
77 const uint8_t* LIBGAV1_RESTRICT const top,
78 const int xstep, const bool upsampled) {
79 assert(width == 4 || width == 8);
80
81 const int upsample_shift = static_cast<int>(upsampled);
82 const int scale_bits = 6 - upsample_shift;
83
84 const int max_base_x = (width + height - 1) << upsample_shift;
85 const int8x8_t max_base = vdup_n_s8(max_base_x);
86 const uint8x8_t top_max_base = vdup_n_u8(top[max_base_x]);
87
88 const int8x8_t all = vcreate_s8(0x0706050403020100);
89 const int8x8_t even = vcreate_s8(0x0e0c0a0806040200);
90 const int8x8_t base_step = upsampled ? even : all;
91 const int8x8_t right_step = vadd_s8(base_step, vdup_n_s8(1));
92
93 int top_x = xstep;
94 int y = 0;
95 do {
96 const int top_base_x = top_x >> scale_bits;
97
98 if (top_base_x >= max_base_x) {
99 for (int i = y; i < height; ++i) {
100 memset(dst, top[max_base_x], 4 /* width */);
101 dst += stride;
102 }
103 return;
104 }
105
106 const uint8_t shift = ((top_x << upsample_shift) & 0x3F) >> 1;
107
108 // Zone2 uses negative values for xstep. Use signed values to compare
109 // |top_base_x| to |max_base_x|.
110 const int8x8_t base_v = vadd_s8(vdup_n_s8(top_base_x), base_step);
111
112 const uint8x8_t max_base_mask = vclt_s8(base_v, max_base);
113
114 // 4 wide subsamples the output. 8 wide subsamples the input.
115 if (width == 4) {
116 const uint8x8_t left_values = vld1_u8(top + top_base_x);
117 const uint8x8_t right_values = RightShiftVector<8>(left_values);
118 const uint8x8_t value = WeightedBlend(left_values, right_values, shift);
119
120 // If |upsampled| is true then extract every other value for output.
121 const uint8x8_t value_stepped =
122 vtbl1_u8(value, vreinterpret_u8_s8(base_step));
123 const uint8x8_t masked_value =
124 vbsl_u8(max_base_mask, value_stepped, top_max_base);
125
126 StoreLo4(dst, masked_value);
127 } else /* width == 8 */ {
128 uint8x8_t left_values, right_values;
129 // WeightedBlend() steps up to Q registers. Downsample the input to avoid
130 // doing extra calculations.
131 LoadStepwise(top + top_base_x, base_step, right_step, &left_values,
132 &right_values);
133
134 const uint8x8_t value = WeightedBlend(left_values, right_values, shift);
135 const uint8x8_t masked_value =
136 vbsl_u8(max_base_mask, value, top_max_base);
137
138 vst1_u8(dst, masked_value);
139 }
140 dst += stride;
141 top_x += xstep;
142 } while (++y < height);
143 }
144
145 // Process a multiple of 8 |width| by any |height|. Processes horizontally
146 // before vertically in the hopes of being a little more cache friendly.
DirectionalZone1_WxH(uint8_t * LIBGAV1_RESTRICT dst,const ptrdiff_t stride,const int width,const int height,const uint8_t * LIBGAV1_RESTRICT const top,const int xstep,const bool upsampled)147 inline void DirectionalZone1_WxH(uint8_t* LIBGAV1_RESTRICT dst,
148 const ptrdiff_t stride, const int width,
149 const int height,
150 const uint8_t* LIBGAV1_RESTRICT const top,
151 const int xstep, const bool upsampled) {
152 assert(width % 8 == 0);
153 const int upsample_shift = static_cast<int>(upsampled);
154 const int scale_bits = 6 - upsample_shift;
155
156 const int max_base_x = (width + height - 1) << upsample_shift;
157 const int8x8_t max_base = vdup_n_s8(max_base_x);
158 const uint8x8_t top_max_base = vdup_n_u8(top[max_base_x]);
159
160 const int8x8_t all = vcreate_s8(0x0706050403020100);
161 const int8x8_t even = vcreate_s8(0x0e0c0a0806040200);
162 const int8x8_t base_step = upsampled ? even : all;
163 const int8x8_t right_step = vadd_s8(base_step, vdup_n_s8(1));
164 const int8x8_t block_step = vdup_n_s8(8 << upsample_shift);
165
166 int top_x = xstep;
167 int y = 0;
168 do {
169 const int top_base_x = top_x >> scale_bits;
170
171 if (top_base_x >= max_base_x) {
172 for (int i = y; i < height; ++i) {
173 memset(dst, top[max_base_x], 4 /* width */);
174 dst += stride;
175 }
176 return;
177 }
178
179 const uint8_t shift = ((top_x << upsample_shift) & 0x3F) >> 1;
180
181 // Zone2 uses negative values for xstep. Use signed values to compare
182 // |top_base_x| to |max_base_x|.
183 int8x8_t base_v = vadd_s8(vdup_n_s8(top_base_x), base_step);
184
185 int x = 0;
186 do {
187 const uint8x8_t max_base_mask = vclt_s8(base_v, max_base);
188
189 // Extract the input values based on |upsampled| here to avoid doing twice
190 // as many calculations.
191 uint8x8_t left_values, right_values;
192 LoadStepwise(top + top_base_x + x, base_step, right_step, &left_values,
193 &right_values);
194
195 const uint8x8_t value = WeightedBlend(left_values, right_values, shift);
196 const uint8x8_t masked_value =
197 vbsl_u8(max_base_mask, value, top_max_base);
198
199 vst1_u8(dst + x, masked_value);
200
201 base_v = vadd_s8(base_v, block_step);
202 x += 8;
203 } while (x < width);
204 top_x += xstep;
205 dst += stride;
206 } while (++y < height);
207 }
208
DirectionalIntraPredictorZone1_NEON(void * LIBGAV1_RESTRICT const dest,const ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const int width,const int height,const int xstep,const bool upsampled_top)209 void DirectionalIntraPredictorZone1_NEON(
210 void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
211 const void* LIBGAV1_RESTRICT const top_row, const int width,
212 const int height, const int xstep, const bool upsampled_top) {
213 const auto* const top = static_cast<const uint8_t*>(top_row);
214 auto* dst = static_cast<uint8_t*>(dest);
215
216 assert(xstep > 0);
217
218 const int upsample_shift = static_cast<int>(upsampled_top);
219
220 const uint8x8_t all = vcreate_u8(0x0706050403020100);
221
222 if (xstep == 64) {
223 assert(!upsampled_top);
224 const uint8_t* top_ptr = top + 1;
225 int y = 0;
226 do {
227 memcpy(dst, top_ptr, width);
228 memcpy(dst + stride, top_ptr + 1, width);
229 memcpy(dst + 2 * stride, top_ptr + 2, width);
230 memcpy(dst + 3 * stride, top_ptr + 3, width);
231 dst += 4 * stride;
232 top_ptr += 4;
233 y += 4;
234 } while (y < height);
235 } else if (width == 4) {
236 DirectionalZone1_WxH<4>(dst, stride, height, top, xstep, upsampled_top);
237 } else if (xstep > 51) {
238 // 7.11.2.10. Intra edge upsample selection process
239 // if ( d <= 0 || d >= 40 ) useUpsample = 0
240 // For |upsample_top| the delta is from vertical so |prediction_angle - 90|.
241 // In |kDirectionalIntraPredictorDerivative[]| angles less than 51 will meet
242 // this criteria. The |xstep| value for angle 51 happens to be 51 as well.
243 // Shallower angles have greater xstep values.
244 assert(!upsampled_top);
245 const int max_base_x = ((width + height) - 1);
246 const uint8x8_t max_base = vdup_n_u8(max_base_x);
247 const uint8x8_t top_max_base = vdup_n_u8(top[max_base_x]);
248 const uint8x8_t block_step = vdup_n_u8(8);
249
250 int top_x = xstep;
251 int y = 0;
252 do {
253 const int top_base_x = top_x >> 6;
254 const uint8_t shift = ((top_x << upsample_shift) & 0x3F) >> 1;
255 uint8x8_t base_v = vadd_u8(vdup_n_u8(top_base_x), all);
256 int x = 0;
257 // Only calculate a block of 8 when at least one of the output values is
258 // within range. Otherwise it can read off the end of |top|.
259 const int must_calculate_width =
260 std::min(width, max_base_x - top_base_x + 7) & ~7;
261 for (; x < must_calculate_width; x += 8) {
262 const uint8x8_t max_base_mask = vclt_u8(base_v, max_base);
263
264 // Since these |xstep| values can not be upsampled the load is
265 // simplified.
266 const uint8x8_t left_values = vld1_u8(top + top_base_x + x);
267 const uint8x8_t right_values = vld1_u8(top + top_base_x + x + 1);
268 const uint8x8_t value = WeightedBlend(left_values, right_values, shift);
269 const uint8x8_t masked_value =
270 vbsl_u8(max_base_mask, value, top_max_base);
271
272 vst1_u8(dst + x, masked_value);
273 base_v = vadd_u8(base_v, block_step);
274 }
275 memset(dst + x, top[max_base_x], width - x);
276 dst += stride;
277 top_x += xstep;
278 } while (++y < height);
279 } else {
280 DirectionalZone1_WxH(dst, stride, width, height, top, xstep, upsampled_top);
281 }
282 }
283
284 // Process 4 or 8 |width| by 4 or 8 |height|.
285 template <int width>
DirectionalZone3_WxH(uint8_t * LIBGAV1_RESTRICT dest,const ptrdiff_t stride,const int height,const uint8_t * LIBGAV1_RESTRICT const left_column,const int base_left_y,const int ystep,const int upsample_shift)286 inline void DirectionalZone3_WxH(
287 uint8_t* LIBGAV1_RESTRICT dest, const ptrdiff_t stride, const int height,
288 const uint8_t* LIBGAV1_RESTRICT const left_column, const int base_left_y,
289 const int ystep, const int upsample_shift) {
290 assert(width == 4 || width == 8);
291 assert(height == 4 || height == 8);
292 const int scale_bits = 6 - upsample_shift;
293
294 // Zone3 never runs out of left_column values.
295 assert((width + height - 1) << upsample_shift > // max_base_y
296 ((ystep * width) >> scale_bits) +
297 (/* base_step */ 1 << upsample_shift) *
298 (height - 1)); // left_base_y
299
300 // Limited improvement for 8x8. ~20% faster for 64x64.
301 const uint8x8_t all = vcreate_u8(0x0706050403020100);
302 const uint8x8_t even = vcreate_u8(0x0e0c0a0806040200);
303 const uint8x8_t base_step = upsample_shift ? even : all;
304 const uint8x8_t right_step = vadd_u8(base_step, vdup_n_u8(1));
305
306 uint8_t* dst = dest;
307 uint8x8_t left_v[8], right_v[8], value_v[8];
308 const uint8_t* const left = left_column;
309
310 const int index_0 = base_left_y;
311 LoadStepwise(left + (index_0 >> scale_bits), base_step, right_step,
312 &left_v[0], &right_v[0]);
313 value_v[0] = WeightedBlend(left_v[0], right_v[0],
314 ((index_0 << upsample_shift) & 0x3F) >> 1);
315
316 const int index_1 = base_left_y + ystep;
317 LoadStepwise(left + (index_1 >> scale_bits), base_step, right_step,
318 &left_v[1], &right_v[1]);
319 value_v[1] = WeightedBlend(left_v[1], right_v[1],
320 ((index_1 << upsample_shift) & 0x3F) >> 1);
321
322 const int index_2 = base_left_y + ystep * 2;
323 LoadStepwise(left + (index_2 >> scale_bits), base_step, right_step,
324 &left_v[2], &right_v[2]);
325 value_v[2] = WeightedBlend(left_v[2], right_v[2],
326 ((index_2 << upsample_shift) & 0x3F) >> 1);
327
328 const int index_3 = base_left_y + ystep * 3;
329 LoadStepwise(left + (index_3 >> scale_bits), base_step, right_step,
330 &left_v[3], &right_v[3]);
331 value_v[3] = WeightedBlend(left_v[3], right_v[3],
332 ((index_3 << upsample_shift) & 0x3F) >> 1);
333
334 const int index_4 = base_left_y + ystep * 4;
335 LoadStepwise(left + (index_4 >> scale_bits), base_step, right_step,
336 &left_v[4], &right_v[4]);
337 value_v[4] = WeightedBlend(left_v[4], right_v[4],
338 ((index_4 << upsample_shift) & 0x3F) >> 1);
339
340 const int index_5 = base_left_y + ystep * 5;
341 LoadStepwise(left + (index_5 >> scale_bits), base_step, right_step,
342 &left_v[5], &right_v[5]);
343 value_v[5] = WeightedBlend(left_v[5], right_v[5],
344 ((index_5 << upsample_shift) & 0x3F) >> 1);
345
346 const int index_6 = base_left_y + ystep * 6;
347 LoadStepwise(left + (index_6 >> scale_bits), base_step, right_step,
348 &left_v[6], &right_v[6]);
349 value_v[6] = WeightedBlend(left_v[6], right_v[6],
350 ((index_6 << upsample_shift) & 0x3F) >> 1);
351
352 const int index_7 = base_left_y + ystep * 7;
353 LoadStepwise(left + (index_7 >> scale_bits), base_step, right_step,
354 &left_v[7], &right_v[7]);
355 value_v[7] = WeightedBlend(left_v[7], right_v[7],
356 ((index_7 << upsample_shift) & 0x3F) >> 1);
357
358 // 8x8 transpose.
359 const uint8x16x2_t b0 = vtrnq_u8(vcombine_u8(value_v[0], value_v[4]),
360 vcombine_u8(value_v[1], value_v[5]));
361 const uint8x16x2_t b1 = vtrnq_u8(vcombine_u8(value_v[2], value_v[6]),
362 vcombine_u8(value_v[3], value_v[7]));
363
364 const uint16x8x2_t c0 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[0]),
365 vreinterpretq_u16_u8(b1.val[0]));
366 const uint16x8x2_t c1 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[1]),
367 vreinterpretq_u16_u8(b1.val[1]));
368
369 const uint32x4x2_t d0 = vuzpq_u32(vreinterpretq_u32_u16(c0.val[0]),
370 vreinterpretq_u32_u16(c1.val[0]));
371 const uint32x4x2_t d1 = vuzpq_u32(vreinterpretq_u32_u16(c0.val[1]),
372 vreinterpretq_u32_u16(c1.val[1]));
373
374 if (width == 4) {
375 StoreLo4(dst, vreinterpret_u8_u32(vget_low_u32(d0.val[0])));
376 dst += stride;
377 StoreLo4(dst, vreinterpret_u8_u32(vget_high_u32(d0.val[0])));
378 dst += stride;
379 StoreLo4(dst, vreinterpret_u8_u32(vget_low_u32(d1.val[0])));
380 dst += stride;
381 StoreLo4(dst, vreinterpret_u8_u32(vget_high_u32(d1.val[0])));
382 if (height == 4) return;
383 dst += stride;
384 StoreLo4(dst, vreinterpret_u8_u32(vget_low_u32(d0.val[1])));
385 dst += stride;
386 StoreLo4(dst, vreinterpret_u8_u32(vget_high_u32(d0.val[1])));
387 dst += stride;
388 StoreLo4(dst, vreinterpret_u8_u32(vget_low_u32(d1.val[1])));
389 dst += stride;
390 StoreLo4(dst, vreinterpret_u8_u32(vget_high_u32(d1.val[1])));
391 } else {
392 vst1_u8(dst, vreinterpret_u8_u32(vget_low_u32(d0.val[0])));
393 dst += stride;
394 vst1_u8(dst, vreinterpret_u8_u32(vget_high_u32(d0.val[0])));
395 dst += stride;
396 vst1_u8(dst, vreinterpret_u8_u32(vget_low_u32(d1.val[0])));
397 dst += stride;
398 vst1_u8(dst, vreinterpret_u8_u32(vget_high_u32(d1.val[0])));
399 if (height == 4) return;
400 dst += stride;
401 vst1_u8(dst, vreinterpret_u8_u32(vget_low_u32(d0.val[1])));
402 dst += stride;
403 vst1_u8(dst, vreinterpret_u8_u32(vget_high_u32(d0.val[1])));
404 dst += stride;
405 vst1_u8(dst, vreinterpret_u8_u32(vget_low_u32(d1.val[1])));
406 dst += stride;
407 vst1_u8(dst, vreinterpret_u8_u32(vget_high_u32(d1.val[1])));
408 }
409 }
410
411 // Because the source values "move backwards" as the row index increases, the
412 // indices derived from ystep are generally negative. This is accommodated by
413 // making sure the relative indices are within [-15, 0] when the function is
414 // called, and sliding them into the inclusive range [0, 15], relative to a
415 // lower base address.
416 constexpr int kPositiveIndexOffset = 15;
417
418 // Process 4 or 8 |width| by any |height|.
419 template <int width>
DirectionalZone2FromLeftCol_WxH(uint8_t * LIBGAV1_RESTRICT dst,const ptrdiff_t stride,const int height,const uint8_t * LIBGAV1_RESTRICT const left_column,const int16x8_t left_y,const int upsample_shift)420 inline void DirectionalZone2FromLeftCol_WxH(
421 uint8_t* LIBGAV1_RESTRICT dst, const ptrdiff_t stride, const int height,
422 const uint8_t* LIBGAV1_RESTRICT const left_column, const int16x8_t left_y,
423 const int upsample_shift) {
424 assert(width == 4 || width == 8);
425
426 // The shift argument must be a constant.
427 int16x8_t offset_y, shift_upsampled = left_y;
428 if (upsample_shift) {
429 offset_y = vshrq_n_s16(left_y, 5);
430 shift_upsampled = vshlq_n_s16(shift_upsampled, 1);
431 } else {
432 offset_y = vshrq_n_s16(left_y, 6);
433 }
434
435 // Select values to the left of the starting point.
436 // The 15th element (and 16th) will be all the way at the end, to the right.
437 // With a negative ystep everything else will be "left" of them.
438 // This supports cumulative steps up to 15. We could support up to 16 by doing
439 // separate loads for |left_values| and |right_values|. vtbl supports 2 Q
440 // registers as input which would allow for cumulative offsets of 32.
441 const int16x8_t sampler =
442 vaddq_s16(offset_y, vdupq_n_s16(kPositiveIndexOffset));
443 const uint8x8_t left_values = vqmovun_s16(sampler);
444 const uint8x8_t right_values = vadd_u8(left_values, vdup_n_u8(1));
445
446 const int16x8_t shift_masked = vandq_s16(shift_upsampled, vdupq_n_s16(0x3f));
447 const uint8x8_t shift_mul = vreinterpret_u8_s8(vshrn_n_s16(shift_masked, 1));
448 const uint8x8_t inv_shift_mul = vsub_u8(vdup_n_u8(32), shift_mul);
449
450 int y = 0;
451 do {
452 uint8x8_t src_left, src_right;
453 LoadStepwise(left_column - kPositiveIndexOffset + (y << upsample_shift),
454 left_values, right_values, &src_left, &src_right);
455 const uint8x8_t val =
456 WeightedBlend(src_left, src_right, inv_shift_mul, shift_mul);
457
458 if (width == 4) {
459 StoreLo4(dst, val);
460 } else {
461 vst1_u8(dst, val);
462 }
463 dst += stride;
464 } while (++y < height);
465 }
466
467 // Process 4 or 8 |width| by any |height|.
468 template <int width>
DirectionalZone1Blend_WxH(uint8_t * LIBGAV1_RESTRICT dest,const ptrdiff_t stride,const int height,const uint8_t * LIBGAV1_RESTRICT const top_row,int zone_bounds,int top_x,const int xstep,const int upsample_shift)469 inline void DirectionalZone1Blend_WxH(
470 uint8_t* LIBGAV1_RESTRICT dest, const ptrdiff_t stride, const int height,
471 const uint8_t* LIBGAV1_RESTRICT const top_row, int zone_bounds, int top_x,
472 const int xstep, const int upsample_shift) {
473 assert(width == 4 || width == 8);
474
475 const int scale_bits_x = 6 - upsample_shift;
476
477 const uint8x8_t all = vcreate_u8(0x0706050403020100);
478 const uint8x8_t even = vcreate_u8(0x0e0c0a0806040200);
479 const uint8x8_t base_step = upsample_shift ? even : all;
480 const uint8x8_t right_step = vadd_u8(base_step, vdup_n_u8(1));
481
482 int y = 0;
483 do {
484 const uint8_t* const src = top_row + (top_x >> scale_bits_x);
485 uint8x8_t left, right;
486 LoadStepwise(src, base_step, right_step, &left, &right);
487
488 const uint8_t shift = ((top_x << upsample_shift) & 0x3f) >> 1;
489 const uint8x8_t val = WeightedBlend(left, right, shift);
490
491 uint8x8_t dst_blend = vld1_u8(dest);
492 // |zone_bounds| values can be negative.
493 uint8x8_t blend =
494 vcge_s8(vreinterpret_s8_u8(all), vdup_n_s8((zone_bounds >> 6)));
495 uint8x8_t output = vbsl_u8(blend, val, dst_blend);
496
497 if (width == 4) {
498 StoreLo4(dest, output);
499 } else {
500 vst1_u8(dest, output);
501 }
502 dest += stride;
503 zone_bounds += xstep;
504 top_x -= xstep;
505 } while (++y < height);
506 }
507
508 // 7.11.2.4 (8) 90 < angle > 180
509 // The strategy for these functions (4xH and 8+xH) is to know how many blocks
510 // can be processed with just pixels from |top_ptr|, then handle mixed blocks,
511 // then handle only blocks that take from |left_ptr|. Additionally, a fast
512 // index-shuffle approach is used for pred values from |left_column| in
513 // sections that permit it.
DirectionalZone2_4xH(uint8_t * LIBGAV1_RESTRICT dst,const ptrdiff_t stride,const uint8_t * LIBGAV1_RESTRICT const top_row,const uint8_t * LIBGAV1_RESTRICT const left_column,const int height,const int xstep,const int ystep,const bool upsampled_top,const bool upsampled_left)514 inline void DirectionalZone2_4xH(
515 uint8_t* LIBGAV1_RESTRICT dst, const ptrdiff_t stride,
516 const uint8_t* LIBGAV1_RESTRICT const top_row,
517 const uint8_t* LIBGAV1_RESTRICT const left_column, const int height,
518 const int xstep, const int ystep, const bool upsampled_top,
519 const bool upsampled_left) {
520 const int upsample_left_shift = static_cast<int>(upsampled_left);
521 const int upsample_top_shift = static_cast<int>(upsampled_top);
522
523 // Helper vector.
524 const int16x8_t zero_to_seven = {0, 1, 2, 3, 4, 5, 6, 7};
525
526 // Loop incrementers for moving by block (4xN). Vertical still steps by 8. If
527 // it's only 4, it will be finished in the first iteration.
528 const ptrdiff_t stride8 = stride << 3;
529 const int xstep8 = xstep << 3;
530
531 const int min_height = (height == 4) ? 4 : 8;
532
533 // All columns from |min_top_only_x| to the right will only need |top_row| to
534 // compute and can therefore call the Zone1 functions. This assumes |xstep| is
535 // at least 3.
536 assert(xstep >= 3);
537 const int min_top_only_x = std::min((height * xstep) >> 6, /* width */ 4);
538
539 // Offsets the original zone bound value to simplify x < (y+1)*xstep/64 -1
540 int xstep_bounds_base = (xstep == 64) ? 0 : xstep - 1;
541
542 const int left_base_increment = ystep >> 6;
543 const int ystep_remainder = ystep & 0x3F;
544
545 // If the 64 scaling is regarded as a decimal point, the first value of the
546 // left_y vector omits the portion which is covered under the left_column
547 // offset. The following values need the full ystep as a relative offset.
548 const int16x8_t remainder = vdupq_n_s16(-ystep_remainder);
549 const int16x8_t left_y = vmlaq_n_s16(remainder, zero_to_seven, -ystep);
550
551 // This loop treats each set of 4 columns in 3 stages with y-value boundaries.
552 // The first stage, before the first y-loop, covers blocks that are only
553 // computed from the top row. The second stage, comprising two y-loops, covers
554 // blocks that have a mixture of values computed from top or left. The final
555 // stage covers blocks that are only computed from the left.
556 if (min_top_only_x > 0) {
557 // Round down to the nearest multiple of 8 (or 4, if height is 4).
558 const int max_top_only_y =
559 std::min((1 << 6) / xstep, height) & ~(min_height - 1);
560 DirectionalZone1_WxH<4>(dst, stride, max_top_only_y, top_row, -xstep,
561 upsampled_top);
562
563 if (max_top_only_y == height) return;
564
565 int y = max_top_only_y;
566 dst += stride * y;
567 const int xstep_y = xstep * y;
568
569 // All rows from |min_left_only_y| down for this set of columns only need
570 // |left_column| to compute.
571 const int min_left_only_y = std::min((4 << 6) / xstep, height);
572 int xstep_bounds = xstep_bounds_base + xstep_y;
573 int top_x = -xstep - xstep_y;
574
575 // +8 increment is OK because if height is 4 this only goes once.
576 for (; y < min_left_only_y;
577 y += 8, dst += stride8, xstep_bounds += xstep8, top_x -= xstep8) {
578 DirectionalZone2FromLeftCol_WxH<4>(
579 dst, stride, min_height,
580 left_column + ((y - left_base_increment) << upsample_left_shift),
581 left_y, upsample_left_shift);
582
583 DirectionalZone1Blend_WxH<4>(dst, stride, min_height, top_row,
584 xstep_bounds, top_x, xstep,
585 upsample_top_shift);
586 }
587
588 // Loop over y for left_only rows.
589 const int16_t base_left_y = vgetq_lane_s16(left_y, 0);
590 for (; y < height; y += 8, dst += stride8) {
591 DirectionalZone3_WxH<4>(
592 dst, stride, min_height,
593 left_column + ((y - left_base_increment) << upsample_left_shift),
594 base_left_y, -ystep, upsample_left_shift);
595 }
596 } else {
597 DirectionalZone1_WxH<4>(dst, stride, height, top_row, -xstep,
598 upsampled_top);
599 }
600 }
601
602 template <bool shuffle_left_column>
DirectionalZone2_8xH(uint8_t * LIBGAV1_RESTRICT const dst,const ptrdiff_t stride,const uint8_t * LIBGAV1_RESTRICT const top_row,const uint8_t * LIBGAV1_RESTRICT const left_column,const int height,const int xstep,const int ystep,const int x,const int left_offset,const int xstep_bounds_base,const int16x8_t left_y,const bool upsampled_top,const bool upsampled_left)603 inline void DirectionalZone2_8xH(
604 uint8_t* LIBGAV1_RESTRICT const dst, const ptrdiff_t stride,
605 const uint8_t* LIBGAV1_RESTRICT const top_row,
606 const uint8_t* LIBGAV1_RESTRICT const left_column, const int height,
607 const int xstep, const int ystep, const int x, const int left_offset,
608 const int xstep_bounds_base, const int16x8_t left_y,
609 const bool upsampled_top, const bool upsampled_left) {
610 const int upsample_left_shift = static_cast<int>(upsampled_left);
611 const int upsample_top_shift = static_cast<int>(upsampled_top);
612
613 // Loop incrementers for moving by block (8x8). This function handles blocks
614 // with height 4 as well. They are calculated in one pass so these variables
615 // do not get used.
616 const ptrdiff_t stride8 = stride << 3;
617 const int xstep8 = xstep << 3;
618
619 // Cover 8x4 case.
620 const int min_height = (height == 4) ? 4 : 8;
621
622 // The first stage, before the first y-loop, covers blocks that are only
623 // computed from the top row. The second stage, comprising two y-loops, covers
624 // blocks that have a mixture of values computed from top or left. The final
625 // stage covers blocks that are only computed from the left.
626 uint8_t* dst_x = dst + x;
627 // Round down to the nearest multiple of 8 (or 4, if height is 4).
628 const int max_top_only_y =
629 std::min((1 << 6) / xstep, height) & ~(min_height - 1);
630 DirectionalZone1_WxH<8>(dst_x, stride, max_top_only_y,
631 top_row + (x << upsample_top_shift), -xstep,
632 upsampled_top);
633
634 if (max_top_only_y == height) return;
635
636 int y = max_top_only_y;
637 dst_x += stride * y;
638 const int xstep_y = xstep * y;
639
640 // All rows from |min_left_only_y| down for this set of columns only need
641 // |left_column| to compute. Round up to the nearest 8.
642 const int min_left_only_y =
643 Align(std::min(((x + 8) << 6) / xstep, height), 8);
644 int xstep_bounds = xstep_bounds_base + xstep_y;
645 int top_x = -xstep - xstep_y;
646
647 const int16_t base_left_y = vgetq_lane_s16(left_y, 0);
648 for (; y < min_left_only_y;
649 y += 8, dst_x += stride8, xstep_bounds += xstep8, top_x -= xstep8) {
650 if (shuffle_left_column) {
651 DirectionalZone2FromLeftCol_WxH<8>(
652 dst_x, stride, min_height,
653 left_column + ((left_offset + y) << upsample_left_shift), left_y,
654 upsample_left_shift);
655 } else {
656 DirectionalZone3_WxH<8>(
657 dst_x, stride, min_height,
658 left_column + ((left_offset + y) << upsample_left_shift), base_left_y,
659 -ystep, upsample_left_shift);
660 }
661
662 DirectionalZone1Blend_WxH<8>(
663 dst_x, stride, min_height, top_row + (x << upsample_top_shift),
664 xstep_bounds, top_x, xstep, upsample_top_shift);
665 }
666
667 // Loop over y for left_only rows.
668 for (; y < height; y += 8, dst_x += stride8) {
669 DirectionalZone3_WxH<8>(
670 dst_x, stride, min_height,
671 left_column + ((left_offset + y) << upsample_left_shift), base_left_y,
672 -ystep, upsample_left_shift);
673 }
674 }
675
676 // Process a multiple of 8 |width|.
DirectionalZone2_WxH(uint8_t * LIBGAV1_RESTRICT const dst,const ptrdiff_t stride,const uint8_t * LIBGAV1_RESTRICT const top_row,const uint8_t * LIBGAV1_RESTRICT const left_column,const int width,const int height,const int xstep,const int ystep,const bool upsampled_top,const bool upsampled_left)677 inline void DirectionalZone2_WxH(
678 uint8_t* LIBGAV1_RESTRICT const dst, const ptrdiff_t stride,
679 const uint8_t* LIBGAV1_RESTRICT const top_row,
680 const uint8_t* LIBGAV1_RESTRICT const left_column, const int width,
681 const int height, const int xstep, const int ystep,
682 const bool upsampled_top, const bool upsampled_left) {
683 const int ystep8 = ystep << 3;
684
685 // Offsets the original zone bound value to simplify x < (y+1)*xstep/64 -1
686 int xstep_bounds_base = (xstep == 64) ? 0 : xstep - 1;
687
688 const int left_base_increment = ystep >> 6;
689 const int ystep_remainder = ystep & 0x3F;
690
691 const int left_base_increment8 = ystep8 >> 6;
692 const int ystep_remainder8 = ystep8 & 0x3F;
693 const int16x8_t increment_left8 = vdupq_n_s16(ystep_remainder8);
694
695 // If the 64 scaling is regarded as a decimal point, the first value of the
696 // left_y vector omits the portion which is covered under the left_column
697 // offset. Following values need the full ystep as a relative offset.
698 const int16x8_t remainder = vdupq_n_s16(-ystep_remainder);
699 const int16x8_t zero_to_seven = {0, 1, 2, 3, 4, 5, 6, 7};
700 int16x8_t left_y = vmlaq_n_s16(remainder, zero_to_seven, -ystep);
701
702 // For ystep > 90, at least two sets of 8 columns can be fully computed from
703 // top_row only.
704 const int min_top_only_x = std::min((height * xstep) >> 6, width);
705 // Analysis finds that, for most angles (ystep < 132), all segments that use
706 // both top_row and left_column can compute from left_column using byte
707 // shuffles from a single vector. For steeper angles, the shuffle is also
708 // fully reliable when x >= 32.
709 const int shuffle_left_col_x = (ystep < 132) ? 0 : 32;
710 const int min_shuffle_x = std::min(min_top_only_x, shuffle_left_col_x);
711
712 // This loop treats each set of 4 columns in 3 stages with y-value boundaries.
713 // The first stage, before the first y-loop, covers blocks that are only
714 // computed from the top row. The second stage, comprising two y-loops, covers
715 // blocks that have a mixture of values computed from top or left. The final
716 // stage covers blocks that are only computed from the left.
717 int x = 0;
718 for (int left_offset = -left_base_increment; x < min_shuffle_x; x += 8,
719 xstep_bounds_base -= (8 << 6),
720 left_y = vsubq_s16(left_y, increment_left8),
721 left_offset -= left_base_increment8) {
722 DirectionalZone2_8xH<false>(dst, stride, top_row, left_column, height,
723 xstep, ystep, x, left_offset, xstep_bounds_base,
724 left_y, upsampled_top, upsampled_left);
725 }
726 for (int left_offset = -left_base_increment; x < min_top_only_x; x += 8,
727 xstep_bounds_base -= (8 << 6),
728 left_y = vsubq_s16(left_y, increment_left8),
729 left_offset -= left_base_increment8) {
730 DirectionalZone2_8xH<true>(dst, stride, top_row, left_column, height, xstep,
731 ystep, x, left_offset, xstep_bounds_base, left_y,
732 upsampled_top, upsampled_left);
733 }
734 if (x < width) {
735 const int upsample_top_shift = static_cast<int>(upsampled_top);
736 DirectionalZone1_WxH(dst + x, stride, width - x, height,
737 top_row + (x << upsample_top_shift), -xstep,
738 upsampled_top);
739 }
740 }
741
DirectionalIntraPredictorZone2_NEON(void * LIBGAV1_RESTRICT const dest,const ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column,const int width,const int height,const int xstep,const int ystep,const bool upsampled_top,const bool upsampled_left)742 void DirectionalIntraPredictorZone2_NEON(
743 void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
744 const void* LIBGAV1_RESTRICT const top_row,
745 const void* LIBGAV1_RESTRICT const left_column, const int width,
746 const int height, const int xstep, const int ystep,
747 const bool upsampled_top, const bool upsampled_left) {
748 // Increasing the negative buffer for this function allows more rows to be
749 // processed at a time without branching in an inner loop to check the base.
750 uint8_t top_buffer[288];
751 uint8_t left_buffer[288];
752 #if LIBGAV1_MSAN
753 memset(top_buffer, 0, sizeof(top_buffer));
754 memset(left_buffer, 0, sizeof(left_buffer));
755 #endif // LIBGAV1_MSAN
756
757 memcpy(top_buffer + 128, static_cast<const uint8_t*>(top_row) - 16, 160);
758 memcpy(left_buffer + 128, static_cast<const uint8_t*>(left_column) - 16, 160);
759 const uint8_t* top_ptr = top_buffer + 144;
760 const uint8_t* left_ptr = left_buffer + 144;
761 auto* dst = static_cast<uint8_t*>(dest);
762
763 if (width == 4) {
764 DirectionalZone2_4xH(dst, stride, top_ptr, left_ptr, height, xstep, ystep,
765 upsampled_top, upsampled_left);
766 } else {
767 DirectionalZone2_WxH(dst, stride, top_ptr, left_ptr, width, height, xstep,
768 ystep, upsampled_top, upsampled_left);
769 }
770 }
771
DirectionalIntraPredictorZone3_NEON(void * LIBGAV1_RESTRICT const dest,const ptrdiff_t stride,const void * LIBGAV1_RESTRICT const left_column,const int width,const int height,const int ystep,const bool upsampled_left)772 void DirectionalIntraPredictorZone3_NEON(
773 void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
774 const void* LIBGAV1_RESTRICT const left_column, const int width,
775 const int height, const int ystep, const bool upsampled_left) {
776 const auto* const left = static_cast<const uint8_t*>(left_column);
777
778 assert(ystep > 0);
779
780 const int upsample_shift = static_cast<int>(upsampled_left);
781 const int scale_bits = 6 - upsample_shift;
782 const int base_step = 1 << upsample_shift;
783
784 if (width == 4 || height == 4) {
785 // This block can handle all sizes but the specializations for other sizes
786 // are faster.
787 const uint8x8_t all = vcreate_u8(0x0706050403020100);
788 const uint8x8_t even = vcreate_u8(0x0e0c0a0806040200);
789 const uint8x8_t base_step_v = upsampled_left ? even : all;
790 const uint8x8_t right_step = vadd_u8(base_step_v, vdup_n_u8(1));
791
792 int y = 0;
793 do {
794 int x = 0;
795 do {
796 auto* dst = static_cast<uint8_t*>(dest);
797 dst += y * stride + x;
798 uint8x8_t left_v[4], right_v[4], value_v[4];
799 const int ystep_base = ystep * x;
800 const int offset = y * base_step;
801
802 const int index_0 = ystep_base + ystep * 1;
803 LoadStepwise(left + offset + (index_0 >> scale_bits), base_step_v,
804 right_step, &left_v[0], &right_v[0]);
805 value_v[0] = WeightedBlend(left_v[0], right_v[0],
806 ((index_0 << upsample_shift) & 0x3F) >> 1);
807
808 const int index_1 = ystep_base + ystep * 2;
809 LoadStepwise(left + offset + (index_1 >> scale_bits), base_step_v,
810 right_step, &left_v[1], &right_v[1]);
811 value_v[1] = WeightedBlend(left_v[1], right_v[1],
812 ((index_1 << upsample_shift) & 0x3F) >> 1);
813
814 const int index_2 = ystep_base + ystep * 3;
815 LoadStepwise(left + offset + (index_2 >> scale_bits), base_step_v,
816 right_step, &left_v[2], &right_v[2]);
817 value_v[2] = WeightedBlend(left_v[2], right_v[2],
818 ((index_2 << upsample_shift) & 0x3F) >> 1);
819
820 const int index_3 = ystep_base + ystep * 4;
821 LoadStepwise(left + offset + (index_3 >> scale_bits), base_step_v,
822 right_step, &left_v[3], &right_v[3]);
823 value_v[3] = WeightedBlend(left_v[3], right_v[3],
824 ((index_3 << upsample_shift) & 0x3F) >> 1);
825
826 // 8x4 transpose.
827 const uint8x8x2_t b0 = vtrn_u8(value_v[0], value_v[1]);
828 const uint8x8x2_t b1 = vtrn_u8(value_v[2], value_v[3]);
829
830 const uint16x4x2_t c0 = vtrn_u16(vreinterpret_u16_u8(b0.val[0]),
831 vreinterpret_u16_u8(b1.val[0]));
832 const uint16x4x2_t c1 = vtrn_u16(vreinterpret_u16_u8(b0.val[1]),
833 vreinterpret_u16_u8(b1.val[1]));
834
835 StoreLo4(dst, vreinterpret_u8_u16(c0.val[0]));
836 dst += stride;
837 StoreLo4(dst, vreinterpret_u8_u16(c1.val[0]));
838 dst += stride;
839 StoreLo4(dst, vreinterpret_u8_u16(c0.val[1]));
840 dst += stride;
841 StoreLo4(dst, vreinterpret_u8_u16(c1.val[1]));
842
843 if (height > 4) {
844 dst += stride;
845 StoreHi4(dst, vreinterpret_u8_u16(c0.val[0]));
846 dst += stride;
847 StoreHi4(dst, vreinterpret_u8_u16(c1.val[0]));
848 dst += stride;
849 StoreHi4(dst, vreinterpret_u8_u16(c0.val[1]));
850 dst += stride;
851 StoreHi4(dst, vreinterpret_u8_u16(c1.val[1]));
852 }
853 x += 4;
854 } while (x < width);
855 y += 8;
856 } while (y < height);
857 } else { // 8x8 at a time.
858 // Limited improvement for 8x8. ~20% faster for 64x64.
859 int y = 0;
860 do {
861 int x = 0;
862 do {
863 auto* dst = static_cast<uint8_t*>(dest);
864 dst += y * stride + x;
865 const int ystep_base = ystep * (x + 1);
866
867 DirectionalZone3_WxH<8>(dst, stride, 8, left + (y << upsample_shift),
868 ystep_base, ystep, upsample_shift);
869 x += 8;
870 } while (x < width);
871 y += 8;
872 } while (y < height);
873 }
874 }
875
Init8bpp()876 void Init8bpp() {
877 Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
878 assert(dsp != nullptr);
879 dsp->directional_intra_predictor_zone1 = DirectionalIntraPredictorZone1_NEON;
880 dsp->directional_intra_predictor_zone2 = DirectionalIntraPredictorZone2_NEON;
881 dsp->directional_intra_predictor_zone3 = DirectionalIntraPredictorZone3_NEON;
882 }
883
884 } // namespace
885 } // namespace low_bitdepth
886
887 #if LIBGAV1_MAX_BITDEPTH >= 10
888 namespace high_bitdepth {
889 namespace {
890
891 // Blend two values based on weights that sum to 32.
WeightedBlend(const uint16x4_t a,const uint16x4_t b,const int a_weight,const int b_weight)892 inline uint16x4_t WeightedBlend(const uint16x4_t a, const uint16x4_t b,
893 const int a_weight, const int b_weight) {
894 const uint16x4_t a_product = vmul_n_u16(a, a_weight);
895 const uint16x4_t sum = vmla_n_u16(a_product, b, b_weight);
896
897 return vrshr_n_u16(sum, 5 /*log2(32)*/);
898 }
899
900 // Blend two values based on weights that sum to 32.
WeightedBlend(const uint16x8_t a,const uint16x8_t b,const uint16_t a_weight,const uint16_t b_weight)901 inline uint16x8_t WeightedBlend(const uint16x8_t a, const uint16x8_t b,
902 const uint16_t a_weight,
903 const uint16_t b_weight) {
904 const uint16x8_t a_product = vmulq_n_u16(a, a_weight);
905 const uint16x8_t sum = vmlaq_n_u16(a_product, b, b_weight);
906
907 return vrshrq_n_u16(sum, 5 /*log2(32)*/);
908 }
909
910 // Blend two values based on weights that sum to 32.
WeightedBlend(const uint16x8_t a,const uint16x8_t b,const uint16x8_t a_weight,const uint16x8_t b_weight)911 inline uint16x8_t WeightedBlend(const uint16x8_t a, const uint16x8_t b,
912 const uint16x8_t a_weight,
913 const uint16x8_t b_weight) {
914 const uint16x8_t a_product = vmulq_u16(a, a_weight);
915 const uint16x8_t sum = vmlaq_u16(a_product, b, b_weight);
916
917 return vrshrq_n_u16(sum, 5 /*log2(32)*/);
918 }
919
920 // Each element of |dest| contains values associated with one weight value.
LoadEdgeVals(uint16x4x2_t * dest,const uint16_t * LIBGAV1_RESTRICT const source,const bool upsampled)921 inline void LoadEdgeVals(uint16x4x2_t* dest,
922 const uint16_t* LIBGAV1_RESTRICT const source,
923 const bool upsampled) {
924 if (upsampled) {
925 *dest = vld2_u16(source);
926 } else {
927 dest->val[0] = vld1_u16(source);
928 dest->val[1] = vld1_u16(source + 1);
929 }
930 }
931
932 // Each element of |dest| contains values associated with one weight value.
LoadEdgeVals(uint16x8x2_t * dest,const uint16_t * LIBGAV1_RESTRICT const source,const bool upsampled)933 inline void LoadEdgeVals(uint16x8x2_t* dest,
934 const uint16_t* LIBGAV1_RESTRICT const source,
935 const bool upsampled) {
936 if (upsampled) {
937 *dest = vld2q_u16(source);
938 } else {
939 dest->val[0] = vld1q_u16(source);
940 dest->val[1] = vld1q_u16(source + 1);
941 }
942 }
943
944 // For Wx4 blocks, load the source for 2 columns. The source for the second
945 // column is held in the high half of each vector.
LoadEdgeVals2x4(uint16x8x2_t * dest,const uint16_t * LIBGAV1_RESTRICT const source_low,const uint16_t * LIBGAV1_RESTRICT const source_high,const bool upsampled)946 inline void LoadEdgeVals2x4(uint16x8x2_t* dest,
947 const uint16_t* LIBGAV1_RESTRICT const source_low,
948 const uint16_t* LIBGAV1_RESTRICT const source_high,
949 const bool upsampled) {
950 if (upsampled) {
951 const uint16x4x2_t low = vld2_u16(source_low);
952 const uint16x4x2_t high = vld2_u16(source_high);
953 dest->val[0] = vcombine_u16(low.val[0], high.val[0]);
954 dest->val[1] = vcombine_u16(low.val[1], high.val[1]);
955 } else {
956 dest->val[0] = vcombine_u16(vld1_u16(source_low), vld1_u16(source_high));
957 dest->val[1] =
958 vcombine_u16(vld1_u16(source_low + 1), vld1_u16(source_high + 1));
959 }
960 }
961
962 template <bool upsampled>
DirectionalZone1_4xH(uint16_t * LIBGAV1_RESTRICT dst,const ptrdiff_t stride,const int height,const uint16_t * LIBGAV1_RESTRICT const top,const int xstep)963 inline void DirectionalZone1_4xH(uint16_t* LIBGAV1_RESTRICT dst,
964 const ptrdiff_t stride, const int height,
965 const uint16_t* LIBGAV1_RESTRICT const top,
966 const int xstep) {
967 const int upsample_shift = static_cast<int>(upsampled);
968 const int index_scale_bits = 6 - upsample_shift;
969
970 const int max_base_x = (4 + height - 1) << upsample_shift;
971 const int16x4_t max_base = vdup_n_s16(max_base_x);
972 const uint16x4_t final_top_val = vdup_n_u16(top[max_base_x]);
973 const int16x4_t index_offset = {0, 1, 2, 3};
974
975 // All rows from |min_corner_only_y| down will simply use Memset.
976 // |max_base_x| is always greater than |height|, so clipping the denominator
977 // to 1 is enough to make the logic work.
978 const int xstep_units = std::max(xstep >> index_scale_bits, 1);
979 const int min_corner_only_y = std::min(max_base_x / xstep_units, height);
980
981 int top_x = xstep;
982 int y = 0;
983 for (; y < min_corner_only_y; ++y, dst += stride, top_x += xstep) {
984 const int top_base_x = top_x >> index_scale_bits;
985
986 // To accommodate reuse of this function in Zone2, permit negative values
987 // for |xstep|.
988 const uint16_t shift_0 = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
989 const uint16_t shift_1 = 32 - shift_0;
990
991 // Use signed values to compare |top_base_x| to |max_base_x|.
992 const int16x4_t base_x = vadd_s16(vdup_n_s16(top_base_x), index_offset);
993 const uint16x4_t max_base_mask = vclt_s16(base_x, max_base);
994
995 uint16x4x2_t sampled_top_row;
996 LoadEdgeVals(&sampled_top_row, top + top_base_x, upsampled);
997 const uint16x4_t combined = WeightedBlend(
998 sampled_top_row.val[0], sampled_top_row.val[1], shift_1, shift_0);
999
1000 // If |upsampled| is true then extract every other value for output.
1001 const uint16x4_t masked_result =
1002 vbsl_u16(max_base_mask, combined, final_top_val);
1003
1004 vst1_u16(dst, masked_result);
1005 }
1006 for (; y < height; ++y) {
1007 Memset(dst, top[max_base_x], 4 /* width */);
1008 dst += stride;
1009 }
1010 }
1011
1012 // Process a multiple of 8 |width| by any |height|. Processes horizontally
1013 // before vertically in the hopes of being a little more cache friendly.
1014 template <bool upsampled>
DirectionalZone1_WxH(uint16_t * LIBGAV1_RESTRICT dst,const ptrdiff_t stride,const int width,const int height,const uint16_t * LIBGAV1_RESTRICT const top,const int xstep)1015 inline void DirectionalZone1_WxH(uint16_t* LIBGAV1_RESTRICT dst,
1016 const ptrdiff_t stride, const int width,
1017 const int height,
1018 const uint16_t* LIBGAV1_RESTRICT const top,
1019 const int xstep) {
1020 assert(width % 8 == 0);
1021 const int upsample_shift = static_cast<int>(upsampled);
1022 const int index_scale_bits = 6 - upsample_shift;
1023
1024 const int max_base_index = (width + height - 1) << upsample_shift;
1025 const int16x8_t max_base_x = vdupq_n_s16(max_base_index);
1026 const uint16x8_t final_top_val = vdupq_n_u16(top[max_base_index]);
1027 const int16x8_t index_offset = {0, 1, 2, 3, 4, 5, 6, 7};
1028
1029 const int base_step = 1 << upsample_shift;
1030 const int base_step8 = base_step << 3;
1031 const int16x8_t block_step = vdupq_n_s16(base_step8);
1032
1033 // All rows from |min_corner_only_y| down will simply use Memset.
1034 // |max_base_x| is always greater than |height|, so clipping the denominator
1035 // to 1 is enough to make the logic work.
1036 const int xstep_units = std::max(xstep >> index_scale_bits, 1);
1037 const int min_corner_only_y = std::min(max_base_index / xstep_units, height);
1038
1039 int top_x = xstep;
1040 int y = 0;
1041 for (; y < min_corner_only_y; ++y, dst += stride, top_x += xstep) {
1042 int top_base_x = top_x >> index_scale_bits;
1043
1044 // To accommodate reuse of this function in Zone2, permit negative values
1045 // for |xstep|.
1046 const uint16_t shift_0 = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
1047 const uint16_t shift_1 = 32 - shift_0;
1048
1049 // Use signed values to compare |top_base_x| to |max_base_x|.
1050 int16x8_t base_x = vaddq_s16(vdupq_n_s16(top_base_x), index_offset);
1051
1052 int x = 0;
1053 do {
1054 const uint16x8_t max_base_mask = vcltq_s16(base_x, max_base_x);
1055
1056 uint16x8x2_t sampled_top_row;
1057 LoadEdgeVals(&sampled_top_row, top + top_base_x, upsampled);
1058 const uint16x8_t combined = WeightedBlend(
1059 sampled_top_row.val[0], sampled_top_row.val[1], shift_1, shift_0);
1060
1061 const uint16x8_t masked_result =
1062 vbslq_u16(max_base_mask, combined, final_top_val);
1063 vst1q_u16(dst + x, masked_result);
1064
1065 base_x = vaddq_s16(base_x, block_step);
1066 top_base_x += base_step8;
1067 x += 8;
1068 } while (x < width);
1069 }
1070 for (int i = y; i < height; ++i) {
1071 Memset(dst, top[max_base_index], width);
1072 dst += stride;
1073 }
1074 }
1075
1076 // Process a multiple of 8 |width| by any |height|. Processes horizontally
1077 // before vertically in the hopes of being a little more cache friendly.
DirectionalZone1_Large(uint16_t * LIBGAV1_RESTRICT dst,const ptrdiff_t stride,const int width,const int height,const uint16_t * LIBGAV1_RESTRICT const top,const int xstep,const bool upsampled)1078 inline void DirectionalZone1_Large(uint16_t* LIBGAV1_RESTRICT dst,
1079 const ptrdiff_t stride, const int width,
1080 const int height,
1081 const uint16_t* LIBGAV1_RESTRICT const top,
1082 const int xstep, const bool upsampled) {
1083 assert(width % 8 == 0);
1084 const int upsample_shift = static_cast<int>(upsampled);
1085 const int index_scale_bits = 6 - upsample_shift;
1086
1087 const int max_base_index = (width + height - 1) << upsample_shift;
1088 const int16x8_t max_base_x = vdupq_n_s16(max_base_index);
1089 const uint16x8_t final_top_val = vdupq_n_u16(top[max_base_index]);
1090 const int16x8_t index_offset = {0, 1, 2, 3, 4, 5, 6, 7};
1091
1092 const int base_step = 1 << upsample_shift;
1093 const int base_step8 = base_step << 3;
1094 const int16x8_t block_step = vdupq_n_s16(base_step8);
1095
1096 // All rows from |min_corner_only_y| down will simply use Memset.
1097 // |max_base_x| is always greater than |height|, so clipping the denominator
1098 // to 1 is enough to make the logic work.
1099 const int xstep_units = std::max(xstep >> index_scale_bits, 1);
1100 const int min_corner_only_y = std::min(max_base_index / xstep_units, height);
1101
1102 // Rows up to this y-value can be computed without checking for bounds.
1103 const int max_no_corner_y = std::min(
1104 ((max_base_index - (base_step * width)) << index_scale_bits) / xstep,
1105 height);
1106 // No need to check for exceeding |max_base_x| in the first loop.
1107 int y = 0;
1108 int top_x = xstep;
1109 for (; y < max_no_corner_y; ++y, dst += stride, top_x += xstep) {
1110 int top_base_x = top_x >> index_scale_bits;
1111 // To accommodate reuse of this function in Zone2, permit negative values
1112 // for |xstep|.
1113 const uint16_t shift_0 = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
1114 const uint16_t shift_1 = 32 - shift_0;
1115
1116 int x = 0;
1117 do {
1118 uint16x8x2_t sampled_top_row;
1119 LoadEdgeVals(&sampled_top_row, top + top_base_x, upsampled);
1120 const uint16x8_t combined = WeightedBlend(
1121 sampled_top_row.val[0], sampled_top_row.val[1], shift_1, shift_0);
1122
1123 vst1q_u16(dst + x, combined);
1124
1125 top_base_x += base_step8;
1126 x += 8;
1127 } while (x < width);
1128 }
1129
1130 for (; y < min_corner_only_y; ++y, dst += stride, top_x += xstep) {
1131 int top_base_x = top_x >> index_scale_bits;
1132
1133 // To accommodate reuse of this function in Zone2, permit negative values
1134 // for |xstep|.
1135 const uint16_t shift_0 = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
1136 const uint16_t shift_1 = 32 - shift_0;
1137
1138 // Use signed values to compare |top_base_x| to |max_base_x|.
1139 int16x8_t base_x = vaddq_s16(vdupq_n_s16(top_base_x), index_offset);
1140
1141 int x = 0;
1142 const int min_corner_only_x =
1143 std::min(width, ((max_base_index - top_base_x) >> upsample_shift) + 7) &
1144 ~7;
1145 for (; x < min_corner_only_x; x += 8, top_base_x += base_step8,
1146 base_x = vaddq_s16(base_x, block_step)) {
1147 const uint16x8_t max_base_mask = vcltq_s16(base_x, max_base_x);
1148
1149 uint16x8x2_t sampled_top_row;
1150 LoadEdgeVals(&sampled_top_row, top + top_base_x, upsampled);
1151 const uint16x8_t combined = WeightedBlend(
1152 sampled_top_row.val[0], sampled_top_row.val[1], shift_1, shift_0);
1153
1154 const uint16x8_t masked_result =
1155 vbslq_u16(max_base_mask, combined, final_top_val);
1156 vst1q_u16(dst + x, masked_result);
1157 }
1158 // Corner-only section of the row.
1159 Memset(dst + x, top[max_base_index], width - x);
1160 }
1161 for (; y < height; ++y) {
1162 Memset(dst, top[max_base_index], width);
1163 dst += stride;
1164 }
1165 }
1166
DirectionalIntraPredictorZone1_NEON(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const int width,const int height,const int xstep,const bool upsampled_top)1167 void DirectionalIntraPredictorZone1_NEON(
1168 void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
1169 const void* LIBGAV1_RESTRICT const top_row, const int width,
1170 const int height, const int xstep, const bool upsampled_top) {
1171 const auto* const top = static_cast<const uint16_t*>(top_row);
1172 auto* dst = static_cast<uint16_t*>(dest);
1173 stride /= sizeof(top[0]);
1174
1175 assert(xstep > 0);
1176
1177 if (xstep == 64) {
1178 assert(!upsampled_top);
1179 const uint16_t* top_ptr = top + 1;
1180 const int width_bytes = width * sizeof(top[0]);
1181 int y = height;
1182 do {
1183 memcpy(dst, top_ptr, width_bytes);
1184 memcpy(dst + stride, top_ptr + 1, width_bytes);
1185 memcpy(dst + 2 * stride, top_ptr + 2, width_bytes);
1186 memcpy(dst + 3 * stride, top_ptr + 3, width_bytes);
1187 dst += 4 * stride;
1188 top_ptr += 4;
1189 y -= 4;
1190 } while (y != 0);
1191 } else {
1192 if (width == 4) {
1193 if (upsampled_top) {
1194 DirectionalZone1_4xH<true>(dst, stride, height, top, xstep);
1195 } else {
1196 DirectionalZone1_4xH<false>(dst, stride, height, top, xstep);
1197 }
1198 } else if (width >= 32) {
1199 if (upsampled_top) {
1200 DirectionalZone1_Large(dst, stride, width, height, top, xstep, true);
1201 } else {
1202 DirectionalZone1_Large(dst, stride, width, height, top, xstep, false);
1203 }
1204 } else if (upsampled_top) {
1205 DirectionalZone1_WxH<true>(dst, stride, width, height, top, xstep);
1206 } else {
1207 DirectionalZone1_WxH<false>(dst, stride, width, height, top, xstep);
1208 }
1209 }
1210 }
1211
1212 // -----------------------------------------------------------------------------
1213 // Zone 3
1214 // This can be considered "the transpose of Zone 1." In Zone 1, the fractional
1215 // step applies when moving vertically in the destination block, connected to
1216 // the change in |y|, whereas in this mode, the step applies when moving
1217 // horizontally, connected to the change in |x|. This makes vectorization very
1218 // complicated in row-order, because a given vector may need source pixels that
1219 // span 16 or 32 pixels in steep angles, requiring multiple expensive table
1220 // lookups and checked loads. Rather than work in row order, it is simpler to
1221 // compute |dest| in column order, and then store the transposed results.
1222
1223 // Compute 4x4 sub-blocks.
1224 // Example of computed sub-blocks of a 4x8 block before and after transpose:
1225 // 00 10 20 30 00 01 02 03
1226 // 01 11 21 31 10 11 12 13
1227 // 02 12 22 32 20 21 22 23
1228 // 03 13 23 33 30 31 32 33
1229 // ----------- --> -----------
1230 // 40 50 60 70 40 41 42 43
1231 // 41 51 61 71 50 51 52 53
1232 // 42 52 62 72 60 61 62 63
1233 // 43 53 63 73 70 71 72 73
1234 template <bool upsampled>
DirectionalZone3_4x4(uint8_t * LIBGAV1_RESTRICT dst,const ptrdiff_t stride,const uint16_t * LIBGAV1_RESTRICT const left,const int ystep,const int base_left_y=0)1235 inline void DirectionalZone3_4x4(uint8_t* LIBGAV1_RESTRICT dst,
1236 const ptrdiff_t stride,
1237 const uint16_t* LIBGAV1_RESTRICT const left,
1238 const int ystep, const int base_left_y = 0) {
1239 const int upsample_shift = static_cast<int>(upsampled);
1240 const int index_scale_bits = 6 - upsample_shift;
1241
1242 // Compute one column at a time, then transpose for storage.
1243 uint16x4_t result[4];
1244
1245 int left_y = base_left_y + ystep;
1246 int left_offset = left_y >> index_scale_bits;
1247 int shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
1248 int shift_1 = 32 - shift_0;
1249 uint16x4x2_t sampled_left_col;
1250 LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
1251 result[0] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
1252 shift_1, shift_0);
1253
1254 left_y += ystep;
1255 left_offset = left_y >> index_scale_bits;
1256 shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
1257 shift_1 = 32 - shift_0;
1258 LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
1259 result[1] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
1260 shift_1, shift_0);
1261
1262 left_y += ystep;
1263 left_offset = left_y >> index_scale_bits;
1264 shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
1265 shift_1 = 32 - shift_0;
1266 LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
1267 result[2] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
1268 shift_1, shift_0);
1269
1270 left_y += ystep;
1271 left_offset = left_y >> index_scale_bits;
1272 shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
1273 shift_1 = 32 - shift_0;
1274 LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
1275 result[3] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
1276 shift_1, shift_0);
1277
1278 Transpose4x4(result);
1279 Store4(dst, result[0]);
1280 dst += stride;
1281 Store4(dst, result[1]);
1282 dst += stride;
1283 Store4(dst, result[2]);
1284 dst += stride;
1285 Store4(dst, result[3]);
1286 }
1287
1288 template <bool upsampled>
DirectionalZone3_8x4(uint8_t * LIBGAV1_RESTRICT dst,const ptrdiff_t stride,const uint16_t * LIBGAV1_RESTRICT const left,const int ystep,const int base_left_y=0)1289 inline void DirectionalZone3_8x4(uint8_t* LIBGAV1_RESTRICT dst,
1290 const ptrdiff_t stride,
1291 const uint16_t* LIBGAV1_RESTRICT const left,
1292 const int ystep, const int base_left_y = 0) {
1293 const int upsample_shift = static_cast<int>(upsampled);
1294 const int index_scale_bits = 6 - upsample_shift;
1295 const uint16x8_t inverter = vdupq_n_u16(32);
1296
1297 uint16x8x2_t sampled_left_col;
1298 // Compute two columns at a time, then transpose for storage.
1299 uint16x8_t result[4];
1300
1301 // The low half of pre-transpose vectors contains columns 0 through 3.
1302 int left_y_low = base_left_y + ystep;
1303 int left_offset_low = left_y_low >> index_scale_bits;
1304 int shift_low = (LeftShift(left_y_low, upsample_shift) & 0x3F) >> 1;
1305
1306 // The high half of pre-transpose vectors contains columns 4 through 7.
1307 int left_y_high = left_y_low + (ystep << 2);
1308 int left_offset_high = left_y_high >> index_scale_bits;
1309 int shift_high = (LeftShift(left_y_high, upsample_shift) & 0x3F) >> 1;
1310 uint16x8_t weights_0 =
1311 vcombine_u16(vdup_n_u16(shift_low), vdup_n_u16(shift_high));
1312 uint16x8_t weights_1 = vsubq_u16(inverter, weights_0);
1313 LoadEdgeVals2x4(&sampled_left_col, &left[left_offset_low],
1314 &left[left_offset_high], upsampled);
1315 result[0] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
1316 weights_1, weights_0);
1317
1318 left_y_low += ystep;
1319 left_offset_low = left_y_low >> index_scale_bits;
1320 shift_low = (LeftShift(left_y_low, upsample_shift) & 0x3F) >> 1;
1321
1322 left_y_high += ystep;
1323 left_offset_high = left_y_high >> index_scale_bits;
1324 shift_high = (LeftShift(left_y_high, upsample_shift) & 0x3F) >> 1;
1325 weights_0 = vcombine_u16(vdup_n_u16(shift_low), vdup_n_u16(shift_high));
1326 weights_1 = vsubq_u16(inverter, weights_0);
1327 LoadEdgeVals2x4(&sampled_left_col, &left[left_offset_low],
1328 &left[left_offset_high], upsampled);
1329 result[1] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
1330 weights_1, weights_0);
1331
1332 left_y_high += ystep;
1333 left_y_low += ystep;
1334 left_offset_low = left_y_low >> index_scale_bits;
1335 shift_low = (LeftShift(left_y_low, upsample_shift) & 0x3F) >> 1;
1336
1337 left_offset_high = left_y_high >> index_scale_bits;
1338 shift_high = (LeftShift(left_y_high, upsample_shift) & 0x3F) >> 1;
1339 weights_0 = vcombine_u16(vdup_n_u16(shift_low), vdup_n_u16(shift_high));
1340 weights_1 = vsubq_u16(inverter, weights_0);
1341 LoadEdgeVals2x4(&sampled_left_col, &left[left_offset_low],
1342 &left[left_offset_high], upsampled);
1343 result[2] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
1344 weights_1, weights_0);
1345
1346 left_y_low += ystep;
1347 left_offset_low = left_y_low >> index_scale_bits;
1348 shift_low = (LeftShift(left_y_low, upsample_shift) & 0x3F) >> 1;
1349
1350 left_y_high += ystep;
1351 left_offset_high = left_y_high >> index_scale_bits;
1352 shift_high = (LeftShift(left_y_high, upsample_shift) & 0x3F) >> 1;
1353 weights_0 = vcombine_u16(vdup_n_u16(shift_low), vdup_n_u16(shift_high));
1354 weights_1 = vsubq_u16(inverter, weights_0);
1355 LoadEdgeVals2x4(&sampled_left_col, &left[left_offset_low],
1356 &left[left_offset_high], upsampled);
1357 result[3] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
1358 weights_1, weights_0);
1359
1360 Transpose4x8(result);
1361 Store8(dst, result[0]);
1362 dst += stride;
1363 Store8(dst, result[1]);
1364 dst += stride;
1365 Store8(dst, result[2]);
1366 dst += stride;
1367 Store8(dst, result[3]);
1368 }
1369
1370 template <bool upsampled>
DirectionalZone3_4x8(uint8_t * LIBGAV1_RESTRICT dst,const ptrdiff_t stride,const uint16_t * LIBGAV1_RESTRICT const left,const int ystep,const int base_left_y=0)1371 inline void DirectionalZone3_4x8(uint8_t* LIBGAV1_RESTRICT dst,
1372 const ptrdiff_t stride,
1373 const uint16_t* LIBGAV1_RESTRICT const left,
1374 const int ystep, const int base_left_y = 0) {
1375 const int upsample_shift = static_cast<int>(upsampled);
1376 const int index_scale_bits = 6 - upsample_shift;
1377
1378 // Compute one column at a time, then transpose for storage.
1379 uint16x8_t result[4];
1380
1381 int left_y = base_left_y + ystep;
1382 int left_offset = left_y >> index_scale_bits;
1383 int shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
1384 int shift_1 = 32 - shift_0;
1385 uint16x8x2_t sampled_left_col;
1386 LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
1387 result[0] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
1388 shift_1, shift_0);
1389
1390 left_y += ystep;
1391 left_offset = left_y >> index_scale_bits;
1392 shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
1393 shift_1 = 32 - shift_0;
1394 LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
1395 result[1] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
1396 shift_1, shift_0);
1397
1398 left_y += ystep;
1399 left_offset = left_y >> index_scale_bits;
1400 shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
1401 shift_1 = 32 - shift_0;
1402 LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
1403 result[2] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
1404 shift_1, shift_0);
1405
1406 left_y += ystep;
1407 left_offset = left_y >> index_scale_bits;
1408 shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
1409 shift_1 = 32 - shift_0;
1410 LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
1411 result[3] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
1412 shift_1, shift_0);
1413
1414 Transpose4x8(result);
1415 Store4(dst, vget_low_u16(result[0]));
1416 dst += stride;
1417 Store4(dst, vget_low_u16(result[1]));
1418 dst += stride;
1419 Store4(dst, vget_low_u16(result[2]));
1420 dst += stride;
1421 Store4(dst, vget_low_u16(result[3]));
1422 dst += stride;
1423 Store4(dst, vget_high_u16(result[0]));
1424 dst += stride;
1425 Store4(dst, vget_high_u16(result[1]));
1426 dst += stride;
1427 Store4(dst, vget_high_u16(result[2]));
1428 dst += stride;
1429 Store4(dst, vget_high_u16(result[3]));
1430 }
1431
1432 template <bool upsampled>
DirectionalZone3_4xH(uint8_t * LIBGAV1_RESTRICT dest,const ptrdiff_t stride,const int height,const uint16_t * LIBGAV1_RESTRICT const left,const int ystep)1433 inline void DirectionalZone3_4xH(uint8_t* LIBGAV1_RESTRICT dest,
1434 const ptrdiff_t stride, const int height,
1435 const uint16_t* LIBGAV1_RESTRICT const left,
1436 const int ystep) {
1437 assert(height == 8 || height == 16);
1438 const int upsample_shift = static_cast<int>(upsampled);
1439 DirectionalZone3_4x8<upsampled>(dest, stride, left, ystep);
1440 if (height == 16) {
1441 dest += stride << 3;
1442 DirectionalZone3_4x8<upsampled>(dest, stride, left + (8 << upsample_shift),
1443 ystep);
1444 }
1445 }
1446
1447 template <bool upsampled>
DirectionalZone3_Wx4(uint8_t * LIBGAV1_RESTRICT dest,const ptrdiff_t stride,const int width,const uint16_t * LIBGAV1_RESTRICT const left,const int ystep)1448 inline void DirectionalZone3_Wx4(uint8_t* LIBGAV1_RESTRICT dest,
1449 const ptrdiff_t stride, const int width,
1450 const uint16_t* LIBGAV1_RESTRICT const left,
1451 const int ystep) {
1452 assert(width <= 16);
1453 if (width == 4) {
1454 DirectionalZone3_4x4<upsampled>(dest, stride, left, ystep);
1455 return;
1456 }
1457 DirectionalZone3_8x4<upsampled>(dest, stride, left, ystep);
1458 if (width == 16) {
1459 const int base_left_y = ystep << 3;
1460 DirectionalZone3_8x4<upsampled>(dest + 8 * sizeof(uint16_t), stride, left,
1461 ystep, base_left_y);
1462 }
1463 }
1464
1465 template <bool upsampled>
DirectionalZone3_8x8(uint8_t * LIBGAV1_RESTRICT dest,const ptrdiff_t stride,const uint16_t * LIBGAV1_RESTRICT const left,const int ystep,const int base_left_y=0)1466 inline void DirectionalZone3_8x8(uint8_t* LIBGAV1_RESTRICT dest,
1467 const ptrdiff_t stride,
1468 const uint16_t* LIBGAV1_RESTRICT const left,
1469 const int ystep, const int base_left_y = 0) {
1470 const int upsample_shift = static_cast<int>(upsampled);
1471 const int index_scale_bits = 6 - upsample_shift;
1472
1473 // Compute one column at a time, then transpose for storage.
1474 uint16x8_t result[8];
1475
1476 int left_y = base_left_y + ystep;
1477 uint16x8x2_t sampled_left_col;
1478 int left_offset = left_y >> index_scale_bits;
1479 int shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
1480 int shift_1 = 32 - shift_0;
1481 LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
1482 result[0] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
1483 shift_1, shift_0);
1484 left_y += ystep;
1485 left_offset = left_y >> index_scale_bits;
1486 shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
1487 shift_1 = 32 - shift_0;
1488 LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
1489 result[1] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
1490 shift_1, shift_0);
1491
1492 left_y += ystep;
1493 left_offset = left_y >> index_scale_bits;
1494 shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
1495 shift_1 = 32 - shift_0;
1496 LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
1497 result[2] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
1498 shift_1, shift_0);
1499
1500 left_y += ystep;
1501 left_offset = left_y >> index_scale_bits;
1502 shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
1503 shift_1 = 32 - shift_0;
1504 LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
1505 result[3] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
1506 shift_1, shift_0);
1507
1508 left_y += ystep;
1509 left_offset = left_y >> index_scale_bits;
1510 shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
1511 shift_1 = 32 - shift_0;
1512 LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
1513 result[4] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
1514 shift_1, shift_0);
1515
1516 left_y += ystep;
1517 left_offset = left_y >> index_scale_bits;
1518 shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
1519 shift_1 = 32 - shift_0;
1520 LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
1521 result[5] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
1522 shift_1, shift_0);
1523
1524 left_y += ystep;
1525 left_offset = left_y >> index_scale_bits;
1526 shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
1527 shift_1 = 32 - shift_0;
1528 LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
1529 result[6] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
1530 shift_1, shift_0);
1531
1532 left_y += ystep;
1533 left_offset = left_y >> index_scale_bits;
1534 shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
1535 shift_1 = 32 - shift_0;
1536 LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
1537 result[7] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
1538 shift_1, shift_0);
1539
1540 Transpose8x8(result);
1541 Store8(dest, result[0]);
1542 dest += stride;
1543 Store8(dest, result[1]);
1544 dest += stride;
1545 Store8(dest, result[2]);
1546 dest += stride;
1547 Store8(dest, result[3]);
1548 dest += stride;
1549 Store8(dest, result[4]);
1550 dest += stride;
1551 Store8(dest, result[5]);
1552 dest += stride;
1553 Store8(dest, result[6]);
1554 dest += stride;
1555 Store8(dest, result[7]);
1556 }
1557
1558 template <bool upsampled>
DirectionalZone3_WxH(uint8_t * LIBGAV1_RESTRICT dest,const ptrdiff_t stride,const int width,const int height,const uint16_t * LIBGAV1_RESTRICT const left,const int ystep)1559 inline void DirectionalZone3_WxH(uint8_t* LIBGAV1_RESTRICT dest,
1560 const ptrdiff_t stride, const int width,
1561 const int height,
1562 const uint16_t* LIBGAV1_RESTRICT const left,
1563 const int ystep) {
1564 const int upsample_shift = static_cast<int>(upsampled);
1565 // Zone3 never runs out of left_column values.
1566 assert((width + height - 1) << upsample_shift > // max_base_y
1567 ((ystep * width) >> (6 - upsample_shift)) +
1568 (/* base_step */ 1 << upsample_shift) *
1569 (height - 1)); // left_base_y
1570 int y = 0;
1571 do {
1572 int x = 0;
1573 uint8_t* dst_x = dest + y * stride;
1574 do {
1575 const int base_left_y = ystep * x;
1576 DirectionalZone3_8x8<upsampled>(
1577 dst_x, stride, left + (y << upsample_shift), ystep, base_left_y);
1578 dst_x += 8 * sizeof(uint16_t);
1579 x += 8;
1580 } while (x < width);
1581 y += 8;
1582 } while (y < height);
1583 }
1584
DirectionalIntraPredictorZone3_NEON(void * LIBGAV1_RESTRICT const dest,const ptrdiff_t stride,const void * LIBGAV1_RESTRICT const left_column,const int width,const int height,const int ystep,const bool upsampled_left)1585 void DirectionalIntraPredictorZone3_NEON(
1586 void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
1587 const void* LIBGAV1_RESTRICT const left_column, const int width,
1588 const int height, const int ystep, const bool upsampled_left) {
1589 const auto* const left = static_cast<const uint16_t*>(left_column);
1590 auto* dst = static_cast<uint8_t*>(dest);
1591
1592 if (ystep == 64) {
1593 assert(!upsampled_left);
1594 const int width_bytes = width * sizeof(left[0]);
1595 int y = height;
1596 do {
1597 const uint16_t* left_ptr = left + 1;
1598 memcpy(dst, left_ptr, width_bytes);
1599 memcpy(dst + stride, left_ptr + 1, width_bytes);
1600 memcpy(dst + 2 * stride, left_ptr + 2, width_bytes);
1601 memcpy(dst + 3 * stride, left_ptr + 3, width_bytes);
1602 dst += 4 * stride;
1603 left_ptr += 4;
1604 y -= 4;
1605 } while (y != 0);
1606 return;
1607 }
1608 if (height == 4) {
1609 if (upsampled_left) {
1610 DirectionalZone3_Wx4<true>(dst, stride, width, left, ystep);
1611 } else {
1612 DirectionalZone3_Wx4<false>(dst, stride, width, left, ystep);
1613 }
1614 } else if (width == 4) {
1615 if (upsampled_left) {
1616 DirectionalZone3_4xH<true>(dst, stride, height, left, ystep);
1617 } else {
1618 DirectionalZone3_4xH<false>(dst, stride, height, left, ystep);
1619 }
1620 } else {
1621 if (upsampled_left) {
1622 // |upsampled_left| can only be true if |width| + |height| <= 16,
1623 // therefore this is 8x8.
1624 DirectionalZone3_8x8<true>(dst, stride, left, ystep);
1625 } else {
1626 DirectionalZone3_WxH<false>(dst, stride, width, height, left, ystep);
1627 }
1628 }
1629 }
1630
1631 // -----------------------------------------------------------------------------
1632 // Zone2
1633 // This function deals with cases not found in zone 1 or zone 3. The extreme
1634 // angles are 93, which makes for sharp ascents along |left_column| with each
1635 // successive dest row element until reaching |top_row|, and 177, with a shallow
1636 // ascent up |left_column| until reaching large jumps along |top_row|. In the
1637 // extremely steep cases, source vectors can only be loaded one lane at a time.
1638
1639 // Fill |left| and |right| with the appropriate values for a given |base_step|.
LoadStepwise(const void * LIBGAV1_RESTRICT const source,const uint8x8_t left_step,const uint8x8_t right_step,uint16x4_t * left,uint16x4_t * right)1640 inline void LoadStepwise(const void* LIBGAV1_RESTRICT const source,
1641 const uint8x8_t left_step, const uint8x8_t right_step,
1642 uint16x4_t* left, uint16x4_t* right) {
1643 const uint8x16x2_t mixed = {
1644 vld1q_u8(static_cast<const uint8_t*>(source)),
1645 vld1q_u8(static_cast<const uint8_t*>(source) + 16)};
1646 *left = vreinterpret_u16_u8(VQTbl2U8(mixed, left_step));
1647 *right = vreinterpret_u16_u8(VQTbl2U8(mixed, right_step));
1648 }
1649
LoadStepwise(const void * LIBGAV1_RESTRICT const source,const uint8x8_t left_step_0,const uint8x8_t right_step_0,const uint8x8_t left_step_1,const uint8x8_t right_step_1,uint16x8_t * left,uint16x8_t * right)1650 inline void LoadStepwise(const void* LIBGAV1_RESTRICT const source,
1651 const uint8x8_t left_step_0,
1652 const uint8x8_t right_step_0,
1653 const uint8x8_t left_step_1,
1654 const uint8x8_t right_step_1, uint16x8_t* left,
1655 uint16x8_t* right) {
1656 const uint8x16x2_t mixed = {
1657 vld1q_u8(static_cast<const uint8_t*>(source)),
1658 vld1q_u8(static_cast<const uint8_t*>(source) + 16)};
1659 const uint16x4_t left_low = vreinterpret_u16_u8(VQTbl2U8(mixed, left_step_0));
1660 const uint16x4_t left_high =
1661 vreinterpret_u16_u8(VQTbl2U8(mixed, left_step_1));
1662 *left = vcombine_u16(left_low, left_high);
1663 const uint16x4_t right_low =
1664 vreinterpret_u16_u8(VQTbl2U8(mixed, right_step_0));
1665 const uint16x4_t right_high =
1666 vreinterpret_u16_u8(VQTbl2U8(mixed, right_step_1));
1667 *right = vcombine_u16(right_low, right_high);
1668 }
1669
1670 // Blend two values based on weight pairs that each sum to 32.
WeightedBlend(const uint16x4_t a,const uint16x4_t b,const uint16x4_t a_weight,const uint16x4_t b_weight)1671 inline uint16x4_t WeightedBlend(const uint16x4_t a, const uint16x4_t b,
1672 const uint16x4_t a_weight,
1673 const uint16x4_t b_weight) {
1674 const uint16x4_t a_product = vmul_u16(a, a_weight);
1675 const uint16x4_t sum = vmla_u16(a_product, b, b_weight);
1676
1677 return vrshr_n_u16(sum, 5 /*log2(32)*/);
1678 }
1679
1680 // Because the source values "move backwards" as the row index increases, the
1681 // indices derived from ystep are generally negative in localized functions.
1682 // This is accommodated by making sure the relative indices are within [-15, 0]
1683 // when the function is called, and sliding them into the inclusive range
1684 // [0, 15], relative to a lower base address. 15 is the Pixel offset, so 30 is
1685 // the byte offset for table lookups.
1686
1687 constexpr int kPositiveIndexOffsetPixels = 15;
1688 constexpr int kPositiveIndexOffsetBytes = 30;
1689
DirectionalZone2FromLeftCol_4xH(uint8_t * LIBGAV1_RESTRICT dst,const ptrdiff_t stride,const int height,const uint16_t * LIBGAV1_RESTRICT const left_column,const int16x4_t left_y,const bool upsampled)1690 inline void DirectionalZone2FromLeftCol_4xH(
1691 uint8_t* LIBGAV1_RESTRICT dst, const ptrdiff_t stride, const int height,
1692 const uint16_t* LIBGAV1_RESTRICT const left_column, const int16x4_t left_y,
1693 const bool upsampled) {
1694 const int upsample_shift = static_cast<int>(upsampled);
1695
1696 const int index_scale_bits = 6;
1697 // The values in |offset_y| are negative, except for the first element, which
1698 // is zero.
1699 int16x4_t offset_y;
1700 int16x4_t shift_upsampled = left_y;
1701 // The shift argument must be a constant, otherwise use upsample_shift
1702 // directly.
1703 if (upsampled) {
1704 offset_y = vshr_n_s16(left_y, index_scale_bits - 1 /*upsample_shift*/);
1705 shift_upsampled = vshl_n_s16(shift_upsampled, 1);
1706 } else {
1707 offset_y = vshr_n_s16(left_y, index_scale_bits);
1708 }
1709 offset_y = vshl_n_s16(offset_y, 1);
1710
1711 // Select values to the left of the starting point.
1712 // The 15th element (and 16th) will be all the way at the end, to the
1713 // right. With a negative ystep everything else will be "left" of them.
1714 // This supports cumulative steps up to 15. We could support up to 16 by
1715 // doing separate loads for |left_values| and |right_values|. vtbl
1716 // supports 2 Q registers as input which would allow for cumulative
1717 // offsets of 32.
1718 // |sampler_0| indexes the first byte of each 16-bit value.
1719 const int16x4_t sampler_0 =
1720 vadd_s16(offset_y, vdup_n_s16(kPositiveIndexOffsetBytes));
1721 // |sampler_1| indexes the second byte of each 16-bit value.
1722 const int16x4_t sampler_1 = vadd_s16(sampler_0, vdup_n_s16(1));
1723 const int16x4x2_t sampler = vzip_s16(sampler_0, sampler_1);
1724 const uint8x8_t left_indices =
1725 vqmovun_s16(vcombine_s16(sampler.val[0], sampler.val[1]));
1726 const uint8x8_t right_indices =
1727 vadd_u8(left_indices, vdup_n_u8(sizeof(uint16_t)));
1728
1729 const int16x4_t shift_masked = vand_s16(shift_upsampled, vdup_n_s16(0x3f));
1730 const uint16x4_t shift_0 = vreinterpret_u16_s16(vshr_n_s16(shift_masked, 1));
1731 const uint16x4_t shift_1 = vsub_u16(vdup_n_u16(32), shift_0);
1732
1733 int y = 0;
1734 do {
1735 uint16x4_t src_left, src_right;
1736 LoadStepwise(
1737 left_column - kPositiveIndexOffsetPixels + (y << upsample_shift),
1738 left_indices, right_indices, &src_left, &src_right);
1739 const uint16x4_t val = WeightedBlend(src_left, src_right, shift_1, shift_0);
1740
1741 Store4(dst, val);
1742 dst += stride;
1743 } while (++y < height);
1744 }
1745
DirectionalZone2FromLeftCol_8x8(uint8_t * LIBGAV1_RESTRICT dst,const ptrdiff_t stride,const uint16_t * LIBGAV1_RESTRICT const left_column,const int16x8_t left_y,const bool upsampled)1746 inline void DirectionalZone2FromLeftCol_8x8(
1747 uint8_t* LIBGAV1_RESTRICT dst, const ptrdiff_t stride,
1748 const uint16_t* LIBGAV1_RESTRICT const left_column, const int16x8_t left_y,
1749 const bool upsampled) {
1750 const int upsample_shift = static_cast<int>(upsampled);
1751
1752 const int index_scale_bits = 6;
1753 // The values in |offset_y| are negative, except for the first element, which
1754 // is zero.
1755 int16x8_t offset_y;
1756 int16x8_t shift_upsampled = left_y;
1757 // The shift argument must be a constant, otherwise use upsample_shift
1758 // directly.
1759 if (upsampled) {
1760 offset_y = vshrq_n_s16(left_y, index_scale_bits - 1);
1761 shift_upsampled = vshlq_n_s16(shift_upsampled, 1);
1762 } else {
1763 offset_y = vshrq_n_s16(left_y, index_scale_bits);
1764 }
1765 offset_y = vshlq_n_s16(offset_y, 1);
1766
1767 // Select values to the left of the starting point.
1768 // The 15th element (and 16th) will be all the way at the end, to the right.
1769 // With a negative ystep everything else will be "left" of them.
1770 // This supports cumulative steps up to 15. We could support up to 16 by doing
1771 // separate loads for |left_values| and |right_values|. vtbl supports 2 Q
1772 // registers as input which would allow for cumulative offsets of 32.
1773 // |sampler_0| indexes the first byte of each 16-bit value.
1774 const int16x8_t sampler_0 =
1775 vaddq_s16(offset_y, vdupq_n_s16(kPositiveIndexOffsetBytes));
1776 // |sampler_1| indexes the second byte of each 16-bit value.
1777 const int16x8_t sampler_1 = vaddq_s16(sampler_0, vdupq_n_s16(1));
1778 const int16x8x2_t sampler = vzipq_s16(sampler_0, sampler_1);
1779 const uint8x8_t left_values_0 = vqmovun_s16(sampler.val[0]);
1780 const uint8x8_t left_values_1 = vqmovun_s16(sampler.val[1]);
1781 const uint8x8_t right_values_0 =
1782 vadd_u8(left_values_0, vdup_n_u8(sizeof(uint16_t)));
1783 const uint8x8_t right_values_1 =
1784 vadd_u8(left_values_1, vdup_n_u8(sizeof(uint16_t)));
1785
1786 const int16x8_t shift_masked = vandq_s16(shift_upsampled, vdupq_n_s16(0x3f));
1787 const uint16x8_t shift_0 =
1788 vreinterpretq_u16_s16(vshrq_n_s16(shift_masked, 1));
1789 const uint16x8_t shift_1 = vsubq_u16(vdupq_n_u16(32), shift_0);
1790
1791 for (int y = 0; y < 8; ++y) {
1792 uint16x8_t src_left, src_right;
1793 LoadStepwise(
1794 left_column - kPositiveIndexOffsetPixels + (y << upsample_shift),
1795 left_values_0, right_values_0, left_values_1, right_values_1, &src_left,
1796 &src_right);
1797 const uint16x8_t val = WeightedBlend(src_left, src_right, shift_1, shift_0);
1798
1799 Store8(dst, val);
1800 dst += stride;
1801 }
1802 }
1803
1804 template <bool upsampled>
DirectionalZone1Blend_4xH(uint8_t * LIBGAV1_RESTRICT dest,const ptrdiff_t stride,const int height,const uint16_t * LIBGAV1_RESTRICT const top_row,int zone_bounds,int top_x,const int xstep)1805 inline void DirectionalZone1Blend_4xH(
1806 uint8_t* LIBGAV1_RESTRICT dest, const ptrdiff_t stride, const int height,
1807 const uint16_t* LIBGAV1_RESTRICT const top_row, int zone_bounds, int top_x,
1808 const int xstep) {
1809 const int upsample_shift = static_cast<int>(upsampled);
1810 const int scale_bits_x = 6 - upsample_shift;
1811
1812 // Representing positions along the row, which |zone_bounds| will target for
1813 // the blending boundary.
1814 const int16x4_t indices = {0, 1, 2, 3};
1815
1816 uint16x4x2_t top_vals;
1817 int y = height;
1818 do {
1819 const uint16_t* const src = top_row + (top_x >> scale_bits_x);
1820 LoadEdgeVals(&top_vals, src, upsampled);
1821
1822 const uint16_t shift_0 = ((top_x << upsample_shift) & 0x3f) >> 1;
1823 const uint16_t shift_1 = 32 - shift_0;
1824
1825 const uint16x4_t val =
1826 WeightedBlend(top_vals.val[0], top_vals.val[1], shift_1, shift_0);
1827
1828 const uint16x4_t dst_blend = Load4U16(dest);
1829 // |zone_bounds| values can be negative.
1830 const uint16x4_t blend = vcge_s16(indices, vdup_n_s16(zone_bounds >> 6));
1831 const uint16x4_t output = vbsl_u16(blend, val, dst_blend);
1832
1833 Store4(dest, output);
1834 dest += stride;
1835 zone_bounds += xstep;
1836 top_x -= xstep;
1837 } while (--y != 0);
1838 }
1839
1840 template <bool upsampled>
DirectionalZone1Blend_8x8(uint8_t * LIBGAV1_RESTRICT dest,const ptrdiff_t stride,const uint16_t * LIBGAV1_RESTRICT const top_row,int zone_bounds,int top_x,const int xstep)1841 inline void DirectionalZone1Blend_8x8(
1842 uint8_t* LIBGAV1_RESTRICT dest, const ptrdiff_t stride,
1843 const uint16_t* LIBGAV1_RESTRICT const top_row, int zone_bounds, int top_x,
1844 const int xstep) {
1845 const int upsample_shift = static_cast<int>(upsampled);
1846 const int scale_bits_x = 6 - upsample_shift;
1847
1848 // Representing positions along the row, which |zone_bounds| will target for
1849 // the blending boundary.
1850 const int16x8_t indices = {0, 1, 2, 3, 4, 5, 6, 7};
1851
1852 uint16x8x2_t top_vals;
1853 for (int y = 0; y < 8; ++y) {
1854 const uint16_t* const src = top_row + (top_x >> scale_bits_x);
1855 LoadEdgeVals(&top_vals, src, upsampled);
1856
1857 const uint16_t shift_0 = ((top_x << upsample_shift) & 0x3f) >> 1;
1858 const uint16_t shift_1 = 32 - shift_0;
1859
1860 const uint16x8_t val =
1861 WeightedBlend(top_vals.val[0], top_vals.val[1], shift_1, shift_0);
1862
1863 const uint16x8_t dst_blend = Load8U16(dest);
1864 // |zone_bounds| values can be negative.
1865 const uint16x8_t blend = vcgeq_s16(indices, vdupq_n_s16(zone_bounds >> 6));
1866 const uint16x8_t output = vbslq_u16(blend, val, dst_blend);
1867
1868 Store8(dest, output);
1869 dest += stride;
1870 zone_bounds += xstep;
1871 top_x -= xstep;
1872 }
1873 }
1874
1875 // 7.11.2.4 (8) 90 < angle > 180
1876 // The strategy for these functions (4xH and 8+xH) is to know how many blocks
1877 // can be processed with just pixels from |top_ptr|, then handle mixed blocks,
1878 // then handle only blocks that take from |left_ptr|. Additionally, a fast
1879 // index-shuffle approach is used for pred values from |left_column| in sections
1880 // that permit it.
1881 template <bool upsampled_top, bool upsampled_left>
DirectionalZone2_4xH(uint8_t * LIBGAV1_RESTRICT dst,const ptrdiff_t stride,const uint16_t * LIBGAV1_RESTRICT const top_row,const uint16_t * LIBGAV1_RESTRICT const left_column,const int height,const int xstep,const int ystep)1882 inline void DirectionalZone2_4xH(
1883 uint8_t* LIBGAV1_RESTRICT dst, const ptrdiff_t stride,
1884 const uint16_t* LIBGAV1_RESTRICT const top_row,
1885 const uint16_t* LIBGAV1_RESTRICT const left_column, const int height,
1886 const int xstep, const int ystep) {
1887 const int upsample_left_shift = static_cast<int>(upsampled_left);
1888
1889 // Helper vector for index computation.
1890 const int16x4_t zero_to_three = {0, 1, 2, 3};
1891
1892 // Loop increments for moving by block (4xN). Vertical still steps by 8. If
1893 // it's only 4, it will be finished in the first iteration.
1894 const ptrdiff_t stride8 = stride << 3;
1895 const int xstep8 = xstep << 3;
1896
1897 const int min_height = (height == 4) ? 4 : 8;
1898
1899 // All columns from |min_top_only_x| to the right will only need |top_row| to
1900 // compute and can therefore call the Zone1 functions. This assumes |xstep| is
1901 // at least 3.
1902 assert(xstep >= 3);
1903
1904 // Offsets the original zone bound value to simplify x < (y+1)*xstep/64 -1
1905 int xstep_bounds_base = (xstep == 64) ? 0 : xstep - 1;
1906
1907 const int left_base_increment = ystep >> 6;
1908 const int ystep_remainder = ystep & 0x3F;
1909
1910 // If the 64 scaling is regarded as a decimal point, the first value of the
1911 // left_y vector omits the portion which is covered under the left_column
1912 // offset. The following values need the full ystep as a relative offset.
1913 const int16x4_t left_y =
1914 vmla_n_s16(vdup_n_s16(-ystep_remainder), zero_to_three, -ystep);
1915
1916 // This loop treats the 4 columns in 3 stages with y-value boundaries.
1917 // The first stage, before the first y-loop, covers blocks that are only
1918 // computed from the top row. The second stage, comprising two y-loops, covers
1919 // blocks that have a mixture of values computed from top or left. The final
1920 // stage covers blocks that are only computed from the left.
1921 // Round down to the nearest multiple of 8 (or 4, if height is 4).
1922 const int max_top_only_y =
1923 std::min((1 << 6) / xstep, height) & ~(min_height - 1);
1924 DirectionalZone1_4xH<upsampled_top>(reinterpret_cast<uint16_t*>(dst),
1925 stride >> 1, max_top_only_y, top_row,
1926 -xstep);
1927
1928 if (max_top_only_y == height) return;
1929
1930 int y = max_top_only_y;
1931 dst += stride * y;
1932 const int xstep_y = xstep * y;
1933
1934 // All rows from |min_left_only_y| down for this set of columns only need
1935 // |left_column| to compute.
1936 const int min_left_only_y = std::min((4 /*width*/ << 6) / xstep, height);
1937 int xstep_bounds = xstep_bounds_base + xstep_y;
1938 int top_x = -xstep - xstep_y;
1939
1940 // +8 increment is OK because if height is 4 this only runs once.
1941 for (; y < min_left_only_y;
1942 y += 8, dst += stride8, xstep_bounds += xstep8, top_x -= xstep8) {
1943 DirectionalZone2FromLeftCol_4xH(
1944 dst, stride, min_height,
1945 left_column + ((y - left_base_increment) << upsample_left_shift),
1946 left_y, upsampled_left);
1947
1948 DirectionalZone1Blend_4xH<upsampled_top>(dst, stride, min_height, top_row,
1949 xstep_bounds, top_x, xstep);
1950 }
1951
1952 // Left-only section. |height| - |y| is assumed equivalent to:
1953 // (y == 0) && (height == 4)
1954 if (height - y == 4) {
1955 DirectionalZone3_4x4<upsampled_left>(dst, stride, left_column, -ystep);
1956 return;
1957 }
1958 if (y < height) {
1959 DirectionalZone3_4xH<upsampled_left>(
1960 dst, stride, height - y, left_column + (y << upsample_left_shift),
1961 -ystep);
1962 }
1963 }
1964
1965 // Process 8x4 and 16x4 blocks. This avoids a lot of overhead and simplifies
1966 // address safety.
1967 template <bool upsampled_top, bool upsampled_left>
DirectionalZone2_Wx4(uint8_t * LIBGAV1_RESTRICT const dst,const ptrdiff_t stride,const uint16_t * LIBGAV1_RESTRICT const top_row,const uint16_t * LIBGAV1_RESTRICT const left_column,const int width,const int xstep,const int ystep)1968 inline void DirectionalZone2_Wx4(
1969 uint8_t* LIBGAV1_RESTRICT const dst, const ptrdiff_t stride,
1970 const uint16_t* LIBGAV1_RESTRICT const top_row,
1971 const uint16_t* LIBGAV1_RESTRICT const left_column, const int width,
1972 const int xstep, const int ystep) {
1973 const int upsample_top_shift = static_cast<int>(upsampled_top);
1974 // Offsets the original zone bound value to simplify x < (y+1)*xstep/64 -1
1975 int xstep_bounds_base = (xstep == 64) ? 0 : xstep - 1;
1976
1977 const int min_top_only_x = std::min((4 * xstep) >> 6, width);
1978 int x = 0;
1979 for (; x < min_top_only_x; x += 4, xstep_bounds_base -= (4 << 6)) {
1980 uint8_t* dst_x = dst + x * sizeof(uint16_t);
1981
1982 // Round down to the nearest multiple of 4.
1983 const int max_top_only_y = (((x + 1) << 6) / xstep) & ~3;
1984 if (max_top_only_y != 0) {
1985 DirectionalZone1_4xH<upsampled_top>(
1986 reinterpret_cast<uint16_t*>(dst_x), stride >> 1, 4,
1987 top_row + (x << upsample_top_shift), -xstep);
1988 continue;
1989 }
1990
1991 DirectionalZone3_4x4<upsampled_left>(dst_x, stride, left_column, -ystep,
1992 -ystep * x);
1993
1994 const int min_left_only_y = ((x + 4) << 6) / xstep;
1995 if (min_left_only_y != 0) {
1996 const int top_x = -xstep;
1997 DirectionalZone1Blend_4xH<upsampled_top>(
1998 dst_x, stride, 4, top_row + (x << upsample_top_shift),
1999 xstep_bounds_base, top_x, xstep);
2000 }
2001 }
2002 // Reached |min_top_only_x|.
2003 for (; x < width; x += 4) {
2004 DirectionalZone1_4xH<upsampled_top>(
2005 reinterpret_cast<uint16_t*>(dst) + x, stride >> 1, 4,
2006 top_row + (x << upsample_top_shift), -xstep);
2007 }
2008 }
2009
2010 template <bool shuffle_left_column, bool upsampled_top, bool upsampled_left>
DirectionalZone2_8xH(uint8_t * LIBGAV1_RESTRICT const dst,const ptrdiff_t stride,const uint16_t * LIBGAV1_RESTRICT const top_row,const uint16_t * LIBGAV1_RESTRICT const left_column,const int height,const int xstep,const int ystep,const int x,const int left_offset,const int xstep_bounds_base,const int16x8_t left_y)2011 inline void DirectionalZone2_8xH(
2012 uint8_t* LIBGAV1_RESTRICT const dst, const ptrdiff_t stride,
2013 const uint16_t* LIBGAV1_RESTRICT const top_row,
2014 const uint16_t* LIBGAV1_RESTRICT const left_column, const int height,
2015 const int xstep, const int ystep, const int x, const int left_offset,
2016 const int xstep_bounds_base, const int16x8_t left_y) {
2017 const int upsample_left_shift = static_cast<int>(upsampled_left);
2018 const int upsample_top_shift = static_cast<int>(upsampled_top);
2019
2020 // Loop incrementers for moving by block (8x8). This function handles blocks
2021 // with height 4 as well. They are calculated in one pass so these variables
2022 // do not get used.
2023 const ptrdiff_t stride8 = stride << 3;
2024 const int xstep8 = xstep << 3;
2025
2026 // The first stage, before the first y-loop, covers blocks that are only
2027 // computed from the top row. The second stage, comprising two y-loops, covers
2028 // blocks that have a mixture of values computed from top or left. The final
2029 // stage covers blocks that are only computed from the left.
2030 uint8_t* dst_x = dst + x * sizeof(uint16_t);
2031 // Round down to the nearest multiple of 8.
2032 const int max_top_only_y = std::min(((x + 1) << 6) / xstep, height) & ~7;
2033 DirectionalZone1_WxH<upsampled_top>(
2034 reinterpret_cast<uint16_t*>(dst_x), stride >> 1, 8, max_top_only_y,
2035 top_row + (x << upsample_top_shift), -xstep);
2036
2037 if (max_top_only_y == height) return;
2038
2039 int y = max_top_only_y;
2040 dst_x += stride * y;
2041 const int xstep_y = xstep * y;
2042
2043 // All rows from |min_left_only_y| down for this set of columns only need
2044 // |left_column| to compute. Round up to the nearest 8.
2045 const int min_left_only_y =
2046 Align(std::min(((x + 8) << 6) / xstep, height), 8);
2047 int xstep_bounds = xstep_bounds_base + xstep_y;
2048 int top_x = -xstep - xstep_y;
2049
2050 for (; y < min_left_only_y;
2051 y += 8, dst_x += stride8, xstep_bounds += xstep8, top_x -= xstep8) {
2052 if (shuffle_left_column) {
2053 DirectionalZone2FromLeftCol_8x8(
2054 dst_x, stride,
2055 left_column + ((left_offset + y) << upsample_left_shift), left_y,
2056 upsampled_left);
2057 } else {
2058 DirectionalZone3_8x8<upsampled_left>(
2059 dst_x, stride, left_column + (y << upsample_left_shift), -ystep,
2060 -ystep * x);
2061 }
2062
2063 DirectionalZone1Blend_8x8<upsampled_top>(
2064 dst_x, stride, top_row + (x << upsample_top_shift), xstep_bounds, top_x,
2065 xstep);
2066 }
2067
2068 // Loop over y for left_only rows.
2069 for (; y < height; y += 8, dst_x += stride8) {
2070 DirectionalZone3_8x8<upsampled_left>(
2071 dst_x, stride, left_column + (y << upsample_left_shift), -ystep,
2072 -ystep * x);
2073 }
2074 }
2075
2076 // Process a multiple of 8 |width|.
2077 template <bool upsampled_top, bool upsampled_left>
DirectionalZone2_NEON(uint8_t * LIBGAV1_RESTRICT const dst,const ptrdiff_t stride,const uint16_t * LIBGAV1_RESTRICT const top_row,const uint16_t * LIBGAV1_RESTRICT const left_column,const int width,const int height,const int xstep,const int ystep)2078 inline void DirectionalZone2_NEON(
2079 uint8_t* LIBGAV1_RESTRICT const dst, const ptrdiff_t stride,
2080 const uint16_t* LIBGAV1_RESTRICT const top_row,
2081 const uint16_t* LIBGAV1_RESTRICT const left_column, const int width,
2082 const int height, const int xstep, const int ystep) {
2083 if (height == 4) {
2084 DirectionalZone2_Wx4<upsampled_top, upsampled_left>(
2085 dst, stride, top_row, left_column, width, xstep, ystep);
2086 return;
2087 }
2088 const int upsample_top_shift = static_cast<int>(upsampled_top);
2089
2090 // Helper vector.
2091 const int16x8_t zero_to_seven = {0, 1, 2, 3, 4, 5, 6, 7};
2092
2093 const int ystep8 = ystep << 3;
2094
2095 // All columns from |min_top_only_x| to the right will only need |top_row| to
2096 // compute and can therefore call the Zone1 functions. This assumes |xstep| is
2097 // at least 3.
2098 assert(xstep >= 3);
2099 const int min_top_only_x = Align(std::min((height * xstep) >> 6, width), 8);
2100 // Analysis finds that, for most angles (ystep < 132), all segments that use
2101 // both top_row and left_column can compute from left_column using byte
2102 // shuffles from a single vector. For steeper angles, the shuffle is also
2103 // fully reliable when x >= 32.
2104 const int shuffle_left_col_x = (ystep < 132) ? 0 : 32;
2105 const int min_shuffle_x = std::min(min_top_only_x, shuffle_left_col_x);
2106
2107 // Offsets the original zone bound value to simplify x < (y+1)*xstep/64 -1
2108 int xstep_bounds_base = (xstep == 64) ? 0 : xstep - 1;
2109
2110 const int left_base_increment = ystep >> 6;
2111 const int ystep_remainder = ystep & 0x3F;
2112
2113 const int left_base_increment8 = ystep8 >> 6;
2114 const int ystep_remainder8 = ystep8 & 0x3F;
2115 const int16x8_t increment_left8 = vdupq_n_s16(ystep_remainder8);
2116
2117 // If the 64 scaling is regarded as a decimal point, the first value of the
2118 // left_y vector omits the portion which is covered under the left_column
2119 // offset. Following values need the full ystep as a relative offset.
2120 int16x8_t left_y =
2121 vmlaq_n_s16(vdupq_n_s16(-ystep_remainder), zero_to_seven, -ystep);
2122
2123 int x = 0;
2124 for (int left_offset = -left_base_increment; x < min_shuffle_x; x += 8,
2125 xstep_bounds_base -= (8 << 6),
2126 left_y = vsubq_s16(left_y, increment_left8),
2127 left_offset -= left_base_increment8) {
2128 DirectionalZone2_8xH<false, upsampled_top, upsampled_left>(
2129 dst, stride, top_row, left_column, height, xstep, ystep, x, left_offset,
2130 xstep_bounds_base, left_y);
2131 }
2132 for (int left_offset = -left_base_increment; x < min_top_only_x; x += 8,
2133 xstep_bounds_base -= (8 << 6),
2134 left_y = vsubq_s16(left_y, increment_left8),
2135 left_offset -= left_base_increment8) {
2136 DirectionalZone2_8xH<true, upsampled_top, upsampled_left>(
2137 dst, stride, top_row, left_column, height, xstep, ystep, x, left_offset,
2138 xstep_bounds_base, left_y);
2139 }
2140 // Reached |min_top_only_x|.
2141 if (x < width) {
2142 DirectionalZone1_WxH<upsampled_top>(
2143 reinterpret_cast<uint16_t*>(dst) + x, stride >> 1, width - x, height,
2144 top_row + (x << upsample_top_shift), -xstep);
2145 }
2146 }
2147
2148 // At this angle, neither edges are upsampled.
2149 // |min_width| is either 4 or 8.
2150 template <int min_width>
DirectionalAngle135(uint8_t * LIBGAV1_RESTRICT dst,const ptrdiff_t stride,const uint16_t * LIBGAV1_RESTRICT const top,const uint16_t * LIBGAV1_RESTRICT const left,const int width,const int height)2151 void DirectionalAngle135(uint8_t* LIBGAV1_RESTRICT dst, const ptrdiff_t stride,
2152 const uint16_t* LIBGAV1_RESTRICT const top,
2153 const uint16_t* LIBGAV1_RESTRICT const left,
2154 const int width, const int height) {
2155 // y = 0 is more trivial than the other rows.
2156 memcpy(dst, top - 1, width * sizeof(top[0]));
2157 dst += stride;
2158
2159 // If |height| > |width|, then there is a point at which top_row is no longer
2160 // used in each row.
2161 const int min_left_only_y = std::min(width, height);
2162
2163 int y = 1;
2164 do {
2165 // Example: If y is 4 (min_width), the dest row starts with left[3],
2166 // left[2], left[1], left[0], because the angle points up. Therefore, load
2167 // starts at left[0] and is then reversed. If y is 2, the load starts at
2168 // left[-2], and is reversed to store left[1], left[0], with negative values
2169 // overwritten from |top_row|.
2170 const uint16_t* const load_left = left + y - min_width;
2171 auto* dst16 = reinterpret_cast<uint16_t*>(dst);
2172
2173 // Some values will be overwritten when |y| is not a multiple of
2174 // |min_width|.
2175 if (min_width == 4) {
2176 const uint16x4_t left_toward_corner = vrev64_u16(vld1_u16(load_left));
2177 vst1_u16(dst16, left_toward_corner);
2178 } else {
2179 int x = 0;
2180 do {
2181 const uint16x8_t left_toward_corner =
2182 vrev64q_u16(vld1q_u16(load_left - x));
2183 vst1_u16(dst16 + x, vget_high_u16(left_toward_corner));
2184 vst1_u16(dst16 + x + 4, vget_low_u16(left_toward_corner));
2185 x += 8;
2186 } while (x < y);
2187 }
2188 // Entering |top|.
2189 memcpy(dst16 + y, top - 1, (width - y) * sizeof(top[0]));
2190 dst += stride;
2191 } while (++y < min_left_only_y);
2192
2193 // Left only.
2194 for (; y < height; ++y, dst += stride) {
2195 auto* dst16 = reinterpret_cast<uint16_t*>(dst);
2196 const uint16_t* const load_left = left + y - min_width;
2197
2198 int x = 0;
2199 if (min_width == 4) {
2200 const uint16x4_t left_toward_corner = vrev64_u16(vld1_u16(load_left - x));
2201 vst1_u16(dst16 + x, left_toward_corner);
2202 } else {
2203 do {
2204 const uint16x8_t left_toward_corner =
2205 vrev64q_u16(vld1q_u16(load_left - x));
2206 vst1_u16(dst16 + x, vget_high_u16(left_toward_corner));
2207 vst1_u16(dst16 + x + 4, vget_low_u16(left_toward_corner));
2208 x += 8;
2209 } while (x < width);
2210 }
2211 }
2212 }
2213
DirectionalIntraPredictorZone2_NEON(void * LIBGAV1_RESTRICT dest,const ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column,const int width,const int height,const int xstep,const int ystep,const bool upsampled_top,const bool upsampled_left)2214 void DirectionalIntraPredictorZone2_NEON(
2215 void* LIBGAV1_RESTRICT dest, const ptrdiff_t stride,
2216 const void* LIBGAV1_RESTRICT const top_row,
2217 const void* LIBGAV1_RESTRICT const left_column, const int width,
2218 const int height, const int xstep, const int ystep,
2219 const bool upsampled_top, const bool upsampled_left) {
2220 // Increasing the negative buffer for this function allows more rows to be
2221 // processed at a time without branching in an inner loop to check the base.
2222 uint16_t top_buffer[288];
2223 uint16_t left_buffer[288];
2224 #if LIBGAV1_MSAN
2225 memset(top_buffer, 0, sizeof(top_buffer));
2226 memset(left_buffer, 0, sizeof(left_buffer));
2227 #endif // LIBGAV1_MSAN
2228 memcpy(top_buffer + 128, static_cast<const uint16_t*>(top_row) - 16, 160);
2229 memcpy(left_buffer + 128, static_cast<const uint16_t*>(left_column) - 16,
2230 160);
2231 const uint16_t* top_ptr = top_buffer + 144;
2232 const uint16_t* left_ptr = left_buffer + 144;
2233 auto* dst = static_cast<uint8_t*>(dest);
2234
2235 if (width == 4) {
2236 if (xstep == 64) {
2237 assert(ystep == 64);
2238 DirectionalAngle135<4>(dst, stride, top_ptr, left_ptr, width, height);
2239 return;
2240 }
2241 if (upsampled_top) {
2242 if (upsampled_left) {
2243 DirectionalZone2_4xH<true, true>(dst, stride, top_ptr, left_ptr, height,
2244 xstep, ystep);
2245 } else {
2246 DirectionalZone2_4xH<true, false>(dst, stride, top_ptr, left_ptr,
2247 height, xstep, ystep);
2248 }
2249 } else if (upsampled_left) {
2250 DirectionalZone2_4xH<false, true>(dst, stride, top_ptr, left_ptr, height,
2251 xstep, ystep);
2252 } else {
2253 DirectionalZone2_4xH<false, false>(dst, stride, top_ptr, left_ptr, height,
2254 xstep, ystep);
2255 }
2256 return;
2257 }
2258
2259 if (xstep == 64) {
2260 assert(ystep == 64);
2261 DirectionalAngle135<8>(dst, stride, top_ptr, left_ptr, width, height);
2262 return;
2263 }
2264 if (upsampled_top) {
2265 if (upsampled_left) {
2266 DirectionalZone2_NEON<true, true>(dst, stride, top_ptr, left_ptr, width,
2267 height, xstep, ystep);
2268 } else {
2269 DirectionalZone2_NEON<true, false>(dst, stride, top_ptr, left_ptr, width,
2270 height, xstep, ystep);
2271 }
2272 } else if (upsampled_left) {
2273 DirectionalZone2_NEON<false, true>(dst, stride, top_ptr, left_ptr, width,
2274 height, xstep, ystep);
2275 } else {
2276 DirectionalZone2_NEON<false, false>(dst, stride, top_ptr, left_ptr, width,
2277 height, xstep, ystep);
2278 }
2279 }
2280
Init10bpp()2281 void Init10bpp() {
2282 Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
2283 assert(dsp != nullptr);
2284 dsp->directional_intra_predictor_zone1 = DirectionalIntraPredictorZone1_NEON;
2285 dsp->directional_intra_predictor_zone2 = DirectionalIntraPredictorZone2_NEON;
2286 dsp->directional_intra_predictor_zone3 = DirectionalIntraPredictorZone3_NEON;
2287 }
2288
2289 } // namespace
2290 } // namespace high_bitdepth
2291 #endif // LIBGAV1_MAX_BITDEPTH >= 10
2292
IntraPredDirectionalInit_NEON()2293 void IntraPredDirectionalInit_NEON() {
2294 low_bitdepth::Init8bpp();
2295 #if LIBGAV1_MAX_BITDEPTH >= 10
2296 high_bitdepth::Init10bpp();
2297 #endif // LIBGAV1_MAX_BITDEPTH >= 10
2298 }
2299
2300 } // namespace dsp
2301 } // namespace libgav1
2302
2303 #else // !LIBGAV1_ENABLE_NEON
2304 namespace libgav1 {
2305 namespace dsp {
2306
IntraPredDirectionalInit_NEON()2307 void IntraPredDirectionalInit_NEON() {}
2308
2309 } // namespace dsp
2310 } // namespace libgav1
2311 #endif // LIBGAV1_ENABLE_NEON
2312