xref: /aosp_15_r20/external/libgav1/src/dsp/arm/intrapred_directional_neon.cc (revision 095378508e87ed692bf8dfeb34008b65b3735891)
1 // Copyright 2019 The libgav1 Authors
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "src/dsp/intrapred_directional.h"
16 #include "src/utils/cpu.h"
17 
18 #if LIBGAV1_ENABLE_NEON
19 
20 #include <arm_neon.h>
21 
22 #include <algorithm>
23 #include <cassert>
24 #include <cstddef>
25 #include <cstdint>
26 #include <cstring>
27 
28 #include "src/dsp/arm/common_neon.h"
29 #include "src/dsp/constants.h"
30 #include "src/dsp/dsp.h"
31 #include "src/utils/common.h"
32 #include "src/utils/compiler_attributes.h"
33 
34 namespace libgav1 {
35 namespace dsp {
36 namespace low_bitdepth {
37 namespace {
38 
39 // Blend two values based on weights that sum to 32.
WeightedBlend(const uint8x8_t a,const uint8x8_t b,const uint8x8_t a_weight,const uint8x8_t b_weight)40 inline uint8x8_t WeightedBlend(const uint8x8_t a, const uint8x8_t b,
41                                const uint8x8_t a_weight,
42                                const uint8x8_t b_weight) {
43   const uint16x8_t a_product = vmull_u8(a, a_weight);
44   const uint16x8_t sum = vmlal_u8(a_product, b, b_weight);
45 
46   return vrshrn_n_u16(sum, 5 /*log2(32)*/);
47 }
48 
49 // For vertical operations the weights are one constant value.
WeightedBlend(const uint8x8_t a,const uint8x8_t b,const uint8_t weight)50 inline uint8x8_t WeightedBlend(const uint8x8_t a, const uint8x8_t b,
51                                const uint8_t weight) {
52   return WeightedBlend(a, b, vdup_n_u8(32 - weight), vdup_n_u8(weight));
53 }
54 
55 // Fill |left| and |right| with the appropriate values for a given |base_step|.
LoadStepwise(const uint8_t * LIBGAV1_RESTRICT const source,const uint8x8_t left_step,const uint8x8_t right_step,uint8x8_t * left,uint8x8_t * right)56 inline void LoadStepwise(const uint8_t* LIBGAV1_RESTRICT const source,
57                          const uint8x8_t left_step, const uint8x8_t right_step,
58                          uint8x8_t* left, uint8x8_t* right) {
59   const uint8x16_t mixed = vld1q_u8(source);
60   *left = VQTbl1U8(mixed, left_step);
61   *right = VQTbl1U8(mixed, right_step);
62 }
63 
64 // Handle signed step arguments by ignoring the sign. Negative values are
65 // considered out of range and overwritten later.
LoadStepwise(const uint8_t * LIBGAV1_RESTRICT const source,const int8x8_t left_step,const int8x8_t right_step,uint8x8_t * left,uint8x8_t * right)66 inline void LoadStepwise(const uint8_t* LIBGAV1_RESTRICT const source,
67                          const int8x8_t left_step, const int8x8_t right_step,
68                          uint8x8_t* left, uint8x8_t* right) {
69   LoadStepwise(source, vreinterpret_u8_s8(left_step),
70                vreinterpret_u8_s8(right_step), left, right);
71 }
72 
73 // Process 4 or 8 |width| by any |height|.
74 template <int width>
DirectionalZone1_WxH(uint8_t * LIBGAV1_RESTRICT dst,const ptrdiff_t stride,const int height,const uint8_t * LIBGAV1_RESTRICT const top,const int xstep,const bool upsampled)75 inline void DirectionalZone1_WxH(uint8_t* LIBGAV1_RESTRICT dst,
76                                  const ptrdiff_t stride, const int height,
77                                  const uint8_t* LIBGAV1_RESTRICT const top,
78                                  const int xstep, const bool upsampled) {
79   assert(width == 4 || width == 8);
80 
81   const int upsample_shift = static_cast<int>(upsampled);
82   const int scale_bits = 6 - upsample_shift;
83 
84   const int max_base_x = (width + height - 1) << upsample_shift;
85   const int8x8_t max_base = vdup_n_s8(max_base_x);
86   const uint8x8_t top_max_base = vdup_n_u8(top[max_base_x]);
87 
88   const int8x8_t all = vcreate_s8(0x0706050403020100);
89   const int8x8_t even = vcreate_s8(0x0e0c0a0806040200);
90   const int8x8_t base_step = upsampled ? even : all;
91   const int8x8_t right_step = vadd_s8(base_step, vdup_n_s8(1));
92 
93   int top_x = xstep;
94   int y = 0;
95   do {
96     const int top_base_x = top_x >> scale_bits;
97 
98     if (top_base_x >= max_base_x) {
99       for (int i = y; i < height; ++i) {
100         memset(dst, top[max_base_x], 4 /* width */);
101         dst += stride;
102       }
103       return;
104     }
105 
106     const uint8_t shift = ((top_x << upsample_shift) & 0x3F) >> 1;
107 
108     // Zone2 uses negative values for xstep. Use signed values to compare
109     // |top_base_x| to |max_base_x|.
110     const int8x8_t base_v = vadd_s8(vdup_n_s8(top_base_x), base_step);
111 
112     const uint8x8_t max_base_mask = vclt_s8(base_v, max_base);
113 
114     // 4 wide subsamples the output. 8 wide subsamples the input.
115     if (width == 4) {
116       const uint8x8_t left_values = vld1_u8(top + top_base_x);
117       const uint8x8_t right_values = RightShiftVector<8>(left_values);
118       const uint8x8_t value = WeightedBlend(left_values, right_values, shift);
119 
120       // If |upsampled| is true then extract every other value for output.
121       const uint8x8_t value_stepped =
122           vtbl1_u8(value, vreinterpret_u8_s8(base_step));
123       const uint8x8_t masked_value =
124           vbsl_u8(max_base_mask, value_stepped, top_max_base);
125 
126       StoreLo4(dst, masked_value);
127     } else /* width == 8 */ {
128       uint8x8_t left_values, right_values;
129       // WeightedBlend() steps up to Q registers. Downsample the input to avoid
130       // doing extra calculations.
131       LoadStepwise(top + top_base_x, base_step, right_step, &left_values,
132                    &right_values);
133 
134       const uint8x8_t value = WeightedBlend(left_values, right_values, shift);
135       const uint8x8_t masked_value =
136           vbsl_u8(max_base_mask, value, top_max_base);
137 
138       vst1_u8(dst, masked_value);
139     }
140     dst += stride;
141     top_x += xstep;
142   } while (++y < height);
143 }
144 
145 // Process a multiple of 8 |width| by any |height|. Processes horizontally
146 // before vertically in the hopes of being a little more cache friendly.
DirectionalZone1_WxH(uint8_t * LIBGAV1_RESTRICT dst,const ptrdiff_t stride,const int width,const int height,const uint8_t * LIBGAV1_RESTRICT const top,const int xstep,const bool upsampled)147 inline void DirectionalZone1_WxH(uint8_t* LIBGAV1_RESTRICT dst,
148                                  const ptrdiff_t stride, const int width,
149                                  const int height,
150                                  const uint8_t* LIBGAV1_RESTRICT const top,
151                                  const int xstep, const bool upsampled) {
152   assert(width % 8 == 0);
153   const int upsample_shift = static_cast<int>(upsampled);
154   const int scale_bits = 6 - upsample_shift;
155 
156   const int max_base_x = (width + height - 1) << upsample_shift;
157   const int8x8_t max_base = vdup_n_s8(max_base_x);
158   const uint8x8_t top_max_base = vdup_n_u8(top[max_base_x]);
159 
160   const int8x8_t all = vcreate_s8(0x0706050403020100);
161   const int8x8_t even = vcreate_s8(0x0e0c0a0806040200);
162   const int8x8_t base_step = upsampled ? even : all;
163   const int8x8_t right_step = vadd_s8(base_step, vdup_n_s8(1));
164   const int8x8_t block_step = vdup_n_s8(8 << upsample_shift);
165 
166   int top_x = xstep;
167   int y = 0;
168   do {
169     const int top_base_x = top_x >> scale_bits;
170 
171     if (top_base_x >= max_base_x) {
172       for (int i = y; i < height; ++i) {
173         memset(dst, top[max_base_x], 4 /* width */);
174         dst += stride;
175       }
176       return;
177     }
178 
179     const uint8_t shift = ((top_x << upsample_shift) & 0x3F) >> 1;
180 
181     // Zone2 uses negative values for xstep. Use signed values to compare
182     // |top_base_x| to |max_base_x|.
183     int8x8_t base_v = vadd_s8(vdup_n_s8(top_base_x), base_step);
184 
185     int x = 0;
186     do {
187       const uint8x8_t max_base_mask = vclt_s8(base_v, max_base);
188 
189       // Extract the input values based on |upsampled| here to avoid doing twice
190       // as many calculations.
191       uint8x8_t left_values, right_values;
192       LoadStepwise(top + top_base_x + x, base_step, right_step, &left_values,
193                    &right_values);
194 
195       const uint8x8_t value = WeightedBlend(left_values, right_values, shift);
196       const uint8x8_t masked_value =
197           vbsl_u8(max_base_mask, value, top_max_base);
198 
199       vst1_u8(dst + x, masked_value);
200 
201       base_v = vadd_s8(base_v, block_step);
202       x += 8;
203     } while (x < width);
204     top_x += xstep;
205     dst += stride;
206   } while (++y < height);
207 }
208 
DirectionalIntraPredictorZone1_NEON(void * LIBGAV1_RESTRICT const dest,const ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const int width,const int height,const int xstep,const bool upsampled_top)209 void DirectionalIntraPredictorZone1_NEON(
210     void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
211     const void* LIBGAV1_RESTRICT const top_row, const int width,
212     const int height, const int xstep, const bool upsampled_top) {
213   const auto* const top = static_cast<const uint8_t*>(top_row);
214   auto* dst = static_cast<uint8_t*>(dest);
215 
216   assert(xstep > 0);
217 
218   const int upsample_shift = static_cast<int>(upsampled_top);
219 
220   const uint8x8_t all = vcreate_u8(0x0706050403020100);
221 
222   if (xstep == 64) {
223     assert(!upsampled_top);
224     const uint8_t* top_ptr = top + 1;
225     int y = 0;
226     do {
227       memcpy(dst, top_ptr, width);
228       memcpy(dst + stride, top_ptr + 1, width);
229       memcpy(dst + 2 * stride, top_ptr + 2, width);
230       memcpy(dst + 3 * stride, top_ptr + 3, width);
231       dst += 4 * stride;
232       top_ptr += 4;
233       y += 4;
234     } while (y < height);
235   } else if (width == 4) {
236     DirectionalZone1_WxH<4>(dst, stride, height, top, xstep, upsampled_top);
237   } else if (xstep > 51) {
238     // 7.11.2.10. Intra edge upsample selection process
239     // if ( d <= 0 || d >= 40 ) useUpsample = 0
240     // For |upsample_top| the delta is from vertical so |prediction_angle - 90|.
241     // In |kDirectionalIntraPredictorDerivative[]| angles less than 51 will meet
242     // this criteria. The |xstep| value for angle 51 happens to be 51 as well.
243     // Shallower angles have greater xstep values.
244     assert(!upsampled_top);
245     const int max_base_x = ((width + height) - 1);
246     const uint8x8_t max_base = vdup_n_u8(max_base_x);
247     const uint8x8_t top_max_base = vdup_n_u8(top[max_base_x]);
248     const uint8x8_t block_step = vdup_n_u8(8);
249 
250     int top_x = xstep;
251     int y = 0;
252     do {
253       const int top_base_x = top_x >> 6;
254       const uint8_t shift = ((top_x << upsample_shift) & 0x3F) >> 1;
255       uint8x8_t base_v = vadd_u8(vdup_n_u8(top_base_x), all);
256       int x = 0;
257       // Only calculate a block of 8 when at least one of the output values is
258       // within range. Otherwise it can read off the end of |top|.
259       const int must_calculate_width =
260           std::min(width, max_base_x - top_base_x + 7) & ~7;
261       for (; x < must_calculate_width; x += 8) {
262         const uint8x8_t max_base_mask = vclt_u8(base_v, max_base);
263 
264         // Since these |xstep| values can not be upsampled the load is
265         // simplified.
266         const uint8x8_t left_values = vld1_u8(top + top_base_x + x);
267         const uint8x8_t right_values = vld1_u8(top + top_base_x + x + 1);
268         const uint8x8_t value = WeightedBlend(left_values, right_values, shift);
269         const uint8x8_t masked_value =
270             vbsl_u8(max_base_mask, value, top_max_base);
271 
272         vst1_u8(dst + x, masked_value);
273         base_v = vadd_u8(base_v, block_step);
274       }
275       memset(dst + x, top[max_base_x], width - x);
276       dst += stride;
277       top_x += xstep;
278     } while (++y < height);
279   } else {
280     DirectionalZone1_WxH(dst, stride, width, height, top, xstep, upsampled_top);
281   }
282 }
283 
284 // Process 4 or 8 |width| by 4 or 8 |height|.
285 template <int width>
DirectionalZone3_WxH(uint8_t * LIBGAV1_RESTRICT dest,const ptrdiff_t stride,const int height,const uint8_t * LIBGAV1_RESTRICT const left_column,const int base_left_y,const int ystep,const int upsample_shift)286 inline void DirectionalZone3_WxH(
287     uint8_t* LIBGAV1_RESTRICT dest, const ptrdiff_t stride, const int height,
288     const uint8_t* LIBGAV1_RESTRICT const left_column, const int base_left_y,
289     const int ystep, const int upsample_shift) {
290   assert(width == 4 || width == 8);
291   assert(height == 4 || height == 8);
292   const int scale_bits = 6 - upsample_shift;
293 
294   // Zone3 never runs out of left_column values.
295   assert((width + height - 1) << upsample_shift >  // max_base_y
296          ((ystep * width) >> scale_bits) +
297              (/* base_step */ 1 << upsample_shift) *
298                  (height - 1));  // left_base_y
299 
300   // Limited improvement for 8x8. ~20% faster for 64x64.
301   const uint8x8_t all = vcreate_u8(0x0706050403020100);
302   const uint8x8_t even = vcreate_u8(0x0e0c0a0806040200);
303   const uint8x8_t base_step = upsample_shift ? even : all;
304   const uint8x8_t right_step = vadd_u8(base_step, vdup_n_u8(1));
305 
306   uint8_t* dst = dest;
307   uint8x8_t left_v[8], right_v[8], value_v[8];
308   const uint8_t* const left = left_column;
309 
310   const int index_0 = base_left_y;
311   LoadStepwise(left + (index_0 >> scale_bits), base_step, right_step,
312                &left_v[0], &right_v[0]);
313   value_v[0] = WeightedBlend(left_v[0], right_v[0],
314                              ((index_0 << upsample_shift) & 0x3F) >> 1);
315 
316   const int index_1 = base_left_y + ystep;
317   LoadStepwise(left + (index_1 >> scale_bits), base_step, right_step,
318                &left_v[1], &right_v[1]);
319   value_v[1] = WeightedBlend(left_v[1], right_v[1],
320                              ((index_1 << upsample_shift) & 0x3F) >> 1);
321 
322   const int index_2 = base_left_y + ystep * 2;
323   LoadStepwise(left + (index_2 >> scale_bits), base_step, right_step,
324                &left_v[2], &right_v[2]);
325   value_v[2] = WeightedBlend(left_v[2], right_v[2],
326                              ((index_2 << upsample_shift) & 0x3F) >> 1);
327 
328   const int index_3 = base_left_y + ystep * 3;
329   LoadStepwise(left + (index_3 >> scale_bits), base_step, right_step,
330                &left_v[3], &right_v[3]);
331   value_v[3] = WeightedBlend(left_v[3], right_v[3],
332                              ((index_3 << upsample_shift) & 0x3F) >> 1);
333 
334   const int index_4 = base_left_y + ystep * 4;
335   LoadStepwise(left + (index_4 >> scale_bits), base_step, right_step,
336                &left_v[4], &right_v[4]);
337   value_v[4] = WeightedBlend(left_v[4], right_v[4],
338                              ((index_4 << upsample_shift) & 0x3F) >> 1);
339 
340   const int index_5 = base_left_y + ystep * 5;
341   LoadStepwise(left + (index_5 >> scale_bits), base_step, right_step,
342                &left_v[5], &right_v[5]);
343   value_v[5] = WeightedBlend(left_v[5], right_v[5],
344                              ((index_5 << upsample_shift) & 0x3F) >> 1);
345 
346   const int index_6 = base_left_y + ystep * 6;
347   LoadStepwise(left + (index_6 >> scale_bits), base_step, right_step,
348                &left_v[6], &right_v[6]);
349   value_v[6] = WeightedBlend(left_v[6], right_v[6],
350                              ((index_6 << upsample_shift) & 0x3F) >> 1);
351 
352   const int index_7 = base_left_y + ystep * 7;
353   LoadStepwise(left + (index_7 >> scale_bits), base_step, right_step,
354                &left_v[7], &right_v[7]);
355   value_v[7] = WeightedBlend(left_v[7], right_v[7],
356                              ((index_7 << upsample_shift) & 0x3F) >> 1);
357 
358   // 8x8 transpose.
359   const uint8x16x2_t b0 = vtrnq_u8(vcombine_u8(value_v[0], value_v[4]),
360                                    vcombine_u8(value_v[1], value_v[5]));
361   const uint8x16x2_t b1 = vtrnq_u8(vcombine_u8(value_v[2], value_v[6]),
362                                    vcombine_u8(value_v[3], value_v[7]));
363 
364   const uint16x8x2_t c0 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[0]),
365                                     vreinterpretq_u16_u8(b1.val[0]));
366   const uint16x8x2_t c1 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[1]),
367                                     vreinterpretq_u16_u8(b1.val[1]));
368 
369   const uint32x4x2_t d0 = vuzpq_u32(vreinterpretq_u32_u16(c0.val[0]),
370                                     vreinterpretq_u32_u16(c1.val[0]));
371   const uint32x4x2_t d1 = vuzpq_u32(vreinterpretq_u32_u16(c0.val[1]),
372                                     vreinterpretq_u32_u16(c1.val[1]));
373 
374   if (width == 4) {
375     StoreLo4(dst, vreinterpret_u8_u32(vget_low_u32(d0.val[0])));
376     dst += stride;
377     StoreLo4(dst, vreinterpret_u8_u32(vget_high_u32(d0.val[0])));
378     dst += stride;
379     StoreLo4(dst, vreinterpret_u8_u32(vget_low_u32(d1.val[0])));
380     dst += stride;
381     StoreLo4(dst, vreinterpret_u8_u32(vget_high_u32(d1.val[0])));
382     if (height == 4) return;
383     dst += stride;
384     StoreLo4(dst, vreinterpret_u8_u32(vget_low_u32(d0.val[1])));
385     dst += stride;
386     StoreLo4(dst, vreinterpret_u8_u32(vget_high_u32(d0.val[1])));
387     dst += stride;
388     StoreLo4(dst, vreinterpret_u8_u32(vget_low_u32(d1.val[1])));
389     dst += stride;
390     StoreLo4(dst, vreinterpret_u8_u32(vget_high_u32(d1.val[1])));
391   } else {
392     vst1_u8(dst, vreinterpret_u8_u32(vget_low_u32(d0.val[0])));
393     dst += stride;
394     vst1_u8(dst, vreinterpret_u8_u32(vget_high_u32(d0.val[0])));
395     dst += stride;
396     vst1_u8(dst, vreinterpret_u8_u32(vget_low_u32(d1.val[0])));
397     dst += stride;
398     vst1_u8(dst, vreinterpret_u8_u32(vget_high_u32(d1.val[0])));
399     if (height == 4) return;
400     dst += stride;
401     vst1_u8(dst, vreinterpret_u8_u32(vget_low_u32(d0.val[1])));
402     dst += stride;
403     vst1_u8(dst, vreinterpret_u8_u32(vget_high_u32(d0.val[1])));
404     dst += stride;
405     vst1_u8(dst, vreinterpret_u8_u32(vget_low_u32(d1.val[1])));
406     dst += stride;
407     vst1_u8(dst, vreinterpret_u8_u32(vget_high_u32(d1.val[1])));
408   }
409 }
410 
411 // Because the source values "move backwards" as the row index increases, the
412 // indices derived from ystep are generally negative. This is accommodated by
413 // making sure the relative indices are within [-15, 0] when the function is
414 // called, and sliding them into the inclusive range [0, 15], relative to a
415 // lower base address.
416 constexpr int kPositiveIndexOffset = 15;
417 
418 // Process 4 or 8 |width| by any |height|.
419 template <int width>
DirectionalZone2FromLeftCol_WxH(uint8_t * LIBGAV1_RESTRICT dst,const ptrdiff_t stride,const int height,const uint8_t * LIBGAV1_RESTRICT const left_column,const int16x8_t left_y,const int upsample_shift)420 inline void DirectionalZone2FromLeftCol_WxH(
421     uint8_t* LIBGAV1_RESTRICT dst, const ptrdiff_t stride, const int height,
422     const uint8_t* LIBGAV1_RESTRICT const left_column, const int16x8_t left_y,
423     const int upsample_shift) {
424   assert(width == 4 || width == 8);
425 
426   // The shift argument must be a constant.
427   int16x8_t offset_y, shift_upsampled = left_y;
428   if (upsample_shift) {
429     offset_y = vshrq_n_s16(left_y, 5);
430     shift_upsampled = vshlq_n_s16(shift_upsampled, 1);
431   } else {
432     offset_y = vshrq_n_s16(left_y, 6);
433   }
434 
435   // Select values to the left of the starting point.
436   // The 15th element (and 16th) will be all the way at the end, to the right.
437   // With a negative ystep everything else will be "left" of them.
438   // This supports cumulative steps up to 15. We could support up to 16 by doing
439   // separate loads for |left_values| and |right_values|. vtbl supports 2 Q
440   // registers as input which would allow for cumulative offsets of 32.
441   const int16x8_t sampler =
442       vaddq_s16(offset_y, vdupq_n_s16(kPositiveIndexOffset));
443   const uint8x8_t left_values = vqmovun_s16(sampler);
444   const uint8x8_t right_values = vadd_u8(left_values, vdup_n_u8(1));
445 
446   const int16x8_t shift_masked = vandq_s16(shift_upsampled, vdupq_n_s16(0x3f));
447   const uint8x8_t shift_mul = vreinterpret_u8_s8(vshrn_n_s16(shift_masked, 1));
448   const uint8x8_t inv_shift_mul = vsub_u8(vdup_n_u8(32), shift_mul);
449 
450   int y = 0;
451   do {
452     uint8x8_t src_left, src_right;
453     LoadStepwise(left_column - kPositiveIndexOffset + (y << upsample_shift),
454                  left_values, right_values, &src_left, &src_right);
455     const uint8x8_t val =
456         WeightedBlend(src_left, src_right, inv_shift_mul, shift_mul);
457 
458     if (width == 4) {
459       StoreLo4(dst, val);
460     } else {
461       vst1_u8(dst, val);
462     }
463     dst += stride;
464   } while (++y < height);
465 }
466 
467 // Process 4 or 8 |width| by any |height|.
468 template <int width>
DirectionalZone1Blend_WxH(uint8_t * LIBGAV1_RESTRICT dest,const ptrdiff_t stride,const int height,const uint8_t * LIBGAV1_RESTRICT const top_row,int zone_bounds,int top_x,const int xstep,const int upsample_shift)469 inline void DirectionalZone1Blend_WxH(
470     uint8_t* LIBGAV1_RESTRICT dest, const ptrdiff_t stride, const int height,
471     const uint8_t* LIBGAV1_RESTRICT const top_row, int zone_bounds, int top_x,
472     const int xstep, const int upsample_shift) {
473   assert(width == 4 || width == 8);
474 
475   const int scale_bits_x = 6 - upsample_shift;
476 
477   const uint8x8_t all = vcreate_u8(0x0706050403020100);
478   const uint8x8_t even = vcreate_u8(0x0e0c0a0806040200);
479   const uint8x8_t base_step = upsample_shift ? even : all;
480   const uint8x8_t right_step = vadd_u8(base_step, vdup_n_u8(1));
481 
482   int y = 0;
483   do {
484     const uint8_t* const src = top_row + (top_x >> scale_bits_x);
485     uint8x8_t left, right;
486     LoadStepwise(src, base_step, right_step, &left, &right);
487 
488     const uint8_t shift = ((top_x << upsample_shift) & 0x3f) >> 1;
489     const uint8x8_t val = WeightedBlend(left, right, shift);
490 
491     uint8x8_t dst_blend = vld1_u8(dest);
492     // |zone_bounds| values can be negative.
493     uint8x8_t blend =
494         vcge_s8(vreinterpret_s8_u8(all), vdup_n_s8((zone_bounds >> 6)));
495     uint8x8_t output = vbsl_u8(blend, val, dst_blend);
496 
497     if (width == 4) {
498       StoreLo4(dest, output);
499     } else {
500       vst1_u8(dest, output);
501     }
502     dest += stride;
503     zone_bounds += xstep;
504     top_x -= xstep;
505   } while (++y < height);
506 }
507 
508 //  7.11.2.4 (8) 90 < angle > 180
509 //  The strategy for these functions (4xH and 8+xH) is to know how many blocks
510 //  can be processed with just pixels from |top_ptr|, then handle mixed blocks,
511 //  then handle only blocks that take from |left_ptr|. Additionally, a fast
512 //  index-shuffle approach is used for pred values from |left_column| in
513 //  sections that permit it.
DirectionalZone2_4xH(uint8_t * LIBGAV1_RESTRICT dst,const ptrdiff_t stride,const uint8_t * LIBGAV1_RESTRICT const top_row,const uint8_t * LIBGAV1_RESTRICT const left_column,const int height,const int xstep,const int ystep,const bool upsampled_top,const bool upsampled_left)514 inline void DirectionalZone2_4xH(
515     uint8_t* LIBGAV1_RESTRICT dst, const ptrdiff_t stride,
516     const uint8_t* LIBGAV1_RESTRICT const top_row,
517     const uint8_t* LIBGAV1_RESTRICT const left_column, const int height,
518     const int xstep, const int ystep, const bool upsampled_top,
519     const bool upsampled_left) {
520   const int upsample_left_shift = static_cast<int>(upsampled_left);
521   const int upsample_top_shift = static_cast<int>(upsampled_top);
522 
523   // Helper vector.
524   const int16x8_t zero_to_seven = {0, 1, 2, 3, 4, 5, 6, 7};
525 
526   // Loop incrementers for moving by block (4xN). Vertical still steps by 8. If
527   // it's only 4, it will be finished in the first iteration.
528   const ptrdiff_t stride8 = stride << 3;
529   const int xstep8 = xstep << 3;
530 
531   const int min_height = (height == 4) ? 4 : 8;
532 
533   // All columns from |min_top_only_x| to the right will only need |top_row| to
534   // compute and can therefore call the Zone1 functions. This assumes |xstep| is
535   // at least 3.
536   assert(xstep >= 3);
537   const int min_top_only_x = std::min((height * xstep) >> 6, /* width */ 4);
538 
539   // Offsets the original zone bound value to simplify x < (y+1)*xstep/64 -1
540   int xstep_bounds_base = (xstep == 64) ? 0 : xstep - 1;
541 
542   const int left_base_increment = ystep >> 6;
543   const int ystep_remainder = ystep & 0x3F;
544 
545   // If the 64 scaling is regarded as a decimal point, the first value of the
546   // left_y vector omits the portion which is covered under the left_column
547   // offset. The following values need the full ystep as a relative offset.
548   const int16x8_t remainder = vdupq_n_s16(-ystep_remainder);
549   const int16x8_t left_y = vmlaq_n_s16(remainder, zero_to_seven, -ystep);
550 
551   // This loop treats each set of 4 columns in 3 stages with y-value boundaries.
552   // The first stage, before the first y-loop, covers blocks that are only
553   // computed from the top row. The second stage, comprising two y-loops, covers
554   // blocks that have a mixture of values computed from top or left. The final
555   // stage covers blocks that are only computed from the left.
556   if (min_top_only_x > 0) {
557     // Round down to the nearest multiple of 8 (or 4, if height is 4).
558     const int max_top_only_y =
559         std::min((1 << 6) / xstep, height) & ~(min_height - 1);
560     DirectionalZone1_WxH<4>(dst, stride, max_top_only_y, top_row, -xstep,
561                             upsampled_top);
562 
563     if (max_top_only_y == height) return;
564 
565     int y = max_top_only_y;
566     dst += stride * y;
567     const int xstep_y = xstep * y;
568 
569     // All rows from |min_left_only_y| down for this set of columns only need
570     // |left_column| to compute.
571     const int min_left_only_y = std::min((4 << 6) / xstep, height);
572     int xstep_bounds = xstep_bounds_base + xstep_y;
573     int top_x = -xstep - xstep_y;
574 
575     // +8 increment is OK because if height is 4 this only goes once.
576     for (; y < min_left_only_y;
577          y += 8, dst += stride8, xstep_bounds += xstep8, top_x -= xstep8) {
578       DirectionalZone2FromLeftCol_WxH<4>(
579           dst, stride, min_height,
580           left_column + ((y - left_base_increment) << upsample_left_shift),
581           left_y, upsample_left_shift);
582 
583       DirectionalZone1Blend_WxH<4>(dst, stride, min_height, top_row,
584                                    xstep_bounds, top_x, xstep,
585                                    upsample_top_shift);
586     }
587 
588     // Loop over y for left_only rows.
589     const int16_t base_left_y = vgetq_lane_s16(left_y, 0);
590     for (; y < height; y += 8, dst += stride8) {
591       DirectionalZone3_WxH<4>(
592           dst, stride, min_height,
593           left_column + ((y - left_base_increment) << upsample_left_shift),
594           base_left_y, -ystep, upsample_left_shift);
595     }
596   } else {
597     DirectionalZone1_WxH<4>(dst, stride, height, top_row, -xstep,
598                             upsampled_top);
599   }
600 }
601 
602 template <bool shuffle_left_column>
DirectionalZone2_8xH(uint8_t * LIBGAV1_RESTRICT const dst,const ptrdiff_t stride,const uint8_t * LIBGAV1_RESTRICT const top_row,const uint8_t * LIBGAV1_RESTRICT const left_column,const int height,const int xstep,const int ystep,const int x,const int left_offset,const int xstep_bounds_base,const int16x8_t left_y,const bool upsampled_top,const bool upsampled_left)603 inline void DirectionalZone2_8xH(
604     uint8_t* LIBGAV1_RESTRICT const dst, const ptrdiff_t stride,
605     const uint8_t* LIBGAV1_RESTRICT const top_row,
606     const uint8_t* LIBGAV1_RESTRICT const left_column, const int height,
607     const int xstep, const int ystep, const int x, const int left_offset,
608     const int xstep_bounds_base, const int16x8_t left_y,
609     const bool upsampled_top, const bool upsampled_left) {
610   const int upsample_left_shift = static_cast<int>(upsampled_left);
611   const int upsample_top_shift = static_cast<int>(upsampled_top);
612 
613   // Loop incrementers for moving by block (8x8). This function handles blocks
614   // with height 4 as well. They are calculated in one pass so these variables
615   // do not get used.
616   const ptrdiff_t stride8 = stride << 3;
617   const int xstep8 = xstep << 3;
618 
619   // Cover 8x4 case.
620   const int min_height = (height == 4) ? 4 : 8;
621 
622   // The first stage, before the first y-loop, covers blocks that are only
623   // computed from the top row. The second stage, comprising two y-loops, covers
624   // blocks that have a mixture of values computed from top or left. The final
625   // stage covers blocks that are only computed from the left.
626   uint8_t* dst_x = dst + x;
627   // Round down to the nearest multiple of 8 (or 4, if height is 4).
628   const int max_top_only_y =
629       std::min((1 << 6) / xstep, height) & ~(min_height - 1);
630   DirectionalZone1_WxH<8>(dst_x, stride, max_top_only_y,
631                           top_row + (x << upsample_top_shift), -xstep,
632                           upsampled_top);
633 
634   if (max_top_only_y == height) return;
635 
636   int y = max_top_only_y;
637   dst_x += stride * y;
638   const int xstep_y = xstep * y;
639 
640   // All rows from |min_left_only_y| down for this set of columns only need
641   // |left_column| to compute. Round up to the nearest 8.
642   const int min_left_only_y =
643       Align(std::min(((x + 8) << 6) / xstep, height), 8);
644   int xstep_bounds = xstep_bounds_base + xstep_y;
645   int top_x = -xstep - xstep_y;
646 
647   const int16_t base_left_y = vgetq_lane_s16(left_y, 0);
648   for (; y < min_left_only_y;
649        y += 8, dst_x += stride8, xstep_bounds += xstep8, top_x -= xstep8) {
650     if (shuffle_left_column) {
651       DirectionalZone2FromLeftCol_WxH<8>(
652           dst_x, stride, min_height,
653           left_column + ((left_offset + y) << upsample_left_shift), left_y,
654           upsample_left_shift);
655     } else {
656       DirectionalZone3_WxH<8>(
657           dst_x, stride, min_height,
658           left_column + ((left_offset + y) << upsample_left_shift), base_left_y,
659           -ystep, upsample_left_shift);
660     }
661 
662     DirectionalZone1Blend_WxH<8>(
663         dst_x, stride, min_height, top_row + (x << upsample_top_shift),
664         xstep_bounds, top_x, xstep, upsample_top_shift);
665   }
666 
667   // Loop over y for left_only rows.
668   for (; y < height; y += 8, dst_x += stride8) {
669     DirectionalZone3_WxH<8>(
670         dst_x, stride, min_height,
671         left_column + ((left_offset + y) << upsample_left_shift), base_left_y,
672         -ystep, upsample_left_shift);
673   }
674 }
675 
676 // Process a multiple of 8 |width|.
DirectionalZone2_WxH(uint8_t * LIBGAV1_RESTRICT const dst,const ptrdiff_t stride,const uint8_t * LIBGAV1_RESTRICT const top_row,const uint8_t * LIBGAV1_RESTRICT const left_column,const int width,const int height,const int xstep,const int ystep,const bool upsampled_top,const bool upsampled_left)677 inline void DirectionalZone2_WxH(
678     uint8_t* LIBGAV1_RESTRICT const dst, const ptrdiff_t stride,
679     const uint8_t* LIBGAV1_RESTRICT const top_row,
680     const uint8_t* LIBGAV1_RESTRICT const left_column, const int width,
681     const int height, const int xstep, const int ystep,
682     const bool upsampled_top, const bool upsampled_left) {
683   const int ystep8 = ystep << 3;
684 
685   // Offsets the original zone bound value to simplify x < (y+1)*xstep/64 -1
686   int xstep_bounds_base = (xstep == 64) ? 0 : xstep - 1;
687 
688   const int left_base_increment = ystep >> 6;
689   const int ystep_remainder = ystep & 0x3F;
690 
691   const int left_base_increment8 = ystep8 >> 6;
692   const int ystep_remainder8 = ystep8 & 0x3F;
693   const int16x8_t increment_left8 = vdupq_n_s16(ystep_remainder8);
694 
695   // If the 64 scaling is regarded as a decimal point, the first value of the
696   // left_y vector omits the portion which is covered under the left_column
697   // offset. Following values need the full ystep as a relative offset.
698   const int16x8_t remainder = vdupq_n_s16(-ystep_remainder);
699   const int16x8_t zero_to_seven = {0, 1, 2, 3, 4, 5, 6, 7};
700   int16x8_t left_y = vmlaq_n_s16(remainder, zero_to_seven, -ystep);
701 
702   // For ystep > 90, at least two sets of 8 columns can be fully computed from
703   // top_row only.
704   const int min_top_only_x = std::min((height * xstep) >> 6, width);
705   // Analysis finds that, for most angles (ystep < 132), all segments that use
706   // both top_row and left_column can compute from left_column using byte
707   // shuffles from a single vector. For steeper angles, the shuffle is also
708   // fully reliable when x >= 32.
709   const int shuffle_left_col_x = (ystep < 132) ? 0 : 32;
710   const int min_shuffle_x = std::min(min_top_only_x, shuffle_left_col_x);
711 
712   // This loop treats each set of 4 columns in 3 stages with y-value boundaries.
713   // The first stage, before the first y-loop, covers blocks that are only
714   // computed from the top row. The second stage, comprising two y-loops, covers
715   // blocks that have a mixture of values computed from top or left. The final
716   // stage covers blocks that are only computed from the left.
717   int x = 0;
718   for (int left_offset = -left_base_increment; x < min_shuffle_x; x += 8,
719            xstep_bounds_base -= (8 << 6),
720            left_y = vsubq_s16(left_y, increment_left8),
721            left_offset -= left_base_increment8) {
722     DirectionalZone2_8xH<false>(dst, stride, top_row, left_column, height,
723                                 xstep, ystep, x, left_offset, xstep_bounds_base,
724                                 left_y, upsampled_top, upsampled_left);
725   }
726   for (int left_offset = -left_base_increment; x < min_top_only_x; x += 8,
727            xstep_bounds_base -= (8 << 6),
728            left_y = vsubq_s16(left_y, increment_left8),
729            left_offset -= left_base_increment8) {
730     DirectionalZone2_8xH<true>(dst, stride, top_row, left_column, height, xstep,
731                                ystep, x, left_offset, xstep_bounds_base, left_y,
732                                upsampled_top, upsampled_left);
733   }
734   if (x < width) {
735     const int upsample_top_shift = static_cast<int>(upsampled_top);
736     DirectionalZone1_WxH(dst + x, stride, width - x, height,
737                          top_row + (x << upsample_top_shift), -xstep,
738                          upsampled_top);
739   }
740 }
741 
DirectionalIntraPredictorZone2_NEON(void * LIBGAV1_RESTRICT const dest,const ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column,const int width,const int height,const int xstep,const int ystep,const bool upsampled_top,const bool upsampled_left)742 void DirectionalIntraPredictorZone2_NEON(
743     void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
744     const void* LIBGAV1_RESTRICT const top_row,
745     const void* LIBGAV1_RESTRICT const left_column, const int width,
746     const int height, const int xstep, const int ystep,
747     const bool upsampled_top, const bool upsampled_left) {
748   // Increasing the negative buffer for this function allows more rows to be
749   // processed at a time without branching in an inner loop to check the base.
750   uint8_t top_buffer[288];
751   uint8_t left_buffer[288];
752 #if LIBGAV1_MSAN
753   memset(top_buffer, 0, sizeof(top_buffer));
754   memset(left_buffer, 0, sizeof(left_buffer));
755 #endif  // LIBGAV1_MSAN
756 
757   memcpy(top_buffer + 128, static_cast<const uint8_t*>(top_row) - 16, 160);
758   memcpy(left_buffer + 128, static_cast<const uint8_t*>(left_column) - 16, 160);
759   const uint8_t* top_ptr = top_buffer + 144;
760   const uint8_t* left_ptr = left_buffer + 144;
761   auto* dst = static_cast<uint8_t*>(dest);
762 
763   if (width == 4) {
764     DirectionalZone2_4xH(dst, stride, top_ptr, left_ptr, height, xstep, ystep,
765                          upsampled_top, upsampled_left);
766   } else {
767     DirectionalZone2_WxH(dst, stride, top_ptr, left_ptr, width, height, xstep,
768                          ystep, upsampled_top, upsampled_left);
769   }
770 }
771 
DirectionalIntraPredictorZone3_NEON(void * LIBGAV1_RESTRICT const dest,const ptrdiff_t stride,const void * LIBGAV1_RESTRICT const left_column,const int width,const int height,const int ystep,const bool upsampled_left)772 void DirectionalIntraPredictorZone3_NEON(
773     void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
774     const void* LIBGAV1_RESTRICT const left_column, const int width,
775     const int height, const int ystep, const bool upsampled_left) {
776   const auto* const left = static_cast<const uint8_t*>(left_column);
777 
778   assert(ystep > 0);
779 
780   const int upsample_shift = static_cast<int>(upsampled_left);
781   const int scale_bits = 6 - upsample_shift;
782   const int base_step = 1 << upsample_shift;
783 
784   if (width == 4 || height == 4) {
785     // This block can handle all sizes but the specializations for other sizes
786     // are faster.
787     const uint8x8_t all = vcreate_u8(0x0706050403020100);
788     const uint8x8_t even = vcreate_u8(0x0e0c0a0806040200);
789     const uint8x8_t base_step_v = upsampled_left ? even : all;
790     const uint8x8_t right_step = vadd_u8(base_step_v, vdup_n_u8(1));
791 
792     int y = 0;
793     do {
794       int x = 0;
795       do {
796         auto* dst = static_cast<uint8_t*>(dest);
797         dst += y * stride + x;
798         uint8x8_t left_v[4], right_v[4], value_v[4];
799         const int ystep_base = ystep * x;
800         const int offset = y * base_step;
801 
802         const int index_0 = ystep_base + ystep * 1;
803         LoadStepwise(left + offset + (index_0 >> scale_bits), base_step_v,
804                      right_step, &left_v[0], &right_v[0]);
805         value_v[0] = WeightedBlend(left_v[0], right_v[0],
806                                    ((index_0 << upsample_shift) & 0x3F) >> 1);
807 
808         const int index_1 = ystep_base + ystep * 2;
809         LoadStepwise(left + offset + (index_1 >> scale_bits), base_step_v,
810                      right_step, &left_v[1], &right_v[1]);
811         value_v[1] = WeightedBlend(left_v[1], right_v[1],
812                                    ((index_1 << upsample_shift) & 0x3F) >> 1);
813 
814         const int index_2 = ystep_base + ystep * 3;
815         LoadStepwise(left + offset + (index_2 >> scale_bits), base_step_v,
816                      right_step, &left_v[2], &right_v[2]);
817         value_v[2] = WeightedBlend(left_v[2], right_v[2],
818                                    ((index_2 << upsample_shift) & 0x3F) >> 1);
819 
820         const int index_3 = ystep_base + ystep * 4;
821         LoadStepwise(left + offset + (index_3 >> scale_bits), base_step_v,
822                      right_step, &left_v[3], &right_v[3]);
823         value_v[3] = WeightedBlend(left_v[3], right_v[3],
824                                    ((index_3 << upsample_shift) & 0x3F) >> 1);
825 
826         // 8x4 transpose.
827         const uint8x8x2_t b0 = vtrn_u8(value_v[0], value_v[1]);
828         const uint8x8x2_t b1 = vtrn_u8(value_v[2], value_v[3]);
829 
830         const uint16x4x2_t c0 = vtrn_u16(vreinterpret_u16_u8(b0.val[0]),
831                                          vreinterpret_u16_u8(b1.val[0]));
832         const uint16x4x2_t c1 = vtrn_u16(vreinterpret_u16_u8(b0.val[1]),
833                                          vreinterpret_u16_u8(b1.val[1]));
834 
835         StoreLo4(dst, vreinterpret_u8_u16(c0.val[0]));
836         dst += stride;
837         StoreLo4(dst, vreinterpret_u8_u16(c1.val[0]));
838         dst += stride;
839         StoreLo4(dst, vreinterpret_u8_u16(c0.val[1]));
840         dst += stride;
841         StoreLo4(dst, vreinterpret_u8_u16(c1.val[1]));
842 
843         if (height > 4) {
844           dst += stride;
845           StoreHi4(dst, vreinterpret_u8_u16(c0.val[0]));
846           dst += stride;
847           StoreHi4(dst, vreinterpret_u8_u16(c1.val[0]));
848           dst += stride;
849           StoreHi4(dst, vreinterpret_u8_u16(c0.val[1]));
850           dst += stride;
851           StoreHi4(dst, vreinterpret_u8_u16(c1.val[1]));
852         }
853         x += 4;
854       } while (x < width);
855       y += 8;
856     } while (y < height);
857   } else {  // 8x8 at a time.
858     // Limited improvement for 8x8. ~20% faster for 64x64.
859     int y = 0;
860     do {
861       int x = 0;
862       do {
863         auto* dst = static_cast<uint8_t*>(dest);
864         dst += y * stride + x;
865         const int ystep_base = ystep * (x + 1);
866 
867         DirectionalZone3_WxH<8>(dst, stride, 8, left + (y << upsample_shift),
868                                 ystep_base, ystep, upsample_shift);
869         x += 8;
870       } while (x < width);
871       y += 8;
872     } while (y < height);
873   }
874 }
875 
Init8bpp()876 void Init8bpp() {
877   Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
878   assert(dsp != nullptr);
879   dsp->directional_intra_predictor_zone1 = DirectionalIntraPredictorZone1_NEON;
880   dsp->directional_intra_predictor_zone2 = DirectionalIntraPredictorZone2_NEON;
881   dsp->directional_intra_predictor_zone3 = DirectionalIntraPredictorZone3_NEON;
882 }
883 
884 }  // namespace
885 }  // namespace low_bitdepth
886 
887 #if LIBGAV1_MAX_BITDEPTH >= 10
888 namespace high_bitdepth {
889 namespace {
890 
891 // Blend two values based on weights that sum to 32.
WeightedBlend(const uint16x4_t a,const uint16x4_t b,const int a_weight,const int b_weight)892 inline uint16x4_t WeightedBlend(const uint16x4_t a, const uint16x4_t b,
893                                 const int a_weight, const int b_weight) {
894   const uint16x4_t a_product = vmul_n_u16(a, a_weight);
895   const uint16x4_t sum = vmla_n_u16(a_product, b, b_weight);
896 
897   return vrshr_n_u16(sum, 5 /*log2(32)*/);
898 }
899 
900 // Blend two values based on weights that sum to 32.
WeightedBlend(const uint16x8_t a,const uint16x8_t b,const uint16_t a_weight,const uint16_t b_weight)901 inline uint16x8_t WeightedBlend(const uint16x8_t a, const uint16x8_t b,
902                                 const uint16_t a_weight,
903                                 const uint16_t b_weight) {
904   const uint16x8_t a_product = vmulq_n_u16(a, a_weight);
905   const uint16x8_t sum = vmlaq_n_u16(a_product, b, b_weight);
906 
907   return vrshrq_n_u16(sum, 5 /*log2(32)*/);
908 }
909 
910 // Blend two values based on weights that sum to 32.
WeightedBlend(const uint16x8_t a,const uint16x8_t b,const uint16x8_t a_weight,const uint16x8_t b_weight)911 inline uint16x8_t WeightedBlend(const uint16x8_t a, const uint16x8_t b,
912                                 const uint16x8_t a_weight,
913                                 const uint16x8_t b_weight) {
914   const uint16x8_t a_product = vmulq_u16(a, a_weight);
915   const uint16x8_t sum = vmlaq_u16(a_product, b, b_weight);
916 
917   return vrshrq_n_u16(sum, 5 /*log2(32)*/);
918 }
919 
920 // Each element of |dest| contains values associated with one weight value.
LoadEdgeVals(uint16x4x2_t * dest,const uint16_t * LIBGAV1_RESTRICT const source,const bool upsampled)921 inline void LoadEdgeVals(uint16x4x2_t* dest,
922                          const uint16_t* LIBGAV1_RESTRICT const source,
923                          const bool upsampled) {
924   if (upsampled) {
925     *dest = vld2_u16(source);
926   } else {
927     dest->val[0] = vld1_u16(source);
928     dest->val[1] = vld1_u16(source + 1);
929   }
930 }
931 
932 // Each element of |dest| contains values associated with one weight value.
LoadEdgeVals(uint16x8x2_t * dest,const uint16_t * LIBGAV1_RESTRICT const source,const bool upsampled)933 inline void LoadEdgeVals(uint16x8x2_t* dest,
934                          const uint16_t* LIBGAV1_RESTRICT const source,
935                          const bool upsampled) {
936   if (upsampled) {
937     *dest = vld2q_u16(source);
938   } else {
939     dest->val[0] = vld1q_u16(source);
940     dest->val[1] = vld1q_u16(source + 1);
941   }
942 }
943 
944 // For Wx4 blocks, load the source for 2 columns. The source for the second
945 // column is held in the high half of each vector.
LoadEdgeVals2x4(uint16x8x2_t * dest,const uint16_t * LIBGAV1_RESTRICT const source_low,const uint16_t * LIBGAV1_RESTRICT const source_high,const bool upsampled)946 inline void LoadEdgeVals2x4(uint16x8x2_t* dest,
947                             const uint16_t* LIBGAV1_RESTRICT const source_low,
948                             const uint16_t* LIBGAV1_RESTRICT const source_high,
949                             const bool upsampled) {
950   if (upsampled) {
951     const uint16x4x2_t low = vld2_u16(source_low);
952     const uint16x4x2_t high = vld2_u16(source_high);
953     dest->val[0] = vcombine_u16(low.val[0], high.val[0]);
954     dest->val[1] = vcombine_u16(low.val[1], high.val[1]);
955   } else {
956     dest->val[0] = vcombine_u16(vld1_u16(source_low), vld1_u16(source_high));
957     dest->val[1] =
958         vcombine_u16(vld1_u16(source_low + 1), vld1_u16(source_high + 1));
959   }
960 }
961 
962 template <bool upsampled>
DirectionalZone1_4xH(uint16_t * LIBGAV1_RESTRICT dst,const ptrdiff_t stride,const int height,const uint16_t * LIBGAV1_RESTRICT const top,const int xstep)963 inline void DirectionalZone1_4xH(uint16_t* LIBGAV1_RESTRICT dst,
964                                  const ptrdiff_t stride, const int height,
965                                  const uint16_t* LIBGAV1_RESTRICT const top,
966                                  const int xstep) {
967   const int upsample_shift = static_cast<int>(upsampled);
968   const int index_scale_bits = 6 - upsample_shift;
969 
970   const int max_base_x = (4 + height - 1) << upsample_shift;
971   const int16x4_t max_base = vdup_n_s16(max_base_x);
972   const uint16x4_t final_top_val = vdup_n_u16(top[max_base_x]);
973   const int16x4_t index_offset = {0, 1, 2, 3};
974 
975   // All rows from |min_corner_only_y| down will simply use Memset.
976   // |max_base_x| is always greater than |height|, so clipping the denominator
977   // to 1 is enough to make the logic work.
978   const int xstep_units = std::max(xstep >> index_scale_bits, 1);
979   const int min_corner_only_y = std::min(max_base_x / xstep_units, height);
980 
981   int top_x = xstep;
982   int y = 0;
983   for (; y < min_corner_only_y; ++y, dst += stride, top_x += xstep) {
984     const int top_base_x = top_x >> index_scale_bits;
985 
986     // To accommodate reuse of this function in Zone2, permit negative values
987     // for |xstep|.
988     const uint16_t shift_0 = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
989     const uint16_t shift_1 = 32 - shift_0;
990 
991     // Use signed values to compare |top_base_x| to |max_base_x|.
992     const int16x4_t base_x = vadd_s16(vdup_n_s16(top_base_x), index_offset);
993     const uint16x4_t max_base_mask = vclt_s16(base_x, max_base);
994 
995     uint16x4x2_t sampled_top_row;
996     LoadEdgeVals(&sampled_top_row, top + top_base_x, upsampled);
997     const uint16x4_t combined = WeightedBlend(
998         sampled_top_row.val[0], sampled_top_row.val[1], shift_1, shift_0);
999 
1000     // If |upsampled| is true then extract every other value for output.
1001     const uint16x4_t masked_result =
1002         vbsl_u16(max_base_mask, combined, final_top_val);
1003 
1004     vst1_u16(dst, masked_result);
1005   }
1006   for (; y < height; ++y) {
1007     Memset(dst, top[max_base_x], 4 /* width */);
1008     dst += stride;
1009   }
1010 }
1011 
1012 // Process a multiple of 8 |width| by any |height|. Processes horizontally
1013 // before vertically in the hopes of being a little more cache friendly.
1014 template <bool upsampled>
DirectionalZone1_WxH(uint16_t * LIBGAV1_RESTRICT dst,const ptrdiff_t stride,const int width,const int height,const uint16_t * LIBGAV1_RESTRICT const top,const int xstep)1015 inline void DirectionalZone1_WxH(uint16_t* LIBGAV1_RESTRICT dst,
1016                                  const ptrdiff_t stride, const int width,
1017                                  const int height,
1018                                  const uint16_t* LIBGAV1_RESTRICT const top,
1019                                  const int xstep) {
1020   assert(width % 8 == 0);
1021   const int upsample_shift = static_cast<int>(upsampled);
1022   const int index_scale_bits = 6 - upsample_shift;
1023 
1024   const int max_base_index = (width + height - 1) << upsample_shift;
1025   const int16x8_t max_base_x = vdupq_n_s16(max_base_index);
1026   const uint16x8_t final_top_val = vdupq_n_u16(top[max_base_index]);
1027   const int16x8_t index_offset = {0, 1, 2, 3, 4, 5, 6, 7};
1028 
1029   const int base_step = 1 << upsample_shift;
1030   const int base_step8 = base_step << 3;
1031   const int16x8_t block_step = vdupq_n_s16(base_step8);
1032 
1033   // All rows from |min_corner_only_y| down will simply use Memset.
1034   // |max_base_x| is always greater than |height|, so clipping the denominator
1035   // to 1 is enough to make the logic work.
1036   const int xstep_units = std::max(xstep >> index_scale_bits, 1);
1037   const int min_corner_only_y = std::min(max_base_index / xstep_units, height);
1038 
1039   int top_x = xstep;
1040   int y = 0;
1041   for (; y < min_corner_only_y; ++y, dst += stride, top_x += xstep) {
1042     int top_base_x = top_x >> index_scale_bits;
1043 
1044     // To accommodate reuse of this function in Zone2, permit negative values
1045     // for |xstep|.
1046     const uint16_t shift_0 = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
1047     const uint16_t shift_1 = 32 - shift_0;
1048 
1049     // Use signed values to compare |top_base_x| to |max_base_x|.
1050     int16x8_t base_x = vaddq_s16(vdupq_n_s16(top_base_x), index_offset);
1051 
1052     int x = 0;
1053     do {
1054       const uint16x8_t max_base_mask = vcltq_s16(base_x, max_base_x);
1055 
1056       uint16x8x2_t sampled_top_row;
1057       LoadEdgeVals(&sampled_top_row, top + top_base_x, upsampled);
1058       const uint16x8_t combined = WeightedBlend(
1059           sampled_top_row.val[0], sampled_top_row.val[1], shift_1, shift_0);
1060 
1061       const uint16x8_t masked_result =
1062           vbslq_u16(max_base_mask, combined, final_top_val);
1063       vst1q_u16(dst + x, masked_result);
1064 
1065       base_x = vaddq_s16(base_x, block_step);
1066       top_base_x += base_step8;
1067       x += 8;
1068     } while (x < width);
1069   }
1070   for (int i = y; i < height; ++i) {
1071     Memset(dst, top[max_base_index], width);
1072     dst += stride;
1073   }
1074 }
1075 
1076 // Process a multiple of 8 |width| by any |height|. Processes horizontally
1077 // before vertically in the hopes of being a little more cache friendly.
DirectionalZone1_Large(uint16_t * LIBGAV1_RESTRICT dst,const ptrdiff_t stride,const int width,const int height,const uint16_t * LIBGAV1_RESTRICT const top,const int xstep,const bool upsampled)1078 inline void DirectionalZone1_Large(uint16_t* LIBGAV1_RESTRICT dst,
1079                                    const ptrdiff_t stride, const int width,
1080                                    const int height,
1081                                    const uint16_t* LIBGAV1_RESTRICT const top,
1082                                    const int xstep, const bool upsampled) {
1083   assert(width % 8 == 0);
1084   const int upsample_shift = static_cast<int>(upsampled);
1085   const int index_scale_bits = 6 - upsample_shift;
1086 
1087   const int max_base_index = (width + height - 1) << upsample_shift;
1088   const int16x8_t max_base_x = vdupq_n_s16(max_base_index);
1089   const uint16x8_t final_top_val = vdupq_n_u16(top[max_base_index]);
1090   const int16x8_t index_offset = {0, 1, 2, 3, 4, 5, 6, 7};
1091 
1092   const int base_step = 1 << upsample_shift;
1093   const int base_step8 = base_step << 3;
1094   const int16x8_t block_step = vdupq_n_s16(base_step8);
1095 
1096   // All rows from |min_corner_only_y| down will simply use Memset.
1097   // |max_base_x| is always greater than |height|, so clipping the denominator
1098   // to 1 is enough to make the logic work.
1099   const int xstep_units = std::max(xstep >> index_scale_bits, 1);
1100   const int min_corner_only_y = std::min(max_base_index / xstep_units, height);
1101 
1102   // Rows up to this y-value can be computed without checking for bounds.
1103   const int max_no_corner_y = std::min(
1104       ((max_base_index - (base_step * width)) << index_scale_bits) / xstep,
1105       height);
1106   // No need to check for exceeding |max_base_x| in the first loop.
1107   int y = 0;
1108   int top_x = xstep;
1109   for (; y < max_no_corner_y; ++y, dst += stride, top_x += xstep) {
1110     int top_base_x = top_x >> index_scale_bits;
1111     // To accommodate reuse of this function in Zone2, permit negative values
1112     // for |xstep|.
1113     const uint16_t shift_0 = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
1114     const uint16_t shift_1 = 32 - shift_0;
1115 
1116     int x = 0;
1117     do {
1118       uint16x8x2_t sampled_top_row;
1119       LoadEdgeVals(&sampled_top_row, top + top_base_x, upsampled);
1120       const uint16x8_t combined = WeightedBlend(
1121           sampled_top_row.val[0], sampled_top_row.val[1], shift_1, shift_0);
1122 
1123       vst1q_u16(dst + x, combined);
1124 
1125       top_base_x += base_step8;
1126       x += 8;
1127     } while (x < width);
1128   }
1129 
1130   for (; y < min_corner_only_y; ++y, dst += stride, top_x += xstep) {
1131     int top_base_x = top_x >> index_scale_bits;
1132 
1133     // To accommodate reuse of this function in Zone2, permit negative values
1134     // for |xstep|.
1135     const uint16_t shift_0 = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
1136     const uint16_t shift_1 = 32 - shift_0;
1137 
1138     // Use signed values to compare |top_base_x| to |max_base_x|.
1139     int16x8_t base_x = vaddq_s16(vdupq_n_s16(top_base_x), index_offset);
1140 
1141     int x = 0;
1142     const int min_corner_only_x =
1143         std::min(width, ((max_base_index - top_base_x) >> upsample_shift) + 7) &
1144         ~7;
1145     for (; x < min_corner_only_x; x += 8, top_base_x += base_step8,
1146                                   base_x = vaddq_s16(base_x, block_step)) {
1147       const uint16x8_t max_base_mask = vcltq_s16(base_x, max_base_x);
1148 
1149       uint16x8x2_t sampled_top_row;
1150       LoadEdgeVals(&sampled_top_row, top + top_base_x, upsampled);
1151       const uint16x8_t combined = WeightedBlend(
1152           sampled_top_row.val[0], sampled_top_row.val[1], shift_1, shift_0);
1153 
1154       const uint16x8_t masked_result =
1155           vbslq_u16(max_base_mask, combined, final_top_val);
1156       vst1q_u16(dst + x, masked_result);
1157     }
1158     // Corner-only section of the row.
1159     Memset(dst + x, top[max_base_index], width - x);
1160   }
1161   for (; y < height; ++y) {
1162     Memset(dst, top[max_base_index], width);
1163     dst += stride;
1164   }
1165 }
1166 
DirectionalIntraPredictorZone1_NEON(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const int width,const int height,const int xstep,const bool upsampled_top)1167 void DirectionalIntraPredictorZone1_NEON(
1168     void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
1169     const void* LIBGAV1_RESTRICT const top_row, const int width,
1170     const int height, const int xstep, const bool upsampled_top) {
1171   const auto* const top = static_cast<const uint16_t*>(top_row);
1172   auto* dst = static_cast<uint16_t*>(dest);
1173   stride /= sizeof(top[0]);
1174 
1175   assert(xstep > 0);
1176 
1177   if (xstep == 64) {
1178     assert(!upsampled_top);
1179     const uint16_t* top_ptr = top + 1;
1180     const int width_bytes = width * sizeof(top[0]);
1181     int y = height;
1182     do {
1183       memcpy(dst, top_ptr, width_bytes);
1184       memcpy(dst + stride, top_ptr + 1, width_bytes);
1185       memcpy(dst + 2 * stride, top_ptr + 2, width_bytes);
1186       memcpy(dst + 3 * stride, top_ptr + 3, width_bytes);
1187       dst += 4 * stride;
1188       top_ptr += 4;
1189       y -= 4;
1190     } while (y != 0);
1191   } else {
1192     if (width == 4) {
1193       if (upsampled_top) {
1194         DirectionalZone1_4xH<true>(dst, stride, height, top, xstep);
1195       } else {
1196         DirectionalZone1_4xH<false>(dst, stride, height, top, xstep);
1197       }
1198     } else if (width >= 32) {
1199       if (upsampled_top) {
1200         DirectionalZone1_Large(dst, stride, width, height, top, xstep, true);
1201       } else {
1202         DirectionalZone1_Large(dst, stride, width, height, top, xstep, false);
1203       }
1204     } else if (upsampled_top) {
1205       DirectionalZone1_WxH<true>(dst, stride, width, height, top, xstep);
1206     } else {
1207       DirectionalZone1_WxH<false>(dst, stride, width, height, top, xstep);
1208     }
1209   }
1210 }
1211 
1212 // -----------------------------------------------------------------------------
1213 // Zone 3
1214 // This can be considered "the transpose of Zone 1." In Zone 1, the fractional
1215 // step applies when moving vertically in the destination block, connected to
1216 // the change in |y|, whereas in this mode, the step applies when moving
1217 // horizontally, connected to the change in |x|. This makes vectorization very
1218 // complicated in row-order, because a given vector may need source pixels that
1219 // span 16 or 32 pixels in steep angles, requiring multiple expensive table
1220 // lookups and checked loads. Rather than work in row order, it is simpler to
1221 // compute |dest| in column order, and then store the transposed results.
1222 
1223 // Compute 4x4 sub-blocks.
1224 // Example of computed sub-blocks of a 4x8 block before and after transpose:
1225 // 00 10 20 30             00 01 02 03
1226 // 01 11 21 31             10 11 12 13
1227 // 02 12 22 32             20 21 22 23
1228 // 03 13 23 33             30 31 32 33
1229 // -----------     -->     -----------
1230 // 40 50 60 70             40 41 42 43
1231 // 41 51 61 71             50 51 52 53
1232 // 42 52 62 72             60 61 62 63
1233 // 43 53 63 73             70 71 72 73
1234 template <bool upsampled>
DirectionalZone3_4x4(uint8_t * LIBGAV1_RESTRICT dst,const ptrdiff_t stride,const uint16_t * LIBGAV1_RESTRICT const left,const int ystep,const int base_left_y=0)1235 inline void DirectionalZone3_4x4(uint8_t* LIBGAV1_RESTRICT dst,
1236                                  const ptrdiff_t stride,
1237                                  const uint16_t* LIBGAV1_RESTRICT const left,
1238                                  const int ystep, const int base_left_y = 0) {
1239   const int upsample_shift = static_cast<int>(upsampled);
1240   const int index_scale_bits = 6 - upsample_shift;
1241 
1242   // Compute one column at a time, then transpose for storage.
1243   uint16x4_t result[4];
1244 
1245   int left_y = base_left_y + ystep;
1246   int left_offset = left_y >> index_scale_bits;
1247   int shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
1248   int shift_1 = 32 - shift_0;
1249   uint16x4x2_t sampled_left_col;
1250   LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
1251   result[0] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
1252                             shift_1, shift_0);
1253 
1254   left_y += ystep;
1255   left_offset = left_y >> index_scale_bits;
1256   shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
1257   shift_1 = 32 - shift_0;
1258   LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
1259   result[1] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
1260                             shift_1, shift_0);
1261 
1262   left_y += ystep;
1263   left_offset = left_y >> index_scale_bits;
1264   shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
1265   shift_1 = 32 - shift_0;
1266   LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
1267   result[2] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
1268                             shift_1, shift_0);
1269 
1270   left_y += ystep;
1271   left_offset = left_y >> index_scale_bits;
1272   shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
1273   shift_1 = 32 - shift_0;
1274   LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
1275   result[3] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
1276                             shift_1, shift_0);
1277 
1278   Transpose4x4(result);
1279   Store4(dst, result[0]);
1280   dst += stride;
1281   Store4(dst, result[1]);
1282   dst += stride;
1283   Store4(dst, result[2]);
1284   dst += stride;
1285   Store4(dst, result[3]);
1286 }
1287 
1288 template <bool upsampled>
DirectionalZone3_8x4(uint8_t * LIBGAV1_RESTRICT dst,const ptrdiff_t stride,const uint16_t * LIBGAV1_RESTRICT const left,const int ystep,const int base_left_y=0)1289 inline void DirectionalZone3_8x4(uint8_t* LIBGAV1_RESTRICT dst,
1290                                  const ptrdiff_t stride,
1291                                  const uint16_t* LIBGAV1_RESTRICT const left,
1292                                  const int ystep, const int base_left_y = 0) {
1293   const int upsample_shift = static_cast<int>(upsampled);
1294   const int index_scale_bits = 6 - upsample_shift;
1295   const uint16x8_t inverter = vdupq_n_u16(32);
1296 
1297   uint16x8x2_t sampled_left_col;
1298   // Compute two columns at a time, then transpose for storage.
1299   uint16x8_t result[4];
1300 
1301   // The low half of pre-transpose vectors contains columns 0 through 3.
1302   int left_y_low = base_left_y + ystep;
1303   int left_offset_low = left_y_low >> index_scale_bits;
1304   int shift_low = (LeftShift(left_y_low, upsample_shift) & 0x3F) >> 1;
1305 
1306   // The high half of pre-transpose vectors contains columns 4 through 7.
1307   int left_y_high = left_y_low + (ystep << 2);
1308   int left_offset_high = left_y_high >> index_scale_bits;
1309   int shift_high = (LeftShift(left_y_high, upsample_shift) & 0x3F) >> 1;
1310   uint16x8_t weights_0 =
1311       vcombine_u16(vdup_n_u16(shift_low), vdup_n_u16(shift_high));
1312   uint16x8_t weights_1 = vsubq_u16(inverter, weights_0);
1313   LoadEdgeVals2x4(&sampled_left_col, &left[left_offset_low],
1314                   &left[left_offset_high], upsampled);
1315   result[0] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
1316                             weights_1, weights_0);
1317 
1318   left_y_low += ystep;
1319   left_offset_low = left_y_low >> index_scale_bits;
1320   shift_low = (LeftShift(left_y_low, upsample_shift) & 0x3F) >> 1;
1321 
1322   left_y_high += ystep;
1323   left_offset_high = left_y_high >> index_scale_bits;
1324   shift_high = (LeftShift(left_y_high, upsample_shift) & 0x3F) >> 1;
1325   weights_0 = vcombine_u16(vdup_n_u16(shift_low), vdup_n_u16(shift_high));
1326   weights_1 = vsubq_u16(inverter, weights_0);
1327   LoadEdgeVals2x4(&sampled_left_col, &left[left_offset_low],
1328                   &left[left_offset_high], upsampled);
1329   result[1] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
1330                             weights_1, weights_0);
1331 
1332   left_y_high += ystep;
1333   left_y_low += ystep;
1334   left_offset_low = left_y_low >> index_scale_bits;
1335   shift_low = (LeftShift(left_y_low, upsample_shift) & 0x3F) >> 1;
1336 
1337   left_offset_high = left_y_high >> index_scale_bits;
1338   shift_high = (LeftShift(left_y_high, upsample_shift) & 0x3F) >> 1;
1339   weights_0 = vcombine_u16(vdup_n_u16(shift_low), vdup_n_u16(shift_high));
1340   weights_1 = vsubq_u16(inverter, weights_0);
1341   LoadEdgeVals2x4(&sampled_left_col, &left[left_offset_low],
1342                   &left[left_offset_high], upsampled);
1343   result[2] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
1344                             weights_1, weights_0);
1345 
1346   left_y_low += ystep;
1347   left_offset_low = left_y_low >> index_scale_bits;
1348   shift_low = (LeftShift(left_y_low, upsample_shift) & 0x3F) >> 1;
1349 
1350   left_y_high += ystep;
1351   left_offset_high = left_y_high >> index_scale_bits;
1352   shift_high = (LeftShift(left_y_high, upsample_shift) & 0x3F) >> 1;
1353   weights_0 = vcombine_u16(vdup_n_u16(shift_low), vdup_n_u16(shift_high));
1354   weights_1 = vsubq_u16(inverter, weights_0);
1355   LoadEdgeVals2x4(&sampled_left_col, &left[left_offset_low],
1356                   &left[left_offset_high], upsampled);
1357   result[3] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
1358                             weights_1, weights_0);
1359 
1360   Transpose4x8(result);
1361   Store8(dst, result[0]);
1362   dst += stride;
1363   Store8(dst, result[1]);
1364   dst += stride;
1365   Store8(dst, result[2]);
1366   dst += stride;
1367   Store8(dst, result[3]);
1368 }
1369 
1370 template <bool upsampled>
DirectionalZone3_4x8(uint8_t * LIBGAV1_RESTRICT dst,const ptrdiff_t stride,const uint16_t * LIBGAV1_RESTRICT const left,const int ystep,const int base_left_y=0)1371 inline void DirectionalZone3_4x8(uint8_t* LIBGAV1_RESTRICT dst,
1372                                  const ptrdiff_t stride,
1373                                  const uint16_t* LIBGAV1_RESTRICT const left,
1374                                  const int ystep, const int base_left_y = 0) {
1375   const int upsample_shift = static_cast<int>(upsampled);
1376   const int index_scale_bits = 6 - upsample_shift;
1377 
1378   // Compute one column at a time, then transpose for storage.
1379   uint16x8_t result[4];
1380 
1381   int left_y = base_left_y + ystep;
1382   int left_offset = left_y >> index_scale_bits;
1383   int shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
1384   int shift_1 = 32 - shift_0;
1385   uint16x8x2_t sampled_left_col;
1386   LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
1387   result[0] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
1388                             shift_1, shift_0);
1389 
1390   left_y += ystep;
1391   left_offset = left_y >> index_scale_bits;
1392   shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
1393   shift_1 = 32 - shift_0;
1394   LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
1395   result[1] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
1396                             shift_1, shift_0);
1397 
1398   left_y += ystep;
1399   left_offset = left_y >> index_scale_bits;
1400   shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
1401   shift_1 = 32 - shift_0;
1402   LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
1403   result[2] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
1404                             shift_1, shift_0);
1405 
1406   left_y += ystep;
1407   left_offset = left_y >> index_scale_bits;
1408   shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
1409   shift_1 = 32 - shift_0;
1410   LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
1411   result[3] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
1412                             shift_1, shift_0);
1413 
1414   Transpose4x8(result);
1415   Store4(dst, vget_low_u16(result[0]));
1416   dst += stride;
1417   Store4(dst, vget_low_u16(result[1]));
1418   dst += stride;
1419   Store4(dst, vget_low_u16(result[2]));
1420   dst += stride;
1421   Store4(dst, vget_low_u16(result[3]));
1422   dst += stride;
1423   Store4(dst, vget_high_u16(result[0]));
1424   dst += stride;
1425   Store4(dst, vget_high_u16(result[1]));
1426   dst += stride;
1427   Store4(dst, vget_high_u16(result[2]));
1428   dst += stride;
1429   Store4(dst, vget_high_u16(result[3]));
1430 }
1431 
1432 template <bool upsampled>
DirectionalZone3_4xH(uint8_t * LIBGAV1_RESTRICT dest,const ptrdiff_t stride,const int height,const uint16_t * LIBGAV1_RESTRICT const left,const int ystep)1433 inline void DirectionalZone3_4xH(uint8_t* LIBGAV1_RESTRICT dest,
1434                                  const ptrdiff_t stride, const int height,
1435                                  const uint16_t* LIBGAV1_RESTRICT const left,
1436                                  const int ystep) {
1437   assert(height == 8 || height == 16);
1438   const int upsample_shift = static_cast<int>(upsampled);
1439   DirectionalZone3_4x8<upsampled>(dest, stride, left, ystep);
1440   if (height == 16) {
1441     dest += stride << 3;
1442     DirectionalZone3_4x8<upsampled>(dest, stride, left + (8 << upsample_shift),
1443                                     ystep);
1444   }
1445 }
1446 
1447 template <bool upsampled>
DirectionalZone3_Wx4(uint8_t * LIBGAV1_RESTRICT dest,const ptrdiff_t stride,const int width,const uint16_t * LIBGAV1_RESTRICT const left,const int ystep)1448 inline void DirectionalZone3_Wx4(uint8_t* LIBGAV1_RESTRICT dest,
1449                                  const ptrdiff_t stride, const int width,
1450                                  const uint16_t* LIBGAV1_RESTRICT const left,
1451                                  const int ystep) {
1452   assert(width <= 16);
1453   if (width == 4) {
1454     DirectionalZone3_4x4<upsampled>(dest, stride, left, ystep);
1455     return;
1456   }
1457   DirectionalZone3_8x4<upsampled>(dest, stride, left, ystep);
1458   if (width == 16) {
1459     const int base_left_y = ystep << 3;
1460     DirectionalZone3_8x4<upsampled>(dest + 8 * sizeof(uint16_t), stride, left,
1461                                     ystep, base_left_y);
1462   }
1463 }
1464 
1465 template <bool upsampled>
DirectionalZone3_8x8(uint8_t * LIBGAV1_RESTRICT dest,const ptrdiff_t stride,const uint16_t * LIBGAV1_RESTRICT const left,const int ystep,const int base_left_y=0)1466 inline void DirectionalZone3_8x8(uint8_t* LIBGAV1_RESTRICT dest,
1467                                  const ptrdiff_t stride,
1468                                  const uint16_t* LIBGAV1_RESTRICT const left,
1469                                  const int ystep, const int base_left_y = 0) {
1470   const int upsample_shift = static_cast<int>(upsampled);
1471   const int index_scale_bits = 6 - upsample_shift;
1472 
1473   // Compute one column at a time, then transpose for storage.
1474   uint16x8_t result[8];
1475 
1476   int left_y = base_left_y + ystep;
1477   uint16x8x2_t sampled_left_col;
1478   int left_offset = left_y >> index_scale_bits;
1479   int shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
1480   int shift_1 = 32 - shift_0;
1481   LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
1482   result[0] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
1483                             shift_1, shift_0);
1484   left_y += ystep;
1485   left_offset = left_y >> index_scale_bits;
1486   shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
1487   shift_1 = 32 - shift_0;
1488   LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
1489   result[1] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
1490                             shift_1, shift_0);
1491 
1492   left_y += ystep;
1493   left_offset = left_y >> index_scale_bits;
1494   shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
1495   shift_1 = 32 - shift_0;
1496   LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
1497   result[2] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
1498                             shift_1, shift_0);
1499 
1500   left_y += ystep;
1501   left_offset = left_y >> index_scale_bits;
1502   shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
1503   shift_1 = 32 - shift_0;
1504   LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
1505   result[3] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
1506                             shift_1, shift_0);
1507 
1508   left_y += ystep;
1509   left_offset = left_y >> index_scale_bits;
1510   shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
1511   shift_1 = 32 - shift_0;
1512   LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
1513   result[4] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
1514                             shift_1, shift_0);
1515 
1516   left_y += ystep;
1517   left_offset = left_y >> index_scale_bits;
1518   shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
1519   shift_1 = 32 - shift_0;
1520   LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
1521   result[5] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
1522                             shift_1, shift_0);
1523 
1524   left_y += ystep;
1525   left_offset = left_y >> index_scale_bits;
1526   shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
1527   shift_1 = 32 - shift_0;
1528   LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
1529   result[6] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
1530                             shift_1, shift_0);
1531 
1532   left_y += ystep;
1533   left_offset = left_y >> index_scale_bits;
1534   shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
1535   shift_1 = 32 - shift_0;
1536   LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
1537   result[7] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
1538                             shift_1, shift_0);
1539 
1540   Transpose8x8(result);
1541   Store8(dest, result[0]);
1542   dest += stride;
1543   Store8(dest, result[1]);
1544   dest += stride;
1545   Store8(dest, result[2]);
1546   dest += stride;
1547   Store8(dest, result[3]);
1548   dest += stride;
1549   Store8(dest, result[4]);
1550   dest += stride;
1551   Store8(dest, result[5]);
1552   dest += stride;
1553   Store8(dest, result[6]);
1554   dest += stride;
1555   Store8(dest, result[7]);
1556 }
1557 
1558 template <bool upsampled>
DirectionalZone3_WxH(uint8_t * LIBGAV1_RESTRICT dest,const ptrdiff_t stride,const int width,const int height,const uint16_t * LIBGAV1_RESTRICT const left,const int ystep)1559 inline void DirectionalZone3_WxH(uint8_t* LIBGAV1_RESTRICT dest,
1560                                  const ptrdiff_t stride, const int width,
1561                                  const int height,
1562                                  const uint16_t* LIBGAV1_RESTRICT const left,
1563                                  const int ystep) {
1564   const int upsample_shift = static_cast<int>(upsampled);
1565   // Zone3 never runs out of left_column values.
1566   assert((width + height - 1) << upsample_shift >  // max_base_y
1567          ((ystep * width) >> (6 - upsample_shift)) +
1568              (/* base_step */ 1 << upsample_shift) *
1569                  (height - 1));  // left_base_y
1570   int y = 0;
1571   do {
1572     int x = 0;
1573     uint8_t* dst_x = dest + y * stride;
1574     do {
1575       const int base_left_y = ystep * x;
1576       DirectionalZone3_8x8<upsampled>(
1577           dst_x, stride, left + (y << upsample_shift), ystep, base_left_y);
1578       dst_x += 8 * sizeof(uint16_t);
1579       x += 8;
1580     } while (x < width);
1581     y += 8;
1582   } while (y < height);
1583 }
1584 
DirectionalIntraPredictorZone3_NEON(void * LIBGAV1_RESTRICT const dest,const ptrdiff_t stride,const void * LIBGAV1_RESTRICT const left_column,const int width,const int height,const int ystep,const bool upsampled_left)1585 void DirectionalIntraPredictorZone3_NEON(
1586     void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
1587     const void* LIBGAV1_RESTRICT const left_column, const int width,
1588     const int height, const int ystep, const bool upsampled_left) {
1589   const auto* const left = static_cast<const uint16_t*>(left_column);
1590   auto* dst = static_cast<uint8_t*>(dest);
1591 
1592   if (ystep == 64) {
1593     assert(!upsampled_left);
1594     const int width_bytes = width * sizeof(left[0]);
1595     int y = height;
1596     do {
1597       const uint16_t* left_ptr = left + 1;
1598       memcpy(dst, left_ptr, width_bytes);
1599       memcpy(dst + stride, left_ptr + 1, width_bytes);
1600       memcpy(dst + 2 * stride, left_ptr + 2, width_bytes);
1601       memcpy(dst + 3 * stride, left_ptr + 3, width_bytes);
1602       dst += 4 * stride;
1603       left_ptr += 4;
1604       y -= 4;
1605     } while (y != 0);
1606     return;
1607   }
1608   if (height == 4) {
1609     if (upsampled_left) {
1610       DirectionalZone3_Wx4<true>(dst, stride, width, left, ystep);
1611     } else {
1612       DirectionalZone3_Wx4<false>(dst, stride, width, left, ystep);
1613     }
1614   } else if (width == 4) {
1615     if (upsampled_left) {
1616       DirectionalZone3_4xH<true>(dst, stride, height, left, ystep);
1617     } else {
1618       DirectionalZone3_4xH<false>(dst, stride, height, left, ystep);
1619     }
1620   } else {
1621     if (upsampled_left) {
1622       // |upsampled_left| can only be true if |width| + |height| <= 16,
1623       // therefore this is 8x8.
1624       DirectionalZone3_8x8<true>(dst, stride, left, ystep);
1625     } else {
1626       DirectionalZone3_WxH<false>(dst, stride, width, height, left, ystep);
1627     }
1628   }
1629 }
1630 
1631 // -----------------------------------------------------------------------------
1632 // Zone2
1633 // This function deals with cases not found in zone 1 or zone 3. The extreme
1634 // angles are 93, which makes for sharp ascents along |left_column| with each
1635 // successive dest row element until reaching |top_row|, and 177, with a shallow
1636 // ascent up |left_column| until reaching large jumps along |top_row|. In the
1637 // extremely steep cases, source vectors can only be loaded one lane at a time.
1638 
1639 // Fill |left| and |right| with the appropriate values for a given |base_step|.
LoadStepwise(const void * LIBGAV1_RESTRICT const source,const uint8x8_t left_step,const uint8x8_t right_step,uint16x4_t * left,uint16x4_t * right)1640 inline void LoadStepwise(const void* LIBGAV1_RESTRICT const source,
1641                          const uint8x8_t left_step, const uint8x8_t right_step,
1642                          uint16x4_t* left, uint16x4_t* right) {
1643   const uint8x16x2_t mixed = {
1644       vld1q_u8(static_cast<const uint8_t*>(source)),
1645       vld1q_u8(static_cast<const uint8_t*>(source) + 16)};
1646   *left = vreinterpret_u16_u8(VQTbl2U8(mixed, left_step));
1647   *right = vreinterpret_u16_u8(VQTbl2U8(mixed, right_step));
1648 }
1649 
LoadStepwise(const void * LIBGAV1_RESTRICT const source,const uint8x8_t left_step_0,const uint8x8_t right_step_0,const uint8x8_t left_step_1,const uint8x8_t right_step_1,uint16x8_t * left,uint16x8_t * right)1650 inline void LoadStepwise(const void* LIBGAV1_RESTRICT const source,
1651                          const uint8x8_t left_step_0,
1652                          const uint8x8_t right_step_0,
1653                          const uint8x8_t left_step_1,
1654                          const uint8x8_t right_step_1, uint16x8_t* left,
1655                          uint16x8_t* right) {
1656   const uint8x16x2_t mixed = {
1657       vld1q_u8(static_cast<const uint8_t*>(source)),
1658       vld1q_u8(static_cast<const uint8_t*>(source) + 16)};
1659   const uint16x4_t left_low = vreinterpret_u16_u8(VQTbl2U8(mixed, left_step_0));
1660   const uint16x4_t left_high =
1661       vreinterpret_u16_u8(VQTbl2U8(mixed, left_step_1));
1662   *left = vcombine_u16(left_low, left_high);
1663   const uint16x4_t right_low =
1664       vreinterpret_u16_u8(VQTbl2U8(mixed, right_step_0));
1665   const uint16x4_t right_high =
1666       vreinterpret_u16_u8(VQTbl2U8(mixed, right_step_1));
1667   *right = vcombine_u16(right_low, right_high);
1668 }
1669 
1670 // Blend two values based on weight pairs that each sum to 32.
WeightedBlend(const uint16x4_t a,const uint16x4_t b,const uint16x4_t a_weight,const uint16x4_t b_weight)1671 inline uint16x4_t WeightedBlend(const uint16x4_t a, const uint16x4_t b,
1672                                 const uint16x4_t a_weight,
1673                                 const uint16x4_t b_weight) {
1674   const uint16x4_t a_product = vmul_u16(a, a_weight);
1675   const uint16x4_t sum = vmla_u16(a_product, b, b_weight);
1676 
1677   return vrshr_n_u16(sum, 5 /*log2(32)*/);
1678 }
1679 
1680 // Because the source values "move backwards" as the row index increases, the
1681 // indices derived from ystep are generally negative in localized functions.
1682 // This is accommodated by making sure the relative indices are within [-15, 0]
1683 // when the function is called, and sliding them into the inclusive range
1684 // [0, 15], relative to a lower base address. 15 is the Pixel offset, so 30 is
1685 // the byte offset for table lookups.
1686 
1687 constexpr int kPositiveIndexOffsetPixels = 15;
1688 constexpr int kPositiveIndexOffsetBytes = 30;
1689 
DirectionalZone2FromLeftCol_4xH(uint8_t * LIBGAV1_RESTRICT dst,const ptrdiff_t stride,const int height,const uint16_t * LIBGAV1_RESTRICT const left_column,const int16x4_t left_y,const bool upsampled)1690 inline void DirectionalZone2FromLeftCol_4xH(
1691     uint8_t* LIBGAV1_RESTRICT dst, const ptrdiff_t stride, const int height,
1692     const uint16_t* LIBGAV1_RESTRICT const left_column, const int16x4_t left_y,
1693     const bool upsampled) {
1694   const int upsample_shift = static_cast<int>(upsampled);
1695 
1696   const int index_scale_bits = 6;
1697   // The values in |offset_y| are negative, except for the first element, which
1698   // is zero.
1699   int16x4_t offset_y;
1700   int16x4_t shift_upsampled = left_y;
1701   // The shift argument must be a constant, otherwise use upsample_shift
1702   // directly.
1703   if (upsampled) {
1704     offset_y = vshr_n_s16(left_y, index_scale_bits - 1 /*upsample_shift*/);
1705     shift_upsampled = vshl_n_s16(shift_upsampled, 1);
1706   } else {
1707     offset_y = vshr_n_s16(left_y, index_scale_bits);
1708   }
1709   offset_y = vshl_n_s16(offset_y, 1);
1710 
1711   // Select values to the left of the starting point.
1712   // The 15th element (and 16th) will be all the way at the end, to the
1713   // right. With a negative ystep everything else will be "left" of them.
1714   // This supports cumulative steps up to 15. We could support up to 16 by
1715   // doing separate loads for |left_values| and |right_values|. vtbl
1716   // supports 2 Q registers as input which would allow for cumulative
1717   // offsets of 32.
1718   // |sampler_0| indexes the first byte of each 16-bit value.
1719   const int16x4_t sampler_0 =
1720       vadd_s16(offset_y, vdup_n_s16(kPositiveIndexOffsetBytes));
1721   // |sampler_1| indexes the second byte of each 16-bit value.
1722   const int16x4_t sampler_1 = vadd_s16(sampler_0, vdup_n_s16(1));
1723   const int16x4x2_t sampler = vzip_s16(sampler_0, sampler_1);
1724   const uint8x8_t left_indices =
1725       vqmovun_s16(vcombine_s16(sampler.val[0], sampler.val[1]));
1726   const uint8x8_t right_indices =
1727       vadd_u8(left_indices, vdup_n_u8(sizeof(uint16_t)));
1728 
1729   const int16x4_t shift_masked = vand_s16(shift_upsampled, vdup_n_s16(0x3f));
1730   const uint16x4_t shift_0 = vreinterpret_u16_s16(vshr_n_s16(shift_masked, 1));
1731   const uint16x4_t shift_1 = vsub_u16(vdup_n_u16(32), shift_0);
1732 
1733   int y = 0;
1734   do {
1735     uint16x4_t src_left, src_right;
1736     LoadStepwise(
1737         left_column - kPositiveIndexOffsetPixels + (y << upsample_shift),
1738         left_indices, right_indices, &src_left, &src_right);
1739     const uint16x4_t val = WeightedBlend(src_left, src_right, shift_1, shift_0);
1740 
1741     Store4(dst, val);
1742     dst += stride;
1743   } while (++y < height);
1744 }
1745 
DirectionalZone2FromLeftCol_8x8(uint8_t * LIBGAV1_RESTRICT dst,const ptrdiff_t stride,const uint16_t * LIBGAV1_RESTRICT const left_column,const int16x8_t left_y,const bool upsampled)1746 inline void DirectionalZone2FromLeftCol_8x8(
1747     uint8_t* LIBGAV1_RESTRICT dst, const ptrdiff_t stride,
1748     const uint16_t* LIBGAV1_RESTRICT const left_column, const int16x8_t left_y,
1749     const bool upsampled) {
1750   const int upsample_shift = static_cast<int>(upsampled);
1751 
1752   const int index_scale_bits = 6;
1753   // The values in |offset_y| are negative, except for the first element, which
1754   // is zero.
1755   int16x8_t offset_y;
1756   int16x8_t shift_upsampled = left_y;
1757   // The shift argument must be a constant, otherwise use upsample_shift
1758   // directly.
1759   if (upsampled) {
1760     offset_y = vshrq_n_s16(left_y, index_scale_bits - 1);
1761     shift_upsampled = vshlq_n_s16(shift_upsampled, 1);
1762   } else {
1763     offset_y = vshrq_n_s16(left_y, index_scale_bits);
1764   }
1765   offset_y = vshlq_n_s16(offset_y, 1);
1766 
1767   // Select values to the left of the starting point.
1768   // The 15th element (and 16th) will be all the way at the end, to the right.
1769   // With a negative ystep everything else will be "left" of them.
1770   // This supports cumulative steps up to 15. We could support up to 16 by doing
1771   // separate loads for |left_values| and |right_values|. vtbl supports 2 Q
1772   // registers as input which would allow for cumulative offsets of 32.
1773   // |sampler_0| indexes the first byte of each 16-bit value.
1774   const int16x8_t sampler_0 =
1775       vaddq_s16(offset_y, vdupq_n_s16(kPositiveIndexOffsetBytes));
1776   // |sampler_1| indexes the second byte of each 16-bit value.
1777   const int16x8_t sampler_1 = vaddq_s16(sampler_0, vdupq_n_s16(1));
1778   const int16x8x2_t sampler = vzipq_s16(sampler_0, sampler_1);
1779   const uint8x8_t left_values_0 = vqmovun_s16(sampler.val[0]);
1780   const uint8x8_t left_values_1 = vqmovun_s16(sampler.val[1]);
1781   const uint8x8_t right_values_0 =
1782       vadd_u8(left_values_0, vdup_n_u8(sizeof(uint16_t)));
1783   const uint8x8_t right_values_1 =
1784       vadd_u8(left_values_1, vdup_n_u8(sizeof(uint16_t)));
1785 
1786   const int16x8_t shift_masked = vandq_s16(shift_upsampled, vdupq_n_s16(0x3f));
1787   const uint16x8_t shift_0 =
1788       vreinterpretq_u16_s16(vshrq_n_s16(shift_masked, 1));
1789   const uint16x8_t shift_1 = vsubq_u16(vdupq_n_u16(32), shift_0);
1790 
1791   for (int y = 0; y < 8; ++y) {
1792     uint16x8_t src_left, src_right;
1793     LoadStepwise(
1794         left_column - kPositiveIndexOffsetPixels + (y << upsample_shift),
1795         left_values_0, right_values_0, left_values_1, right_values_1, &src_left,
1796         &src_right);
1797     const uint16x8_t val = WeightedBlend(src_left, src_right, shift_1, shift_0);
1798 
1799     Store8(dst, val);
1800     dst += stride;
1801   }
1802 }
1803 
1804 template <bool upsampled>
DirectionalZone1Blend_4xH(uint8_t * LIBGAV1_RESTRICT dest,const ptrdiff_t stride,const int height,const uint16_t * LIBGAV1_RESTRICT const top_row,int zone_bounds,int top_x,const int xstep)1805 inline void DirectionalZone1Blend_4xH(
1806     uint8_t* LIBGAV1_RESTRICT dest, const ptrdiff_t stride, const int height,
1807     const uint16_t* LIBGAV1_RESTRICT const top_row, int zone_bounds, int top_x,
1808     const int xstep) {
1809   const int upsample_shift = static_cast<int>(upsampled);
1810   const int scale_bits_x = 6 - upsample_shift;
1811 
1812   // Representing positions along the row, which |zone_bounds| will target for
1813   // the blending boundary.
1814   const int16x4_t indices = {0, 1, 2, 3};
1815 
1816   uint16x4x2_t top_vals;
1817   int y = height;
1818   do {
1819     const uint16_t* const src = top_row + (top_x >> scale_bits_x);
1820     LoadEdgeVals(&top_vals, src, upsampled);
1821 
1822     const uint16_t shift_0 = ((top_x << upsample_shift) & 0x3f) >> 1;
1823     const uint16_t shift_1 = 32 - shift_0;
1824 
1825     const uint16x4_t val =
1826         WeightedBlend(top_vals.val[0], top_vals.val[1], shift_1, shift_0);
1827 
1828     const uint16x4_t dst_blend = Load4U16(dest);
1829     // |zone_bounds| values can be negative.
1830     const uint16x4_t blend = vcge_s16(indices, vdup_n_s16(zone_bounds >> 6));
1831     const uint16x4_t output = vbsl_u16(blend, val, dst_blend);
1832 
1833     Store4(dest, output);
1834     dest += stride;
1835     zone_bounds += xstep;
1836     top_x -= xstep;
1837   } while (--y != 0);
1838 }
1839 
1840 template <bool upsampled>
DirectionalZone1Blend_8x8(uint8_t * LIBGAV1_RESTRICT dest,const ptrdiff_t stride,const uint16_t * LIBGAV1_RESTRICT const top_row,int zone_bounds,int top_x,const int xstep)1841 inline void DirectionalZone1Blend_8x8(
1842     uint8_t* LIBGAV1_RESTRICT dest, const ptrdiff_t stride,
1843     const uint16_t* LIBGAV1_RESTRICT const top_row, int zone_bounds, int top_x,
1844     const int xstep) {
1845   const int upsample_shift = static_cast<int>(upsampled);
1846   const int scale_bits_x = 6 - upsample_shift;
1847 
1848   // Representing positions along the row, which |zone_bounds| will target for
1849   // the blending boundary.
1850   const int16x8_t indices = {0, 1, 2, 3, 4, 5, 6, 7};
1851 
1852   uint16x8x2_t top_vals;
1853   for (int y = 0; y < 8; ++y) {
1854     const uint16_t* const src = top_row + (top_x >> scale_bits_x);
1855     LoadEdgeVals(&top_vals, src, upsampled);
1856 
1857     const uint16_t shift_0 = ((top_x << upsample_shift) & 0x3f) >> 1;
1858     const uint16_t shift_1 = 32 - shift_0;
1859 
1860     const uint16x8_t val =
1861         WeightedBlend(top_vals.val[0], top_vals.val[1], shift_1, shift_0);
1862 
1863     const uint16x8_t dst_blend = Load8U16(dest);
1864     // |zone_bounds| values can be negative.
1865     const uint16x8_t blend = vcgeq_s16(indices, vdupq_n_s16(zone_bounds >> 6));
1866     const uint16x8_t output = vbslq_u16(blend, val, dst_blend);
1867 
1868     Store8(dest, output);
1869     dest += stride;
1870     zone_bounds += xstep;
1871     top_x -= xstep;
1872   }
1873 }
1874 
1875 // 7.11.2.4 (8) 90 < angle > 180
1876 // The strategy for these functions (4xH and 8+xH) is to know how many blocks
1877 // can be processed with just pixels from |top_ptr|, then handle mixed blocks,
1878 // then handle only blocks that take from |left_ptr|. Additionally, a fast
1879 // index-shuffle approach is used for pred values from |left_column| in sections
1880 // that permit it.
1881 template <bool upsampled_top, bool upsampled_left>
DirectionalZone2_4xH(uint8_t * LIBGAV1_RESTRICT dst,const ptrdiff_t stride,const uint16_t * LIBGAV1_RESTRICT const top_row,const uint16_t * LIBGAV1_RESTRICT const left_column,const int height,const int xstep,const int ystep)1882 inline void DirectionalZone2_4xH(
1883     uint8_t* LIBGAV1_RESTRICT dst, const ptrdiff_t stride,
1884     const uint16_t* LIBGAV1_RESTRICT const top_row,
1885     const uint16_t* LIBGAV1_RESTRICT const left_column, const int height,
1886     const int xstep, const int ystep) {
1887   const int upsample_left_shift = static_cast<int>(upsampled_left);
1888 
1889   // Helper vector for index computation.
1890   const int16x4_t zero_to_three = {0, 1, 2, 3};
1891 
1892   // Loop increments for moving by block (4xN). Vertical still steps by 8. If
1893   // it's only 4, it will be finished in the first iteration.
1894   const ptrdiff_t stride8 = stride << 3;
1895   const int xstep8 = xstep << 3;
1896 
1897   const int min_height = (height == 4) ? 4 : 8;
1898 
1899   // All columns from |min_top_only_x| to the right will only need |top_row| to
1900   // compute and can therefore call the Zone1 functions. This assumes |xstep| is
1901   // at least 3.
1902   assert(xstep >= 3);
1903 
1904   // Offsets the original zone bound value to simplify x < (y+1)*xstep/64 -1
1905   int xstep_bounds_base = (xstep == 64) ? 0 : xstep - 1;
1906 
1907   const int left_base_increment = ystep >> 6;
1908   const int ystep_remainder = ystep & 0x3F;
1909 
1910   // If the 64 scaling is regarded as a decimal point, the first value of the
1911   // left_y vector omits the portion which is covered under the left_column
1912   // offset. The following values need the full ystep as a relative offset.
1913   const int16x4_t left_y =
1914       vmla_n_s16(vdup_n_s16(-ystep_remainder), zero_to_three, -ystep);
1915 
1916   // This loop treats the 4 columns in 3 stages with y-value boundaries.
1917   // The first stage, before the first y-loop, covers blocks that are only
1918   // computed from the top row. The second stage, comprising two y-loops, covers
1919   // blocks that have a mixture of values computed from top or left. The final
1920   // stage covers blocks that are only computed from the left.
1921   // Round down to the nearest multiple of 8 (or 4, if height is 4).
1922   const int max_top_only_y =
1923       std::min((1 << 6) / xstep, height) & ~(min_height - 1);
1924   DirectionalZone1_4xH<upsampled_top>(reinterpret_cast<uint16_t*>(dst),
1925                                       stride >> 1, max_top_only_y, top_row,
1926                                       -xstep);
1927 
1928   if (max_top_only_y == height) return;
1929 
1930   int y = max_top_only_y;
1931   dst += stride * y;
1932   const int xstep_y = xstep * y;
1933 
1934   // All rows from |min_left_only_y| down for this set of columns only need
1935   // |left_column| to compute.
1936   const int min_left_only_y = std::min((4 /*width*/ << 6) / xstep, height);
1937   int xstep_bounds = xstep_bounds_base + xstep_y;
1938   int top_x = -xstep - xstep_y;
1939 
1940   // +8 increment is OK because if height is 4 this only runs once.
1941   for (; y < min_left_only_y;
1942        y += 8, dst += stride8, xstep_bounds += xstep8, top_x -= xstep8) {
1943     DirectionalZone2FromLeftCol_4xH(
1944         dst, stride, min_height,
1945         left_column + ((y - left_base_increment) << upsample_left_shift),
1946         left_y, upsampled_left);
1947 
1948     DirectionalZone1Blend_4xH<upsampled_top>(dst, stride, min_height, top_row,
1949                                              xstep_bounds, top_x, xstep);
1950   }
1951 
1952   // Left-only section. |height| - |y| is assumed equivalent to:
1953   // (y == 0) && (height == 4)
1954   if (height - y == 4) {
1955     DirectionalZone3_4x4<upsampled_left>(dst, stride, left_column, -ystep);
1956     return;
1957   }
1958   if (y < height) {
1959     DirectionalZone3_4xH<upsampled_left>(
1960         dst, stride, height - y, left_column + (y << upsample_left_shift),
1961         -ystep);
1962   }
1963 }
1964 
1965 // Process 8x4 and 16x4 blocks. This avoids a lot of overhead and simplifies
1966 // address safety.
1967 template <bool upsampled_top, bool upsampled_left>
DirectionalZone2_Wx4(uint8_t * LIBGAV1_RESTRICT const dst,const ptrdiff_t stride,const uint16_t * LIBGAV1_RESTRICT const top_row,const uint16_t * LIBGAV1_RESTRICT const left_column,const int width,const int xstep,const int ystep)1968 inline void DirectionalZone2_Wx4(
1969     uint8_t* LIBGAV1_RESTRICT const dst, const ptrdiff_t stride,
1970     const uint16_t* LIBGAV1_RESTRICT const top_row,
1971     const uint16_t* LIBGAV1_RESTRICT const left_column, const int width,
1972     const int xstep, const int ystep) {
1973   const int upsample_top_shift = static_cast<int>(upsampled_top);
1974   // Offsets the original zone bound value to simplify x < (y+1)*xstep/64 -1
1975   int xstep_bounds_base = (xstep == 64) ? 0 : xstep - 1;
1976 
1977   const int min_top_only_x = std::min((4 * xstep) >> 6, width);
1978   int x = 0;
1979   for (; x < min_top_only_x; x += 4, xstep_bounds_base -= (4 << 6)) {
1980     uint8_t* dst_x = dst + x * sizeof(uint16_t);
1981 
1982     // Round down to the nearest multiple of 4.
1983     const int max_top_only_y = (((x + 1) << 6) / xstep) & ~3;
1984     if (max_top_only_y != 0) {
1985       DirectionalZone1_4xH<upsampled_top>(
1986           reinterpret_cast<uint16_t*>(dst_x), stride >> 1, 4,
1987           top_row + (x << upsample_top_shift), -xstep);
1988       continue;
1989     }
1990 
1991     DirectionalZone3_4x4<upsampled_left>(dst_x, stride, left_column, -ystep,
1992                                          -ystep * x);
1993 
1994     const int min_left_only_y = ((x + 4) << 6) / xstep;
1995     if (min_left_only_y != 0) {
1996       const int top_x = -xstep;
1997       DirectionalZone1Blend_4xH<upsampled_top>(
1998           dst_x, stride, 4, top_row + (x << upsample_top_shift),
1999           xstep_bounds_base, top_x, xstep);
2000     }
2001   }
2002   // Reached |min_top_only_x|.
2003   for (; x < width; x += 4) {
2004     DirectionalZone1_4xH<upsampled_top>(
2005         reinterpret_cast<uint16_t*>(dst) + x, stride >> 1, 4,
2006         top_row + (x << upsample_top_shift), -xstep);
2007   }
2008 }
2009 
2010 template <bool shuffle_left_column, bool upsampled_top, bool upsampled_left>
DirectionalZone2_8xH(uint8_t * LIBGAV1_RESTRICT const dst,const ptrdiff_t stride,const uint16_t * LIBGAV1_RESTRICT const top_row,const uint16_t * LIBGAV1_RESTRICT const left_column,const int height,const int xstep,const int ystep,const int x,const int left_offset,const int xstep_bounds_base,const int16x8_t left_y)2011 inline void DirectionalZone2_8xH(
2012     uint8_t* LIBGAV1_RESTRICT const dst, const ptrdiff_t stride,
2013     const uint16_t* LIBGAV1_RESTRICT const top_row,
2014     const uint16_t* LIBGAV1_RESTRICT const left_column, const int height,
2015     const int xstep, const int ystep, const int x, const int left_offset,
2016     const int xstep_bounds_base, const int16x8_t left_y) {
2017   const int upsample_left_shift = static_cast<int>(upsampled_left);
2018   const int upsample_top_shift = static_cast<int>(upsampled_top);
2019 
2020   // Loop incrementers for moving by block (8x8). This function handles blocks
2021   // with height 4 as well. They are calculated in one pass so these variables
2022   // do not get used.
2023   const ptrdiff_t stride8 = stride << 3;
2024   const int xstep8 = xstep << 3;
2025 
2026   // The first stage, before the first y-loop, covers blocks that are only
2027   // computed from the top row. The second stage, comprising two y-loops, covers
2028   // blocks that have a mixture of values computed from top or left. The final
2029   // stage covers blocks that are only computed from the left.
2030   uint8_t* dst_x = dst + x * sizeof(uint16_t);
2031   // Round down to the nearest multiple of 8.
2032   const int max_top_only_y = std::min(((x + 1) << 6) / xstep, height) & ~7;
2033   DirectionalZone1_WxH<upsampled_top>(
2034       reinterpret_cast<uint16_t*>(dst_x), stride >> 1, 8, max_top_only_y,
2035       top_row + (x << upsample_top_shift), -xstep);
2036 
2037   if (max_top_only_y == height) return;
2038 
2039   int y = max_top_only_y;
2040   dst_x += stride * y;
2041   const int xstep_y = xstep * y;
2042 
2043   // All rows from |min_left_only_y| down for this set of columns only need
2044   // |left_column| to compute. Round up to the nearest 8.
2045   const int min_left_only_y =
2046       Align(std::min(((x + 8) << 6) / xstep, height), 8);
2047   int xstep_bounds = xstep_bounds_base + xstep_y;
2048   int top_x = -xstep - xstep_y;
2049 
2050   for (; y < min_left_only_y;
2051        y += 8, dst_x += stride8, xstep_bounds += xstep8, top_x -= xstep8) {
2052     if (shuffle_left_column) {
2053       DirectionalZone2FromLeftCol_8x8(
2054           dst_x, stride,
2055           left_column + ((left_offset + y) << upsample_left_shift), left_y,
2056           upsampled_left);
2057     } else {
2058       DirectionalZone3_8x8<upsampled_left>(
2059           dst_x, stride, left_column + (y << upsample_left_shift), -ystep,
2060           -ystep * x);
2061     }
2062 
2063     DirectionalZone1Blend_8x8<upsampled_top>(
2064         dst_x, stride, top_row + (x << upsample_top_shift), xstep_bounds, top_x,
2065         xstep);
2066   }
2067 
2068   // Loop over y for left_only rows.
2069   for (; y < height; y += 8, dst_x += stride8) {
2070     DirectionalZone3_8x8<upsampled_left>(
2071         dst_x, stride, left_column + (y << upsample_left_shift), -ystep,
2072         -ystep * x);
2073   }
2074 }
2075 
2076 // Process a multiple of 8 |width|.
2077 template <bool upsampled_top, bool upsampled_left>
DirectionalZone2_NEON(uint8_t * LIBGAV1_RESTRICT const dst,const ptrdiff_t stride,const uint16_t * LIBGAV1_RESTRICT const top_row,const uint16_t * LIBGAV1_RESTRICT const left_column,const int width,const int height,const int xstep,const int ystep)2078 inline void DirectionalZone2_NEON(
2079     uint8_t* LIBGAV1_RESTRICT const dst, const ptrdiff_t stride,
2080     const uint16_t* LIBGAV1_RESTRICT const top_row,
2081     const uint16_t* LIBGAV1_RESTRICT const left_column, const int width,
2082     const int height, const int xstep, const int ystep) {
2083   if (height == 4) {
2084     DirectionalZone2_Wx4<upsampled_top, upsampled_left>(
2085         dst, stride, top_row, left_column, width, xstep, ystep);
2086     return;
2087   }
2088   const int upsample_top_shift = static_cast<int>(upsampled_top);
2089 
2090   // Helper vector.
2091   const int16x8_t zero_to_seven = {0, 1, 2, 3, 4, 5, 6, 7};
2092 
2093   const int ystep8 = ystep << 3;
2094 
2095   // All columns from |min_top_only_x| to the right will only need |top_row| to
2096   // compute and can therefore call the Zone1 functions. This assumes |xstep| is
2097   // at least 3.
2098   assert(xstep >= 3);
2099   const int min_top_only_x = Align(std::min((height * xstep) >> 6, width), 8);
2100   // Analysis finds that, for most angles (ystep < 132), all segments that use
2101   // both top_row and left_column can compute from left_column using byte
2102   // shuffles from a single vector. For steeper angles, the shuffle is also
2103   // fully reliable when x >= 32.
2104   const int shuffle_left_col_x = (ystep < 132) ? 0 : 32;
2105   const int min_shuffle_x = std::min(min_top_only_x, shuffle_left_col_x);
2106 
2107   // Offsets the original zone bound value to simplify x < (y+1)*xstep/64 -1
2108   int xstep_bounds_base = (xstep == 64) ? 0 : xstep - 1;
2109 
2110   const int left_base_increment = ystep >> 6;
2111   const int ystep_remainder = ystep & 0x3F;
2112 
2113   const int left_base_increment8 = ystep8 >> 6;
2114   const int ystep_remainder8 = ystep8 & 0x3F;
2115   const int16x8_t increment_left8 = vdupq_n_s16(ystep_remainder8);
2116 
2117   // If the 64 scaling is regarded as a decimal point, the first value of the
2118   // left_y vector omits the portion which is covered under the left_column
2119   // offset. Following values need the full ystep as a relative offset.
2120   int16x8_t left_y =
2121       vmlaq_n_s16(vdupq_n_s16(-ystep_remainder), zero_to_seven, -ystep);
2122 
2123   int x = 0;
2124   for (int left_offset = -left_base_increment; x < min_shuffle_x; x += 8,
2125            xstep_bounds_base -= (8 << 6),
2126            left_y = vsubq_s16(left_y, increment_left8),
2127            left_offset -= left_base_increment8) {
2128     DirectionalZone2_8xH<false, upsampled_top, upsampled_left>(
2129         dst, stride, top_row, left_column, height, xstep, ystep, x, left_offset,
2130         xstep_bounds_base, left_y);
2131   }
2132   for (int left_offset = -left_base_increment; x < min_top_only_x; x += 8,
2133            xstep_bounds_base -= (8 << 6),
2134            left_y = vsubq_s16(left_y, increment_left8),
2135            left_offset -= left_base_increment8) {
2136     DirectionalZone2_8xH<true, upsampled_top, upsampled_left>(
2137         dst, stride, top_row, left_column, height, xstep, ystep, x, left_offset,
2138         xstep_bounds_base, left_y);
2139   }
2140   // Reached |min_top_only_x|.
2141   if (x < width) {
2142     DirectionalZone1_WxH<upsampled_top>(
2143         reinterpret_cast<uint16_t*>(dst) + x, stride >> 1, width - x, height,
2144         top_row + (x << upsample_top_shift), -xstep);
2145   }
2146 }
2147 
2148 // At this angle, neither edges are upsampled.
2149 // |min_width| is either 4 or 8.
2150 template <int min_width>
DirectionalAngle135(uint8_t * LIBGAV1_RESTRICT dst,const ptrdiff_t stride,const uint16_t * LIBGAV1_RESTRICT const top,const uint16_t * LIBGAV1_RESTRICT const left,const int width,const int height)2151 void DirectionalAngle135(uint8_t* LIBGAV1_RESTRICT dst, const ptrdiff_t stride,
2152                          const uint16_t* LIBGAV1_RESTRICT const top,
2153                          const uint16_t* LIBGAV1_RESTRICT const left,
2154                          const int width, const int height) {
2155   // y = 0 is more trivial than the other rows.
2156   memcpy(dst, top - 1, width * sizeof(top[0]));
2157   dst += stride;
2158 
2159   // If |height| > |width|, then there is a point at which top_row is no longer
2160   // used in each row.
2161   const int min_left_only_y = std::min(width, height);
2162 
2163   int y = 1;
2164   do {
2165     // Example: If y is 4 (min_width), the dest row starts with left[3],
2166     // left[2], left[1], left[0], because the angle points up. Therefore, load
2167     // starts at left[0] and is then reversed. If y is 2, the load starts at
2168     // left[-2], and is reversed to store left[1], left[0], with negative values
2169     // overwritten from |top_row|.
2170     const uint16_t* const load_left = left + y - min_width;
2171     auto* dst16 = reinterpret_cast<uint16_t*>(dst);
2172 
2173     // Some values will be overwritten when |y| is not a multiple of
2174     // |min_width|.
2175     if (min_width == 4) {
2176       const uint16x4_t left_toward_corner = vrev64_u16(vld1_u16(load_left));
2177       vst1_u16(dst16, left_toward_corner);
2178     } else {
2179       int x = 0;
2180       do {
2181         const uint16x8_t left_toward_corner =
2182             vrev64q_u16(vld1q_u16(load_left - x));
2183         vst1_u16(dst16 + x, vget_high_u16(left_toward_corner));
2184         vst1_u16(dst16 + x + 4, vget_low_u16(left_toward_corner));
2185         x += 8;
2186       } while (x < y);
2187     }
2188     // Entering |top|.
2189     memcpy(dst16 + y, top - 1, (width - y) * sizeof(top[0]));
2190     dst += stride;
2191   } while (++y < min_left_only_y);
2192 
2193   // Left only.
2194   for (; y < height; ++y, dst += stride) {
2195     auto* dst16 = reinterpret_cast<uint16_t*>(dst);
2196     const uint16_t* const load_left = left + y - min_width;
2197 
2198     int x = 0;
2199     if (min_width == 4) {
2200       const uint16x4_t left_toward_corner = vrev64_u16(vld1_u16(load_left - x));
2201       vst1_u16(dst16 + x, left_toward_corner);
2202     } else {
2203       do {
2204         const uint16x8_t left_toward_corner =
2205             vrev64q_u16(vld1q_u16(load_left - x));
2206         vst1_u16(dst16 + x, vget_high_u16(left_toward_corner));
2207         vst1_u16(dst16 + x + 4, vget_low_u16(left_toward_corner));
2208         x += 8;
2209       } while (x < width);
2210     }
2211   }
2212 }
2213 
DirectionalIntraPredictorZone2_NEON(void * LIBGAV1_RESTRICT dest,const ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column,const int width,const int height,const int xstep,const int ystep,const bool upsampled_top,const bool upsampled_left)2214 void DirectionalIntraPredictorZone2_NEON(
2215     void* LIBGAV1_RESTRICT dest, const ptrdiff_t stride,
2216     const void* LIBGAV1_RESTRICT const top_row,
2217     const void* LIBGAV1_RESTRICT const left_column, const int width,
2218     const int height, const int xstep, const int ystep,
2219     const bool upsampled_top, const bool upsampled_left) {
2220   // Increasing the negative buffer for this function allows more rows to be
2221   // processed at a time without branching in an inner loop to check the base.
2222   uint16_t top_buffer[288];
2223   uint16_t left_buffer[288];
2224 #if LIBGAV1_MSAN
2225   memset(top_buffer, 0, sizeof(top_buffer));
2226   memset(left_buffer, 0, sizeof(left_buffer));
2227 #endif  // LIBGAV1_MSAN
2228   memcpy(top_buffer + 128, static_cast<const uint16_t*>(top_row) - 16, 160);
2229   memcpy(left_buffer + 128, static_cast<const uint16_t*>(left_column) - 16,
2230          160);
2231   const uint16_t* top_ptr = top_buffer + 144;
2232   const uint16_t* left_ptr = left_buffer + 144;
2233   auto* dst = static_cast<uint8_t*>(dest);
2234 
2235   if (width == 4) {
2236     if (xstep == 64) {
2237       assert(ystep == 64);
2238       DirectionalAngle135<4>(dst, stride, top_ptr, left_ptr, width, height);
2239       return;
2240     }
2241     if (upsampled_top) {
2242       if (upsampled_left) {
2243         DirectionalZone2_4xH<true, true>(dst, stride, top_ptr, left_ptr, height,
2244                                          xstep, ystep);
2245       } else {
2246         DirectionalZone2_4xH<true, false>(dst, stride, top_ptr, left_ptr,
2247                                           height, xstep, ystep);
2248       }
2249     } else if (upsampled_left) {
2250       DirectionalZone2_4xH<false, true>(dst, stride, top_ptr, left_ptr, height,
2251                                         xstep, ystep);
2252     } else {
2253       DirectionalZone2_4xH<false, false>(dst, stride, top_ptr, left_ptr, height,
2254                                          xstep, ystep);
2255     }
2256     return;
2257   }
2258 
2259   if (xstep == 64) {
2260     assert(ystep == 64);
2261     DirectionalAngle135<8>(dst, stride, top_ptr, left_ptr, width, height);
2262     return;
2263   }
2264   if (upsampled_top) {
2265     if (upsampled_left) {
2266       DirectionalZone2_NEON<true, true>(dst, stride, top_ptr, left_ptr, width,
2267                                         height, xstep, ystep);
2268     } else {
2269       DirectionalZone2_NEON<true, false>(dst, stride, top_ptr, left_ptr, width,
2270                                          height, xstep, ystep);
2271     }
2272   } else if (upsampled_left) {
2273     DirectionalZone2_NEON<false, true>(dst, stride, top_ptr, left_ptr, width,
2274                                        height, xstep, ystep);
2275   } else {
2276     DirectionalZone2_NEON<false, false>(dst, stride, top_ptr, left_ptr, width,
2277                                         height, xstep, ystep);
2278   }
2279 }
2280 
Init10bpp()2281 void Init10bpp() {
2282   Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
2283   assert(dsp != nullptr);
2284   dsp->directional_intra_predictor_zone1 = DirectionalIntraPredictorZone1_NEON;
2285   dsp->directional_intra_predictor_zone2 = DirectionalIntraPredictorZone2_NEON;
2286   dsp->directional_intra_predictor_zone3 = DirectionalIntraPredictorZone3_NEON;
2287 }
2288 
2289 }  // namespace
2290 }  // namespace high_bitdepth
2291 #endif  // LIBGAV1_MAX_BITDEPTH >= 10
2292 
IntraPredDirectionalInit_NEON()2293 void IntraPredDirectionalInit_NEON() {
2294   low_bitdepth::Init8bpp();
2295 #if LIBGAV1_MAX_BITDEPTH >= 10
2296   high_bitdepth::Init10bpp();
2297 #endif  // LIBGAV1_MAX_BITDEPTH >= 10
2298 }
2299 
2300 }  // namespace dsp
2301 }  // namespace libgav1
2302 
2303 #else   // !LIBGAV1_ENABLE_NEON
2304 namespace libgav1 {
2305 namespace dsp {
2306 
IntraPredDirectionalInit_NEON()2307 void IntraPredDirectionalInit_NEON() {}
2308 
2309 }  // namespace dsp
2310 }  // namespace libgav1
2311 #endif  // LIBGAV1_ENABLE_NEON
2312