xref: /aosp_15_r20/external/libgav1/src/dsp/arm/intrapred_smooth_neon.cc (revision 095378508e87ed692bf8dfeb34008b65b3735891)
1 // Copyright 2019 The libgav1 Authors
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "src/dsp/intrapred_smooth.h"
16 #include "src/utils/cpu.h"
17 
18 #if LIBGAV1_ENABLE_NEON
19 
20 #include <arm_neon.h>
21 
22 #include <cassert>
23 #include <cstddef>
24 #include <cstdint>
25 
26 #include "src/dsp/arm/common_neon.h"
27 #include "src/dsp/constants.h"
28 #include "src/dsp/dsp.h"
29 #include "src/utils/common.h"
30 #include "src/utils/constants.h"
31 
32 namespace libgav1 {
33 namespace dsp {
34 namespace low_bitdepth {
35 namespace {
36 
37 // Note these constants are duplicated from intrapred.cc to allow the compiler
38 // to have visibility of the values. This helps reduce loads and in the
39 // creation of the inverse weights.
40 constexpr uint8_t kSmoothWeights[] = {
41 #include "src/dsp/smooth_weights.inc"
42 };
43 
44 // 256 - v = vneg_s8(v)
NegateS8(const uint8x8_t v)45 inline uint8x8_t NegateS8(const uint8x8_t v) {
46   return vreinterpret_u8_s8(vneg_s8(vreinterpret_s8_u8(v)));
47 }
48 
49 template <int height>
Smooth4xN_NEON(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)50 void Smooth4xN_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
51                     const void* LIBGAV1_RESTRICT const top_row,
52                     const void* LIBGAV1_RESTRICT const left_column) {
53   constexpr int width = 4;
54   const auto* const top = static_cast<const uint8_t*>(top_row);
55   const auto* const left = static_cast<const uint8_t*>(left_column);
56   const uint8_t top_right = top[width - 1];
57   const uint8_t bottom_left = left[height - 1];
58   const uint8_t* const weights_y = kSmoothWeights + height - 4;
59   auto* dst = static_cast<uint8_t*>(dest);
60 
61   const uint8x8_t top_v = Load4(top);
62   const uint8x8_t top_right_v = vdup_n_u8(top_right);
63   const uint8x8_t bottom_left_v = vdup_n_u8(bottom_left);
64   const uint8x8_t weights_x_v = Load4(kSmoothWeights + width - 4);
65   const uint8x8_t scaled_weights_x = NegateS8(weights_x_v);
66   const uint16x8_t weighted_tr = vmull_u8(scaled_weights_x, top_right_v);
67 
68   for (int y = 0; y < height; ++y) {
69     const uint8x8_t left_v = vdup_n_u8(left[y]);
70     const uint8x8_t weights_y_v = vdup_n_u8(weights_y[y]);
71     const uint8x8_t scaled_weights_y = NegateS8(weights_y_v);
72     const uint16x8_t weighted_bl = vmull_u8(scaled_weights_y, bottom_left_v);
73     const uint16x8_t weighted_top_bl =
74         vmlal_u8(weighted_bl, weights_y_v, top_v);
75     const uint16x8_t weighted_left_tr =
76         vmlal_u8(weighted_tr, weights_x_v, left_v);
77     // Maximum value of each parameter: 0xFF00
78     const uint16x8_t avg = vhaddq_u16(weighted_top_bl, weighted_left_tr);
79     const uint8x8_t result = vrshrn_n_u16(avg, kSmoothWeightScale);
80 
81     StoreLo4(dst, result);
82     dst += stride;
83   }
84 }
85 
CalculatePred(const uint16x8_t weighted_top_bl,const uint16x8_t weighted_left_tr)86 inline uint8x8_t CalculatePred(const uint16x8_t weighted_top_bl,
87                                const uint16x8_t weighted_left_tr) {
88   // Maximum value of each parameter: 0xFF00
89   const uint16x8_t avg = vhaddq_u16(weighted_top_bl, weighted_left_tr);
90   return vrshrn_n_u16(avg, kSmoothWeightScale);
91 }
92 
CalculateWeightsAndPred(const uint8x8_t top,const uint8x8_t left,const uint16x8_t weighted_tr,const uint8x8_t bottom_left,const uint8x8_t weights_x,const uint8x8_t scaled_weights_y,const uint8x8_t weights_y)93 inline uint8x8_t CalculateWeightsAndPred(
94     const uint8x8_t top, const uint8x8_t left, const uint16x8_t weighted_tr,
95     const uint8x8_t bottom_left, const uint8x8_t weights_x,
96     const uint8x8_t scaled_weights_y, const uint8x8_t weights_y) {
97   const uint16x8_t weighted_top = vmull_u8(weights_y, top);
98   const uint16x8_t weighted_top_bl =
99       vmlal_u8(weighted_top, scaled_weights_y, bottom_left);
100   const uint16x8_t weighted_left_tr = vmlal_u8(weighted_tr, weights_x, left);
101   return CalculatePred(weighted_top_bl, weighted_left_tr);
102 }
103 
104 template <int height>
Smooth8xN_NEON(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)105 void Smooth8xN_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
106                     const void* LIBGAV1_RESTRICT const top_row,
107                     const void* LIBGAV1_RESTRICT const left_column) {
108   constexpr int width = 8;
109   const auto* const top = static_cast<const uint8_t*>(top_row);
110   const auto* const left = static_cast<const uint8_t*>(left_column);
111   const uint8_t top_right = top[width - 1];
112   const uint8_t bottom_left = left[height - 1];
113   const uint8_t* const weights_y = kSmoothWeights + height - 4;
114   auto* dst = static_cast<uint8_t*>(dest);
115 
116   const uint8x8_t top_v = vld1_u8(top);
117   const uint8x8_t top_right_v = vdup_n_u8(top_right);
118   const uint8x8_t bottom_left_v = vdup_n_u8(bottom_left);
119   const uint8x8_t weights_x_v = vld1_u8(kSmoothWeights + width - 4);
120   const uint8x8_t scaled_weights_x = NegateS8(weights_x_v);
121   const uint16x8_t weighted_tr = vmull_u8(scaled_weights_x, top_right_v);
122 
123   for (int y = 0; y < height; ++y) {
124     const uint8x8_t left_v = vdup_n_u8(left[y]);
125     const uint8x8_t weights_y_v = vdup_n_u8(weights_y[y]);
126     const uint8x8_t scaled_weights_y = NegateS8(weights_y_v);
127     const uint8x8_t result =
128         CalculateWeightsAndPred(top_v, left_v, weighted_tr, bottom_left_v,
129                                 weights_x_v, scaled_weights_y, weights_y_v);
130 
131     vst1_u8(dst, result);
132     dst += stride;
133   }
134 }
135 
CalculateWeightsAndPred(const uint8x16_t top,const uint8x8_t left,const uint8x8_t top_right,const uint8x8_t weights_y,const uint8x16_t weights_x,const uint8x16_t scaled_weights_x,const uint16x8_t weighted_bl)136 inline uint8x16_t CalculateWeightsAndPred(
137     const uint8x16_t top, const uint8x8_t left, const uint8x8_t top_right,
138     const uint8x8_t weights_y, const uint8x16_t weights_x,
139     const uint8x16_t scaled_weights_x, const uint16x8_t weighted_bl) {
140   const uint16x8_t weighted_top_bl_low =
141       vmlal_u8(weighted_bl, weights_y, vget_low_u8(top));
142   const uint16x8_t weighted_left_low = vmull_u8(vget_low_u8(weights_x), left);
143   const uint16x8_t weighted_left_tr_low =
144       vmlal_u8(weighted_left_low, vget_low_u8(scaled_weights_x), top_right);
145   const uint8x8_t result_low =
146       CalculatePred(weighted_top_bl_low, weighted_left_tr_low);
147 
148   const uint16x8_t weighted_top_bl_high =
149       vmlal_u8(weighted_bl, weights_y, vget_high_u8(top));
150   const uint16x8_t weighted_left_high = vmull_u8(vget_high_u8(weights_x), left);
151   const uint16x8_t weighted_left_tr_high =
152       vmlal_u8(weighted_left_high, vget_high_u8(scaled_weights_x), top_right);
153   const uint8x8_t result_high =
154       CalculatePred(weighted_top_bl_high, weighted_left_tr_high);
155 
156   return vcombine_u8(result_low, result_high);
157 }
158 
159 // 256 - v = vneg_s8(v)
NegateS8(const uint8x16_t v)160 inline uint8x16_t NegateS8(const uint8x16_t v) {
161   return vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(v)));
162 }
163 
164 template <int width, int height>
Smooth16PlusxN_NEON(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)165 void Smooth16PlusxN_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
166                          const void* LIBGAV1_RESTRICT const top_row,
167                          const void* LIBGAV1_RESTRICT const left_column) {
168   const auto* const top = static_cast<const uint8_t*>(top_row);
169   const auto* const left = static_cast<const uint8_t*>(left_column);
170   const uint8_t top_right = top[width - 1];
171   const uint8_t bottom_left = left[height - 1];
172   const uint8_t* const weights_y = kSmoothWeights + height - 4;
173   auto* dst = static_cast<uint8_t*>(dest);
174 
175   uint8x16_t top_v[4];
176   top_v[0] = vld1q_u8(top);
177   if (width > 16) {
178     top_v[1] = vld1q_u8(top + 16);
179     if (width == 64) {
180       top_v[2] = vld1q_u8(top + 32);
181       top_v[3] = vld1q_u8(top + 48);
182     }
183   }
184 
185   const uint8x8_t top_right_v = vdup_n_u8(top_right);
186   const uint8x8_t bottom_left_v = vdup_n_u8(bottom_left);
187 
188   uint8x16_t weights_x_v[4];
189   weights_x_v[0] = vld1q_u8(kSmoothWeights + width - 4);
190   if (width > 16) {
191     weights_x_v[1] = vld1q_u8(kSmoothWeights + width + 16 - 4);
192     if (width == 64) {
193       weights_x_v[2] = vld1q_u8(kSmoothWeights + width + 32 - 4);
194       weights_x_v[3] = vld1q_u8(kSmoothWeights + width + 48 - 4);
195     }
196   }
197 
198   uint8x16_t scaled_weights_x[4];
199   scaled_weights_x[0] = NegateS8(weights_x_v[0]);
200   if (width > 16) {
201     scaled_weights_x[1] = NegateS8(weights_x_v[1]);
202     if (width == 64) {
203       scaled_weights_x[2] = NegateS8(weights_x_v[2]);
204       scaled_weights_x[3] = NegateS8(weights_x_v[3]);
205     }
206   }
207 
208   for (int y = 0; y < height; ++y) {
209     const uint8x8_t left_v = vdup_n_u8(left[y]);
210     const uint8x8_t weights_y_v = vdup_n_u8(weights_y[y]);
211     const uint8x8_t scaled_weights_y = NegateS8(weights_y_v);
212     const uint16x8_t weighted_bl = vmull_u8(scaled_weights_y, bottom_left_v);
213 
214     vst1q_u8(dst, CalculateWeightsAndPred(top_v[0], left_v, top_right_v,
215                                           weights_y_v, weights_x_v[0],
216                                           scaled_weights_x[0], weighted_bl));
217 
218     if (width > 16) {
219       vst1q_u8(dst + 16, CalculateWeightsAndPred(
220                              top_v[1], left_v, top_right_v, weights_y_v,
221                              weights_x_v[1], scaled_weights_x[1], weighted_bl));
222       if (width == 64) {
223         vst1q_u8(dst + 32,
224                  CalculateWeightsAndPred(top_v[2], left_v, top_right_v,
225                                          weights_y_v, weights_x_v[2],
226                                          scaled_weights_x[2], weighted_bl));
227         vst1q_u8(dst + 48,
228                  CalculateWeightsAndPred(top_v[3], left_v, top_right_v,
229                                          weights_y_v, weights_x_v[3],
230                                          scaled_weights_x[3], weighted_bl));
231       }
232     }
233 
234     dst += stride;
235   }
236 }
237 
238 template <int width, int height>
SmoothVertical4Or8xN_NEON(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)239 void SmoothVertical4Or8xN_NEON(void* LIBGAV1_RESTRICT const dest,
240                                ptrdiff_t stride,
241                                const void* LIBGAV1_RESTRICT const top_row,
242                                const void* LIBGAV1_RESTRICT const left_column) {
243   const auto* const top = static_cast<const uint8_t*>(top_row);
244   const auto* const left = static_cast<const uint8_t*>(left_column);
245   const uint8_t bottom_left = left[height - 1];
246   const uint8_t* const weights_y = kSmoothWeights + height - 4;
247   auto* dst = static_cast<uint8_t*>(dest);
248 
249   uint8x8_t top_v;
250   if (width == 4) {
251     top_v = Load4(top);
252   } else {  // width == 8
253     top_v = vld1_u8(top);
254   }
255 
256   const uint8x8_t bottom_left_v = vdup_n_u8(bottom_left);
257 
258   for (int y = 0; y < height; ++y) {
259     const uint8x8_t weights_y_v = vdup_n_u8(weights_y[y]);
260     const uint8x8_t scaled_weights_y = NegateS8(weights_y_v);
261 
262     const uint16x8_t weighted_top = vmull_u8(weights_y_v, top_v);
263     const uint16x8_t weighted_top_bl =
264         vmlal_u8(weighted_top, scaled_weights_y, bottom_left_v);
265     const uint8x8_t pred = vrshrn_n_u16(weighted_top_bl, kSmoothWeightScale);
266 
267     if (width == 4) {
268       StoreLo4(dst, pred);
269     } else {  // width == 8
270       vst1_u8(dst, pred);
271     }
272     dst += stride;
273   }
274 }
275 
CalculateVerticalWeightsAndPred(const uint8x16_t top,const uint8x8_t weights_y,const uint16x8_t weighted_bl)276 inline uint8x16_t CalculateVerticalWeightsAndPred(
277     const uint8x16_t top, const uint8x8_t weights_y,
278     const uint16x8_t weighted_bl) {
279   const uint16x8_t pred_low =
280       vmlal_u8(weighted_bl, weights_y, vget_low_u8(top));
281   const uint16x8_t pred_high =
282       vmlal_u8(weighted_bl, weights_y, vget_high_u8(top));
283   const uint8x8_t pred_scaled_low = vrshrn_n_u16(pred_low, kSmoothWeightScale);
284   const uint8x8_t pred_scaled_high =
285       vrshrn_n_u16(pred_high, kSmoothWeightScale);
286   return vcombine_u8(pred_scaled_low, pred_scaled_high);
287 }
288 
289 template <int width, int height>
SmoothVertical16PlusxN_NEON(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)290 void SmoothVertical16PlusxN_NEON(
291     void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
292     const void* LIBGAV1_RESTRICT const top_row,
293     const void* LIBGAV1_RESTRICT const left_column) {
294   const auto* const top = static_cast<const uint8_t*>(top_row);
295   const auto* const left = static_cast<const uint8_t*>(left_column);
296   const uint8_t bottom_left = left[height - 1];
297   const uint8_t* const weights_y = kSmoothWeights + height - 4;
298   auto* dst = static_cast<uint8_t*>(dest);
299 
300   uint8x16_t top_v[4];
301   top_v[0] = vld1q_u8(top);
302   if (width > 16) {
303     top_v[1] = vld1q_u8(top + 16);
304     if (width == 64) {
305       top_v[2] = vld1q_u8(top + 32);
306       top_v[3] = vld1q_u8(top + 48);
307     }
308   }
309 
310   const uint8x8_t bottom_left_v = vdup_n_u8(bottom_left);
311 
312   for (int y = 0; y < height; ++y) {
313     const uint8x8_t weights_y_v = vdup_n_u8(weights_y[y]);
314     const uint8x8_t scaled_weights_y = NegateS8(weights_y_v);
315     const uint16x8_t weighted_bl = vmull_u8(scaled_weights_y, bottom_left_v);
316 
317     const uint8x16_t pred_0 =
318         CalculateVerticalWeightsAndPred(top_v[0], weights_y_v, weighted_bl);
319     vst1q_u8(dst, pred_0);
320 
321     if (width > 16) {
322       const uint8x16_t pred_1 =
323           CalculateVerticalWeightsAndPred(top_v[1], weights_y_v, weighted_bl);
324       vst1q_u8(dst + 16, pred_1);
325 
326       if (width == 64) {
327         const uint8x16_t pred_2 =
328             CalculateVerticalWeightsAndPred(top_v[2], weights_y_v, weighted_bl);
329         vst1q_u8(dst + 32, pred_2);
330 
331         const uint8x16_t pred_3 =
332             CalculateVerticalWeightsAndPred(top_v[3], weights_y_v, weighted_bl);
333         vst1q_u8(dst + 48, pred_3);
334       }
335     }
336 
337     dst += stride;
338   }
339 }
340 
341 template <int width, int height>
SmoothHorizontal4Or8xN_NEON(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)342 void SmoothHorizontal4Or8xN_NEON(
343     void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
344     const void* LIBGAV1_RESTRICT const top_row,
345     const void* LIBGAV1_RESTRICT const left_column) {
346   const auto* const top = static_cast<const uint8_t*>(top_row);
347   const auto* const left = static_cast<const uint8_t*>(left_column);
348   const uint8_t top_right = top[width - 1];
349   auto* dst = static_cast<uint8_t*>(dest);
350 
351   const uint8x8_t top_right_v = vdup_n_u8(top_right);
352   // Over-reads for 4xN but still within the array.
353   const uint8x8_t weights_x = vld1_u8(kSmoothWeights + width - 4);
354   const uint8x8_t scaled_weights_x = NegateS8(weights_x);
355   const uint16x8_t weighted_tr = vmull_u8(scaled_weights_x, top_right_v);
356 
357   for (int y = 0; y < height; ++y) {
358     const uint8x8_t left_v = vdup_n_u8(left[y]);
359     const uint16x8_t weighted_left_tr =
360         vmlal_u8(weighted_tr, weights_x, left_v);
361     const uint8x8_t pred = vrshrn_n_u16(weighted_left_tr, kSmoothWeightScale);
362 
363     if (width == 4) {
364       StoreLo4(dst, pred);
365     } else {  // width == 8
366       vst1_u8(dst, pred);
367     }
368     dst += stride;
369   }
370 }
371 
CalculateHorizontalWeightsAndPred(const uint8x8_t left,const uint8x8_t top_right,const uint8x16_t weights_x,const uint8x16_t scaled_weights_x)372 inline uint8x16_t CalculateHorizontalWeightsAndPred(
373     const uint8x8_t left, const uint8x8_t top_right, const uint8x16_t weights_x,
374     const uint8x16_t scaled_weights_x) {
375   const uint16x8_t weighted_left_low = vmull_u8(vget_low_u8(weights_x), left);
376   const uint16x8_t weighted_left_tr_low =
377       vmlal_u8(weighted_left_low, vget_low_u8(scaled_weights_x), top_right);
378   const uint8x8_t pred_scaled_low =
379       vrshrn_n_u16(weighted_left_tr_low, kSmoothWeightScale);
380 
381   const uint16x8_t weighted_left_high = vmull_u8(vget_high_u8(weights_x), left);
382   const uint16x8_t weighted_left_tr_high =
383       vmlal_u8(weighted_left_high, vget_high_u8(scaled_weights_x), top_right);
384   const uint8x8_t pred_scaled_high =
385       vrshrn_n_u16(weighted_left_tr_high, kSmoothWeightScale);
386 
387   return vcombine_u8(pred_scaled_low, pred_scaled_high);
388 }
389 
390 template <int width, int height>
SmoothHorizontal16PlusxN_NEON(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)391 void SmoothHorizontal16PlusxN_NEON(
392     void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
393     const void* LIBGAV1_RESTRICT const top_row,
394     const void* LIBGAV1_RESTRICT const left_column) {
395   const auto* const top = static_cast<const uint8_t*>(top_row);
396   const auto* const left = static_cast<const uint8_t*>(left_column);
397   const uint8_t top_right = top[width - 1];
398   auto* dst = static_cast<uint8_t*>(dest);
399 
400   const uint8x8_t top_right_v = vdup_n_u8(top_right);
401 
402   uint8x16_t weights_x[4];
403   weights_x[0] = vld1q_u8(kSmoothWeights + width - 4);
404   if (width > 16) {
405     weights_x[1] = vld1q_u8(kSmoothWeights + width + 16 - 4);
406     if (width == 64) {
407       weights_x[2] = vld1q_u8(kSmoothWeights + width + 32 - 4);
408       weights_x[3] = vld1q_u8(kSmoothWeights + width + 48 - 4);
409     }
410   }
411 
412   uint8x16_t scaled_weights_x[4];
413   scaled_weights_x[0] = NegateS8(weights_x[0]);
414   if (width > 16) {
415     scaled_weights_x[1] = NegateS8(weights_x[1]);
416     if (width == 64) {
417       scaled_weights_x[2] = NegateS8(weights_x[2]);
418       scaled_weights_x[3] = NegateS8(weights_x[3]);
419     }
420   }
421 
422   for (int y = 0; y < height; ++y) {
423     const uint8x8_t left_v = vdup_n_u8(left[y]);
424 
425     const uint8x16_t pred_0 = CalculateHorizontalWeightsAndPred(
426         left_v, top_right_v, weights_x[0], scaled_weights_x[0]);
427     vst1q_u8(dst, pred_0);
428 
429     if (width > 16) {
430       const uint8x16_t pred_1 = CalculateHorizontalWeightsAndPred(
431           left_v, top_right_v, weights_x[1], scaled_weights_x[1]);
432       vst1q_u8(dst + 16, pred_1);
433 
434       if (width == 64) {
435         const uint8x16_t pred_2 = CalculateHorizontalWeightsAndPred(
436             left_v, top_right_v, weights_x[2], scaled_weights_x[2]);
437         vst1q_u8(dst + 32, pred_2);
438 
439         const uint8x16_t pred_3 = CalculateHorizontalWeightsAndPred(
440             left_v, top_right_v, weights_x[3], scaled_weights_x[3]);
441         vst1q_u8(dst + 48, pred_3);
442       }
443     }
444     dst += stride;
445   }
446 }
447 
Init8bpp()448 void Init8bpp() {
449   Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
450   assert(dsp != nullptr);
451   // 4x4
452   dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmooth] =
453       Smooth4xN_NEON<4>;
454   dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmoothVertical] =
455       SmoothVertical4Or8xN_NEON<4, 4>;
456   dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmoothHorizontal] =
457       SmoothHorizontal4Or8xN_NEON<4, 4>;
458 
459   // 4x8
460   dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmooth] =
461       Smooth4xN_NEON<8>;
462   dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmoothVertical] =
463       SmoothVertical4Or8xN_NEON<4, 8>;
464   dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmoothHorizontal] =
465       SmoothHorizontal4Or8xN_NEON<4, 8>;
466 
467   // 4x16
468   dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmooth] =
469       Smooth4xN_NEON<16>;
470   dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmoothVertical] =
471       SmoothVertical4Or8xN_NEON<4, 16>;
472   dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmoothHorizontal] =
473       SmoothHorizontal4Or8xN_NEON<4, 16>;
474 
475   // 8x4
476   dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmooth] =
477       Smooth8xN_NEON<4>;
478   dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmoothVertical] =
479       SmoothVertical4Or8xN_NEON<8, 4>;
480   dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmoothHorizontal] =
481       SmoothHorizontal4Or8xN_NEON<8, 4>;
482 
483   // 8x8
484   dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmooth] =
485       Smooth8xN_NEON<8>;
486   dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmoothVertical] =
487       SmoothVertical4Or8xN_NEON<8, 8>;
488   dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmoothHorizontal] =
489       SmoothHorizontal4Or8xN_NEON<8, 8>;
490 
491   // 8x16
492   dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmooth] =
493       Smooth8xN_NEON<16>;
494   dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmoothVertical] =
495       SmoothVertical4Or8xN_NEON<8, 16>;
496   dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmoothHorizontal] =
497       SmoothHorizontal4Or8xN_NEON<8, 16>;
498 
499   // 8x32
500   dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmooth] =
501       Smooth8xN_NEON<32>;
502   dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmoothVertical] =
503       SmoothVertical4Or8xN_NEON<8, 32>;
504   dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmoothHorizontal] =
505       SmoothHorizontal4Or8xN_NEON<8, 32>;
506 
507   // 16x4
508   dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmooth] =
509       Smooth16PlusxN_NEON<16, 4>;
510   dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmoothVertical] =
511       SmoothVertical16PlusxN_NEON<16, 4>;
512   dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmoothHorizontal] =
513       SmoothHorizontal16PlusxN_NEON<16, 4>;
514 
515   // 16x8
516   dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmooth] =
517       Smooth16PlusxN_NEON<16, 8>;
518   dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmoothVertical] =
519       SmoothVertical16PlusxN_NEON<16, 8>;
520   dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmoothHorizontal] =
521       SmoothHorizontal16PlusxN_NEON<16, 8>;
522 
523   // 16x16
524   dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmooth] =
525       Smooth16PlusxN_NEON<16, 16>;
526   dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmoothVertical] =
527       SmoothVertical16PlusxN_NEON<16, 16>;
528   dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmoothHorizontal] =
529       SmoothHorizontal16PlusxN_NEON<16, 16>;
530 
531   // 16x32
532   dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmooth] =
533       Smooth16PlusxN_NEON<16, 32>;
534   dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmoothVertical] =
535       SmoothVertical16PlusxN_NEON<16, 32>;
536   dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmoothHorizontal] =
537       SmoothHorizontal16PlusxN_NEON<16, 32>;
538 
539   // 16x64
540   dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmooth] =
541       Smooth16PlusxN_NEON<16, 64>;
542   dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmoothVertical] =
543       SmoothVertical16PlusxN_NEON<16, 64>;
544   dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmoothHorizontal] =
545       SmoothHorizontal16PlusxN_NEON<16, 64>;
546 
547   // 32x8
548   dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmooth] =
549       Smooth16PlusxN_NEON<32, 8>;
550   dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmoothVertical] =
551       SmoothVertical16PlusxN_NEON<32, 8>;
552   dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmoothHorizontal] =
553       SmoothHorizontal16PlusxN_NEON<32, 8>;
554 
555   // 32x16
556   dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmooth] =
557       Smooth16PlusxN_NEON<32, 16>;
558   dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmoothVertical] =
559       SmoothVertical16PlusxN_NEON<32, 16>;
560   dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmoothHorizontal] =
561       SmoothHorizontal16PlusxN_NEON<32, 16>;
562 
563   // 32x32
564   dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmooth] =
565       Smooth16PlusxN_NEON<32, 32>;
566   dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmoothVertical] =
567       SmoothVertical16PlusxN_NEON<32, 32>;
568   dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmoothHorizontal] =
569       SmoothHorizontal16PlusxN_NEON<32, 32>;
570 
571   // 32x64
572   dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmooth] =
573       Smooth16PlusxN_NEON<32, 64>;
574   dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmoothVertical] =
575       SmoothVertical16PlusxN_NEON<32, 64>;
576   dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmoothHorizontal] =
577       SmoothHorizontal16PlusxN_NEON<32, 64>;
578 
579   // 64x16
580   dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmooth] =
581       Smooth16PlusxN_NEON<64, 16>;
582   dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmoothVertical] =
583       SmoothVertical16PlusxN_NEON<64, 16>;
584   dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmoothHorizontal] =
585       SmoothHorizontal16PlusxN_NEON<64, 16>;
586 
587   // 64x32
588   dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmooth] =
589       Smooth16PlusxN_NEON<64, 32>;
590   dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmoothVertical] =
591       SmoothVertical16PlusxN_NEON<64, 32>;
592   dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmoothHorizontal] =
593       SmoothHorizontal16PlusxN_NEON<64, 32>;
594 
595   // 64x64
596   dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmooth] =
597       Smooth16PlusxN_NEON<64, 64>;
598   dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmoothVertical] =
599       SmoothVertical16PlusxN_NEON<64, 64>;
600   dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmoothHorizontal] =
601       SmoothHorizontal16PlusxN_NEON<64, 64>;
602 }
603 
604 }  // namespace
605 }  // namespace low_bitdepth
606 
607 #if LIBGAV1_MAX_BITDEPTH >= 10
608 namespace high_bitdepth {
609 namespace {
610 
611 // Note these constants are duplicated from intrapred.cc to allow the compiler
612 // to have visibility of the values. This helps reduce loads and in the
613 // creation of the inverse weights.
614 constexpr uint16_t kSmoothWeights[] = {
615 #include "src/dsp/smooth_weights.inc"
616 };
617 
618 // 256 - v = vneg_s8(v)
NegateS8(const uint16x4_t v)619 inline uint16x4_t NegateS8(const uint16x4_t v) {
620   return vreinterpret_u16_s8(vneg_s8(vreinterpret_s8_u16(v)));
621 }
622 
623 template <int height>
Smooth4xH_NEON(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)624 void Smooth4xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
625                     const void* LIBGAV1_RESTRICT const top_row,
626                     const void* LIBGAV1_RESTRICT const left_column) {
627   const auto* const top = static_cast<const uint16_t*>(top_row);
628   const auto* const left = static_cast<const uint16_t*>(left_column);
629   const uint16_t top_right = top[3];
630   const uint16_t bottom_left = left[height - 1];
631   const uint16_t* const weights_y = kSmoothWeights + height - 4;
632   auto* dst = static_cast<uint8_t*>(dest);
633 
634   const uint16x4_t top_v = vld1_u16(top);
635   const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left);
636   const uint16x4_t weights_x_v = vld1_u16(kSmoothWeights);
637   const uint16x4_t scaled_weights_x = NegateS8(weights_x_v);
638   const uint32x4_t weighted_tr = vmull_n_u16(scaled_weights_x, top_right);
639 
640   for (int y = 0; y < height; ++y) {
641     // Each variable in the running summation is named for the last item to be
642     // accumulated.
643     const uint32x4_t weighted_top =
644         vmlal_n_u16(weighted_tr, top_v, weights_y[y]);
645     const uint32x4_t weighted_left =
646         vmlal_n_u16(weighted_top, weights_x_v, left[y]);
647     const uint32x4_t weighted_bl =
648         vmlal_n_u16(weighted_left, bottom_left_v, 256 - weights_y[y]);
649 
650     const uint16x4_t pred = vrshrn_n_u32(weighted_bl, kSmoothWeightScale + 1);
651     vst1_u16(reinterpret_cast<uint16_t*>(dst), pred);
652     dst += stride;
653   }
654 }
655 
656 // Common code between 8xH and [16|32|64]xH.
CalculatePred8(uint16_t * LIBGAV1_RESTRICT dst,const uint32x4_t weighted_corners_low,const uint32x4_t weighted_corners_high,const uint16x4x2_t top_vals,const uint16x4x2_t weights_x,const uint16_t left_y,const uint16_t weight_y)657 inline void CalculatePred8(uint16_t* LIBGAV1_RESTRICT dst,
658                            const uint32x4_t weighted_corners_low,
659                            const uint32x4_t weighted_corners_high,
660                            const uint16x4x2_t top_vals,
661                            const uint16x4x2_t weights_x, const uint16_t left_y,
662                            const uint16_t weight_y) {
663   // Each variable in the running summation is named for the last item to be
664   // accumulated.
665   const uint32x4_t weighted_top_low =
666       vmlal_n_u16(weighted_corners_low, top_vals.val[0], weight_y);
667   const uint32x4_t weighted_edges_low =
668       vmlal_n_u16(weighted_top_low, weights_x.val[0], left_y);
669 
670   const uint16x4_t pred_low =
671       vrshrn_n_u32(weighted_edges_low, kSmoothWeightScale + 1);
672   vst1_u16(dst, pred_low);
673 
674   const uint32x4_t weighted_top_high =
675       vmlal_n_u16(weighted_corners_high, top_vals.val[1], weight_y);
676   const uint32x4_t weighted_edges_high =
677       vmlal_n_u16(weighted_top_high, weights_x.val[1], left_y);
678 
679   const uint16x4_t pred_high =
680       vrshrn_n_u32(weighted_edges_high, kSmoothWeightScale + 1);
681   vst1_u16(dst + 4, pred_high);
682 }
683 
684 template <int height>
Smooth8xH_NEON(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)685 void Smooth8xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
686                     const void* LIBGAV1_RESTRICT const top_row,
687                     const void* LIBGAV1_RESTRICT const left_column) {
688   const auto* const top = static_cast<const uint16_t*>(top_row);
689   const auto* const left = static_cast<const uint16_t*>(left_column);
690   const uint16_t top_right = top[7];
691   const uint16_t bottom_left = left[height - 1];
692   const uint16_t* const weights_y = kSmoothWeights + height - 4;
693 
694   auto* dst = static_cast<uint8_t*>(dest);
695 
696   const uint16x4x2_t top_vals = {vld1_u16(top), vld1_u16(top + 4)};
697   const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left);
698   const uint16x4x2_t weights_x = {vld1_u16(kSmoothWeights + 4),
699                                   vld1_u16(kSmoothWeights + 8)};
700   const uint32x4_t weighted_tr_low =
701       vmull_n_u16(NegateS8(weights_x.val[0]), top_right);
702   const uint32x4_t weighted_tr_high =
703       vmull_n_u16(NegateS8(weights_x.val[1]), top_right);
704 
705   for (int y = 0; y < height; ++y) {
706     const uint32x4_t weighted_bl =
707         vmull_n_u16(bottom_left_v, 256 - weights_y[y]);
708     const uint32x4_t weighted_corners_low =
709         vaddq_u32(weighted_bl, weighted_tr_low);
710     const uint32x4_t weighted_corners_high =
711         vaddq_u32(weighted_bl, weighted_tr_high);
712     CalculatePred8(reinterpret_cast<uint16_t*>(dst), weighted_corners_low,
713                    weighted_corners_high, top_vals, weights_x, left[y],
714                    weights_y[y]);
715     dst += stride;
716   }
717 }
718 
719 // For width 16 and above.
720 template <int width, int height>
SmoothWxH_NEON(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)721 void SmoothWxH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
722                     const void* LIBGAV1_RESTRICT const top_row,
723                     const void* LIBGAV1_RESTRICT const left_column) {
724   const auto* const top = static_cast<const uint16_t*>(top_row);
725   const auto* const left = static_cast<const uint16_t*>(left_column);
726   const uint16_t top_right = top[width - 1];
727   const uint16_t bottom_left = left[height - 1];
728   const uint16_t* const weights_y = kSmoothWeights + height - 4;
729 
730   auto* dst = static_cast<uint8_t*>(dest);
731 
732   // Precompute weighted values that don't vary with |y|.
733   uint32x4_t weighted_tr_low[width >> 3];
734   uint32x4_t weighted_tr_high[width >> 3];
735   for (int i = 0; i < width >> 3; ++i) {
736     const int x = i << 3;
737     const uint16x4_t weights_x_low = vld1_u16(kSmoothWeights + width - 4 + x);
738     weighted_tr_low[i] = vmull_n_u16(NegateS8(weights_x_low), top_right);
739     const uint16x4_t weights_x_high = vld1_u16(kSmoothWeights + width + x);
740     weighted_tr_high[i] = vmull_n_u16(NegateS8(weights_x_high), top_right);
741   }
742 
743   const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left);
744   for (int y = 0; y < height; ++y) {
745     const uint32x4_t weighted_bl =
746         vmull_n_u16(bottom_left_v, 256 - weights_y[y]);
747     auto* dst_x = reinterpret_cast<uint16_t*>(dst);
748     for (int i = 0; i < width >> 3; ++i) {
749       const int x = i << 3;
750       const uint16x4x2_t top_vals = {vld1_u16(top + x), vld1_u16(top + x + 4)};
751       const uint32x4_t weighted_corners_low =
752           vaddq_u32(weighted_bl, weighted_tr_low[i]);
753       const uint32x4_t weighted_corners_high =
754           vaddq_u32(weighted_bl, weighted_tr_high[i]);
755       // Accumulate weighted edge values and store.
756       const uint16x4x2_t weights_x = {vld1_u16(kSmoothWeights + width - 4 + x),
757                                       vld1_u16(kSmoothWeights + width + x)};
758       CalculatePred8(dst_x, weighted_corners_low, weighted_corners_high,
759                      top_vals, weights_x, left[y], weights_y[y]);
760       dst_x += 8;
761     }
762     dst += stride;
763   }
764 }
765 
766 template <int height>
SmoothVertical4xH_NEON(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)767 void SmoothVertical4xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
768                             const void* LIBGAV1_RESTRICT const top_row,
769                             const void* LIBGAV1_RESTRICT const left_column) {
770   const auto* const top = static_cast<const uint16_t*>(top_row);
771   const auto* const left = static_cast<const uint16_t*>(left_column);
772   const uint16_t bottom_left = left[height - 1];
773   const uint16_t* const weights_y = kSmoothWeights + height - 4;
774 
775   auto* dst = static_cast<uint8_t*>(dest);
776 
777   const uint16x4_t top_v = vld1_u16(top);
778   const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left);
779 
780   for (int y = 0; y < height; ++y) {
781     auto* dst16 = reinterpret_cast<uint16_t*>(dst);
782     const uint32x4_t weighted_bl =
783         vmull_n_u16(bottom_left_v, 256 - weights_y[y]);
784     const uint32x4_t weighted_top =
785         vmlal_n_u16(weighted_bl, top_v, weights_y[y]);
786     vst1_u16(dst16, vrshrn_n_u32(weighted_top, kSmoothWeightScale));
787 
788     dst += stride;
789   }
790 }
791 
792 template <int height>
SmoothVertical8xH_NEON(void * LIBGAV1_RESTRICT const dest,const ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)793 void SmoothVertical8xH_NEON(void* LIBGAV1_RESTRICT const dest,
794                             const ptrdiff_t stride,
795                             const void* LIBGAV1_RESTRICT const top_row,
796                             const void* LIBGAV1_RESTRICT const left_column) {
797   const auto* const top = static_cast<const uint16_t*>(top_row);
798   const auto* const left = static_cast<const uint16_t*>(left_column);
799   const uint16_t bottom_left = left[height - 1];
800   const uint16_t* const weights_y = kSmoothWeights + height - 4;
801 
802   auto* dst = static_cast<uint8_t*>(dest);
803 
804   const uint16x4_t top_low = vld1_u16(top);
805   const uint16x4_t top_high = vld1_u16(top + 4);
806   const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left);
807 
808   for (int y = 0; y < height; ++y) {
809     auto* dst16 = reinterpret_cast<uint16_t*>(dst);
810     const uint32x4_t weighted_bl =
811         vmull_n_u16(bottom_left_v, 256 - weights_y[y]);
812 
813     const uint32x4_t weighted_top_low =
814         vmlal_n_u16(weighted_bl, top_low, weights_y[y]);
815     vst1_u16(dst16, vrshrn_n_u32(weighted_top_low, kSmoothWeightScale));
816 
817     const uint32x4_t weighted_top_high =
818         vmlal_n_u16(weighted_bl, top_high, weights_y[y]);
819     vst1_u16(dst16 + 4, vrshrn_n_u32(weighted_top_high, kSmoothWeightScale));
820     dst += stride;
821   }
822 }
823 
824 // For width 16 and above.
825 template <int width, int height>
SmoothVerticalWxH_NEON(void * LIBGAV1_RESTRICT const dest,const ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)826 void SmoothVerticalWxH_NEON(void* LIBGAV1_RESTRICT const dest,
827                             const ptrdiff_t stride,
828                             const void* LIBGAV1_RESTRICT const top_row,
829                             const void* LIBGAV1_RESTRICT const left_column) {
830   const auto* const top = static_cast<const uint16_t*>(top_row);
831   const auto* const left = static_cast<const uint16_t*>(left_column);
832   const uint16_t bottom_left = left[height - 1];
833   const uint16_t* const weights_y = kSmoothWeights + height - 4;
834 
835   auto* dst = static_cast<uint8_t*>(dest);
836 
837   uint16x4x2_t top_vals[width >> 3];
838   for (int i = 0; i < width >> 3; ++i) {
839     const int x = i << 3;
840     top_vals[i] = {vld1_u16(top + x), vld1_u16(top + x + 4)};
841   }
842 
843   const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left);
844   for (int y = 0; y < height; ++y) {
845     const uint32x4_t weighted_bl =
846         vmull_n_u16(bottom_left_v, 256 - weights_y[y]);
847 
848     auto* dst_x = reinterpret_cast<uint16_t*>(dst);
849     for (int i = 0; i < width >> 3; ++i) {
850       const uint32x4_t weighted_top_low =
851           vmlal_n_u16(weighted_bl, top_vals[i].val[0], weights_y[y]);
852       vst1_u16(dst_x, vrshrn_n_u32(weighted_top_low, kSmoothWeightScale));
853 
854       const uint32x4_t weighted_top_high =
855           vmlal_n_u16(weighted_bl, top_vals[i].val[1], weights_y[y]);
856       vst1_u16(dst_x + 4, vrshrn_n_u32(weighted_top_high, kSmoothWeightScale));
857       dst_x += 8;
858     }
859     dst += stride;
860   }
861 }
862 
863 template <int height>
SmoothHorizontal4xH_NEON(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)864 void SmoothHorizontal4xH_NEON(void* LIBGAV1_RESTRICT const dest,
865                               ptrdiff_t stride,
866                               const void* LIBGAV1_RESTRICT const top_row,
867                               const void* LIBGAV1_RESTRICT const left_column) {
868   const auto* const top = static_cast<const uint16_t*>(top_row);
869   const auto* const left = static_cast<const uint16_t*>(left_column);
870   const uint16_t top_right = top[3];
871 
872   auto* dst = static_cast<uint8_t*>(dest);
873 
874   const uint16x4_t weights_x = vld1_u16(kSmoothWeights);
875   const uint16x4_t scaled_weights_x = NegateS8(weights_x);
876 
877   const uint32x4_t weighted_tr = vmull_n_u16(scaled_weights_x, top_right);
878   for (int y = 0; y < height; ++y) {
879     auto* dst16 = reinterpret_cast<uint16_t*>(dst);
880     const uint32x4_t weighted_left =
881         vmlal_n_u16(weighted_tr, weights_x, left[y]);
882     vst1_u16(dst16, vrshrn_n_u32(weighted_left, kSmoothWeightScale));
883     dst += stride;
884   }
885 }
886 
887 template <int height>
SmoothHorizontal8xH_NEON(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)888 void SmoothHorizontal8xH_NEON(void* LIBGAV1_RESTRICT const dest,
889                               ptrdiff_t stride,
890                               const void* LIBGAV1_RESTRICT const top_row,
891                               const void* LIBGAV1_RESTRICT const left_column) {
892   const auto* const top = static_cast<const uint16_t*>(top_row);
893   const auto* const left = static_cast<const uint16_t*>(left_column);
894   const uint16_t top_right = top[7];
895 
896   auto* dst = static_cast<uint8_t*>(dest);
897 
898   const uint16x4x2_t weights_x = {vld1_u16(kSmoothWeights + 4),
899                                   vld1_u16(kSmoothWeights + 8)};
900 
901   const uint32x4_t weighted_tr_low =
902       vmull_n_u16(NegateS8(weights_x.val[0]), top_right);
903   const uint32x4_t weighted_tr_high =
904       vmull_n_u16(NegateS8(weights_x.val[1]), top_right);
905 
906   for (int y = 0; y < height; ++y) {
907     auto* dst16 = reinterpret_cast<uint16_t*>(dst);
908     const uint16_t left_y = left[y];
909     const uint32x4_t weighted_left_low =
910         vmlal_n_u16(weighted_tr_low, weights_x.val[0], left_y);
911     vst1_u16(dst16, vrshrn_n_u32(weighted_left_low, kSmoothWeightScale));
912 
913     const uint32x4_t weighted_left_high =
914         vmlal_n_u16(weighted_tr_high, weights_x.val[1], left_y);
915     vst1_u16(dst16 + 4, vrshrn_n_u32(weighted_left_high, kSmoothWeightScale));
916     dst += stride;
917   }
918 }
919 
920 // For width 16 and above.
921 template <int width, int height>
SmoothHorizontalWxH_NEON(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)922 void SmoothHorizontalWxH_NEON(void* LIBGAV1_RESTRICT const dest,
923                               ptrdiff_t stride,
924                               const void* LIBGAV1_RESTRICT const top_row,
925                               const void* LIBGAV1_RESTRICT const left_column) {
926   const auto* const top = static_cast<const uint16_t*>(top_row);
927   const auto* const left = static_cast<const uint16_t*>(left_column);
928   const uint16_t top_right = top[width - 1];
929 
930   auto* dst = static_cast<uint8_t*>(dest);
931 
932   uint16x4_t weights_x_low[width >> 3];
933   uint16x4_t weights_x_high[width >> 3];
934   uint32x4_t weighted_tr_low[width >> 3];
935   uint32x4_t weighted_tr_high[width >> 3];
936   for (int i = 0; i < width >> 3; ++i) {
937     const int x = i << 3;
938     weights_x_low[i] = vld1_u16(kSmoothWeights + width - 4 + x);
939     weighted_tr_low[i] = vmull_n_u16(NegateS8(weights_x_low[i]), top_right);
940     weights_x_high[i] = vld1_u16(kSmoothWeights + width + x);
941     weighted_tr_high[i] = vmull_n_u16(NegateS8(weights_x_high[i]), top_right);
942   }
943 
944   for (int y = 0; y < height; ++y) {
945     auto* dst_x = reinterpret_cast<uint16_t*>(dst);
946     const uint16_t left_y = left[y];
947     for (int i = 0; i < width >> 3; ++i) {
948       const uint32x4_t weighted_left_low =
949           vmlal_n_u16(weighted_tr_low[i], weights_x_low[i], left_y);
950       vst1_u16(dst_x, vrshrn_n_u32(weighted_left_low, kSmoothWeightScale));
951 
952       const uint32x4_t weighted_left_high =
953           vmlal_n_u16(weighted_tr_high[i], weights_x_high[i], left_y);
954       vst1_u16(dst_x + 4, vrshrn_n_u32(weighted_left_high, kSmoothWeightScale));
955       dst_x += 8;
956     }
957     dst += stride;
958   }
959 }
960 
Init10bpp()961 void Init10bpp() {
962   Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
963   assert(dsp != nullptr);
964   // 4x4
965   dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmooth] =
966       Smooth4xH_NEON<4>;
967   dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmoothVertical] =
968       SmoothVertical4xH_NEON<4>;
969   dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmoothHorizontal] =
970       SmoothHorizontal4xH_NEON<4>;
971 
972   // 4x8
973   dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmooth] =
974       Smooth4xH_NEON<8>;
975   dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmoothVertical] =
976       SmoothVertical4xH_NEON<8>;
977   dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmoothHorizontal] =
978       SmoothHorizontal4xH_NEON<8>;
979 
980   // 4x16
981   dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmooth] =
982       Smooth4xH_NEON<16>;
983   dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmoothVertical] =
984       SmoothVertical4xH_NEON<16>;
985   dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmoothHorizontal] =
986       SmoothHorizontal4xH_NEON<16>;
987 
988   // 8x4
989   dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmooth] =
990       Smooth8xH_NEON<4>;
991   dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmoothVertical] =
992       SmoothVertical8xH_NEON<4>;
993   dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmoothHorizontal] =
994       SmoothHorizontal8xH_NEON<4>;
995 
996   // 8x8
997   dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmooth] =
998       Smooth8xH_NEON<8>;
999   dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmoothVertical] =
1000       SmoothVertical8xH_NEON<8>;
1001   dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmoothHorizontal] =
1002       SmoothHorizontal8xH_NEON<8>;
1003 
1004   // 8x16
1005   dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmooth] =
1006       Smooth8xH_NEON<16>;
1007   dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmoothVertical] =
1008       SmoothVertical8xH_NEON<16>;
1009   dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmoothHorizontal] =
1010       SmoothHorizontal8xH_NEON<16>;
1011 
1012   // 8x32
1013   dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmooth] =
1014       Smooth8xH_NEON<32>;
1015   dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmoothVertical] =
1016       SmoothVertical8xH_NEON<32>;
1017   dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmoothHorizontal] =
1018       SmoothHorizontal8xH_NEON<32>;
1019 
1020   // 16x4
1021   dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmooth] =
1022       SmoothWxH_NEON<16, 4>;
1023   dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmoothVertical] =
1024       SmoothVerticalWxH_NEON<16, 4>;
1025   dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmoothHorizontal] =
1026       SmoothHorizontalWxH_NEON<16, 4>;
1027 
1028   // 16x8
1029   dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmooth] =
1030       SmoothWxH_NEON<16, 8>;
1031   dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmoothVertical] =
1032       SmoothVerticalWxH_NEON<16, 8>;
1033   dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmoothHorizontal] =
1034       SmoothHorizontalWxH_NEON<16, 8>;
1035 
1036   // 16x16
1037   dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmooth] =
1038       SmoothWxH_NEON<16, 16>;
1039   dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmoothVertical] =
1040       SmoothVerticalWxH_NEON<16, 16>;
1041   dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmoothHorizontal] =
1042       SmoothHorizontalWxH_NEON<16, 16>;
1043 
1044   // 16x32
1045   dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmooth] =
1046       SmoothWxH_NEON<16, 32>;
1047   dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmoothVertical] =
1048       SmoothVerticalWxH_NEON<16, 32>;
1049   dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmoothHorizontal] =
1050       SmoothHorizontalWxH_NEON<16, 32>;
1051 
1052   // 16x64
1053   dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmooth] =
1054       SmoothWxH_NEON<16, 64>;
1055   dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmoothVertical] =
1056       SmoothVerticalWxH_NEON<16, 64>;
1057   dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmoothHorizontal] =
1058       SmoothHorizontalWxH_NEON<16, 64>;
1059 
1060   // 32x8
1061   dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmooth] =
1062       SmoothWxH_NEON<32, 8>;
1063   dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmoothVertical] =
1064       SmoothVerticalWxH_NEON<32, 8>;
1065   dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmoothHorizontal] =
1066       SmoothHorizontalWxH_NEON<32, 8>;
1067 
1068   // 32x16
1069   dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmooth] =
1070       SmoothWxH_NEON<32, 16>;
1071   dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmoothVertical] =
1072       SmoothVerticalWxH_NEON<32, 16>;
1073   dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmoothHorizontal] =
1074       SmoothHorizontalWxH_NEON<32, 16>;
1075 
1076   // 32x32
1077   dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmooth] =
1078       SmoothWxH_NEON<32, 32>;
1079   dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmoothVertical] =
1080       SmoothVerticalWxH_NEON<32, 32>;
1081   dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmoothHorizontal] =
1082       SmoothHorizontalWxH_NEON<32, 32>;
1083 
1084   // 32x64
1085   dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmooth] =
1086       SmoothWxH_NEON<32, 64>;
1087   dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmoothVertical] =
1088       SmoothVerticalWxH_NEON<32, 64>;
1089   dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmoothHorizontal] =
1090       SmoothHorizontalWxH_NEON<32, 64>;
1091 
1092   // 64x16
1093   dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmooth] =
1094       SmoothWxH_NEON<64, 16>;
1095   dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmoothVertical] =
1096       SmoothVerticalWxH_NEON<64, 16>;
1097   dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmoothHorizontal] =
1098       SmoothHorizontalWxH_NEON<64, 16>;
1099 
1100   // 64x32
1101   dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmooth] =
1102       SmoothWxH_NEON<64, 32>;
1103   dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmoothVertical] =
1104       SmoothVerticalWxH_NEON<64, 32>;
1105   dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmoothHorizontal] =
1106       SmoothHorizontalWxH_NEON<64, 32>;
1107 
1108   // 64x64
1109   dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmooth] =
1110       SmoothWxH_NEON<64, 64>;
1111   dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmoothVertical] =
1112       SmoothVerticalWxH_NEON<64, 64>;
1113   dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmoothHorizontal] =
1114       SmoothHorizontalWxH_NEON<64, 64>;
1115 }
1116 
1117 }  // namespace
1118 }  // namespace high_bitdepth
1119 #endif  // LIBGAV1_MAX_BITDEPTH >= 10
1120 
IntraPredSmoothInit_NEON()1121 void IntraPredSmoothInit_NEON() {
1122   low_bitdepth::Init8bpp();
1123 #if LIBGAV1_MAX_BITDEPTH >= 10
1124   high_bitdepth::Init10bpp();
1125 #endif
1126 }
1127 
1128 }  // namespace dsp
1129 }  // namespace libgav1
1130 
1131 #else   // !LIBGAV1_ENABLE_NEON
1132 namespace libgav1 {
1133 namespace dsp {
1134 
IntraPredSmoothInit_NEON()1135 void IntraPredSmoothInit_NEON() {}
1136 
1137 }  // namespace dsp
1138 }  // namespace libgav1
1139 #endif  // LIBGAV1_ENABLE_NEON
1140