1 // Copyright 2019 The libgav1 Authors
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "src/dsp/intrapred_smooth.h"
16 #include "src/utils/cpu.h"
17
18 #if LIBGAV1_ENABLE_NEON
19
20 #include <arm_neon.h>
21
22 #include <cassert>
23 #include <cstddef>
24 #include <cstdint>
25
26 #include "src/dsp/arm/common_neon.h"
27 #include "src/dsp/constants.h"
28 #include "src/dsp/dsp.h"
29 #include "src/utils/common.h"
30 #include "src/utils/constants.h"
31
32 namespace libgav1 {
33 namespace dsp {
34 namespace low_bitdepth {
35 namespace {
36
37 // Note these constants are duplicated from intrapred.cc to allow the compiler
38 // to have visibility of the values. This helps reduce loads and in the
39 // creation of the inverse weights.
40 constexpr uint8_t kSmoothWeights[] = {
41 #include "src/dsp/smooth_weights.inc"
42 };
43
44 // 256 - v = vneg_s8(v)
NegateS8(const uint8x8_t v)45 inline uint8x8_t NegateS8(const uint8x8_t v) {
46 return vreinterpret_u8_s8(vneg_s8(vreinterpret_s8_u8(v)));
47 }
48
49 template <int height>
Smooth4xN_NEON(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)50 void Smooth4xN_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
51 const void* LIBGAV1_RESTRICT const top_row,
52 const void* LIBGAV1_RESTRICT const left_column) {
53 constexpr int width = 4;
54 const auto* const top = static_cast<const uint8_t*>(top_row);
55 const auto* const left = static_cast<const uint8_t*>(left_column);
56 const uint8_t top_right = top[width - 1];
57 const uint8_t bottom_left = left[height - 1];
58 const uint8_t* const weights_y = kSmoothWeights + height - 4;
59 auto* dst = static_cast<uint8_t*>(dest);
60
61 const uint8x8_t top_v = Load4(top);
62 const uint8x8_t top_right_v = vdup_n_u8(top_right);
63 const uint8x8_t bottom_left_v = vdup_n_u8(bottom_left);
64 const uint8x8_t weights_x_v = Load4(kSmoothWeights + width - 4);
65 const uint8x8_t scaled_weights_x = NegateS8(weights_x_v);
66 const uint16x8_t weighted_tr = vmull_u8(scaled_weights_x, top_right_v);
67
68 for (int y = 0; y < height; ++y) {
69 const uint8x8_t left_v = vdup_n_u8(left[y]);
70 const uint8x8_t weights_y_v = vdup_n_u8(weights_y[y]);
71 const uint8x8_t scaled_weights_y = NegateS8(weights_y_v);
72 const uint16x8_t weighted_bl = vmull_u8(scaled_weights_y, bottom_left_v);
73 const uint16x8_t weighted_top_bl =
74 vmlal_u8(weighted_bl, weights_y_v, top_v);
75 const uint16x8_t weighted_left_tr =
76 vmlal_u8(weighted_tr, weights_x_v, left_v);
77 // Maximum value of each parameter: 0xFF00
78 const uint16x8_t avg = vhaddq_u16(weighted_top_bl, weighted_left_tr);
79 const uint8x8_t result = vrshrn_n_u16(avg, kSmoothWeightScale);
80
81 StoreLo4(dst, result);
82 dst += stride;
83 }
84 }
85
CalculatePred(const uint16x8_t weighted_top_bl,const uint16x8_t weighted_left_tr)86 inline uint8x8_t CalculatePred(const uint16x8_t weighted_top_bl,
87 const uint16x8_t weighted_left_tr) {
88 // Maximum value of each parameter: 0xFF00
89 const uint16x8_t avg = vhaddq_u16(weighted_top_bl, weighted_left_tr);
90 return vrshrn_n_u16(avg, kSmoothWeightScale);
91 }
92
CalculateWeightsAndPred(const uint8x8_t top,const uint8x8_t left,const uint16x8_t weighted_tr,const uint8x8_t bottom_left,const uint8x8_t weights_x,const uint8x8_t scaled_weights_y,const uint8x8_t weights_y)93 inline uint8x8_t CalculateWeightsAndPred(
94 const uint8x8_t top, const uint8x8_t left, const uint16x8_t weighted_tr,
95 const uint8x8_t bottom_left, const uint8x8_t weights_x,
96 const uint8x8_t scaled_weights_y, const uint8x8_t weights_y) {
97 const uint16x8_t weighted_top = vmull_u8(weights_y, top);
98 const uint16x8_t weighted_top_bl =
99 vmlal_u8(weighted_top, scaled_weights_y, bottom_left);
100 const uint16x8_t weighted_left_tr = vmlal_u8(weighted_tr, weights_x, left);
101 return CalculatePred(weighted_top_bl, weighted_left_tr);
102 }
103
104 template <int height>
Smooth8xN_NEON(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)105 void Smooth8xN_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
106 const void* LIBGAV1_RESTRICT const top_row,
107 const void* LIBGAV1_RESTRICT const left_column) {
108 constexpr int width = 8;
109 const auto* const top = static_cast<const uint8_t*>(top_row);
110 const auto* const left = static_cast<const uint8_t*>(left_column);
111 const uint8_t top_right = top[width - 1];
112 const uint8_t bottom_left = left[height - 1];
113 const uint8_t* const weights_y = kSmoothWeights + height - 4;
114 auto* dst = static_cast<uint8_t*>(dest);
115
116 const uint8x8_t top_v = vld1_u8(top);
117 const uint8x8_t top_right_v = vdup_n_u8(top_right);
118 const uint8x8_t bottom_left_v = vdup_n_u8(bottom_left);
119 const uint8x8_t weights_x_v = vld1_u8(kSmoothWeights + width - 4);
120 const uint8x8_t scaled_weights_x = NegateS8(weights_x_v);
121 const uint16x8_t weighted_tr = vmull_u8(scaled_weights_x, top_right_v);
122
123 for (int y = 0; y < height; ++y) {
124 const uint8x8_t left_v = vdup_n_u8(left[y]);
125 const uint8x8_t weights_y_v = vdup_n_u8(weights_y[y]);
126 const uint8x8_t scaled_weights_y = NegateS8(weights_y_v);
127 const uint8x8_t result =
128 CalculateWeightsAndPred(top_v, left_v, weighted_tr, bottom_left_v,
129 weights_x_v, scaled_weights_y, weights_y_v);
130
131 vst1_u8(dst, result);
132 dst += stride;
133 }
134 }
135
CalculateWeightsAndPred(const uint8x16_t top,const uint8x8_t left,const uint8x8_t top_right,const uint8x8_t weights_y,const uint8x16_t weights_x,const uint8x16_t scaled_weights_x,const uint16x8_t weighted_bl)136 inline uint8x16_t CalculateWeightsAndPred(
137 const uint8x16_t top, const uint8x8_t left, const uint8x8_t top_right,
138 const uint8x8_t weights_y, const uint8x16_t weights_x,
139 const uint8x16_t scaled_weights_x, const uint16x8_t weighted_bl) {
140 const uint16x8_t weighted_top_bl_low =
141 vmlal_u8(weighted_bl, weights_y, vget_low_u8(top));
142 const uint16x8_t weighted_left_low = vmull_u8(vget_low_u8(weights_x), left);
143 const uint16x8_t weighted_left_tr_low =
144 vmlal_u8(weighted_left_low, vget_low_u8(scaled_weights_x), top_right);
145 const uint8x8_t result_low =
146 CalculatePred(weighted_top_bl_low, weighted_left_tr_low);
147
148 const uint16x8_t weighted_top_bl_high =
149 vmlal_u8(weighted_bl, weights_y, vget_high_u8(top));
150 const uint16x8_t weighted_left_high = vmull_u8(vget_high_u8(weights_x), left);
151 const uint16x8_t weighted_left_tr_high =
152 vmlal_u8(weighted_left_high, vget_high_u8(scaled_weights_x), top_right);
153 const uint8x8_t result_high =
154 CalculatePred(weighted_top_bl_high, weighted_left_tr_high);
155
156 return vcombine_u8(result_low, result_high);
157 }
158
159 // 256 - v = vneg_s8(v)
NegateS8(const uint8x16_t v)160 inline uint8x16_t NegateS8(const uint8x16_t v) {
161 return vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(v)));
162 }
163
164 template <int width, int height>
Smooth16PlusxN_NEON(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)165 void Smooth16PlusxN_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
166 const void* LIBGAV1_RESTRICT const top_row,
167 const void* LIBGAV1_RESTRICT const left_column) {
168 const auto* const top = static_cast<const uint8_t*>(top_row);
169 const auto* const left = static_cast<const uint8_t*>(left_column);
170 const uint8_t top_right = top[width - 1];
171 const uint8_t bottom_left = left[height - 1];
172 const uint8_t* const weights_y = kSmoothWeights + height - 4;
173 auto* dst = static_cast<uint8_t*>(dest);
174
175 uint8x16_t top_v[4];
176 top_v[0] = vld1q_u8(top);
177 if (width > 16) {
178 top_v[1] = vld1q_u8(top + 16);
179 if (width == 64) {
180 top_v[2] = vld1q_u8(top + 32);
181 top_v[3] = vld1q_u8(top + 48);
182 }
183 }
184
185 const uint8x8_t top_right_v = vdup_n_u8(top_right);
186 const uint8x8_t bottom_left_v = vdup_n_u8(bottom_left);
187
188 uint8x16_t weights_x_v[4];
189 weights_x_v[0] = vld1q_u8(kSmoothWeights + width - 4);
190 if (width > 16) {
191 weights_x_v[1] = vld1q_u8(kSmoothWeights + width + 16 - 4);
192 if (width == 64) {
193 weights_x_v[2] = vld1q_u8(kSmoothWeights + width + 32 - 4);
194 weights_x_v[3] = vld1q_u8(kSmoothWeights + width + 48 - 4);
195 }
196 }
197
198 uint8x16_t scaled_weights_x[4];
199 scaled_weights_x[0] = NegateS8(weights_x_v[0]);
200 if (width > 16) {
201 scaled_weights_x[1] = NegateS8(weights_x_v[1]);
202 if (width == 64) {
203 scaled_weights_x[2] = NegateS8(weights_x_v[2]);
204 scaled_weights_x[3] = NegateS8(weights_x_v[3]);
205 }
206 }
207
208 for (int y = 0; y < height; ++y) {
209 const uint8x8_t left_v = vdup_n_u8(left[y]);
210 const uint8x8_t weights_y_v = vdup_n_u8(weights_y[y]);
211 const uint8x8_t scaled_weights_y = NegateS8(weights_y_v);
212 const uint16x8_t weighted_bl = vmull_u8(scaled_weights_y, bottom_left_v);
213
214 vst1q_u8(dst, CalculateWeightsAndPred(top_v[0], left_v, top_right_v,
215 weights_y_v, weights_x_v[0],
216 scaled_weights_x[0], weighted_bl));
217
218 if (width > 16) {
219 vst1q_u8(dst + 16, CalculateWeightsAndPred(
220 top_v[1], left_v, top_right_v, weights_y_v,
221 weights_x_v[1], scaled_weights_x[1], weighted_bl));
222 if (width == 64) {
223 vst1q_u8(dst + 32,
224 CalculateWeightsAndPred(top_v[2], left_v, top_right_v,
225 weights_y_v, weights_x_v[2],
226 scaled_weights_x[2], weighted_bl));
227 vst1q_u8(dst + 48,
228 CalculateWeightsAndPred(top_v[3], left_v, top_right_v,
229 weights_y_v, weights_x_v[3],
230 scaled_weights_x[3], weighted_bl));
231 }
232 }
233
234 dst += stride;
235 }
236 }
237
238 template <int width, int height>
SmoothVertical4Or8xN_NEON(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)239 void SmoothVertical4Or8xN_NEON(void* LIBGAV1_RESTRICT const dest,
240 ptrdiff_t stride,
241 const void* LIBGAV1_RESTRICT const top_row,
242 const void* LIBGAV1_RESTRICT const left_column) {
243 const auto* const top = static_cast<const uint8_t*>(top_row);
244 const auto* const left = static_cast<const uint8_t*>(left_column);
245 const uint8_t bottom_left = left[height - 1];
246 const uint8_t* const weights_y = kSmoothWeights + height - 4;
247 auto* dst = static_cast<uint8_t*>(dest);
248
249 uint8x8_t top_v;
250 if (width == 4) {
251 top_v = Load4(top);
252 } else { // width == 8
253 top_v = vld1_u8(top);
254 }
255
256 const uint8x8_t bottom_left_v = vdup_n_u8(bottom_left);
257
258 for (int y = 0; y < height; ++y) {
259 const uint8x8_t weights_y_v = vdup_n_u8(weights_y[y]);
260 const uint8x8_t scaled_weights_y = NegateS8(weights_y_v);
261
262 const uint16x8_t weighted_top = vmull_u8(weights_y_v, top_v);
263 const uint16x8_t weighted_top_bl =
264 vmlal_u8(weighted_top, scaled_weights_y, bottom_left_v);
265 const uint8x8_t pred = vrshrn_n_u16(weighted_top_bl, kSmoothWeightScale);
266
267 if (width == 4) {
268 StoreLo4(dst, pred);
269 } else { // width == 8
270 vst1_u8(dst, pred);
271 }
272 dst += stride;
273 }
274 }
275
CalculateVerticalWeightsAndPred(const uint8x16_t top,const uint8x8_t weights_y,const uint16x8_t weighted_bl)276 inline uint8x16_t CalculateVerticalWeightsAndPred(
277 const uint8x16_t top, const uint8x8_t weights_y,
278 const uint16x8_t weighted_bl) {
279 const uint16x8_t pred_low =
280 vmlal_u8(weighted_bl, weights_y, vget_low_u8(top));
281 const uint16x8_t pred_high =
282 vmlal_u8(weighted_bl, weights_y, vget_high_u8(top));
283 const uint8x8_t pred_scaled_low = vrshrn_n_u16(pred_low, kSmoothWeightScale);
284 const uint8x8_t pred_scaled_high =
285 vrshrn_n_u16(pred_high, kSmoothWeightScale);
286 return vcombine_u8(pred_scaled_low, pred_scaled_high);
287 }
288
289 template <int width, int height>
SmoothVertical16PlusxN_NEON(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)290 void SmoothVertical16PlusxN_NEON(
291 void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
292 const void* LIBGAV1_RESTRICT const top_row,
293 const void* LIBGAV1_RESTRICT const left_column) {
294 const auto* const top = static_cast<const uint8_t*>(top_row);
295 const auto* const left = static_cast<const uint8_t*>(left_column);
296 const uint8_t bottom_left = left[height - 1];
297 const uint8_t* const weights_y = kSmoothWeights + height - 4;
298 auto* dst = static_cast<uint8_t*>(dest);
299
300 uint8x16_t top_v[4];
301 top_v[0] = vld1q_u8(top);
302 if (width > 16) {
303 top_v[1] = vld1q_u8(top + 16);
304 if (width == 64) {
305 top_v[2] = vld1q_u8(top + 32);
306 top_v[3] = vld1q_u8(top + 48);
307 }
308 }
309
310 const uint8x8_t bottom_left_v = vdup_n_u8(bottom_left);
311
312 for (int y = 0; y < height; ++y) {
313 const uint8x8_t weights_y_v = vdup_n_u8(weights_y[y]);
314 const uint8x8_t scaled_weights_y = NegateS8(weights_y_v);
315 const uint16x8_t weighted_bl = vmull_u8(scaled_weights_y, bottom_left_v);
316
317 const uint8x16_t pred_0 =
318 CalculateVerticalWeightsAndPred(top_v[0], weights_y_v, weighted_bl);
319 vst1q_u8(dst, pred_0);
320
321 if (width > 16) {
322 const uint8x16_t pred_1 =
323 CalculateVerticalWeightsAndPred(top_v[1], weights_y_v, weighted_bl);
324 vst1q_u8(dst + 16, pred_1);
325
326 if (width == 64) {
327 const uint8x16_t pred_2 =
328 CalculateVerticalWeightsAndPred(top_v[2], weights_y_v, weighted_bl);
329 vst1q_u8(dst + 32, pred_2);
330
331 const uint8x16_t pred_3 =
332 CalculateVerticalWeightsAndPred(top_v[3], weights_y_v, weighted_bl);
333 vst1q_u8(dst + 48, pred_3);
334 }
335 }
336
337 dst += stride;
338 }
339 }
340
341 template <int width, int height>
SmoothHorizontal4Or8xN_NEON(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)342 void SmoothHorizontal4Or8xN_NEON(
343 void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
344 const void* LIBGAV1_RESTRICT const top_row,
345 const void* LIBGAV1_RESTRICT const left_column) {
346 const auto* const top = static_cast<const uint8_t*>(top_row);
347 const auto* const left = static_cast<const uint8_t*>(left_column);
348 const uint8_t top_right = top[width - 1];
349 auto* dst = static_cast<uint8_t*>(dest);
350
351 const uint8x8_t top_right_v = vdup_n_u8(top_right);
352 // Over-reads for 4xN but still within the array.
353 const uint8x8_t weights_x = vld1_u8(kSmoothWeights + width - 4);
354 const uint8x8_t scaled_weights_x = NegateS8(weights_x);
355 const uint16x8_t weighted_tr = vmull_u8(scaled_weights_x, top_right_v);
356
357 for (int y = 0; y < height; ++y) {
358 const uint8x8_t left_v = vdup_n_u8(left[y]);
359 const uint16x8_t weighted_left_tr =
360 vmlal_u8(weighted_tr, weights_x, left_v);
361 const uint8x8_t pred = vrshrn_n_u16(weighted_left_tr, kSmoothWeightScale);
362
363 if (width == 4) {
364 StoreLo4(dst, pred);
365 } else { // width == 8
366 vst1_u8(dst, pred);
367 }
368 dst += stride;
369 }
370 }
371
CalculateHorizontalWeightsAndPred(const uint8x8_t left,const uint8x8_t top_right,const uint8x16_t weights_x,const uint8x16_t scaled_weights_x)372 inline uint8x16_t CalculateHorizontalWeightsAndPred(
373 const uint8x8_t left, const uint8x8_t top_right, const uint8x16_t weights_x,
374 const uint8x16_t scaled_weights_x) {
375 const uint16x8_t weighted_left_low = vmull_u8(vget_low_u8(weights_x), left);
376 const uint16x8_t weighted_left_tr_low =
377 vmlal_u8(weighted_left_low, vget_low_u8(scaled_weights_x), top_right);
378 const uint8x8_t pred_scaled_low =
379 vrshrn_n_u16(weighted_left_tr_low, kSmoothWeightScale);
380
381 const uint16x8_t weighted_left_high = vmull_u8(vget_high_u8(weights_x), left);
382 const uint16x8_t weighted_left_tr_high =
383 vmlal_u8(weighted_left_high, vget_high_u8(scaled_weights_x), top_right);
384 const uint8x8_t pred_scaled_high =
385 vrshrn_n_u16(weighted_left_tr_high, kSmoothWeightScale);
386
387 return vcombine_u8(pred_scaled_low, pred_scaled_high);
388 }
389
390 template <int width, int height>
SmoothHorizontal16PlusxN_NEON(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)391 void SmoothHorizontal16PlusxN_NEON(
392 void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
393 const void* LIBGAV1_RESTRICT const top_row,
394 const void* LIBGAV1_RESTRICT const left_column) {
395 const auto* const top = static_cast<const uint8_t*>(top_row);
396 const auto* const left = static_cast<const uint8_t*>(left_column);
397 const uint8_t top_right = top[width - 1];
398 auto* dst = static_cast<uint8_t*>(dest);
399
400 const uint8x8_t top_right_v = vdup_n_u8(top_right);
401
402 uint8x16_t weights_x[4];
403 weights_x[0] = vld1q_u8(kSmoothWeights + width - 4);
404 if (width > 16) {
405 weights_x[1] = vld1q_u8(kSmoothWeights + width + 16 - 4);
406 if (width == 64) {
407 weights_x[2] = vld1q_u8(kSmoothWeights + width + 32 - 4);
408 weights_x[3] = vld1q_u8(kSmoothWeights + width + 48 - 4);
409 }
410 }
411
412 uint8x16_t scaled_weights_x[4];
413 scaled_weights_x[0] = NegateS8(weights_x[0]);
414 if (width > 16) {
415 scaled_weights_x[1] = NegateS8(weights_x[1]);
416 if (width == 64) {
417 scaled_weights_x[2] = NegateS8(weights_x[2]);
418 scaled_weights_x[3] = NegateS8(weights_x[3]);
419 }
420 }
421
422 for (int y = 0; y < height; ++y) {
423 const uint8x8_t left_v = vdup_n_u8(left[y]);
424
425 const uint8x16_t pred_0 = CalculateHorizontalWeightsAndPred(
426 left_v, top_right_v, weights_x[0], scaled_weights_x[0]);
427 vst1q_u8(dst, pred_0);
428
429 if (width > 16) {
430 const uint8x16_t pred_1 = CalculateHorizontalWeightsAndPred(
431 left_v, top_right_v, weights_x[1], scaled_weights_x[1]);
432 vst1q_u8(dst + 16, pred_1);
433
434 if (width == 64) {
435 const uint8x16_t pred_2 = CalculateHorizontalWeightsAndPred(
436 left_v, top_right_v, weights_x[2], scaled_weights_x[2]);
437 vst1q_u8(dst + 32, pred_2);
438
439 const uint8x16_t pred_3 = CalculateHorizontalWeightsAndPred(
440 left_v, top_right_v, weights_x[3], scaled_weights_x[3]);
441 vst1q_u8(dst + 48, pred_3);
442 }
443 }
444 dst += stride;
445 }
446 }
447
Init8bpp()448 void Init8bpp() {
449 Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
450 assert(dsp != nullptr);
451 // 4x4
452 dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmooth] =
453 Smooth4xN_NEON<4>;
454 dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmoothVertical] =
455 SmoothVertical4Or8xN_NEON<4, 4>;
456 dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmoothHorizontal] =
457 SmoothHorizontal4Or8xN_NEON<4, 4>;
458
459 // 4x8
460 dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmooth] =
461 Smooth4xN_NEON<8>;
462 dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmoothVertical] =
463 SmoothVertical4Or8xN_NEON<4, 8>;
464 dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmoothHorizontal] =
465 SmoothHorizontal4Or8xN_NEON<4, 8>;
466
467 // 4x16
468 dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmooth] =
469 Smooth4xN_NEON<16>;
470 dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmoothVertical] =
471 SmoothVertical4Or8xN_NEON<4, 16>;
472 dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmoothHorizontal] =
473 SmoothHorizontal4Or8xN_NEON<4, 16>;
474
475 // 8x4
476 dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmooth] =
477 Smooth8xN_NEON<4>;
478 dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmoothVertical] =
479 SmoothVertical4Or8xN_NEON<8, 4>;
480 dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmoothHorizontal] =
481 SmoothHorizontal4Or8xN_NEON<8, 4>;
482
483 // 8x8
484 dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmooth] =
485 Smooth8xN_NEON<8>;
486 dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmoothVertical] =
487 SmoothVertical4Or8xN_NEON<8, 8>;
488 dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmoothHorizontal] =
489 SmoothHorizontal4Or8xN_NEON<8, 8>;
490
491 // 8x16
492 dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmooth] =
493 Smooth8xN_NEON<16>;
494 dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmoothVertical] =
495 SmoothVertical4Or8xN_NEON<8, 16>;
496 dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmoothHorizontal] =
497 SmoothHorizontal4Or8xN_NEON<8, 16>;
498
499 // 8x32
500 dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmooth] =
501 Smooth8xN_NEON<32>;
502 dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmoothVertical] =
503 SmoothVertical4Or8xN_NEON<8, 32>;
504 dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmoothHorizontal] =
505 SmoothHorizontal4Or8xN_NEON<8, 32>;
506
507 // 16x4
508 dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmooth] =
509 Smooth16PlusxN_NEON<16, 4>;
510 dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmoothVertical] =
511 SmoothVertical16PlusxN_NEON<16, 4>;
512 dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmoothHorizontal] =
513 SmoothHorizontal16PlusxN_NEON<16, 4>;
514
515 // 16x8
516 dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmooth] =
517 Smooth16PlusxN_NEON<16, 8>;
518 dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmoothVertical] =
519 SmoothVertical16PlusxN_NEON<16, 8>;
520 dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmoothHorizontal] =
521 SmoothHorizontal16PlusxN_NEON<16, 8>;
522
523 // 16x16
524 dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmooth] =
525 Smooth16PlusxN_NEON<16, 16>;
526 dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmoothVertical] =
527 SmoothVertical16PlusxN_NEON<16, 16>;
528 dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmoothHorizontal] =
529 SmoothHorizontal16PlusxN_NEON<16, 16>;
530
531 // 16x32
532 dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmooth] =
533 Smooth16PlusxN_NEON<16, 32>;
534 dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmoothVertical] =
535 SmoothVertical16PlusxN_NEON<16, 32>;
536 dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmoothHorizontal] =
537 SmoothHorizontal16PlusxN_NEON<16, 32>;
538
539 // 16x64
540 dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmooth] =
541 Smooth16PlusxN_NEON<16, 64>;
542 dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmoothVertical] =
543 SmoothVertical16PlusxN_NEON<16, 64>;
544 dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmoothHorizontal] =
545 SmoothHorizontal16PlusxN_NEON<16, 64>;
546
547 // 32x8
548 dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmooth] =
549 Smooth16PlusxN_NEON<32, 8>;
550 dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmoothVertical] =
551 SmoothVertical16PlusxN_NEON<32, 8>;
552 dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmoothHorizontal] =
553 SmoothHorizontal16PlusxN_NEON<32, 8>;
554
555 // 32x16
556 dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmooth] =
557 Smooth16PlusxN_NEON<32, 16>;
558 dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmoothVertical] =
559 SmoothVertical16PlusxN_NEON<32, 16>;
560 dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmoothHorizontal] =
561 SmoothHorizontal16PlusxN_NEON<32, 16>;
562
563 // 32x32
564 dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmooth] =
565 Smooth16PlusxN_NEON<32, 32>;
566 dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmoothVertical] =
567 SmoothVertical16PlusxN_NEON<32, 32>;
568 dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmoothHorizontal] =
569 SmoothHorizontal16PlusxN_NEON<32, 32>;
570
571 // 32x64
572 dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmooth] =
573 Smooth16PlusxN_NEON<32, 64>;
574 dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmoothVertical] =
575 SmoothVertical16PlusxN_NEON<32, 64>;
576 dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmoothHorizontal] =
577 SmoothHorizontal16PlusxN_NEON<32, 64>;
578
579 // 64x16
580 dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmooth] =
581 Smooth16PlusxN_NEON<64, 16>;
582 dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmoothVertical] =
583 SmoothVertical16PlusxN_NEON<64, 16>;
584 dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmoothHorizontal] =
585 SmoothHorizontal16PlusxN_NEON<64, 16>;
586
587 // 64x32
588 dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmooth] =
589 Smooth16PlusxN_NEON<64, 32>;
590 dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmoothVertical] =
591 SmoothVertical16PlusxN_NEON<64, 32>;
592 dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmoothHorizontal] =
593 SmoothHorizontal16PlusxN_NEON<64, 32>;
594
595 // 64x64
596 dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmooth] =
597 Smooth16PlusxN_NEON<64, 64>;
598 dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmoothVertical] =
599 SmoothVertical16PlusxN_NEON<64, 64>;
600 dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmoothHorizontal] =
601 SmoothHorizontal16PlusxN_NEON<64, 64>;
602 }
603
604 } // namespace
605 } // namespace low_bitdepth
606
607 #if LIBGAV1_MAX_BITDEPTH >= 10
608 namespace high_bitdepth {
609 namespace {
610
611 // Note these constants are duplicated from intrapred.cc to allow the compiler
612 // to have visibility of the values. This helps reduce loads and in the
613 // creation of the inverse weights.
614 constexpr uint16_t kSmoothWeights[] = {
615 #include "src/dsp/smooth_weights.inc"
616 };
617
618 // 256 - v = vneg_s8(v)
NegateS8(const uint16x4_t v)619 inline uint16x4_t NegateS8(const uint16x4_t v) {
620 return vreinterpret_u16_s8(vneg_s8(vreinterpret_s8_u16(v)));
621 }
622
623 template <int height>
Smooth4xH_NEON(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)624 void Smooth4xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
625 const void* LIBGAV1_RESTRICT const top_row,
626 const void* LIBGAV1_RESTRICT const left_column) {
627 const auto* const top = static_cast<const uint16_t*>(top_row);
628 const auto* const left = static_cast<const uint16_t*>(left_column);
629 const uint16_t top_right = top[3];
630 const uint16_t bottom_left = left[height - 1];
631 const uint16_t* const weights_y = kSmoothWeights + height - 4;
632 auto* dst = static_cast<uint8_t*>(dest);
633
634 const uint16x4_t top_v = vld1_u16(top);
635 const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left);
636 const uint16x4_t weights_x_v = vld1_u16(kSmoothWeights);
637 const uint16x4_t scaled_weights_x = NegateS8(weights_x_v);
638 const uint32x4_t weighted_tr = vmull_n_u16(scaled_weights_x, top_right);
639
640 for (int y = 0; y < height; ++y) {
641 // Each variable in the running summation is named for the last item to be
642 // accumulated.
643 const uint32x4_t weighted_top =
644 vmlal_n_u16(weighted_tr, top_v, weights_y[y]);
645 const uint32x4_t weighted_left =
646 vmlal_n_u16(weighted_top, weights_x_v, left[y]);
647 const uint32x4_t weighted_bl =
648 vmlal_n_u16(weighted_left, bottom_left_v, 256 - weights_y[y]);
649
650 const uint16x4_t pred = vrshrn_n_u32(weighted_bl, kSmoothWeightScale + 1);
651 vst1_u16(reinterpret_cast<uint16_t*>(dst), pred);
652 dst += stride;
653 }
654 }
655
656 // Common code between 8xH and [16|32|64]xH.
CalculatePred8(uint16_t * LIBGAV1_RESTRICT dst,const uint32x4_t weighted_corners_low,const uint32x4_t weighted_corners_high,const uint16x4x2_t top_vals,const uint16x4x2_t weights_x,const uint16_t left_y,const uint16_t weight_y)657 inline void CalculatePred8(uint16_t* LIBGAV1_RESTRICT dst,
658 const uint32x4_t weighted_corners_low,
659 const uint32x4_t weighted_corners_high,
660 const uint16x4x2_t top_vals,
661 const uint16x4x2_t weights_x, const uint16_t left_y,
662 const uint16_t weight_y) {
663 // Each variable in the running summation is named for the last item to be
664 // accumulated.
665 const uint32x4_t weighted_top_low =
666 vmlal_n_u16(weighted_corners_low, top_vals.val[0], weight_y);
667 const uint32x4_t weighted_edges_low =
668 vmlal_n_u16(weighted_top_low, weights_x.val[0], left_y);
669
670 const uint16x4_t pred_low =
671 vrshrn_n_u32(weighted_edges_low, kSmoothWeightScale + 1);
672 vst1_u16(dst, pred_low);
673
674 const uint32x4_t weighted_top_high =
675 vmlal_n_u16(weighted_corners_high, top_vals.val[1], weight_y);
676 const uint32x4_t weighted_edges_high =
677 vmlal_n_u16(weighted_top_high, weights_x.val[1], left_y);
678
679 const uint16x4_t pred_high =
680 vrshrn_n_u32(weighted_edges_high, kSmoothWeightScale + 1);
681 vst1_u16(dst + 4, pred_high);
682 }
683
684 template <int height>
Smooth8xH_NEON(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)685 void Smooth8xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
686 const void* LIBGAV1_RESTRICT const top_row,
687 const void* LIBGAV1_RESTRICT const left_column) {
688 const auto* const top = static_cast<const uint16_t*>(top_row);
689 const auto* const left = static_cast<const uint16_t*>(left_column);
690 const uint16_t top_right = top[7];
691 const uint16_t bottom_left = left[height - 1];
692 const uint16_t* const weights_y = kSmoothWeights + height - 4;
693
694 auto* dst = static_cast<uint8_t*>(dest);
695
696 const uint16x4x2_t top_vals = {vld1_u16(top), vld1_u16(top + 4)};
697 const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left);
698 const uint16x4x2_t weights_x = {vld1_u16(kSmoothWeights + 4),
699 vld1_u16(kSmoothWeights + 8)};
700 const uint32x4_t weighted_tr_low =
701 vmull_n_u16(NegateS8(weights_x.val[0]), top_right);
702 const uint32x4_t weighted_tr_high =
703 vmull_n_u16(NegateS8(weights_x.val[1]), top_right);
704
705 for (int y = 0; y < height; ++y) {
706 const uint32x4_t weighted_bl =
707 vmull_n_u16(bottom_left_v, 256 - weights_y[y]);
708 const uint32x4_t weighted_corners_low =
709 vaddq_u32(weighted_bl, weighted_tr_low);
710 const uint32x4_t weighted_corners_high =
711 vaddq_u32(weighted_bl, weighted_tr_high);
712 CalculatePred8(reinterpret_cast<uint16_t*>(dst), weighted_corners_low,
713 weighted_corners_high, top_vals, weights_x, left[y],
714 weights_y[y]);
715 dst += stride;
716 }
717 }
718
719 // For width 16 and above.
720 template <int width, int height>
SmoothWxH_NEON(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)721 void SmoothWxH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
722 const void* LIBGAV1_RESTRICT const top_row,
723 const void* LIBGAV1_RESTRICT const left_column) {
724 const auto* const top = static_cast<const uint16_t*>(top_row);
725 const auto* const left = static_cast<const uint16_t*>(left_column);
726 const uint16_t top_right = top[width - 1];
727 const uint16_t bottom_left = left[height - 1];
728 const uint16_t* const weights_y = kSmoothWeights + height - 4;
729
730 auto* dst = static_cast<uint8_t*>(dest);
731
732 // Precompute weighted values that don't vary with |y|.
733 uint32x4_t weighted_tr_low[width >> 3];
734 uint32x4_t weighted_tr_high[width >> 3];
735 for (int i = 0; i < width >> 3; ++i) {
736 const int x = i << 3;
737 const uint16x4_t weights_x_low = vld1_u16(kSmoothWeights + width - 4 + x);
738 weighted_tr_low[i] = vmull_n_u16(NegateS8(weights_x_low), top_right);
739 const uint16x4_t weights_x_high = vld1_u16(kSmoothWeights + width + x);
740 weighted_tr_high[i] = vmull_n_u16(NegateS8(weights_x_high), top_right);
741 }
742
743 const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left);
744 for (int y = 0; y < height; ++y) {
745 const uint32x4_t weighted_bl =
746 vmull_n_u16(bottom_left_v, 256 - weights_y[y]);
747 auto* dst_x = reinterpret_cast<uint16_t*>(dst);
748 for (int i = 0; i < width >> 3; ++i) {
749 const int x = i << 3;
750 const uint16x4x2_t top_vals = {vld1_u16(top + x), vld1_u16(top + x + 4)};
751 const uint32x4_t weighted_corners_low =
752 vaddq_u32(weighted_bl, weighted_tr_low[i]);
753 const uint32x4_t weighted_corners_high =
754 vaddq_u32(weighted_bl, weighted_tr_high[i]);
755 // Accumulate weighted edge values and store.
756 const uint16x4x2_t weights_x = {vld1_u16(kSmoothWeights + width - 4 + x),
757 vld1_u16(kSmoothWeights + width + x)};
758 CalculatePred8(dst_x, weighted_corners_low, weighted_corners_high,
759 top_vals, weights_x, left[y], weights_y[y]);
760 dst_x += 8;
761 }
762 dst += stride;
763 }
764 }
765
766 template <int height>
SmoothVertical4xH_NEON(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)767 void SmoothVertical4xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
768 const void* LIBGAV1_RESTRICT const top_row,
769 const void* LIBGAV1_RESTRICT const left_column) {
770 const auto* const top = static_cast<const uint16_t*>(top_row);
771 const auto* const left = static_cast<const uint16_t*>(left_column);
772 const uint16_t bottom_left = left[height - 1];
773 const uint16_t* const weights_y = kSmoothWeights + height - 4;
774
775 auto* dst = static_cast<uint8_t*>(dest);
776
777 const uint16x4_t top_v = vld1_u16(top);
778 const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left);
779
780 for (int y = 0; y < height; ++y) {
781 auto* dst16 = reinterpret_cast<uint16_t*>(dst);
782 const uint32x4_t weighted_bl =
783 vmull_n_u16(bottom_left_v, 256 - weights_y[y]);
784 const uint32x4_t weighted_top =
785 vmlal_n_u16(weighted_bl, top_v, weights_y[y]);
786 vst1_u16(dst16, vrshrn_n_u32(weighted_top, kSmoothWeightScale));
787
788 dst += stride;
789 }
790 }
791
792 template <int height>
SmoothVertical8xH_NEON(void * LIBGAV1_RESTRICT const dest,const ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)793 void SmoothVertical8xH_NEON(void* LIBGAV1_RESTRICT const dest,
794 const ptrdiff_t stride,
795 const void* LIBGAV1_RESTRICT const top_row,
796 const void* LIBGAV1_RESTRICT const left_column) {
797 const auto* const top = static_cast<const uint16_t*>(top_row);
798 const auto* const left = static_cast<const uint16_t*>(left_column);
799 const uint16_t bottom_left = left[height - 1];
800 const uint16_t* const weights_y = kSmoothWeights + height - 4;
801
802 auto* dst = static_cast<uint8_t*>(dest);
803
804 const uint16x4_t top_low = vld1_u16(top);
805 const uint16x4_t top_high = vld1_u16(top + 4);
806 const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left);
807
808 for (int y = 0; y < height; ++y) {
809 auto* dst16 = reinterpret_cast<uint16_t*>(dst);
810 const uint32x4_t weighted_bl =
811 vmull_n_u16(bottom_left_v, 256 - weights_y[y]);
812
813 const uint32x4_t weighted_top_low =
814 vmlal_n_u16(weighted_bl, top_low, weights_y[y]);
815 vst1_u16(dst16, vrshrn_n_u32(weighted_top_low, kSmoothWeightScale));
816
817 const uint32x4_t weighted_top_high =
818 vmlal_n_u16(weighted_bl, top_high, weights_y[y]);
819 vst1_u16(dst16 + 4, vrshrn_n_u32(weighted_top_high, kSmoothWeightScale));
820 dst += stride;
821 }
822 }
823
824 // For width 16 and above.
825 template <int width, int height>
SmoothVerticalWxH_NEON(void * LIBGAV1_RESTRICT const dest,const ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)826 void SmoothVerticalWxH_NEON(void* LIBGAV1_RESTRICT const dest,
827 const ptrdiff_t stride,
828 const void* LIBGAV1_RESTRICT const top_row,
829 const void* LIBGAV1_RESTRICT const left_column) {
830 const auto* const top = static_cast<const uint16_t*>(top_row);
831 const auto* const left = static_cast<const uint16_t*>(left_column);
832 const uint16_t bottom_left = left[height - 1];
833 const uint16_t* const weights_y = kSmoothWeights + height - 4;
834
835 auto* dst = static_cast<uint8_t*>(dest);
836
837 uint16x4x2_t top_vals[width >> 3];
838 for (int i = 0; i < width >> 3; ++i) {
839 const int x = i << 3;
840 top_vals[i] = {vld1_u16(top + x), vld1_u16(top + x + 4)};
841 }
842
843 const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left);
844 for (int y = 0; y < height; ++y) {
845 const uint32x4_t weighted_bl =
846 vmull_n_u16(bottom_left_v, 256 - weights_y[y]);
847
848 auto* dst_x = reinterpret_cast<uint16_t*>(dst);
849 for (int i = 0; i < width >> 3; ++i) {
850 const uint32x4_t weighted_top_low =
851 vmlal_n_u16(weighted_bl, top_vals[i].val[0], weights_y[y]);
852 vst1_u16(dst_x, vrshrn_n_u32(weighted_top_low, kSmoothWeightScale));
853
854 const uint32x4_t weighted_top_high =
855 vmlal_n_u16(weighted_bl, top_vals[i].val[1], weights_y[y]);
856 vst1_u16(dst_x + 4, vrshrn_n_u32(weighted_top_high, kSmoothWeightScale));
857 dst_x += 8;
858 }
859 dst += stride;
860 }
861 }
862
863 template <int height>
SmoothHorizontal4xH_NEON(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)864 void SmoothHorizontal4xH_NEON(void* LIBGAV1_RESTRICT const dest,
865 ptrdiff_t stride,
866 const void* LIBGAV1_RESTRICT const top_row,
867 const void* LIBGAV1_RESTRICT const left_column) {
868 const auto* const top = static_cast<const uint16_t*>(top_row);
869 const auto* const left = static_cast<const uint16_t*>(left_column);
870 const uint16_t top_right = top[3];
871
872 auto* dst = static_cast<uint8_t*>(dest);
873
874 const uint16x4_t weights_x = vld1_u16(kSmoothWeights);
875 const uint16x4_t scaled_weights_x = NegateS8(weights_x);
876
877 const uint32x4_t weighted_tr = vmull_n_u16(scaled_weights_x, top_right);
878 for (int y = 0; y < height; ++y) {
879 auto* dst16 = reinterpret_cast<uint16_t*>(dst);
880 const uint32x4_t weighted_left =
881 vmlal_n_u16(weighted_tr, weights_x, left[y]);
882 vst1_u16(dst16, vrshrn_n_u32(weighted_left, kSmoothWeightScale));
883 dst += stride;
884 }
885 }
886
887 template <int height>
SmoothHorizontal8xH_NEON(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)888 void SmoothHorizontal8xH_NEON(void* LIBGAV1_RESTRICT const dest,
889 ptrdiff_t stride,
890 const void* LIBGAV1_RESTRICT const top_row,
891 const void* LIBGAV1_RESTRICT const left_column) {
892 const auto* const top = static_cast<const uint16_t*>(top_row);
893 const auto* const left = static_cast<const uint16_t*>(left_column);
894 const uint16_t top_right = top[7];
895
896 auto* dst = static_cast<uint8_t*>(dest);
897
898 const uint16x4x2_t weights_x = {vld1_u16(kSmoothWeights + 4),
899 vld1_u16(kSmoothWeights + 8)};
900
901 const uint32x4_t weighted_tr_low =
902 vmull_n_u16(NegateS8(weights_x.val[0]), top_right);
903 const uint32x4_t weighted_tr_high =
904 vmull_n_u16(NegateS8(weights_x.val[1]), top_right);
905
906 for (int y = 0; y < height; ++y) {
907 auto* dst16 = reinterpret_cast<uint16_t*>(dst);
908 const uint16_t left_y = left[y];
909 const uint32x4_t weighted_left_low =
910 vmlal_n_u16(weighted_tr_low, weights_x.val[0], left_y);
911 vst1_u16(dst16, vrshrn_n_u32(weighted_left_low, kSmoothWeightScale));
912
913 const uint32x4_t weighted_left_high =
914 vmlal_n_u16(weighted_tr_high, weights_x.val[1], left_y);
915 vst1_u16(dst16 + 4, vrshrn_n_u32(weighted_left_high, kSmoothWeightScale));
916 dst += stride;
917 }
918 }
919
920 // For width 16 and above.
921 template <int width, int height>
SmoothHorizontalWxH_NEON(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)922 void SmoothHorizontalWxH_NEON(void* LIBGAV1_RESTRICT const dest,
923 ptrdiff_t stride,
924 const void* LIBGAV1_RESTRICT const top_row,
925 const void* LIBGAV1_RESTRICT const left_column) {
926 const auto* const top = static_cast<const uint16_t*>(top_row);
927 const auto* const left = static_cast<const uint16_t*>(left_column);
928 const uint16_t top_right = top[width - 1];
929
930 auto* dst = static_cast<uint8_t*>(dest);
931
932 uint16x4_t weights_x_low[width >> 3];
933 uint16x4_t weights_x_high[width >> 3];
934 uint32x4_t weighted_tr_low[width >> 3];
935 uint32x4_t weighted_tr_high[width >> 3];
936 for (int i = 0; i < width >> 3; ++i) {
937 const int x = i << 3;
938 weights_x_low[i] = vld1_u16(kSmoothWeights + width - 4 + x);
939 weighted_tr_low[i] = vmull_n_u16(NegateS8(weights_x_low[i]), top_right);
940 weights_x_high[i] = vld1_u16(kSmoothWeights + width + x);
941 weighted_tr_high[i] = vmull_n_u16(NegateS8(weights_x_high[i]), top_right);
942 }
943
944 for (int y = 0; y < height; ++y) {
945 auto* dst_x = reinterpret_cast<uint16_t*>(dst);
946 const uint16_t left_y = left[y];
947 for (int i = 0; i < width >> 3; ++i) {
948 const uint32x4_t weighted_left_low =
949 vmlal_n_u16(weighted_tr_low[i], weights_x_low[i], left_y);
950 vst1_u16(dst_x, vrshrn_n_u32(weighted_left_low, kSmoothWeightScale));
951
952 const uint32x4_t weighted_left_high =
953 vmlal_n_u16(weighted_tr_high[i], weights_x_high[i], left_y);
954 vst1_u16(dst_x + 4, vrshrn_n_u32(weighted_left_high, kSmoothWeightScale));
955 dst_x += 8;
956 }
957 dst += stride;
958 }
959 }
960
Init10bpp()961 void Init10bpp() {
962 Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
963 assert(dsp != nullptr);
964 // 4x4
965 dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmooth] =
966 Smooth4xH_NEON<4>;
967 dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmoothVertical] =
968 SmoothVertical4xH_NEON<4>;
969 dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmoothHorizontal] =
970 SmoothHorizontal4xH_NEON<4>;
971
972 // 4x8
973 dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmooth] =
974 Smooth4xH_NEON<8>;
975 dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmoothVertical] =
976 SmoothVertical4xH_NEON<8>;
977 dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmoothHorizontal] =
978 SmoothHorizontal4xH_NEON<8>;
979
980 // 4x16
981 dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmooth] =
982 Smooth4xH_NEON<16>;
983 dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmoothVertical] =
984 SmoothVertical4xH_NEON<16>;
985 dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmoothHorizontal] =
986 SmoothHorizontal4xH_NEON<16>;
987
988 // 8x4
989 dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmooth] =
990 Smooth8xH_NEON<4>;
991 dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmoothVertical] =
992 SmoothVertical8xH_NEON<4>;
993 dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmoothHorizontal] =
994 SmoothHorizontal8xH_NEON<4>;
995
996 // 8x8
997 dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmooth] =
998 Smooth8xH_NEON<8>;
999 dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmoothVertical] =
1000 SmoothVertical8xH_NEON<8>;
1001 dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmoothHorizontal] =
1002 SmoothHorizontal8xH_NEON<8>;
1003
1004 // 8x16
1005 dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmooth] =
1006 Smooth8xH_NEON<16>;
1007 dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmoothVertical] =
1008 SmoothVertical8xH_NEON<16>;
1009 dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmoothHorizontal] =
1010 SmoothHorizontal8xH_NEON<16>;
1011
1012 // 8x32
1013 dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmooth] =
1014 Smooth8xH_NEON<32>;
1015 dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmoothVertical] =
1016 SmoothVertical8xH_NEON<32>;
1017 dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmoothHorizontal] =
1018 SmoothHorizontal8xH_NEON<32>;
1019
1020 // 16x4
1021 dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmooth] =
1022 SmoothWxH_NEON<16, 4>;
1023 dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmoothVertical] =
1024 SmoothVerticalWxH_NEON<16, 4>;
1025 dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmoothHorizontal] =
1026 SmoothHorizontalWxH_NEON<16, 4>;
1027
1028 // 16x8
1029 dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmooth] =
1030 SmoothWxH_NEON<16, 8>;
1031 dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmoothVertical] =
1032 SmoothVerticalWxH_NEON<16, 8>;
1033 dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmoothHorizontal] =
1034 SmoothHorizontalWxH_NEON<16, 8>;
1035
1036 // 16x16
1037 dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmooth] =
1038 SmoothWxH_NEON<16, 16>;
1039 dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmoothVertical] =
1040 SmoothVerticalWxH_NEON<16, 16>;
1041 dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmoothHorizontal] =
1042 SmoothHorizontalWxH_NEON<16, 16>;
1043
1044 // 16x32
1045 dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmooth] =
1046 SmoothWxH_NEON<16, 32>;
1047 dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmoothVertical] =
1048 SmoothVerticalWxH_NEON<16, 32>;
1049 dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmoothHorizontal] =
1050 SmoothHorizontalWxH_NEON<16, 32>;
1051
1052 // 16x64
1053 dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmooth] =
1054 SmoothWxH_NEON<16, 64>;
1055 dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmoothVertical] =
1056 SmoothVerticalWxH_NEON<16, 64>;
1057 dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmoothHorizontal] =
1058 SmoothHorizontalWxH_NEON<16, 64>;
1059
1060 // 32x8
1061 dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmooth] =
1062 SmoothWxH_NEON<32, 8>;
1063 dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmoothVertical] =
1064 SmoothVerticalWxH_NEON<32, 8>;
1065 dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmoothHorizontal] =
1066 SmoothHorizontalWxH_NEON<32, 8>;
1067
1068 // 32x16
1069 dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmooth] =
1070 SmoothWxH_NEON<32, 16>;
1071 dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmoothVertical] =
1072 SmoothVerticalWxH_NEON<32, 16>;
1073 dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmoothHorizontal] =
1074 SmoothHorizontalWxH_NEON<32, 16>;
1075
1076 // 32x32
1077 dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmooth] =
1078 SmoothWxH_NEON<32, 32>;
1079 dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmoothVertical] =
1080 SmoothVerticalWxH_NEON<32, 32>;
1081 dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmoothHorizontal] =
1082 SmoothHorizontalWxH_NEON<32, 32>;
1083
1084 // 32x64
1085 dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmooth] =
1086 SmoothWxH_NEON<32, 64>;
1087 dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmoothVertical] =
1088 SmoothVerticalWxH_NEON<32, 64>;
1089 dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmoothHorizontal] =
1090 SmoothHorizontalWxH_NEON<32, 64>;
1091
1092 // 64x16
1093 dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmooth] =
1094 SmoothWxH_NEON<64, 16>;
1095 dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmoothVertical] =
1096 SmoothVerticalWxH_NEON<64, 16>;
1097 dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmoothHorizontal] =
1098 SmoothHorizontalWxH_NEON<64, 16>;
1099
1100 // 64x32
1101 dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmooth] =
1102 SmoothWxH_NEON<64, 32>;
1103 dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmoothVertical] =
1104 SmoothVerticalWxH_NEON<64, 32>;
1105 dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmoothHorizontal] =
1106 SmoothHorizontalWxH_NEON<64, 32>;
1107
1108 // 64x64
1109 dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmooth] =
1110 SmoothWxH_NEON<64, 64>;
1111 dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmoothVertical] =
1112 SmoothVerticalWxH_NEON<64, 64>;
1113 dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmoothHorizontal] =
1114 SmoothHorizontalWxH_NEON<64, 64>;
1115 }
1116
1117 } // namespace
1118 } // namespace high_bitdepth
1119 #endif // LIBGAV1_MAX_BITDEPTH >= 10
1120
IntraPredSmoothInit_NEON()1121 void IntraPredSmoothInit_NEON() {
1122 low_bitdepth::Init8bpp();
1123 #if LIBGAV1_MAX_BITDEPTH >= 10
1124 high_bitdepth::Init10bpp();
1125 #endif
1126 }
1127
1128 } // namespace dsp
1129 } // namespace libgav1
1130
1131 #else // !LIBGAV1_ENABLE_NEON
1132 namespace libgav1 {
1133 namespace dsp {
1134
IntraPredSmoothInit_NEON()1135 void IntraPredSmoothInit_NEON() {}
1136
1137 } // namespace dsp
1138 } // namespace libgav1
1139 #endif // LIBGAV1_ENABLE_NEON
1140