xref: /aosp_15_r20/external/libvpx/vpx_dsp/arm/highbd_avg_neon.c (revision fb1b10ab9aebc7c7068eedab379b749d7e3900be)
1 /*
2  *  Copyright (c) 2023 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include <arm_neon.h>
12 
13 #include "./vpx_dsp_rtcd.h"
14 #include "./vpx_config.h"
15 
16 #include "vpx_dsp/arm/mem_neon.h"
17 #include "vpx_dsp/arm/sum_neon.h"
18 
vpx_highbd_avg_4x4_neon(const uint8_t * s8,int p)19 uint32_t vpx_highbd_avg_4x4_neon(const uint8_t *s8, int p) {
20   const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(s8);
21   const uint16x8_t a0 = load_unaligned_u16q(a_ptr + 0 * p, p);
22   const uint16x8_t a1 = load_unaligned_u16q(a_ptr + 2 * p, p);
23   return (horizontal_add_uint16x8(vaddq_u16(a0, a1)) + (1 << 3)) >> 4;
24 }
25 
vpx_highbd_avg_8x8_neon(const uint8_t * s8,int p)26 uint32_t vpx_highbd_avg_8x8_neon(const uint8_t *s8, int p) {
27   const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(s8);
28   uint16x8_t sum, a0, a1, a2, a3, a4, a5, a6, a7;
29 
30   load_u16_8x8(a_ptr, p, &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
31 
32   sum = vaddq_u16(a0, a1);
33   sum = vaddq_u16(sum, a2);
34   sum = vaddq_u16(sum, a3);
35   sum = vaddq_u16(sum, a4);
36   sum = vaddq_u16(sum, a5);
37   sum = vaddq_u16(sum, a6);
38   sum = vaddq_u16(sum, a7);
39 
40   return (horizontal_add_uint16x8(sum) + (1 << 5)) >> 6;
41 }
42 
43 // coeff: 32 bits, dynamic range [-2147483648, 2147483647].
44 // length: value range {16, 64, 256, 1024}.
45 // satd: 42 bits, dynamic range [-2147483648 * 1024, 2147483647 * 1024]
vpx_highbd_satd_neon(const tran_low_t * coeff,int length)46 int vpx_highbd_satd_neon(const tran_low_t *coeff, int length) {
47   int64x2_t sum_s64[2] = { vdupq_n_s64(0), vdupq_n_s64(0) };
48 
49   do {
50     int32x4_t abs0, abs1;
51     const int32x4_t s0 = load_tran_low_to_s32q(coeff);
52     const int32x4_t s1 = load_tran_low_to_s32q(coeff + 4);
53 
54     abs0 = vabsq_s32(s0);
55     sum_s64[0] = vpadalq_s32(sum_s64[0], abs0);
56     abs1 = vabsq_s32(s1);
57     sum_s64[1] = vpadalq_s32(sum_s64[1], abs1);
58 
59     length -= 8;
60     coeff += 8;
61   } while (length != 0);
62 
63   return (int)horizontal_add_int64x2(vaddq_s64(sum_s64[0], sum_s64[1]));
64 }
65 
vpx_highbd_minmax_8x8_neon(const uint8_t * s8,int p,const uint8_t * d8,int dp,int * min,int * max)66 void vpx_highbd_minmax_8x8_neon(const uint8_t *s8, int p, const uint8_t *d8,
67                                 int dp, int *min, int *max) {
68   const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(s8);
69   const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(d8);
70 
71   const uint16x8_t a0 = vld1q_u16(a_ptr + 0 * p);
72   const uint16x8_t a1 = vld1q_u16(a_ptr + 1 * p);
73   const uint16x8_t a2 = vld1q_u16(a_ptr + 2 * p);
74   const uint16x8_t a3 = vld1q_u16(a_ptr + 3 * p);
75   const uint16x8_t a4 = vld1q_u16(a_ptr + 4 * p);
76   const uint16x8_t a5 = vld1q_u16(a_ptr + 5 * p);
77   const uint16x8_t a6 = vld1q_u16(a_ptr + 6 * p);
78   const uint16x8_t a7 = vld1q_u16(a_ptr + 7 * p);
79 
80   const uint16x8_t b0 = vld1q_u16(b_ptr + 0 * dp);
81   const uint16x8_t b1 = vld1q_u16(b_ptr + 1 * dp);
82   const uint16x8_t b2 = vld1q_u16(b_ptr + 2 * dp);
83   const uint16x8_t b3 = vld1q_u16(b_ptr + 3 * dp);
84   const uint16x8_t b4 = vld1q_u16(b_ptr + 4 * dp);
85   const uint16x8_t b5 = vld1q_u16(b_ptr + 5 * dp);
86   const uint16x8_t b6 = vld1q_u16(b_ptr + 6 * dp);
87   const uint16x8_t b7 = vld1q_u16(b_ptr + 7 * dp);
88 
89   const uint16x8_t abs_diff0 = vabdq_u16(a0, b0);
90   const uint16x8_t abs_diff1 = vabdq_u16(a1, b1);
91   const uint16x8_t abs_diff2 = vabdq_u16(a2, b2);
92   const uint16x8_t abs_diff3 = vabdq_u16(a3, b3);
93   const uint16x8_t abs_diff4 = vabdq_u16(a4, b4);
94   const uint16x8_t abs_diff5 = vabdq_u16(a5, b5);
95   const uint16x8_t abs_diff6 = vabdq_u16(a6, b6);
96   const uint16x8_t abs_diff7 = vabdq_u16(a7, b7);
97 
98   const uint16x8_t max01 = vmaxq_u16(abs_diff0, abs_diff1);
99   const uint16x8_t max23 = vmaxq_u16(abs_diff2, abs_diff3);
100   const uint16x8_t max45 = vmaxq_u16(abs_diff4, abs_diff5);
101   const uint16x8_t max67 = vmaxq_u16(abs_diff6, abs_diff7);
102 
103   const uint16x8_t max0123 = vmaxq_u16(max01, max23);
104   const uint16x8_t max4567 = vmaxq_u16(max45, max67);
105   const uint16x8_t max07 = vmaxq_u16(max0123, max4567);
106 
107   const uint16x8_t min01 = vminq_u16(abs_diff0, abs_diff1);
108   const uint16x8_t min23 = vminq_u16(abs_diff2, abs_diff3);
109   const uint16x8_t min45 = vminq_u16(abs_diff4, abs_diff5);
110   const uint16x8_t min67 = vminq_u16(abs_diff6, abs_diff7);
111 
112   const uint16x8_t min0123 = vminq_u16(min01, min23);
113   const uint16x8_t min4567 = vminq_u16(min45, min67);
114   const uint16x8_t min07 = vminq_u16(min0123, min4567);
115 
116 #if VPX_ARCH_AARCH64
117   *min = *max = 0;  // Clear high bits
118   *((uint16_t *)max) = vmaxvq_u16(max07);
119   *((uint16_t *)min) = vminvq_u16(min07);
120 #else
121   // Split into 64-bit vectors and execute pairwise min/max.
122   uint16x4_t ab_max = vmax_u16(vget_high_u16(max07), vget_low_u16(max07));
123   uint16x4_t ab_min = vmin_u16(vget_high_u16(min07), vget_low_u16(min07));
124 
125   // Enough runs of vpmax/min propagate the max/min values to every position.
126   ab_max = vpmax_u16(ab_max, ab_max);
127   ab_min = vpmin_u16(ab_min, ab_min);
128 
129   ab_max = vpmax_u16(ab_max, ab_max);
130   ab_min = vpmin_u16(ab_min, ab_min);
131 
132   ab_max = vpmax_u16(ab_max, ab_max);
133   ab_min = vpmin_u16(ab_min, ab_min);
134 
135   *min = *max = 0;  // Clear high bits
136   // Store directly to avoid costly neon->gpr transfer.
137   vst1_lane_u16((uint16_t *)max, ab_max, 0);
138   vst1_lane_u16((uint16_t *)min, ab_min, 0);
139 #endif
140 }
141