1 /*
2 * Copyright 2024 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include <arm_neon.h>
18 #include <cstring>
19
20 #include "ultrahdr/dsp/arm/mem_neon.h"
21 #include "ultrahdr/editorhelper.h"
22
23 namespace ultrahdr {
24
25 #define vrev128q_u8(src, dst) \
26 dst = vrev64q_u8(src); \
27 dst = vextq_u8(dst, dst, 8);
28
29 #define vrev128q_u16(src, dst) \
30 dst = vrev64q_u16(src); \
31 dst = vextq_u16(dst, dst, 4);
32
33 #define vrev128q_u32(src, dst) \
34 dst = vrev64q_u32(src); \
35 dst = vextq_u32(dst, dst, 2);
36
37 #define vrev128q_u64(a) a = vextq_u64(a, a, 1)
38
mirror_buffer_horizontal_neon_uint8_t(uint8_t * src_buffer,uint8_t * dst_buffer,int src_w,int src_h,int src_stride,int dst_stride)39 static void mirror_buffer_horizontal_neon_uint8_t(uint8_t* src_buffer, uint8_t* dst_buffer,
40 int src_w, int src_h, int src_stride,
41 int dst_stride) {
42 uint8_t* src_row = src_buffer;
43 uint8_t* dst_row = dst_buffer;
44
45 for (int i = 0; i < src_h; i++, src_row += src_stride, dst_row += dst_stride) {
46 uint8_t* src_blk = src_row + src_w;
47 uint8_t* dst_blk = dst_row;
48 int j = 0;
49
50 for (; j + 64 <= src_w; src_blk -= 64, dst_blk += 64, j += 64) {
51 uint8x16x4_t s0 = load_u8x16_x4(src_blk - 64);
52 uint8x16x4_t d0;
53 vrev128q_u8(s0.val[0], d0.val[3]);
54 vrev128q_u8(s0.val[1], d0.val[2]);
55 vrev128q_u8(s0.val[2], d0.val[1]);
56 vrev128q_u8(s0.val[3], d0.val[0]);
57 store_u8x16_x4(dst_blk, d0);
58 }
59
60 for (; j + 32 <= src_w; src_blk -= 32, dst_blk += 32, j += 32) {
61 uint8x16x2_t s0 = load_u8x16_x2(src_blk - 32);
62 uint8x16x2_t d0;
63 vrev128q_u8(s0.val[0], d0.val[1]);
64 vrev128q_u8(s0.val[1], d0.val[0]);
65 store_u8x16_x2(dst_blk, d0);
66 }
67
68 for (; j + 16 <= src_w; src_blk -= 16, dst_blk += 16, j += 16) {
69 uint8x16_t s0 = vld1q_u8(src_blk - 16);
70 vrev128q_u8(s0, s0);
71 vst1q_u8(dst_blk, s0);
72 }
73
74 for (; j + 8 <= src_w; src_blk -= 8, dst_blk += 8, j += 8) {
75 uint8x8_t s0 = vld1_u8(src_blk - 8);
76 s0 = vrev64_u8(s0);
77 vst1_u8(dst_blk, s0);
78 }
79
80 for (int k = 0; k < src_w - j; k++) {
81 dst_blk[k] = src_row[src_w - j - k - 1];
82 }
83 }
84 }
85
mirror_buffer_horizontal_neon_uint16_t(uint16_t * src_buffer,uint16_t * dst_buffer,int src_w,int src_h,int src_stride,int dst_stride)86 static void mirror_buffer_horizontal_neon_uint16_t(uint16_t* src_buffer, uint16_t* dst_buffer,
87 int src_w, int src_h, int src_stride,
88 int dst_stride) {
89 uint16_t* src_row = src_buffer;
90 uint16_t* dst_row = dst_buffer;
91
92 for (int i = 0; i < src_h; i++, src_row += src_stride, dst_row += dst_stride) {
93 uint16_t* src_blk = src_row + src_w;
94 uint16_t* dst_blk = dst_row;
95 int j = 0;
96
97 for (; j + 32 <= src_w; src_blk -= 32, dst_blk += 32, j += 32) {
98 uint16x8x4_t s0 = load_u16x8_x4(src_blk - 32);
99 uint16x8x4_t d0;
100 vrev128q_u16(s0.val[0], d0.val[3]);
101 vrev128q_u16(s0.val[1], d0.val[2]);
102 vrev128q_u16(s0.val[2], d0.val[1]);
103 vrev128q_u16(s0.val[3], d0.val[0]);
104 store_u16x8_x4(dst_blk, d0);
105 }
106
107 for (; j + 16 <= src_w; src_blk -= 16, dst_blk += 16, j += 16) {
108 uint16x8x2_t s0 = load_u16x8_x2(src_blk - 16);
109 uint16x8x2_t d0;
110 vrev128q_u16(s0.val[0], d0.val[1]);
111 vrev128q_u16(s0.val[1], d0.val[0]);
112 store_u16x8_x2(dst_blk, d0);
113 }
114
115 for (; j + 8 <= src_w; src_blk -= 8, dst_blk += 8, j += 8) {
116 uint16x8_t s0 = vld1q_u16(src_blk - 8);
117 vrev128q_u16(s0, s0);
118 vst1q_u16(dst_blk, s0);
119 }
120
121 for (int k = 0; k < src_w - j; k++) {
122 dst_blk[k] = src_row[src_w - j - k - 1];
123 }
124 }
125 }
126
mirror_buffer_horizontal_neon_uint32_t(uint32_t * src_buffer,uint32_t * dst_buffer,int src_w,int src_h,int src_stride,int dst_stride)127 static void mirror_buffer_horizontal_neon_uint32_t(uint32_t* src_buffer, uint32_t* dst_buffer,
128 int src_w, int src_h, int src_stride,
129 int dst_stride) {
130 uint32_t* src_row = src_buffer;
131 uint32_t* dst_row = dst_buffer;
132
133 for (int i = 0; i < src_h; i++, src_row += src_stride, dst_row += dst_stride) {
134 uint32_t* src_blk = src_row + src_w;
135 uint32_t* dst_blk = dst_row;
136 int j = 0;
137
138 for (; j + 16 <= src_w; src_blk -= 16, dst_blk += 16, j += 16) {
139 uint32x4x4_t s0 = load_u32x4_x4(src_blk - 16);
140 uint32x4x4_t d0;
141 vrev128q_u32(s0.val[0], d0.val[3]);
142 vrev128q_u32(s0.val[1], d0.val[2]);
143 vrev128q_u32(s0.val[2], d0.val[1]);
144 vrev128q_u32(s0.val[3], d0.val[0]);
145 store_u32x4_x4(dst_blk, d0);
146 }
147
148 for (; j + 8 <= src_w; src_blk -= 8, dst_blk += 8, j += 8) {
149 uint32x4x2_t s0 = load_u32x4_x2(src_blk - 8);
150 uint32x4x2_t d0;
151 vrev128q_u32(s0.val[0], d0.val[1]);
152 vrev128q_u32(s0.val[1], d0.val[0]);
153 store_u32x4_x2(dst_blk, d0);
154 }
155
156 for (; j + 4 <= src_w; src_blk -= 4, dst_blk += 4, j += 4) {
157 uint32x4_t s0 = vld1q_u32(src_blk - 4);
158 vrev128q_u32(s0, s0);
159 vst1q_u32(dst_blk, s0);
160 }
161
162 for (int k = 0; k < src_w - j; k++) {
163 dst_blk[k] = src_row[src_w - j - k - 1];
164 }
165 }
166 }
167
mirror_buffer_horizontal_neon_uint64_t(uint64_t * src_buffer,uint64_t * dst_buffer,int src_w,int src_h,int src_stride,int dst_stride)168 static void mirror_buffer_horizontal_neon_uint64_t(uint64_t* src_buffer, uint64_t* dst_buffer,
169 int src_w, int src_h, int src_stride,
170 int dst_stride) {
171 uint64_t* src_row = src_buffer;
172 uint64_t* dst_row = dst_buffer;
173
174 for (int i = 0; i < src_h; i++, src_row += src_stride, dst_row += dst_stride) {
175 uint64_t* src_blk = src_row + src_w;
176 uint64_t* dst_blk = dst_row;
177 int j = 0;
178
179 for (; j + 2 <= src_w; src_blk -= 2, dst_blk += 2, j += 2) {
180 uint64x2_t s0 = vld1q_u64(src_blk - 2);
181 vrev128q_u64(s0);
182 vst1q_u64(dst_blk, s0);
183 }
184 for (int k = 0; k < src_w - j; k++) {
185 dst_blk[k] = src_row[src_w - j - k - 1];
186 }
187 }
188 }
189
mirror_buffer_vertical_neon_uint8_t(uint8_t * src_buffer,uint8_t * dst_buffer,int src_w,int src_h,int src_stride,int dst_stride)190 static void mirror_buffer_vertical_neon_uint8_t(uint8_t* src_buffer, uint8_t* dst_buffer, int src_w,
191 int src_h, int src_stride, int dst_stride) {
192 uint8_t* src_row = src_buffer;
193 uint8_t* dst_row = dst_buffer + (src_h - 1) * dst_stride;
194
195 for (int i = 0; i < src_h; i++, src_row += src_stride, dst_row -= dst_stride) {
196 uint8_t* src_blk = src_row;
197 uint8_t* dst_blk = dst_row;
198 int j = 0;
199
200 for (; j + 64 <= src_w; src_blk += 64, dst_blk += 64, j += 64) {
201 uint8x16x4_t s0 = load_u8x16_x4(src_blk);
202 store_u8x16_x4(dst_blk, s0);
203 }
204
205 for (; j + 32 <= src_w; src_blk += 32, dst_blk += 32, j += 32) {
206 uint8x16x2_t s0 = load_u8x16_x2(src_blk);
207 store_u8x16_x2(dst_blk, s0);
208 }
209
210 for (; j + 16 <= src_w; src_blk += 16, dst_blk += 16, j += 16) {
211 uint8x16_t s0 = vld1q_u8(src_blk);
212 vst1q_u8(dst_blk, s0);
213 }
214
215 for (; j + 8 <= src_w; src_blk += 8, dst_blk += 8, j += 8) {
216 uint8x8_t s0 = vld1_u8(src_blk);
217 vst1_u8(dst_blk, s0);
218 }
219
220 if (j < src_w) memcpy(dst_blk, src_blk, src_w - j);
221 }
222 }
223
mirror_buffer_vertical_neon_uint16_t(uint16_t * src_buffer,uint16_t * dst_buffer,int src_w,int src_h,int src_stride,int dst_stride)224 static void mirror_buffer_vertical_neon_uint16_t(uint16_t* src_buffer, uint16_t* dst_buffer,
225 int src_w, int src_h, int src_stride,
226 int dst_stride) {
227 uint16_t* src_row = src_buffer;
228 uint16_t* dst_row = dst_buffer + (src_h - 1) * dst_stride;
229
230 for (int i = 0; i < src_h; i++, src_row += src_stride, dst_row -= dst_stride) {
231 uint16_t* src_blk = src_row;
232 uint16_t* dst_blk = dst_row;
233 int j = 0;
234
235 for (; j + 32 <= src_w; src_blk += 32, dst_blk += 32, j += 32) {
236 uint16x8x4_t s0 = load_u16x8_x4(src_blk);
237 store_u16x8_x4(dst_blk, s0);
238 }
239
240 for (; j + 16 <= src_w; src_blk += 16, dst_blk += 16, j += 16) {
241 uint16x8x2_t s0 = load_u16x8_x2(src_blk);
242 store_u16x8_x2(dst_blk, s0);
243 }
244
245 for (; j + 8 <= src_w; src_blk += 8, dst_blk += 8, j += 8) {
246 uint16x8_t s0 = vld1q_u16(src_blk);
247 vst1q_u16(dst_blk, s0);
248 }
249
250 if (j < src_w) memcpy(dst_blk, src_blk, (src_w - j) * sizeof(uint16_t));
251 }
252 }
253
mirror_buffer_vertical_neon_uint32_t(uint32_t * src_buffer,uint32_t * dst_buffer,int src_w,int src_h,int src_stride,int dst_stride)254 static void mirror_buffer_vertical_neon_uint32_t(uint32_t* src_buffer, uint32_t* dst_buffer,
255 int src_w, int src_h, int src_stride,
256 int dst_stride) {
257 uint32_t* src_row = src_buffer;
258 uint32_t* dst_row = dst_buffer + (src_h - 1) * dst_stride;
259
260 for (int i = 0; i < src_h; i++, src_row += src_stride, dst_row -= dst_stride) {
261 uint32_t* src_blk = src_row;
262 uint32_t* dst_blk = dst_row;
263 int j = 0;
264
265 for (; j + 16 <= src_w; src_blk += 16, dst_blk += 16, j += 16) {
266 uint32x4x4_t s0 = load_u32x4_x4(src_blk);
267 store_u32x4_x4(dst_blk, s0);
268 }
269
270 for (; j + 8 <= src_w; src_blk += 8, dst_blk += 8, j += 8) {
271 uint32x4x2_t s0 = load_u32x4_x2(src_blk);
272 store_u32x4_x2(dst_blk, s0);
273 }
274
275 for (; j + 4 <= src_w; src_blk += 4, dst_blk += 4, j += 4) {
276 uint32x4_t s0 = vld1q_u32(src_blk);
277 vst1q_u32(dst_blk, s0);
278 }
279
280 if (j < src_w) memcpy(dst_blk, src_blk, (src_w - j) * sizeof(uint32_t));
281 }
282 }
283
mirror_buffer_vertical_neon_uint64_t(uint64_t * src_buffer,uint64_t * dst_buffer,int src_w,int src_h,int src_stride,int dst_stride)284 static void mirror_buffer_vertical_neon_uint64_t(uint64_t* src_buffer, uint64_t* dst_buffer,
285 int src_w, int src_h, int src_stride,
286 int dst_stride) {
287 uint64_t* src_row = src_buffer;
288 uint64_t* dst_row = dst_buffer + (src_h - 1) * dst_stride;
289
290 for (int i = 0; i < src_h; i++, src_row += src_stride, dst_row -= dst_stride) {
291 uint64_t* src_blk = src_row;
292 uint64_t* dst_blk = dst_row;
293 int j = 0;
294
295 for (; j + 2 <= src_w; src_blk += 2, dst_blk += 2, j += 2) {
296 uint64x2_t s0 = vld1q_u64(src_blk);
297 vst1q_u64(dst_blk, s0);
298 }
299
300 if (j < src_w) memcpy(dst_blk, src_blk, (src_w - j) * sizeof(uint64_t));
301 }
302 }
303
transpose_u8_8x8(uint8x8_t * a0,uint8x8_t * a1,uint8x8_t * a2,uint8x8_t * a3,uint8x8_t * a4,uint8x8_t * a5,uint8x8_t * a6,uint8x8_t * a7)304 static INLINE void transpose_u8_8x8(uint8x8_t* a0, uint8x8_t* a1, uint8x8_t* a2, uint8x8_t* a3,
305 uint8x8_t* a4, uint8x8_t* a5, uint8x8_t* a6, uint8x8_t* a7) {
306 // Swap 8 bit elements. Goes from:
307 // a0: 00 01 02 03 04 05 06 07
308 // a1: 10 11 12 13 14 15 16 17
309 // a2: 20 21 22 23 24 25 26 27
310 // a3: 30 31 32 33 34 35 36 37
311 // a4: 40 41 42 43 44 45 46 47
312 // a5: 50 51 52 53 54 55 56 57
313 // a6: 60 61 62 63 64 65 66 67
314 // a7: 70 71 72 73 74 75 76 77
315 // to:
316 // b0.val[0]: 00 10 02 12 04 14 06 16 40 50 42 52 44 54 46 56
317 // b0.val[1]: 01 11 03 13 05 15 07 17 41 51 43 53 45 55 47 57
318 // b1.val[0]: 20 30 22 32 24 34 26 36 60 70 62 72 64 74 66 76
319 // b1.val[1]: 21 31 23 33 25 35 27 37 61 71 63 73 65 75 67 77
320
321 const uint8x16x2_t b0 = vtrnq_u8(vcombine_u8(*a0, *a4), vcombine_u8(*a1, *a5));
322 const uint8x16x2_t b1 = vtrnq_u8(vcombine_u8(*a2, *a6), vcombine_u8(*a3, *a7));
323
324 // Swap 16 bit elements resulting in:
325 // c0.val[0]: 00 10 20 30 04 14 24 34 40 50 60 70 44 54 64 74
326 // c0.val[1]: 02 12 22 32 06 16 26 36 42 52 62 72 46 56 66 76
327 // c1.val[0]: 01 11 21 31 05 15 25 35 41 51 61 71 45 55 65 75
328 // c1.val[1]: 03 13 23 33 07 17 27 37 43 53 63 73 47 57 67 77
329
330 const uint16x8x2_t c0 =
331 vtrnq_u16(vreinterpretq_u16_u8(b0.val[0]), vreinterpretq_u16_u8(b1.val[0]));
332 const uint16x8x2_t c1 =
333 vtrnq_u16(vreinterpretq_u16_u8(b0.val[1]), vreinterpretq_u16_u8(b1.val[1]));
334
335 // Unzip 32 bit elements resulting in:
336 // d0.val[0]: 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
337 // d0.val[1]: 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75
338 // d1.val[0]: 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
339 // d1.val[1]: 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77
340 const uint32x4x2_t d0 =
341 vuzpq_u32(vreinterpretq_u32_u16(c0.val[0]), vreinterpretq_u32_u16(c1.val[0]));
342 const uint32x4x2_t d1 =
343 vuzpq_u32(vreinterpretq_u32_u16(c0.val[1]), vreinterpretq_u32_u16(c1.val[1]));
344
345 *a0 = vreinterpret_u8_u32(vget_low_u32(d0.val[0]));
346 *a1 = vreinterpret_u8_u32(vget_high_u32(d0.val[0]));
347 *a2 = vreinterpret_u8_u32(vget_low_u32(d1.val[0]));
348 *a3 = vreinterpret_u8_u32(vget_high_u32(d1.val[0]));
349 *a4 = vreinterpret_u8_u32(vget_low_u32(d0.val[1]));
350 *a5 = vreinterpret_u8_u32(vget_high_u32(d0.val[1]));
351 *a6 = vreinterpret_u8_u32(vget_low_u32(d1.val[1]));
352 *a7 = vreinterpret_u8_u32(vget_high_u32(d1.val[1]));
353 }
354
reverse_uint8x8_regs(uint8x8_t * a0,uint8x8_t * a1,uint8x8_t * a2,uint8x8_t * a3,uint8x8_t * a4,uint8x8_t * a5,uint8x8_t * a6,uint8x8_t * a7)355 static INLINE void reverse_uint8x8_regs(uint8x8_t* a0, uint8x8_t* a1, uint8x8_t* a2, uint8x8_t* a3,
356 uint8x8_t* a4, uint8x8_t* a5, uint8x8_t* a6,
357 uint8x8_t* a7) {
358 *a0 = vrev64_u8(*a0);
359 *a1 = vrev64_u8(*a1);
360 *a2 = vrev64_u8(*a2);
361 *a3 = vrev64_u8(*a3);
362 *a4 = vrev64_u8(*a4);
363 *a5 = vrev64_u8(*a5);
364 *a6 = vrev64_u8(*a6);
365 *a7 = vrev64_u8(*a7);
366 }
367
vtrnq_u64_to_u16(uint32x4_t a0,uint32x4_t a1)368 static INLINE uint16x8x2_t vtrnq_u64_to_u16(uint32x4_t a0, uint32x4_t a1) {
369 uint16x8x2_t b0;
370
371 #if (defined(__arm64__) && defined(__APPLE__)) || defined(__aarch64__)
372 b0.val[0] =
373 vreinterpretq_u16_u64(vtrn1q_u64(vreinterpretq_u64_u32(a0), vreinterpretq_u64_u32(a1)));
374 b0.val[1] =
375 vreinterpretq_u16_u64(vtrn2q_u64(vreinterpretq_u64_u32(a0), vreinterpretq_u64_u32(a1)));
376 #else
377 b0.val[0] =
378 vcombine_u16(vreinterpret_u16_u32(vget_low_u32(a0)), vreinterpret_u16_u32(vget_low_u32(a1)));
379 b0.val[1] = vcombine_u16(vreinterpret_u16_u32(vget_high_u32(a0)),
380 vreinterpret_u16_u32(vget_high_u32(a1)));
381 #endif
382 return b0;
383 }
384
transpose_u16_8x8(uint16x8_t * a0,uint16x8_t * a1,uint16x8_t * a2,uint16x8_t * a3,uint16x8_t * a4,uint16x8_t * a5,uint16x8_t * a6,uint16x8_t * a7)385 static INLINE void transpose_u16_8x8(uint16x8_t* a0, uint16x8_t* a1, uint16x8_t* a2, uint16x8_t* a3,
386 uint16x8_t* a4, uint16x8_t* a5, uint16x8_t* a6,
387 uint16x8_t* a7) {
388 // Swap 16 bit elements. Goes from:
389 // a0: 00 01 02 03 04 05 06 07
390 // a1: 10 11 12 13 14 15 16 17
391 // a2: 20 21 22 23 24 25 26 27
392 // a3: 30 31 32 33 34 35 36 37
393 // a4: 40 41 42 43 44 45 46 47
394 // a5: 50 51 52 53 54 55 56 57
395 // a6: 60 61 62 63 64 65 66 67
396 // a7: 70 71 72 73 74 75 76 77
397 // to:
398 // b0.val[0]: 00 10 02 12 04 14 06 16
399 // b0.val[1]: 01 11 03 13 05 15 07 17
400 // b1.val[0]: 20 30 22 32 24 34 26 36
401 // b1.val[1]: 21 31 23 33 25 35 27 37
402 // b2.val[0]: 40 50 42 52 44 54 46 56
403 // b2.val[1]: 41 51 43 53 45 55 47 57
404 // b3.val[0]: 60 70 62 72 64 74 66 76
405 // b3.val[1]: 61 71 63 73 65 75 67 77
406 const uint16x8x2_t b0 = vtrnq_u16(*a0, *a1);
407 const uint16x8x2_t b1 = vtrnq_u16(*a2, *a3);
408 const uint16x8x2_t b2 = vtrnq_u16(*a4, *a5);
409 const uint16x8x2_t b3 = vtrnq_u16(*a6, *a7);
410
411 // Swap 32 bit elements resulting in:
412 // c0.val[0]: 00 10 20 30 04 14 24 34
413 // c0.val[1]: 02 12 22 32 06 16 26 36
414 // c1.val[0]: 01 11 21 31 05 15 25 35
415 // c1.val[1]: 03 13 23 33 07 17 27 37
416 // c2.val[0]: 40 50 60 70 44 54 64 74
417 // c2.val[1]: 42 52 62 72 46 56 66 76
418 // c3.val[0]: 41 51 61 71 45 55 65 75
419 // c3.val[1]: 43 53 63 73 47 57 67 77
420 const uint32x4x2_t c0 =
421 vtrnq_u32(vreinterpretq_u32_u16(b0.val[0]), vreinterpretq_u32_u16(b1.val[0]));
422 const uint32x4x2_t c1 =
423 vtrnq_u32(vreinterpretq_u32_u16(b0.val[1]), vreinterpretq_u32_u16(b1.val[1]));
424 const uint32x4x2_t c2 =
425 vtrnq_u32(vreinterpretq_u32_u16(b2.val[0]), vreinterpretq_u32_u16(b3.val[0]));
426 const uint32x4x2_t c3 =
427 vtrnq_u32(vreinterpretq_u32_u16(b2.val[1]), vreinterpretq_u32_u16(b3.val[1]));
428
429 // Swap 64 bit elements resulting in:
430 // d0.val[0]: 00 10 20 30 40 50 60 70
431 // d0.val[1]: 04 14 24 34 44 54 64 74
432 // d1.val[0]: 01 11 21 31 41 51 61 71
433 // d1.val[1]: 05 15 25 35 45 55 65 75
434 // d2.val[0]: 02 12 22 32 42 52 62 72
435 // d2.val[1]: 06 16 26 36 46 56 66 76
436 // d3.val[0]: 03 13 23 33 43 53 63 73
437 // d3.val[1]: 07 17 27 37 47 57 67 77
438 const uint16x8x2_t d0 = vtrnq_u64_to_u16(c0.val[0], c2.val[0]);
439 const uint16x8x2_t d1 = vtrnq_u64_to_u16(c1.val[0], c3.val[0]);
440 const uint16x8x2_t d2 = vtrnq_u64_to_u16(c0.val[1], c2.val[1]);
441 const uint16x8x2_t d3 = vtrnq_u64_to_u16(c1.val[1], c3.val[1]);
442
443 *a0 = d0.val[0];
444 *a1 = d1.val[0];
445 *a2 = d2.val[0];
446 *a3 = d3.val[0];
447 *a4 = d0.val[1];
448 *a5 = d1.val[1];
449 *a6 = d2.val[1];
450 *a7 = d3.val[1];
451 }
452
reverse_uint16x8_regs(uint16x8_t * a0,uint16x8_t * a1,uint16x8_t * a2,uint16x8_t * a3,uint16x8_t * a4,uint16x8_t * a5,uint16x8_t * a6,uint16x8_t * a7)453 static INLINE void reverse_uint16x8_regs(uint16x8_t* a0, uint16x8_t* a1, uint16x8_t* a2,
454 uint16x8_t* a3, uint16x8_t* a4, uint16x8_t* a5,
455 uint16x8_t* a6, uint16x8_t* a7) {
456 vrev128q_u16(*a0, *a0);
457 vrev128q_u16(*a1, *a1);
458 vrev128q_u16(*a2, *a2);
459 vrev128q_u16(*a3, *a3);
460 vrev128q_u16(*a4, *a4);
461 vrev128q_u16(*a5, *a5);
462 vrev128q_u16(*a6, *a6);
463 vrev128q_u16(*a7, *a7);
464 }
465
vtrnq_u64_to_u32(uint32x4_t a0,uint32x4_t a1)466 static INLINE uint32x4x2_t vtrnq_u64_to_u32(uint32x4_t a0, uint32x4_t a1) {
467 uint32x4x2_t b0;
468 #if (defined(__arm64__) && defined(__APPLE__)) || defined(__aarch64__)
469 b0.val[0] =
470 vreinterpretq_u32_u64(vtrn1q_u64(vreinterpretq_u64_u32(a0), vreinterpretq_u64_u32(a1)));
471 b0.val[1] =
472 vreinterpretq_u32_u64(vtrn2q_u64(vreinterpretq_u64_u32(a0), vreinterpretq_u64_u32(a1)));
473 #else
474 b0.val[0] = vcombine_u32(vget_low_u32(a0), vget_low_u32(a1));
475 b0.val[1] = vcombine_u32(vget_high_u32(a0), vget_high_u32(a1));
476 #endif
477 return b0;
478 }
479
transpose_u32_4x4(uint32x4_t * a0,uint32x4_t * a1,uint32x4_t * a2,uint32x4_t * a3)480 static INLINE void transpose_u32_4x4(uint32x4_t* a0, uint32x4_t* a1, uint32x4_t* a2,
481 uint32x4_t* a3) {
482 // Swap 32 bit elements. Goes from:
483 // a0: 00 01 02 03
484 // a1: 10 11 12 13
485 // a2: 20 21 22 23
486 // a3: 30 31 32 33
487 // to:
488 // b0.val[0]: 00 10 02 12
489 // b0.val[1]: 01 11 03 13
490 // b1.val[0]: 20 30 22 32
491 // b1.val[1]: 21 31 23 33
492
493 const uint32x4x2_t b0 = vtrnq_u32(*a0, *a1);
494 const uint32x4x2_t b1 = vtrnq_u32(*a2, *a3);
495
496 // Swap 64 bit elements resulting in:
497 // c0.val[0]: 00 10 20 30
498 // c0.val[1]: 02 12 22 32
499 // c1.val[0]: 01 11 21 31
500 // c1.val[1]: 03 13 23 33
501
502 const uint32x4x2_t c0 = vtrnq_u64_to_u32(b0.val[0], b1.val[0]);
503 const uint32x4x2_t c1 = vtrnq_u64_to_u32(b0.val[1], b1.val[1]);
504
505 *a0 = c0.val[0];
506 *a1 = c1.val[0];
507 *a2 = c0.val[1];
508 *a3 = c1.val[1];
509 }
510
reverse_uint32x4_regs(uint32x4_t * a0,uint32x4_t * a1,uint32x4_t * a2,uint32x4_t * a3)511 static INLINE void reverse_uint32x4_regs(uint32x4_t* a0, uint32x4_t* a1, uint32x4_t* a2,
512 uint32x4_t* a3) {
513 vrev128q_u32(*a0, *a0);
514 vrev128q_u32(*a1, *a1);
515 vrev128q_u32(*a2, *a2);
516 vrev128q_u32(*a3, *a3);
517 }
518
rotate90_u64_2x2(uint64x2_t * a0,uint64x2_t * a1)519 static INLINE void rotate90_u64_2x2(uint64x2_t* a0, uint64x2_t* a1) {
520 uint64x2_t b0 = vcombine_u64(vget_low_u64(*a1), vget_low_u64(*a0));
521 uint64x2_t b1 = vcombine_u64(vget_high_u64(*a1), vget_high_u64(*a0));
522 *a0 = b0;
523 *a1 = b1;
524 }
525
rotate270_u64_2x2(uint64x2_t * a0,uint64x2_t * a1)526 static INLINE void rotate270_u64_2x2(uint64x2_t* a0, uint64x2_t* a1) {
527 uint64x2_t b0 = vcombine_u64(vget_low_u64(*a0), vget_low_u64(*a1));
528 uint64x2_t b1 = vcombine_u64(vget_high_u64(*a0), vget_high_u64(*a1));
529 *a0 = b1;
530 *a1 = b0;
531 }
532
load_u8_8x8(const uint8_t * s,const int stride,uint8x8_t * s0,uint8x8_t * s1,uint8x8_t * s2,uint8x8_t * s3,uint8x8_t * s4,uint8x8_t * s5,uint8x8_t * s6,uint8x8_t * s7)533 static INLINE void load_u8_8x8(const uint8_t* s, const int stride, uint8x8_t* s0, uint8x8_t* s1,
534 uint8x8_t* s2, uint8x8_t* s3, uint8x8_t* s4, uint8x8_t* s5,
535 uint8x8_t* s6, uint8x8_t* s7) {
536 *s0 = vld1_u8(s);
537 s += stride;
538 *s1 = vld1_u8(s);
539 s += stride;
540 *s2 = vld1_u8(s);
541 s += stride;
542 *s3 = vld1_u8(s);
543 s += stride;
544 *s4 = vld1_u8(s);
545 s += stride;
546 *s5 = vld1_u8(s);
547 s += stride;
548 *s6 = vld1_u8(s);
549 s += stride;
550 *s7 = vld1_u8(s);
551 }
552
load_u16_8x8(const uint16_t * s,const int stride,uint16x8_t * s0,uint16x8_t * s1,uint16x8_t * s2,uint16x8_t * s3,uint16x8_t * s4,uint16x8_t * s5,uint16x8_t * s6,uint16x8_t * s7)553 static INLINE void load_u16_8x8(const uint16_t* s, const int stride, uint16x8_t* s0, uint16x8_t* s1,
554 uint16x8_t* s2, uint16x8_t* s3, uint16x8_t* s4, uint16x8_t* s5,
555 uint16x8_t* s6, uint16x8_t* s7) {
556 *s0 = vld1q_u16(s);
557 s += stride;
558 *s1 = vld1q_u16(s);
559 s += stride;
560 *s2 = vld1q_u16(s);
561 s += stride;
562 *s3 = vld1q_u16(s);
563 s += stride;
564 *s4 = vld1q_u16(s);
565 s += stride;
566 *s5 = vld1q_u16(s);
567 s += stride;
568 *s6 = vld1q_u16(s);
569 s += stride;
570 *s7 = vld1q_u16(s);
571 }
572
load_u32_4x4(const uint32_t * s,const int stride,uint32x4_t * s1,uint32x4_t * s2,uint32x4_t * s3,uint32x4_t * s4)573 static INLINE void load_u32_4x4(const uint32_t* s, const int stride, uint32x4_t* s1, uint32x4_t* s2,
574 uint32x4_t* s3, uint32x4_t* s4) {
575 *s1 = vld1q_u32(s);
576 s += stride;
577 *s2 = vld1q_u32(s);
578 s += stride;
579 *s3 = vld1q_u32(s);
580 s += stride;
581 *s4 = vld1q_u32(s);
582 }
583
load_u64_2x2(const uint64_t * s,const int stride,uint64x2_t * s1,uint64x2_t * s2)584 static INLINE void load_u64_2x2(const uint64_t* s, const int stride, uint64x2_t* s1,
585 uint64x2_t* s2) {
586 *s1 = vld1q_u64(s);
587 s += stride;
588 *s2 = vld1q_u64(s);
589 }
590
store_u8_8x8(uint8_t * s,int stride,uint8x8_t s0,uint8x8_t s1,uint8x8_t s2,uint8x8_t s3,uint8x8_t s4,uint8x8_t s5,uint8x8_t s6,uint8x8_t s7)591 static INLINE void store_u8_8x8(uint8_t* s, int stride, uint8x8_t s0, uint8x8_t s1, uint8x8_t s2,
592 uint8x8_t s3, uint8x8_t s4, uint8x8_t s5, uint8x8_t s6,
593 uint8x8_t s7) {
594 vst1_u8(s, s0);
595 s += stride;
596 vst1_u8(s, s1);
597 s += stride;
598 vst1_u8(s, s2);
599 s += stride;
600 vst1_u8(s, s3);
601 s += stride;
602 vst1_u8(s, s4);
603 s += stride;
604 vst1_u8(s, s5);
605 s += stride;
606 vst1_u8(s, s6);
607 s += stride;
608 vst1_u8(s, s7);
609 }
610
store_u16_8x8(uint16_t * s,int stride,uint16x8_t s0,uint16x8_t s1,uint16x8_t s2,uint16x8_t s3,uint16x8_t s4,uint16x8_t s5,uint16x8_t s6,uint16x8_t s7)611 static INLINE void store_u16_8x8(uint16_t* s, int stride, uint16x8_t s0, uint16x8_t s1,
612 uint16x8_t s2, uint16x8_t s3, uint16x8_t s4, uint16x8_t s5,
613 uint16x8_t s6, uint16x8_t s7) {
614 vst1q_u16(s, s0);
615 s += stride;
616 vst1q_u16(s, s1);
617 s += stride;
618 vst1q_u16(s, s2);
619 s += stride;
620 vst1q_u16(s, s3);
621 s += stride;
622 vst1q_u16(s, s4);
623 s += stride;
624 vst1q_u16(s, s5);
625 s += stride;
626 vst1q_u16(s, s6);
627 s += stride;
628 vst1q_u16(s, s7);
629 }
630
store_u32_4x4(uint32_t * s,int stride,uint32x4_t s1,uint32x4_t s2,uint32x4_t s3,uint32x4_t s4)631 static INLINE void store_u32_4x4(uint32_t* s, int stride, uint32x4_t s1, uint32x4_t s2,
632 uint32x4_t s3, uint32x4_t s4) {
633 vst1q_u32(s, s1);
634 s += stride;
635 vst1q_u32(s, s2);
636 s += stride;
637 vst1q_u32(s, s3);
638 s += stride;
639 vst1q_u32(s, s4);
640 }
641
store_u64_2x2(uint64_t * s,int stride,uint64x2_t s1,uint64x2_t s2)642 static INLINE void store_u64_2x2(uint64_t* s, int stride, uint64x2_t s1, uint64x2_t s2) {
643 vst1q_u64(s, s1);
644 s += stride;
645 vst1q_u64(s, s2);
646 }
647
rotate_buffer_clockwise_90_neon_uint8_t(uint8_t * src_buffer,uint8_t * dst_buffer,int src_w,int src_h,int src_stride,int dst_stride)648 static void rotate_buffer_clockwise_90_neon_uint8_t(uint8_t* src_buffer, uint8_t* dst_buffer,
649 int src_w, int src_h, int src_stride,
650 int dst_stride) {
651 const int blk_wd = 8;
652
653 if (src_h < blk_wd || src_w < blk_wd) {
654 rotate_buffer_clockwise(src_buffer, dst_buffer, src_w, src_h, src_stride, dst_stride, 90);
655 return;
656 }
657
658 int sub_img_w = (src_w / blk_wd) * blk_wd;
659 uint8x8_t s[blk_wd];
660 int i = 0;
661
662 while (1) {
663 uint8_t* dst_blk = dst_buffer + src_h - i - blk_wd;
664 uint8_t* src_blk = src_buffer + (i * src_stride);
665 int j;
666
667 for (j = 0; j < sub_img_w; j += blk_wd, src_blk += blk_wd, dst_blk += (blk_wd * dst_stride)) {
668 load_u8_8x8(src_blk, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6], &s[7]);
669 transpose_u8_8x8(&s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6], &s[7]);
670 reverse_uint8x8_regs(&s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6], &s[7]);
671 store_u8_8x8(dst_blk, dst_stride, s[0], s[1], s[2], s[3], s[4], s[5], s[6], s[7]);
672 }
673 if (sub_img_w < src_w) {
674 dst_blk += blk_wd - 1;
675 for (int k = 0; k < blk_wd; k++) {
676 for (int l = 0; l < (src_w - sub_img_w); l++) {
677 dst_blk[l * dst_stride - k] = src_blk[k * src_stride + l];
678 }
679 }
680 }
681 i += blk_wd;
682 if (i == src_h) break;
683 if (i + blk_wd > src_h) i = src_h - blk_wd;
684 }
685 }
686
rotate_buffer_clockwise_90_neon_uint16_t(uint16_t * src_buffer,uint16_t * dst_buffer,int src_w,int src_h,int src_stride,int dst_stride)687 static void rotate_buffer_clockwise_90_neon_uint16_t(uint16_t* src_buffer, uint16_t* dst_buffer,
688 int src_w, int src_h, int src_stride,
689 int dst_stride) {
690 const int blk_wd = 8;
691
692 if (src_h < blk_wd || src_w < blk_wd) {
693 rotate_buffer_clockwise(src_buffer, dst_buffer, src_w, src_h, src_stride, dst_stride, 90);
694 return;
695 }
696
697 int sub_img_w = (src_w / blk_wd) * blk_wd;
698 uint16x8_t s[blk_wd];
699 int i = 0;
700
701 while (1) {
702 uint16_t* dst_blk = dst_buffer + src_h - i - blk_wd;
703 uint16_t* src_blk = src_buffer + (i * src_stride);
704 int j;
705
706 for (j = 0; j < sub_img_w; j += blk_wd, src_blk += blk_wd, dst_blk += (blk_wd * dst_stride)) {
707 load_u16_8x8(src_blk, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6], &s[7]);
708 transpose_u16_8x8(&s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6], &s[7]);
709 reverse_uint16x8_regs(&s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6], &s[7]);
710 store_u16_8x8(dst_blk, dst_stride, s[0], s[1], s[2], s[3], s[4], s[5], s[6], s[7]);
711 }
712 if (sub_img_w < src_w) {
713 dst_blk += blk_wd - 1;
714 for (int k = 0; k < blk_wd; k++) {
715 for (int l = 0; l < (src_w - sub_img_w); l++) {
716 dst_blk[l * dst_stride - k] = src_blk[k * src_stride + l];
717 }
718 }
719 }
720 i += blk_wd;
721 if (i == src_h) break;
722 if (i + blk_wd > src_h) i = src_h - blk_wd;
723 }
724 }
725
rotate_buffer_clockwise_90_neon_uint32_t(uint32_t * src_buffer,uint32_t * dst_buffer,int src_w,int src_h,int src_stride,int dst_stride)726 static void rotate_buffer_clockwise_90_neon_uint32_t(uint32_t* src_buffer, uint32_t* dst_buffer,
727 int src_w, int src_h, int src_stride,
728 int dst_stride) {
729 const int blk_wd = 4;
730
731 if (src_h < blk_wd || src_w < blk_wd) {
732 rotate_buffer_clockwise(src_buffer, dst_buffer, src_w, src_h, src_stride, dst_stride, 90);
733 return;
734 }
735
736 int sub_img_w = (src_w / blk_wd) * blk_wd;
737 uint32x4_t s[blk_wd];
738 int i = 0;
739
740 while (1) {
741 uint32_t* dst_blk = dst_buffer + src_h - i - blk_wd;
742 uint32_t* src_blk = src_buffer + (i * src_stride);
743 int j;
744
745 for (j = 0; j < sub_img_w; j += blk_wd, src_blk += blk_wd, dst_blk += (blk_wd * dst_stride)) {
746 load_u32_4x4(src_blk, src_stride, &s[0], &s[1], &s[2], &s[3]);
747 transpose_u32_4x4(&s[0], &s[1], &s[2], &s[3]);
748 reverse_uint32x4_regs(&s[0], &s[1], &s[2], &s[3]);
749 store_u32_4x4(dst_blk, dst_stride, s[0], s[1], s[2], s[3]);
750 }
751 if (sub_img_w < src_w) {
752 dst_blk += blk_wd - 1;
753 for (int k = 0; k < blk_wd; k++) {
754 for (int l = 0; l < (src_w - sub_img_w); l++) {
755 dst_blk[l * dst_stride - k] = src_blk[k * src_stride + l];
756 }
757 }
758 }
759 i += blk_wd;
760 if (i == src_h) break;
761 if (i + blk_wd > src_h) i = src_h - blk_wd;
762 }
763 }
764
rotate_buffer_clockwise_90_neon_uint64_t(uint64_t * src_buffer,uint64_t * dst_buffer,int src_w,int src_h,int src_stride,int dst_stride)765 static void rotate_buffer_clockwise_90_neon_uint64_t(uint64_t* src_buffer, uint64_t* dst_buffer,
766 int src_w, int src_h, int src_stride,
767 int dst_stride) {
768 const int blk_wd = 2;
769
770 if (src_h < blk_wd || src_w < blk_wd) {
771 rotate_buffer_clockwise(src_buffer, dst_buffer, src_w, src_h, src_stride, dst_stride, 90);
772 return;
773 }
774
775 int sub_img_w = (src_w / blk_wd) * blk_wd;
776 uint64x2_t s[blk_wd];
777 int i = 0;
778
779 while (1) {
780 uint64_t* dst_blk = dst_buffer + src_h - i - blk_wd;
781 uint64_t* src_blk = src_buffer + (i * src_stride);
782 int j;
783
784 for (j = 0; j < sub_img_w; j += blk_wd, src_blk += blk_wd, dst_blk += (blk_wd * dst_stride)) {
785 load_u64_2x2(src_blk, src_stride, &s[0], &s[1]);
786 rotate90_u64_2x2(&s[0], &s[1]);
787 store_u64_2x2(dst_blk, dst_stride, s[0], s[1]);
788 }
789 if (sub_img_w < src_w) {
790 dst_blk += blk_wd - 1;
791 for (int k = 0; k < blk_wd; k++) {
792 for (int l = 0; l < (src_w - sub_img_w); l++) {
793 dst_blk[l * dst_stride - k] = src_blk[k * src_stride + l];
794 }
795 }
796 }
797 i += blk_wd;
798 if (i == src_h) break;
799 if (i + blk_wd > src_h) i = src_h - blk_wd;
800 }
801 }
802
rotate_buffer_clockwise_270_neon_uint8_t(uint8_t * src_buffer,uint8_t * dst_buffer,int src_w,int src_h,int src_stride,int dst_stride)803 static void rotate_buffer_clockwise_270_neon_uint8_t(uint8_t* src_buffer, uint8_t* dst_buffer,
804 int src_w, int src_h, int src_stride,
805 int dst_stride) {
806 const int blk_wd = 8;
807
808 if (src_h < blk_wd || src_w < blk_wd) {
809 rotate_buffer_clockwise(src_buffer, dst_buffer, src_w, src_h, src_stride, dst_stride, 270);
810 return;
811 }
812
813 int sub_img_w = (src_w / blk_wd) * blk_wd;
814 uint8x8_t s[blk_wd];
815 int i = 0;
816
817 while (1) {
818 uint8_t* dst_blk = dst_buffer + i + (src_w - blk_wd) * dst_stride;
819 uint8_t* src_blk = src_buffer + (i * src_stride);
820 int j;
821
822 for (j = 0; j < sub_img_w; j += blk_wd, src_blk += blk_wd, dst_blk -= (blk_wd * dst_stride)) {
823 load_u8_8x8(src_blk, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6], &s[7]);
824 transpose_u8_8x8(&s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6], &s[7]);
825 store_u8_8x8(dst_blk, dst_stride, s[7], s[6], s[5], s[4], s[3], s[2], s[1], s[0]);
826 }
827 if (sub_img_w < src_w) {
828 dst_blk += (blk_wd - 1) * dst_stride;
829 for (int k = 0; k < blk_wd; k++) {
830 for (int l = 0; l < (src_w - sub_img_w); l++) {
831 dst_blk[-l * dst_stride + k] = src_blk[k * src_stride + l];
832 }
833 }
834 }
835 i += blk_wd;
836 if (i == src_h) break;
837 if (i + blk_wd > src_h) i = src_h - blk_wd;
838 }
839 }
840
rotate_buffer_clockwise_270_neon_uint16_t(uint16_t * src_buffer,uint16_t * dst_buffer,int src_w,int src_h,int src_stride,int dst_stride)841 static void rotate_buffer_clockwise_270_neon_uint16_t(uint16_t* src_buffer, uint16_t* dst_buffer,
842 int src_w, int src_h, int src_stride,
843 int dst_stride) {
844 const int blk_wd = 8;
845
846 if (src_h < blk_wd || src_w < blk_wd) {
847 rotate_buffer_clockwise(src_buffer, dst_buffer, src_w, src_h, src_stride, dst_stride, 270);
848 return;
849 }
850
851 int sub_img_w = (src_w / blk_wd) * blk_wd;
852 uint16x8_t s[blk_wd];
853 int i = 0;
854
855 while (1) {
856 uint16_t* dst_blk = dst_buffer + i + (src_w - blk_wd) * dst_stride;
857 uint16_t* src_blk = src_buffer + (i * src_stride);
858 int j;
859
860 for (j = 0; j < sub_img_w; j += blk_wd, src_blk += blk_wd, dst_blk -= (blk_wd * dst_stride)) {
861 load_u16_8x8(src_blk, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6], &s[7]);
862 transpose_u16_8x8(&s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6], &s[7]);
863 store_u16_8x8(dst_blk, dst_stride, s[7], s[6], s[5], s[4], s[3], s[2], s[1], s[0]);
864 }
865 if (sub_img_w < src_w) {
866 dst_blk += (blk_wd - 1) * dst_stride;
867 for (int k = 0; k < blk_wd; k++) {
868 for (int l = 0; l < (src_w - sub_img_w); l++) {
869 dst_blk[-l * dst_stride + k] = src_blk[k * src_stride + l];
870 }
871 }
872 }
873 i += blk_wd;
874 if (i == src_h) break;
875 if (i + blk_wd > src_h) i = src_h - blk_wd;
876 }
877 }
878
rotate_buffer_clockwise_270_neon_uint32_t(uint32_t * src_buffer,uint32_t * dst_buffer,int src_w,int src_h,int src_stride,int dst_stride)879 static void rotate_buffer_clockwise_270_neon_uint32_t(uint32_t* src_buffer, uint32_t* dst_buffer,
880 int src_w, int src_h, int src_stride,
881 int dst_stride) {
882 const int blk_wd = 4;
883
884 if (src_h < blk_wd || src_w < blk_wd) {
885 rotate_buffer_clockwise(src_buffer, dst_buffer, src_w, src_h, src_stride, dst_stride, 270);
886 return;
887 }
888
889 int sub_img_w = (src_w / blk_wd) * blk_wd;
890 uint32x4_t s[blk_wd];
891 int i = 0;
892
893 while (1) {
894 uint32_t* dst_blk = dst_buffer + i + (src_w - blk_wd) * dst_stride;
895 uint32_t* src_blk = src_buffer + (i * src_stride);
896 int j;
897
898 for (j = 0; j < sub_img_w; j += blk_wd, src_blk += blk_wd, dst_blk -= (blk_wd * dst_stride)) {
899 load_u32_4x4(src_blk, src_stride, &s[0], &s[1], &s[2], &s[3]);
900 transpose_u32_4x4(&s[0], &s[1], &s[2], &s[3]);
901 store_u32_4x4(dst_blk, dst_stride, s[3], s[2], s[1], s[0]);
902 }
903 if (sub_img_w < src_w) {
904 dst_blk += (blk_wd - 1) * dst_stride;
905 for (int k = 0; k < blk_wd; k++) {
906 for (int l = 0; l < (src_w - sub_img_w); l++) {
907 dst_blk[-l * dst_stride + k] = src_blk[k * src_stride + l];
908 }
909 }
910 }
911 i += blk_wd;
912 if (i == src_h) break;
913 if (i + blk_wd > src_h) i = src_h - blk_wd;
914 }
915 }
916
rotate_buffer_clockwise_270_neon_uint64_t(uint64_t * src_buffer,uint64_t * dst_buffer,int src_w,int src_h,int src_stride,int dst_stride)917 static void rotate_buffer_clockwise_270_neon_uint64_t(uint64_t* src_buffer, uint64_t* dst_buffer,
918 int src_w, int src_h, int src_stride,
919 int dst_stride) {
920 const int blk_wd = 2;
921
922 if (src_h < blk_wd || src_w < blk_wd) {
923 rotate_buffer_clockwise(src_buffer, dst_buffer, src_w, src_h, src_stride, dst_stride, 270);
924 return;
925 }
926
927 int sub_img_w = (src_w / blk_wd) * blk_wd;
928 uint64x2_t s[blk_wd];
929 int i = 0;
930
931 while (1) {
932 uint64_t* dst_blk = dst_buffer + i + (src_w - blk_wd) * dst_stride;
933 uint64_t* src_blk = src_buffer + (i * src_stride);
934 int j;
935
936 for (j = 0; j < sub_img_w; j += blk_wd, src_blk += blk_wd, dst_blk -= (blk_wd * dst_stride)) {
937 load_u64_2x2(src_blk, src_stride, &s[0], &s[1]);
938 rotate270_u64_2x2(&s[0], &s[1]);
939 store_u64_2x2(dst_blk, dst_stride, s[0], s[1]);
940 }
941 if (sub_img_w < src_w) {
942 dst_blk += (blk_wd - 1) * dst_stride;
943 for (int k = 0; k < blk_wd; k++) {
944 for (int l = 0; l < (src_w - sub_img_w); l++) {
945 dst_blk[-l * dst_stride + k] = src_blk[k * src_stride + l];
946 }
947 }
948 }
949 i += blk_wd;
950 if (i == src_h) break;
951 if (i + blk_wd > src_h) i = src_h - blk_wd;
952 }
953 }
954
955 template <typename T>
mirror_buffer_neon(T * src_buffer,T * dst_buffer,int src_w,int src_h,int src_stride,int dst_stride,uhdr_mirror_direction_t direction)956 void mirror_buffer_neon(T* src_buffer, T* dst_buffer, int src_w, int src_h, int src_stride,
957 int dst_stride, uhdr_mirror_direction_t direction) {
958 if (direction == UHDR_MIRROR_VERTICAL) {
959 if constexpr (sizeof(T) == 1) {
960 mirror_buffer_vertical_neon_uint8_t(src_buffer, dst_buffer, src_w, src_h, src_stride,
961 dst_stride);
962 } else if constexpr (sizeof(T) == 2) {
963 mirror_buffer_vertical_neon_uint16_t(src_buffer, dst_buffer, src_w, src_h, src_stride,
964 dst_stride);
965 } else if constexpr (sizeof(T) == 4) {
966 mirror_buffer_vertical_neon_uint32_t(src_buffer, dst_buffer, src_w, src_h, src_stride,
967 dst_stride);
968 } else if constexpr (sizeof(T) == 8) {
969 mirror_buffer_vertical_neon_uint64_t(src_buffer, dst_buffer, src_w, src_h, src_stride,
970 dst_stride);
971 }
972
973 } else if (direction == UHDR_MIRROR_HORIZONTAL) {
974 if constexpr (sizeof(T) == 1) {
975 mirror_buffer_horizontal_neon_uint8_t(src_buffer, dst_buffer, src_w, src_h, src_stride,
976 dst_stride);
977 } else if constexpr (sizeof(T) == 2) {
978 mirror_buffer_horizontal_neon_uint16_t(src_buffer, dst_buffer, src_w, src_h, src_stride,
979 dst_stride);
980 } else if constexpr (sizeof(T) == 4) {
981 mirror_buffer_horizontal_neon_uint32_t(src_buffer, dst_buffer, src_w, src_h, src_stride,
982 dst_stride);
983 } else if constexpr (sizeof(T) == 8) {
984 mirror_buffer_horizontal_neon_uint64_t(src_buffer, dst_buffer, src_w, src_h, src_stride,
985 dst_stride);
986 }
987 }
988 }
989
990 template <typename T>
rotate_buffer_clockwise_180_neon(T * src_buffer,T * dst_buffer,int src_w,int src_h,int src_stride,int dst_stride)991 void rotate_buffer_clockwise_180_neon(T* src_buffer, T* dst_buffer, int src_w, int src_h,
992 int src_stride, int dst_stride) {
993 if constexpr (sizeof(T) == 1) {
994 mirror_buffer_horizontal_neon_uint8_t(src_buffer + (src_h - 1) * src_stride, dst_buffer, src_w,
995 src_h, -src_stride, dst_stride);
996 } else if constexpr (sizeof(T) == 2) {
997 mirror_buffer_horizontal_neon_uint16_t(src_buffer + (src_h - 1) * src_stride, dst_buffer, src_w,
998 src_h, -src_stride, dst_stride);
999 } else if constexpr (sizeof(T) == 4) {
1000 mirror_buffer_horizontal_neon_uint32_t(src_buffer + (src_h - 1) * src_stride, dst_buffer, src_w,
1001 src_h, -src_stride, dst_stride);
1002 } else if constexpr (sizeof(T) == 8) {
1003 mirror_buffer_horizontal_neon_uint64_t(src_buffer + (src_h - 1) * src_stride, dst_buffer, src_w,
1004 src_h, -src_stride, dst_stride);
1005 }
1006 }
1007
1008 template <typename T>
rotate_buffer_clockwise_90_neon(T * src_buffer,T * dst_buffer,int src_w,int src_h,int src_stride,int dst_stride)1009 void rotate_buffer_clockwise_90_neon(T* src_buffer, T* dst_buffer, int src_w, int src_h,
1010 int src_stride, int dst_stride) {
1011 if constexpr (sizeof(T) == 1) {
1012 rotate_buffer_clockwise_90_neon_uint8_t(src_buffer, dst_buffer, src_w, src_h, src_stride,
1013 dst_stride);
1014 } else if constexpr (sizeof(T) == 2) {
1015 rotate_buffer_clockwise_90_neon_uint16_t(src_buffer, dst_buffer, src_w, src_h, src_stride,
1016 dst_stride);
1017 } else if constexpr (sizeof(T) == 4) {
1018 rotate_buffer_clockwise_90_neon_uint32_t(src_buffer, dst_buffer, src_w, src_h, src_stride,
1019 dst_stride);
1020 } else if constexpr (sizeof(T) == 8) {
1021 rotate_buffer_clockwise_90_neon_uint64_t(src_buffer, dst_buffer, src_w, src_h, src_stride,
1022 dst_stride);
1023 }
1024 }
1025
1026 template <typename T>
rotate_buffer_clockwise_270_neon(T * src_buffer,T * dst_buffer,int src_w,int src_h,int src_stride,int dst_stride)1027 void rotate_buffer_clockwise_270_neon(T* src_buffer, T* dst_buffer, int src_w, int src_h,
1028 int src_stride, int dst_stride) {
1029 if constexpr (sizeof(T) == 1) {
1030 rotate_buffer_clockwise_270_neon_uint8_t(src_buffer, dst_buffer, src_w, src_h, src_stride,
1031 dst_stride);
1032 } else if constexpr (sizeof(T) == 2) {
1033 rotate_buffer_clockwise_270_neon_uint16_t(src_buffer, dst_buffer, src_w, src_h, src_stride,
1034 dst_stride);
1035 } else if constexpr (sizeof(T) == 4) {
1036 rotate_buffer_clockwise_270_neon_uint32_t(src_buffer, dst_buffer, src_w, src_h, src_stride,
1037 dst_stride);
1038 } else if constexpr (sizeof(T) == 8) {
1039 rotate_buffer_clockwise_270_neon_uint64_t(src_buffer, dst_buffer, src_w, src_h, src_stride,
1040 dst_stride);
1041 }
1042 }
1043
1044 template <typename T>
rotate_buffer_clockwise_neon(T * src_buffer,T * dst_buffer,int src_w,int src_h,int src_stride,int dst_stride,int degrees)1045 void rotate_buffer_clockwise_neon(T* src_buffer, T* dst_buffer, int src_w, int src_h,
1046 int src_stride, int dst_stride, int degrees) {
1047 if (degrees == 90) {
1048 rotate_buffer_clockwise_90_neon(src_buffer, dst_buffer, src_w, src_h, src_stride, dst_stride);
1049 } else if (degrees == 180) {
1050 rotate_buffer_clockwise_180_neon(src_buffer, dst_buffer, src_w, src_h, src_stride, dst_stride);
1051 } else if (degrees == 270) {
1052 rotate_buffer_clockwise_270_neon(src_buffer, dst_buffer, src_w, src_h, src_stride, dst_stride);
1053 }
1054 }
1055
1056 template void mirror_buffer_neon<uint8_t>(uint8_t*, uint8_t*, int, int, int, int,
1057 uhdr_mirror_direction_t);
1058 template void mirror_buffer_neon<uint16_t>(uint16_t*, uint16_t*, int, int, int, int,
1059 uhdr_mirror_direction_t);
1060 template void mirror_buffer_neon<uint32_t>(uint32_t*, uint32_t*, int, int, int, int,
1061 uhdr_mirror_direction_t);
1062 template void mirror_buffer_neon<uint64_t>(uint64_t*, uint64_t*, int, int, int, int,
1063 uhdr_mirror_direction_t);
1064
1065 template void rotate_buffer_clockwise_neon<uint8_t>(uint8_t*, uint8_t*, int, int, int, int, int);
1066 template void rotate_buffer_clockwise_neon<uint16_t>(uint16_t*, uint16_t*, int, int, int, int, int);
1067 template void rotate_buffer_clockwise_neon<uint32_t>(uint32_t*, uint32_t*, int, int, int, int, int);
1068 template void rotate_buffer_clockwise_neon<uint64_t>(uint64_t*, uint64_t*, int, int, int, int, int);
1069
1070 } // namespace ultrahdr
1071