xref: /aosp_15_r20/external/libultrahdr/lib/src/dsp/arm/editorhelper_neon.cpp (revision 89a0ef05262152531a00a15832a2d3b1e3990773)
1 /*
2  * Copyright 2024 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include <arm_neon.h>
18 #include <cstring>
19 
20 #include "ultrahdr/dsp/arm/mem_neon.h"
21 #include "ultrahdr/editorhelper.h"
22 
23 namespace ultrahdr {
24 
25 #define vrev128q_u8(src, dst) \
26   dst = vrev64q_u8(src);      \
27   dst = vextq_u8(dst, dst, 8);
28 
29 #define vrev128q_u16(src, dst) \
30   dst = vrev64q_u16(src);      \
31   dst = vextq_u16(dst, dst, 4);
32 
33 #define vrev128q_u32(src, dst) \
34   dst = vrev64q_u32(src);      \
35   dst = vextq_u32(dst, dst, 2);
36 
37 #define vrev128q_u64(a) a = vextq_u64(a, a, 1)
38 
mirror_buffer_horizontal_neon_uint8_t(uint8_t * src_buffer,uint8_t * dst_buffer,int src_w,int src_h,int src_stride,int dst_stride)39 static void mirror_buffer_horizontal_neon_uint8_t(uint8_t* src_buffer, uint8_t* dst_buffer,
40                                                   int src_w, int src_h, int src_stride,
41                                                   int dst_stride) {
42   uint8_t* src_row = src_buffer;
43   uint8_t* dst_row = dst_buffer;
44 
45   for (int i = 0; i < src_h; i++, src_row += src_stride, dst_row += dst_stride) {
46     uint8_t* src_blk = src_row + src_w;
47     uint8_t* dst_blk = dst_row;
48     int j = 0;
49 
50     for (; j + 64 <= src_w; src_blk -= 64, dst_blk += 64, j += 64) {
51       uint8x16x4_t s0 = load_u8x16_x4(src_blk - 64);
52       uint8x16x4_t d0;
53       vrev128q_u8(s0.val[0], d0.val[3]);
54       vrev128q_u8(s0.val[1], d0.val[2]);
55       vrev128q_u8(s0.val[2], d0.val[1]);
56       vrev128q_u8(s0.val[3], d0.val[0]);
57       store_u8x16_x4(dst_blk, d0);
58     }
59 
60     for (; j + 32 <= src_w; src_blk -= 32, dst_blk += 32, j += 32) {
61       uint8x16x2_t s0 = load_u8x16_x2(src_blk - 32);
62       uint8x16x2_t d0;
63       vrev128q_u8(s0.val[0], d0.val[1]);
64       vrev128q_u8(s0.val[1], d0.val[0]);
65       store_u8x16_x2(dst_blk, d0);
66     }
67 
68     for (; j + 16 <= src_w; src_blk -= 16, dst_blk += 16, j += 16) {
69       uint8x16_t s0 = vld1q_u8(src_blk - 16);
70       vrev128q_u8(s0, s0);
71       vst1q_u8(dst_blk, s0);
72     }
73 
74     for (; j + 8 <= src_w; src_blk -= 8, dst_blk += 8, j += 8) {
75       uint8x8_t s0 = vld1_u8(src_blk - 8);
76       s0 = vrev64_u8(s0);
77       vst1_u8(dst_blk, s0);
78     }
79 
80     for (int k = 0; k < src_w - j; k++) {
81       dst_blk[k] = src_row[src_w - j - k - 1];
82     }
83   }
84 }
85 
mirror_buffer_horizontal_neon_uint16_t(uint16_t * src_buffer,uint16_t * dst_buffer,int src_w,int src_h,int src_stride,int dst_stride)86 static void mirror_buffer_horizontal_neon_uint16_t(uint16_t* src_buffer, uint16_t* dst_buffer,
87                                                    int src_w, int src_h, int src_stride,
88                                                    int dst_stride) {
89   uint16_t* src_row = src_buffer;
90   uint16_t* dst_row = dst_buffer;
91 
92   for (int i = 0; i < src_h; i++, src_row += src_stride, dst_row += dst_stride) {
93     uint16_t* src_blk = src_row + src_w;
94     uint16_t* dst_blk = dst_row;
95     int j = 0;
96 
97     for (; j + 32 <= src_w; src_blk -= 32, dst_blk += 32, j += 32) {
98       uint16x8x4_t s0 = load_u16x8_x4(src_blk - 32);
99       uint16x8x4_t d0;
100       vrev128q_u16(s0.val[0], d0.val[3]);
101       vrev128q_u16(s0.val[1], d0.val[2]);
102       vrev128q_u16(s0.val[2], d0.val[1]);
103       vrev128q_u16(s0.val[3], d0.val[0]);
104       store_u16x8_x4(dst_blk, d0);
105     }
106 
107     for (; j + 16 <= src_w; src_blk -= 16, dst_blk += 16, j += 16) {
108       uint16x8x2_t s0 = load_u16x8_x2(src_blk - 16);
109       uint16x8x2_t d0;
110       vrev128q_u16(s0.val[0], d0.val[1]);
111       vrev128q_u16(s0.val[1], d0.val[0]);
112       store_u16x8_x2(dst_blk, d0);
113     }
114 
115     for (; j + 8 <= src_w; src_blk -= 8, dst_blk += 8, j += 8) {
116       uint16x8_t s0 = vld1q_u16(src_blk - 8);
117       vrev128q_u16(s0, s0);
118       vst1q_u16(dst_blk, s0);
119     }
120 
121     for (int k = 0; k < src_w - j; k++) {
122       dst_blk[k] = src_row[src_w - j - k - 1];
123     }
124   }
125 }
126 
mirror_buffer_horizontal_neon_uint32_t(uint32_t * src_buffer,uint32_t * dst_buffer,int src_w,int src_h,int src_stride,int dst_stride)127 static void mirror_buffer_horizontal_neon_uint32_t(uint32_t* src_buffer, uint32_t* dst_buffer,
128                                                    int src_w, int src_h, int src_stride,
129                                                    int dst_stride) {
130   uint32_t* src_row = src_buffer;
131   uint32_t* dst_row = dst_buffer;
132 
133   for (int i = 0; i < src_h; i++, src_row += src_stride, dst_row += dst_stride) {
134     uint32_t* src_blk = src_row + src_w;
135     uint32_t* dst_blk = dst_row;
136     int j = 0;
137 
138     for (; j + 16 <= src_w; src_blk -= 16, dst_blk += 16, j += 16) {
139       uint32x4x4_t s0 = load_u32x4_x4(src_blk - 16);
140       uint32x4x4_t d0;
141       vrev128q_u32(s0.val[0], d0.val[3]);
142       vrev128q_u32(s0.val[1], d0.val[2]);
143       vrev128q_u32(s0.val[2], d0.val[1]);
144       vrev128q_u32(s0.val[3], d0.val[0]);
145       store_u32x4_x4(dst_blk, d0);
146     }
147 
148     for (; j + 8 <= src_w; src_blk -= 8, dst_blk += 8, j += 8) {
149       uint32x4x2_t s0 = load_u32x4_x2(src_blk - 8);
150       uint32x4x2_t d0;
151       vrev128q_u32(s0.val[0], d0.val[1]);
152       vrev128q_u32(s0.val[1], d0.val[0]);
153       store_u32x4_x2(dst_blk, d0);
154     }
155 
156     for (; j + 4 <= src_w; src_blk -= 4, dst_blk += 4, j += 4) {
157       uint32x4_t s0 = vld1q_u32(src_blk - 4);
158       vrev128q_u32(s0, s0);
159       vst1q_u32(dst_blk, s0);
160     }
161 
162     for (int k = 0; k < src_w - j; k++) {
163       dst_blk[k] = src_row[src_w - j - k - 1];
164     }
165   }
166 }
167 
mirror_buffer_horizontal_neon_uint64_t(uint64_t * src_buffer,uint64_t * dst_buffer,int src_w,int src_h,int src_stride,int dst_stride)168 static void mirror_buffer_horizontal_neon_uint64_t(uint64_t* src_buffer, uint64_t* dst_buffer,
169                                                    int src_w, int src_h, int src_stride,
170                                                    int dst_stride) {
171   uint64_t* src_row = src_buffer;
172   uint64_t* dst_row = dst_buffer;
173 
174   for (int i = 0; i < src_h; i++, src_row += src_stride, dst_row += dst_stride) {
175     uint64_t* src_blk = src_row + src_w;
176     uint64_t* dst_blk = dst_row;
177     int j = 0;
178 
179     for (; j + 2 <= src_w; src_blk -= 2, dst_blk += 2, j += 2) {
180       uint64x2_t s0 = vld1q_u64(src_blk - 2);
181       vrev128q_u64(s0);
182       vst1q_u64(dst_blk, s0);
183     }
184     for (int k = 0; k < src_w - j; k++) {
185       dst_blk[k] = src_row[src_w - j - k - 1];
186     }
187   }
188 }
189 
mirror_buffer_vertical_neon_uint8_t(uint8_t * src_buffer,uint8_t * dst_buffer,int src_w,int src_h,int src_stride,int dst_stride)190 static void mirror_buffer_vertical_neon_uint8_t(uint8_t* src_buffer, uint8_t* dst_buffer, int src_w,
191                                                 int src_h, int src_stride, int dst_stride) {
192   uint8_t* src_row = src_buffer;
193   uint8_t* dst_row = dst_buffer + (src_h - 1) * dst_stride;
194 
195   for (int i = 0; i < src_h; i++, src_row += src_stride, dst_row -= dst_stride) {
196     uint8_t* src_blk = src_row;
197     uint8_t* dst_blk = dst_row;
198     int j = 0;
199 
200     for (; j + 64 <= src_w; src_blk += 64, dst_blk += 64, j += 64) {
201       uint8x16x4_t s0 = load_u8x16_x4(src_blk);
202       store_u8x16_x4(dst_blk, s0);
203     }
204 
205     for (; j + 32 <= src_w; src_blk += 32, dst_blk += 32, j += 32) {
206       uint8x16x2_t s0 = load_u8x16_x2(src_blk);
207       store_u8x16_x2(dst_blk, s0);
208     }
209 
210     for (; j + 16 <= src_w; src_blk += 16, dst_blk += 16, j += 16) {
211       uint8x16_t s0 = vld1q_u8(src_blk);
212       vst1q_u8(dst_blk, s0);
213     }
214 
215     for (; j + 8 <= src_w; src_blk += 8, dst_blk += 8, j += 8) {
216       uint8x8_t s0 = vld1_u8(src_blk);
217       vst1_u8(dst_blk, s0);
218     }
219 
220     if (j < src_w) memcpy(dst_blk, src_blk, src_w - j);
221   }
222 }
223 
mirror_buffer_vertical_neon_uint16_t(uint16_t * src_buffer,uint16_t * dst_buffer,int src_w,int src_h,int src_stride,int dst_stride)224 static void mirror_buffer_vertical_neon_uint16_t(uint16_t* src_buffer, uint16_t* dst_buffer,
225                                                  int src_w, int src_h, int src_stride,
226                                                  int dst_stride) {
227   uint16_t* src_row = src_buffer;
228   uint16_t* dst_row = dst_buffer + (src_h - 1) * dst_stride;
229 
230   for (int i = 0; i < src_h; i++, src_row += src_stride, dst_row -= dst_stride) {
231     uint16_t* src_blk = src_row;
232     uint16_t* dst_blk = dst_row;
233     int j = 0;
234 
235     for (; j + 32 <= src_w; src_blk += 32, dst_blk += 32, j += 32) {
236       uint16x8x4_t s0 = load_u16x8_x4(src_blk);
237       store_u16x8_x4(dst_blk, s0);
238     }
239 
240     for (; j + 16 <= src_w; src_blk += 16, dst_blk += 16, j += 16) {
241       uint16x8x2_t s0 = load_u16x8_x2(src_blk);
242       store_u16x8_x2(dst_blk, s0);
243     }
244 
245     for (; j + 8 <= src_w; src_blk += 8, dst_blk += 8, j += 8) {
246       uint16x8_t s0 = vld1q_u16(src_blk);
247       vst1q_u16(dst_blk, s0);
248     }
249 
250     if (j < src_w) memcpy(dst_blk, src_blk, (src_w - j) * sizeof(uint16_t));
251   }
252 }
253 
mirror_buffer_vertical_neon_uint32_t(uint32_t * src_buffer,uint32_t * dst_buffer,int src_w,int src_h,int src_stride,int dst_stride)254 static void mirror_buffer_vertical_neon_uint32_t(uint32_t* src_buffer, uint32_t* dst_buffer,
255                                                  int src_w, int src_h, int src_stride,
256                                                  int dst_stride) {
257   uint32_t* src_row = src_buffer;
258   uint32_t* dst_row = dst_buffer + (src_h - 1) * dst_stride;
259 
260   for (int i = 0; i < src_h; i++, src_row += src_stride, dst_row -= dst_stride) {
261     uint32_t* src_blk = src_row;
262     uint32_t* dst_blk = dst_row;
263     int j = 0;
264 
265     for (; j + 16 <= src_w; src_blk += 16, dst_blk += 16, j += 16) {
266       uint32x4x4_t s0 = load_u32x4_x4(src_blk);
267       store_u32x4_x4(dst_blk, s0);
268     }
269 
270     for (; j + 8 <= src_w; src_blk += 8, dst_blk += 8, j += 8) {
271       uint32x4x2_t s0 = load_u32x4_x2(src_blk);
272       store_u32x4_x2(dst_blk, s0);
273     }
274 
275     for (; j + 4 <= src_w; src_blk += 4, dst_blk += 4, j += 4) {
276       uint32x4_t s0 = vld1q_u32(src_blk);
277       vst1q_u32(dst_blk, s0);
278     }
279 
280     if (j < src_w) memcpy(dst_blk, src_blk, (src_w - j) * sizeof(uint32_t));
281   }
282 }
283 
mirror_buffer_vertical_neon_uint64_t(uint64_t * src_buffer,uint64_t * dst_buffer,int src_w,int src_h,int src_stride,int dst_stride)284 static void mirror_buffer_vertical_neon_uint64_t(uint64_t* src_buffer, uint64_t* dst_buffer,
285                                                  int src_w, int src_h, int src_stride,
286                                                  int dst_stride) {
287   uint64_t* src_row = src_buffer;
288   uint64_t* dst_row = dst_buffer + (src_h - 1) * dst_stride;
289 
290   for (int i = 0; i < src_h; i++, src_row += src_stride, dst_row -= dst_stride) {
291     uint64_t* src_blk = src_row;
292     uint64_t* dst_blk = dst_row;
293     int j = 0;
294 
295     for (; j + 2 <= src_w; src_blk += 2, dst_blk += 2, j += 2) {
296       uint64x2_t s0 = vld1q_u64(src_blk);
297       vst1q_u64(dst_blk, s0);
298     }
299 
300     if (j < src_w) memcpy(dst_blk, src_blk, (src_w - j) * sizeof(uint64_t));
301   }
302 }
303 
transpose_u8_8x8(uint8x8_t * a0,uint8x8_t * a1,uint8x8_t * a2,uint8x8_t * a3,uint8x8_t * a4,uint8x8_t * a5,uint8x8_t * a6,uint8x8_t * a7)304 static INLINE void transpose_u8_8x8(uint8x8_t* a0, uint8x8_t* a1, uint8x8_t* a2, uint8x8_t* a3,
305                                     uint8x8_t* a4, uint8x8_t* a5, uint8x8_t* a6, uint8x8_t* a7) {
306   // Swap 8 bit elements. Goes from:
307   // a0: 00 01 02 03 04 05 06 07
308   // a1: 10 11 12 13 14 15 16 17
309   // a2: 20 21 22 23 24 25 26 27
310   // a3: 30 31 32 33 34 35 36 37
311   // a4: 40 41 42 43 44 45 46 47
312   // a5: 50 51 52 53 54 55 56 57
313   // a6: 60 61 62 63 64 65 66 67
314   // a7: 70 71 72 73 74 75 76 77
315   // to:
316   // b0.val[0]: 00 10 02 12 04 14 06 16  40 50 42 52 44 54 46 56
317   // b0.val[1]: 01 11 03 13 05 15 07 17  41 51 43 53 45 55 47 57
318   // b1.val[0]: 20 30 22 32 24 34 26 36  60 70 62 72 64 74 66 76
319   // b1.val[1]: 21 31 23 33 25 35 27 37  61 71 63 73 65 75 67 77
320 
321   const uint8x16x2_t b0 = vtrnq_u8(vcombine_u8(*a0, *a4), vcombine_u8(*a1, *a5));
322   const uint8x16x2_t b1 = vtrnq_u8(vcombine_u8(*a2, *a6), vcombine_u8(*a3, *a7));
323 
324   // Swap 16 bit elements resulting in:
325   // c0.val[0]: 00 10 20 30 04 14 24 34  40 50 60 70 44 54 64 74
326   // c0.val[1]: 02 12 22 32 06 16 26 36  42 52 62 72 46 56 66 76
327   // c1.val[0]: 01 11 21 31 05 15 25 35  41 51 61 71 45 55 65 75
328   // c1.val[1]: 03 13 23 33 07 17 27 37  43 53 63 73 47 57 67 77
329 
330   const uint16x8x2_t c0 =
331       vtrnq_u16(vreinterpretq_u16_u8(b0.val[0]), vreinterpretq_u16_u8(b1.val[0]));
332   const uint16x8x2_t c1 =
333       vtrnq_u16(vreinterpretq_u16_u8(b0.val[1]), vreinterpretq_u16_u8(b1.val[1]));
334 
335   // Unzip 32 bit elements resulting in:
336   // d0.val[0]: 00 10 20 30 40 50 60 70  01 11 21 31 41 51 61 71
337   // d0.val[1]: 04 14 24 34 44 54 64 74  05 15 25 35 45 55 65 75
338   // d1.val[0]: 02 12 22 32 42 52 62 72  03 13 23 33 43 53 63 73
339   // d1.val[1]: 06 16 26 36 46 56 66 76  07 17 27 37 47 57 67 77
340   const uint32x4x2_t d0 =
341       vuzpq_u32(vreinterpretq_u32_u16(c0.val[0]), vreinterpretq_u32_u16(c1.val[0]));
342   const uint32x4x2_t d1 =
343       vuzpq_u32(vreinterpretq_u32_u16(c0.val[1]), vreinterpretq_u32_u16(c1.val[1]));
344 
345   *a0 = vreinterpret_u8_u32(vget_low_u32(d0.val[0]));
346   *a1 = vreinterpret_u8_u32(vget_high_u32(d0.val[0]));
347   *a2 = vreinterpret_u8_u32(vget_low_u32(d1.val[0]));
348   *a3 = vreinterpret_u8_u32(vget_high_u32(d1.val[0]));
349   *a4 = vreinterpret_u8_u32(vget_low_u32(d0.val[1]));
350   *a5 = vreinterpret_u8_u32(vget_high_u32(d0.val[1]));
351   *a6 = vreinterpret_u8_u32(vget_low_u32(d1.val[1]));
352   *a7 = vreinterpret_u8_u32(vget_high_u32(d1.val[1]));
353 }
354 
reverse_uint8x8_regs(uint8x8_t * a0,uint8x8_t * a1,uint8x8_t * a2,uint8x8_t * a3,uint8x8_t * a4,uint8x8_t * a5,uint8x8_t * a6,uint8x8_t * a7)355 static INLINE void reverse_uint8x8_regs(uint8x8_t* a0, uint8x8_t* a1, uint8x8_t* a2, uint8x8_t* a3,
356                                         uint8x8_t* a4, uint8x8_t* a5, uint8x8_t* a6,
357                                         uint8x8_t* a7) {
358   *a0 = vrev64_u8(*a0);
359   *a1 = vrev64_u8(*a1);
360   *a2 = vrev64_u8(*a2);
361   *a3 = vrev64_u8(*a3);
362   *a4 = vrev64_u8(*a4);
363   *a5 = vrev64_u8(*a5);
364   *a6 = vrev64_u8(*a6);
365   *a7 = vrev64_u8(*a7);
366 }
367 
vtrnq_u64_to_u16(uint32x4_t a0,uint32x4_t a1)368 static INLINE uint16x8x2_t vtrnq_u64_to_u16(uint32x4_t a0, uint32x4_t a1) {
369   uint16x8x2_t b0;
370 
371 #if (defined(__arm64__) && defined(__APPLE__)) || defined(__aarch64__)
372   b0.val[0] =
373       vreinterpretq_u16_u64(vtrn1q_u64(vreinterpretq_u64_u32(a0), vreinterpretq_u64_u32(a1)));
374   b0.val[1] =
375       vreinterpretq_u16_u64(vtrn2q_u64(vreinterpretq_u64_u32(a0), vreinterpretq_u64_u32(a1)));
376 #else
377   b0.val[0] =
378       vcombine_u16(vreinterpret_u16_u32(vget_low_u32(a0)), vreinterpret_u16_u32(vget_low_u32(a1)));
379   b0.val[1] = vcombine_u16(vreinterpret_u16_u32(vget_high_u32(a0)),
380                            vreinterpret_u16_u32(vget_high_u32(a1)));
381 #endif
382   return b0;
383 }
384 
transpose_u16_8x8(uint16x8_t * a0,uint16x8_t * a1,uint16x8_t * a2,uint16x8_t * a3,uint16x8_t * a4,uint16x8_t * a5,uint16x8_t * a6,uint16x8_t * a7)385 static INLINE void transpose_u16_8x8(uint16x8_t* a0, uint16x8_t* a1, uint16x8_t* a2, uint16x8_t* a3,
386                                      uint16x8_t* a4, uint16x8_t* a5, uint16x8_t* a6,
387                                      uint16x8_t* a7) {
388   // Swap 16 bit elements. Goes from:
389   // a0: 00 01 02 03 04 05 06 07
390   // a1: 10 11 12 13 14 15 16 17
391   // a2: 20 21 22 23 24 25 26 27
392   // a3: 30 31 32 33 34 35 36 37
393   // a4: 40 41 42 43 44 45 46 47
394   // a5: 50 51 52 53 54 55 56 57
395   // a6: 60 61 62 63 64 65 66 67
396   // a7: 70 71 72 73 74 75 76 77
397   // to:
398   // b0.val[0]: 00 10 02 12 04 14 06 16
399   // b0.val[1]: 01 11 03 13 05 15 07 17
400   // b1.val[0]: 20 30 22 32 24 34 26 36
401   // b1.val[1]: 21 31 23 33 25 35 27 37
402   // b2.val[0]: 40 50 42 52 44 54 46 56
403   // b2.val[1]: 41 51 43 53 45 55 47 57
404   // b3.val[0]: 60 70 62 72 64 74 66 76
405   // b3.val[1]: 61 71 63 73 65 75 67 77
406   const uint16x8x2_t b0 = vtrnq_u16(*a0, *a1);
407   const uint16x8x2_t b1 = vtrnq_u16(*a2, *a3);
408   const uint16x8x2_t b2 = vtrnq_u16(*a4, *a5);
409   const uint16x8x2_t b3 = vtrnq_u16(*a6, *a7);
410 
411   // Swap 32 bit elements resulting in:
412   // c0.val[0]: 00 10 20 30 04 14 24 34
413   // c0.val[1]: 02 12 22 32 06 16 26 36
414   // c1.val[0]: 01 11 21 31 05 15 25 35
415   // c1.val[1]: 03 13 23 33 07 17 27 37
416   // c2.val[0]: 40 50 60 70 44 54 64 74
417   // c2.val[1]: 42 52 62 72 46 56 66 76
418   // c3.val[0]: 41 51 61 71 45 55 65 75
419   // c3.val[1]: 43 53 63 73 47 57 67 77
420   const uint32x4x2_t c0 =
421       vtrnq_u32(vreinterpretq_u32_u16(b0.val[0]), vreinterpretq_u32_u16(b1.val[0]));
422   const uint32x4x2_t c1 =
423       vtrnq_u32(vreinterpretq_u32_u16(b0.val[1]), vreinterpretq_u32_u16(b1.val[1]));
424   const uint32x4x2_t c2 =
425       vtrnq_u32(vreinterpretq_u32_u16(b2.val[0]), vreinterpretq_u32_u16(b3.val[0]));
426   const uint32x4x2_t c3 =
427       vtrnq_u32(vreinterpretq_u32_u16(b2.val[1]), vreinterpretq_u32_u16(b3.val[1]));
428 
429   // Swap 64 bit elements resulting in:
430   // d0.val[0]: 00 10 20 30 40 50 60 70
431   // d0.val[1]: 04 14 24 34 44 54 64 74
432   // d1.val[0]: 01 11 21 31 41 51 61 71
433   // d1.val[1]: 05 15 25 35 45 55 65 75
434   // d2.val[0]: 02 12 22 32 42 52 62 72
435   // d2.val[1]: 06 16 26 36 46 56 66 76
436   // d3.val[0]: 03 13 23 33 43 53 63 73
437   // d3.val[1]: 07 17 27 37 47 57 67 77
438   const uint16x8x2_t d0 = vtrnq_u64_to_u16(c0.val[0], c2.val[0]);
439   const uint16x8x2_t d1 = vtrnq_u64_to_u16(c1.val[0], c3.val[0]);
440   const uint16x8x2_t d2 = vtrnq_u64_to_u16(c0.val[1], c2.val[1]);
441   const uint16x8x2_t d3 = vtrnq_u64_to_u16(c1.val[1], c3.val[1]);
442 
443   *a0 = d0.val[0];
444   *a1 = d1.val[0];
445   *a2 = d2.val[0];
446   *a3 = d3.val[0];
447   *a4 = d0.val[1];
448   *a5 = d1.val[1];
449   *a6 = d2.val[1];
450   *a7 = d3.val[1];
451 }
452 
reverse_uint16x8_regs(uint16x8_t * a0,uint16x8_t * a1,uint16x8_t * a2,uint16x8_t * a3,uint16x8_t * a4,uint16x8_t * a5,uint16x8_t * a6,uint16x8_t * a7)453 static INLINE void reverse_uint16x8_regs(uint16x8_t* a0, uint16x8_t* a1, uint16x8_t* a2,
454                                          uint16x8_t* a3, uint16x8_t* a4, uint16x8_t* a5,
455                                          uint16x8_t* a6, uint16x8_t* a7) {
456   vrev128q_u16(*a0, *a0);
457   vrev128q_u16(*a1, *a1);
458   vrev128q_u16(*a2, *a2);
459   vrev128q_u16(*a3, *a3);
460   vrev128q_u16(*a4, *a4);
461   vrev128q_u16(*a5, *a5);
462   vrev128q_u16(*a6, *a6);
463   vrev128q_u16(*a7, *a7);
464 }
465 
vtrnq_u64_to_u32(uint32x4_t a0,uint32x4_t a1)466 static INLINE uint32x4x2_t vtrnq_u64_to_u32(uint32x4_t a0, uint32x4_t a1) {
467   uint32x4x2_t b0;
468 #if (defined(__arm64__) && defined(__APPLE__)) || defined(__aarch64__)
469   b0.val[0] =
470       vreinterpretq_u32_u64(vtrn1q_u64(vreinterpretq_u64_u32(a0), vreinterpretq_u64_u32(a1)));
471   b0.val[1] =
472       vreinterpretq_u32_u64(vtrn2q_u64(vreinterpretq_u64_u32(a0), vreinterpretq_u64_u32(a1)));
473 #else
474   b0.val[0] = vcombine_u32(vget_low_u32(a0), vget_low_u32(a1));
475   b0.val[1] = vcombine_u32(vget_high_u32(a0), vget_high_u32(a1));
476 #endif
477   return b0;
478 }
479 
transpose_u32_4x4(uint32x4_t * a0,uint32x4_t * a1,uint32x4_t * a2,uint32x4_t * a3)480 static INLINE void transpose_u32_4x4(uint32x4_t* a0, uint32x4_t* a1, uint32x4_t* a2,
481                                      uint32x4_t* a3) {
482   // Swap 32 bit elements. Goes from:
483   // a0: 00 01 02 03
484   // a1: 10 11 12 13
485   // a2: 20 21 22 23
486   // a3: 30 31 32 33
487   // to:
488   // b0.val[0]: 00 10 02 12
489   // b0.val[1]: 01 11 03 13
490   // b1.val[0]: 20 30 22 32
491   // b1.val[1]: 21 31 23 33
492 
493   const uint32x4x2_t b0 = vtrnq_u32(*a0, *a1);
494   const uint32x4x2_t b1 = vtrnq_u32(*a2, *a3);
495 
496   // Swap 64 bit elements resulting in:
497   // c0.val[0]: 00 10 20 30
498   // c0.val[1]: 02 12 22 32
499   // c1.val[0]: 01 11 21 31
500   // c1.val[1]: 03 13 23 33
501 
502   const uint32x4x2_t c0 = vtrnq_u64_to_u32(b0.val[0], b1.val[0]);
503   const uint32x4x2_t c1 = vtrnq_u64_to_u32(b0.val[1], b1.val[1]);
504 
505   *a0 = c0.val[0];
506   *a1 = c1.val[0];
507   *a2 = c0.val[1];
508   *a3 = c1.val[1];
509 }
510 
reverse_uint32x4_regs(uint32x4_t * a0,uint32x4_t * a1,uint32x4_t * a2,uint32x4_t * a3)511 static INLINE void reverse_uint32x4_regs(uint32x4_t* a0, uint32x4_t* a1, uint32x4_t* a2,
512                                          uint32x4_t* a3) {
513   vrev128q_u32(*a0, *a0);
514   vrev128q_u32(*a1, *a1);
515   vrev128q_u32(*a2, *a2);
516   vrev128q_u32(*a3, *a3);
517 }
518 
rotate90_u64_2x2(uint64x2_t * a0,uint64x2_t * a1)519 static INLINE void rotate90_u64_2x2(uint64x2_t* a0, uint64x2_t* a1) {
520   uint64x2_t b0 = vcombine_u64(vget_low_u64(*a1), vget_low_u64(*a0));
521   uint64x2_t b1 = vcombine_u64(vget_high_u64(*a1), vget_high_u64(*a0));
522   *a0 = b0;
523   *a1 = b1;
524 }
525 
rotate270_u64_2x2(uint64x2_t * a0,uint64x2_t * a1)526 static INLINE void rotate270_u64_2x2(uint64x2_t* a0, uint64x2_t* a1) {
527   uint64x2_t b0 = vcombine_u64(vget_low_u64(*a0), vget_low_u64(*a1));
528   uint64x2_t b1 = vcombine_u64(vget_high_u64(*a0), vget_high_u64(*a1));
529   *a0 = b1;
530   *a1 = b0;
531 }
532 
load_u8_8x8(const uint8_t * s,const int stride,uint8x8_t * s0,uint8x8_t * s1,uint8x8_t * s2,uint8x8_t * s3,uint8x8_t * s4,uint8x8_t * s5,uint8x8_t * s6,uint8x8_t * s7)533 static INLINE void load_u8_8x8(const uint8_t* s, const int stride, uint8x8_t* s0, uint8x8_t* s1,
534                                uint8x8_t* s2, uint8x8_t* s3, uint8x8_t* s4, uint8x8_t* s5,
535                                uint8x8_t* s6, uint8x8_t* s7) {
536   *s0 = vld1_u8(s);
537   s += stride;
538   *s1 = vld1_u8(s);
539   s += stride;
540   *s2 = vld1_u8(s);
541   s += stride;
542   *s3 = vld1_u8(s);
543   s += stride;
544   *s4 = vld1_u8(s);
545   s += stride;
546   *s5 = vld1_u8(s);
547   s += stride;
548   *s6 = vld1_u8(s);
549   s += stride;
550   *s7 = vld1_u8(s);
551 }
552 
load_u16_8x8(const uint16_t * s,const int stride,uint16x8_t * s0,uint16x8_t * s1,uint16x8_t * s2,uint16x8_t * s3,uint16x8_t * s4,uint16x8_t * s5,uint16x8_t * s6,uint16x8_t * s7)553 static INLINE void load_u16_8x8(const uint16_t* s, const int stride, uint16x8_t* s0, uint16x8_t* s1,
554                                 uint16x8_t* s2, uint16x8_t* s3, uint16x8_t* s4, uint16x8_t* s5,
555                                 uint16x8_t* s6, uint16x8_t* s7) {
556   *s0 = vld1q_u16(s);
557   s += stride;
558   *s1 = vld1q_u16(s);
559   s += stride;
560   *s2 = vld1q_u16(s);
561   s += stride;
562   *s3 = vld1q_u16(s);
563   s += stride;
564   *s4 = vld1q_u16(s);
565   s += stride;
566   *s5 = vld1q_u16(s);
567   s += stride;
568   *s6 = vld1q_u16(s);
569   s += stride;
570   *s7 = vld1q_u16(s);
571 }
572 
load_u32_4x4(const uint32_t * s,const int stride,uint32x4_t * s1,uint32x4_t * s2,uint32x4_t * s3,uint32x4_t * s4)573 static INLINE void load_u32_4x4(const uint32_t* s, const int stride, uint32x4_t* s1, uint32x4_t* s2,
574                                 uint32x4_t* s3, uint32x4_t* s4) {
575   *s1 = vld1q_u32(s);
576   s += stride;
577   *s2 = vld1q_u32(s);
578   s += stride;
579   *s3 = vld1q_u32(s);
580   s += stride;
581   *s4 = vld1q_u32(s);
582 }
583 
load_u64_2x2(const uint64_t * s,const int stride,uint64x2_t * s1,uint64x2_t * s2)584 static INLINE void load_u64_2x2(const uint64_t* s, const int stride, uint64x2_t* s1,
585                                 uint64x2_t* s2) {
586   *s1 = vld1q_u64(s);
587   s += stride;
588   *s2 = vld1q_u64(s);
589 }
590 
store_u8_8x8(uint8_t * s,int stride,uint8x8_t s0,uint8x8_t s1,uint8x8_t s2,uint8x8_t s3,uint8x8_t s4,uint8x8_t s5,uint8x8_t s6,uint8x8_t s7)591 static INLINE void store_u8_8x8(uint8_t* s, int stride, uint8x8_t s0, uint8x8_t s1, uint8x8_t s2,
592                                 uint8x8_t s3, uint8x8_t s4, uint8x8_t s5, uint8x8_t s6,
593                                 uint8x8_t s7) {
594   vst1_u8(s, s0);
595   s += stride;
596   vst1_u8(s, s1);
597   s += stride;
598   vst1_u8(s, s2);
599   s += stride;
600   vst1_u8(s, s3);
601   s += stride;
602   vst1_u8(s, s4);
603   s += stride;
604   vst1_u8(s, s5);
605   s += stride;
606   vst1_u8(s, s6);
607   s += stride;
608   vst1_u8(s, s7);
609 }
610 
store_u16_8x8(uint16_t * s,int stride,uint16x8_t s0,uint16x8_t s1,uint16x8_t s2,uint16x8_t s3,uint16x8_t s4,uint16x8_t s5,uint16x8_t s6,uint16x8_t s7)611 static INLINE void store_u16_8x8(uint16_t* s, int stride, uint16x8_t s0, uint16x8_t s1,
612                                  uint16x8_t s2, uint16x8_t s3, uint16x8_t s4, uint16x8_t s5,
613                                  uint16x8_t s6, uint16x8_t s7) {
614   vst1q_u16(s, s0);
615   s += stride;
616   vst1q_u16(s, s1);
617   s += stride;
618   vst1q_u16(s, s2);
619   s += stride;
620   vst1q_u16(s, s3);
621   s += stride;
622   vst1q_u16(s, s4);
623   s += stride;
624   vst1q_u16(s, s5);
625   s += stride;
626   vst1q_u16(s, s6);
627   s += stride;
628   vst1q_u16(s, s7);
629 }
630 
store_u32_4x4(uint32_t * s,int stride,uint32x4_t s1,uint32x4_t s2,uint32x4_t s3,uint32x4_t s4)631 static INLINE void store_u32_4x4(uint32_t* s, int stride, uint32x4_t s1, uint32x4_t s2,
632                                  uint32x4_t s3, uint32x4_t s4) {
633   vst1q_u32(s, s1);
634   s += stride;
635   vst1q_u32(s, s2);
636   s += stride;
637   vst1q_u32(s, s3);
638   s += stride;
639   vst1q_u32(s, s4);
640 }
641 
store_u64_2x2(uint64_t * s,int stride,uint64x2_t s1,uint64x2_t s2)642 static INLINE void store_u64_2x2(uint64_t* s, int stride, uint64x2_t s1, uint64x2_t s2) {
643   vst1q_u64(s, s1);
644   s += stride;
645   vst1q_u64(s, s2);
646 }
647 
rotate_buffer_clockwise_90_neon_uint8_t(uint8_t * src_buffer,uint8_t * dst_buffer,int src_w,int src_h,int src_stride,int dst_stride)648 static void rotate_buffer_clockwise_90_neon_uint8_t(uint8_t* src_buffer, uint8_t* dst_buffer,
649                                                     int src_w, int src_h, int src_stride,
650                                                     int dst_stride) {
651   const int blk_wd = 8;
652 
653   if (src_h < blk_wd || src_w < blk_wd) {
654     rotate_buffer_clockwise(src_buffer, dst_buffer, src_w, src_h, src_stride, dst_stride, 90);
655     return;
656   }
657 
658   int sub_img_w = (src_w / blk_wd) * blk_wd;
659   uint8x8_t s[blk_wd];
660   int i = 0;
661 
662   while (1) {
663     uint8_t* dst_blk = dst_buffer + src_h - i - blk_wd;
664     uint8_t* src_blk = src_buffer + (i * src_stride);
665     int j;
666 
667     for (j = 0; j < sub_img_w; j += blk_wd, src_blk += blk_wd, dst_blk += (blk_wd * dst_stride)) {
668       load_u8_8x8(src_blk, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6], &s[7]);
669       transpose_u8_8x8(&s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6], &s[7]);
670       reverse_uint8x8_regs(&s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6], &s[7]);
671       store_u8_8x8(dst_blk, dst_stride, s[0], s[1], s[2], s[3], s[4], s[5], s[6], s[7]);
672     }
673     if (sub_img_w < src_w) {
674       dst_blk += blk_wd - 1;
675       for (int k = 0; k < blk_wd; k++) {
676         for (int l = 0; l < (src_w - sub_img_w); l++) {
677           dst_blk[l * dst_stride - k] = src_blk[k * src_stride + l];
678         }
679       }
680     }
681     i += blk_wd;
682     if (i == src_h) break;
683     if (i + blk_wd > src_h) i = src_h - blk_wd;
684   }
685 }
686 
rotate_buffer_clockwise_90_neon_uint16_t(uint16_t * src_buffer,uint16_t * dst_buffer,int src_w,int src_h,int src_stride,int dst_stride)687 static void rotate_buffer_clockwise_90_neon_uint16_t(uint16_t* src_buffer, uint16_t* dst_buffer,
688                                                      int src_w, int src_h, int src_stride,
689                                                      int dst_stride) {
690   const int blk_wd = 8;
691 
692   if (src_h < blk_wd || src_w < blk_wd) {
693     rotate_buffer_clockwise(src_buffer, dst_buffer, src_w, src_h, src_stride, dst_stride, 90);
694     return;
695   }
696 
697   int sub_img_w = (src_w / blk_wd) * blk_wd;
698   uint16x8_t s[blk_wd];
699   int i = 0;
700 
701   while (1) {
702     uint16_t* dst_blk = dst_buffer + src_h - i - blk_wd;
703     uint16_t* src_blk = src_buffer + (i * src_stride);
704     int j;
705 
706     for (j = 0; j < sub_img_w; j += blk_wd, src_blk += blk_wd, dst_blk += (blk_wd * dst_stride)) {
707       load_u16_8x8(src_blk, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6], &s[7]);
708       transpose_u16_8x8(&s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6], &s[7]);
709       reverse_uint16x8_regs(&s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6], &s[7]);
710       store_u16_8x8(dst_blk, dst_stride, s[0], s[1], s[2], s[3], s[4], s[5], s[6], s[7]);
711     }
712     if (sub_img_w < src_w) {
713       dst_blk += blk_wd - 1;
714       for (int k = 0; k < blk_wd; k++) {
715         for (int l = 0; l < (src_w - sub_img_w); l++) {
716           dst_blk[l * dst_stride - k] = src_blk[k * src_stride + l];
717         }
718       }
719     }
720     i += blk_wd;
721     if (i == src_h) break;
722     if (i + blk_wd > src_h) i = src_h - blk_wd;
723   }
724 }
725 
rotate_buffer_clockwise_90_neon_uint32_t(uint32_t * src_buffer,uint32_t * dst_buffer,int src_w,int src_h,int src_stride,int dst_stride)726 static void rotate_buffer_clockwise_90_neon_uint32_t(uint32_t* src_buffer, uint32_t* dst_buffer,
727                                                      int src_w, int src_h, int src_stride,
728                                                      int dst_stride) {
729   const int blk_wd = 4;
730 
731   if (src_h < blk_wd || src_w < blk_wd) {
732     rotate_buffer_clockwise(src_buffer, dst_buffer, src_w, src_h, src_stride, dst_stride, 90);
733     return;
734   }
735 
736   int sub_img_w = (src_w / blk_wd) * blk_wd;
737   uint32x4_t s[blk_wd];
738   int i = 0;
739 
740   while (1) {
741     uint32_t* dst_blk = dst_buffer + src_h - i - blk_wd;
742     uint32_t* src_blk = src_buffer + (i * src_stride);
743     int j;
744 
745     for (j = 0; j < sub_img_w; j += blk_wd, src_blk += blk_wd, dst_blk += (blk_wd * dst_stride)) {
746       load_u32_4x4(src_blk, src_stride, &s[0], &s[1], &s[2], &s[3]);
747       transpose_u32_4x4(&s[0], &s[1], &s[2], &s[3]);
748       reverse_uint32x4_regs(&s[0], &s[1], &s[2], &s[3]);
749       store_u32_4x4(dst_blk, dst_stride, s[0], s[1], s[2], s[3]);
750     }
751     if (sub_img_w < src_w) {
752       dst_blk += blk_wd - 1;
753       for (int k = 0; k < blk_wd; k++) {
754         for (int l = 0; l < (src_w - sub_img_w); l++) {
755           dst_blk[l * dst_stride - k] = src_blk[k * src_stride + l];
756         }
757       }
758     }
759     i += blk_wd;
760     if (i == src_h) break;
761     if (i + blk_wd > src_h) i = src_h - blk_wd;
762   }
763 }
764 
rotate_buffer_clockwise_90_neon_uint64_t(uint64_t * src_buffer,uint64_t * dst_buffer,int src_w,int src_h,int src_stride,int dst_stride)765 static void rotate_buffer_clockwise_90_neon_uint64_t(uint64_t* src_buffer, uint64_t* dst_buffer,
766                                                      int src_w, int src_h, int src_stride,
767                                                      int dst_stride) {
768   const int blk_wd = 2;
769 
770   if (src_h < blk_wd || src_w < blk_wd) {
771     rotate_buffer_clockwise(src_buffer, dst_buffer, src_w, src_h, src_stride, dst_stride, 90);
772     return;
773   }
774 
775   int sub_img_w = (src_w / blk_wd) * blk_wd;
776   uint64x2_t s[blk_wd];
777   int i = 0;
778 
779   while (1) {
780     uint64_t* dst_blk = dst_buffer + src_h - i - blk_wd;
781     uint64_t* src_blk = src_buffer + (i * src_stride);
782     int j;
783 
784     for (j = 0; j < sub_img_w; j += blk_wd, src_blk += blk_wd, dst_blk += (blk_wd * dst_stride)) {
785       load_u64_2x2(src_blk, src_stride, &s[0], &s[1]);
786       rotate90_u64_2x2(&s[0], &s[1]);
787       store_u64_2x2(dst_blk, dst_stride, s[0], s[1]);
788     }
789     if (sub_img_w < src_w) {
790       dst_blk += blk_wd - 1;
791       for (int k = 0; k < blk_wd; k++) {
792         for (int l = 0; l < (src_w - sub_img_w); l++) {
793           dst_blk[l * dst_stride - k] = src_blk[k * src_stride + l];
794         }
795       }
796     }
797     i += blk_wd;
798     if (i == src_h) break;
799     if (i + blk_wd > src_h) i = src_h - blk_wd;
800   }
801 }
802 
rotate_buffer_clockwise_270_neon_uint8_t(uint8_t * src_buffer,uint8_t * dst_buffer,int src_w,int src_h,int src_stride,int dst_stride)803 static void rotate_buffer_clockwise_270_neon_uint8_t(uint8_t* src_buffer, uint8_t* dst_buffer,
804                                                      int src_w, int src_h, int src_stride,
805                                                      int dst_stride) {
806   const int blk_wd = 8;
807 
808   if (src_h < blk_wd || src_w < blk_wd) {
809     rotate_buffer_clockwise(src_buffer, dst_buffer, src_w, src_h, src_stride, dst_stride, 270);
810     return;
811   }
812 
813   int sub_img_w = (src_w / blk_wd) * blk_wd;
814   uint8x8_t s[blk_wd];
815   int i = 0;
816 
817   while (1) {
818     uint8_t* dst_blk = dst_buffer + i + (src_w - blk_wd) * dst_stride;
819     uint8_t* src_blk = src_buffer + (i * src_stride);
820     int j;
821 
822     for (j = 0; j < sub_img_w; j += blk_wd, src_blk += blk_wd, dst_blk -= (blk_wd * dst_stride)) {
823       load_u8_8x8(src_blk, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6], &s[7]);
824       transpose_u8_8x8(&s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6], &s[7]);
825       store_u8_8x8(dst_blk, dst_stride, s[7], s[6], s[5], s[4], s[3], s[2], s[1], s[0]);
826     }
827     if (sub_img_w < src_w) {
828       dst_blk += (blk_wd - 1) * dst_stride;
829       for (int k = 0; k < blk_wd; k++) {
830         for (int l = 0; l < (src_w - sub_img_w); l++) {
831           dst_blk[-l * dst_stride + k] = src_blk[k * src_stride + l];
832         }
833       }
834     }
835     i += blk_wd;
836     if (i == src_h) break;
837     if (i + blk_wd > src_h) i = src_h - blk_wd;
838   }
839 }
840 
rotate_buffer_clockwise_270_neon_uint16_t(uint16_t * src_buffer,uint16_t * dst_buffer,int src_w,int src_h,int src_stride,int dst_stride)841 static void rotate_buffer_clockwise_270_neon_uint16_t(uint16_t* src_buffer, uint16_t* dst_buffer,
842                                                       int src_w, int src_h, int src_stride,
843                                                       int dst_stride) {
844   const int blk_wd = 8;
845 
846   if (src_h < blk_wd || src_w < blk_wd) {
847     rotate_buffer_clockwise(src_buffer, dst_buffer, src_w, src_h, src_stride, dst_stride, 270);
848     return;
849   }
850 
851   int sub_img_w = (src_w / blk_wd) * blk_wd;
852   uint16x8_t s[blk_wd];
853   int i = 0;
854 
855   while (1) {
856     uint16_t* dst_blk = dst_buffer + i + (src_w - blk_wd) * dst_stride;
857     uint16_t* src_blk = src_buffer + (i * src_stride);
858     int j;
859 
860     for (j = 0; j < sub_img_w; j += blk_wd, src_blk += blk_wd, dst_blk -= (blk_wd * dst_stride)) {
861       load_u16_8x8(src_blk, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6], &s[7]);
862       transpose_u16_8x8(&s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6], &s[7]);
863       store_u16_8x8(dst_blk, dst_stride, s[7], s[6], s[5], s[4], s[3], s[2], s[1], s[0]);
864     }
865     if (sub_img_w < src_w) {
866       dst_blk += (blk_wd - 1) * dst_stride;
867       for (int k = 0; k < blk_wd; k++) {
868         for (int l = 0; l < (src_w - sub_img_w); l++) {
869           dst_blk[-l * dst_stride + k] = src_blk[k * src_stride + l];
870         }
871       }
872     }
873     i += blk_wd;
874     if (i == src_h) break;
875     if (i + blk_wd > src_h) i = src_h - blk_wd;
876   }
877 }
878 
rotate_buffer_clockwise_270_neon_uint32_t(uint32_t * src_buffer,uint32_t * dst_buffer,int src_w,int src_h,int src_stride,int dst_stride)879 static void rotate_buffer_clockwise_270_neon_uint32_t(uint32_t* src_buffer, uint32_t* dst_buffer,
880                                                       int src_w, int src_h, int src_stride,
881                                                       int dst_stride) {
882   const int blk_wd = 4;
883 
884   if (src_h < blk_wd || src_w < blk_wd) {
885     rotate_buffer_clockwise(src_buffer, dst_buffer, src_w, src_h, src_stride, dst_stride, 270);
886     return;
887   }
888 
889   int sub_img_w = (src_w / blk_wd) * blk_wd;
890   uint32x4_t s[blk_wd];
891   int i = 0;
892 
893   while (1) {
894     uint32_t* dst_blk = dst_buffer + i + (src_w - blk_wd) * dst_stride;
895     uint32_t* src_blk = src_buffer + (i * src_stride);
896     int j;
897 
898     for (j = 0; j < sub_img_w; j += blk_wd, src_blk += blk_wd, dst_blk -= (blk_wd * dst_stride)) {
899       load_u32_4x4(src_blk, src_stride, &s[0], &s[1], &s[2], &s[3]);
900       transpose_u32_4x4(&s[0], &s[1], &s[2], &s[3]);
901       store_u32_4x4(dst_blk, dst_stride, s[3], s[2], s[1], s[0]);
902     }
903     if (sub_img_w < src_w) {
904       dst_blk += (blk_wd - 1) * dst_stride;
905       for (int k = 0; k < blk_wd; k++) {
906         for (int l = 0; l < (src_w - sub_img_w); l++) {
907           dst_blk[-l * dst_stride + k] = src_blk[k * src_stride + l];
908         }
909       }
910     }
911     i += blk_wd;
912     if (i == src_h) break;
913     if (i + blk_wd > src_h) i = src_h - blk_wd;
914   }
915 }
916 
rotate_buffer_clockwise_270_neon_uint64_t(uint64_t * src_buffer,uint64_t * dst_buffer,int src_w,int src_h,int src_stride,int dst_stride)917 static void rotate_buffer_clockwise_270_neon_uint64_t(uint64_t* src_buffer, uint64_t* dst_buffer,
918                                                       int src_w, int src_h, int src_stride,
919                                                       int dst_stride) {
920   const int blk_wd = 2;
921 
922   if (src_h < blk_wd || src_w < blk_wd) {
923     rotate_buffer_clockwise(src_buffer, dst_buffer, src_w, src_h, src_stride, dst_stride, 270);
924     return;
925   }
926 
927   int sub_img_w = (src_w / blk_wd) * blk_wd;
928   uint64x2_t s[blk_wd];
929   int i = 0;
930 
931   while (1) {
932     uint64_t* dst_blk = dst_buffer + i + (src_w - blk_wd) * dst_stride;
933     uint64_t* src_blk = src_buffer + (i * src_stride);
934     int j;
935 
936     for (j = 0; j < sub_img_w; j += blk_wd, src_blk += blk_wd, dst_blk -= (blk_wd * dst_stride)) {
937       load_u64_2x2(src_blk, src_stride, &s[0], &s[1]);
938       rotate270_u64_2x2(&s[0], &s[1]);
939       store_u64_2x2(dst_blk, dst_stride, s[0], s[1]);
940     }
941     if (sub_img_w < src_w) {
942       dst_blk += (blk_wd - 1) * dst_stride;
943       for (int k = 0; k < blk_wd; k++) {
944         for (int l = 0; l < (src_w - sub_img_w); l++) {
945           dst_blk[-l * dst_stride + k] = src_blk[k * src_stride + l];
946         }
947       }
948     }
949     i += blk_wd;
950     if (i == src_h) break;
951     if (i + blk_wd > src_h) i = src_h - blk_wd;
952   }
953 }
954 
955 template <typename T>
mirror_buffer_neon(T * src_buffer,T * dst_buffer,int src_w,int src_h,int src_stride,int dst_stride,uhdr_mirror_direction_t direction)956 void mirror_buffer_neon(T* src_buffer, T* dst_buffer, int src_w, int src_h, int src_stride,
957                         int dst_stride, uhdr_mirror_direction_t direction) {
958   if (direction == UHDR_MIRROR_VERTICAL) {
959     if constexpr (sizeof(T) == 1) {
960       mirror_buffer_vertical_neon_uint8_t(src_buffer, dst_buffer, src_w, src_h, src_stride,
961                                           dst_stride);
962     } else if constexpr (sizeof(T) == 2) {
963       mirror_buffer_vertical_neon_uint16_t(src_buffer, dst_buffer, src_w, src_h, src_stride,
964                                            dst_stride);
965     } else if constexpr (sizeof(T) == 4) {
966       mirror_buffer_vertical_neon_uint32_t(src_buffer, dst_buffer, src_w, src_h, src_stride,
967                                            dst_stride);
968     } else if constexpr (sizeof(T) == 8) {
969       mirror_buffer_vertical_neon_uint64_t(src_buffer, dst_buffer, src_w, src_h, src_stride,
970                                            dst_stride);
971     }
972 
973   } else if (direction == UHDR_MIRROR_HORIZONTAL) {
974     if constexpr (sizeof(T) == 1) {
975       mirror_buffer_horizontal_neon_uint8_t(src_buffer, dst_buffer, src_w, src_h, src_stride,
976                                             dst_stride);
977     } else if constexpr (sizeof(T) == 2) {
978       mirror_buffer_horizontal_neon_uint16_t(src_buffer, dst_buffer, src_w, src_h, src_stride,
979                                              dst_stride);
980     } else if constexpr (sizeof(T) == 4) {
981       mirror_buffer_horizontal_neon_uint32_t(src_buffer, dst_buffer, src_w, src_h, src_stride,
982                                              dst_stride);
983     } else if constexpr (sizeof(T) == 8) {
984       mirror_buffer_horizontal_neon_uint64_t(src_buffer, dst_buffer, src_w, src_h, src_stride,
985                                              dst_stride);
986     }
987   }
988 }
989 
990 template <typename T>
rotate_buffer_clockwise_180_neon(T * src_buffer,T * dst_buffer,int src_w,int src_h,int src_stride,int dst_stride)991 void rotate_buffer_clockwise_180_neon(T* src_buffer, T* dst_buffer, int src_w, int src_h,
992                                       int src_stride, int dst_stride) {
993   if constexpr (sizeof(T) == 1) {
994     mirror_buffer_horizontal_neon_uint8_t(src_buffer + (src_h - 1) * src_stride, dst_buffer, src_w,
995                                           src_h, -src_stride, dst_stride);
996   } else if constexpr (sizeof(T) == 2) {
997     mirror_buffer_horizontal_neon_uint16_t(src_buffer + (src_h - 1) * src_stride, dst_buffer, src_w,
998                                            src_h, -src_stride, dst_stride);
999   } else if constexpr (sizeof(T) == 4) {
1000     mirror_buffer_horizontal_neon_uint32_t(src_buffer + (src_h - 1) * src_stride, dst_buffer, src_w,
1001                                            src_h, -src_stride, dst_stride);
1002   } else if constexpr (sizeof(T) == 8) {
1003     mirror_buffer_horizontal_neon_uint64_t(src_buffer + (src_h - 1) * src_stride, dst_buffer, src_w,
1004                                            src_h, -src_stride, dst_stride);
1005   }
1006 }
1007 
1008 template <typename T>
rotate_buffer_clockwise_90_neon(T * src_buffer,T * dst_buffer,int src_w,int src_h,int src_stride,int dst_stride)1009 void rotate_buffer_clockwise_90_neon(T* src_buffer, T* dst_buffer, int src_w, int src_h,
1010                                      int src_stride, int dst_stride) {
1011   if constexpr (sizeof(T) == 1) {
1012     rotate_buffer_clockwise_90_neon_uint8_t(src_buffer, dst_buffer, src_w, src_h, src_stride,
1013                                             dst_stride);
1014   } else if constexpr (sizeof(T) == 2) {
1015     rotate_buffer_clockwise_90_neon_uint16_t(src_buffer, dst_buffer, src_w, src_h, src_stride,
1016                                              dst_stride);
1017   } else if constexpr (sizeof(T) == 4) {
1018     rotate_buffer_clockwise_90_neon_uint32_t(src_buffer, dst_buffer, src_w, src_h, src_stride,
1019                                              dst_stride);
1020   } else if constexpr (sizeof(T) == 8) {
1021     rotate_buffer_clockwise_90_neon_uint64_t(src_buffer, dst_buffer, src_w, src_h, src_stride,
1022                                              dst_stride);
1023   }
1024 }
1025 
1026 template <typename T>
rotate_buffer_clockwise_270_neon(T * src_buffer,T * dst_buffer,int src_w,int src_h,int src_stride,int dst_stride)1027 void rotate_buffer_clockwise_270_neon(T* src_buffer, T* dst_buffer, int src_w, int src_h,
1028                                       int src_stride, int dst_stride) {
1029   if constexpr (sizeof(T) == 1) {
1030     rotate_buffer_clockwise_270_neon_uint8_t(src_buffer, dst_buffer, src_w, src_h, src_stride,
1031                                              dst_stride);
1032   } else if constexpr (sizeof(T) == 2) {
1033     rotate_buffer_clockwise_270_neon_uint16_t(src_buffer, dst_buffer, src_w, src_h, src_stride,
1034                                               dst_stride);
1035   } else if constexpr (sizeof(T) == 4) {
1036     rotate_buffer_clockwise_270_neon_uint32_t(src_buffer, dst_buffer, src_w, src_h, src_stride,
1037                                               dst_stride);
1038   } else if constexpr (sizeof(T) == 8) {
1039     rotate_buffer_clockwise_270_neon_uint64_t(src_buffer, dst_buffer, src_w, src_h, src_stride,
1040                                               dst_stride);
1041   }
1042 }
1043 
1044 template <typename T>
rotate_buffer_clockwise_neon(T * src_buffer,T * dst_buffer,int src_w,int src_h,int src_stride,int dst_stride,int degrees)1045 void rotate_buffer_clockwise_neon(T* src_buffer, T* dst_buffer, int src_w, int src_h,
1046                                   int src_stride, int dst_stride, int degrees) {
1047   if (degrees == 90) {
1048     rotate_buffer_clockwise_90_neon(src_buffer, dst_buffer, src_w, src_h, src_stride, dst_stride);
1049   } else if (degrees == 180) {
1050     rotate_buffer_clockwise_180_neon(src_buffer, dst_buffer, src_w, src_h, src_stride, dst_stride);
1051   } else if (degrees == 270) {
1052     rotate_buffer_clockwise_270_neon(src_buffer, dst_buffer, src_w, src_h, src_stride, dst_stride);
1053   }
1054 }
1055 
1056 template void mirror_buffer_neon<uint8_t>(uint8_t*, uint8_t*, int, int, int, int,
1057                                           uhdr_mirror_direction_t);
1058 template void mirror_buffer_neon<uint16_t>(uint16_t*, uint16_t*, int, int, int, int,
1059                                            uhdr_mirror_direction_t);
1060 template void mirror_buffer_neon<uint32_t>(uint32_t*, uint32_t*, int, int, int, int,
1061                                            uhdr_mirror_direction_t);
1062 template void mirror_buffer_neon<uint64_t>(uint64_t*, uint64_t*, int, int, int, int,
1063                                            uhdr_mirror_direction_t);
1064 
1065 template void rotate_buffer_clockwise_neon<uint8_t>(uint8_t*, uint8_t*, int, int, int, int, int);
1066 template void rotate_buffer_clockwise_neon<uint16_t>(uint16_t*, uint16_t*, int, int, int, int, int);
1067 template void rotate_buffer_clockwise_neon<uint32_t>(uint32_t*, uint32_t*, int, int, int, int, int);
1068 template void rotate_buffer_clockwise_neon<uint64_t>(uint64_t*, uint64_t*, int, int, int, int, int);
1069 
1070 }  // namespace ultrahdr
1071