xref: /aosp_15_r20/external/libdav1d/src/arm/64/util.S (revision c09093415860a1c2373dacd84c4fde00c507cdfd)
1/******************************************************************************
2 * Copyright © 2018, VideoLAN and dav1d authors
3 * Copyright © 2015 Martin Storsjo
4 * Copyright © 2015 Janne Grunau
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright notice, this
11 *    list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright notice,
14 *    this list of conditions and the following disclaimer in the documentation
15 *    and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
19 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
20 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
21 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
22 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
23 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
24 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 *****************************************************************************/
28
29#ifndef DAV1D_SRC_ARM_64_UTIL_S
30#define DAV1D_SRC_ARM_64_UTIL_S
31
32#include "config.h"
33#include "src/arm/asm.S"
34
35#ifndef __has_feature
36#define __has_feature(x) 0
37#endif
38
39.macro  movrel rd, val, offset=0
40#if defined(__APPLE__)
41  .if \offset < 0
42        adrp            \rd, \val@PAGE
43        add             \rd, \rd, \val@PAGEOFF
44        sub             \rd, \rd, -(\offset)
45  .else
46        adrp            \rd, \val+(\offset)@PAGE
47        add             \rd, \rd, \val+(\offset)@PAGEOFF
48  .endif
49#elif defined(PIC) && defined(_WIN32)
50  .if \offset < 0
51        adrp            \rd, \val
52        add             \rd, \rd, :lo12:\val
53        sub             \rd, \rd, -(\offset)
54  .else
55        adrp            \rd, \val+(\offset)
56        add             \rd, \rd, :lo12:\val+(\offset)
57  .endif
58#elif __has_feature(hwaddress_sanitizer)
59        adrp            \rd, :pg_hi21_nc:\val+(\offset)
60        movk            \rd, #:prel_g3:\val+0x100000000
61        add             \rd, \rd, :lo12:\val+(\offset)
62#elif defined(PIC)
63        adrp            \rd, \val+(\offset)
64        add             \rd, \rd, :lo12:\val+(\offset)
65#else
66        ldr             \rd, =\val+\offset
67#endif
68.endm
69
70.macro sub_sp space
71#ifdef _WIN32
72.if \space > 8192
73        // Here, we'd need to touch two (or more) pages while decrementing
74        // the stack pointer.
75        .error          "sub_sp_align doesn't support values over 8K at the moment"
76.elseif \space > 4096
77        sub             x16, sp,  #4096
78        ldr             xzr, [x16]
79        sub             sp,  x16, #(\space - 4096)
80.else
81        sub             sp,  sp,  #\space
82.endif
83#else
84.if \space >= 4096
85        sub             sp,  sp,  #(\space)/4096*4096
86.endif
87.if (\space % 4096) != 0
88        sub             sp,  sp,  #(\space)%4096
89.endif
90#endif
91.endm
92
93.macro transpose_8x8b_xtl r0, r1, r2, r3, r4, r5, r6, r7, xtl
94        // a0 b0 a1 b1 a2 b2 a3 b3 a4 b4 a5 b5 a6 b6 a7 b7
95        zip1            \r0\().16b, \r0\().16b, \r1\().16b
96        // c0 d0 c1 d1 c2 d2 d3 d3 c4 d4 c5 d5 c6 d6 d7 d7
97        zip1            \r2\().16b, \r2\().16b, \r3\().16b
98        // e0 f0 e1 f1 e2 f2 e3 f3 e4 f4 e5 f5 e6 f6 e7 f7
99        zip1            \r4\().16b, \r4\().16b, \r5\().16b
100        // g0 h0 g1 h1 g2 h2 h3 h3 g4 h4 g5 h5 g6 h6 h7 h7
101        zip1            \r6\().16b, \r6\().16b, \r7\().16b
102
103        // a0 b0 c0 d0 a2 b2 c2 d2 a4 b4 c4 d4 a6 b6 c6 d6
104        trn1            \r1\().8h,  \r0\().8h,  \r2\().8h
105        // a1 b1 c1 d1 a3 b3 c3 d3 a5 b5 c5 d5 a7 b7 c7 d7
106        trn2            \r3\().8h,  \r0\().8h,  \r2\().8h
107        // e0 f0 g0 h0 e2 f2 g2 h2 e4 f4 g4 h4 e6 f6 g6 h6
108        trn1            \r5\().8h,  \r4\().8h,  \r6\().8h
109        // e1 f1 g1 h1 e3 f3 g3 h3 e5 f5 g5 h5 e7 f7 g7 h7
110        trn2            \r7\().8h,  \r4\().8h,  \r6\().8h
111
112        // a0 b0 c0 d0 e0 f0 g0 h0 a4 b4 c4 d4 e4 f4 g4 h4
113        trn1            \r0\().4s,  \r1\().4s,  \r5\().4s
114        // a2 b2 c2 d2 e2 f2 g2 h2 a6 b6 c6 d6 e6 f6 g6 h6
115        trn2            \r2\().4s,  \r1\().4s,  \r5\().4s
116        // a1 b1 c1 d1 e1 f1 g1 h1 a5 b5 c5 d5 e5 f5 g5 h5
117        trn1            \r1\().4s,  \r3\().4s,  \r7\().4s
118        // a3 b3 c3 d3 e3 f3 g3 h3 a7 b7 c7 d7 e7 f7 g7 h7
119        trn2            \r3\().4s,  \r3\().4s,  \r7\().4s
120
121        \xtl\()2        \r4\().8h,  \r0\().16b
122        \xtl            \r0\().8h,  \r0\().8b
123        \xtl\()2        \r6\().8h,  \r2\().16b
124        \xtl            \r2\().8h,  \r2\().8b
125        \xtl\()2        \r5\().8h,  \r1\().16b
126        \xtl            \r1\().8h,  \r1\().8b
127        \xtl\()2        \r7\().8h,  \r3\().16b
128        \xtl            \r3\().8h,  \r3\().8b
129.endm
130
131.macro transpose_8x8h r0, r1, r2, r3, r4, r5, r6, r7, t8, t9
132        trn1            \t8\().8h,  \r0\().8h,  \r1\().8h
133        trn2            \t9\().8h,  \r0\().8h,  \r1\().8h
134        trn1            \r1\().8h,  \r2\().8h,  \r3\().8h
135        trn2            \r3\().8h,  \r2\().8h,  \r3\().8h
136        trn1            \r0\().8h,  \r4\().8h,  \r5\().8h
137        trn2            \r5\().8h,  \r4\().8h,  \r5\().8h
138        trn1            \r2\().8h,  \r6\().8h,  \r7\().8h
139        trn2            \r7\().8h,  \r6\().8h,  \r7\().8h
140
141        trn1            \r4\().4s,  \r0\().4s,  \r2\().4s
142        trn2            \r2\().4s,  \r0\().4s,  \r2\().4s
143        trn1            \r6\().4s,  \r5\().4s,  \r7\().4s
144        trn2            \r7\().4s,  \r5\().4s,  \r7\().4s
145        trn1            \r5\().4s,  \t9\().4s,  \r3\().4s
146        trn2            \t9\().4s,  \t9\().4s,  \r3\().4s
147        trn1            \r3\().4s,  \t8\().4s,  \r1\().4s
148        trn2            \t8\().4s,  \t8\().4s,  \r1\().4s
149
150        trn1            \r0\().2d,  \r3\().2d,  \r4\().2d
151        trn2            \r4\().2d,  \r3\().2d,  \r4\().2d
152        trn1            \r1\().2d,  \r5\().2d,  \r6\().2d
153        trn2            \r5\().2d,  \r5\().2d,  \r6\().2d
154        trn2            \r6\().2d,  \t8\().2d,  \r2\().2d
155        trn1            \r2\().2d,  \t8\().2d,  \r2\().2d
156        trn1            \r3\().2d,  \t9\().2d,  \r7\().2d
157        trn2            \r7\().2d,  \t9\().2d,  \r7\().2d
158.endm
159
160.macro transpose_8x8h_mov r0, r1, r2, r3, r4, r5, r6, r7, t8, t9, o0, o1, o2, o3, o4, o5, o6, o7
161        trn1            \t8\().8h,  \r0\().8h,  \r1\().8h
162        trn2            \t9\().8h,  \r0\().8h,  \r1\().8h
163        trn1            \r1\().8h,  \r2\().8h,  \r3\().8h
164        trn2            \r3\().8h,  \r2\().8h,  \r3\().8h
165        trn1            \r0\().8h,  \r4\().8h,  \r5\().8h
166        trn2            \r5\().8h,  \r4\().8h,  \r5\().8h
167        trn1            \r2\().8h,  \r6\().8h,  \r7\().8h
168        trn2            \r7\().8h,  \r6\().8h,  \r7\().8h
169
170        trn1            \r4\().4s,  \r0\().4s,  \r2\().4s
171        trn2            \r2\().4s,  \r0\().4s,  \r2\().4s
172        trn1            \r6\().4s,  \r5\().4s,  \r7\().4s
173        trn2            \r7\().4s,  \r5\().4s,  \r7\().4s
174        trn1            \r5\().4s,  \t9\().4s,  \r3\().4s
175        trn2            \t9\().4s,  \t9\().4s,  \r3\().4s
176        trn1            \r3\().4s,  \t8\().4s,  \r1\().4s
177        trn2            \t8\().4s,  \t8\().4s,  \r1\().4s
178
179        trn1            \o0\().2d,  \r3\().2d,  \r4\().2d
180        trn2            \o4\().2d,  \r3\().2d,  \r4\().2d
181        trn1            \o1\().2d,  \r5\().2d,  \r6\().2d
182        trn2            \o5\().2d,  \r5\().2d,  \r6\().2d
183        trn2            \o6\().2d,  \t8\().2d,  \r2\().2d
184        trn1            \o2\().2d,  \t8\().2d,  \r2\().2d
185        trn1            \o3\().2d,  \t9\().2d,  \r7\().2d
186        trn2            \o7\().2d,  \t9\().2d,  \r7\().2d
187.endm
188
189.macro transpose_8x16b r0, r1, r2, r3, r4, r5, r6, r7, t8, t9
190        trn1            \t8\().16b, \r0\().16b, \r1\().16b
191        trn2            \t9\().16b, \r0\().16b, \r1\().16b
192        trn1            \r1\().16b, \r2\().16b, \r3\().16b
193        trn2            \r3\().16b, \r2\().16b, \r3\().16b
194        trn1            \r0\().16b, \r4\().16b, \r5\().16b
195        trn2            \r5\().16b, \r4\().16b, \r5\().16b
196        trn1            \r2\().16b, \r6\().16b, \r7\().16b
197        trn2            \r7\().16b, \r6\().16b, \r7\().16b
198
199        trn1            \r4\().8h,  \r0\().8h,  \r2\().8h
200        trn2            \r2\().8h,  \r0\().8h,  \r2\().8h
201        trn1            \r6\().8h,  \r5\().8h,  \r7\().8h
202        trn2            \r7\().8h,  \r5\().8h,  \r7\().8h
203        trn1            \r5\().8h,  \t9\().8h,  \r3\().8h
204        trn2            \t9\().8h,  \t9\().8h,  \r3\().8h
205        trn1            \r3\().8h,  \t8\().8h,  \r1\().8h
206        trn2            \t8\().8h,  \t8\().8h,  \r1\().8h
207
208        trn1            \r0\().4s,  \r3\().4s,  \r4\().4s
209        trn2            \r4\().4s,  \r3\().4s,  \r4\().4s
210        trn1            \r1\().4s,  \r5\().4s,  \r6\().4s
211        trn2            \r5\().4s,  \r5\().4s,  \r6\().4s
212        trn2            \r6\().4s,  \t8\().4s,  \r2\().4s
213        trn1            \r2\().4s,  \t8\().4s,  \r2\().4s
214        trn1            \r3\().4s,  \t9\().4s,  \r7\().4s
215        trn2            \r7\().4s,  \t9\().4s,  \r7\().4s
216.endm
217
218.macro  transpose_4x16b r0, r1, r2, r3, t4, t5, t6, t7
219        trn1            \t4\().16b, \r0\().16b, \r1\().16b
220        trn2            \t5\().16b, \r0\().16b, \r1\().16b
221        trn1            \t6\().16b, \r2\().16b, \r3\().16b
222        trn2            \t7\().16b, \r2\().16b, \r3\().16b
223
224        trn1            \r0\().8h,  \t4\().8h,  \t6\().8h
225        trn2            \r2\().8h,  \t4\().8h,  \t6\().8h
226        trn1            \r1\().8h,  \t5\().8h,  \t7\().8h
227        trn2            \r3\().8h,  \t5\().8h,  \t7\().8h
228.endm
229
230.macro  transpose_4x4h  r0, r1, r2, r3, t4, t5, t6, t7
231        trn1            \t4\().4h,  \r0\().4h,  \r1\().4h
232        trn2            \t5\().4h,  \r0\().4h,  \r1\().4h
233        trn1            \t6\().4h,  \r2\().4h,  \r3\().4h
234        trn2            \t7\().4h,  \r2\().4h,  \r3\().4h
235
236        trn1            \r0\().2s,  \t4\().2s,  \t6\().2s
237        trn2            \r2\().2s,  \t4\().2s,  \t6\().2s
238        trn1            \r1\().2s,  \t5\().2s,  \t7\().2s
239        trn2            \r3\().2s,  \t5\().2s,  \t7\().2s
240.endm
241
242.macro  transpose_4x4s  r0, r1, r2, r3, t4, t5, t6, t7
243        trn1            \t4\().4s,  \r0\().4s,  \r1\().4s
244        trn2            \t5\().4s,  \r0\().4s,  \r1\().4s
245        trn1            \t6\().4s,  \r2\().4s,  \r3\().4s
246        trn2            \t7\().4s,  \r2\().4s,  \r3\().4s
247
248        trn1            \r0\().2d,  \t4\().2d,  \t6\().2d
249        trn2            \r2\().2d,  \t4\().2d,  \t6\().2d
250        trn1            \r1\().2d,  \t5\().2d,  \t7\().2d
251        trn2            \r3\().2d,  \t5\().2d,  \t7\().2d
252.endm
253
254.macro  transpose_4x8h  r0, r1, r2, r3, t4, t5, t6, t7
255        trn1            \t4\().8h,  \r0\().8h,  \r1\().8h
256        trn2            \t5\().8h,  \r0\().8h,  \r1\().8h
257        trn1            \t6\().8h,  \r2\().8h,  \r3\().8h
258        trn2            \t7\().8h,  \r2\().8h,  \r3\().8h
259
260        trn1            \r0\().4s,  \t4\().4s,  \t6\().4s
261        trn2            \r2\().4s,  \t4\().4s,  \t6\().4s
262        trn1            \r1\().4s,  \t5\().4s,  \t7\().4s
263        trn2            \r3\().4s,  \t5\().4s,  \t7\().4s
264.endm
265
266.macro  transpose_4x8h_mov r0, r1, r2, r3, t4, t5, t6, t7, o0, o1, o2, o3
267        trn1            \t4\().8h,  \r0\().8h,  \r1\().8h
268        trn2            \t5\().8h,  \r0\().8h,  \r1\().8h
269        trn1            \t6\().8h,  \r2\().8h,  \r3\().8h
270        trn2            \t7\().8h,  \r2\().8h,  \r3\().8h
271
272        trn1            \o0\().4s,  \t4\().4s,  \t6\().4s
273        trn2            \o2\().4s,  \t4\().4s,  \t6\().4s
274        trn1            \o1\().4s,  \t5\().4s,  \t7\().4s
275        trn2            \o3\().4s,  \t5\().4s,  \t7\().4s
276.endm
277
278#endif /* DAV1D_SRC_ARM_64_UTIL_S */
279