1/****************************************************************************** 2 * Copyright © 2018, VideoLAN and dav1d authors 3 * Copyright © 2015 Martin Storsjo 4 * Copyright © 2015 Janne Grunau 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions are met: 9 * 10 * 1. Redistributions of source code must retain the above copyright notice, this 11 * list of conditions and the following disclaimer. 12 * 13 * 2. Redistributions in binary form must reproduce the above copyright notice, 14 * this list of conditions and the following disclaimer in the documentation 15 * and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 19 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 20 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 21 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 22 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 23 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 24 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 26 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 *****************************************************************************/ 28 29#ifndef DAV1D_SRC_ARM_64_UTIL_S 30#define DAV1D_SRC_ARM_64_UTIL_S 31 32#include "config.h" 33#include "src/arm/asm.S" 34 35#ifndef __has_feature 36#define __has_feature(x) 0 37#endif 38 39.macro movrel rd, val, offset=0 40#if defined(__APPLE__) 41 .if \offset < 0 42 adrp \rd, \val@PAGE 43 add \rd, \rd, \val@PAGEOFF 44 sub \rd, \rd, -(\offset) 45 .else 46 adrp \rd, \val+(\offset)@PAGE 47 add \rd, \rd, \val+(\offset)@PAGEOFF 48 .endif 49#elif defined(PIC) && defined(_WIN32) 50 .if \offset < 0 51 adrp \rd, \val 52 add \rd, \rd, :lo12:\val 53 sub \rd, \rd, -(\offset) 54 .else 55 adrp \rd, \val+(\offset) 56 add \rd, \rd, :lo12:\val+(\offset) 57 .endif 58#elif __has_feature(hwaddress_sanitizer) 59 adrp \rd, :pg_hi21_nc:\val+(\offset) 60 movk \rd, #:prel_g3:\val+0x100000000 61 add \rd, \rd, :lo12:\val+(\offset) 62#elif defined(PIC) 63 adrp \rd, \val+(\offset) 64 add \rd, \rd, :lo12:\val+(\offset) 65#else 66 ldr \rd, =\val+\offset 67#endif 68.endm 69 70.macro sub_sp space 71#ifdef _WIN32 72.if \space > 8192 73 // Here, we'd need to touch two (or more) pages while decrementing 74 // the stack pointer. 75 .error "sub_sp_align doesn't support values over 8K at the moment" 76.elseif \space > 4096 77 sub x16, sp, #4096 78 ldr xzr, [x16] 79 sub sp, x16, #(\space - 4096) 80.else 81 sub sp, sp, #\space 82.endif 83#else 84.if \space >= 4096 85 sub sp, sp, #(\space)/4096*4096 86.endif 87.if (\space % 4096) != 0 88 sub sp, sp, #(\space)%4096 89.endif 90#endif 91.endm 92 93.macro transpose_8x8b_xtl r0, r1, r2, r3, r4, r5, r6, r7, xtl 94 // a0 b0 a1 b1 a2 b2 a3 b3 a4 b4 a5 b5 a6 b6 a7 b7 95 zip1 \r0\().16b, \r0\().16b, \r1\().16b 96 // c0 d0 c1 d1 c2 d2 d3 d3 c4 d4 c5 d5 c6 d6 d7 d7 97 zip1 \r2\().16b, \r2\().16b, \r3\().16b 98 // e0 f0 e1 f1 e2 f2 e3 f3 e4 f4 e5 f5 e6 f6 e7 f7 99 zip1 \r4\().16b, \r4\().16b, \r5\().16b 100 // g0 h0 g1 h1 g2 h2 h3 h3 g4 h4 g5 h5 g6 h6 h7 h7 101 zip1 \r6\().16b, \r6\().16b, \r7\().16b 102 103 // a0 b0 c0 d0 a2 b2 c2 d2 a4 b4 c4 d4 a6 b6 c6 d6 104 trn1 \r1\().8h, \r0\().8h, \r2\().8h 105 // a1 b1 c1 d1 a3 b3 c3 d3 a5 b5 c5 d5 a7 b7 c7 d7 106 trn2 \r3\().8h, \r0\().8h, \r2\().8h 107 // e0 f0 g0 h0 e2 f2 g2 h2 e4 f4 g4 h4 e6 f6 g6 h6 108 trn1 \r5\().8h, \r4\().8h, \r6\().8h 109 // e1 f1 g1 h1 e3 f3 g3 h3 e5 f5 g5 h5 e7 f7 g7 h7 110 trn2 \r7\().8h, \r4\().8h, \r6\().8h 111 112 // a0 b0 c0 d0 e0 f0 g0 h0 a4 b4 c4 d4 e4 f4 g4 h4 113 trn1 \r0\().4s, \r1\().4s, \r5\().4s 114 // a2 b2 c2 d2 e2 f2 g2 h2 a6 b6 c6 d6 e6 f6 g6 h6 115 trn2 \r2\().4s, \r1\().4s, \r5\().4s 116 // a1 b1 c1 d1 e1 f1 g1 h1 a5 b5 c5 d5 e5 f5 g5 h5 117 trn1 \r1\().4s, \r3\().4s, \r7\().4s 118 // a3 b3 c3 d3 e3 f3 g3 h3 a7 b7 c7 d7 e7 f7 g7 h7 119 trn2 \r3\().4s, \r3\().4s, \r7\().4s 120 121 \xtl\()2 \r4\().8h, \r0\().16b 122 \xtl \r0\().8h, \r0\().8b 123 \xtl\()2 \r6\().8h, \r2\().16b 124 \xtl \r2\().8h, \r2\().8b 125 \xtl\()2 \r5\().8h, \r1\().16b 126 \xtl \r1\().8h, \r1\().8b 127 \xtl\()2 \r7\().8h, \r3\().16b 128 \xtl \r3\().8h, \r3\().8b 129.endm 130 131.macro transpose_8x8h r0, r1, r2, r3, r4, r5, r6, r7, t8, t9 132 trn1 \t8\().8h, \r0\().8h, \r1\().8h 133 trn2 \t9\().8h, \r0\().8h, \r1\().8h 134 trn1 \r1\().8h, \r2\().8h, \r3\().8h 135 trn2 \r3\().8h, \r2\().8h, \r3\().8h 136 trn1 \r0\().8h, \r4\().8h, \r5\().8h 137 trn2 \r5\().8h, \r4\().8h, \r5\().8h 138 trn1 \r2\().8h, \r6\().8h, \r7\().8h 139 trn2 \r7\().8h, \r6\().8h, \r7\().8h 140 141 trn1 \r4\().4s, \r0\().4s, \r2\().4s 142 trn2 \r2\().4s, \r0\().4s, \r2\().4s 143 trn1 \r6\().4s, \r5\().4s, \r7\().4s 144 trn2 \r7\().4s, \r5\().4s, \r7\().4s 145 trn1 \r5\().4s, \t9\().4s, \r3\().4s 146 trn2 \t9\().4s, \t9\().4s, \r3\().4s 147 trn1 \r3\().4s, \t8\().4s, \r1\().4s 148 trn2 \t8\().4s, \t8\().4s, \r1\().4s 149 150 trn1 \r0\().2d, \r3\().2d, \r4\().2d 151 trn2 \r4\().2d, \r3\().2d, \r4\().2d 152 trn1 \r1\().2d, \r5\().2d, \r6\().2d 153 trn2 \r5\().2d, \r5\().2d, \r6\().2d 154 trn2 \r6\().2d, \t8\().2d, \r2\().2d 155 trn1 \r2\().2d, \t8\().2d, \r2\().2d 156 trn1 \r3\().2d, \t9\().2d, \r7\().2d 157 trn2 \r7\().2d, \t9\().2d, \r7\().2d 158.endm 159 160.macro transpose_8x8h_mov r0, r1, r2, r3, r4, r5, r6, r7, t8, t9, o0, o1, o2, o3, o4, o5, o6, o7 161 trn1 \t8\().8h, \r0\().8h, \r1\().8h 162 trn2 \t9\().8h, \r0\().8h, \r1\().8h 163 trn1 \r1\().8h, \r2\().8h, \r3\().8h 164 trn2 \r3\().8h, \r2\().8h, \r3\().8h 165 trn1 \r0\().8h, \r4\().8h, \r5\().8h 166 trn2 \r5\().8h, \r4\().8h, \r5\().8h 167 trn1 \r2\().8h, \r6\().8h, \r7\().8h 168 trn2 \r7\().8h, \r6\().8h, \r7\().8h 169 170 trn1 \r4\().4s, \r0\().4s, \r2\().4s 171 trn2 \r2\().4s, \r0\().4s, \r2\().4s 172 trn1 \r6\().4s, \r5\().4s, \r7\().4s 173 trn2 \r7\().4s, \r5\().4s, \r7\().4s 174 trn1 \r5\().4s, \t9\().4s, \r3\().4s 175 trn2 \t9\().4s, \t9\().4s, \r3\().4s 176 trn1 \r3\().4s, \t8\().4s, \r1\().4s 177 trn2 \t8\().4s, \t8\().4s, \r1\().4s 178 179 trn1 \o0\().2d, \r3\().2d, \r4\().2d 180 trn2 \o4\().2d, \r3\().2d, \r4\().2d 181 trn1 \o1\().2d, \r5\().2d, \r6\().2d 182 trn2 \o5\().2d, \r5\().2d, \r6\().2d 183 trn2 \o6\().2d, \t8\().2d, \r2\().2d 184 trn1 \o2\().2d, \t8\().2d, \r2\().2d 185 trn1 \o3\().2d, \t9\().2d, \r7\().2d 186 trn2 \o7\().2d, \t9\().2d, \r7\().2d 187.endm 188 189.macro transpose_8x16b r0, r1, r2, r3, r4, r5, r6, r7, t8, t9 190 trn1 \t8\().16b, \r0\().16b, \r1\().16b 191 trn2 \t9\().16b, \r0\().16b, \r1\().16b 192 trn1 \r1\().16b, \r2\().16b, \r3\().16b 193 trn2 \r3\().16b, \r2\().16b, \r3\().16b 194 trn1 \r0\().16b, \r4\().16b, \r5\().16b 195 trn2 \r5\().16b, \r4\().16b, \r5\().16b 196 trn1 \r2\().16b, \r6\().16b, \r7\().16b 197 trn2 \r7\().16b, \r6\().16b, \r7\().16b 198 199 trn1 \r4\().8h, \r0\().8h, \r2\().8h 200 trn2 \r2\().8h, \r0\().8h, \r2\().8h 201 trn1 \r6\().8h, \r5\().8h, \r7\().8h 202 trn2 \r7\().8h, \r5\().8h, \r7\().8h 203 trn1 \r5\().8h, \t9\().8h, \r3\().8h 204 trn2 \t9\().8h, \t9\().8h, \r3\().8h 205 trn1 \r3\().8h, \t8\().8h, \r1\().8h 206 trn2 \t8\().8h, \t8\().8h, \r1\().8h 207 208 trn1 \r0\().4s, \r3\().4s, \r4\().4s 209 trn2 \r4\().4s, \r3\().4s, \r4\().4s 210 trn1 \r1\().4s, \r5\().4s, \r6\().4s 211 trn2 \r5\().4s, \r5\().4s, \r6\().4s 212 trn2 \r6\().4s, \t8\().4s, \r2\().4s 213 trn1 \r2\().4s, \t8\().4s, \r2\().4s 214 trn1 \r3\().4s, \t9\().4s, \r7\().4s 215 trn2 \r7\().4s, \t9\().4s, \r7\().4s 216.endm 217 218.macro transpose_4x16b r0, r1, r2, r3, t4, t5, t6, t7 219 trn1 \t4\().16b, \r0\().16b, \r1\().16b 220 trn2 \t5\().16b, \r0\().16b, \r1\().16b 221 trn1 \t6\().16b, \r2\().16b, \r3\().16b 222 trn2 \t7\().16b, \r2\().16b, \r3\().16b 223 224 trn1 \r0\().8h, \t4\().8h, \t6\().8h 225 trn2 \r2\().8h, \t4\().8h, \t6\().8h 226 trn1 \r1\().8h, \t5\().8h, \t7\().8h 227 trn2 \r3\().8h, \t5\().8h, \t7\().8h 228.endm 229 230.macro transpose_4x4h r0, r1, r2, r3, t4, t5, t6, t7 231 trn1 \t4\().4h, \r0\().4h, \r1\().4h 232 trn2 \t5\().4h, \r0\().4h, \r1\().4h 233 trn1 \t6\().4h, \r2\().4h, \r3\().4h 234 trn2 \t7\().4h, \r2\().4h, \r3\().4h 235 236 trn1 \r0\().2s, \t4\().2s, \t6\().2s 237 trn2 \r2\().2s, \t4\().2s, \t6\().2s 238 trn1 \r1\().2s, \t5\().2s, \t7\().2s 239 trn2 \r3\().2s, \t5\().2s, \t7\().2s 240.endm 241 242.macro transpose_4x4s r0, r1, r2, r3, t4, t5, t6, t7 243 trn1 \t4\().4s, \r0\().4s, \r1\().4s 244 trn2 \t5\().4s, \r0\().4s, \r1\().4s 245 trn1 \t6\().4s, \r2\().4s, \r3\().4s 246 trn2 \t7\().4s, \r2\().4s, \r3\().4s 247 248 trn1 \r0\().2d, \t4\().2d, \t6\().2d 249 trn2 \r2\().2d, \t4\().2d, \t6\().2d 250 trn1 \r1\().2d, \t5\().2d, \t7\().2d 251 trn2 \r3\().2d, \t5\().2d, \t7\().2d 252.endm 253 254.macro transpose_4x8h r0, r1, r2, r3, t4, t5, t6, t7 255 trn1 \t4\().8h, \r0\().8h, \r1\().8h 256 trn2 \t5\().8h, \r0\().8h, \r1\().8h 257 trn1 \t6\().8h, \r2\().8h, \r3\().8h 258 trn2 \t7\().8h, \r2\().8h, \r3\().8h 259 260 trn1 \r0\().4s, \t4\().4s, \t6\().4s 261 trn2 \r2\().4s, \t4\().4s, \t6\().4s 262 trn1 \r1\().4s, \t5\().4s, \t7\().4s 263 trn2 \r3\().4s, \t5\().4s, \t7\().4s 264.endm 265 266.macro transpose_4x8h_mov r0, r1, r2, r3, t4, t5, t6, t7, o0, o1, o2, o3 267 trn1 \t4\().8h, \r0\().8h, \r1\().8h 268 trn2 \t5\().8h, \r0\().8h, \r1\().8h 269 trn1 \t6\().8h, \r2\().8h, \r3\().8h 270 trn2 \t7\().8h, \r2\().8h, \r3\().8h 271 272 trn1 \o0\().4s, \t4\().4s, \t6\().4s 273 trn2 \o2\().4s, \t4\().4s, \t6\().4s 274 trn1 \o1\().4s, \t5\().4s, \t7\().4s 275 trn2 \o3\().4s, \t5\().4s, \t7\().4s 276.endm 277 278#endif /* DAV1D_SRC_ARM_64_UTIL_S */ 279