/aosp_15_r20/external/libdav1d/src/arm/64/ |
H A D | mc16.S | 29 #include "src/arm/asm.S" 30 #include "util.S" 35 ld1 {\t0\().8h,\t1\().8h}, [x2], 32 36 ld1 {\t2\().8h,\t3\().8h}, [x3], 32 37 sqadd \t0\().8h, \t0\().8h, \t2\().8h 38 sqadd \t1\().8h, \t1\().8h, \t3\().8h 39 smax \t0\().8h, \t0\().8h, v28.8h // -2*PREP_BIAS - 1 << intermediate_bits 40 smax \t1\().8h, \t1\().8h, v28.8h // -2*PREP_BIAS - 1 << intermediate_bits 41 sqsub \t0\().8h, \t0\().8h, v28.8h // -2*PREP_BIAS - 1 << intermediate_bits 42 sqsub \t1\().8h, \t1\().8h, v28.8h // -2*PREP_BIAS - 1 << intermediate_bits [all …]
|
H A D | looprestoration_tmpl.S | 28 #include "src/arm/asm.S" 40 stp d8, d9, [sp, #-0x40]! 56 movi v30.8h, #3 57 movi v31.4s, #3 59 ld1 {v0.8h, v1.8h}, [x10], #32 60 ld1 {v2.8h, v3.8h}, [x11], #32 61 ld1 {v4.8h, v5.8h}, [x12], #32 62 ld1 {v6.8h, v7.8h}, [x4], #32 63 ld1 {v16.4s, v17.4s, v18.4s}, [x7], #48 64 ld1 {v19.4s, v20.4s, v21.4s}, [x8], #48 [all …]
|
H A D | ipred16.S | 28 #include "src/arm/asm.S" 29 #include "util.S" 42 dup v0.8h, w8 46 urshr v0.8h, v0.8h, #1 60 8: 61 st1 {v0.8h}, [x0], x1 62 st1 {v0.8h}, [x6], x1 64 st1 {v0.8h}, [x0], x1 65 st1 {v0.8h}, [x6], x1 66 b.gt 8b [all …]
|
H A D | itx16.S | 28 #include "src/arm/asm.S" 29 #include "util.S" 36 // x0-x3 external parameters 42 // x9-x12 scratch variables for helper functions 48 // v0-v1 multiplication coefficients 49 // v2-v7 scratch registers 50 // v8-v15 unused 51 // v16-v31 inputs/outputs of transforms 55 .int 2896, 2896*8*(1<<16), 1567, 3784 69 .int 101*8*(1<<16), 4095*8*(1<<16), 2967*8*(1<<16), -2824*8*(1<<16) [all …]
|
H A D | looprestoration16.S | 28 #include "src/arm/asm.S" 29 #include "util.S" 50 // const int16_t filter[2][8], 56 stp x29, x30, [sp, #-32]! 59 ld1 {v0.8h, v1.8h}, [x6] 63 dup v28.8h, w8 // bitdepth_max 65 movi v30.4s, #1 66 sub w10, w8, #38 // -(bitdepth + 6) 68 sub w8, w8, #25 // -round_bits_h 70 neg w11, w11 // -round_bits_v [all …]
|
/aosp_15_r20/external/tensorflow/tensorflow/lite/kernels/internal/optimized/ |
H A D | depthwiseconv_uint8_3x3_filter.h | 7 http://www.apache.org/licenses/LICENSE-2.0 46 #define STR(s) STR_UNEXPANDED(s) argument 47 #define STR_UNEXPANDED(s) #s argument 50 // Jetson TX-2. This compiler does not support the offsetof() macro. 55 // 4 8-bit lanes together. So these are treated much like 32-bit loads and 56 // 32-bit stores. Stores require 32-bit alignment. 67 // reinterpret_cast. Sanitizers may fail silently on lane-loading, with an 68 // obscure bug or mis-feature probably in unhygienic macro expansion. 83 #define OFFSET_INPUT_ROW_SIZE 8 154 // Dot product ops hard-coded [all …]
|
/aosp_15_r20/external/renderscript-intrinsics-replacement-toolkit/renderscript-toolkit/src/main/cpp/ |
H A D | ColorMatrix_advsimd.S | 8 * http://www.apache.org/licenses/LICENSE-2.0 18 #define END(f) .size f, .-f; 23 .if (\i) & (\mask - 1) 33 .if (\i) & (\mask - 1) 43 .if (\i) & (\mask - 1 + 16) 53 .if (\i) & (\mask - 1 + 16) 72 .irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 77 dup v6.4s, v4.s[0] 78 dup v7.4s, v4.s[0] 80 vmxx_s16 \i, 1, v6.4s, v12.4h, v0.h[0] [all …]
|
/aosp_15_r20/frameworks/rs/toolkit/ |
H A D | ColorMatrix_advsimd.S | 8 * http://www.apache.org/licenses/LICENSE-2.0 18 #define END(f) .size f, .-f; 23 .if (\i) & (\mask - 1) 33 .if (\i) & (\mask - 1) 43 .if (\i) & (\mask - 1 + 16) 53 .if (\i) & (\mask - 1 + 16) 72 .irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 77 dup v6.4s, v4.s[0] 78 dup v7.4s, v4.s[0] 80 vmxx_s16 \i, 1, v6.4s, v12.4h, v0.h[0] [all …]
|
/aosp_15_r20/frameworks/rs/cpu_ref/ |
H A D | rsCpuIntrinsics_advsimd_ColorMatrix.S | 8 * http://www.apache.org/licenses/LICENSE-2.0 18 #define END(f) .size f, .-f; 23 .if (\i) & (\mask - 1) 33 .if (\i) & (\mask - 1) 43 .if (\i) & (\mask - 1 + 16) 53 .if (\i) & (\mask - 1 + 16) 72 .irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 77 dup v6.4s, v4.s[0] 78 dup v7.4s, v4.s[0] 80 vmxx_s16 \i, 1, v6.4s, v12.4h, v0.h[0] [all …]
|
/aosp_15_r20/external/tensorflow/tensorflow/lite/kernels/internal/optimized/integer_ops/ |
H A D | depthwise_conv_3x3_filter.h | 7 http://www.apache.org/licenses/LICENSE-2.0 29 #define STR(s) STR_UNEXPANDED(s) argument 30 #define STR_UNEXPANDED(s) #s argument 33 // Jetson TX-2. This compiler does not support the offsetof() macro. 41 #define OFFSET_INPUT_ROW_SIZE 8 109 struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 1, 121 const int64_t output_height_increment = 2 * params_ptr->output_row_size; 122 TFLITE_DCHECK_EQ(params_ptr->filter_offset, 0); 131 #define DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LOOP "8" 138 // |output_window_height| and |output_window_width|. The inner-most loop [all …]
|
H A D | depthwise_conv_hybrid_3x3_filter.h | 7 http://www.apache.org/licenses/LICENSE-2.0 29 #define STR(s) STR_UNEXPANDED(s) argument 30 #define STR_UNEXPANDED(s) #s argument 33 // Jetson TX-2. This compiler does not support the offsetof() macro. 41 #define OFFSET_INPUT_ROW_SIZE 8 127 8, 1, 1> { 139 const int64_t output_height_increment = 2 * 4 * params_ptr->output_row_size; 140 TFLITE_DCHECK_EQ(params_ptr->filter_offset, 0); 149 #define DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LOOP "8" 156 // |output_window_height| and |output_window_width|. The inner-most loop [all …]
|
/aosp_15_r20/external/llvm/test/MC/AArch64/ |
H A D | neon-3vdiff.s | 1 // RUN: llvm-mc -triple=aarch64-none-linux-gnu -mattr=+crypto -mattr=+neon -show-encoding < %s | Fi… 5 //------------------------------------------------------------------------------ 7 //------------------------------------------------------------------------------ 9 //------------------------------------------------------------------------------ 11 //------------------------------------------------------------------------------ 13 //------------------------------------------------------------------------------ 14 // Long - Variant 1 15 //------------------------------------------------------------------------------ 17 saddl v0.8h, v1.8b, v2.8b 18 saddl v0.4s, v1.4h, v2.4h [all …]
|
/aosp_15_r20/prebuilts/vndk/v33/x86_64/arch-x86-x86_64/shared/vndk-core/ |
D | libpcre2.so | ... . - - - - - - - - - - - 8 8 8 8 8 8 8 8 8 |
/aosp_15_r20/external/ruy/ruy/ |
H A D | kernel_arm64.cc | 7 http://www.apache.org/licenses/LICENSE-2.0 36 #define RUY_OFFSET_LHS_SUMS 8 93 // Fast-int8-trick kernel, similar to this production gemmlowp kernel: 95 …gemmlowp/blob/36212ad3651871bc3e9a599f1a6d5324778aea25/standalone/neon-gemm-kernel-benchmark.cc#L2… 97 // Relevant target CPUs for this kernel include ARM Cortex-A73 and Cortex-A75, 98 // since these are 64-bit, out-of-order and without dotprod support. 115 // v16 -- v31 are int32 accumulators. in Kernel8bitNeon() 116 // During accumulation, v0 -- v3 are used to load int8 data from LHS and in Kernel8bitNeon() 117 // v4 -- v7 from RHS: in Kernel8bitNeon() 120 // /-----------------------------------------| in Kernel8bitNeon() [all …]
|
/aosp_15_r20/external/pytorch/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm/ |
H A D | 8x8-aarch64-neon.S | 5 * This source code is licensed under the BSD-style license found in the 10 #include <requantization/runtime-assembly.h> 12 # Args passed via 8 registers (64 bytes) 25 # |-----------| 27 # |params | 8 28 # |-----------| 43 … # https://developer.arm.com/docs/ihi0055/d/procedure-call-standard-for-the-arm-64-bit-architecture 44 # Callee need to save 8-15 vector registers and only the lower 64 bits of each. 49 STP d15, d14, [sp, -16] 52 # And go to the a_zero_point with post-index [all …]
|
H A D | 8x8-dq-aarch64-neon.S | 5 * This source code is licensed under the BSD-style license found in the 10 #include <requantization/runtime-assembly.h> 17 # |-----------| 19 # |out ch indx| 8 21 # |-----------| 37 STP d15, d14, [sp, -16] 38 STP d13, d12, [sp, -32] 39 STP d11, d10, [sp, -48] 40 STP d9, d8, [sp, -64] 48 LDR x10, [sp, 8] [all …]
|
/aosp_15_r20/external/pytorch/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8conv/ |
H A D | 8x8-aarch64-neon.S | 5 * This source code is licensed under the BSD-style license found in the 10 #include <requantization/runtime-assembly.h> 13 # Args passed via 8 registers (64 bytes) 26 # |-----------| 28 # |params | 8 29 # |-----------| 49 STP d15, d14, [sp, -16] 50 STP d13, d12, [sp, -32] 51 STP d11, d10, [sp, -48] 52 STP d9, d8, [sp, -64] [all …]
|
/aosp_15_r20/external/XNNPACK/src/qs8-gemm/ |
H A D | 2x8c8-aarch64-neon-mlal-cortex-a53.S.in | 3 // This source code is licensed under the BSD-style license found in the 22 # size_t cn_stride, [sp] -> x10 23 # const union ${PARAMS_UNION} params) [sp + 8] -> x11 25 # d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 42 STP d8, d9, [sp, -80]! 57 SUBS x0, x2, 16 // k = kc - 16 58 LDP s16, s18, [x5], 8 61 LDP s20, s22, [x5], 8 64 LDP s24, s26, [x5], 8 67 LDP s28, s30, [x5], 8 [all …]
|
/aosp_15_r20/external/XNNPACK/src/qs8-igemm/ |
H A D | 2x8c8-aarch64-neon-mlal-cortex-a53.S.in | 3 // This source code is licensed under the BSD-style license found in the 23 # size_t cn_stride, [sp] -> x10 24 # size_t a_offset, [sp + 8] -> x8 25 # const int8_t* zero, [sp + 16] -> x12 26 # const union ${PARAMS_UNION} params [sp + 24] -> x11 28 # d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 48 STP d8, d9, [sp, -80]! 60 LDP s16, s18, [x5], 8 63 LDP s20, s22, [x5], 8 66 LDP s24, s26, [x5], 8 [all …]
|
/aosp_15_r20/external/XNNPACK/src/qs8-igemm/gen/ |
H A D | 4x16-minmax-rndnu-aarch64-neon-mlal-lane-cortex-a53.S | 1 // Auto-generated file. Do not edit! 2 // Template: src/qs8-igemm/4x16-aarch64-neon-mlal-lane-cortex-a53.S.in 7 // This source code is licensed under the BSD-style license found in the 22 # size_t cn_stride, [sp] -> x10 23 # size_t a_offset, [sp + 8] -> x8 24 # const int8_t* zero, [sp + 16] -> x12 25 # const xnn_qs8_conv_minmax_params params [sp + 24] -> (x11) 27 # d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 40 # x11, x21 temp for Cortex-A53 loads 56 STP x20, x21, [sp, -16]! // Save x20-x21 on stack [all …]
|
H A D | 4x16-minmax-fp32-aarch64-neon-mlal-lane-cortex-a53.S | 1 // Auto-generated file. Do not edit! 2 // Template: src/qs8-igemm/4x16-aarch64-neon-mlal-lane-cortex-a53.S.in 7 // This source code is licensed under the BSD-style license found in the 22 # size_t cn_stride, [sp] -> x10 23 # size_t a_offset, [sp + 8] -> x8 24 # const int8_t* zero, [sp + 16] -> x12 25 # const xnn_qs8_conv_minmax_params params [sp + 24] -> (x11) 27 # d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 40 # x11, x21 temp for Cortex-A53 loads 56 STP x20, x21, [sp, -16]! // Save x20-x21 on stack [all …]
|
H A D | 4x16-minmax-fp32-aarch64-neon-mlal-lane-prfm-cortex-a53.S | 1 // Auto-generated file. Do not edit! 2 // Template: src/qs8-igemm/4x16-aarch64-neon-mlal-lane-cortex-a53.S.in 7 // This source code is licensed under the BSD-style license found in the 22 # size_t cn_stride, [sp] -> x10 23 # size_t a_offset, [sp + 8] -> x8 24 # const int8_t* zero, [sp + 16] -> x12 25 # const xnn_qs8_conv_minmax_params params [sp + 24] -> (x11) 27 # d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 40 # x11, x21 temp for Cortex-A53 loads 56 STP x20, x21, [sp, -16]! // Save x20-x21 on stack [all …]
|
/aosp_15_r20/external/XNNPACK/src/qs8-gemm/gen/ |
H A D | 2x8c8-minmax-rndnu-aarch64-neon-mlal-prfm-cortex-a53.S | 1 // Auto-generated file. Do not edit! 2 // Template: src/qs8-gemm/2x8c8-aarch64-neon-mlal-cortex-a53.S.in 7 // This source code is licensed under the BSD-style license found in the 22 # size_t cn_stride, [sp] -> x10 23 # const union xnn_qs8_conv_minmax_params params) [sp + 8] -> x11 25 # d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 42 STP d8, d9, [sp, -80]! 57 SUBS x0, x2, 16 // k = kc - 16 58 LDP s16, s18, [x5], 8 61 LDP s20, s22, [x5], 8 [all …]
|
H A D | 2x8c8-minmax-rndnu-aarch64-neon-mlal-cortex-a53.S | 1 // Auto-generated file. Do not edit! 2 // Template: src/qs8-gemm/2x8c8-aarch64-neon-mlal-cortex-a53.S.in 7 // This source code is licensed under the BSD-style license found in the 22 # size_t cn_stride, [sp] -> x10 23 # const union xnn_qs8_conv_minmax_params params) [sp + 8] -> x11 25 # d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 42 STP d8, d9, [sp, -80]! 57 SUBS x0, x2, 16 // k = kc - 16 58 LDP s16, s18, [x5], 8 61 LDP s20, s22, [x5], 8 [all …]
|
/aosp_15_r20/external/XNNPACK/src/qc8-igemm/gen/ |
H A D | 4x16-minmax-fp32-aarch64-neon-mlal-lane-prfm-cortex-a53.S | 1 // Auto-generated file. Do not edit! 2 // Template: src/qs8-igemm/4x16-aarch64-neon-mlal-lane-cortex-a53.S.in 7 // This source code is licensed under the BSD-style license found in the 22 # size_t cn_stride, [sp] -> x10 23 # size_t a_offset, [sp + 8] -> x8 24 # const int8_t* zero, [sp + 16] -> x12 25 # const xnn_qs8_conv_minmax_params params [sp + 24] -> (x11) 27 # d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 40 # x11, x21 temp for Cortex-A53 loads 56 STP x20, x21, [sp, -16]! // Save x20-x21 on stack [all …]
|