Home
last modified time | relevance | path

Searched +full:- +full:8 +full:s (Results 1 – 25 of 1690) sorted by relevance

12345678910>>...68

/aosp_15_r20/external/libdav1d/src/arm/64/
H A Dmc16.S29 #include "src/arm/asm.S"
30 #include "util.S"
35 ld1 {\t0\().8h,\t1\().8h}, [x2], 32
36 ld1 {\t2\().8h,\t3\().8h}, [x3], 32
37 sqadd \t0\().8h, \t0\().8h, \t2\().8h
38 sqadd \t1\().8h, \t1\().8h, \t3\().8h
39 smax \t0\().8h, \t0\().8h, v28.8h // -2*PREP_BIAS - 1 << intermediate_bits
40 smax \t1\().8h, \t1\().8h, v28.8h // -2*PREP_BIAS - 1 << intermediate_bits
41 sqsub \t0\().8h, \t0\().8h, v28.8h // -2*PREP_BIAS - 1 << intermediate_bits
42 sqsub \t1\().8h, \t1\().8h, v28.8h // -2*PREP_BIAS - 1 << intermediate_bits
[all …]
H A Dlooprestoration_tmpl.S28 #include "src/arm/asm.S"
40 stp d8, d9, [sp, #-0x40]!
56 movi v30.8h, #3
57 movi v31.4s, #3
59 ld1 {v0.8h, v1.8h}, [x10], #32
60 ld1 {v2.8h, v3.8h}, [x11], #32
61 ld1 {v4.8h, v5.8h}, [x12], #32
62 ld1 {v6.8h, v7.8h}, [x4], #32
63 ld1 {v16.4s, v17.4s, v18.4s}, [x7], #48
64 ld1 {v19.4s, v20.4s, v21.4s}, [x8], #48
[all …]
H A Dipred16.S28 #include "src/arm/asm.S"
29 #include "util.S"
42 dup v0.8h, w8
46 urshr v0.8h, v0.8h, #1
60 8:
61 st1 {v0.8h}, [x0], x1
62 st1 {v0.8h}, [x6], x1
64 st1 {v0.8h}, [x0], x1
65 st1 {v0.8h}, [x6], x1
66 b.gt 8b
[all …]
H A Ditx16.S28 #include "src/arm/asm.S"
29 #include "util.S"
36 // x0-x3 external parameters
42 // x9-x12 scratch variables for helper functions
48 // v0-v1 multiplication coefficients
49 // v2-v7 scratch registers
50 // v8-v15 unused
51 // v16-v31 inputs/outputs of transforms
55 .int 2896, 2896*8*(1<<16), 1567, 3784
69 .int 101*8*(1<<16), 4095*8*(1<<16), 2967*8*(1<<16), -2824*8*(1<<16)
[all …]
H A Dlooprestoration16.S28 #include "src/arm/asm.S"
29 #include "util.S"
50 // const int16_t filter[2][8],
56 stp x29, x30, [sp, #-32]!
59 ld1 {v0.8h, v1.8h}, [x6]
63 dup v28.8h, w8 // bitdepth_max
65 movi v30.4s, #1
66 sub w10, w8, #38 // -(bitdepth + 6)
68 sub w8, w8, #25 // -round_bits_h
70 neg w11, w11 // -round_bits_v
[all …]
/aosp_15_r20/external/tensorflow/tensorflow/lite/kernels/internal/optimized/
H A Ddepthwiseconv_uint8_3x3_filter.h7 http://www.apache.org/licenses/LICENSE-2.0
46 #define STR(s) STR_UNEXPANDED(s) argument
47 #define STR_UNEXPANDED(s) #s argument
50 // Jetson TX-2. This compiler does not support the offsetof() macro.
55 // 4 8-bit lanes together. So these are treated much like 32-bit loads and
56 // 32-bit stores. Stores require 32-bit alignment.
67 // reinterpret_cast. Sanitizers may fail silently on lane-loading, with an
68 // obscure bug or mis-feature probably in unhygienic macro expansion.
83 #define OFFSET_INPUT_ROW_SIZE 8
154 // Dot product ops hard-coded
[all …]
/aosp_15_r20/external/renderscript-intrinsics-replacement-toolkit/renderscript-toolkit/src/main/cpp/
H A DColorMatrix_advsimd.S8 * http://www.apache.org/licenses/LICENSE-2.0
18 #define END(f) .size f, .-f;
23 .if (\i) & (\mask - 1)
33 .if (\i) & (\mask - 1)
43 .if (\i) & (\mask - 1 + 16)
53 .if (\i) & (\mask - 1 + 16)
72 .irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
77 dup v6.4s, v4.s[0]
78 dup v7.4s, v4.s[0]
80 vmxx_s16 \i, 1, v6.4s, v12.4h, v0.h[0]
[all …]
/aosp_15_r20/frameworks/rs/toolkit/
H A DColorMatrix_advsimd.S8 * http://www.apache.org/licenses/LICENSE-2.0
18 #define END(f) .size f, .-f;
23 .if (\i) & (\mask - 1)
33 .if (\i) & (\mask - 1)
43 .if (\i) & (\mask - 1 + 16)
53 .if (\i) & (\mask - 1 + 16)
72 .irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
77 dup v6.4s, v4.s[0]
78 dup v7.4s, v4.s[0]
80 vmxx_s16 \i, 1, v6.4s, v12.4h, v0.h[0]
[all …]
/aosp_15_r20/frameworks/rs/cpu_ref/
H A DrsCpuIntrinsics_advsimd_ColorMatrix.S8 * http://www.apache.org/licenses/LICENSE-2.0
18 #define END(f) .size f, .-f;
23 .if (\i) & (\mask - 1)
33 .if (\i) & (\mask - 1)
43 .if (\i) & (\mask - 1 + 16)
53 .if (\i) & (\mask - 1 + 16)
72 .irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
77 dup v6.4s, v4.s[0]
78 dup v7.4s, v4.s[0]
80 vmxx_s16 \i, 1, v6.4s, v12.4h, v0.h[0]
[all …]
/aosp_15_r20/external/tensorflow/tensorflow/lite/kernels/internal/optimized/integer_ops/
H A Ddepthwise_conv_3x3_filter.h7 http://www.apache.org/licenses/LICENSE-2.0
29 #define STR(s) STR_UNEXPANDED(s) argument
30 #define STR_UNEXPANDED(s) #s argument
33 // Jetson TX-2. This compiler does not support the offsetof() macro.
41 #define OFFSET_INPUT_ROW_SIZE 8
109 struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 1,
121 const int64_t output_height_increment = 2 * params_ptr->output_row_size;
122 TFLITE_DCHECK_EQ(params_ptr->filter_offset, 0);
131 #define DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LOOP "8"
138 // |output_window_height| and |output_window_width|. The inner-most loop
[all …]
H A Ddepthwise_conv_hybrid_3x3_filter.h7 http://www.apache.org/licenses/LICENSE-2.0
29 #define STR(s) STR_UNEXPANDED(s) argument
30 #define STR_UNEXPANDED(s) #s argument
33 // Jetson TX-2. This compiler does not support the offsetof() macro.
41 #define OFFSET_INPUT_ROW_SIZE 8
127 8, 1, 1> {
139 const int64_t output_height_increment = 2 * 4 * params_ptr->output_row_size;
140 TFLITE_DCHECK_EQ(params_ptr->filter_offset, 0);
149 #define DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LOOP "8"
156 // |output_window_height| and |output_window_width|. The inner-most loop
[all …]
/aosp_15_r20/external/llvm/test/MC/AArch64/
H A Dneon-3vdiff.s1 // RUN: llvm-mc -triple=aarch64-none-linux-gnu -mattr=+crypto -mattr=+neon -show-encoding < %s | Fi…
5 //------------------------------------------------------------------------------
7 //------------------------------------------------------------------------------
9 //------------------------------------------------------------------------------
11 //------------------------------------------------------------------------------
13 //------------------------------------------------------------------------------
14 // Long - Variant 1
15 //------------------------------------------------------------------------------
17 saddl v0.8h, v1.8b, v2.8b
18 saddl v0.4s, v1.4h, v2.4h
[all …]
/aosp_15_r20/prebuilts/vndk/v33/x86_64/arch-x86-x86_64/shared/vndk-core/
Dlibpcre2.so ... . - - - - - - - - - - - 8 8 8 8 8 8 8 8 8
/aosp_15_r20/external/ruy/ruy/
H A Dkernel_arm64.cc7 http://www.apache.org/licenses/LICENSE-2.0
36 #define RUY_OFFSET_LHS_SUMS 8
93 // Fast-int8-trick kernel, similar to this production gemmlowp kernel:
95 …gemmlowp/blob/36212ad3651871bc3e9a599f1a6d5324778aea25/standalone/neon-gemm-kernel-benchmark.cc#L2…
97 // Relevant target CPUs for this kernel include ARM Cortex-A73 and Cortex-A75,
98 // since these are 64-bit, out-of-order and without dotprod support.
115 // v16 -- v31 are int32 accumulators. in Kernel8bitNeon()
116 // During accumulation, v0 -- v3 are used to load int8 data from LHS and in Kernel8bitNeon()
117 // v4 -- v7 from RHS: in Kernel8bitNeon()
120 // /-----------------------------------------| in Kernel8bitNeon()
[all …]
/aosp_15_r20/external/pytorch/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm/
H A D8x8-aarch64-neon.S5 * This source code is licensed under the BSD-style license found in the
10 #include <requantization/runtime-assembly.h>
12 # Args passed via 8 registers (64 bytes)
25 # |-----------|
27 # |params | 8
28 # |-----------|
43 … # https://developer.arm.com/docs/ihi0055/d/procedure-call-standard-for-the-arm-64-bit-architecture
44 # Callee need to save 8-15 vector registers and only the lower 64 bits of each.
49 STP d15, d14, [sp, -16]
52 # And go to the a_zero_point with post-index
[all …]
H A D8x8-dq-aarch64-neon.S5 * This source code is licensed under the BSD-style license found in the
10 #include <requantization/runtime-assembly.h>
17 # |-----------|
19 # |out ch indx| 8
21 # |-----------|
37 STP d15, d14, [sp, -16]
38 STP d13, d12, [sp, -32]
39 STP d11, d10, [sp, -48]
40 STP d9, d8, [sp, -64]
48 LDR x10, [sp, 8]
[all …]
/aosp_15_r20/external/pytorch/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8conv/
H A D8x8-aarch64-neon.S5 * This source code is licensed under the BSD-style license found in the
10 #include <requantization/runtime-assembly.h>
13 # Args passed via 8 registers (64 bytes)
26 # |-----------|
28 # |params | 8
29 # |-----------|
49 STP d15, d14, [sp, -16]
50 STP d13, d12, [sp, -32]
51 STP d11, d10, [sp, -48]
52 STP d9, d8, [sp, -64]
[all …]
/aosp_15_r20/external/XNNPACK/src/qs8-gemm/
H A D2x8c8-aarch64-neon-mlal-cortex-a53.S.in3 // This source code is licensed under the BSD-style license found in the
22 # size_t cn_stride, [sp] -> x10
23 # const union ${PARAMS_UNION} params) [sp + 8] -> x11
25 # d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
42 STP d8, d9, [sp, -80]!
57 SUBS x0, x2, 16 // k = kc - 16
58 LDP s16, s18, [x5], 8
61 LDP s20, s22, [x5], 8
64 LDP s24, s26, [x5], 8
67 LDP s28, s30, [x5], 8
[all …]
/aosp_15_r20/external/XNNPACK/src/qs8-igemm/
H A D2x8c8-aarch64-neon-mlal-cortex-a53.S.in3 // This source code is licensed under the BSD-style license found in the
23 # size_t cn_stride, [sp] -> x10
24 # size_t a_offset, [sp + 8] -> x8
25 # const int8_t* zero, [sp + 16] -> x12
26 # const union ${PARAMS_UNION} params [sp + 24] -> x11
28 # d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
48 STP d8, d9, [sp, -80]!
60 LDP s16, s18, [x5], 8
63 LDP s20, s22, [x5], 8
66 LDP s24, s26, [x5], 8
[all …]
/aosp_15_r20/external/XNNPACK/src/qs8-igemm/gen/
H A D4x16-minmax-rndnu-aarch64-neon-mlal-lane-cortex-a53.S1 // Auto-generated file. Do not edit!
2 // Template: src/qs8-igemm/4x16-aarch64-neon-mlal-lane-cortex-a53.S.in
7 // This source code is licensed under the BSD-style license found in the
22 # size_t cn_stride, [sp] -> x10
23 # size_t a_offset, [sp + 8] -> x8
24 # const int8_t* zero, [sp + 16] -> x12
25 # const xnn_qs8_conv_minmax_params params [sp + 24] -> (x11)
27 # d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
40 # x11, x21 temp for Cortex-A53 loads
56 STP x20, x21, [sp, -16]! // Save x20-x21 on stack
[all …]
H A D4x16-minmax-fp32-aarch64-neon-mlal-lane-cortex-a53.S1 // Auto-generated file. Do not edit!
2 // Template: src/qs8-igemm/4x16-aarch64-neon-mlal-lane-cortex-a53.S.in
7 // This source code is licensed under the BSD-style license found in the
22 # size_t cn_stride, [sp] -> x10
23 # size_t a_offset, [sp + 8] -> x8
24 # const int8_t* zero, [sp + 16] -> x12
25 # const xnn_qs8_conv_minmax_params params [sp + 24] -> (x11)
27 # d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
40 # x11, x21 temp for Cortex-A53 loads
56 STP x20, x21, [sp, -16]! // Save x20-x21 on stack
[all …]
H A D4x16-minmax-fp32-aarch64-neon-mlal-lane-prfm-cortex-a53.S1 // Auto-generated file. Do not edit!
2 // Template: src/qs8-igemm/4x16-aarch64-neon-mlal-lane-cortex-a53.S.in
7 // This source code is licensed under the BSD-style license found in the
22 # size_t cn_stride, [sp] -> x10
23 # size_t a_offset, [sp + 8] -> x8
24 # const int8_t* zero, [sp + 16] -> x12
25 # const xnn_qs8_conv_minmax_params params [sp + 24] -> (x11)
27 # d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
40 # x11, x21 temp for Cortex-A53 loads
56 STP x20, x21, [sp, -16]! // Save x20-x21 on stack
[all …]
/aosp_15_r20/external/XNNPACK/src/qs8-gemm/gen/
H A D2x8c8-minmax-rndnu-aarch64-neon-mlal-prfm-cortex-a53.S1 // Auto-generated file. Do not edit!
2 // Template: src/qs8-gemm/2x8c8-aarch64-neon-mlal-cortex-a53.S.in
7 // This source code is licensed under the BSD-style license found in the
22 # size_t cn_stride, [sp] -> x10
23 # const union xnn_qs8_conv_minmax_params params) [sp + 8] -> x11
25 # d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
42 STP d8, d9, [sp, -80]!
57 SUBS x0, x2, 16 // k = kc - 16
58 LDP s16, s18, [x5], 8
61 LDP s20, s22, [x5], 8
[all …]
H A D2x8c8-minmax-rndnu-aarch64-neon-mlal-cortex-a53.S1 // Auto-generated file. Do not edit!
2 // Template: src/qs8-gemm/2x8c8-aarch64-neon-mlal-cortex-a53.S.in
7 // This source code is licensed under the BSD-style license found in the
22 # size_t cn_stride, [sp] -> x10
23 # const union xnn_qs8_conv_minmax_params params) [sp + 8] -> x11
25 # d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
42 STP d8, d9, [sp, -80]!
57 SUBS x0, x2, 16 // k = kc - 16
58 LDP s16, s18, [x5], 8
61 LDP s20, s22, [x5], 8
[all …]
/aosp_15_r20/external/XNNPACK/src/qc8-igemm/gen/
H A D4x16-minmax-fp32-aarch64-neon-mlal-lane-prfm-cortex-a53.S1 // Auto-generated file. Do not edit!
2 // Template: src/qs8-igemm/4x16-aarch64-neon-mlal-lane-cortex-a53.S.in
7 // This source code is licensed under the BSD-style license found in the
22 # size_t cn_stride, [sp] -> x10
23 # size_t a_offset, [sp + 8] -> x8
24 # const int8_t* zero, [sp + 16] -> x12
25 # const xnn_qs8_conv_minmax_params params [sp + 24] -> (x11)
27 # d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
40 # x11, x21 temp for Cortex-A53 loads
56 STP x20, x21, [sp, -16]! // Save x20-x21 on stack
[all …]

12345678910>>...68