xref: /aosp_15_r20/external/XNNPACK/src/f32-vrelu/wasm_shr_x4.S (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1# Copyright 2020 Google LLC
2#
3# This source code is licensed under the BSD-style license found in the
4# LICENSE file in the root directory of this source tree.
5
6#include <xnnpack/assembly.h>
7
8# void xnn_f32_vrelu_ukernel__wasm32_shr_x4(
9#     size_t n,             0
10#     const float* x,       1
11#     float* y,             2
12#     const union params)   3 unused
13
14# locals
15#     float value0          4
16#     float value1          5
17#     float value2          6
18#     float value3          7
19#     float mask0           8
20#     float mask1           9
21#     float mask2           10
22#     float mask3           11
23
24BEGIN_FUNCTION  xnn_f32_vrelu_ukernel__wasm32_shr_x4
25    .functype   xnn_f32_vrelu_ukernel__wasm32_shr_x4 (i32, i32, i32, i32) -> ()
26    .local      i32, i32, i32, i32, i32, i32, i32, i32
27
28    local.get    0
29    i32.const    16      # count >= 16
30    i32.ge_s
31    if
32      loop
33        local.get    1
34        i32.load     0        # load 4 floats from src
35        local.set    4
36        local.get    1
37        i32.load     4
38        local.set    5
39        local.get    1
40        i32.load     8
41        local.set    6
42        local.get    1
43        i32.load     12
44        local.set    7
45
46        local.get    4        # (v >> 31) - 1) & v
47        i32.const    31
48        i32.shr_u
49        local.set    8
50        local.get    5
51        i32.const    31
52        i32.shr_u
53        local.set    9
54        local.get    6
55        i32.const    31
56        i32.shr_u
57        local.set    10
58        local.get    7
59        i32.const    31
60        i32.shr_u
61        local.set    11
62
63        local.get    8
64        i32.const    -1
65        i32.add
66        local.set    8
67        local.get    9
68        i32.const    -1
69        i32.add
70        local.set    9
71        local.get    10
72        i32.const    -1
73        i32.add
74        local.set    10
75        local.get    11
76        i32.const    -1
77        i32.add
78        local.set    11
79
80        local.get    4
81        local.get    8
82        i32.and
83        local.set    4
84        local.get    5
85        local.get    9
86        i32.and
87        local.set    5
88        local.get    6
89        local.get    10
90        i32.and
91        local.set    6
92        local.get    7
93        local.get    11
94        i32.and
95        local.set    7
96
97        local.get    2
98        local.get    4
99        i32.store    0        # store 4 floats
100        local.get    2
101        local.get    5
102        i32.store    4
103        local.get    2
104        local.get    6
105        i32.store    8
106        local.get    2
107        local.get    7
108        i32.store    12
109
110        local.get    2        # dst += 16
111        i32.const    16
112        i32.add
113        local.set    2
114
115        local.get    1        # src += 16
116        i32.const    16
117        i32.add
118        local.set    1
119
120        local.get    0
121        i32.const    -16
122        i32.add              # count -= 16
123        local.set    0
124
125        local.get    0
126        i32.const    16      # count >= 16
127        i32.ge_s
128        br_if        0       # loop
129      end_loop
130    end_if
131
132    local.get    0
133    i32.const    4       # if count >= 4
134    i32.ge_s
135    if
136      loop
137        local.get    1        # src
138        i32.load     0        # load float from src
139        local.set    4
140
141        local.get    1        # src += 4
142        i32.const    4
143        i32.add
144        local.set    1
145
146        local.get    4        # (v >> 31) - 1) & v
147        i32.const    31
148        i32.shr_u
149        local.set    5
150
151        local.get    5
152        i32.const    -1
153        i32.add
154        local.set    5
155
156        local.get    4
157        local.get    5
158        i32.and
159        local.set    4
160
161        local.get    2        # dst
162        local.get    4
163        i32.store    0        # store float
164
165        local.get    2        # dst += 4
166        i32.const    4
167        i32.add
168        local.set    2
169
170        local.get    0
171        i32.const    -4
172        i32.add              # count -= 4
173        local.set    0
174
175        local.get    0
176        i32.const    4       # count >= 4
177        i32.ge_s
178        br_if        0       # loop
179      end_loop
180    end_if
181END_FUNCTION xnn_f32_vrelu_ukernel__wasm32_shr_x4
182