xref: /aosp_15_r20/external/XNNPACK/src/f32-spmm/gen/8x1-minmax-scalar.c (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1 // Auto-generated file. Do not edit!
2 //   Template: src/f32-spmm/scalar.c.in
3 //   Generator: tools/xngen
4 //
5 // Copyright 2019 Google LLC
6 //
7 // This source code is licensed under the BSD-style license found in the
8 // LICENSE file in the root directory of this source tree.
9 
10 #include <assert.h>
11 
12 #include <xnnpack/math.h>
13 #include <xnnpack/spmm.h>
14 
15 
xnn_f32_spmm_minmax_ukernel_8x1__scalar(size_t mc,size_t nc,const float * restrict input,const float * restrict weights,const int32_t * restrict widx_dmap,const uint32_t * restrict nidx_nnzmap,float * restrict output,size_t output_stride,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])16 void xnn_f32_spmm_minmax_ukernel_8x1__scalar(
17     size_t mc,
18     size_t nc,
19     const float*restrict input,
20     const float*restrict weights,
21     const int32_t*restrict widx_dmap,
22     const uint32_t*restrict nidx_nnzmap,
23     float*restrict output,
24     size_t output_stride,
25     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
26 {
27   assert(mc != 0);
28   assert(mc % sizeof(float) == 0);
29   assert(nc != 0);
30 
31   const float vmin = params->scalar.min;
32   const float vmax = params->scalar.max;
33   size_t output_decrement = output_stride * nc - 8 * sizeof(float);
34   while (mc >= 8 * sizeof(float)) {
35     const float*restrict w = weights;
36     const int32_t* dmap = widx_dmap;
37     const uint32_t* nnzmap = nidx_nnzmap;
38     size_t n = nc;
39     while (n >= 1) {
40       uint32_t nnz = *nnzmap++;
41       float vacc0x0 = *w++;
42       float vacc1x0 = vacc0x0;
43       float vacc2x0 = vacc0x0;
44       float vacc3x0 = vacc0x0;
45       float vacc4x0 = vacc0x0;
46       float vacc5x0 = vacc0x0;
47       float vacc6x0 = vacc0x0;
48       float vacc7x0 = vacc0x0;
49       if XNN_LIKELY(nnz != 0) {
50         do {
51           const intptr_t diff = *dmap++;
52           const float vi0 = input[0];
53           const float vi1 = input[1];
54           const float vi2 = input[2];
55           const float vi3 = input[3];
56           const float vi4 = input[4];
57           const float vi5 = input[5];
58           const float vi6 = input[6];
59           const float vi7 = input[7];
60           input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
61           const float vw0 = *w++;
62           vacc0x0 += vi0 * vw0;
63           vacc1x0 += vi1 * vw0;
64           vacc2x0 += vi2 * vw0;
65           vacc3x0 += vi3 * vw0;
66           vacc4x0 += vi4 * vw0;
67           vacc5x0 += vi5 * vw0;
68           vacc6x0 += vi6 * vw0;
69           vacc7x0 += vi7 * vw0;
70         } while (--nnz != 0);
71       }
72       float vout0x0 = math_min_f32(vacc0x0, vmax);
73       float vout1x0 = math_min_f32(vacc1x0, vmax);
74       float vout2x0 = math_min_f32(vacc2x0, vmax);
75       float vout3x0 = math_min_f32(vacc3x0, vmax);
76       float vout4x0 = math_min_f32(vacc4x0, vmax);
77       float vout5x0 = math_min_f32(vacc5x0, vmax);
78       float vout6x0 = math_min_f32(vacc6x0, vmax);
79       float vout7x0 = math_min_f32(vacc7x0, vmax);
80       vout0x0 = math_max_f32(vout0x0, vmin);
81       vout1x0 = math_max_f32(vout1x0, vmin);
82       vout2x0 = math_max_f32(vout2x0, vmin);
83       vout3x0 = math_max_f32(vout3x0, vmin);
84       vout4x0 = math_max_f32(vout4x0, vmin);
85       vout5x0 = math_max_f32(vout5x0, vmin);
86       vout6x0 = math_max_f32(vout6x0, vmin);
87       vout7x0 = math_max_f32(vout7x0, vmin);
88       output[0] = vout0x0;
89       output[1] = vout1x0;
90       output[2] = vout2x0;
91       output[3] = vout3x0;
92       output[4] = vout4x0;
93       output[5] = vout5x0;
94       output[6] = vout6x0;
95       output[7] = vout7x0;
96       output[0] = vout0x0;
97       output[1] = vout1x0;
98       output[2] = vout2x0;
99       output[3] = vout3x0;
100       output[4] = vout4x0;
101       output[5] = vout5x0;
102       output[6] = vout6x0;
103       output[7] = vout7x0;
104       output = (float*restrict) ((uintptr_t) output + output_stride);
105       n -= 1;
106     }
107     if XNN_UNLIKELY(n != 0) {
108       do {
109         uint32_t nnz = *nnzmap++;
110         float vacc0 = *w++;
111         float vacc1 = vacc0;
112         float vacc2 = vacc0;
113         float vacc3 = vacc0;
114         float vacc4 = vacc0;
115         float vacc5 = vacc0;
116         float vacc6 = vacc0;
117         float vacc7 = vacc0;
118         if XNN_LIKELY(nnz != 0) {
119           do {
120             const intptr_t diff = *dmap++;
121             const float vi0 = input[0];
122             const float vi1 = input[1];
123             const float vi2 = input[2];
124             const float vi3 = input[3];
125             const float vi4 = input[4];
126             const float vi5 = input[5];
127             const float vi6 = input[6];
128             const float vi7 = input[7];
129             input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
130             const float vw = *w++;
131             vacc0 += vi0 * vw;
132             vacc1 += vi1 * vw;
133             vacc2 += vi2 * vw;
134             vacc3 += vi3 * vw;
135             vacc4 += vi4 * vw;
136             vacc5 += vi5 * vw;
137             vacc6 += vi6 * vw;
138             vacc7 += vi7 * vw;
139           } while (--nnz != 0);
140         }
141         float vout0 = math_min_f32(vacc0, vmax);
142         float vout1 = math_min_f32(vacc1, vmax);
143         float vout2 = math_min_f32(vacc2, vmax);
144         float vout3 = math_min_f32(vacc3, vmax);
145         float vout4 = math_min_f32(vacc4, vmax);
146         float vout5 = math_min_f32(vacc5, vmax);
147         float vout6 = math_min_f32(vacc6, vmax);
148         float vout7 = math_min_f32(vacc7, vmax);
149         vout0 = math_max_f32(vout0, vmin);
150         vout1 = math_max_f32(vout1, vmin);
151         vout2 = math_max_f32(vout2, vmin);
152         vout3 = math_max_f32(vout3, vmin);
153         vout4 = math_max_f32(vout4, vmin);
154         vout5 = math_max_f32(vout5, vmin);
155         vout6 = math_max_f32(vout6, vmin);
156         vout7 = math_max_f32(vout7, vmin);
157         output[0] = vout0;
158         output[1] = vout1;
159         output[2] = vout2;
160         output[3] = vout3;
161         output[4] = vout4;
162         output[5] = vout5;
163         output[6] = vout6;
164         output[7] = vout7;
165         output = (float*restrict) ((uintptr_t) output + output_stride);
166         n -= 1;
167       } while (n != 0);
168     }
169     output = (float*restrict) ((uintptr_t) output - output_decrement);
170     input += 8;
171     mc -= 8 * sizeof(float);
172   }
173   if XNN_UNLIKELY(mc != 0) {
174     output_decrement += 4 * sizeof(float);
175     if (mc & (4 * sizeof(float))) {
176       const float*restrict w = weights;
177       const int32_t* dmap = widx_dmap;
178       const uint32_t* nnzmap = nidx_nnzmap;
179       size_t n = nc;
180       while (n >= 1) {
181         uint32_t nnz = *nnzmap++;
182         float vacc0x0 = *w++;
183         float vacc1x0 = vacc0x0;
184         float vacc2x0 = vacc0x0;
185         float vacc3x0 = vacc0x0;
186         if XNN_LIKELY(nnz != 0) {
187           do {
188             const intptr_t diff = *dmap++;
189             const float vi0 = input[0];
190             const float vi1 = input[1];
191             const float vi2 = input[2];
192             const float vi3 = input[3];
193             input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
194             const float vw0 = *w++;
195             vacc0x0 += vi0 * vw0;
196             vacc1x0 += vi1 * vw0;
197             vacc2x0 += vi2 * vw0;
198             vacc3x0 += vi3 * vw0;
199           } while (--nnz != 0);
200         }
201         float vout0x0 = math_min_f32(vacc0x0, vmax);
202         float vout1x0 = math_min_f32(vacc1x0, vmax);
203         float vout2x0 = math_min_f32(vacc2x0, vmax);
204         float vout3x0 = math_min_f32(vacc3x0, vmax);
205         vout0x0 = math_max_f32(vout0x0, vmin);
206         vout1x0 = math_max_f32(vout1x0, vmin);
207         vout2x0 = math_max_f32(vout2x0, vmin);
208         vout3x0 = math_max_f32(vout3x0, vmin);
209         output[0] = vout0x0;
210         output[1] = vout1x0;
211         output[2] = vout2x0;
212         output[3] = vout3x0;
213         output = (float*restrict) ((uintptr_t) output + output_stride);
214         n -= 1;
215       }
216       if XNN_UNLIKELY(n != 0) {
217         do {
218           uint32_t nnz = *nnzmap++;
219           float vacc0 = *w++;
220           float vacc1 = vacc0;
221           float vacc2 = vacc0;
222           float vacc3 = vacc0;
223           if XNN_LIKELY(nnz != 0) {
224             do {
225               const intptr_t diff = *dmap++;
226               const float vi0 = input[0];
227               const float vi1 = input[1];
228               const float vi2 = input[2];
229               const float vi3 = input[3];
230               input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
231               const float vw = *w++;
232               vacc0 += vi0 * vw;
233               vacc1 += vi1 * vw;
234               vacc2 += vi2 * vw;
235               vacc3 += vi3 * vw;
236             } while (--nnz != 0);
237           }
238           float vout0 = math_min_f32(vacc0, vmax);
239           float vout1 = math_min_f32(vacc1, vmax);
240           float vout2 = math_min_f32(vacc2, vmax);
241           float vout3 = math_min_f32(vacc3, vmax);
242           vout0 = math_max_f32(vout0, vmin);
243           vout1 = math_max_f32(vout1, vmin);
244           vout2 = math_max_f32(vout2, vmin);
245           vout3 = math_max_f32(vout3, vmin);
246           output[0] = vout0;
247           output[1] = vout1;
248           output[2] = vout2;
249           output[3] = vout3;
250           output = (float*restrict) ((uintptr_t) output + output_stride);
251           n -= 1;
252         } while (n != 0);
253       }
254       output = (float*restrict) ((uintptr_t) output - output_decrement);
255       input += 4;
256     }
257     output_decrement += 2 * sizeof(float);
258     if (mc & (2 * sizeof(float))) {
259       const float*restrict w = weights;
260       const int32_t* dmap = widx_dmap;
261       const uint32_t* nnzmap = nidx_nnzmap;
262       size_t n = nc;
263       while (n >= 1) {
264         uint32_t nnz = *nnzmap++;
265         float vacc0x0 = *w++;
266         float vacc1x0 = vacc0x0;
267         if XNN_LIKELY(nnz != 0) {
268           do {
269             const intptr_t diff = *dmap++;
270             const float vi0 = input[0];
271             const float vi1 = input[1];
272             input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
273             const float vw0 = *w++;
274             vacc0x0 += vi0 * vw0;
275             vacc1x0 += vi1 * vw0;
276           } while (--nnz != 0);
277         }
278         float vout0x0 = math_min_f32(vacc0x0, vmax);
279         float vout1x0 = math_min_f32(vacc1x0, vmax);
280         vout0x0 = math_max_f32(vout0x0, vmin);
281         vout1x0 = math_max_f32(vout1x0, vmin);
282         output[0] = vout0x0;
283         output[1] = vout1x0;
284         output = (float*restrict) ((uintptr_t) output + output_stride);
285         n -= 1;
286       }
287       if XNN_UNLIKELY(n != 0) {
288         do {
289           uint32_t nnz = *nnzmap++;
290           float vacc0 = *w++;
291           float vacc1 = vacc0;
292           if XNN_LIKELY(nnz != 0) {
293             do {
294               const intptr_t diff = *dmap++;
295               const float vi0 = input[0];
296               const float vi1 = input[1];
297               input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
298               const float vw = *w++;
299               vacc0 += vi0 * vw;
300               vacc1 += vi1 * vw;
301             } while (--nnz != 0);
302           }
303           float vout0 = math_min_f32(vacc0, vmax);
304           float vout1 = math_min_f32(vacc1, vmax);
305           vout0 = math_max_f32(vout0, vmin);
306           vout1 = math_max_f32(vout1, vmin);
307           output[0] = vout0;
308           output[1] = vout1;
309           output = (float*restrict) ((uintptr_t) output + output_stride);
310           n -= 1;
311         } while (n != 0);
312       }
313       output = (float*restrict) ((uintptr_t) output - output_decrement);
314       input += 2;
315     }
316     output_decrement += 1 * sizeof(float);
317     if (mc & (1 * sizeof(float))) {
318       const float*restrict w = weights;
319       const int32_t* dmap = widx_dmap;
320       const uint32_t* nnzmap = nidx_nnzmap;
321       size_t n = nc;
322       while (n >= 1) {
323         uint32_t nnz = *nnzmap++;
324         float vacc0x0 = *w++;
325         if XNN_LIKELY(nnz != 0) {
326           do {
327             const intptr_t diff = *dmap++;
328             const float vi0 = input[0];
329             input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
330             const float vw0 = *w++;
331             vacc0x0 += vi0 * vw0;
332           } while (--nnz != 0);
333         }
334         float vout0x0 = math_min_f32(vacc0x0, vmax);
335         vout0x0 = math_max_f32(vout0x0, vmin);
336         output[0] = vout0x0;
337         output = (float*restrict) ((uintptr_t) output + output_stride);
338         n -= 1;
339       }
340       if XNN_UNLIKELY(n != 0) {
341         do {
342           uint32_t nnz = *nnzmap++;
343           float vacc0 = *w++;
344           if XNN_LIKELY(nnz != 0) {
345             do {
346               const intptr_t diff = *dmap++;
347               const float vi0 = input[0];
348               input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
349               const float vw = *w++;
350               vacc0 += vi0 * vw;
351             } while (--nnz != 0);
352           }
353           float vout0 = math_min_f32(vacc0, vmax);
354           vout0 = math_max_f32(vout0, vmin);
355           output[0] = vout0;
356           output = (float*restrict) ((uintptr_t) output + output_stride);
357           n -= 1;
358         } while (n != 0);
359       }
360       output = (float*restrict) ((uintptr_t) output - output_decrement);
361       input += 1;
362     }
363   }
364 }
365