xref: /aosp_15_r20/external/XNNPACK/src/packing.c (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1*4bdc9457SAndroid Build Coastguard Worker // Copyright (c) Facebook, Inc. and its affiliates.
2*4bdc9457SAndroid Build Coastguard Worker // All rights reserved.
3*4bdc9457SAndroid Build Coastguard Worker //
4*4bdc9457SAndroid Build Coastguard Worker // Copyright 2019 Google LLC
5*4bdc9457SAndroid Build Coastguard Worker //
6*4bdc9457SAndroid Build Coastguard Worker // This source code is licensed under the BSD-style license found in the
7*4bdc9457SAndroid Build Coastguard Worker // LICENSE file in the root directory of this source tree.
8*4bdc9457SAndroid Build Coastguard Worker 
9*4bdc9457SAndroid Build Coastguard Worker #include <stdint.h>
10*4bdc9457SAndroid Build Coastguard Worker #include <stddef.h>
11*4bdc9457SAndroid Build Coastguard Worker #include <string.h>
12*4bdc9457SAndroid Build Coastguard Worker 
13*4bdc9457SAndroid Build Coastguard Worker #include <fp16.h>
14*4bdc9457SAndroid Build Coastguard Worker 
15*4bdc9457SAndroid Build Coastguard Worker #include <xnnpack/math.h>
16*4bdc9457SAndroid Build Coastguard Worker #include <xnnpack/operator.h>
17*4bdc9457SAndroid Build Coastguard Worker #include <xnnpack/pack.h>
18*4bdc9457SAndroid Build Coastguard Worker #include <xnnpack/unaligned.h>
19*4bdc9457SAndroid Build Coastguard Worker 
20*4bdc9457SAndroid Build Coastguard Worker 
xnn_pack_f32_gemm_goi_w(size_t g,size_t nc,size_t kc,size_t nr,size_t kr,size_t sr,const float * k,const float * b,float * packed_w,size_t extra_bytes,const void * params)21*4bdc9457SAndroid Build Coastguard Worker void xnn_pack_f32_gemm_goi_w(
22*4bdc9457SAndroid Build Coastguard Worker   size_t g,
23*4bdc9457SAndroid Build Coastguard Worker   size_t nc,
24*4bdc9457SAndroid Build Coastguard Worker   size_t kc,
25*4bdc9457SAndroid Build Coastguard Worker   size_t nr,
26*4bdc9457SAndroid Build Coastguard Worker   size_t kr,
27*4bdc9457SAndroid Build Coastguard Worker   size_t sr,
28*4bdc9457SAndroid Build Coastguard Worker   const float* k,
29*4bdc9457SAndroid Build Coastguard Worker   const float* b,
30*4bdc9457SAndroid Build Coastguard Worker   float* packed_w,
31*4bdc9457SAndroid Build Coastguard Worker   size_t extra_bytes,
32*4bdc9457SAndroid Build Coastguard Worker   const void* params)
33*4bdc9457SAndroid Build Coastguard Worker {
34*4bdc9457SAndroid Build Coastguard Worker   assert(nr >= sr);
35*4bdc9457SAndroid Build Coastguard Worker 
36*4bdc9457SAndroid Build Coastguard Worker   const size_t skr = sr * kr;
37*4bdc9457SAndroid Build Coastguard Worker   do {
38*4bdc9457SAndroid Build Coastguard Worker     for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
39*4bdc9457SAndroid Build Coastguard Worker       const size_t nr_block_size = min(nc - nr_block_start, nr);
40*4bdc9457SAndroid Build Coastguard Worker       if XNN_LIKELY(b != NULL) {
41*4bdc9457SAndroid Build Coastguard Worker         for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
42*4bdc9457SAndroid Build Coastguard Worker           packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
43*4bdc9457SAndroid Build Coastguard Worker         }
44*4bdc9457SAndroid Build Coastguard Worker       }
45*4bdc9457SAndroid Build Coastguard Worker       packed_w += nr;
46*4bdc9457SAndroid Build Coastguard Worker 
47*4bdc9457SAndroid Build Coastguard Worker       for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
48*4bdc9457SAndroid Build Coastguard Worker         for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
49*4bdc9457SAndroid Build Coastguard Worker           for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
50*4bdc9457SAndroid Build Coastguard Worker             const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
51*4bdc9457SAndroid Build Coastguard Worker             if (kc_idx < kc) {
52*4bdc9457SAndroid Build Coastguard Worker               packed_w[kr_block_offset] = k[(nr_block_start + nr_block_offset) * kc + kc_idx];
53*4bdc9457SAndroid Build Coastguard Worker             }
54*4bdc9457SAndroid Build Coastguard Worker           }
55*4bdc9457SAndroid Build Coastguard Worker           packed_w += kr;
56*4bdc9457SAndroid Build Coastguard Worker         }
57*4bdc9457SAndroid Build Coastguard Worker         packed_w += (nr - nr_block_size) * kr;
58*4bdc9457SAndroid Build Coastguard Worker       }
59*4bdc9457SAndroid Build Coastguard Worker       packed_w = (float*) ((uintptr_t) packed_w + extra_bytes);
60*4bdc9457SAndroid Build Coastguard Worker     }
61*4bdc9457SAndroid Build Coastguard Worker     k += nc * kc;
62*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(b != NULL) {
63*4bdc9457SAndroid Build Coastguard Worker       b += nc;
64*4bdc9457SAndroid Build Coastguard Worker     }
65*4bdc9457SAndroid Build Coastguard Worker   } while (--g != 0);
66*4bdc9457SAndroid Build Coastguard Worker }
67*4bdc9457SAndroid Build Coastguard Worker 
xnn_pack_f16_gemm_goi_w(size_t g,size_t nc,size_t kc,size_t nr,size_t kr,size_t sr,const uint16_t * k,const uint16_t * b,uint16_t * packed_w,size_t extra_bytes,const void * params)68*4bdc9457SAndroid Build Coastguard Worker void xnn_pack_f16_gemm_goi_w(
69*4bdc9457SAndroid Build Coastguard Worker   size_t g,
70*4bdc9457SAndroid Build Coastguard Worker   size_t nc,
71*4bdc9457SAndroid Build Coastguard Worker   size_t kc,
72*4bdc9457SAndroid Build Coastguard Worker   size_t nr,
73*4bdc9457SAndroid Build Coastguard Worker   size_t kr,
74*4bdc9457SAndroid Build Coastguard Worker   size_t sr,
75*4bdc9457SAndroid Build Coastguard Worker   const uint16_t* k,
76*4bdc9457SAndroid Build Coastguard Worker   const uint16_t* b,
77*4bdc9457SAndroid Build Coastguard Worker   uint16_t* packed_w,
78*4bdc9457SAndroid Build Coastguard Worker   size_t extra_bytes,
79*4bdc9457SAndroid Build Coastguard Worker   const void* params)
80*4bdc9457SAndroid Build Coastguard Worker {
81*4bdc9457SAndroid Build Coastguard Worker   assert(nr >= sr);
82*4bdc9457SAndroid Build Coastguard Worker 
83*4bdc9457SAndroid Build Coastguard Worker   const size_t skr = sr * kr;
84*4bdc9457SAndroid Build Coastguard Worker   do {
85*4bdc9457SAndroid Build Coastguard Worker     for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
86*4bdc9457SAndroid Build Coastguard Worker       const size_t nr_block_size = min(nc - nr_block_start, nr);
87*4bdc9457SAndroid Build Coastguard Worker       if XNN_LIKELY(b != NULL) {
88*4bdc9457SAndroid Build Coastguard Worker         for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
89*4bdc9457SAndroid Build Coastguard Worker           packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
90*4bdc9457SAndroid Build Coastguard Worker         }
91*4bdc9457SAndroid Build Coastguard Worker       }
92*4bdc9457SAndroid Build Coastguard Worker       packed_w += nr;
93*4bdc9457SAndroid Build Coastguard Worker 
94*4bdc9457SAndroid Build Coastguard Worker       for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
95*4bdc9457SAndroid Build Coastguard Worker         for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
96*4bdc9457SAndroid Build Coastguard Worker           for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
97*4bdc9457SAndroid Build Coastguard Worker             const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
98*4bdc9457SAndroid Build Coastguard Worker             if (kc_idx < kc) {
99*4bdc9457SAndroid Build Coastguard Worker               packed_w[kr_block_offset] = k[(nr_block_start + nr_block_offset) * kc + kc_idx];
100*4bdc9457SAndroid Build Coastguard Worker             }
101*4bdc9457SAndroid Build Coastguard Worker           }
102*4bdc9457SAndroid Build Coastguard Worker           packed_w += kr;
103*4bdc9457SAndroid Build Coastguard Worker         }
104*4bdc9457SAndroid Build Coastguard Worker         packed_w += (nr - nr_block_size) * kr;
105*4bdc9457SAndroid Build Coastguard Worker       }
106*4bdc9457SAndroid Build Coastguard Worker       packed_w = (uint16_t*) ((uintptr_t) packed_w + extra_bytes);
107*4bdc9457SAndroid Build Coastguard Worker     }
108*4bdc9457SAndroid Build Coastguard Worker     k += nc * kc;
109*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(b != NULL) {
110*4bdc9457SAndroid Build Coastguard Worker       b += nc;
111*4bdc9457SAndroid Build Coastguard Worker     }
112*4bdc9457SAndroid Build Coastguard Worker   } while (--g != 0);
113*4bdc9457SAndroid Build Coastguard Worker }
114*4bdc9457SAndroid Build Coastguard Worker 
xnn_pack_f32_to_f16_gemm_goi_w(size_t g,size_t nc,size_t kc,size_t nr,size_t kr,size_t sr,const float * k,const float * b,uint16_t * packed_w,size_t extra_bytes,const void * params)115*4bdc9457SAndroid Build Coastguard Worker void xnn_pack_f32_to_f16_gemm_goi_w(
116*4bdc9457SAndroid Build Coastguard Worker   size_t g,
117*4bdc9457SAndroid Build Coastguard Worker   size_t nc,
118*4bdc9457SAndroid Build Coastguard Worker   size_t kc,
119*4bdc9457SAndroid Build Coastguard Worker   size_t nr,
120*4bdc9457SAndroid Build Coastguard Worker   size_t kr,
121*4bdc9457SAndroid Build Coastguard Worker   size_t sr,
122*4bdc9457SAndroid Build Coastguard Worker   const float* k,
123*4bdc9457SAndroid Build Coastguard Worker   const float* b,
124*4bdc9457SAndroid Build Coastguard Worker   uint16_t* packed_w,
125*4bdc9457SAndroid Build Coastguard Worker   size_t extra_bytes,
126*4bdc9457SAndroid Build Coastguard Worker   const void* params)
127*4bdc9457SAndroid Build Coastguard Worker {
128*4bdc9457SAndroid Build Coastguard Worker   assert(nr >= sr);
129*4bdc9457SAndroid Build Coastguard Worker 
130*4bdc9457SAndroid Build Coastguard Worker   const size_t skr = sr * kr;
131*4bdc9457SAndroid Build Coastguard Worker   do {
132*4bdc9457SAndroid Build Coastguard Worker     for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
133*4bdc9457SAndroid Build Coastguard Worker       const size_t nr_block_size = min(nc - nr_block_start, nr);
134*4bdc9457SAndroid Build Coastguard Worker       if XNN_LIKELY(b != NULL) {
135*4bdc9457SAndroid Build Coastguard Worker         for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
136*4bdc9457SAndroid Build Coastguard Worker           packed_w[nr_block_offset] = fp16_ieee_from_fp32_value(b[nr_block_start + nr_block_offset]);
137*4bdc9457SAndroid Build Coastguard Worker         }
138*4bdc9457SAndroid Build Coastguard Worker       }
139*4bdc9457SAndroid Build Coastguard Worker       packed_w += nr;
140*4bdc9457SAndroid Build Coastguard Worker 
141*4bdc9457SAndroid Build Coastguard Worker       for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
142*4bdc9457SAndroid Build Coastguard Worker         for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
143*4bdc9457SAndroid Build Coastguard Worker           for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
144*4bdc9457SAndroid Build Coastguard Worker             const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
145*4bdc9457SAndroid Build Coastguard Worker             if (kc_idx < kc) {
146*4bdc9457SAndroid Build Coastguard Worker               packed_w[kr_block_offset] = fp16_ieee_from_fp32_value(k[(nr_block_start + nr_block_offset) * kc + kc_idx]);
147*4bdc9457SAndroid Build Coastguard Worker             }
148*4bdc9457SAndroid Build Coastguard Worker           }
149*4bdc9457SAndroid Build Coastguard Worker           packed_w += kr;
150*4bdc9457SAndroid Build Coastguard Worker         }
151*4bdc9457SAndroid Build Coastguard Worker         packed_w += (nr - nr_block_size) * kr;
152*4bdc9457SAndroid Build Coastguard Worker       }
153*4bdc9457SAndroid Build Coastguard Worker       packed_w = (uint16_t*) ((uintptr_t) packed_w + extra_bytes);
154*4bdc9457SAndroid Build Coastguard Worker     }
155*4bdc9457SAndroid Build Coastguard Worker     k += nc * kc;
156*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(b != NULL) {
157*4bdc9457SAndroid Build Coastguard Worker       b += nc;
158*4bdc9457SAndroid Build Coastguard Worker     }
159*4bdc9457SAndroid Build Coastguard Worker   } while (--g != 0);
160*4bdc9457SAndroid Build Coastguard Worker }
161*4bdc9457SAndroid Build Coastguard Worker 
xnn_pack_qu8_gemm_goi_w(size_t g,size_t nc,size_t kc,size_t nr,size_t kr,size_t sr,const uint8_t * k,const int32_t * b,void * packed_w,size_t extra_bytes,const struct xnn_qu8_packing_params * params)162*4bdc9457SAndroid Build Coastguard Worker void xnn_pack_qu8_gemm_goi_w(
163*4bdc9457SAndroid Build Coastguard Worker   size_t g,
164*4bdc9457SAndroid Build Coastguard Worker   size_t nc,
165*4bdc9457SAndroid Build Coastguard Worker   size_t kc,
166*4bdc9457SAndroid Build Coastguard Worker   size_t nr,
167*4bdc9457SAndroid Build Coastguard Worker   size_t kr,
168*4bdc9457SAndroid Build Coastguard Worker   size_t sr,
169*4bdc9457SAndroid Build Coastguard Worker   const uint8_t* k,
170*4bdc9457SAndroid Build Coastguard Worker   const int32_t* b,
171*4bdc9457SAndroid Build Coastguard Worker   void* packed_w,
172*4bdc9457SAndroid Build Coastguard Worker   size_t extra_bytes,
173*4bdc9457SAndroid Build Coastguard Worker   const struct xnn_qu8_packing_params* params)
174*4bdc9457SAndroid Build Coastguard Worker {
175*4bdc9457SAndroid Build Coastguard Worker   assert(nr >= sr);
176*4bdc9457SAndroid Build Coastguard Worker 
177*4bdc9457SAndroid Build Coastguard Worker   const size_t skr = sr * kr;
178*4bdc9457SAndroid Build Coastguard Worker   const int32_t izp = (int32_t) params->input_zero_point;
179*4bdc9457SAndroid Build Coastguard Worker   const int32_t bzp = (int32_t) kc * izp * (int32_t) params->kernel_zero_point;
180*4bdc9457SAndroid Build Coastguard Worker   do {
181*4bdc9457SAndroid Build Coastguard Worker     for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
182*4bdc9457SAndroid Build Coastguard Worker       const size_t nr_block_size = min(nc - nr_block_start, nr);
183*4bdc9457SAndroid Build Coastguard Worker       int32_t* packed_b = (int32_t*) packed_w;
184*4bdc9457SAndroid Build Coastguard Worker       if XNN_LIKELY(b != NULL) {
185*4bdc9457SAndroid Build Coastguard Worker         for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
186*4bdc9457SAndroid Build Coastguard Worker           unaligned_store_s32(packed_w, bzp + b[nr_block_start + nr_block_offset]);
187*4bdc9457SAndroid Build Coastguard Worker           packed_w = (int32_t*) packed_w + 1;
188*4bdc9457SAndroid Build Coastguard Worker         }
189*4bdc9457SAndroid Build Coastguard Worker       } else {
190*4bdc9457SAndroid Build Coastguard Worker         size_t n = nr_block_size;
191*4bdc9457SAndroid Build Coastguard Worker         do {
192*4bdc9457SAndroid Build Coastguard Worker           unaligned_store_s32(packed_w, bzp);
193*4bdc9457SAndroid Build Coastguard Worker           packed_w = (int32_t*) packed_w + 1;
194*4bdc9457SAndroid Build Coastguard Worker         } while (--n != 0);
195*4bdc9457SAndroid Build Coastguard Worker       }
196*4bdc9457SAndroid Build Coastguard Worker       packed_w = (int32_t*) packed_w + (nr - nr_block_size);
197*4bdc9457SAndroid Build Coastguard Worker 
198*4bdc9457SAndroid Build Coastguard Worker       for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
199*4bdc9457SAndroid Build Coastguard Worker         for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
200*4bdc9457SAndroid Build Coastguard Worker           int32_t ksum = 0;
201*4bdc9457SAndroid Build Coastguard Worker           for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
202*4bdc9457SAndroid Build Coastguard Worker             const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
203*4bdc9457SAndroid Build Coastguard Worker             if (kc_idx < kc) {
204*4bdc9457SAndroid Build Coastguard Worker               const uint8_t kv = k[(nr_block_start + nr_block_offset) * kc + kc_idx];
205*4bdc9457SAndroid Build Coastguard Worker               ksum += (int32_t) kv;
206*4bdc9457SAndroid Build Coastguard Worker               ((uint8_t*) packed_w)[kr_block_offset] = kv;
207*4bdc9457SAndroid Build Coastguard Worker             }
208*4bdc9457SAndroid Build Coastguard Worker           }
209*4bdc9457SAndroid Build Coastguard Worker           unaligned_indexed_store_s32(packed_b, nr_block_offset, unaligned_indexed_load_s32(packed_b, nr_block_offset) - ksum * izp);
210*4bdc9457SAndroid Build Coastguard Worker           packed_w = (uint8_t*) packed_w + kr;
211*4bdc9457SAndroid Build Coastguard Worker         }
212*4bdc9457SAndroid Build Coastguard Worker         packed_w = (uint8_t*) packed_w + (nr - nr_block_size) * kr;
213*4bdc9457SAndroid Build Coastguard Worker       }
214*4bdc9457SAndroid Build Coastguard Worker       packed_w = (void*) ((uintptr_t) packed_w + extra_bytes);
215*4bdc9457SAndroid Build Coastguard Worker     }
216*4bdc9457SAndroid Build Coastguard Worker     k += nc * kc;
217*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(b != NULL) {
218*4bdc9457SAndroid Build Coastguard Worker       b += nc;
219*4bdc9457SAndroid Build Coastguard Worker     }
220*4bdc9457SAndroid Build Coastguard Worker   } while (--g != 0);
221*4bdc9457SAndroid Build Coastguard Worker }
222*4bdc9457SAndroid Build Coastguard Worker 
xnn_pack_qs8_gemm_goi_w(size_t g,size_t nc,size_t kc,size_t nr,size_t kr,size_t sr,const int8_t * k,const int32_t * b,void * packed_w,size_t extra_bytes,const struct xnn_qs8_packing_params * params)223*4bdc9457SAndroid Build Coastguard Worker void xnn_pack_qs8_gemm_goi_w(
224*4bdc9457SAndroid Build Coastguard Worker   size_t g,
225*4bdc9457SAndroid Build Coastguard Worker   size_t nc,
226*4bdc9457SAndroid Build Coastguard Worker   size_t kc,
227*4bdc9457SAndroid Build Coastguard Worker   size_t nr,
228*4bdc9457SAndroid Build Coastguard Worker   size_t kr,
229*4bdc9457SAndroid Build Coastguard Worker   size_t sr,
230*4bdc9457SAndroid Build Coastguard Worker   const int8_t* k,
231*4bdc9457SAndroid Build Coastguard Worker   const int32_t* b,
232*4bdc9457SAndroid Build Coastguard Worker   void* packed_w,
233*4bdc9457SAndroid Build Coastguard Worker   size_t extra_bytes,
234*4bdc9457SAndroid Build Coastguard Worker   const struct xnn_qs8_packing_params* params)
235*4bdc9457SAndroid Build Coastguard Worker {
236*4bdc9457SAndroid Build Coastguard Worker   assert(nr >= sr);
237*4bdc9457SAndroid Build Coastguard Worker 
238*4bdc9457SAndroid Build Coastguard Worker   const size_t skr = sr * kr;
239*4bdc9457SAndroid Build Coastguard Worker   const uint32_t izp = (uint32_t) params->input_zero_point;
240*4bdc9457SAndroid Build Coastguard Worker   do {
241*4bdc9457SAndroid Build Coastguard Worker     for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
242*4bdc9457SAndroid Build Coastguard Worker       const size_t nr_block_size = min(nc - nr_block_start, nr);
243*4bdc9457SAndroid Build Coastguard Worker       int32_t* packed_b = (int32_t*) packed_w;
244*4bdc9457SAndroid Build Coastguard Worker       if XNN_LIKELY(b != NULL) {
245*4bdc9457SAndroid Build Coastguard Worker         for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
246*4bdc9457SAndroid Build Coastguard Worker           unaligned_store_s32(packed_w, b[nr_block_start + nr_block_offset]);
247*4bdc9457SAndroid Build Coastguard Worker           packed_w = (int32_t*) packed_w + 1;
248*4bdc9457SAndroid Build Coastguard Worker         }
249*4bdc9457SAndroid Build Coastguard Worker       } else {
250*4bdc9457SAndroid Build Coastguard Worker         size_t n = nr_block_size;
251*4bdc9457SAndroid Build Coastguard Worker         do {
252*4bdc9457SAndroid Build Coastguard Worker           unaligned_store_s32(packed_w, 0);
253*4bdc9457SAndroid Build Coastguard Worker           packed_w = (int32_t*) packed_w + 1;
254*4bdc9457SAndroid Build Coastguard Worker         } while (--n != 0);
255*4bdc9457SAndroid Build Coastguard Worker       }
256*4bdc9457SAndroid Build Coastguard Worker       packed_w = (int32_t*) packed_w + (nr - nr_block_size);
257*4bdc9457SAndroid Build Coastguard Worker 
258*4bdc9457SAndroid Build Coastguard Worker       for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
259*4bdc9457SAndroid Build Coastguard Worker         for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
260*4bdc9457SAndroid Build Coastguard Worker           uint32_t ksum = 0;
261*4bdc9457SAndroid Build Coastguard Worker           for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
262*4bdc9457SAndroid Build Coastguard Worker             const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
263*4bdc9457SAndroid Build Coastguard Worker             if (kc_idx < kc) {
264*4bdc9457SAndroid Build Coastguard Worker               const int8_t kv = k[(nr_block_start + nr_block_offset) * kc + kc_idx];
265*4bdc9457SAndroid Build Coastguard Worker               ksum += (uint32_t) kv;
266*4bdc9457SAndroid Build Coastguard Worker               ((int8_t*) packed_w)[kr_block_offset] = kv;
267*4bdc9457SAndroid Build Coastguard Worker             }
268*4bdc9457SAndroid Build Coastguard Worker           }
269*4bdc9457SAndroid Build Coastguard Worker           unaligned_indexed_store_u32(packed_b, nr_block_offset, unaligned_indexed_load_u32(packed_b, nr_block_offset) - ksum * izp);
270*4bdc9457SAndroid Build Coastguard Worker           packed_w = (int8_t*) packed_w + kr;
271*4bdc9457SAndroid Build Coastguard Worker         }
272*4bdc9457SAndroid Build Coastguard Worker         packed_w = (int8_t*) packed_w + (nr - nr_block_size) * kr;
273*4bdc9457SAndroid Build Coastguard Worker       }
274*4bdc9457SAndroid Build Coastguard Worker       packed_w = (void*) ((uintptr_t) packed_w + extra_bytes);
275*4bdc9457SAndroid Build Coastguard Worker     }
276*4bdc9457SAndroid Build Coastguard Worker     k += nc * kc;
277*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(b != NULL) {
278*4bdc9457SAndroid Build Coastguard Worker       b += nc;
279*4bdc9457SAndroid Build Coastguard Worker     }
280*4bdc9457SAndroid Build Coastguard Worker   } while (--g != 0);
281*4bdc9457SAndroid Build Coastguard Worker }
282*4bdc9457SAndroid Build Coastguard Worker 
xnn_pack_qs8_gemm_xw_goi_w(size_t g,size_t nc,size_t kc,size_t nr,size_t kr,size_t sr,const int8_t * k,const int32_t * b,void * packed_w,size_t extra_bytes,const struct xnn_qs8_packing_params * params)283*4bdc9457SAndroid Build Coastguard Worker void xnn_pack_qs8_gemm_xw_goi_w(
284*4bdc9457SAndroid Build Coastguard Worker   size_t g,
285*4bdc9457SAndroid Build Coastguard Worker   size_t nc,
286*4bdc9457SAndroid Build Coastguard Worker   size_t kc,
287*4bdc9457SAndroid Build Coastguard Worker   size_t nr,
288*4bdc9457SAndroid Build Coastguard Worker   size_t kr,
289*4bdc9457SAndroid Build Coastguard Worker   size_t sr,
290*4bdc9457SAndroid Build Coastguard Worker   const int8_t* k,
291*4bdc9457SAndroid Build Coastguard Worker   const int32_t* b,
292*4bdc9457SAndroid Build Coastguard Worker   void* packed_w,
293*4bdc9457SAndroid Build Coastguard Worker   size_t extra_bytes,
294*4bdc9457SAndroid Build Coastguard Worker   const struct xnn_qs8_packing_params* params)
295*4bdc9457SAndroid Build Coastguard Worker {
296*4bdc9457SAndroid Build Coastguard Worker   assert(nr >= sr);
297*4bdc9457SAndroid Build Coastguard Worker 
298*4bdc9457SAndroid Build Coastguard Worker   const size_t skr = sr * kr;
299*4bdc9457SAndroid Build Coastguard Worker   const uint32_t izp = (uint32_t) params->input_zero_point;
300*4bdc9457SAndroid Build Coastguard Worker   do {
301*4bdc9457SAndroid Build Coastguard Worker     for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
302*4bdc9457SAndroid Build Coastguard Worker       const size_t nr_block_size = min(nc - nr_block_start, nr);
303*4bdc9457SAndroid Build Coastguard Worker       int32_t* packed_b = (int32_t*) packed_w;
304*4bdc9457SAndroid Build Coastguard Worker       if XNN_LIKELY(b != NULL) {
305*4bdc9457SAndroid Build Coastguard Worker         for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
306*4bdc9457SAndroid Build Coastguard Worker           unaligned_store_s32(packed_w, b[nr_block_start + nr_block_offset]);
307*4bdc9457SAndroid Build Coastguard Worker           packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
308*4bdc9457SAndroid Build Coastguard Worker         }
309*4bdc9457SAndroid Build Coastguard Worker       } else {
310*4bdc9457SAndroid Build Coastguard Worker         size_t n = nr_block_size;
311*4bdc9457SAndroid Build Coastguard Worker         do {
312*4bdc9457SAndroid Build Coastguard Worker           unaligned_store_s32(packed_w, 0);
313*4bdc9457SAndroid Build Coastguard Worker           packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
314*4bdc9457SAndroid Build Coastguard Worker         } while (--n != 0);
315*4bdc9457SAndroid Build Coastguard Worker       }
316*4bdc9457SAndroid Build Coastguard Worker       packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
317*4bdc9457SAndroid Build Coastguard Worker 
318*4bdc9457SAndroid Build Coastguard Worker       for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
319*4bdc9457SAndroid Build Coastguard Worker         for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
320*4bdc9457SAndroid Build Coastguard Worker           uint32_t ksum = 0;
321*4bdc9457SAndroid Build Coastguard Worker           for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
322*4bdc9457SAndroid Build Coastguard Worker             const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
323*4bdc9457SAndroid Build Coastguard Worker             if (kc_idx < kc) {
324*4bdc9457SAndroid Build Coastguard Worker               const int8_t kv = k[(nr_block_start + nr_block_offset) * kc + kc_idx];
325*4bdc9457SAndroid Build Coastguard Worker               ksum += (uint32_t) kv;
326*4bdc9457SAndroid Build Coastguard Worker               ((int16_t*) packed_w)[kr_block_offset] = (int16_t) kv;
327*4bdc9457SAndroid Build Coastguard Worker             }
328*4bdc9457SAndroid Build Coastguard Worker           }
329*4bdc9457SAndroid Build Coastguard Worker           unaligned_indexed_store_u32(packed_b, nr_block_offset, unaligned_indexed_load_u32(packed_b, nr_block_offset) - ksum * izp);
330*4bdc9457SAndroid Build Coastguard Worker           packed_w = (int16_t*) packed_w + kr;
331*4bdc9457SAndroid Build Coastguard Worker         }
332*4bdc9457SAndroid Build Coastguard Worker         packed_w = (int16_t*) packed_w + (nr - nr_block_size) * kr;
333*4bdc9457SAndroid Build Coastguard Worker       }
334*4bdc9457SAndroid Build Coastguard Worker       packed_w = (void*) ((uintptr_t) packed_w + extra_bytes);
335*4bdc9457SAndroid Build Coastguard Worker     }
336*4bdc9457SAndroid Build Coastguard Worker     k += nc * kc;
337*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(b != NULL) {
338*4bdc9457SAndroid Build Coastguard Worker       b += nc;
339*4bdc9457SAndroid Build Coastguard Worker     }
340*4bdc9457SAndroid Build Coastguard Worker   } while (--g != 0);
341*4bdc9457SAndroid Build Coastguard Worker }
342*4bdc9457SAndroid Build Coastguard Worker 
xnn_pack_f32_gemm_io_w(size_t nc,size_t kc,size_t nr,size_t kr,size_t sr,const float * k,const float * b,float * packed_w,const void * params)343*4bdc9457SAndroid Build Coastguard Worker void xnn_pack_f32_gemm_io_w(
344*4bdc9457SAndroid Build Coastguard Worker   size_t nc,
345*4bdc9457SAndroid Build Coastguard Worker   size_t kc,
346*4bdc9457SAndroid Build Coastguard Worker   size_t nr,
347*4bdc9457SAndroid Build Coastguard Worker   size_t kr,
348*4bdc9457SAndroid Build Coastguard Worker   size_t sr,
349*4bdc9457SAndroid Build Coastguard Worker   const float* k,
350*4bdc9457SAndroid Build Coastguard Worker   const float* b,
351*4bdc9457SAndroid Build Coastguard Worker   float* packed_w,
352*4bdc9457SAndroid Build Coastguard Worker   const void* params)
353*4bdc9457SAndroid Build Coastguard Worker {
354*4bdc9457SAndroid Build Coastguard Worker   assert(nr >= sr);
355*4bdc9457SAndroid Build Coastguard Worker 
356*4bdc9457SAndroid Build Coastguard Worker   const size_t skr = sr * kr;
357*4bdc9457SAndroid Build Coastguard Worker   for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
358*4bdc9457SAndroid Build Coastguard Worker     const size_t nr_block_size = min(nc - nr_block_start, nr);
359*4bdc9457SAndroid Build Coastguard Worker     if XNN_LIKELY(b != NULL) {
360*4bdc9457SAndroid Build Coastguard Worker       for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
361*4bdc9457SAndroid Build Coastguard Worker         packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
362*4bdc9457SAndroid Build Coastguard Worker       }
363*4bdc9457SAndroid Build Coastguard Worker     }
364*4bdc9457SAndroid Build Coastguard Worker     packed_w += nr;
365*4bdc9457SAndroid Build Coastguard Worker 
366*4bdc9457SAndroid Build Coastguard Worker     for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
367*4bdc9457SAndroid Build Coastguard Worker       for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
368*4bdc9457SAndroid Build Coastguard Worker         for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
369*4bdc9457SAndroid Build Coastguard Worker           const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
370*4bdc9457SAndroid Build Coastguard Worker           if (kc_idx < kc) {
371*4bdc9457SAndroid Build Coastguard Worker             packed_w[kr_block_offset] = k[kc_idx * nc + nr_block_start + nr_block_offset];
372*4bdc9457SAndroid Build Coastguard Worker           }
373*4bdc9457SAndroid Build Coastguard Worker         }
374*4bdc9457SAndroid Build Coastguard Worker         packed_w += kr;
375*4bdc9457SAndroid Build Coastguard Worker       }
376*4bdc9457SAndroid Build Coastguard Worker       packed_w += (nr - nr_block_size) * kr;
377*4bdc9457SAndroid Build Coastguard Worker     }
378*4bdc9457SAndroid Build Coastguard Worker   }
379*4bdc9457SAndroid Build Coastguard Worker }
380*4bdc9457SAndroid Build Coastguard Worker 
xnn_pack_f16_gemm_io_w(size_t nc,size_t kc,size_t nr,size_t kr,size_t sr,const uint16_t * k,const uint16_t * b,uint16_t * packed_w,const void * params)381*4bdc9457SAndroid Build Coastguard Worker void xnn_pack_f16_gemm_io_w(
382*4bdc9457SAndroid Build Coastguard Worker   size_t nc,
383*4bdc9457SAndroid Build Coastguard Worker   size_t kc,
384*4bdc9457SAndroid Build Coastguard Worker   size_t nr,
385*4bdc9457SAndroid Build Coastguard Worker   size_t kr,
386*4bdc9457SAndroid Build Coastguard Worker   size_t sr,
387*4bdc9457SAndroid Build Coastguard Worker   const uint16_t* k,
388*4bdc9457SAndroid Build Coastguard Worker   const uint16_t* b,
389*4bdc9457SAndroid Build Coastguard Worker   uint16_t* packed_w,
390*4bdc9457SAndroid Build Coastguard Worker   const void* params)
391*4bdc9457SAndroid Build Coastguard Worker {
392*4bdc9457SAndroid Build Coastguard Worker   assert(nr >= sr);
393*4bdc9457SAndroid Build Coastguard Worker 
394*4bdc9457SAndroid Build Coastguard Worker   const size_t skr = sr * kr;
395*4bdc9457SAndroid Build Coastguard Worker   for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
396*4bdc9457SAndroid Build Coastguard Worker     const size_t nr_block_size = min(nc - nr_block_start, nr);
397*4bdc9457SAndroid Build Coastguard Worker     if XNN_LIKELY(b != NULL) {
398*4bdc9457SAndroid Build Coastguard Worker       for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
399*4bdc9457SAndroid Build Coastguard Worker         packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
400*4bdc9457SAndroid Build Coastguard Worker       }
401*4bdc9457SAndroid Build Coastguard Worker     }
402*4bdc9457SAndroid Build Coastguard Worker     packed_w += nr;
403*4bdc9457SAndroid Build Coastguard Worker 
404*4bdc9457SAndroid Build Coastguard Worker     for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
405*4bdc9457SAndroid Build Coastguard Worker       for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
406*4bdc9457SAndroid Build Coastguard Worker         for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
407*4bdc9457SAndroid Build Coastguard Worker           const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
408*4bdc9457SAndroid Build Coastguard Worker           if (kc_idx < kc) {
409*4bdc9457SAndroid Build Coastguard Worker             packed_w[kr_block_offset] = k[kc_idx * nc + nr_block_start + nr_block_offset];
410*4bdc9457SAndroid Build Coastguard Worker           }
411*4bdc9457SAndroid Build Coastguard Worker         }
412*4bdc9457SAndroid Build Coastguard Worker         packed_w += kr;
413*4bdc9457SAndroid Build Coastguard Worker       }
414*4bdc9457SAndroid Build Coastguard Worker       packed_w += (nr - nr_block_size) * kr;
415*4bdc9457SAndroid Build Coastguard Worker     }
416*4bdc9457SAndroid Build Coastguard Worker   }
417*4bdc9457SAndroid Build Coastguard Worker }
418*4bdc9457SAndroid Build Coastguard Worker 
xnn_pack_f32_to_f16_gemm_io_w(size_t nc,size_t kc,size_t nr,size_t kr,size_t sr,const float * k,const float * b,uint16_t * packed_w,const void * params)419*4bdc9457SAndroid Build Coastguard Worker void xnn_pack_f32_to_f16_gemm_io_w(
420*4bdc9457SAndroid Build Coastguard Worker   size_t nc,
421*4bdc9457SAndroid Build Coastguard Worker   size_t kc,
422*4bdc9457SAndroid Build Coastguard Worker   size_t nr,
423*4bdc9457SAndroid Build Coastguard Worker   size_t kr,
424*4bdc9457SAndroid Build Coastguard Worker   size_t sr,
425*4bdc9457SAndroid Build Coastguard Worker   const float* k,
426*4bdc9457SAndroid Build Coastguard Worker   const float* b,
427*4bdc9457SAndroid Build Coastguard Worker   uint16_t* packed_w,
428*4bdc9457SAndroid Build Coastguard Worker   const void* params)
429*4bdc9457SAndroid Build Coastguard Worker {
430*4bdc9457SAndroid Build Coastguard Worker   assert(nr >= sr);
431*4bdc9457SAndroid Build Coastguard Worker 
432*4bdc9457SAndroid Build Coastguard Worker   const size_t skr = sr * kr;
433*4bdc9457SAndroid Build Coastguard Worker   for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
434*4bdc9457SAndroid Build Coastguard Worker     const size_t nr_block_size = min(nc - nr_block_start, nr);
435*4bdc9457SAndroid Build Coastguard Worker     if XNN_LIKELY(b != NULL) {
436*4bdc9457SAndroid Build Coastguard Worker       for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
437*4bdc9457SAndroid Build Coastguard Worker         packed_w[nr_block_offset] = fp16_ieee_from_fp32_value(b[nr_block_start + nr_block_offset]);
438*4bdc9457SAndroid Build Coastguard Worker       }
439*4bdc9457SAndroid Build Coastguard Worker     }
440*4bdc9457SAndroid Build Coastguard Worker     packed_w += nr;
441*4bdc9457SAndroid Build Coastguard Worker 
442*4bdc9457SAndroid Build Coastguard Worker     for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
443*4bdc9457SAndroid Build Coastguard Worker       for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
444*4bdc9457SAndroid Build Coastguard Worker         for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
445*4bdc9457SAndroid Build Coastguard Worker           const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
446*4bdc9457SAndroid Build Coastguard Worker           if (kc_idx < kc) {
447*4bdc9457SAndroid Build Coastguard Worker             packed_w[kr_block_offset] = fp16_ieee_from_fp32_value(k[kc_idx * nc + nr_block_start + nr_block_offset]);
448*4bdc9457SAndroid Build Coastguard Worker           }
449*4bdc9457SAndroid Build Coastguard Worker         }
450*4bdc9457SAndroid Build Coastguard Worker         packed_w += kr;
451*4bdc9457SAndroid Build Coastguard Worker       }
452*4bdc9457SAndroid Build Coastguard Worker       packed_w += (nr - nr_block_size) * kr;
453*4bdc9457SAndroid Build Coastguard Worker     }
454*4bdc9457SAndroid Build Coastguard Worker   }
455*4bdc9457SAndroid Build Coastguard Worker }
456*4bdc9457SAndroid Build Coastguard Worker 
xnn_pack_qu8_gemm_io_w(size_t nc,size_t kc,size_t nr,size_t kr,size_t sr,const uint8_t * k,const int32_t * b,void * packed_w,const struct xnn_qu8_packing_params * params)457*4bdc9457SAndroid Build Coastguard Worker void xnn_pack_qu8_gemm_io_w(
458*4bdc9457SAndroid Build Coastguard Worker   size_t nc,
459*4bdc9457SAndroid Build Coastguard Worker   size_t kc,
460*4bdc9457SAndroid Build Coastguard Worker   size_t nr,
461*4bdc9457SAndroid Build Coastguard Worker   size_t kr,
462*4bdc9457SAndroid Build Coastguard Worker   size_t sr,
463*4bdc9457SAndroid Build Coastguard Worker   const uint8_t* k,
464*4bdc9457SAndroid Build Coastguard Worker   const int32_t* b,
465*4bdc9457SAndroid Build Coastguard Worker   void* packed_w,
466*4bdc9457SAndroid Build Coastguard Worker   const struct xnn_qu8_packing_params* params)
467*4bdc9457SAndroid Build Coastguard Worker {
468*4bdc9457SAndroid Build Coastguard Worker   assert(nr >= sr);
469*4bdc9457SAndroid Build Coastguard Worker 
470*4bdc9457SAndroid Build Coastguard Worker   const size_t skr = sr * kr;
471*4bdc9457SAndroid Build Coastguard Worker   const int32_t izp = (int32_t) params->input_zero_point;
472*4bdc9457SAndroid Build Coastguard Worker   const int32_t bzp = (int32_t) kc * izp * (int32_t) params->kernel_zero_point;
473*4bdc9457SAndroid Build Coastguard Worker   for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
474*4bdc9457SAndroid Build Coastguard Worker     const size_t nr_block_size = min(nc - nr_block_start, nr);
475*4bdc9457SAndroid Build Coastguard Worker     int32_t* packed_b = (int32_t*) packed_w;
476*4bdc9457SAndroid Build Coastguard Worker     if XNN_LIKELY(b != NULL) {
477*4bdc9457SAndroid Build Coastguard Worker       for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
478*4bdc9457SAndroid Build Coastguard Worker         unaligned_store_s32(packed_w, bzp + b[nr_block_start + nr_block_offset]);
479*4bdc9457SAndroid Build Coastguard Worker         packed_w = (int32_t*) packed_w + 1;
480*4bdc9457SAndroid Build Coastguard Worker       }
481*4bdc9457SAndroid Build Coastguard Worker     } else {
482*4bdc9457SAndroid Build Coastguard Worker       size_t n = nr_block_size;
483*4bdc9457SAndroid Build Coastguard Worker       do {
484*4bdc9457SAndroid Build Coastguard Worker         unaligned_store_s32(packed_w, bzp);
485*4bdc9457SAndroid Build Coastguard Worker         packed_w = (int32_t*) packed_w + 1;
486*4bdc9457SAndroid Build Coastguard Worker       } while (--n != 0);
487*4bdc9457SAndroid Build Coastguard Worker     }
488*4bdc9457SAndroid Build Coastguard Worker     packed_w = (int32_t*) packed_w + (nr - nr_block_size);
489*4bdc9457SAndroid Build Coastguard Worker 
490*4bdc9457SAndroid Build Coastguard Worker     for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
491*4bdc9457SAndroid Build Coastguard Worker       for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
492*4bdc9457SAndroid Build Coastguard Worker         int32_t ksum = 0;
493*4bdc9457SAndroid Build Coastguard Worker         for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
494*4bdc9457SAndroid Build Coastguard Worker           const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
495*4bdc9457SAndroid Build Coastguard Worker           if (kc_idx < kc) {
496*4bdc9457SAndroid Build Coastguard Worker             const uint8_t kv = k[kc_idx * nc + (nr_block_start + nr_block_offset)];
497*4bdc9457SAndroid Build Coastguard Worker             ksum += (int32_t) kv;
498*4bdc9457SAndroid Build Coastguard Worker             ((uint8_t*) packed_w)[kr_block_offset] = kv;
499*4bdc9457SAndroid Build Coastguard Worker           }
500*4bdc9457SAndroid Build Coastguard Worker         }
501*4bdc9457SAndroid Build Coastguard Worker         unaligned_indexed_store_s32(packed_b, nr_block_offset, unaligned_indexed_load_s32(packed_b, nr_block_offset) - ksum * izp);
502*4bdc9457SAndroid Build Coastguard Worker         packed_w = (uint8_t*) packed_w + kr;
503*4bdc9457SAndroid Build Coastguard Worker       }
504*4bdc9457SAndroid Build Coastguard Worker       packed_w = (uint8_t*) packed_w + (nr - nr_block_size) * kr;
505*4bdc9457SAndroid Build Coastguard Worker     }
506*4bdc9457SAndroid Build Coastguard Worker   }
507*4bdc9457SAndroid Build Coastguard Worker }
508*4bdc9457SAndroid Build Coastguard Worker 
xnn_pack_qs8_gemm_io_w(size_t nc,size_t kc,size_t nr,size_t kr,size_t sr,const int8_t * k,const int32_t * b,void * packed_w,const struct xnn_qs8_packing_params * params)509*4bdc9457SAndroid Build Coastguard Worker void xnn_pack_qs8_gemm_io_w(
510*4bdc9457SAndroid Build Coastguard Worker   size_t nc,
511*4bdc9457SAndroid Build Coastguard Worker   size_t kc,
512*4bdc9457SAndroid Build Coastguard Worker   size_t nr,
513*4bdc9457SAndroid Build Coastguard Worker   size_t kr,
514*4bdc9457SAndroid Build Coastguard Worker   size_t sr,
515*4bdc9457SAndroid Build Coastguard Worker   const int8_t* k,
516*4bdc9457SAndroid Build Coastguard Worker   const int32_t* b,
517*4bdc9457SAndroid Build Coastguard Worker   void* packed_w,
518*4bdc9457SAndroid Build Coastguard Worker   const struct xnn_qs8_packing_params* params)
519*4bdc9457SAndroid Build Coastguard Worker {
520*4bdc9457SAndroid Build Coastguard Worker   assert(nr >= sr);
521*4bdc9457SAndroid Build Coastguard Worker 
522*4bdc9457SAndroid Build Coastguard Worker   const size_t skr = sr * kr;
523*4bdc9457SAndroid Build Coastguard Worker   const uint32_t izp = (uint32_t) params->input_zero_point;
524*4bdc9457SAndroid Build Coastguard Worker   for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
525*4bdc9457SAndroid Build Coastguard Worker     const size_t nr_block_size = min(nc - nr_block_start, nr);
526*4bdc9457SAndroid Build Coastguard Worker     int32_t* packed_b = (int32_t*) packed_w;
527*4bdc9457SAndroid Build Coastguard Worker     if XNN_LIKELY(b != NULL) {
528*4bdc9457SAndroid Build Coastguard Worker       for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
529*4bdc9457SAndroid Build Coastguard Worker         unaligned_store_s32(packed_w, b[nr_block_start + nr_block_offset]);
530*4bdc9457SAndroid Build Coastguard Worker         packed_w = (int32_t*) packed_w + 1;
531*4bdc9457SAndroid Build Coastguard Worker       }
532*4bdc9457SAndroid Build Coastguard Worker     } else {
533*4bdc9457SAndroid Build Coastguard Worker       size_t n = nr_block_size;
534*4bdc9457SAndroid Build Coastguard Worker       do {
535*4bdc9457SAndroid Build Coastguard Worker         unaligned_store_s32(packed_w, 0);
536*4bdc9457SAndroid Build Coastguard Worker         packed_w = (int32_t*) packed_w + 1;
537*4bdc9457SAndroid Build Coastguard Worker       } while (--n != 0);
538*4bdc9457SAndroid Build Coastguard Worker     }
539*4bdc9457SAndroid Build Coastguard Worker     packed_w = (uint32_t*) packed_w + (nr - nr_block_size);
540*4bdc9457SAndroid Build Coastguard Worker 
541*4bdc9457SAndroid Build Coastguard Worker     for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
542*4bdc9457SAndroid Build Coastguard Worker       for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
543*4bdc9457SAndroid Build Coastguard Worker         uint32_t ksum = 0;
544*4bdc9457SAndroid Build Coastguard Worker         for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
545*4bdc9457SAndroid Build Coastguard Worker           const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
546*4bdc9457SAndroid Build Coastguard Worker           if (kc_idx < kc) {
547*4bdc9457SAndroid Build Coastguard Worker             const int8_t kv = k[kc_idx * nc + (nr_block_start + nr_block_offset)];
548*4bdc9457SAndroid Build Coastguard Worker             ksum += (uint32_t) kv;
549*4bdc9457SAndroid Build Coastguard Worker             ((int8_t*) packed_w)[kr_block_offset] = kv;
550*4bdc9457SAndroid Build Coastguard Worker           }
551*4bdc9457SAndroid Build Coastguard Worker         }
552*4bdc9457SAndroid Build Coastguard Worker         unaligned_indexed_store_u32(packed_b, nr_block_offset, unaligned_indexed_load_u32(packed_b, nr_block_offset) - ksum * izp);
553*4bdc9457SAndroid Build Coastguard Worker         packed_w = (int8_t*) packed_w + kr;
554*4bdc9457SAndroid Build Coastguard Worker       }
555*4bdc9457SAndroid Build Coastguard Worker       packed_w = (int8_t*) packed_w + (nr - nr_block_size) * kr;
556*4bdc9457SAndroid Build Coastguard Worker     }
557*4bdc9457SAndroid Build Coastguard Worker   }
558*4bdc9457SAndroid Build Coastguard Worker }
559*4bdc9457SAndroid Build Coastguard Worker 
xnn_pack_f32_conv_goki_w(size_t g,size_t nc,size_t ks,size_t kc,size_t nr,size_t kr,size_t sr,const float * k,const float * b,float * packed_w,size_t extra_bytes,const void * params)560*4bdc9457SAndroid Build Coastguard Worker void xnn_pack_f32_conv_goki_w(
561*4bdc9457SAndroid Build Coastguard Worker   size_t g,
562*4bdc9457SAndroid Build Coastguard Worker   size_t nc,
563*4bdc9457SAndroid Build Coastguard Worker   size_t ks,
564*4bdc9457SAndroid Build Coastguard Worker   size_t kc,
565*4bdc9457SAndroid Build Coastguard Worker   size_t nr,
566*4bdc9457SAndroid Build Coastguard Worker   size_t kr,
567*4bdc9457SAndroid Build Coastguard Worker   size_t sr,
568*4bdc9457SAndroid Build Coastguard Worker   const float* k,
569*4bdc9457SAndroid Build Coastguard Worker   const float* b,
570*4bdc9457SAndroid Build Coastguard Worker   float* packed_w,
571*4bdc9457SAndroid Build Coastguard Worker   size_t extra_bytes,
572*4bdc9457SAndroid Build Coastguard Worker   const void* params)
573*4bdc9457SAndroid Build Coastguard Worker {
574*4bdc9457SAndroid Build Coastguard Worker   assert(nr >= sr);
575*4bdc9457SAndroid Build Coastguard Worker 
576*4bdc9457SAndroid Build Coastguard Worker   const size_t skr = sr * kr;
577*4bdc9457SAndroid Build Coastguard Worker   do {
578*4bdc9457SAndroid Build Coastguard Worker     for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
579*4bdc9457SAndroid Build Coastguard Worker       const size_t nr_block_size = min(nc - nr_block_start, nr);
580*4bdc9457SAndroid Build Coastguard Worker       if XNN_LIKELY(b != NULL) {
581*4bdc9457SAndroid Build Coastguard Worker         for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
582*4bdc9457SAndroid Build Coastguard Worker           packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
583*4bdc9457SAndroid Build Coastguard Worker         }
584*4bdc9457SAndroid Build Coastguard Worker       }
585*4bdc9457SAndroid Build Coastguard Worker       packed_w += nr;
586*4bdc9457SAndroid Build Coastguard Worker 
587*4bdc9457SAndroid Build Coastguard Worker       for (size_t ki = 0; ki < ks; ki++) {
588*4bdc9457SAndroid Build Coastguard Worker         for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
589*4bdc9457SAndroid Build Coastguard Worker           for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
590*4bdc9457SAndroid Build Coastguard Worker             for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
591*4bdc9457SAndroid Build Coastguard Worker               const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
592*4bdc9457SAndroid Build Coastguard Worker               if (kc_idx < kc) {
593*4bdc9457SAndroid Build Coastguard Worker                 packed_w[kr_block_offset] = k[((nr_block_start + nr_block_offset) * ks + ki) * kc + kc_idx];
594*4bdc9457SAndroid Build Coastguard Worker               }
595*4bdc9457SAndroid Build Coastguard Worker             }
596*4bdc9457SAndroid Build Coastguard Worker             packed_w += kr;
597*4bdc9457SAndroid Build Coastguard Worker           }
598*4bdc9457SAndroid Build Coastguard Worker           packed_w += (nr - nr_block_size) * kr;
599*4bdc9457SAndroid Build Coastguard Worker         }
600*4bdc9457SAndroid Build Coastguard Worker       }
601*4bdc9457SAndroid Build Coastguard Worker       packed_w = (float*) ((uintptr_t) packed_w + extra_bytes);
602*4bdc9457SAndroid Build Coastguard Worker     }
603*4bdc9457SAndroid Build Coastguard Worker     k += ks * kc * nc;
604*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(b != NULL) {
605*4bdc9457SAndroid Build Coastguard Worker       b += nc;
606*4bdc9457SAndroid Build Coastguard Worker     }
607*4bdc9457SAndroid Build Coastguard Worker   } while (--g != 0);
608*4bdc9457SAndroid Build Coastguard Worker }
609*4bdc9457SAndroid Build Coastguard Worker 
xnn_pack_f16_conv_goki_w(size_t g,size_t nc,size_t ks,size_t kc,size_t nr,size_t kr,size_t sr,const uint16_t * k,const uint16_t * b,uint16_t * packed_w,size_t extra_bytes,const void * params)610*4bdc9457SAndroid Build Coastguard Worker void xnn_pack_f16_conv_goki_w(
611*4bdc9457SAndroid Build Coastguard Worker   size_t g,
612*4bdc9457SAndroid Build Coastguard Worker   size_t nc,
613*4bdc9457SAndroid Build Coastguard Worker   size_t ks,
614*4bdc9457SAndroid Build Coastguard Worker   size_t kc,
615*4bdc9457SAndroid Build Coastguard Worker   size_t nr,
616*4bdc9457SAndroid Build Coastguard Worker   size_t kr,
617*4bdc9457SAndroid Build Coastguard Worker   size_t sr,
618*4bdc9457SAndroid Build Coastguard Worker   const uint16_t* k,
619*4bdc9457SAndroid Build Coastguard Worker   const uint16_t* b,
620*4bdc9457SAndroid Build Coastguard Worker   uint16_t* packed_w,
621*4bdc9457SAndroid Build Coastguard Worker   size_t extra_bytes,
622*4bdc9457SAndroid Build Coastguard Worker   const void* params)
623*4bdc9457SAndroid Build Coastguard Worker {
624*4bdc9457SAndroid Build Coastguard Worker   assert(nr >= sr);
625*4bdc9457SAndroid Build Coastguard Worker 
626*4bdc9457SAndroid Build Coastguard Worker   const size_t skr = sr * kr;
627*4bdc9457SAndroid Build Coastguard Worker   do {
628*4bdc9457SAndroid Build Coastguard Worker     for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
629*4bdc9457SAndroid Build Coastguard Worker       const size_t nr_block_size = min(nc - nr_block_start, nr);
630*4bdc9457SAndroid Build Coastguard Worker       if XNN_LIKELY(b != NULL) {
631*4bdc9457SAndroid Build Coastguard Worker         for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
632*4bdc9457SAndroid Build Coastguard Worker           packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
633*4bdc9457SAndroid Build Coastguard Worker         }
634*4bdc9457SAndroid Build Coastguard Worker       }
635*4bdc9457SAndroid Build Coastguard Worker       packed_w += nr;
636*4bdc9457SAndroid Build Coastguard Worker 
637*4bdc9457SAndroid Build Coastguard Worker       for (size_t ki = 0; ki < ks; ki++) {
638*4bdc9457SAndroid Build Coastguard Worker         for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
639*4bdc9457SAndroid Build Coastguard Worker           for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
640*4bdc9457SAndroid Build Coastguard Worker             for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
641*4bdc9457SAndroid Build Coastguard Worker               const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
642*4bdc9457SAndroid Build Coastguard Worker               if (kc_idx < kc) {
643*4bdc9457SAndroid Build Coastguard Worker                 packed_w[kr_block_offset] = k[((nr_block_start + nr_block_offset) * ks + ki) * kc + kc_idx];
644*4bdc9457SAndroid Build Coastguard Worker               }
645*4bdc9457SAndroid Build Coastguard Worker             }
646*4bdc9457SAndroid Build Coastguard Worker             packed_w += kr;
647*4bdc9457SAndroid Build Coastguard Worker           }
648*4bdc9457SAndroid Build Coastguard Worker           packed_w += (nr - nr_block_size) * kr;
649*4bdc9457SAndroid Build Coastguard Worker         }
650*4bdc9457SAndroid Build Coastguard Worker       }
651*4bdc9457SAndroid Build Coastguard Worker       packed_w = (uint16_t*) ((uintptr_t) packed_w + extra_bytes);
652*4bdc9457SAndroid Build Coastguard Worker     }
653*4bdc9457SAndroid Build Coastguard Worker     k += ks * kc * nc;
654*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(b != NULL) {
655*4bdc9457SAndroid Build Coastguard Worker       b += nc;
656*4bdc9457SAndroid Build Coastguard Worker     }
657*4bdc9457SAndroid Build Coastguard Worker   } while (--g != 0);
658*4bdc9457SAndroid Build Coastguard Worker }
659*4bdc9457SAndroid Build Coastguard Worker 
xnn_pack_f32_to_f16_conv_goki_w(size_t g,size_t nc,size_t ks,size_t kc,size_t nr,size_t kr,size_t sr,const float * k,const float * b,uint16_t * packed_w,size_t extra_bytes,const void * params)660*4bdc9457SAndroid Build Coastguard Worker void xnn_pack_f32_to_f16_conv_goki_w(
661*4bdc9457SAndroid Build Coastguard Worker   size_t g,
662*4bdc9457SAndroid Build Coastguard Worker   size_t nc,
663*4bdc9457SAndroid Build Coastguard Worker   size_t ks,
664*4bdc9457SAndroid Build Coastguard Worker   size_t kc,
665*4bdc9457SAndroid Build Coastguard Worker   size_t nr,
666*4bdc9457SAndroid Build Coastguard Worker   size_t kr,
667*4bdc9457SAndroid Build Coastguard Worker   size_t sr,
668*4bdc9457SAndroid Build Coastguard Worker   const float* k,
669*4bdc9457SAndroid Build Coastguard Worker   const float* b,
670*4bdc9457SAndroid Build Coastguard Worker   uint16_t* packed_w,
671*4bdc9457SAndroid Build Coastguard Worker   size_t extra_bytes,
672*4bdc9457SAndroid Build Coastguard Worker   const void* params)
673*4bdc9457SAndroid Build Coastguard Worker {
674*4bdc9457SAndroid Build Coastguard Worker   assert(nr >= sr);
675*4bdc9457SAndroid Build Coastguard Worker 
676*4bdc9457SAndroid Build Coastguard Worker   const size_t skr = sr * kr;
677*4bdc9457SAndroid Build Coastguard Worker   do {
678*4bdc9457SAndroid Build Coastguard Worker     for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
679*4bdc9457SAndroid Build Coastguard Worker       const size_t nr_block_size = min(nc - nr_block_start, nr);
680*4bdc9457SAndroid Build Coastguard Worker       if XNN_LIKELY(b != NULL) {
681*4bdc9457SAndroid Build Coastguard Worker         for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
682*4bdc9457SAndroid Build Coastguard Worker           packed_w[nr_block_offset] = fp16_ieee_from_fp32_value(b[nr_block_start + nr_block_offset]);
683*4bdc9457SAndroid Build Coastguard Worker         }
684*4bdc9457SAndroid Build Coastguard Worker       }
685*4bdc9457SAndroid Build Coastguard Worker       packed_w += nr;
686*4bdc9457SAndroid Build Coastguard Worker 
687*4bdc9457SAndroid Build Coastguard Worker       for (size_t ki = 0; ki < ks; ki++) {
688*4bdc9457SAndroid Build Coastguard Worker         for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
689*4bdc9457SAndroid Build Coastguard Worker           for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
690*4bdc9457SAndroid Build Coastguard Worker             for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
691*4bdc9457SAndroid Build Coastguard Worker               const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
692*4bdc9457SAndroid Build Coastguard Worker               if (kc_idx < kc) {
693*4bdc9457SAndroid Build Coastguard Worker                 packed_w[kr_block_offset] = fp16_ieee_from_fp32_value(k[((nr_block_start + nr_block_offset) * ks + ki) * kc + kc_idx]);
694*4bdc9457SAndroid Build Coastguard Worker               }
695*4bdc9457SAndroid Build Coastguard Worker             }
696*4bdc9457SAndroid Build Coastguard Worker             packed_w += kr;
697*4bdc9457SAndroid Build Coastguard Worker           }
698*4bdc9457SAndroid Build Coastguard Worker           packed_w += (nr - nr_block_size) * kr;
699*4bdc9457SAndroid Build Coastguard Worker         }
700*4bdc9457SAndroid Build Coastguard Worker       }
701*4bdc9457SAndroid Build Coastguard Worker       packed_w = (uint16_t*) ((uintptr_t) packed_w + extra_bytes);
702*4bdc9457SAndroid Build Coastguard Worker     }
703*4bdc9457SAndroid Build Coastguard Worker     k += ks * kc * nc;
704*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(b != NULL) {
705*4bdc9457SAndroid Build Coastguard Worker       b += nc;
706*4bdc9457SAndroid Build Coastguard Worker     }
707*4bdc9457SAndroid Build Coastguard Worker   } while (--g != 0);
708*4bdc9457SAndroid Build Coastguard Worker }
709*4bdc9457SAndroid Build Coastguard Worker 
xnn_pack_qu8_conv_goki_w(size_t g,size_t nc,size_t ks,size_t kc,size_t nr,size_t kr,size_t sr,const uint8_t * k,const int32_t * b,void * packed_w,size_t extra_bytes,const struct xnn_qu8_packing_params * params)710*4bdc9457SAndroid Build Coastguard Worker void xnn_pack_qu8_conv_goki_w(
711*4bdc9457SAndroid Build Coastguard Worker   size_t g,
712*4bdc9457SAndroid Build Coastguard Worker   size_t nc,
713*4bdc9457SAndroid Build Coastguard Worker   size_t ks,
714*4bdc9457SAndroid Build Coastguard Worker   size_t kc,
715*4bdc9457SAndroid Build Coastguard Worker   size_t nr,
716*4bdc9457SAndroid Build Coastguard Worker   size_t kr,
717*4bdc9457SAndroid Build Coastguard Worker   size_t sr,
718*4bdc9457SAndroid Build Coastguard Worker   const uint8_t* k,
719*4bdc9457SAndroid Build Coastguard Worker   const int32_t* b,
720*4bdc9457SAndroid Build Coastguard Worker   void* packed_w,
721*4bdc9457SAndroid Build Coastguard Worker   size_t extra_bytes,
722*4bdc9457SAndroid Build Coastguard Worker   const struct xnn_qu8_packing_params* params)
723*4bdc9457SAndroid Build Coastguard Worker {
724*4bdc9457SAndroid Build Coastguard Worker   assert(nr >= sr);
725*4bdc9457SAndroid Build Coastguard Worker 
726*4bdc9457SAndroid Build Coastguard Worker   const size_t skr = sr * kr;
727*4bdc9457SAndroid Build Coastguard Worker   const int32_t izp = (int32_t) params->input_zero_point;
728*4bdc9457SAndroid Build Coastguard Worker   const int32_t bzp = (int32_t) ks * (int32_t) kc * izp * (int32_t) params->kernel_zero_point;
729*4bdc9457SAndroid Build Coastguard Worker   do {
730*4bdc9457SAndroid Build Coastguard Worker     for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
731*4bdc9457SAndroid Build Coastguard Worker       const size_t nr_block_size = min(nc - nr_block_start, nr);
732*4bdc9457SAndroid Build Coastguard Worker       int32_t* packed_b = (int32_t*) packed_w;
733*4bdc9457SAndroid Build Coastguard Worker       if XNN_LIKELY(b != NULL) {
734*4bdc9457SAndroid Build Coastguard Worker         for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
735*4bdc9457SAndroid Build Coastguard Worker           unaligned_store_s32(packed_w, bzp + b[nr_block_start + nr_block_offset]);
736*4bdc9457SAndroid Build Coastguard Worker           packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
737*4bdc9457SAndroid Build Coastguard Worker         }
738*4bdc9457SAndroid Build Coastguard Worker       } else {
739*4bdc9457SAndroid Build Coastguard Worker         size_t n = nr_block_size;
740*4bdc9457SAndroid Build Coastguard Worker         do {
741*4bdc9457SAndroid Build Coastguard Worker           unaligned_store_s32(packed_w, bzp);
742*4bdc9457SAndroid Build Coastguard Worker           packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
743*4bdc9457SAndroid Build Coastguard Worker         } while (--n != 0);
744*4bdc9457SAndroid Build Coastguard Worker       }
745*4bdc9457SAndroid Build Coastguard Worker       packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
746*4bdc9457SAndroid Build Coastguard Worker 
747*4bdc9457SAndroid Build Coastguard Worker       for (size_t ki = 0; ki < ks; ki++) {
748*4bdc9457SAndroid Build Coastguard Worker         for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
749*4bdc9457SAndroid Build Coastguard Worker           for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
750*4bdc9457SAndroid Build Coastguard Worker             int32_t ksum = 0;
751*4bdc9457SAndroid Build Coastguard Worker             for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
752*4bdc9457SAndroid Build Coastguard Worker               const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
753*4bdc9457SAndroid Build Coastguard Worker               if (kc_idx < kc) {
754*4bdc9457SAndroid Build Coastguard Worker                 const uint8_t kv = k[((nr_block_start + nr_block_offset) * ks + ki) * kc + kc_idx];
755*4bdc9457SAndroid Build Coastguard Worker                 ksum += (int32_t) kv;
756*4bdc9457SAndroid Build Coastguard Worker                 ((uint8_t*) packed_w)[kr_block_offset] = kv;
757*4bdc9457SAndroid Build Coastguard Worker               }
758*4bdc9457SAndroid Build Coastguard Worker             }
759*4bdc9457SAndroid Build Coastguard Worker             unaligned_indexed_store_s32(packed_b, nr_block_offset, unaligned_indexed_load_s32(packed_b, nr_block_offset) - ksum * izp);
760*4bdc9457SAndroid Build Coastguard Worker             packed_w = (uint8_t*) packed_w + kr;
761*4bdc9457SAndroid Build Coastguard Worker           }
762*4bdc9457SAndroid Build Coastguard Worker           packed_w = (uint8_t*) packed_w + (nr - nr_block_size) * kr;
763*4bdc9457SAndroid Build Coastguard Worker         }
764*4bdc9457SAndroid Build Coastguard Worker       }
765*4bdc9457SAndroid Build Coastguard Worker       packed_w = (void*) ((uintptr_t) packed_w + extra_bytes);
766*4bdc9457SAndroid Build Coastguard Worker     }
767*4bdc9457SAndroid Build Coastguard Worker     k += ks * kc * nc;
768*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(b != NULL) {
769*4bdc9457SAndroid Build Coastguard Worker       b += nc;
770*4bdc9457SAndroid Build Coastguard Worker     }
771*4bdc9457SAndroid Build Coastguard Worker   } while (--g != 0);
772*4bdc9457SAndroid Build Coastguard Worker }
773*4bdc9457SAndroid Build Coastguard Worker 
xnn_pack_qs8_conv_goki_w(size_t g,size_t nc,size_t ks,size_t kc,size_t nr,size_t kr,size_t sr,const int8_t * k,const int32_t * b,void * packed_w,size_t extra_bytes,const struct xnn_qs8_packing_params * params)774*4bdc9457SAndroid Build Coastguard Worker void xnn_pack_qs8_conv_goki_w(
775*4bdc9457SAndroid Build Coastguard Worker   size_t g,
776*4bdc9457SAndroid Build Coastguard Worker   size_t nc,
777*4bdc9457SAndroid Build Coastguard Worker   size_t ks,
778*4bdc9457SAndroid Build Coastguard Worker   size_t kc,
779*4bdc9457SAndroid Build Coastguard Worker   size_t nr,
780*4bdc9457SAndroid Build Coastguard Worker   size_t kr,
781*4bdc9457SAndroid Build Coastguard Worker   size_t sr,
782*4bdc9457SAndroid Build Coastguard Worker   const int8_t* k,
783*4bdc9457SAndroid Build Coastguard Worker   const int32_t* b,
784*4bdc9457SAndroid Build Coastguard Worker   void* packed_w,
785*4bdc9457SAndroid Build Coastguard Worker   size_t extra_bytes,
786*4bdc9457SAndroid Build Coastguard Worker   const struct xnn_qs8_packing_params* params)
787*4bdc9457SAndroid Build Coastguard Worker {
788*4bdc9457SAndroid Build Coastguard Worker   assert(nr >= sr);
789*4bdc9457SAndroid Build Coastguard Worker 
790*4bdc9457SAndroid Build Coastguard Worker   const size_t skr = sr * kr;
791*4bdc9457SAndroid Build Coastguard Worker   const uint32_t izp = (int32_t) params->input_zero_point;
792*4bdc9457SAndroid Build Coastguard Worker   do {
793*4bdc9457SAndroid Build Coastguard Worker     for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
794*4bdc9457SAndroid Build Coastguard Worker       const size_t nr_block_size = min(nc - nr_block_start, nr);
795*4bdc9457SAndroid Build Coastguard Worker       int32_t* packed_b = (int32_t*) packed_w;
796*4bdc9457SAndroid Build Coastguard Worker       if XNN_LIKELY(b != NULL) {
797*4bdc9457SAndroid Build Coastguard Worker         for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
798*4bdc9457SAndroid Build Coastguard Worker           unaligned_store_s32(packed_w, b[nr_block_start + nr_block_offset]);
799*4bdc9457SAndroid Build Coastguard Worker           packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
800*4bdc9457SAndroid Build Coastguard Worker         }
801*4bdc9457SAndroid Build Coastguard Worker       } else {
802*4bdc9457SAndroid Build Coastguard Worker         size_t n = nr_block_size;
803*4bdc9457SAndroid Build Coastguard Worker         do {
804*4bdc9457SAndroid Build Coastguard Worker           unaligned_store_s32(packed_w, 0);
805*4bdc9457SAndroid Build Coastguard Worker           packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
806*4bdc9457SAndroid Build Coastguard Worker         } while (--n != 0);
807*4bdc9457SAndroid Build Coastguard Worker       }
808*4bdc9457SAndroid Build Coastguard Worker       packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
809*4bdc9457SAndroid Build Coastguard Worker 
810*4bdc9457SAndroid Build Coastguard Worker       for (size_t ki = 0; ki < ks; ki++) {
811*4bdc9457SAndroid Build Coastguard Worker         for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
812*4bdc9457SAndroid Build Coastguard Worker           for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
813*4bdc9457SAndroid Build Coastguard Worker             uint32_t ksum = 0;
814*4bdc9457SAndroid Build Coastguard Worker             for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
815*4bdc9457SAndroid Build Coastguard Worker               const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
816*4bdc9457SAndroid Build Coastguard Worker               if (kc_idx < kc) {
817*4bdc9457SAndroid Build Coastguard Worker                 const int8_t kv = k[((nr_block_start + nr_block_offset) * ks + ki) * kc + kc_idx];
818*4bdc9457SAndroid Build Coastguard Worker                 ksum += (uint32_t) kv;
819*4bdc9457SAndroid Build Coastguard Worker                 ((int8_t*) packed_w)[kr_block_offset] = kv;
820*4bdc9457SAndroid Build Coastguard Worker               }
821*4bdc9457SAndroid Build Coastguard Worker             }
822*4bdc9457SAndroid Build Coastguard Worker             unaligned_indexed_store_u32(packed_b, nr_block_offset, unaligned_indexed_load_u32(packed_b, nr_block_offset) - ksum * izp);
823*4bdc9457SAndroid Build Coastguard Worker             packed_w = (int8_t*) packed_w + kr;
824*4bdc9457SAndroid Build Coastguard Worker           }
825*4bdc9457SAndroid Build Coastguard Worker           packed_w = (int8_t*) packed_w + (nr - nr_block_size) * kr;
826*4bdc9457SAndroid Build Coastguard Worker         }
827*4bdc9457SAndroid Build Coastguard Worker       }
828*4bdc9457SAndroid Build Coastguard Worker       packed_w = (void*) ((uintptr_t) packed_w + extra_bytes);
829*4bdc9457SAndroid Build Coastguard Worker     }
830*4bdc9457SAndroid Build Coastguard Worker     k += ks * kc * nc;
831*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(b != NULL) {
832*4bdc9457SAndroid Build Coastguard Worker       b += nc;
833*4bdc9457SAndroid Build Coastguard Worker     }
834*4bdc9457SAndroid Build Coastguard Worker   } while (--g != 0);
835*4bdc9457SAndroid Build Coastguard Worker }
836*4bdc9457SAndroid Build Coastguard Worker 
xnn_pack_f32_conv_kgo_w(size_t g,size_t nc,size_t ks,size_t nr,size_t kr,size_t sr,const float * k,const float * b,float * packed_w,size_t extra_bytes,const void * params)837*4bdc9457SAndroid Build Coastguard Worker void xnn_pack_f32_conv_kgo_w(
838*4bdc9457SAndroid Build Coastguard Worker   size_t g,
839*4bdc9457SAndroid Build Coastguard Worker   size_t nc,
840*4bdc9457SAndroid Build Coastguard Worker   size_t ks,
841*4bdc9457SAndroid Build Coastguard Worker   size_t nr,
842*4bdc9457SAndroid Build Coastguard Worker   size_t kr,
843*4bdc9457SAndroid Build Coastguard Worker   size_t sr,
844*4bdc9457SAndroid Build Coastguard Worker   const float* k,
845*4bdc9457SAndroid Build Coastguard Worker   const float* b,
846*4bdc9457SAndroid Build Coastguard Worker   float* packed_w,
847*4bdc9457SAndroid Build Coastguard Worker   size_t extra_bytes,
848*4bdc9457SAndroid Build Coastguard Worker   const void* params)
849*4bdc9457SAndroid Build Coastguard Worker {
850*4bdc9457SAndroid Build Coastguard Worker   assert(nr >= sr);
851*4bdc9457SAndroid Build Coastguard Worker 
852*4bdc9457SAndroid Build Coastguard Worker   for (size_t i = 0; i < g; i++) {
853*4bdc9457SAndroid Build Coastguard Worker     for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
854*4bdc9457SAndroid Build Coastguard Worker       const size_t nr_block_size = min(nc - nr_block_start, nr);
855*4bdc9457SAndroid Build Coastguard Worker       if XNN_LIKELY(b != NULL) {
856*4bdc9457SAndroid Build Coastguard Worker         for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
857*4bdc9457SAndroid Build Coastguard Worker           packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
858*4bdc9457SAndroid Build Coastguard Worker         }
859*4bdc9457SAndroid Build Coastguard Worker       }
860*4bdc9457SAndroid Build Coastguard Worker       packed_w += nr;
861*4bdc9457SAndroid Build Coastguard Worker 
862*4bdc9457SAndroid Build Coastguard Worker       for (size_t ki = 0; ki < ks; ki++) {
863*4bdc9457SAndroid Build Coastguard Worker         for (size_t sr_block_offset = 0; sr_block_offset < sr; sr_block_offset++) {
864*4bdc9457SAndroid Build Coastguard Worker           for (size_t nr_block_offset = (-sr_block_offset) & (sr - 1); nr_block_offset < nr_block_size; nr_block_offset += sr) {
865*4bdc9457SAndroid Build Coastguard Worker             packed_w[nr_block_offset * kr] = k[ki * g * nc + (nr_block_start + nr_block_offset)];
866*4bdc9457SAndroid Build Coastguard Worker           }
867*4bdc9457SAndroid Build Coastguard Worker           packed_w += nr * kr;
868*4bdc9457SAndroid Build Coastguard Worker         }
869*4bdc9457SAndroid Build Coastguard Worker       }
870*4bdc9457SAndroid Build Coastguard Worker       packed_w = (float*) ((uintptr_t) packed_w + extra_bytes);
871*4bdc9457SAndroid Build Coastguard Worker     }
872*4bdc9457SAndroid Build Coastguard Worker     k += nc;
873*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(b != NULL) {
874*4bdc9457SAndroid Build Coastguard Worker       b += nc;
875*4bdc9457SAndroid Build Coastguard Worker     }
876*4bdc9457SAndroid Build Coastguard Worker   }
877*4bdc9457SAndroid Build Coastguard Worker }
878*4bdc9457SAndroid Build Coastguard Worker 
xnn_pack_f16_conv_kgo_w(size_t g,size_t nc,size_t ks,size_t nr,size_t kr,size_t sr,const uint16_t * k,const uint16_t * b,uint16_t * packed_w,size_t extra_bytes,const void * params)879*4bdc9457SAndroid Build Coastguard Worker void xnn_pack_f16_conv_kgo_w(
880*4bdc9457SAndroid Build Coastguard Worker   size_t g,
881*4bdc9457SAndroid Build Coastguard Worker   size_t nc,
882*4bdc9457SAndroid Build Coastguard Worker   size_t ks,
883*4bdc9457SAndroid Build Coastguard Worker   size_t nr,
884*4bdc9457SAndroid Build Coastguard Worker   size_t kr,
885*4bdc9457SAndroid Build Coastguard Worker   size_t sr,
886*4bdc9457SAndroid Build Coastguard Worker   const uint16_t* k,
887*4bdc9457SAndroid Build Coastguard Worker   const uint16_t* b,
888*4bdc9457SAndroid Build Coastguard Worker   uint16_t* packed_w,
889*4bdc9457SAndroid Build Coastguard Worker   size_t extra_bytes,
890*4bdc9457SAndroid Build Coastguard Worker   const void* params)
891*4bdc9457SAndroid Build Coastguard Worker {
892*4bdc9457SAndroid Build Coastguard Worker   assert(nr >= sr);
893*4bdc9457SAndroid Build Coastguard Worker 
894*4bdc9457SAndroid Build Coastguard Worker   for (size_t i = 0; i < g; i++) {
895*4bdc9457SAndroid Build Coastguard Worker     for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
896*4bdc9457SAndroid Build Coastguard Worker       const size_t nr_block_size = min(nc - nr_block_start, nr);
897*4bdc9457SAndroid Build Coastguard Worker       if XNN_LIKELY(b != NULL) {
898*4bdc9457SAndroid Build Coastguard Worker         for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
899*4bdc9457SAndroid Build Coastguard Worker           packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
900*4bdc9457SAndroid Build Coastguard Worker         }
901*4bdc9457SAndroid Build Coastguard Worker       }
902*4bdc9457SAndroid Build Coastguard Worker       packed_w += nr;
903*4bdc9457SAndroid Build Coastguard Worker 
904*4bdc9457SAndroid Build Coastguard Worker       for (size_t ki = 0; ki < ks; ki++) {
905*4bdc9457SAndroid Build Coastguard Worker         for (size_t sr_block_offset = 0; sr_block_offset < sr; sr_block_offset++) {
906*4bdc9457SAndroid Build Coastguard Worker           for (size_t nr_block_offset = (-sr_block_offset) & (sr - 1); nr_block_offset < nr_block_size; nr_block_offset += sr) {
907*4bdc9457SAndroid Build Coastguard Worker             packed_w[nr_block_offset * kr] = k[ki * g * nc + (nr_block_start + nr_block_offset)];
908*4bdc9457SAndroid Build Coastguard Worker           }
909*4bdc9457SAndroid Build Coastguard Worker           packed_w += nr * kr;
910*4bdc9457SAndroid Build Coastguard Worker         }
911*4bdc9457SAndroid Build Coastguard Worker       }
912*4bdc9457SAndroid Build Coastguard Worker       packed_w = (uint16_t*) ((uintptr_t) packed_w + extra_bytes);
913*4bdc9457SAndroid Build Coastguard Worker     }
914*4bdc9457SAndroid Build Coastguard Worker     k += nc;
915*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(b != NULL) {
916*4bdc9457SAndroid Build Coastguard Worker       b += nc;
917*4bdc9457SAndroid Build Coastguard Worker     }
918*4bdc9457SAndroid Build Coastguard Worker   }
919*4bdc9457SAndroid Build Coastguard Worker }
920*4bdc9457SAndroid Build Coastguard Worker 
xnn_pack_f32_to_f16_conv_kgo_w(size_t g,size_t nc,size_t ks,size_t nr,size_t kr,size_t sr,const float * k,const float * b,uint16_t * packed_w,size_t extra_bytes,const void * params)921*4bdc9457SAndroid Build Coastguard Worker void xnn_pack_f32_to_f16_conv_kgo_w(
922*4bdc9457SAndroid Build Coastguard Worker   size_t g,
923*4bdc9457SAndroid Build Coastguard Worker   size_t nc,
924*4bdc9457SAndroid Build Coastguard Worker   size_t ks,
925*4bdc9457SAndroid Build Coastguard Worker   size_t nr,
926*4bdc9457SAndroid Build Coastguard Worker   size_t kr,
927*4bdc9457SAndroid Build Coastguard Worker   size_t sr,
928*4bdc9457SAndroid Build Coastguard Worker   const float* k,
929*4bdc9457SAndroid Build Coastguard Worker   const float* b,
930*4bdc9457SAndroid Build Coastguard Worker   uint16_t* packed_w,
931*4bdc9457SAndroid Build Coastguard Worker   size_t extra_bytes,
932*4bdc9457SAndroid Build Coastguard Worker   const void* params)
933*4bdc9457SAndroid Build Coastguard Worker {
934*4bdc9457SAndroid Build Coastguard Worker   assert(nr >= sr);
935*4bdc9457SAndroid Build Coastguard Worker 
936*4bdc9457SAndroid Build Coastguard Worker   for (size_t i = 0; i < g; i++) {
937*4bdc9457SAndroid Build Coastguard Worker     for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
938*4bdc9457SAndroid Build Coastguard Worker       const size_t nr_block_size = min(nc - nr_block_start, nr);
939*4bdc9457SAndroid Build Coastguard Worker       if XNN_LIKELY(b != NULL) {
940*4bdc9457SAndroid Build Coastguard Worker         for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
941*4bdc9457SAndroid Build Coastguard Worker           packed_w[nr_block_offset] = fp16_ieee_from_fp32_value(b[nr_block_start + nr_block_offset]);
942*4bdc9457SAndroid Build Coastguard Worker         }
943*4bdc9457SAndroid Build Coastguard Worker       }
944*4bdc9457SAndroid Build Coastguard Worker       packed_w += nr;
945*4bdc9457SAndroid Build Coastguard Worker 
946*4bdc9457SAndroid Build Coastguard Worker       for (size_t ki = 0; ki < ks; ki++) {
947*4bdc9457SAndroid Build Coastguard Worker         for (size_t sr_block_offset = 0; sr_block_offset < sr; sr_block_offset++) {
948*4bdc9457SAndroid Build Coastguard Worker           for (size_t nr_block_offset = (-sr_block_offset) & (sr - 1); nr_block_offset < nr_block_size; nr_block_offset += sr) {
949*4bdc9457SAndroid Build Coastguard Worker             packed_w[nr_block_offset * kr] = fp16_ieee_from_fp32_value(k[ki * g * nc + (nr_block_start + nr_block_offset)]);
950*4bdc9457SAndroid Build Coastguard Worker           }
951*4bdc9457SAndroid Build Coastguard Worker           packed_w += nr * kr;
952*4bdc9457SAndroid Build Coastguard Worker         }
953*4bdc9457SAndroid Build Coastguard Worker       }
954*4bdc9457SAndroid Build Coastguard Worker       packed_w = (uint16_t*) ((uintptr_t) packed_w + extra_bytes);
955*4bdc9457SAndroid Build Coastguard Worker     }
956*4bdc9457SAndroid Build Coastguard Worker     k += nc;
957*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(b != NULL) {
958*4bdc9457SAndroid Build Coastguard Worker       b += nc;
959*4bdc9457SAndroid Build Coastguard Worker     }
960*4bdc9457SAndroid Build Coastguard Worker   }
961*4bdc9457SAndroid Build Coastguard Worker }
962*4bdc9457SAndroid Build Coastguard Worker 
xnn_pack_qu8_conv_kgo_w(size_t g,size_t nc,size_t ks,size_t nr,size_t kr,size_t sr,const uint8_t * k,const int32_t * b,void * packed_w,size_t extra_bytes,const struct xnn_qu8_packing_params * params)963*4bdc9457SAndroid Build Coastguard Worker void xnn_pack_qu8_conv_kgo_w(
964*4bdc9457SAndroid Build Coastguard Worker   size_t g,
965*4bdc9457SAndroid Build Coastguard Worker   size_t nc,
966*4bdc9457SAndroid Build Coastguard Worker   size_t ks,
967*4bdc9457SAndroid Build Coastguard Worker   size_t nr,
968*4bdc9457SAndroid Build Coastguard Worker   size_t kr,
969*4bdc9457SAndroid Build Coastguard Worker   size_t sr,
970*4bdc9457SAndroid Build Coastguard Worker   const uint8_t* k,
971*4bdc9457SAndroid Build Coastguard Worker   const int32_t* b,
972*4bdc9457SAndroid Build Coastguard Worker   void* packed_w,
973*4bdc9457SAndroid Build Coastguard Worker   size_t extra_bytes,
974*4bdc9457SAndroid Build Coastguard Worker   const struct xnn_qu8_packing_params* params)
975*4bdc9457SAndroid Build Coastguard Worker {
976*4bdc9457SAndroid Build Coastguard Worker   assert(nr >= sr);
977*4bdc9457SAndroid Build Coastguard Worker 
978*4bdc9457SAndroid Build Coastguard Worker   const int32_t izp = (int32_t) params->input_zero_point;
979*4bdc9457SAndroid Build Coastguard Worker   const int32_t bzp = (int32_t) ks * izp * (int32_t) params->kernel_zero_point;
980*4bdc9457SAndroid Build Coastguard Worker   for (size_t i = 0; i < g; i++) {
981*4bdc9457SAndroid Build Coastguard Worker     for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
982*4bdc9457SAndroid Build Coastguard Worker       const size_t nr_block_size = min(nc - nr_block_start, nr);
983*4bdc9457SAndroid Build Coastguard Worker       int32_t* packed_b = (int32_t*) packed_w;
984*4bdc9457SAndroid Build Coastguard Worker       if XNN_LIKELY(b != NULL) {
985*4bdc9457SAndroid Build Coastguard Worker         for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
986*4bdc9457SAndroid Build Coastguard Worker           unaligned_store_s32(packed_w, bzp + b[nr_block_start + nr_block_offset]);
987*4bdc9457SAndroid Build Coastguard Worker           packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
988*4bdc9457SAndroid Build Coastguard Worker         }
989*4bdc9457SAndroid Build Coastguard Worker       } else {
990*4bdc9457SAndroid Build Coastguard Worker         size_t n = nr_block_size;
991*4bdc9457SAndroid Build Coastguard Worker         do {
992*4bdc9457SAndroid Build Coastguard Worker           unaligned_store_s32(packed_w, bzp);
993*4bdc9457SAndroid Build Coastguard Worker           packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
994*4bdc9457SAndroid Build Coastguard Worker         } while (--n != 0);
995*4bdc9457SAndroid Build Coastguard Worker       }
996*4bdc9457SAndroid Build Coastguard Worker       packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
997*4bdc9457SAndroid Build Coastguard Worker 
998*4bdc9457SAndroid Build Coastguard Worker       for (size_t ki = 0; ki < ks; ki++) {
999*4bdc9457SAndroid Build Coastguard Worker         for (size_t sr_block_offset = 0; sr_block_offset < sr; sr_block_offset++) {
1000*4bdc9457SAndroid Build Coastguard Worker           for (size_t nr_block_offset = (-sr_block_offset) & (sr - 1); nr_block_offset < nr_block_size; nr_block_offset += sr) {
1001*4bdc9457SAndroid Build Coastguard Worker             const uint8_t kv = k[ki * g * nc + (nr_block_start + nr_block_offset)];
1002*4bdc9457SAndroid Build Coastguard Worker             ((uint8_t*) packed_w)[nr_block_offset * kr] = kv;
1003*4bdc9457SAndroid Build Coastguard Worker             unaligned_indexed_store_s32(packed_b, nr_block_offset, unaligned_indexed_load_s32(packed_b, nr_block_offset) - (int32_t) kv * izp);
1004*4bdc9457SAndroid Build Coastguard Worker           }
1005*4bdc9457SAndroid Build Coastguard Worker           packed_w = (uint8_t*) packed_w + nr * kr;
1006*4bdc9457SAndroid Build Coastguard Worker         }
1007*4bdc9457SAndroid Build Coastguard Worker       }
1008*4bdc9457SAndroid Build Coastguard Worker       packed_w = (void*) ((uintptr_t) packed_w + extra_bytes);
1009*4bdc9457SAndroid Build Coastguard Worker     }
1010*4bdc9457SAndroid Build Coastguard Worker     k += nc;
1011*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(b != NULL) {
1012*4bdc9457SAndroid Build Coastguard Worker       b += nc;
1013*4bdc9457SAndroid Build Coastguard Worker     }
1014*4bdc9457SAndroid Build Coastguard Worker   }
1015*4bdc9457SAndroid Build Coastguard Worker }
1016*4bdc9457SAndroid Build Coastguard Worker 
xnn_pack_qs8_conv_kgo_w(size_t g,size_t nc,size_t ks,size_t nr,size_t kr,size_t sr,const int8_t * k,const int32_t * b,void * packed_w,size_t extra_bytes,const struct xnn_qs8_packing_params * params)1017*4bdc9457SAndroid Build Coastguard Worker void xnn_pack_qs8_conv_kgo_w(
1018*4bdc9457SAndroid Build Coastguard Worker   size_t g,
1019*4bdc9457SAndroid Build Coastguard Worker   size_t nc,
1020*4bdc9457SAndroid Build Coastguard Worker   size_t ks,
1021*4bdc9457SAndroid Build Coastguard Worker   size_t nr,
1022*4bdc9457SAndroid Build Coastguard Worker   size_t kr,
1023*4bdc9457SAndroid Build Coastguard Worker   size_t sr,
1024*4bdc9457SAndroid Build Coastguard Worker   const int8_t* k,
1025*4bdc9457SAndroid Build Coastguard Worker   const int32_t* b,
1026*4bdc9457SAndroid Build Coastguard Worker   void* packed_w,
1027*4bdc9457SAndroid Build Coastguard Worker   size_t extra_bytes,
1028*4bdc9457SAndroid Build Coastguard Worker   const struct xnn_qs8_packing_params* params)
1029*4bdc9457SAndroid Build Coastguard Worker {
1030*4bdc9457SAndroid Build Coastguard Worker   assert(nr >= sr);
1031*4bdc9457SAndroid Build Coastguard Worker 
1032*4bdc9457SAndroid Build Coastguard Worker   const uint32_t izp = (uint32_t) params->input_zero_point;
1033*4bdc9457SAndroid Build Coastguard Worker   for (size_t i = 0; i < g; i++) {
1034*4bdc9457SAndroid Build Coastguard Worker     for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
1035*4bdc9457SAndroid Build Coastguard Worker       const size_t nr_block_size = min(nc - nr_block_start, nr);
1036*4bdc9457SAndroid Build Coastguard Worker       int32_t* packed_b = (int32_t*) packed_w;
1037*4bdc9457SAndroid Build Coastguard Worker       if XNN_LIKELY(b != NULL) {
1038*4bdc9457SAndroid Build Coastguard Worker         for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1039*4bdc9457SAndroid Build Coastguard Worker           unaligned_store_s32(packed_w, b[nr_block_start + nr_block_offset]);
1040*4bdc9457SAndroid Build Coastguard Worker           packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1041*4bdc9457SAndroid Build Coastguard Worker         }
1042*4bdc9457SAndroid Build Coastguard Worker       } else {
1043*4bdc9457SAndroid Build Coastguard Worker         size_t n = nr_block_size;
1044*4bdc9457SAndroid Build Coastguard Worker         do {
1045*4bdc9457SAndroid Build Coastguard Worker           unaligned_store_s32(packed_w, 0);
1046*4bdc9457SAndroid Build Coastguard Worker           packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1047*4bdc9457SAndroid Build Coastguard Worker         } while (--n != 0);
1048*4bdc9457SAndroid Build Coastguard Worker       }
1049*4bdc9457SAndroid Build Coastguard Worker       packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
1050*4bdc9457SAndroid Build Coastguard Worker 
1051*4bdc9457SAndroid Build Coastguard Worker       for (size_t ki = 0; ki < ks; ki++) {
1052*4bdc9457SAndroid Build Coastguard Worker         for (size_t sr_block_offset = 0; sr_block_offset < sr; sr_block_offset++) {
1053*4bdc9457SAndroid Build Coastguard Worker           for (size_t nr_block_offset = (-sr_block_offset) & (sr - 1); nr_block_offset < nr_block_size; nr_block_offset += sr) {
1054*4bdc9457SAndroid Build Coastguard Worker             const int8_t kv = k[ki * g * nc + (nr_block_start + nr_block_offset)];
1055*4bdc9457SAndroid Build Coastguard Worker             ((int8_t*) packed_w)[nr_block_offset * kr] = kv;
1056*4bdc9457SAndroid Build Coastguard Worker             unaligned_indexed_store_u32(packed_b, nr_block_offset, unaligned_indexed_load_u32(packed_b, nr_block_offset) - (uint32_t) kv * izp);
1057*4bdc9457SAndroid Build Coastguard Worker           }
1058*4bdc9457SAndroid Build Coastguard Worker           packed_w = (int8_t*) packed_w + nr * kr;
1059*4bdc9457SAndroid Build Coastguard Worker         }
1060*4bdc9457SAndroid Build Coastguard Worker       }
1061*4bdc9457SAndroid Build Coastguard Worker       packed_w = (void*) ((uintptr_t) packed_w + extra_bytes);
1062*4bdc9457SAndroid Build Coastguard Worker     }
1063*4bdc9457SAndroid Build Coastguard Worker     k += nc;
1064*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(b != NULL) {
1065*4bdc9457SAndroid Build Coastguard Worker       b += nc;
1066*4bdc9457SAndroid Build Coastguard Worker     }
1067*4bdc9457SAndroid Build Coastguard Worker   }
1068*4bdc9457SAndroid Build Coastguard Worker }
1069*4bdc9457SAndroid Build Coastguard Worker 
xnn_pack_f32_deconv_goki_w(size_t g,size_t nc,size_t kh,size_t kw,size_t kc,size_t sh,size_t sw,size_t nr,size_t kr,size_t sr,const float * k,const float * b,float * packed_w,struct subconvolution_params * subconv_params,const void * params)1070*4bdc9457SAndroid Build Coastguard Worker void xnn_pack_f32_deconv_goki_w(
1071*4bdc9457SAndroid Build Coastguard Worker   size_t g,
1072*4bdc9457SAndroid Build Coastguard Worker   size_t nc,
1073*4bdc9457SAndroid Build Coastguard Worker   size_t kh,
1074*4bdc9457SAndroid Build Coastguard Worker   size_t kw,
1075*4bdc9457SAndroid Build Coastguard Worker   size_t kc,
1076*4bdc9457SAndroid Build Coastguard Worker   size_t sh,
1077*4bdc9457SAndroid Build Coastguard Worker   size_t sw,
1078*4bdc9457SAndroid Build Coastguard Worker   size_t nr,
1079*4bdc9457SAndroid Build Coastguard Worker   size_t kr,
1080*4bdc9457SAndroid Build Coastguard Worker   size_t sr,
1081*4bdc9457SAndroid Build Coastguard Worker   const float* k,
1082*4bdc9457SAndroid Build Coastguard Worker   const float* b,
1083*4bdc9457SAndroid Build Coastguard Worker   float* packed_w,
1084*4bdc9457SAndroid Build Coastguard Worker   struct subconvolution_params* subconv_params,
1085*4bdc9457SAndroid Build Coastguard Worker   const void* params)
1086*4bdc9457SAndroid Build Coastguard Worker {
1087*4bdc9457SAndroid Build Coastguard Worker   assert(nr >= sr);
1088*4bdc9457SAndroid Build Coastguard Worker 
1089*4bdc9457SAndroid Build Coastguard Worker   const size_t skr = sr * kr;
1090*4bdc9457SAndroid Build Coastguard Worker   for (size_t i = 0; i < g; i++) {
1091*4bdc9457SAndroid Build Coastguard Worker     for (size_t oy = 0; oy < sh; oy++) {
1092*4bdc9457SAndroid Build Coastguard Worker       for (size_t ox = 0; ox < sw; ox++) {
1093*4bdc9457SAndroid Build Coastguard Worker         if (i == 0) {
1094*4bdc9457SAndroid Build Coastguard Worker           (*subconv_params++).weights = packed_w;
1095*4bdc9457SAndroid Build Coastguard Worker         }
1096*4bdc9457SAndroid Build Coastguard Worker         for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
1097*4bdc9457SAndroid Build Coastguard Worker           const size_t nr_block_size = min(nc - nr_block_start, nr);
1098*4bdc9457SAndroid Build Coastguard Worker           if XNN_LIKELY(b != NULL) {
1099*4bdc9457SAndroid Build Coastguard Worker             for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1100*4bdc9457SAndroid Build Coastguard Worker               packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
1101*4bdc9457SAndroid Build Coastguard Worker             }
1102*4bdc9457SAndroid Build Coastguard Worker           }
1103*4bdc9457SAndroid Build Coastguard Worker           packed_w += nr;
1104*4bdc9457SAndroid Build Coastguard Worker           for (size_t ky = oy; ky < kh; ky += sh) {
1105*4bdc9457SAndroid Build Coastguard Worker             for (size_t kx = ox; kx < kw; kx += sw) {
1106*4bdc9457SAndroid Build Coastguard Worker               for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
1107*4bdc9457SAndroid Build Coastguard Worker                 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1108*4bdc9457SAndroid Build Coastguard Worker                   for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
1109*4bdc9457SAndroid Build Coastguard Worker                     const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
1110*4bdc9457SAndroid Build Coastguard Worker                     if (kc_idx < kc) {
1111*4bdc9457SAndroid Build Coastguard Worker                       packed_w[kr_block_offset] = k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + kc_idx];
1112*4bdc9457SAndroid Build Coastguard Worker                     }
1113*4bdc9457SAndroid Build Coastguard Worker                   }
1114*4bdc9457SAndroid Build Coastguard Worker                   packed_w += kr;
1115*4bdc9457SAndroid Build Coastguard Worker                 }
1116*4bdc9457SAndroid Build Coastguard Worker                 packed_w += (nr - nr_block_size) * kr;
1117*4bdc9457SAndroid Build Coastguard Worker               }
1118*4bdc9457SAndroid Build Coastguard Worker             }
1119*4bdc9457SAndroid Build Coastguard Worker           }
1120*4bdc9457SAndroid Build Coastguard Worker         }
1121*4bdc9457SAndroid Build Coastguard Worker       }
1122*4bdc9457SAndroid Build Coastguard Worker     }
1123*4bdc9457SAndroid Build Coastguard Worker     k += kh * kw * kc * nc;
1124*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(b != NULL) {
1125*4bdc9457SAndroid Build Coastguard Worker       b += nc;
1126*4bdc9457SAndroid Build Coastguard Worker     }
1127*4bdc9457SAndroid Build Coastguard Worker   }
1128*4bdc9457SAndroid Build Coastguard Worker }
1129*4bdc9457SAndroid Build Coastguard Worker 
xnn_pack_f16_deconv_goki_w(size_t g,size_t nc,size_t kh,size_t kw,size_t kc,size_t sh,size_t sw,size_t nr,size_t kr,size_t sr,const uint16_t * k,const uint16_t * b,uint16_t * packed_w,struct subconvolution_params * subconv_params,const void * params)1130*4bdc9457SAndroid Build Coastguard Worker void xnn_pack_f16_deconv_goki_w(
1131*4bdc9457SAndroid Build Coastguard Worker   size_t g,
1132*4bdc9457SAndroid Build Coastguard Worker   size_t nc,
1133*4bdc9457SAndroid Build Coastguard Worker   size_t kh,
1134*4bdc9457SAndroid Build Coastguard Worker   size_t kw,
1135*4bdc9457SAndroid Build Coastguard Worker   size_t kc,
1136*4bdc9457SAndroid Build Coastguard Worker   size_t sh,
1137*4bdc9457SAndroid Build Coastguard Worker   size_t sw,
1138*4bdc9457SAndroid Build Coastguard Worker   size_t nr,
1139*4bdc9457SAndroid Build Coastguard Worker   size_t kr,
1140*4bdc9457SAndroid Build Coastguard Worker   size_t sr,
1141*4bdc9457SAndroid Build Coastguard Worker   const uint16_t* k,
1142*4bdc9457SAndroid Build Coastguard Worker   const uint16_t* b,
1143*4bdc9457SAndroid Build Coastguard Worker   uint16_t* packed_w,
1144*4bdc9457SAndroid Build Coastguard Worker   struct subconvolution_params* subconv_params,
1145*4bdc9457SAndroid Build Coastguard Worker   const void* params)
1146*4bdc9457SAndroid Build Coastguard Worker {
1147*4bdc9457SAndroid Build Coastguard Worker   assert(nr >= sr);
1148*4bdc9457SAndroid Build Coastguard Worker 
1149*4bdc9457SAndroid Build Coastguard Worker   const size_t skr = sr * kr;
1150*4bdc9457SAndroid Build Coastguard Worker   for (size_t i = 0; i < g; i++) {
1151*4bdc9457SAndroid Build Coastguard Worker     for (size_t oy = 0; oy < sh; oy++) {
1152*4bdc9457SAndroid Build Coastguard Worker       for (size_t ox = 0; ox < sw; ox++) {
1153*4bdc9457SAndroid Build Coastguard Worker         if (i == 0) {
1154*4bdc9457SAndroid Build Coastguard Worker           (*subconv_params++).weights = packed_w;
1155*4bdc9457SAndroid Build Coastguard Worker         }
1156*4bdc9457SAndroid Build Coastguard Worker         for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
1157*4bdc9457SAndroid Build Coastguard Worker           const size_t nr_block_size = min(nc - nr_block_start, nr);
1158*4bdc9457SAndroid Build Coastguard Worker           if XNN_LIKELY(b != NULL) {
1159*4bdc9457SAndroid Build Coastguard Worker             for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1160*4bdc9457SAndroid Build Coastguard Worker               packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
1161*4bdc9457SAndroid Build Coastguard Worker             }
1162*4bdc9457SAndroid Build Coastguard Worker           }
1163*4bdc9457SAndroid Build Coastguard Worker           packed_w += nr;
1164*4bdc9457SAndroid Build Coastguard Worker           for (size_t ky = oy; ky < kh; ky += sh) {
1165*4bdc9457SAndroid Build Coastguard Worker             for (size_t kx = ox; kx < kw; kx += sw) {
1166*4bdc9457SAndroid Build Coastguard Worker               for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
1167*4bdc9457SAndroid Build Coastguard Worker                 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1168*4bdc9457SAndroid Build Coastguard Worker                   for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
1169*4bdc9457SAndroid Build Coastguard Worker                     const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
1170*4bdc9457SAndroid Build Coastguard Worker                     if (kc_idx < kc) {
1171*4bdc9457SAndroid Build Coastguard Worker                       packed_w[kr_block_offset] = k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + kc_idx];
1172*4bdc9457SAndroid Build Coastguard Worker                     }
1173*4bdc9457SAndroid Build Coastguard Worker                   }
1174*4bdc9457SAndroid Build Coastguard Worker                   packed_w += kr;
1175*4bdc9457SAndroid Build Coastguard Worker                 }
1176*4bdc9457SAndroid Build Coastguard Worker                 packed_w += (nr - nr_block_size) * kr;
1177*4bdc9457SAndroid Build Coastguard Worker               }
1178*4bdc9457SAndroid Build Coastguard Worker             }
1179*4bdc9457SAndroid Build Coastguard Worker           }
1180*4bdc9457SAndroid Build Coastguard Worker         }
1181*4bdc9457SAndroid Build Coastguard Worker       }
1182*4bdc9457SAndroid Build Coastguard Worker     }
1183*4bdc9457SAndroid Build Coastguard Worker     k += kh * kw * kc * nc;
1184*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(b != NULL) {
1185*4bdc9457SAndroid Build Coastguard Worker       b += nc;
1186*4bdc9457SAndroid Build Coastguard Worker     }
1187*4bdc9457SAndroid Build Coastguard Worker   }
1188*4bdc9457SAndroid Build Coastguard Worker }
1189*4bdc9457SAndroid Build Coastguard Worker 
xnn_pack_f32_to_f16_deconv_goki_w(size_t g,size_t nc,size_t kh,size_t kw,size_t kc,size_t sh,size_t sw,size_t nr,size_t kr,size_t sr,const float * k,const float * b,uint16_t * packed_w,struct subconvolution_params * subconv_params,const void * params)1190*4bdc9457SAndroid Build Coastguard Worker void xnn_pack_f32_to_f16_deconv_goki_w(
1191*4bdc9457SAndroid Build Coastguard Worker   size_t g,
1192*4bdc9457SAndroid Build Coastguard Worker   size_t nc,
1193*4bdc9457SAndroid Build Coastguard Worker   size_t kh,
1194*4bdc9457SAndroid Build Coastguard Worker   size_t kw,
1195*4bdc9457SAndroid Build Coastguard Worker   size_t kc,
1196*4bdc9457SAndroid Build Coastguard Worker   size_t sh,
1197*4bdc9457SAndroid Build Coastguard Worker   size_t sw,
1198*4bdc9457SAndroid Build Coastguard Worker   size_t nr,
1199*4bdc9457SAndroid Build Coastguard Worker   size_t kr,
1200*4bdc9457SAndroid Build Coastguard Worker   size_t sr,
1201*4bdc9457SAndroid Build Coastguard Worker   const float* k,
1202*4bdc9457SAndroid Build Coastguard Worker   const float* b,
1203*4bdc9457SAndroid Build Coastguard Worker   uint16_t* packed_w,
1204*4bdc9457SAndroid Build Coastguard Worker   struct subconvolution_params* subconv_params,
1205*4bdc9457SAndroid Build Coastguard Worker   const void* params)
1206*4bdc9457SAndroid Build Coastguard Worker {
1207*4bdc9457SAndroid Build Coastguard Worker   assert(nr >= sr);
1208*4bdc9457SAndroid Build Coastguard Worker 
1209*4bdc9457SAndroid Build Coastguard Worker   const size_t skr = sr * kr;
1210*4bdc9457SAndroid Build Coastguard Worker   for (size_t i = 0; i < g; i++) {
1211*4bdc9457SAndroid Build Coastguard Worker     for (size_t oy = 0; oy < sh; oy++) {
1212*4bdc9457SAndroid Build Coastguard Worker       for (size_t ox = 0; ox < sw; ox++) {
1213*4bdc9457SAndroid Build Coastguard Worker         if (i == 0) {
1214*4bdc9457SAndroid Build Coastguard Worker           (*subconv_params++).weights = packed_w;
1215*4bdc9457SAndroid Build Coastguard Worker         }
1216*4bdc9457SAndroid Build Coastguard Worker         for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
1217*4bdc9457SAndroid Build Coastguard Worker           const size_t nr_block_size = min(nc - nr_block_start, nr);
1218*4bdc9457SAndroid Build Coastguard Worker           if XNN_LIKELY(b != NULL) {
1219*4bdc9457SAndroid Build Coastguard Worker             for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1220*4bdc9457SAndroid Build Coastguard Worker               packed_w[nr_block_offset] = fp16_ieee_from_fp32_value(b[nr_block_start + nr_block_offset]);
1221*4bdc9457SAndroid Build Coastguard Worker             }
1222*4bdc9457SAndroid Build Coastguard Worker           }
1223*4bdc9457SAndroid Build Coastguard Worker           packed_w += nr;
1224*4bdc9457SAndroid Build Coastguard Worker           for (size_t ky = oy; ky < kh; ky += sh) {
1225*4bdc9457SAndroid Build Coastguard Worker             for (size_t kx = ox; kx < kw; kx += sw) {
1226*4bdc9457SAndroid Build Coastguard Worker               for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
1227*4bdc9457SAndroid Build Coastguard Worker                 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1228*4bdc9457SAndroid Build Coastguard Worker                   for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
1229*4bdc9457SAndroid Build Coastguard Worker                     const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
1230*4bdc9457SAndroid Build Coastguard Worker                     if (kc_idx < kc) {
1231*4bdc9457SAndroid Build Coastguard Worker                       packed_w[kr_block_offset] = fp16_ieee_from_fp32_value(k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + kc_idx]);
1232*4bdc9457SAndroid Build Coastguard Worker                     }
1233*4bdc9457SAndroid Build Coastguard Worker                   }
1234*4bdc9457SAndroid Build Coastguard Worker                   packed_w += kr;
1235*4bdc9457SAndroid Build Coastguard Worker                 }
1236*4bdc9457SAndroid Build Coastguard Worker                 packed_w += (nr - nr_block_size) * kr;
1237*4bdc9457SAndroid Build Coastguard Worker               }
1238*4bdc9457SAndroid Build Coastguard Worker             }
1239*4bdc9457SAndroid Build Coastguard Worker           }
1240*4bdc9457SAndroid Build Coastguard Worker         }
1241*4bdc9457SAndroid Build Coastguard Worker       }
1242*4bdc9457SAndroid Build Coastguard Worker     }
1243*4bdc9457SAndroid Build Coastguard Worker     k += kh * kw * kc * nc;
1244*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(b != NULL) {
1245*4bdc9457SAndroid Build Coastguard Worker       b += nc;
1246*4bdc9457SAndroid Build Coastguard Worker     }
1247*4bdc9457SAndroid Build Coastguard Worker   }
1248*4bdc9457SAndroid Build Coastguard Worker }
1249*4bdc9457SAndroid Build Coastguard Worker 
xnn_pack_qs8_deconv_goki_w(size_t g,size_t nc,size_t kh,size_t kw,size_t kc,size_t sh,size_t sw,size_t nr,size_t kr,size_t sr,const int8_t * k,const int32_t * b,void * packed_w,struct subconvolution_params * subconv_params,const struct xnn_qs8_packing_params * params)1250*4bdc9457SAndroid Build Coastguard Worker void xnn_pack_qs8_deconv_goki_w(
1251*4bdc9457SAndroid Build Coastguard Worker   size_t g,
1252*4bdc9457SAndroid Build Coastguard Worker   size_t nc,
1253*4bdc9457SAndroid Build Coastguard Worker   size_t kh,
1254*4bdc9457SAndroid Build Coastguard Worker   size_t kw,
1255*4bdc9457SAndroid Build Coastguard Worker   size_t kc,
1256*4bdc9457SAndroid Build Coastguard Worker   size_t sh,
1257*4bdc9457SAndroid Build Coastguard Worker   size_t sw,
1258*4bdc9457SAndroid Build Coastguard Worker   size_t nr,
1259*4bdc9457SAndroid Build Coastguard Worker   size_t kr,
1260*4bdc9457SAndroid Build Coastguard Worker   size_t sr,
1261*4bdc9457SAndroid Build Coastguard Worker   const int8_t* k,
1262*4bdc9457SAndroid Build Coastguard Worker   const int32_t* b,
1263*4bdc9457SAndroid Build Coastguard Worker   void* packed_w,
1264*4bdc9457SAndroid Build Coastguard Worker   struct subconvolution_params* subconv_params,
1265*4bdc9457SAndroid Build Coastguard Worker   const struct xnn_qs8_packing_params* params)
1266*4bdc9457SAndroid Build Coastguard Worker {
1267*4bdc9457SAndroid Build Coastguard Worker   assert(nr >= sr);
1268*4bdc9457SAndroid Build Coastguard Worker 
1269*4bdc9457SAndroid Build Coastguard Worker   const size_t skr = sr * kr;
1270*4bdc9457SAndroid Build Coastguard Worker   const uint32_t izp = (uint32_t) params->input_zero_point;
1271*4bdc9457SAndroid Build Coastguard Worker   for (size_t i = 0; i < g; i++) {
1272*4bdc9457SAndroid Build Coastguard Worker     for (size_t oy = 0; oy < sh; oy++) {
1273*4bdc9457SAndroid Build Coastguard Worker       for (size_t ox = 0; ox < sw; ox++) {
1274*4bdc9457SAndroid Build Coastguard Worker         if (i == 0) {
1275*4bdc9457SAndroid Build Coastguard Worker           (*subconv_params++).weights = packed_w;
1276*4bdc9457SAndroid Build Coastguard Worker         }
1277*4bdc9457SAndroid Build Coastguard Worker         for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
1278*4bdc9457SAndroid Build Coastguard Worker           const size_t nr_block_size = min(nc - nr_block_start, nr);
1279*4bdc9457SAndroid Build Coastguard Worker           int32_t* packed_b = (int32_t*) packed_w;
1280*4bdc9457SAndroid Build Coastguard Worker           if XNN_LIKELY(b != 0) {
1281*4bdc9457SAndroid Build Coastguard Worker             for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1282*4bdc9457SAndroid Build Coastguard Worker               unaligned_store_s32(packed_w, b[nr_block_start + nr_block_offset]);
1283*4bdc9457SAndroid Build Coastguard Worker               packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1284*4bdc9457SAndroid Build Coastguard Worker             }
1285*4bdc9457SAndroid Build Coastguard Worker           } else {
1286*4bdc9457SAndroid Build Coastguard Worker             size_t n = nr_block_size;
1287*4bdc9457SAndroid Build Coastguard Worker             do {
1288*4bdc9457SAndroid Build Coastguard Worker               unaligned_store_s32(packed_w, 0);
1289*4bdc9457SAndroid Build Coastguard Worker               packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1290*4bdc9457SAndroid Build Coastguard Worker             } while (--n != 0);
1291*4bdc9457SAndroid Build Coastguard Worker           }
1292*4bdc9457SAndroid Build Coastguard Worker           packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
1293*4bdc9457SAndroid Build Coastguard Worker           for (size_t ky = oy; ky < kh; ky += sh) {
1294*4bdc9457SAndroid Build Coastguard Worker             for (size_t kx = ox; kx < kw; kx += sw) {
1295*4bdc9457SAndroid Build Coastguard Worker               for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
1296*4bdc9457SAndroid Build Coastguard Worker                 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1297*4bdc9457SAndroid Build Coastguard Worker                   uint32_t ksum = 0;
1298*4bdc9457SAndroid Build Coastguard Worker                   for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
1299*4bdc9457SAndroid Build Coastguard Worker                     const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
1300*4bdc9457SAndroid Build Coastguard Worker                     if (kc_idx < kc) {
1301*4bdc9457SAndroid Build Coastguard Worker                       const int8_t kv = k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + kc_idx];
1302*4bdc9457SAndroid Build Coastguard Worker                       ksum += (uint32_t) kv;
1303*4bdc9457SAndroid Build Coastguard Worker                       ((int8_t*) packed_w)[kr_block_offset] = kv;
1304*4bdc9457SAndroid Build Coastguard Worker                     }
1305*4bdc9457SAndroid Build Coastguard Worker                   }
1306*4bdc9457SAndroid Build Coastguard Worker                   unaligned_indexed_store_u32(packed_b, nr_block_offset, unaligned_indexed_load_u32(packed_b, nr_block_offset) - ksum * izp);
1307*4bdc9457SAndroid Build Coastguard Worker                   packed_w = (int8_t*) packed_w + kr;
1308*4bdc9457SAndroid Build Coastguard Worker                 }
1309*4bdc9457SAndroid Build Coastguard Worker                 packed_w = (int8_t*) packed_w + (nr - nr_block_size) * kr;
1310*4bdc9457SAndroid Build Coastguard Worker               }
1311*4bdc9457SAndroid Build Coastguard Worker             }
1312*4bdc9457SAndroid Build Coastguard Worker           }
1313*4bdc9457SAndroid Build Coastguard Worker         }
1314*4bdc9457SAndroid Build Coastguard Worker       }
1315*4bdc9457SAndroid Build Coastguard Worker     }
1316*4bdc9457SAndroid Build Coastguard Worker     k += kh * kw * kc * nc;
1317*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(b != NULL) {
1318*4bdc9457SAndroid Build Coastguard Worker       b += nc;
1319*4bdc9457SAndroid Build Coastguard Worker     }
1320*4bdc9457SAndroid Build Coastguard Worker   }
1321*4bdc9457SAndroid Build Coastguard Worker }
1322*4bdc9457SAndroid Build Coastguard Worker 
xnn_pack_qu8_deconv_goki_w(size_t g,size_t nc,size_t kh,size_t kw,size_t kc,size_t sh,size_t sw,size_t nr,size_t kr,size_t sr,const uint8_t * k,const int32_t * b,void * packed_w,struct subconvolution_params * subconv_params,const struct xnn_qu8_packing_params * params)1323*4bdc9457SAndroid Build Coastguard Worker void xnn_pack_qu8_deconv_goki_w(
1324*4bdc9457SAndroid Build Coastguard Worker   size_t g,
1325*4bdc9457SAndroid Build Coastguard Worker   size_t nc,
1326*4bdc9457SAndroid Build Coastguard Worker   size_t kh,
1327*4bdc9457SAndroid Build Coastguard Worker   size_t kw,
1328*4bdc9457SAndroid Build Coastguard Worker   size_t kc,
1329*4bdc9457SAndroid Build Coastguard Worker   size_t sh,
1330*4bdc9457SAndroid Build Coastguard Worker   size_t sw,
1331*4bdc9457SAndroid Build Coastguard Worker   size_t nr,
1332*4bdc9457SAndroid Build Coastguard Worker   size_t kr,
1333*4bdc9457SAndroid Build Coastguard Worker   size_t sr,
1334*4bdc9457SAndroid Build Coastguard Worker   const uint8_t* k,
1335*4bdc9457SAndroid Build Coastguard Worker   const int32_t* b,
1336*4bdc9457SAndroid Build Coastguard Worker   void* packed_w,
1337*4bdc9457SAndroid Build Coastguard Worker   struct subconvolution_params* subconv_params,
1338*4bdc9457SAndroid Build Coastguard Worker   const struct xnn_qu8_packing_params* params)
1339*4bdc9457SAndroid Build Coastguard Worker {
1340*4bdc9457SAndroid Build Coastguard Worker   assert(nr >= sr);
1341*4bdc9457SAndroid Build Coastguard Worker 
1342*4bdc9457SAndroid Build Coastguard Worker   const size_t skr = sr * kr;
1343*4bdc9457SAndroid Build Coastguard Worker   const int32_t izp = (int32_t) params->input_zero_point;
1344*4bdc9457SAndroid Build Coastguard Worker   const int32_t kzp = (int32_t) params->kernel_zero_point;
1345*4bdc9457SAndroid Build Coastguard Worker   for (size_t i = 0; i < g; i++) {
1346*4bdc9457SAndroid Build Coastguard Worker     for (size_t oy = 0; oy < sh; oy++) {
1347*4bdc9457SAndroid Build Coastguard Worker       for (size_t ox = 0; ox < sw; ox++) {
1348*4bdc9457SAndroid Build Coastguard Worker         if (i == 0) {
1349*4bdc9457SAndroid Build Coastguard Worker           (*subconv_params++).weights = packed_w;
1350*4bdc9457SAndroid Build Coastguard Worker         }
1351*4bdc9457SAndroid Build Coastguard Worker         const int32_t bzp = (int32_t) divide_round_up(kh - oy, sh) * (int32_t) divide_round_up(kw - ox, sw) * (int32_t) kc * izp * kzp;
1352*4bdc9457SAndroid Build Coastguard Worker         for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
1353*4bdc9457SAndroid Build Coastguard Worker           const size_t nr_block_size = min(nc - nr_block_start, nr);
1354*4bdc9457SAndroid Build Coastguard Worker           int32_t* packed_b = (int32_t*) packed_w;
1355*4bdc9457SAndroid Build Coastguard Worker           if XNN_LIKELY(b != 0) {
1356*4bdc9457SAndroid Build Coastguard Worker             for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1357*4bdc9457SAndroid Build Coastguard Worker               unaligned_store_s32(packed_w, bzp + b[nr_block_start + nr_block_offset]);
1358*4bdc9457SAndroid Build Coastguard Worker               packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1359*4bdc9457SAndroid Build Coastguard Worker             }
1360*4bdc9457SAndroid Build Coastguard Worker           } else {
1361*4bdc9457SAndroid Build Coastguard Worker             size_t n = nr_block_size;
1362*4bdc9457SAndroid Build Coastguard Worker             do {
1363*4bdc9457SAndroid Build Coastguard Worker               unaligned_store_s32(packed_w, bzp);
1364*4bdc9457SAndroid Build Coastguard Worker               packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1365*4bdc9457SAndroid Build Coastguard Worker             } while (--n != 0);
1366*4bdc9457SAndroid Build Coastguard Worker           }
1367*4bdc9457SAndroid Build Coastguard Worker           packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
1368*4bdc9457SAndroid Build Coastguard Worker           for (size_t ky = oy; ky < kh; ky += sh) {
1369*4bdc9457SAndroid Build Coastguard Worker             for (size_t kx = ox; kx < kw; kx += sw) {
1370*4bdc9457SAndroid Build Coastguard Worker               for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
1371*4bdc9457SAndroid Build Coastguard Worker                 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1372*4bdc9457SAndroid Build Coastguard Worker                   int32_t ksum = 0;
1373*4bdc9457SAndroid Build Coastguard Worker                   for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
1374*4bdc9457SAndroid Build Coastguard Worker                     const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
1375*4bdc9457SAndroid Build Coastguard Worker                     if (kc_idx < kc) {
1376*4bdc9457SAndroid Build Coastguard Worker                       const uint8_t kv = k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + kc_idx];
1377*4bdc9457SAndroid Build Coastguard Worker                       ksum += (int32_t) kv;
1378*4bdc9457SAndroid Build Coastguard Worker                       ((uint8_t*) packed_w)[kr_block_offset] = kv;
1379*4bdc9457SAndroid Build Coastguard Worker                     }
1380*4bdc9457SAndroid Build Coastguard Worker                   }
1381*4bdc9457SAndroid Build Coastguard Worker                   unaligned_indexed_store_s32(packed_b, nr_block_offset, unaligned_indexed_load_s32(packed_b, nr_block_offset) - ksum * izp);
1382*4bdc9457SAndroid Build Coastguard Worker                   packed_w = (uint8_t*) packed_w + kr;
1383*4bdc9457SAndroid Build Coastguard Worker                 }
1384*4bdc9457SAndroid Build Coastguard Worker                 packed_w = (uint8_t*) packed_w + (nr - nr_block_size) * kr;
1385*4bdc9457SAndroid Build Coastguard Worker               }
1386*4bdc9457SAndroid Build Coastguard Worker             }
1387*4bdc9457SAndroid Build Coastguard Worker           }
1388*4bdc9457SAndroid Build Coastguard Worker         }
1389*4bdc9457SAndroid Build Coastguard Worker       }
1390*4bdc9457SAndroid Build Coastguard Worker     }
1391*4bdc9457SAndroid Build Coastguard Worker     k += kh * kw * kc * nc;
1392*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(b != NULL) {
1393*4bdc9457SAndroid Build Coastguard Worker       b += nc;
1394*4bdc9457SAndroid Build Coastguard Worker     }
1395*4bdc9457SAndroid Build Coastguard Worker   }
1396*4bdc9457SAndroid Build Coastguard Worker }
1397*4bdc9457SAndroid Build Coastguard Worker 
xnn_pack_f32_dwconv_ghw_w(size_t primary_tile,size_t h,size_t w,size_t c,size_t cr,const float * k,const float * b,float * packed_w,size_t extra_bytes,const void * params)1398*4bdc9457SAndroid Build Coastguard Worker void xnn_pack_f32_dwconv_ghw_w(
1399*4bdc9457SAndroid Build Coastguard Worker   size_t primary_tile,
1400*4bdc9457SAndroid Build Coastguard Worker   size_t h,
1401*4bdc9457SAndroid Build Coastguard Worker   size_t w,
1402*4bdc9457SAndroid Build Coastguard Worker   size_t c,
1403*4bdc9457SAndroid Build Coastguard Worker   size_t cr,
1404*4bdc9457SAndroid Build Coastguard Worker   const float* k,
1405*4bdc9457SAndroid Build Coastguard Worker   const float* b,
1406*4bdc9457SAndroid Build Coastguard Worker   float* packed_w,
1407*4bdc9457SAndroid Build Coastguard Worker   size_t extra_bytes,
1408*4bdc9457SAndroid Build Coastguard Worker   const void* params)
1409*4bdc9457SAndroid Build Coastguard Worker {
1410*4bdc9457SAndroid Build Coastguard Worker   for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1411*4bdc9457SAndroid Build Coastguard Worker     const size_t cr_block_size = min(c - cr_block_start, cr);
1412*4bdc9457SAndroid Build Coastguard Worker     if XNN_LIKELY(b != NULL) {
1413*4bdc9457SAndroid Build Coastguard Worker       for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1414*4bdc9457SAndroid Build Coastguard Worker         *packed_w++ = b[cr_block_start + cr_block_offset];
1415*4bdc9457SAndroid Build Coastguard Worker       }
1416*4bdc9457SAndroid Build Coastguard Worker     } else {
1417*4bdc9457SAndroid Build Coastguard Worker       size_t n = cr_block_size;
1418*4bdc9457SAndroid Build Coastguard Worker       do {
1419*4bdc9457SAndroid Build Coastguard Worker         *packed_w++ = 0.0f;
1420*4bdc9457SAndroid Build Coastguard Worker       } while (--n != 0);
1421*4bdc9457SAndroid Build Coastguard Worker     }
1422*4bdc9457SAndroid Build Coastguard Worker     packed_w += cr - cr_block_size;
1423*4bdc9457SAndroid Build Coastguard Worker     for (size_t x = 0; x < w; x++) {
1424*4bdc9457SAndroid Build Coastguard Worker       for (size_t y = 0; y < h; y++) {
1425*4bdc9457SAndroid Build Coastguard Worker         for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1426*4bdc9457SAndroid Build Coastguard Worker           const float kv = k[((cr_block_start + cr_block_offset) * h + y) * w + x];
1427*4bdc9457SAndroid Build Coastguard Worker           *packed_w++ = kv;
1428*4bdc9457SAndroid Build Coastguard Worker         }
1429*4bdc9457SAndroid Build Coastguard Worker         packed_w += cr - cr_block_size;
1430*4bdc9457SAndroid Build Coastguard Worker       }
1431*4bdc9457SAndroid Build Coastguard Worker     }
1432*4bdc9457SAndroid Build Coastguard Worker     packed_w += (primary_tile - (h * w)) * cr_block_size;
1433*4bdc9457SAndroid Build Coastguard Worker     packed_w = (float*) ((uintptr_t) packed_w + extra_bytes);
1434*4bdc9457SAndroid Build Coastguard Worker   }
1435*4bdc9457SAndroid Build Coastguard Worker }
1436*4bdc9457SAndroid Build Coastguard Worker 
xnn_pack_f16_dwconv_ghw_w(size_t primary_tile,size_t h,size_t w,size_t c,size_t cr,const uint16_t * k,const uint16_t * b,uint16_t * packed_w,size_t extra_bytes,const void * params)1437*4bdc9457SAndroid Build Coastguard Worker void xnn_pack_f16_dwconv_ghw_w(
1438*4bdc9457SAndroid Build Coastguard Worker   size_t primary_tile,
1439*4bdc9457SAndroid Build Coastguard Worker   size_t h,
1440*4bdc9457SAndroid Build Coastguard Worker   size_t w,
1441*4bdc9457SAndroid Build Coastguard Worker   size_t c,
1442*4bdc9457SAndroid Build Coastguard Worker   size_t cr,
1443*4bdc9457SAndroid Build Coastguard Worker   const uint16_t* k,
1444*4bdc9457SAndroid Build Coastguard Worker   const uint16_t* b,
1445*4bdc9457SAndroid Build Coastguard Worker   uint16_t* packed_w,
1446*4bdc9457SAndroid Build Coastguard Worker   size_t extra_bytes,
1447*4bdc9457SAndroid Build Coastguard Worker   const void* params)
1448*4bdc9457SAndroid Build Coastguard Worker {
1449*4bdc9457SAndroid Build Coastguard Worker   for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1450*4bdc9457SAndroid Build Coastguard Worker     const size_t cr_block_size = min(c - cr_block_start, cr);
1451*4bdc9457SAndroid Build Coastguard Worker     if XNN_LIKELY(b != NULL) {
1452*4bdc9457SAndroid Build Coastguard Worker       for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1453*4bdc9457SAndroid Build Coastguard Worker         *packed_w++ = b[cr_block_start + cr_block_offset];
1454*4bdc9457SAndroid Build Coastguard Worker       }
1455*4bdc9457SAndroid Build Coastguard Worker     } else {
1456*4bdc9457SAndroid Build Coastguard Worker       size_t n = cr_block_size;
1457*4bdc9457SAndroid Build Coastguard Worker       do {
1458*4bdc9457SAndroid Build Coastguard Worker         *packed_w++ = 0;
1459*4bdc9457SAndroid Build Coastguard Worker       } while (--n != 0);
1460*4bdc9457SAndroid Build Coastguard Worker     }
1461*4bdc9457SAndroid Build Coastguard Worker     packed_w += cr - cr_block_size;
1462*4bdc9457SAndroid Build Coastguard Worker     for (size_t x = 0; x < w; x++) {
1463*4bdc9457SAndroid Build Coastguard Worker       for (size_t y = 0; y < h; y++) {
1464*4bdc9457SAndroid Build Coastguard Worker         for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1465*4bdc9457SAndroid Build Coastguard Worker           const uint16_t kv = k[((cr_block_start + cr_block_offset) * h + y) * w + x];
1466*4bdc9457SAndroid Build Coastguard Worker           *packed_w++ = kv;
1467*4bdc9457SAndroid Build Coastguard Worker         }
1468*4bdc9457SAndroid Build Coastguard Worker         packed_w += cr - cr_block_size;
1469*4bdc9457SAndroid Build Coastguard Worker       }
1470*4bdc9457SAndroid Build Coastguard Worker     }
1471*4bdc9457SAndroid Build Coastguard Worker     packed_w += (primary_tile - (h * w)) * cr_block_size;
1472*4bdc9457SAndroid Build Coastguard Worker     packed_w = (uint16_t*) ((uintptr_t) packed_w + extra_bytes);
1473*4bdc9457SAndroid Build Coastguard Worker   }
1474*4bdc9457SAndroid Build Coastguard Worker }
1475*4bdc9457SAndroid Build Coastguard Worker 
xnn_pack_f32_to_f16_dwconv_ghw_w(size_t primary_tile,size_t h,size_t w,size_t c,size_t cr,const float * k,const float * b,uint16_t * packed_w,size_t extra_bytes,const void * params)1476*4bdc9457SAndroid Build Coastguard Worker void xnn_pack_f32_to_f16_dwconv_ghw_w(
1477*4bdc9457SAndroid Build Coastguard Worker   size_t primary_tile,
1478*4bdc9457SAndroid Build Coastguard Worker   size_t h,
1479*4bdc9457SAndroid Build Coastguard Worker   size_t w,
1480*4bdc9457SAndroid Build Coastguard Worker   size_t c,
1481*4bdc9457SAndroid Build Coastguard Worker   size_t cr,
1482*4bdc9457SAndroid Build Coastguard Worker   const float* k,
1483*4bdc9457SAndroid Build Coastguard Worker   const float* b,
1484*4bdc9457SAndroid Build Coastguard Worker   uint16_t* packed_w,
1485*4bdc9457SAndroid Build Coastguard Worker   size_t extra_bytes,
1486*4bdc9457SAndroid Build Coastguard Worker   const void* params)
1487*4bdc9457SAndroid Build Coastguard Worker {
1488*4bdc9457SAndroid Build Coastguard Worker   for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1489*4bdc9457SAndroid Build Coastguard Worker     const size_t cr_block_size = min(c - cr_block_start, cr);
1490*4bdc9457SAndroid Build Coastguard Worker     if XNN_LIKELY(b != NULL) {
1491*4bdc9457SAndroid Build Coastguard Worker       for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1492*4bdc9457SAndroid Build Coastguard Worker         *packed_w++ = fp16_ieee_from_fp32_value(b[cr_block_start + cr_block_offset]);
1493*4bdc9457SAndroid Build Coastguard Worker       }
1494*4bdc9457SAndroid Build Coastguard Worker     } else {
1495*4bdc9457SAndroid Build Coastguard Worker       size_t n = cr_block_size;
1496*4bdc9457SAndroid Build Coastguard Worker       do {
1497*4bdc9457SAndroid Build Coastguard Worker         *packed_w++ = 0;
1498*4bdc9457SAndroid Build Coastguard Worker       } while (--n != 0);
1499*4bdc9457SAndroid Build Coastguard Worker     }
1500*4bdc9457SAndroid Build Coastguard Worker     packed_w += cr - cr_block_size;
1501*4bdc9457SAndroid Build Coastguard Worker     for (size_t x = 0; x < w; x++) {
1502*4bdc9457SAndroid Build Coastguard Worker       for (size_t y = 0; y < h; y++) {
1503*4bdc9457SAndroid Build Coastguard Worker         for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1504*4bdc9457SAndroid Build Coastguard Worker           const uint16_t kv = fp16_ieee_from_fp32_value(k[((cr_block_start + cr_block_offset) * h + y) * w + x]);
1505*4bdc9457SAndroid Build Coastguard Worker           *packed_w++ = kv;
1506*4bdc9457SAndroid Build Coastguard Worker         }
1507*4bdc9457SAndroid Build Coastguard Worker         packed_w += cr - cr_block_size;
1508*4bdc9457SAndroid Build Coastguard Worker       }
1509*4bdc9457SAndroid Build Coastguard Worker     }
1510*4bdc9457SAndroid Build Coastguard Worker     packed_w += (primary_tile - (h * w)) * cr_block_size;
1511*4bdc9457SAndroid Build Coastguard Worker     packed_w = (uint16_t*) ((uintptr_t) packed_w + extra_bytes);
1512*4bdc9457SAndroid Build Coastguard Worker   }
1513*4bdc9457SAndroid Build Coastguard Worker }
1514*4bdc9457SAndroid Build Coastguard Worker 
xnn_pack_qu8_dwconv_ghw_w(size_t primary_tile,size_t h,size_t w,size_t c,size_t cr,const uint8_t * k,const int32_t * b,void * packed_w,size_t extra_bytes,const struct xnn_qu8_packing_params * params)1515*4bdc9457SAndroid Build Coastguard Worker void xnn_pack_qu8_dwconv_ghw_w(
1516*4bdc9457SAndroid Build Coastguard Worker   size_t primary_tile,
1517*4bdc9457SAndroid Build Coastguard Worker   size_t h,
1518*4bdc9457SAndroid Build Coastguard Worker   size_t w,
1519*4bdc9457SAndroid Build Coastguard Worker   size_t c,
1520*4bdc9457SAndroid Build Coastguard Worker   size_t cr,
1521*4bdc9457SAndroid Build Coastguard Worker   const uint8_t* k,
1522*4bdc9457SAndroid Build Coastguard Worker   const int32_t* b,
1523*4bdc9457SAndroid Build Coastguard Worker   void* packed_w,
1524*4bdc9457SAndroid Build Coastguard Worker   size_t extra_bytes,
1525*4bdc9457SAndroid Build Coastguard Worker   const struct xnn_qu8_packing_params* params)
1526*4bdc9457SAndroid Build Coastguard Worker {
1527*4bdc9457SAndroid Build Coastguard Worker   const int32_t izp = (int32_t) params->input_zero_point;
1528*4bdc9457SAndroid Build Coastguard Worker   const int32_t boff = (int32_t) h * (int32_t) w * izp * (int32_t) params->kernel_zero_point;
1529*4bdc9457SAndroid Build Coastguard Worker   for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1530*4bdc9457SAndroid Build Coastguard Worker     const size_t cr_block_size = min(c - cr_block_start, cr);
1531*4bdc9457SAndroid Build Coastguard Worker     int32_t* packed_b = (int32_t*) packed_w;
1532*4bdc9457SAndroid Build Coastguard Worker     if XNN_LIKELY(b != NULL) {
1533*4bdc9457SAndroid Build Coastguard Worker       for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1534*4bdc9457SAndroid Build Coastguard Worker         unaligned_store_s32(packed_w, boff + b[cr_block_start + cr_block_offset]);
1535*4bdc9457SAndroid Build Coastguard Worker         packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1536*4bdc9457SAndroid Build Coastguard Worker       }
1537*4bdc9457SAndroid Build Coastguard Worker     } else {
1538*4bdc9457SAndroid Build Coastguard Worker       size_t n = cr_block_size;
1539*4bdc9457SAndroid Build Coastguard Worker       do {
1540*4bdc9457SAndroid Build Coastguard Worker         unaligned_store_s32(packed_w, boff);
1541*4bdc9457SAndroid Build Coastguard Worker         packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1542*4bdc9457SAndroid Build Coastguard Worker       } while (--n != 0);
1543*4bdc9457SAndroid Build Coastguard Worker     }
1544*4bdc9457SAndroid Build Coastguard Worker     packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(int32_t));
1545*4bdc9457SAndroid Build Coastguard Worker     for (size_t x = 0; x < w; x++) {
1546*4bdc9457SAndroid Build Coastguard Worker       for (size_t y = 0; y < h; y++) {
1547*4bdc9457SAndroid Build Coastguard Worker         for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1548*4bdc9457SAndroid Build Coastguard Worker           const uint8_t kv = k[((cr_block_start + cr_block_offset) * h + y) * w + x];
1549*4bdc9457SAndroid Build Coastguard Worker           unaligned_indexed_store_s32(packed_b, cr_block_offset, unaligned_indexed_load_s32(packed_b, cr_block_offset) - (int32_t) kv * izp);
1550*4bdc9457SAndroid Build Coastguard Worker           *((uint8_t*) packed_w) = kv;
1551*4bdc9457SAndroid Build Coastguard Worker           packed_w = (void*) ((uintptr_t) packed_w + sizeof(uint8_t));
1552*4bdc9457SAndroid Build Coastguard Worker         }
1553*4bdc9457SAndroid Build Coastguard Worker         packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(uint8_t));
1554*4bdc9457SAndroid Build Coastguard Worker       }
1555*4bdc9457SAndroid Build Coastguard Worker     }
1556*4bdc9457SAndroid Build Coastguard Worker     packed_w = (void*) ((uintptr_t) packed_w + (primary_tile - (h * w)) * cr_block_size * sizeof(uint8_t));
1557*4bdc9457SAndroid Build Coastguard Worker     packed_w = (void*) ((uintptr_t) packed_w + extra_bytes);
1558*4bdc9457SAndroid Build Coastguard Worker   }
1559*4bdc9457SAndroid Build Coastguard Worker }
1560*4bdc9457SAndroid Build Coastguard Worker 
xnn_pack_qs8_dwconv_ghw_w(size_t primary_tile,size_t h,size_t w,size_t c,size_t cr,const int8_t * k,const int32_t * b,void * packed_w,size_t extra_bytes,const struct xnn_qs8_packing_params * params)1561*4bdc9457SAndroid Build Coastguard Worker void xnn_pack_qs8_dwconv_ghw_w(
1562*4bdc9457SAndroid Build Coastguard Worker   size_t primary_tile,
1563*4bdc9457SAndroid Build Coastguard Worker   size_t h,
1564*4bdc9457SAndroid Build Coastguard Worker   size_t w,
1565*4bdc9457SAndroid Build Coastguard Worker   size_t c,
1566*4bdc9457SAndroid Build Coastguard Worker   size_t cr,
1567*4bdc9457SAndroid Build Coastguard Worker   const int8_t* k,
1568*4bdc9457SAndroid Build Coastguard Worker   const int32_t* b,
1569*4bdc9457SAndroid Build Coastguard Worker   void* packed_w,
1570*4bdc9457SAndroid Build Coastguard Worker   size_t extra_bytes,
1571*4bdc9457SAndroid Build Coastguard Worker   const struct xnn_qs8_packing_params* params)
1572*4bdc9457SAndroid Build Coastguard Worker {
1573*4bdc9457SAndroid Build Coastguard Worker   const uint32_t izp = (uint32_t) params->input_zero_point;
1574*4bdc9457SAndroid Build Coastguard Worker   for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1575*4bdc9457SAndroid Build Coastguard Worker     const size_t cr_block_size = min(c - cr_block_start, cr);
1576*4bdc9457SAndroid Build Coastguard Worker     int32_t* packed_b = (int32_t*) packed_w;
1577*4bdc9457SAndroid Build Coastguard Worker     if XNN_LIKELY(b != NULL) {
1578*4bdc9457SAndroid Build Coastguard Worker       for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1579*4bdc9457SAndroid Build Coastguard Worker         unaligned_store_s32(packed_w, b[cr_block_start + cr_block_offset]);
1580*4bdc9457SAndroid Build Coastguard Worker         packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1581*4bdc9457SAndroid Build Coastguard Worker       }
1582*4bdc9457SAndroid Build Coastguard Worker     } else {
1583*4bdc9457SAndroid Build Coastguard Worker       size_t n = cr_block_size;
1584*4bdc9457SAndroid Build Coastguard Worker       do {
1585*4bdc9457SAndroid Build Coastguard Worker         unaligned_store_s32(packed_w, 0);
1586*4bdc9457SAndroid Build Coastguard Worker         packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1587*4bdc9457SAndroid Build Coastguard Worker       } while (--n != 0);
1588*4bdc9457SAndroid Build Coastguard Worker     }
1589*4bdc9457SAndroid Build Coastguard Worker     packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(int32_t));
1590*4bdc9457SAndroid Build Coastguard Worker     for (size_t x = 0; x < w; x++) {
1591*4bdc9457SAndroid Build Coastguard Worker       for (size_t y = 0; y < h; y++) {
1592*4bdc9457SAndroid Build Coastguard Worker         for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1593*4bdc9457SAndroid Build Coastguard Worker           const int8_t kv = k[((cr_block_start + cr_block_offset) * h + y) * w + x];
1594*4bdc9457SAndroid Build Coastguard Worker           unaligned_indexed_store_u32(packed_b, cr_block_offset, unaligned_indexed_load_u32(packed_b, cr_block_offset) - (uint32_t) kv * izp);
1595*4bdc9457SAndroid Build Coastguard Worker           *((int8_t*) packed_w) = kv;
1596*4bdc9457SAndroid Build Coastguard Worker           packed_w = (void*) ((uintptr_t) packed_w + sizeof(int8_t));
1597*4bdc9457SAndroid Build Coastguard Worker         }
1598*4bdc9457SAndroid Build Coastguard Worker         packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(int8_t));
1599*4bdc9457SAndroid Build Coastguard Worker       }
1600*4bdc9457SAndroid Build Coastguard Worker     }
1601*4bdc9457SAndroid Build Coastguard Worker     packed_w = (void*) ((uintptr_t) packed_w + (primary_tile - (h * w)) * cr_block_size * sizeof(int8_t));
1602*4bdc9457SAndroid Build Coastguard Worker     packed_w = (void*) ((uintptr_t) packed_w + extra_bytes);
1603*4bdc9457SAndroid Build Coastguard Worker   }
1604*4bdc9457SAndroid Build Coastguard Worker }
1605*4bdc9457SAndroid Build Coastguard Worker 
xnn_pack_f32_dwconv_hwg_w(size_t primary_tile,size_t h,size_t w,size_t c,size_t cr,const float * k,const float * b,float * packed_w,size_t extra_bytes,const void * params)1606*4bdc9457SAndroid Build Coastguard Worker void xnn_pack_f32_dwconv_hwg_w(
1607*4bdc9457SAndroid Build Coastguard Worker   size_t primary_tile,
1608*4bdc9457SAndroid Build Coastguard Worker   size_t h,
1609*4bdc9457SAndroid Build Coastguard Worker   size_t w,
1610*4bdc9457SAndroid Build Coastguard Worker   size_t c,
1611*4bdc9457SAndroid Build Coastguard Worker   size_t cr,
1612*4bdc9457SAndroid Build Coastguard Worker   const float* k,
1613*4bdc9457SAndroid Build Coastguard Worker   const float* b,
1614*4bdc9457SAndroid Build Coastguard Worker   float* packed_w,
1615*4bdc9457SAndroid Build Coastguard Worker   size_t extra_bytes,
1616*4bdc9457SAndroid Build Coastguard Worker   const void* params)
1617*4bdc9457SAndroid Build Coastguard Worker {
1618*4bdc9457SAndroid Build Coastguard Worker   for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1619*4bdc9457SAndroid Build Coastguard Worker     const size_t cr_block_size = min(c - cr_block_start, cr);
1620*4bdc9457SAndroid Build Coastguard Worker     if XNN_LIKELY(b != NULL) {
1621*4bdc9457SAndroid Build Coastguard Worker       for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1622*4bdc9457SAndroid Build Coastguard Worker         *packed_w++ = b[cr_block_start + cr_block_offset];
1623*4bdc9457SAndroid Build Coastguard Worker       }
1624*4bdc9457SAndroid Build Coastguard Worker     } else {
1625*4bdc9457SAndroid Build Coastguard Worker       size_t n = cr_block_size;
1626*4bdc9457SAndroid Build Coastguard Worker       do {
1627*4bdc9457SAndroid Build Coastguard Worker         *packed_w++ = 0.0f;
1628*4bdc9457SAndroid Build Coastguard Worker       } while (--n != 0);
1629*4bdc9457SAndroid Build Coastguard Worker     }
1630*4bdc9457SAndroid Build Coastguard Worker     packed_w += cr - cr_block_size;
1631*4bdc9457SAndroid Build Coastguard Worker     for (size_t x = 0; x < w; x++) {
1632*4bdc9457SAndroid Build Coastguard Worker       for (size_t y = 0; y < h; y++) {
1633*4bdc9457SAndroid Build Coastguard Worker         for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1634*4bdc9457SAndroid Build Coastguard Worker           const float kv = k[(y * w + x) * c + (cr_block_start + cr_block_offset)];
1635*4bdc9457SAndroid Build Coastguard Worker           *packed_w++ = kv;
1636*4bdc9457SAndroid Build Coastguard Worker         }
1637*4bdc9457SAndroid Build Coastguard Worker         packed_w += cr - cr_block_size;
1638*4bdc9457SAndroid Build Coastguard Worker       }
1639*4bdc9457SAndroid Build Coastguard Worker     }
1640*4bdc9457SAndroid Build Coastguard Worker     packed_w += (primary_tile - (h * w)) * cr_block_size;
1641*4bdc9457SAndroid Build Coastguard Worker     packed_w = (float*) ((uintptr_t) packed_w + extra_bytes);
1642*4bdc9457SAndroid Build Coastguard Worker   }
1643*4bdc9457SAndroid Build Coastguard Worker }
1644*4bdc9457SAndroid Build Coastguard Worker 
xnn_pack_f16_dwconv_hwg_w(size_t primary_tile,size_t h,size_t w,size_t c,size_t cr,const uint16_t * k,const uint16_t * b,uint16_t * packed_w,size_t extra_bytes,const void * params)1645*4bdc9457SAndroid Build Coastguard Worker void xnn_pack_f16_dwconv_hwg_w(
1646*4bdc9457SAndroid Build Coastguard Worker   size_t primary_tile,
1647*4bdc9457SAndroid Build Coastguard Worker   size_t h,
1648*4bdc9457SAndroid Build Coastguard Worker   size_t w,
1649*4bdc9457SAndroid Build Coastguard Worker   size_t c,
1650*4bdc9457SAndroid Build Coastguard Worker   size_t cr,
1651*4bdc9457SAndroid Build Coastguard Worker   const uint16_t* k,
1652*4bdc9457SAndroid Build Coastguard Worker   const uint16_t* b,
1653*4bdc9457SAndroid Build Coastguard Worker   uint16_t* packed_w,
1654*4bdc9457SAndroid Build Coastguard Worker   size_t extra_bytes,
1655*4bdc9457SAndroid Build Coastguard Worker   const void* params)
1656*4bdc9457SAndroid Build Coastguard Worker {
1657*4bdc9457SAndroid Build Coastguard Worker   for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1658*4bdc9457SAndroid Build Coastguard Worker     const size_t cr_block_size = min(c - cr_block_start, cr);
1659*4bdc9457SAndroid Build Coastguard Worker     if XNN_LIKELY(b != NULL) {
1660*4bdc9457SAndroid Build Coastguard Worker       for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1661*4bdc9457SAndroid Build Coastguard Worker         *packed_w++ = b[cr_block_start + cr_block_offset];
1662*4bdc9457SAndroid Build Coastguard Worker       }
1663*4bdc9457SAndroid Build Coastguard Worker     } else {
1664*4bdc9457SAndroid Build Coastguard Worker       size_t n = cr_block_size;
1665*4bdc9457SAndroid Build Coastguard Worker       do {
1666*4bdc9457SAndroid Build Coastguard Worker         *packed_w++ = 0;
1667*4bdc9457SAndroid Build Coastguard Worker       } while (--n != 0);
1668*4bdc9457SAndroid Build Coastguard Worker     }
1669*4bdc9457SAndroid Build Coastguard Worker     packed_w += cr - cr_block_size;
1670*4bdc9457SAndroid Build Coastguard Worker     for (size_t x = 0; x < w; x++) {
1671*4bdc9457SAndroid Build Coastguard Worker       for (size_t y = 0; y < h; y++) {
1672*4bdc9457SAndroid Build Coastguard Worker         for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1673*4bdc9457SAndroid Build Coastguard Worker           const uint16_t kv = k[(y * w + x) * c + (cr_block_start + cr_block_offset)];
1674*4bdc9457SAndroid Build Coastguard Worker           *packed_w++ = kv;
1675*4bdc9457SAndroid Build Coastguard Worker         }
1676*4bdc9457SAndroid Build Coastguard Worker         packed_w += cr - cr_block_size;
1677*4bdc9457SAndroid Build Coastguard Worker       }
1678*4bdc9457SAndroid Build Coastguard Worker     }
1679*4bdc9457SAndroid Build Coastguard Worker     packed_w += (primary_tile - (h * w)) * cr_block_size;
1680*4bdc9457SAndroid Build Coastguard Worker     packed_w = (uint16_t*) ((uintptr_t) packed_w + extra_bytes);
1681*4bdc9457SAndroid Build Coastguard Worker   }
1682*4bdc9457SAndroid Build Coastguard Worker }
1683*4bdc9457SAndroid Build Coastguard Worker 
xnn_pack_f32_to_f16_dwconv_hwg_w(size_t primary_tile,size_t h,size_t w,size_t c,size_t cr,const float * k,const float * b,uint16_t * packed_w,size_t extra_bytes,const void * params)1684*4bdc9457SAndroid Build Coastguard Worker void xnn_pack_f32_to_f16_dwconv_hwg_w(
1685*4bdc9457SAndroid Build Coastguard Worker   size_t primary_tile,
1686*4bdc9457SAndroid Build Coastguard Worker   size_t h,
1687*4bdc9457SAndroid Build Coastguard Worker   size_t w,
1688*4bdc9457SAndroid Build Coastguard Worker   size_t c,
1689*4bdc9457SAndroid Build Coastguard Worker   size_t cr,
1690*4bdc9457SAndroid Build Coastguard Worker   const float* k,
1691*4bdc9457SAndroid Build Coastguard Worker   const float* b,
1692*4bdc9457SAndroid Build Coastguard Worker   uint16_t* packed_w,
1693*4bdc9457SAndroid Build Coastguard Worker   size_t extra_bytes,
1694*4bdc9457SAndroid Build Coastguard Worker   const void* params)
1695*4bdc9457SAndroid Build Coastguard Worker {
1696*4bdc9457SAndroid Build Coastguard Worker   for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1697*4bdc9457SAndroid Build Coastguard Worker     const size_t cr_block_size = min(c - cr_block_start, cr);
1698*4bdc9457SAndroid Build Coastguard Worker     if XNN_LIKELY(b != NULL) {
1699*4bdc9457SAndroid Build Coastguard Worker       for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1700*4bdc9457SAndroid Build Coastguard Worker         *packed_w++ = fp16_ieee_from_fp32_value(b[cr_block_start + cr_block_offset]);
1701*4bdc9457SAndroid Build Coastguard Worker       }
1702*4bdc9457SAndroid Build Coastguard Worker     } else {
1703*4bdc9457SAndroid Build Coastguard Worker       size_t n = cr_block_size;
1704*4bdc9457SAndroid Build Coastguard Worker       do {
1705*4bdc9457SAndroid Build Coastguard Worker         *packed_w++ = 0;
1706*4bdc9457SAndroid Build Coastguard Worker       } while (--n != 0);
1707*4bdc9457SAndroid Build Coastguard Worker     }
1708*4bdc9457SAndroid Build Coastguard Worker     packed_w += cr - cr_block_size;
1709*4bdc9457SAndroid Build Coastguard Worker     for (size_t x = 0; x < w; x++) {
1710*4bdc9457SAndroid Build Coastguard Worker       for (size_t y = 0; y < h; y++) {
1711*4bdc9457SAndroid Build Coastguard Worker         for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1712*4bdc9457SAndroid Build Coastguard Worker           const uint16_t kv = fp16_ieee_from_fp32_value(k[(y * w + x) * c + (cr_block_start + cr_block_offset)]);
1713*4bdc9457SAndroid Build Coastguard Worker           *packed_w++ = kv;
1714*4bdc9457SAndroid Build Coastguard Worker         }
1715*4bdc9457SAndroid Build Coastguard Worker         packed_w += cr - cr_block_size;
1716*4bdc9457SAndroid Build Coastguard Worker       }
1717*4bdc9457SAndroid Build Coastguard Worker     }
1718*4bdc9457SAndroid Build Coastguard Worker     packed_w += (primary_tile - (h * w)) * cr_block_size;
1719*4bdc9457SAndroid Build Coastguard Worker     packed_w = (uint16_t*) ((uintptr_t) packed_w + extra_bytes);
1720*4bdc9457SAndroid Build Coastguard Worker   }
1721*4bdc9457SAndroid Build Coastguard Worker }
1722*4bdc9457SAndroid Build Coastguard Worker 
xnn_pack_qu8_dwconv_hwg_w(size_t primary_tile,size_t h,size_t w,size_t c,size_t cr,const uint8_t * k,const int32_t * b,void * packed_w,size_t extra_bytes,const struct xnn_qu8_packing_params * params)1723*4bdc9457SAndroid Build Coastguard Worker void xnn_pack_qu8_dwconv_hwg_w(
1724*4bdc9457SAndroid Build Coastguard Worker   size_t primary_tile,
1725*4bdc9457SAndroid Build Coastguard Worker   size_t h,
1726*4bdc9457SAndroid Build Coastguard Worker   size_t w,
1727*4bdc9457SAndroid Build Coastguard Worker   size_t c,
1728*4bdc9457SAndroid Build Coastguard Worker   size_t cr,
1729*4bdc9457SAndroid Build Coastguard Worker   const uint8_t* k,
1730*4bdc9457SAndroid Build Coastguard Worker   const int32_t* b,
1731*4bdc9457SAndroid Build Coastguard Worker   void* packed_w,
1732*4bdc9457SAndroid Build Coastguard Worker   size_t extra_bytes,
1733*4bdc9457SAndroid Build Coastguard Worker   const struct xnn_qu8_packing_params* params)
1734*4bdc9457SAndroid Build Coastguard Worker {
1735*4bdc9457SAndroid Build Coastguard Worker   const int32_t izp = (int32_t) params->input_zero_point;
1736*4bdc9457SAndroid Build Coastguard Worker   const int32_t boff = (int32_t) h * (int32_t) w * izp * (int32_t) params->kernel_zero_point;
1737*4bdc9457SAndroid Build Coastguard Worker   for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1738*4bdc9457SAndroid Build Coastguard Worker     const size_t cr_block_size = min(c - cr_block_start, cr);
1739*4bdc9457SAndroid Build Coastguard Worker     int32_t* packed_b = (int32_t*) packed_w;
1740*4bdc9457SAndroid Build Coastguard Worker     if XNN_LIKELY(b != NULL) {
1741*4bdc9457SAndroid Build Coastguard Worker       for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1742*4bdc9457SAndroid Build Coastguard Worker         unaligned_store_s32(packed_w, boff + b[cr_block_start + cr_block_offset]);
1743*4bdc9457SAndroid Build Coastguard Worker         packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1744*4bdc9457SAndroid Build Coastguard Worker       }
1745*4bdc9457SAndroid Build Coastguard Worker     } else {
1746*4bdc9457SAndroid Build Coastguard Worker       size_t n = cr_block_size;
1747*4bdc9457SAndroid Build Coastguard Worker       do {
1748*4bdc9457SAndroid Build Coastguard Worker         unaligned_store_s32(packed_w, boff);
1749*4bdc9457SAndroid Build Coastguard Worker         packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1750*4bdc9457SAndroid Build Coastguard Worker       } while (--n != 0);
1751*4bdc9457SAndroid Build Coastguard Worker     }
1752*4bdc9457SAndroid Build Coastguard Worker     packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(int32_t));
1753*4bdc9457SAndroid Build Coastguard Worker     for (size_t x = 0; x < w; x++) {
1754*4bdc9457SAndroid Build Coastguard Worker       for (size_t y = 0; y < h; y++) {
1755*4bdc9457SAndroid Build Coastguard Worker         for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1756*4bdc9457SAndroid Build Coastguard Worker           const uint8_t kv = k[(y * w + x) * c + (cr_block_start + cr_block_offset)];
1757*4bdc9457SAndroid Build Coastguard Worker           unaligned_indexed_store_s32(packed_b, cr_block_offset, unaligned_indexed_load_s32(packed_b, cr_block_offset) - (int32_t) kv * izp);
1758*4bdc9457SAndroid Build Coastguard Worker           *((uint8_t*) packed_w) = kv;
1759*4bdc9457SAndroid Build Coastguard Worker           packed_w = (void*) ((uintptr_t) packed_w + sizeof(uint8_t));
1760*4bdc9457SAndroid Build Coastguard Worker         }
1761*4bdc9457SAndroid Build Coastguard Worker         packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(uint8_t));
1762*4bdc9457SAndroid Build Coastguard Worker       }
1763*4bdc9457SAndroid Build Coastguard Worker     }
1764*4bdc9457SAndroid Build Coastguard Worker     packed_w = (void*) ((uintptr_t) packed_w + (primary_tile - (h * w)) * cr_block_size * sizeof(uint8_t));
1765*4bdc9457SAndroid Build Coastguard Worker     packed_w = (void*) ((uintptr_t) packed_w + extra_bytes);
1766*4bdc9457SAndroid Build Coastguard Worker   }
1767*4bdc9457SAndroid Build Coastguard Worker }
1768*4bdc9457SAndroid Build Coastguard Worker 
xnn_pack_qs8_dwconv_hwg_w(size_t primary_tile,size_t h,size_t w,size_t c,size_t cr,const int8_t * k,const int32_t * b,void * packed_w,size_t extra_bytes,const struct xnn_qs8_packing_params * params)1769*4bdc9457SAndroid Build Coastguard Worker void xnn_pack_qs8_dwconv_hwg_w(
1770*4bdc9457SAndroid Build Coastguard Worker   size_t primary_tile,
1771*4bdc9457SAndroid Build Coastguard Worker   size_t h,
1772*4bdc9457SAndroid Build Coastguard Worker   size_t w,
1773*4bdc9457SAndroid Build Coastguard Worker   size_t c,
1774*4bdc9457SAndroid Build Coastguard Worker   size_t cr,
1775*4bdc9457SAndroid Build Coastguard Worker   const int8_t* k,
1776*4bdc9457SAndroid Build Coastguard Worker   const int32_t* b,
1777*4bdc9457SAndroid Build Coastguard Worker   void* packed_w,
1778*4bdc9457SAndroid Build Coastguard Worker   size_t extra_bytes,
1779*4bdc9457SAndroid Build Coastguard Worker   const struct xnn_qs8_packing_params* params)
1780*4bdc9457SAndroid Build Coastguard Worker {
1781*4bdc9457SAndroid Build Coastguard Worker   const uint32_t izp = (int32_t) params->input_zero_point;
1782*4bdc9457SAndroid Build Coastguard Worker   for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1783*4bdc9457SAndroid Build Coastguard Worker     const size_t cr_block_size = min(c - cr_block_start, cr);
1784*4bdc9457SAndroid Build Coastguard Worker     int32_t* packed_b = (int32_t*) packed_w;
1785*4bdc9457SAndroid Build Coastguard Worker     if XNN_LIKELY(b != NULL) {
1786*4bdc9457SAndroid Build Coastguard Worker       for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1787*4bdc9457SAndroid Build Coastguard Worker         unaligned_store_s32(packed_w, b[cr_block_start + cr_block_offset]);
1788*4bdc9457SAndroid Build Coastguard Worker         packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1789*4bdc9457SAndroid Build Coastguard Worker       }
1790*4bdc9457SAndroid Build Coastguard Worker     } else {
1791*4bdc9457SAndroid Build Coastguard Worker       size_t n = cr_block_size;
1792*4bdc9457SAndroid Build Coastguard Worker       do {
1793*4bdc9457SAndroid Build Coastguard Worker         unaligned_store_s32(packed_w, 0);
1794*4bdc9457SAndroid Build Coastguard Worker         packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1795*4bdc9457SAndroid Build Coastguard Worker       } while (--n != 0);
1796*4bdc9457SAndroid Build Coastguard Worker     }
1797*4bdc9457SAndroid Build Coastguard Worker     packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(int32_t));
1798*4bdc9457SAndroid Build Coastguard Worker     for (size_t x = 0; x < w; x++) {
1799*4bdc9457SAndroid Build Coastguard Worker       for (size_t y = 0; y < h; y++) {
1800*4bdc9457SAndroid Build Coastguard Worker         for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1801*4bdc9457SAndroid Build Coastguard Worker           const int8_t kv = k[(y * w + x) * c + (cr_block_start + cr_block_offset)];
1802*4bdc9457SAndroid Build Coastguard Worker           unaligned_indexed_store_u32(packed_b, cr_block_offset, unaligned_indexed_load_u32(packed_b, cr_block_offset) - (uint32_t) kv * izp);
1803*4bdc9457SAndroid Build Coastguard Worker           *((int8_t*) packed_w) = kv;
1804*4bdc9457SAndroid Build Coastguard Worker           packed_w = (void*) ((uintptr_t) packed_w + sizeof(int8_t));
1805*4bdc9457SAndroid Build Coastguard Worker         }
1806*4bdc9457SAndroid Build Coastguard Worker         packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(int8_t));
1807*4bdc9457SAndroid Build Coastguard Worker       }
1808*4bdc9457SAndroid Build Coastguard Worker     }
1809*4bdc9457SAndroid Build Coastguard Worker     packed_w = (void*) ((uintptr_t) packed_w + (primary_tile - (h * w)) * cr_block_size * sizeof(int8_t));
1810*4bdc9457SAndroid Build Coastguard Worker     packed_w = (void*) ((uintptr_t) packed_w + extra_bytes);
1811*4bdc9457SAndroid Build Coastguard Worker   }
1812*4bdc9457SAndroid Build Coastguard Worker }
1813*4bdc9457SAndroid Build Coastguard Worker 
xnn_pack_f32_gemminc_goi_w(size_t g,size_t nc,size_t kc,size_t nr,size_t kr,size_t sr,const float * k,float * packed_w,const void * params)1814*4bdc9457SAndroid Build Coastguard Worker void xnn_pack_f32_gemminc_goi_w(
1815*4bdc9457SAndroid Build Coastguard Worker   size_t g,
1816*4bdc9457SAndroid Build Coastguard Worker   size_t nc,
1817*4bdc9457SAndroid Build Coastguard Worker   size_t kc,
1818*4bdc9457SAndroid Build Coastguard Worker   size_t nr,
1819*4bdc9457SAndroid Build Coastguard Worker   size_t kr,
1820*4bdc9457SAndroid Build Coastguard Worker   size_t sr,
1821*4bdc9457SAndroid Build Coastguard Worker   const float* k,
1822*4bdc9457SAndroid Build Coastguard Worker   float* packed_w,
1823*4bdc9457SAndroid Build Coastguard Worker   const void* params)
1824*4bdc9457SAndroid Build Coastguard Worker {
1825*4bdc9457SAndroid Build Coastguard Worker   const size_t skr = sr * kr;
1826*4bdc9457SAndroid Build Coastguard Worker   do {
1827*4bdc9457SAndroid Build Coastguard Worker     for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
1828*4bdc9457SAndroid Build Coastguard Worker       const size_t nr_block_size = min(nc - nr_block_start, nr);
1829*4bdc9457SAndroid Build Coastguard Worker 
1830*4bdc9457SAndroid Build Coastguard Worker       for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
1831*4bdc9457SAndroid Build Coastguard Worker         for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1832*4bdc9457SAndroid Build Coastguard Worker           for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
1833*4bdc9457SAndroid Build Coastguard Worker             const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
1834*4bdc9457SAndroid Build Coastguard Worker             if (kc_idx < kc) {
1835*4bdc9457SAndroid Build Coastguard Worker               packed_w[kr_block_offset] = k[(nr_block_start + nr_block_offset) * kc + kc_idx];
1836*4bdc9457SAndroid Build Coastguard Worker             }
1837*4bdc9457SAndroid Build Coastguard Worker           }
1838*4bdc9457SAndroid Build Coastguard Worker           packed_w += kr;
1839*4bdc9457SAndroid Build Coastguard Worker         }
1840*4bdc9457SAndroid Build Coastguard Worker         packed_w += (nr - nr_block_size) * kr;
1841*4bdc9457SAndroid Build Coastguard Worker       }
1842*4bdc9457SAndroid Build Coastguard Worker     }
1843*4bdc9457SAndroid Build Coastguard Worker     k += nc * kc;
1844*4bdc9457SAndroid Build Coastguard Worker   } while (--g != 0);
1845*4bdc9457SAndroid Build Coastguard Worker }
1846*4bdc9457SAndroid Build Coastguard Worker 
xnn_pack_f16_gemminc_goi_w(size_t g,size_t nc,size_t kc,size_t nr,size_t kr,size_t sr,const uint16_t * k,uint16_t * packed_w,const void * params)1847*4bdc9457SAndroid Build Coastguard Worker void xnn_pack_f16_gemminc_goi_w(
1848*4bdc9457SAndroid Build Coastguard Worker   size_t g,
1849*4bdc9457SAndroid Build Coastguard Worker   size_t nc,
1850*4bdc9457SAndroid Build Coastguard Worker   size_t kc,
1851*4bdc9457SAndroid Build Coastguard Worker   size_t nr,
1852*4bdc9457SAndroid Build Coastguard Worker   size_t kr,
1853*4bdc9457SAndroid Build Coastguard Worker   size_t sr,
1854*4bdc9457SAndroid Build Coastguard Worker   const uint16_t* k,
1855*4bdc9457SAndroid Build Coastguard Worker   uint16_t* packed_w,
1856*4bdc9457SAndroid Build Coastguard Worker   const void* params)
1857*4bdc9457SAndroid Build Coastguard Worker {
1858*4bdc9457SAndroid Build Coastguard Worker   const size_t skr = sr * kr;
1859*4bdc9457SAndroid Build Coastguard Worker   do {
1860*4bdc9457SAndroid Build Coastguard Worker     for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
1861*4bdc9457SAndroid Build Coastguard Worker       const size_t nr_block_size = min(nc - nr_block_start, nr);
1862*4bdc9457SAndroid Build Coastguard Worker 
1863*4bdc9457SAndroid Build Coastguard Worker       for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
1864*4bdc9457SAndroid Build Coastguard Worker         for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1865*4bdc9457SAndroid Build Coastguard Worker           for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
1866*4bdc9457SAndroid Build Coastguard Worker             const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
1867*4bdc9457SAndroid Build Coastguard Worker             if (kc_idx < kc) {
1868*4bdc9457SAndroid Build Coastguard Worker               packed_w[kr_block_offset] = k[(nr_block_start + nr_block_offset) * kc + kc_idx];
1869*4bdc9457SAndroid Build Coastguard Worker             }
1870*4bdc9457SAndroid Build Coastguard Worker           }
1871*4bdc9457SAndroid Build Coastguard Worker           packed_w += kr;
1872*4bdc9457SAndroid Build Coastguard Worker         }
1873*4bdc9457SAndroid Build Coastguard Worker         packed_w += (nr - nr_block_size) * kr;
1874*4bdc9457SAndroid Build Coastguard Worker       }
1875*4bdc9457SAndroid Build Coastguard Worker     }
1876*4bdc9457SAndroid Build Coastguard Worker     k += nc * kc;
1877*4bdc9457SAndroid Build Coastguard Worker   } while (--g != 0);
1878*4bdc9457SAndroid Build Coastguard Worker }
1879*4bdc9457SAndroid Build Coastguard Worker 
xnn_pack_f32_dconv_oki_w(size_t nc,size_t kc,size_t nr,size_t kh,size_t kw,const float * k,const float * b,float * packed_w,const void * params)1880*4bdc9457SAndroid Build Coastguard Worker void xnn_pack_f32_dconv_oki_w(
1881*4bdc9457SAndroid Build Coastguard Worker   size_t nc,
1882*4bdc9457SAndroid Build Coastguard Worker   size_t kc,
1883*4bdc9457SAndroid Build Coastguard Worker   size_t nr,
1884*4bdc9457SAndroid Build Coastguard Worker   size_t kh,
1885*4bdc9457SAndroid Build Coastguard Worker   size_t kw,
1886*4bdc9457SAndroid Build Coastguard Worker   const float* k,
1887*4bdc9457SAndroid Build Coastguard Worker   const float* b,
1888*4bdc9457SAndroid Build Coastguard Worker   float* packed_w,
1889*4bdc9457SAndroid Build Coastguard Worker   const void* params)
1890*4bdc9457SAndroid Build Coastguard Worker {
1891*4bdc9457SAndroid Build Coastguard Worker   for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
1892*4bdc9457SAndroid Build Coastguard Worker     const size_t nr_block_size = min(nc - nr_block_start, nr);
1893*4bdc9457SAndroid Build Coastguard Worker     if XNN_LIKELY(b != NULL) {
1894*4bdc9457SAndroid Build Coastguard Worker       for (size_t nr_block_offset = 0; nr_block_offset < nr; nr_block_offset++) {
1895*4bdc9457SAndroid Build Coastguard Worker         *packed_w++ = b[min(nr_block_offset, nr_block_size - 1)];
1896*4bdc9457SAndroid Build Coastguard Worker       }
1897*4bdc9457SAndroid Build Coastguard Worker     } else {
1898*4bdc9457SAndroid Build Coastguard Worker       size_t n = nr;
1899*4bdc9457SAndroid Build Coastguard Worker       do {
1900*4bdc9457SAndroid Build Coastguard Worker         *packed_w++ = 0.0f;
1901*4bdc9457SAndroid Build Coastguard Worker       } while (--n != 0);
1902*4bdc9457SAndroid Build Coastguard Worker     }
1903*4bdc9457SAndroid Build Coastguard Worker 
1904*4bdc9457SAndroid Build Coastguard Worker     for (size_t kx = 0; kx < kw; kx++) {
1905*4bdc9457SAndroid Build Coastguard Worker       for (size_t c = 0; c < kc; c++) {
1906*4bdc9457SAndroid Build Coastguard Worker         for (size_t ky = 0; ky < kh; ky++) {
1907*4bdc9457SAndroid Build Coastguard Worker           for (size_t nr_block_offset = 0; nr_block_offset < nr; nr_block_offset++) {
1908*4bdc9457SAndroid Build Coastguard Worker             *packed_w++ = k[(((nr_block_start + min(nr_block_offset, nr_block_size - 1)) * kh + ky) * kw + kx) * kc + c];
1909*4bdc9457SAndroid Build Coastguard Worker           }
1910*4bdc9457SAndroid Build Coastguard Worker         }
1911*4bdc9457SAndroid Build Coastguard Worker       }
1912*4bdc9457SAndroid Build Coastguard Worker     }
1913*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(b != NULL) {
1914*4bdc9457SAndroid Build Coastguard Worker       b += nr;
1915*4bdc9457SAndroid Build Coastguard Worker     }
1916*4bdc9457SAndroid Build Coastguard Worker   }
1917*4bdc9457SAndroid Build Coastguard Worker }
1918*4bdc9457SAndroid Build Coastguard Worker 
xnn_pack_f16_dconv_oki_w(size_t nc,size_t kc,size_t nr,size_t kh,size_t kw,const uint16_t * k,const uint16_t * b,uint16_t * packed_w,const void * params)1919*4bdc9457SAndroid Build Coastguard Worker void xnn_pack_f16_dconv_oki_w(
1920*4bdc9457SAndroid Build Coastguard Worker   size_t nc,
1921*4bdc9457SAndroid Build Coastguard Worker   size_t kc,
1922*4bdc9457SAndroid Build Coastguard Worker   size_t nr,
1923*4bdc9457SAndroid Build Coastguard Worker   size_t kh,
1924*4bdc9457SAndroid Build Coastguard Worker   size_t kw,
1925*4bdc9457SAndroid Build Coastguard Worker   const uint16_t* k,
1926*4bdc9457SAndroid Build Coastguard Worker   const uint16_t* b,
1927*4bdc9457SAndroid Build Coastguard Worker   uint16_t* packed_w,
1928*4bdc9457SAndroid Build Coastguard Worker   const void* params)
1929*4bdc9457SAndroid Build Coastguard Worker {
1930*4bdc9457SAndroid Build Coastguard Worker   for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
1931*4bdc9457SAndroid Build Coastguard Worker     const size_t nr_block_size = min(nc - nr_block_start, nr);
1932*4bdc9457SAndroid Build Coastguard Worker     if XNN_LIKELY(b != NULL) {
1933*4bdc9457SAndroid Build Coastguard Worker       for (size_t nr_block_offset = 0; nr_block_offset < nr; nr_block_offset++) {
1934*4bdc9457SAndroid Build Coastguard Worker         *packed_w++ = b[min(nr_block_offset, nr_block_size - 1)];
1935*4bdc9457SAndroid Build Coastguard Worker       }
1936*4bdc9457SAndroid Build Coastguard Worker     } else {
1937*4bdc9457SAndroid Build Coastguard Worker       size_t n = nr;
1938*4bdc9457SAndroid Build Coastguard Worker       do {
1939*4bdc9457SAndroid Build Coastguard Worker         *packed_w++ = 0;
1940*4bdc9457SAndroid Build Coastguard Worker       } while (--n != 0);
1941*4bdc9457SAndroid Build Coastguard Worker     }
1942*4bdc9457SAndroid Build Coastguard Worker 
1943*4bdc9457SAndroid Build Coastguard Worker     for (size_t kx = 0; kx < kw; kx++) {
1944*4bdc9457SAndroid Build Coastguard Worker       for (size_t c = 0; c < kc; c++) {
1945*4bdc9457SAndroid Build Coastguard Worker         for (size_t ky = 0; ky < kh; ky++) {
1946*4bdc9457SAndroid Build Coastguard Worker           for (size_t nr_block_offset = 0; nr_block_offset < nr; nr_block_offset++) {
1947*4bdc9457SAndroid Build Coastguard Worker             *packed_w++ = k[(((nr_block_start + min(nr_block_offset, nr_block_size - 1)) * kh + ky) * kw + kx) * kc + c];
1948*4bdc9457SAndroid Build Coastguard Worker           }
1949*4bdc9457SAndroid Build Coastguard Worker         }
1950*4bdc9457SAndroid Build Coastguard Worker       }
1951*4bdc9457SAndroid Build Coastguard Worker     }
1952*4bdc9457SAndroid Build Coastguard Worker     if XNN_UNPREDICTABLE(b != NULL) {
1953*4bdc9457SAndroid Build Coastguard Worker       b += nr;
1954*4bdc9457SAndroid Build Coastguard Worker     }
1955*4bdc9457SAndroid Build Coastguard Worker   }
1956*4bdc9457SAndroid Build Coastguard Worker }
1957*4bdc9457SAndroid Build Coastguard Worker 
xnn_pack_f32_chw_dwconv_ghw_w(size_t kernel_size,size_t groups,const float * kernel,const float * bias,float * packed_weights,const void * params)1958*4bdc9457SAndroid Build Coastguard Worker void xnn_pack_f32_chw_dwconv_ghw_w(
1959*4bdc9457SAndroid Build Coastguard Worker   size_t kernel_size,
1960*4bdc9457SAndroid Build Coastguard Worker   size_t groups,
1961*4bdc9457SAndroid Build Coastguard Worker   const float* kernel,
1962*4bdc9457SAndroid Build Coastguard Worker   const float* bias,
1963*4bdc9457SAndroid Build Coastguard Worker   float* packed_weights,
1964*4bdc9457SAndroid Build Coastguard Worker   const void* params)
1965*4bdc9457SAndroid Build Coastguard Worker {
1966*4bdc9457SAndroid Build Coastguard Worker   for (size_t g = 0; g < groups; g++) {
1967*4bdc9457SAndroid Build Coastguard Worker     if XNN_LIKELY(bias != NULL) {
1968*4bdc9457SAndroid Build Coastguard Worker       *packed_weights = *bias++;
1969*4bdc9457SAndroid Build Coastguard Worker     } else {
1970*4bdc9457SAndroid Build Coastguard Worker       *packed_weights = 0.0f;
1971*4bdc9457SAndroid Build Coastguard Worker     }
1972*4bdc9457SAndroid Build Coastguard Worker     packed_weights += 1;
1973*4bdc9457SAndroid Build Coastguard Worker     for (size_t i = 0; i < kernel_size; i++) {
1974*4bdc9457SAndroid Build Coastguard Worker       *packed_weights++ = kernel[g * kernel_size + i];
1975*4bdc9457SAndroid Build Coastguard Worker     }
1976*4bdc9457SAndroid Build Coastguard Worker   }
1977*4bdc9457SAndroid Build Coastguard Worker }
1978*4bdc9457SAndroid Build Coastguard Worker 
xnn_pack_f16_chw_dwconv_ghw_w(size_t kernel_size,size_t groups,const uint16_t * kernel,const uint16_t * bias,uint16_t * packed_weights,const void * params)1979*4bdc9457SAndroid Build Coastguard Worker void xnn_pack_f16_chw_dwconv_ghw_w(
1980*4bdc9457SAndroid Build Coastguard Worker   size_t kernel_size,
1981*4bdc9457SAndroid Build Coastguard Worker   size_t groups,
1982*4bdc9457SAndroid Build Coastguard Worker   const uint16_t* kernel,
1983*4bdc9457SAndroid Build Coastguard Worker   const uint16_t* bias,
1984*4bdc9457SAndroid Build Coastguard Worker   uint16_t* packed_weights,
1985*4bdc9457SAndroid Build Coastguard Worker   const void* params)
1986*4bdc9457SAndroid Build Coastguard Worker {
1987*4bdc9457SAndroid Build Coastguard Worker   for (size_t g = 0; g < groups; g++) {
1988*4bdc9457SAndroid Build Coastguard Worker     if XNN_LIKELY(bias != NULL) {
1989*4bdc9457SAndroid Build Coastguard Worker       *packed_weights = *bias++;
1990*4bdc9457SAndroid Build Coastguard Worker     } else {
1991*4bdc9457SAndroid Build Coastguard Worker       *packed_weights = 0;
1992*4bdc9457SAndroid Build Coastguard Worker     }
1993*4bdc9457SAndroid Build Coastguard Worker     packed_weights += 1;
1994*4bdc9457SAndroid Build Coastguard Worker     for (size_t i = 0; i < kernel_size; i++) {
1995*4bdc9457SAndroid Build Coastguard Worker       *packed_weights++ = kernel[g * kernel_size + i];
1996*4bdc9457SAndroid Build Coastguard Worker     }
1997*4bdc9457SAndroid Build Coastguard Worker   }
1998*4bdc9457SAndroid Build Coastguard Worker }
1999*4bdc9457SAndroid Build Coastguard Worker 
xnn_pack_f32_chw_dwconv_hwg_w(size_t kernel_size,size_t groups,const float * kernel,const float * bias,float * packed_weights,const void * params)2000*4bdc9457SAndroid Build Coastguard Worker void xnn_pack_f32_chw_dwconv_hwg_w(
2001*4bdc9457SAndroid Build Coastguard Worker   size_t kernel_size,
2002*4bdc9457SAndroid Build Coastguard Worker   size_t groups,
2003*4bdc9457SAndroid Build Coastguard Worker   const float* kernel,
2004*4bdc9457SAndroid Build Coastguard Worker   const float* bias,
2005*4bdc9457SAndroid Build Coastguard Worker   float* packed_weights,
2006*4bdc9457SAndroid Build Coastguard Worker   const void* params)
2007*4bdc9457SAndroid Build Coastguard Worker {
2008*4bdc9457SAndroid Build Coastguard Worker   for (size_t g = 0; g < groups; g++) {
2009*4bdc9457SAndroid Build Coastguard Worker     if XNN_LIKELY(bias != NULL) {
2010*4bdc9457SAndroid Build Coastguard Worker       *packed_weights = *bias++;
2011*4bdc9457SAndroid Build Coastguard Worker     } else {
2012*4bdc9457SAndroid Build Coastguard Worker       *packed_weights = 0.0f;
2013*4bdc9457SAndroid Build Coastguard Worker     }
2014*4bdc9457SAndroid Build Coastguard Worker     packed_weights += 1;
2015*4bdc9457SAndroid Build Coastguard Worker     for (size_t i = 0; i < kernel_size; i++) {
2016*4bdc9457SAndroid Build Coastguard Worker       *packed_weights++ = kernel[i * groups + g];
2017*4bdc9457SAndroid Build Coastguard Worker     }
2018*4bdc9457SAndroid Build Coastguard Worker   }
2019*4bdc9457SAndroid Build Coastguard Worker }
2020*4bdc9457SAndroid Build Coastguard Worker 
xnn_pack_f32_vmulcaddc_w(size_t c,size_t cr,const float * s,const float * b,float * packed_w,const void * params)2021*4bdc9457SAndroid Build Coastguard Worker void xnn_pack_f32_vmulcaddc_w(
2022*4bdc9457SAndroid Build Coastguard Worker   size_t c,
2023*4bdc9457SAndroid Build Coastguard Worker   size_t cr,
2024*4bdc9457SAndroid Build Coastguard Worker   const float* s,
2025*4bdc9457SAndroid Build Coastguard Worker   const float* b,
2026*4bdc9457SAndroid Build Coastguard Worker   float* packed_w,
2027*4bdc9457SAndroid Build Coastguard Worker   const void* params)
2028*4bdc9457SAndroid Build Coastguard Worker {
2029*4bdc9457SAndroid Build Coastguard Worker   for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
2030*4bdc9457SAndroid Build Coastguard Worker     const size_t cr_block_size = min(c - cr_block_start, cr);
2031*4bdc9457SAndroid Build Coastguard Worker     for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
2032*4bdc9457SAndroid Build Coastguard Worker       *packed_w++ = s[cr_block_start + cr_block_offset];
2033*4bdc9457SAndroid Build Coastguard Worker     }
2034*4bdc9457SAndroid Build Coastguard Worker     packed_w += cr - cr_block_size;
2035*4bdc9457SAndroid Build Coastguard Worker     if XNN_LIKELY(b != NULL) {
2036*4bdc9457SAndroid Build Coastguard Worker       for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
2037*4bdc9457SAndroid Build Coastguard Worker         *packed_w++ = b[cr_block_start + cr_block_offset];
2038*4bdc9457SAndroid Build Coastguard Worker       }
2039*4bdc9457SAndroid Build Coastguard Worker     } else {
2040*4bdc9457SAndroid Build Coastguard Worker       size_t n = cr_block_size;
2041*4bdc9457SAndroid Build Coastguard Worker       do {
2042*4bdc9457SAndroid Build Coastguard Worker         *packed_w++ = 0.0f;
2043*4bdc9457SAndroid Build Coastguard Worker       } while (--n != 0);
2044*4bdc9457SAndroid Build Coastguard Worker     }
2045*4bdc9457SAndroid Build Coastguard Worker     packed_w += cr - cr_block_size;
2046*4bdc9457SAndroid Build Coastguard Worker   }
2047*4bdc9457SAndroid Build Coastguard Worker }
2048*4bdc9457SAndroid Build Coastguard Worker 
xnn_pack_f16_vmulcaddc_w(size_t c,size_t cr,const uint16_t * s,const uint16_t * b,uint16_t * packed_w,const void * params)2049*4bdc9457SAndroid Build Coastguard Worker void xnn_pack_f16_vmulcaddc_w(
2050*4bdc9457SAndroid Build Coastguard Worker   size_t c,
2051*4bdc9457SAndroid Build Coastguard Worker   size_t cr,
2052*4bdc9457SAndroid Build Coastguard Worker   const uint16_t* s,
2053*4bdc9457SAndroid Build Coastguard Worker   const uint16_t* b,
2054*4bdc9457SAndroid Build Coastguard Worker   uint16_t* packed_w,
2055*4bdc9457SAndroid Build Coastguard Worker   const void* params)
2056*4bdc9457SAndroid Build Coastguard Worker {
2057*4bdc9457SAndroid Build Coastguard Worker   for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
2058*4bdc9457SAndroid Build Coastguard Worker     const size_t cr_block_size = min(c - cr_block_start, cr);
2059*4bdc9457SAndroid Build Coastguard Worker     for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
2060*4bdc9457SAndroid Build Coastguard Worker       *packed_w++ = s[cr_block_start + cr_block_offset];
2061*4bdc9457SAndroid Build Coastguard Worker     }
2062*4bdc9457SAndroid Build Coastguard Worker     packed_w += cr - cr_block_size;
2063*4bdc9457SAndroid Build Coastguard Worker     if XNN_LIKELY(b != NULL) {
2064*4bdc9457SAndroid Build Coastguard Worker       for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
2065*4bdc9457SAndroid Build Coastguard Worker         *packed_w++ = b[cr_block_start + cr_block_offset];
2066*4bdc9457SAndroid Build Coastguard Worker       }
2067*4bdc9457SAndroid Build Coastguard Worker     } else {
2068*4bdc9457SAndroid Build Coastguard Worker       size_t n = cr_block_size;
2069*4bdc9457SAndroid Build Coastguard Worker       do {
2070*4bdc9457SAndroid Build Coastguard Worker         *packed_w++ = 0;
2071*4bdc9457SAndroid Build Coastguard Worker       } while (--n != 0);
2072*4bdc9457SAndroid Build Coastguard Worker     }
2073*4bdc9457SAndroid Build Coastguard Worker     packed_w += cr - cr_block_size;
2074*4bdc9457SAndroid Build Coastguard Worker   }
2075*4bdc9457SAndroid Build Coastguard Worker }
2076*4bdc9457SAndroid Build Coastguard Worker 
xnn_pack_f32_to_f16_vmulcaddc_w(size_t c,size_t cr,const float * s,const float * b,uint16_t * packed_w,const void * params)2077*4bdc9457SAndroid Build Coastguard Worker void xnn_pack_f32_to_f16_vmulcaddc_w(
2078*4bdc9457SAndroid Build Coastguard Worker   size_t c,
2079*4bdc9457SAndroid Build Coastguard Worker   size_t cr,
2080*4bdc9457SAndroid Build Coastguard Worker   const float* s,
2081*4bdc9457SAndroid Build Coastguard Worker   const float* b,
2082*4bdc9457SAndroid Build Coastguard Worker   uint16_t* packed_w,
2083*4bdc9457SAndroid Build Coastguard Worker   const void* params)
2084*4bdc9457SAndroid Build Coastguard Worker {
2085*4bdc9457SAndroid Build Coastguard Worker   for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
2086*4bdc9457SAndroid Build Coastguard Worker     const size_t cr_block_size = min(c - cr_block_start, cr);
2087*4bdc9457SAndroid Build Coastguard Worker     for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
2088*4bdc9457SAndroid Build Coastguard Worker       *packed_w++ = fp16_ieee_from_fp32_value(s[cr_block_start + cr_block_offset]);
2089*4bdc9457SAndroid Build Coastguard Worker     }
2090*4bdc9457SAndroid Build Coastguard Worker     packed_w += cr - cr_block_size;
2091*4bdc9457SAndroid Build Coastguard Worker     if XNN_LIKELY(b != NULL) {
2092*4bdc9457SAndroid Build Coastguard Worker       for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
2093*4bdc9457SAndroid Build Coastguard Worker         *packed_w++ = fp16_ieee_from_fp32_value(b[cr_block_start + cr_block_offset]);
2094*4bdc9457SAndroid Build Coastguard Worker       }
2095*4bdc9457SAndroid Build Coastguard Worker     } else {
2096*4bdc9457SAndroid Build Coastguard Worker       size_t n = cr_block_size;
2097*4bdc9457SAndroid Build Coastguard Worker       do {
2098*4bdc9457SAndroid Build Coastguard Worker         *packed_w++ = 0;
2099*4bdc9457SAndroid Build Coastguard Worker       } while (--n != 0);
2100*4bdc9457SAndroid Build Coastguard Worker     }
2101*4bdc9457SAndroid Build Coastguard Worker     packed_w += cr - cr_block_size;
2102*4bdc9457SAndroid Build Coastguard Worker   }
2103*4bdc9457SAndroid Build Coastguard Worker }
2104*4bdc9457SAndroid Build Coastguard Worker 
xnn_pack_f32_prelu_w(size_t c,const float * s,float * packed_w)2105*4bdc9457SAndroid Build Coastguard Worker void xnn_pack_f32_prelu_w(
2106*4bdc9457SAndroid Build Coastguard Worker   size_t c,
2107*4bdc9457SAndroid Build Coastguard Worker   const float* s,
2108*4bdc9457SAndroid Build Coastguard Worker   float* packed_w)
2109*4bdc9457SAndroid Build Coastguard Worker {
2110*4bdc9457SAndroid Build Coastguard Worker   memcpy(packed_w, s, c * sizeof(float));
2111*4bdc9457SAndroid Build Coastguard Worker }
2112*4bdc9457SAndroid Build Coastguard Worker 
xnn_pack_f16_prelu_w(size_t c,const uint16_t * s,uint16_t * packed_w)2113*4bdc9457SAndroid Build Coastguard Worker void xnn_pack_f16_prelu_w(
2114*4bdc9457SAndroid Build Coastguard Worker   size_t c,
2115*4bdc9457SAndroid Build Coastguard Worker   const uint16_t* s,
2116*4bdc9457SAndroid Build Coastguard Worker   uint16_t* packed_w)
2117*4bdc9457SAndroid Build Coastguard Worker {
2118*4bdc9457SAndroid Build Coastguard Worker   memcpy(packed_w, s, c * sizeof(uint16_t));
2119*4bdc9457SAndroid Build Coastguard Worker }
2120*4bdc9457SAndroid Build Coastguard Worker 
xnn_pack_f32_to_f16_prelu_w(size_t c,const float * s,uint16_t * packed_w)2121*4bdc9457SAndroid Build Coastguard Worker void xnn_pack_f32_to_f16_prelu_w(
2122*4bdc9457SAndroid Build Coastguard Worker   size_t c,
2123*4bdc9457SAndroid Build Coastguard Worker   const float* s,
2124*4bdc9457SAndroid Build Coastguard Worker   uint16_t* packed_w)
2125*4bdc9457SAndroid Build Coastguard Worker {
2126*4bdc9457SAndroid Build Coastguard Worker   do {
2127*4bdc9457SAndroid Build Coastguard Worker     *packed_w++ = fp16_ieee_from_fp32_value(*s++);
2128*4bdc9457SAndroid Build Coastguard Worker   } while (--c != 0);
2129*4bdc9457SAndroid Build Coastguard Worker }
2130