xref: /aosp_15_r20/external/XNNPACK/src/indirection.c (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1 // Copyright (c) Facebook, Inc. and its affiliates.
2 // All rights reserved.
3 //
4 // Copyright 2019 Google LLC
5 //
6 // This source code is licensed under the BSD-style license found in the
7 // LICENSE file in the root directory of this source tree.
8 
9 #include <stddef.h>
10 #include <math.h>
11 
12 #include <fp16.h>
13 
14 #include <fxdiv.h>
15 
16 #include <xnnpack/indirection.h>
17 #include <xnnpack/operator.h>
18 #include <xnnpack/math.h>
19 
20 
xnn_indirection_init_conv2d(xnn_operator_t op,size_t output_tile_size,uint32_t log2_element_size)21 void xnn_indirection_init_conv2d(
22   xnn_operator_t op,
23   size_t output_tile_size,
24   uint32_t log2_element_size)
25 {
26   const void** indirection_buffer          = op->indirection_buffer;
27   const void* input                        = op->input;
28   const void* zero                         = op->zero_buffer;
29   const size_t input_pixel_stride          = op->input_pixel_stride << log2_element_size;
30   const size_t input_height                = op->input_height;
31   const size_t input_width                 = op->input_width;
32   const size_t output_height               = op->output_height;
33   const size_t output_width                = op->output_width;
34   const size_t kernel_height               = op->kernel_height;
35   const size_t kernel_width                = op->kernel_width;
36   const size_t stride_height               = op->stride_height;
37   const size_t stride_width                = op->stride_width;
38   const size_t dilation_height             = op->dilation_height;
39   const size_t dilation_width              = op->dilation_width;
40   const size_t input_padding_top           = op->padding_top;
41   const size_t input_padding_left          = op->padding_left;
42 
43   const size_t output_size = output_height * output_width;
44   const size_t tiled_output_size = round_up(output_size, output_tile_size);
45   const size_t kernel_size = kernel_height * kernel_width;
46 
47   const struct fxdiv_divisor_size_t output_width_divisor = fxdiv_init_size_t(output_width);
48 
49   for (size_t output_tile_start = 0; output_tile_start < tiled_output_size; output_tile_start += output_tile_size) {
50     for (size_t output_tile_offset = 0; output_tile_offset < output_tile_size; output_tile_offset++) {
51       const size_t output_index = min(output_tile_start + output_tile_offset, output_size - 1);
52       const struct fxdiv_result_size_t output_y_x = fxdiv_divide_size_t(output_index, output_width_divisor);
53       const size_t output_x = output_y_x.remainder;
54       const size_t output_y = output_y_x.quotient;
55       for (size_t kernel_y = 0; kernel_y < kernel_height; kernel_y++) {
56         const size_t input_y = output_y * stride_height + kernel_y * dilation_height - input_padding_top;
57         if (input_y < input_height) {
58           for (size_t kernel_x = 0; kernel_x < kernel_width; kernel_x++) {
59             const size_t input_x = output_x * stride_width + kernel_x * dilation_width - input_padding_left;
60             const size_t kernel_index = kernel_y * kernel_width + kernel_x;
61             const size_t index = output_tile_start * kernel_size + kernel_index * output_tile_size + output_tile_offset;
62             if (input_x < input_width) {
63               indirection_buffer[index] = (const void*)
64                 ((uintptr_t) input + (input_y * input_width + input_x) * input_pixel_stride);
65             } else {
66               indirection_buffer[index] = zero;
67             }
68           }
69         } else {
70           for (size_t kernel_x = 0; kernel_x < kernel_width; kernel_x++) {
71             const size_t kernel_index = kernel_y * kernel_width + kernel_x;
72             const size_t index = output_tile_start * kernel_size + kernel_index * output_tile_size + output_tile_offset;
73             indirection_buffer[index] = zero;
74           }
75         }
76       }
77     }
78   }
79 }
80 
xnn_indirection_init_deconv2d(xnn_operator_t op,size_t output_tile_size,uint32_t log2_element_size)81 void xnn_indirection_init_deconv2d(
82   xnn_operator_t op,
83   size_t output_tile_size,
84   uint32_t log2_element_size)
85 {
86   const void** indirection_buffer = op->indirection_buffer;
87   const void* input               = op->input;
88   const size_t input_pixel_stride = op->input_pixel_stride << log2_element_size;
89   const void* zero                = op->zero_buffer;
90   const size_t input_height       = op->input_height;
91   const size_t input_width        = op->input_width;
92   const size_t output_height      = op->output_height;
93   const size_t output_width       = op->output_width;
94   const size_t kernel_height      = op->kernel_height;
95   const size_t kernel_width       = op->kernel_width;
96   const size_t stride_height      = op->stride_height;
97   const size_t stride_width       = op->stride_width;
98   const size_t dilation_height    = op->dilation_height;
99   const size_t dilation_width     = op->dilation_width;
100   const size_t padding_top        = op->padding_top;
101   const size_t padding_left       = op->padding_left;
102 
103   const size_t output_size = output_height * output_width;
104   const size_t tiled_output_size = round_up(output_size, output_tile_size);
105   const size_t kernel_size = kernel_height * kernel_width;
106 
107   const struct fxdiv_divisor_size_t output_width_divisor = fxdiv_init_size_t(output_width);
108   const struct fxdiv_divisor_size_t stride_height_divisor = fxdiv_init_size_t(stride_height);
109   const struct fxdiv_divisor_size_t stride_width_divisor = fxdiv_init_size_t(stride_width);
110 
111   for (size_t output_tile_start = 0; output_tile_start < tiled_output_size; output_tile_start += output_tile_size) {
112     for (size_t output_tile_offset = 0; output_tile_offset < output_tile_size; output_tile_offset++) {
113       const size_t output_index = min(output_tile_start + output_tile_offset, output_size - 1);
114       const struct fxdiv_result_size_t output_y_x = fxdiv_divide_size_t(output_index, output_width_divisor);
115       const size_t output_x = output_y_x.remainder;
116       const size_t output_y = output_y_x.quotient;
117       for (size_t kernel_y = 0; kernel_y < kernel_height; kernel_y++) {
118         const size_t y = output_y + padding_top - kernel_y * dilation_height;
119         const size_t input_y = fxdiv_quotient_size_t(y, stride_height_divisor);
120         for (size_t kernel_x = 0; kernel_x < kernel_width; kernel_x++) {
121           const size_t x = output_x + padding_left - kernel_x * dilation_width;
122           const size_t input_x = fxdiv_quotient_size_t(x, stride_width_divisor);
123           const size_t kernel_index = kernel_y * kernel_width + kernel_x;
124           const size_t index = output_tile_start * kernel_size + kernel_index * output_tile_size + output_tile_offset;
125           if (input_y * stride_height == y && input_y < input_height && input_x * stride_width == x && input_x < input_width) {
126             indirection_buffer[index] = (const void*) ((uintptr_t) input + (input_y * input_width + input_x) * input_pixel_stride);
127           } else {
128             indirection_buffer[index] = zero;
129           }
130         }
131       }
132     }
133   }
134 }
135 
xnn_indirection_init_subconv2d(xnn_operator_t op,size_t output_tile_size,uint32_t log2_element_size)136 void xnn_indirection_init_subconv2d(
137   xnn_operator_t op,
138   size_t output_tile_size,
139   uint32_t log2_element_size)
140 {
141   const void** indirection_buffer                     = op->indirection_buffer;
142   struct subconvolution_params* subconvolution_params = op->subconvolution_buffer;
143   const void* input                                   = op->input;
144   const size_t input_pixel_stride                     = op->input_pixel_stride << log2_element_size;
145   const void* zero                                    = op->zero_buffer;
146   const size_t input_height                           = op->input_height;
147   const size_t input_width                            = op->input_width;
148   const size_t output_height                          = op->output_height;
149   const size_t output_width                           = op->output_width;
150   const size_t kernel_height                          = op->kernel_height;
151   const size_t kernel_width                           = op->kernel_width;
152   const size_t stride_height                          = op->stride_height;
153   const size_t stride_width                           = op->stride_width;
154   const size_t padding_top                            = op->padding_top;
155   const size_t padding_left                           = op->padding_left;
156 
157   const size_t modulo_padding_top = padding_top % stride_height;
158   const size_t modulo_padding_left = padding_left % stride_width;
159   for (size_t offset_y = 0; offset_y < stride_height; offset_y++) {
160     const size_t output_y_start = subtract_modulo(offset_y, modulo_padding_top, stride_height);
161     for (size_t offset_x = 0; offset_x < stride_width; offset_x++) {
162       const size_t output_x_start = subtract_modulo(offset_x, modulo_padding_left, stride_width);
163       const size_t sliced_output_width = divide_round_up(output_width - output_x_start, stride_width);
164 
165       subconvolution_params->indirection_buffer = indirection_buffer;
166       subconvolution_params->indirection_y_stride =
167         subconvolution_params->indirection_x_stride * round_up(sliced_output_width, output_tile_size);
168       ++subconvolution_params;
169 
170       for (size_t output_y = output_y_start; output_y < output_height; output_y += stride_height) {
171         for (size_t output_tile_start = 0; output_tile_start < sliced_output_width; output_tile_start += output_tile_size) {
172           for (size_t kernel_y = offset_y; kernel_y < kernel_height; kernel_y += stride_height) {
173             assert(doz(output_y + padding_top, kernel_y) % stride_height == 0);
174             const size_t y = output_y + padding_top - kernel_y;
175             const size_t input_y = y / stride_height;
176 
177             for (size_t kernel_x = offset_x; kernel_x < kernel_width; kernel_x += stride_width) {
178               for (size_t output_tile_offset = 0; output_tile_offset < output_tile_size; output_tile_offset++) {
179                 const size_t sliced_output_x = min(output_tile_start + output_tile_offset, sliced_output_width - 1);
180                 const size_t output_x = output_x_start + sliced_output_x * stride_width;
181 
182                 assert(doz(output_x + padding_left, kernel_x) % stride_width == 0);
183                 const size_t x = output_x + padding_left - kernel_x;
184                 const size_t input_x = x / stride_width;
185 
186                 if (input_y < input_height && input_x < input_width) {
187                   *indirection_buffer++ =
188                     (const void*) ((uintptr_t) input + (input_y * input_width + input_x) * input_pixel_stride);
189                 } else {
190                   *indirection_buffer++ = zero;
191                 }
192               }
193             }
194           }
195         }
196       }
197     }
198   }
199 }
200 
xnn_indirection_init_dwconv2d(xnn_operator_t op,size_t step_height,size_t step_width,size_t primary_tile,uint32_t log2_element_size)201 void xnn_indirection_init_dwconv2d(
202   xnn_operator_t op,
203   size_t step_height,
204   size_t step_width,
205   size_t primary_tile,
206   uint32_t log2_element_size)
207 {
208   const void** indirection_buffer = op->indirection_buffer;
209   const void* input               = op->input;
210   const size_t input_pixel_stride = op->input_pixel_stride << log2_element_size;
211   const void* zero                = op->zero_buffer;
212   const size_t input_height       = op->input_height;
213   const size_t input_width        = op->input_width;
214   const size_t output_height      = op->output_height;
215   const size_t output_width       = op->output_width;
216   const size_t kernel_height      = op->kernel_height;
217   const size_t kernel_width       = op->kernel_width;
218   const size_t stride_height      = op->stride_height;
219   const size_t stride_width       = op->stride_width;
220   const size_t dilation_height    = op->dilation_height;
221   const size_t dilation_width     = op->dilation_width;
222   const size_t input_padding_top  = op->padding_top;
223   const size_t input_padding_left = op->padding_left;
224 
225   for (size_t output_y = 0; output_y < output_height; output_y++) {
226     for (size_t kernel_y = 0; kernel_y < kernel_height; kernel_y++) {
227       const size_t input_y = output_y * stride_height + kernel_y * dilation_height - input_padding_top;
228       if (input_y < input_height) {
229         for (size_t output_x = 0; output_x < output_width; output_x++) {
230           for (size_t kernel_x = 0; kernel_x < kernel_width; kernel_x++) {
231             const size_t input_x = output_x * stride_width + kernel_x * dilation_width - input_padding_left;
232             const size_t index = output_y * step_height + output_x * step_width * kernel_height + kernel_x * kernel_height + kernel_y;
233             if (input_x < input_width) {
234               indirection_buffer[index] =
235                 (const void*) ((uintptr_t) input + (input_y * input_width + input_x) * input_pixel_stride);
236             } else {
237               indirection_buffer[index] = zero;
238             }
239           }
240         }
241       } else {
242         for (size_t output_x = 0; output_x < output_width; output_x++) {
243           for (size_t kernel_x = 0; kernel_x < kernel_width; kernel_x++) {
244             const size_t index = output_y * step_height + output_x * step_width * kernel_height + kernel_x * kernel_height + kernel_y;
245             indirection_buffer[index] = zero;
246           }
247         }
248       }
249     }
250   }
251 
252   const void* last_output_pixel = indirection_buffer[output_height * step_height - 1];
253   const size_t last_kernel_index = output_height * step_height - (kernel_height * kernel_width);
254   for (size_t tile_index = kernel_height * kernel_width; tile_index < primary_tile; tile_index++) {
255     indirection_buffer[last_kernel_index + tile_index] = last_output_pixel;
256   }
257 }
258 
xnn_indirection_init_maxpool2d(xnn_operator_t op,size_t step_height,size_t step_width,uint32_t log2_element_size)259 void xnn_indirection_init_maxpool2d(
260   xnn_operator_t op,
261   size_t step_height,
262   size_t step_width,
263   uint32_t log2_element_size)
264 {
265   const void** indirection_buffer = op->indirection_buffer;
266   const void* input               = op->input;
267   const size_t input_pixel_stride = op->input_pixel_stride << log2_element_size;
268   const size_t input_height       = op->input_height;
269   const size_t input_width        = op->input_width;
270   const size_t output_height      = op->output_height;
271   const size_t output_width       = op->output_width;
272   const size_t pooling_height     = op->kernel_height;
273   const size_t pooling_width      = op->kernel_width;
274   const size_t stride_height      = op->stride_height;
275   const size_t stride_width       = op->stride_width;
276   const size_t dilation_height    = op->dilation_height;
277   const size_t dilation_width     = op->dilation_width;
278   const size_t input_padding_top  = op->padding_top;
279   const size_t input_padding_left = op->padding_left;
280 
281   const bool any_dilation = (dilation_height | dilation_width) > 1;
282 
283   if (any_dilation) {
284     // Clamp to the border doesn't work for pooling with dilation.
285     const size_t adjusted_padding_top = input_padding_top % dilation_height;
286     const size_t adjusted_padding_left = input_padding_left % dilation_width;
287     for (size_t output_y = 0; output_y < output_height; output_y++) {
288       for (size_t pooling_y = 0; pooling_y < pooling_height; pooling_y++) {
289         size_t safe_input_y = output_y * stride_height;
290         if XNN_UNPREDICTABLE(safe_input_y < adjusted_padding_top) {
291           safe_input_y += dilation_height;
292         }
293         safe_input_y -= adjusted_padding_top;
294 
295         size_t input_y = output_y * stride_height + pooling_y * dilation_height - input_padding_top;
296         if XNN_UNPREDICTABLE(input_y >= input_height) {
297           input_y = safe_input_y;
298         }
299 
300         for (size_t output_x = 0; output_x < output_width; output_x++) {
301           for (size_t pooling_x = 0; pooling_x < pooling_width; pooling_x++) {
302             size_t safe_input_x = output_x * stride_width;
303             if XNN_UNPREDICTABLE(safe_input_x < adjusted_padding_left) {
304               safe_input_x += dilation_width;
305             }
306             safe_input_x -= adjusted_padding_left;
307 
308             size_t input_x = output_x * stride_width + pooling_x * dilation_width - input_padding_left;
309             if XNN_UNPREDICTABLE(input_x >= input_width) {
310               input_x = safe_input_x;
311             }
312 
313             const size_t index = output_y * step_height + output_x * step_width * pooling_height + pooling_x * pooling_height + pooling_y;
314             indirection_buffer[index] = (const void*) ((uintptr_t) input + (input_y * input_width + input_x) * input_pixel_stride);
315           }
316         }
317       }
318     }
319   } else {
320     const size_t input_x_max = input_width - 1;
321     const size_t input_y_max = input_height - 1;
322     for (size_t output_y = 0; output_y < output_height; output_y++) {
323       for (size_t pooling_y = 0; pooling_y < pooling_height; pooling_y++) {
324         const size_t input_y = min(doz(output_y * stride_height + pooling_y * dilation_height, input_padding_top), input_y_max);
325         for (size_t output_x = 0; output_x < output_width; output_x++) {
326           for (size_t pooling_x = 0; pooling_x < pooling_width; pooling_x++) {
327             const size_t input_x = min(doz(output_x * stride_width + pooling_x * dilation_width, input_padding_left), input_x_max);
328             const size_t index = output_y * step_height + output_x * step_width * pooling_height + pooling_x * pooling_height + pooling_y;
329             indirection_buffer[index] = (const void*) ((uintptr_t) input + (input_y * input_width + input_x) * input_pixel_stride);
330           }
331         }
332       }
333     }
334   }
335 }
336 
xnn_indirection_init_resize_bilinear2d_hwc_f16(size_t input_pixel_stride,size_t input_height,size_t input_width,size_t output_height,size_t output_width,const void * input,const void ** indirection_buffer,void * packed_weights,bool align_corners,bool tensorflow_legacy)337 void xnn_indirection_init_resize_bilinear2d_hwc_f16(
338   size_t input_pixel_stride,
339   size_t input_height,
340   size_t input_width,
341   size_t output_height,
342   size_t output_width,
343   const void* input,
344   const void** indirection_buffer,
345   void* packed_weights,
346   bool align_corners,
347   bool tensorflow_legacy)
348 {
349   assert(input_height != 0);
350   assert(input_height < 16777216 /* 2**24 */);
351   assert(input_width != 0);
352   assert(input_width < 16777216 /* 2**24 */);
353   assert(output_height != 0);
354   assert(output_height < 16777216 /* 2**24 */);
355   assert(output_width != 0);
356   assert(output_width < 16777216 /* 2**24 */);
357 
358   const int32_t width_adjustment = (int32_t) (align_corners && output_width != 1);
359   const int32_t height_adjustment = (int32_t) (align_corners && output_height != 1);
360   const float width_scale =
361     (float) ((int32_t) input_width - width_adjustment) / (float) ((int32_t) output_width - width_adjustment);
362   const float height_scale =
363     (float) ((int32_t) input_height - height_adjustment) / (float) ((int32_t) output_height - height_adjustment);
364 
365   uint16_t* w = (uint16_t*) packed_weights;
366   const uint32_t input_y_max = (uint32_t) input_height - 1;
367   const uint32_t input_x_max = (uint32_t) input_width - 1;
368   if (tensorflow_legacy || align_corners) {
369     for (size_t output_y = 0; output_y < output_height; output_y++) {
370       const float input_y = (float) (int32_t) output_y * height_scale;
371       assert(input_y >= 0.0f);
372       assert(input_y < (float) input_height);
373 
374       const uint32_t input_y_top = (uint32_t) (int32_t) input_y;
375       const uint32_t input_y_bottom = math_min_u32(input_y_top + 1, input_y_max);
376       const float alpha_y = input_y - (float) input_y_top;
377       for (size_t output_x = 0; output_x < output_width; output_x++) {
378         const float input_x = (float) (int32_t) output_x * width_scale;
379         assert(input_x >= 0.0f);
380         assert(input_x < (float) input_width);
381 
382         const uint32_t input_x_left = (uint32_t) (int32_t) input_x;
383         const uint32_t input_x_right = math_min_u32(input_x_left + 1, input_x_max);
384         const float alpha_x = input_x - (float) input_x_left;
385         indirection_buffer[0] =
386           (void*) ((uintptr_t) input + (input_y_top * input_width + input_x_left) * input_pixel_stride);
387         indirection_buffer[1] =
388           (void*) ((uintptr_t) input + (input_y_top * input_width + input_x_right) * input_pixel_stride);
389         indirection_buffer[2] =
390           (void*) ((uintptr_t) input + (input_y_bottom * input_width + input_x_left) * input_pixel_stride);
391         indirection_buffer[3] =
392           (void*) ((uintptr_t) input + (input_y_bottom * input_width + input_x_right) * input_pixel_stride);
393         w[0] = fp16_ieee_from_fp32_value(alpha_x);
394         w[1] = fp16_ieee_from_fp32_value(alpha_y);
395         indirection_buffer += 4;
396         w += 2;
397       }
398     }
399   } else {
400     const float height_offset = 0.5f * height_scale - 0.5f;
401     const float width_offset = 0.5f * width_scale - 0.5f;
402     for (size_t output_y = 0; output_y < output_height; output_y++) {
403       float input_y = (float) (int32_t) output_y * height_scale + height_offset;
404       input_y = math_min_f32(math_max_f32(input_y, 0.0f), (float) input_y_max);
405       const uint32_t input_y_top = (uint32_t) (int32_t) input_y;
406       assert((int32_t) input_y_top >= 0);
407       const uint32_t input_y_bottom = math_min_u32(input_y_top + 1, input_y_max);
408       const float alpha_y = input_y - (float) input_y_top;
409       for (size_t output_x = 0; output_x < output_width; output_x++) {
410         float input_x = (float) (int32_t) output_x * width_scale + width_offset;
411         input_x = math_min_f32(math_max_f32(input_x, 0.0f), (float) input_x_max);
412         const uint32_t input_x_left = (uint32_t) (int32_t) input_x;
413         assert((int32_t) input_x_left >= 0);
414         const uint32_t input_x_right = math_min_u32(input_x_left + 1, input_x_max);
415         const float alpha_x = input_x - (float) input_x_left;
416         indirection_buffer[0] =
417           (void*) ((uintptr_t) input + (input_y_top * input_width + input_x_left) * input_pixel_stride);
418         indirection_buffer[1] =
419           (void*) ((uintptr_t) input + (input_y_top * input_width + input_x_right) * input_pixel_stride);
420         indirection_buffer[2] =
421           (void*) ((uintptr_t) input + (input_y_bottom * input_width + input_x_left) * input_pixel_stride);
422         indirection_buffer[3] =
423           (void*) ((uintptr_t) input + (input_y_bottom * input_width + input_x_right) * input_pixel_stride);
424         w[0] = fp16_ieee_from_fp32_value(alpha_x);
425         w[1] = fp16_ieee_from_fp32_value(alpha_y);
426         indirection_buffer += 4;
427         w += 2;
428       }
429     }
430   }
431 }
432 
xnn_indirection_init_resize_bilinear2d_hwc_f32(size_t input_pixel_stride,size_t input_height,size_t input_width,size_t output_height,size_t output_width,const void * input,const void ** indirection_buffer,float * packed_weights,bool align_corners,bool tensorflow_legacy)433 void xnn_indirection_init_resize_bilinear2d_hwc_f32(
434   size_t input_pixel_stride,
435   size_t input_height,
436   size_t input_width,
437   size_t output_height,
438   size_t output_width,
439   const void* input,
440   const void** indirection_buffer,
441   float* packed_weights,
442   bool align_corners,
443   bool tensorflow_legacy)
444 {
445   assert(input_height != 0);
446   assert(input_height < 16777216 /* 2**24 */);
447   assert(input_width != 0);
448   assert(input_width < 16777216 /* 2**24 */);
449   assert(output_height != 0);
450   assert(output_height < 16777216 /* 2**24 */);
451   assert(output_width != 0);
452   assert(output_width < 16777216 /* 2**24 */);
453 
454   const int32_t width_adjustment = (int32_t) (align_corners && output_width != 1);
455   const int32_t height_adjustment = (int32_t) (align_corners && output_height != 1);
456   const float width_scale =
457     (float) ((int32_t) input_width - width_adjustment) / (float) ((int32_t) output_width - width_adjustment);
458   const float height_scale =
459     (float) ((int32_t) input_height - height_adjustment) / (float) ((int32_t) output_height - height_adjustment);
460 
461   const uint32_t input_y_max = (uint32_t) input_height - 1;
462   const uint32_t input_x_max = (uint32_t) input_width - 1;
463   if (tensorflow_legacy || align_corners) {
464     for (size_t output_y = 0; output_y < output_height; output_y++) {
465       const float input_y = (float) (int32_t) output_y * height_scale;
466       assert(input_y >= 0.0f);
467       assert(input_y < (float) input_height);
468 
469       const uint32_t input_y_top = (uint32_t) (int32_t) input_y;
470       const uint32_t input_y_bottom = math_min_u32(input_y_top + 1, input_y_max);
471       const float alpha_y = input_y - (float) input_y_top;
472       for (size_t output_x = 0; output_x < output_width; output_x++) {
473         const float input_x = (float) (int32_t) output_x * width_scale;
474         assert(input_x >= 0.0f);
475         assert(input_x < (float) input_width);
476 
477         const uint32_t input_x_left = (uint32_t) (int32_t) input_x;
478         const uint32_t input_x_right = math_min_u32(input_x_left + 1, input_x_max);
479         const float alpha_x = input_x - (float) input_x_left;
480         indirection_buffer[0] =
481           (void*) ((uintptr_t) input + (input_y_top * input_width + input_x_left) * input_pixel_stride);
482         indirection_buffer[1] =
483           (void*) ((uintptr_t) input + (input_y_top * input_width + input_x_right) * input_pixel_stride);
484         indirection_buffer[2] =
485           (void*) ((uintptr_t) input + (input_y_bottom * input_width + input_x_left) * input_pixel_stride);
486         indirection_buffer[3] =
487           (void*) ((uintptr_t) input + (input_y_bottom * input_width + input_x_right) * input_pixel_stride);
488         packed_weights[0] = alpha_x;
489         packed_weights[1] = alpha_y;
490         indirection_buffer += 4;
491         packed_weights += 2;
492       }
493     }
494   } else {
495     const float height_offset = 0.5f * height_scale - 0.5f;
496     const float width_offset = 0.5f * width_scale - 0.5f;
497     for (size_t output_y = 0; output_y < output_height; output_y++) {
498       float input_y = (float) (int32_t) output_y * height_scale + height_offset;
499       input_y = math_min_f32(math_max_f32(input_y, 0.0f), (float) input_y_max);
500       const uint32_t input_y_top = (uint32_t) (int32_t) input_y;
501       assert((int32_t) input_y_top >= 0);
502       const uint32_t input_y_bottom = math_min_u32(input_y_top + 1, input_y_max);
503       const float alpha_y = input_y - (float) input_y_top;
504       for (size_t output_x = 0; output_x < output_width; output_x++) {
505         float input_x = (float) (int32_t) output_x * width_scale + width_offset;
506         input_x = math_min_f32(math_max_f32(input_x, 0.0f), (float) input_x_max);
507         const uint32_t input_x_left = (uint32_t) (int32_t) input_x;
508         assert((int32_t) input_x_left >= 0);
509         const uint32_t input_x_right = math_min_u32(input_x_left + 1, input_x_max);
510         const float alpha_x = input_x - (float) input_x_left;
511         indirection_buffer[0] =
512           (void*) ((uintptr_t) input + (input_y_top * input_width + input_x_left) * input_pixel_stride);
513         indirection_buffer[1] =
514           (void*) ((uintptr_t) input + (input_y_top * input_width + input_x_right) * input_pixel_stride);
515         indirection_buffer[2] =
516           (void*) ((uintptr_t) input + (input_y_bottom * input_width + input_x_left) * input_pixel_stride);
517         indirection_buffer[3] =
518           (void*) ((uintptr_t) input + (input_y_bottom * input_width + input_x_right) * input_pixel_stride);
519         packed_weights[0] = alpha_x;
520         packed_weights[1] = alpha_y;
521         indirection_buffer += 4;
522         packed_weights += 2;
523       }
524     }
525   }
526 }
527 
xnn_indirection_init_resize_bilinear2d_hwc_q11(size_t input_pixel_stride,size_t input_height,size_t input_width,size_t output_height,size_t output_width,const void * input,const void ** indirection_buffer,int16_t * packed_weights,bool align_corners,bool tensorflow_legacy)528 void xnn_indirection_init_resize_bilinear2d_hwc_q11(
529   size_t input_pixel_stride,
530   size_t input_height,
531   size_t input_width,
532   size_t output_height,
533   size_t output_width,
534   const void* input,
535   const void** indirection_buffer,
536   int16_t* packed_weights,
537   bool align_corners,
538   bool tensorflow_legacy)
539 {
540   assert(input_height != 0);
541   assert(input_height < 16777216 /* 2**24 */);
542   assert(input_width != 0);
543   assert(input_width < 16777216 /* 2**24 */);
544   assert(output_height != 0);
545   assert(output_height < 16777216 /* 2**24 */);
546   assert(output_width != 0);
547   assert(output_width < 16777216 /* 2**24 */);
548 
549   const int32_t width_adjustment = (int32_t) (align_corners && output_width != 1);
550   const int32_t height_adjustment = (int32_t) (align_corners && output_height != 1);
551   const float width_scale =
552     (float) ((int32_t) input_width - width_adjustment) / (float) ((int32_t) output_width - width_adjustment);
553   const float height_scale =
554     (float) ((int32_t) input_height - height_adjustment) / (float) ((int32_t) output_height - height_adjustment);
555 
556   const uint32_t input_y_max = (uint32_t) input_height - 1;
557   const uint32_t input_x_max = (uint32_t) input_width - 1;
558   if (tensorflow_legacy || align_corners) {
559     for (size_t output_y = 0; output_y < output_height; output_y++) {
560       const float input_y = (float) (int32_t) output_y * height_scale;
561       assert(input_y >= 0.0f);
562       assert(input_y < (float) input_height);
563 
564       const uint32_t input_y_top = (uint32_t) (int32_t) input_y;
565       const uint32_t input_y_bottom = math_min_u32(input_y_top + 1, input_y_max);
566       const float alpha_y = input_y - (float) input_y_top;
567       for (size_t output_x = 0; output_x < output_width; output_x++) {
568         const float input_x = (float) (int32_t) output_x * width_scale;
569         assert(input_x >= 0.0f);
570         assert(input_x < (float) input_width);
571 
572         const uint32_t input_x_left = (uint32_t) (int32_t) input_x;
573         const uint32_t input_x_right = math_min_u32(input_x_left + 1, input_x_max);
574         const float alpha_x = input_x - (float) input_x_left;
575         indirection_buffer[0] =
576           (void*) ((uintptr_t) input + (input_y_top * input_width + input_x_left) * input_pixel_stride);
577         indirection_buffer[1] =
578           (void*) ((uintptr_t) input + (input_y_top * input_width + input_x_right) * input_pixel_stride);
579         indirection_buffer[2] =
580           (void*) ((uintptr_t) input + (input_y_bottom * input_width + input_x_left) * input_pixel_stride);
581         indirection_buffer[3] =
582           (void*) ((uintptr_t) input + (input_y_bottom * input_width + input_x_right) * input_pixel_stride);
583         packed_weights[0] = (int16_t) lrintf(alpha_x * 0x1.0p+11f);
584         packed_weights[1] = (int16_t) lrintf(alpha_y * 0x1.0p+11f);
585         indirection_buffer += 4;
586         packed_weights += 2;
587       }
588     }
589   } else {
590     const float height_offset = 0.5f * height_scale - 0.5f;
591     const float width_offset = 0.5f * width_scale - 0.5f;
592     for (size_t output_y = 0; output_y < output_height; output_y++) {
593       float input_y = (float) (int32_t) output_y * height_scale + height_offset;
594       input_y = math_min_f32(math_max_f32(input_y, 0.0f), (float) input_y_max);
595       const uint32_t input_y_top = (uint32_t) (int32_t) input_y;
596       assert((int32_t) input_y_top >= 0);
597       const uint32_t input_y_bottom = math_min_u32(input_y_top + 1, input_y_max);
598       const float alpha_y = input_y - (float) input_y_top;
599       for (size_t output_x = 0; output_x < output_width; output_x++) {
600         float input_x = (float) (int32_t) output_x * width_scale + width_offset;
601         input_x = math_min_f32(math_max_f32(input_x, 0.0f), (float) input_x_max);
602         const uint32_t input_x_left = (uint32_t) (int32_t) input_x;
603         assert((int32_t) input_x_left >= 0);
604         const uint32_t input_x_right = math_min_u32(input_x_left + 1, input_x_max);
605         const float alpha_x = input_x - (float) input_x_left;
606         indirection_buffer[0] =
607           (void*) ((uintptr_t) input + (input_y_top * input_width + input_x_left) * input_pixel_stride);
608         indirection_buffer[1] =
609           (void*) ((uintptr_t) input + (input_y_top * input_width + input_x_right) * input_pixel_stride);
610         indirection_buffer[2] =
611           (void*) ((uintptr_t) input + (input_y_bottom * input_width + input_x_left) * input_pixel_stride);
612         indirection_buffer[3] =
613           (void*) ((uintptr_t) input + (input_y_bottom * input_width + input_x_right) * input_pixel_stride);
614         packed_weights[0] = (int16_t) lrintf(alpha_x * 0x1.0p+11f);
615         packed_weights[1] = (int16_t) lrintf(alpha_y * 0x1.0p+11f);
616         indirection_buffer += 4;
617         packed_weights += 2;
618       }
619     }
620   }
621 }
622 
xnn_indirection_init_resize_bilinear2d_chw_f16(size_t input_pixel_stride,size_t input_height,size_t input_width,size_t output_height,size_t output_width,const void * input,const void ** indirection_buffer,void * packed_weights,bool align_corners,bool tensorflow_legacy)623 void xnn_indirection_init_resize_bilinear2d_chw_f16(
624   size_t input_pixel_stride,
625   size_t input_height,
626   size_t input_width,
627   size_t output_height,
628   size_t output_width,
629   const void* input,
630   const void** indirection_buffer,
631   void* packed_weights,
632   bool align_corners,
633   bool tensorflow_legacy)
634 {
635   assert(input_height > 1);
636   assert(input_height < 16777216 /* 2**24 */);
637   assert(input_width > 1);
638   assert(input_width < 16777216 /* 2**24 */);
639   assert(output_height != 0);
640   assert(output_height < 16777216 /* 2**24 */);
641   assert(output_width != 0);
642   assert(output_width < 16777216 /* 2**24 */);
643 
644   const int32_t width_adjustment = (int32_t) (align_corners && output_width != 1);
645   const int32_t height_adjustment = (int32_t) (align_corners && output_height != 1);
646   const float width_scale =
647     (float) ((int32_t) input_width - width_adjustment) / (float) ((int32_t) output_width - width_adjustment);
648   const float height_scale =
649     (float) ((int32_t) input_height - height_adjustment) / (float) ((int32_t) output_height - height_adjustment);
650 
651   uint16_t* w = (uint16_t*) packed_weights;
652   const uint32_t input_y_max = (uint32_t) input_height - 1;
653   const uint32_t input_x_max = (uint32_t) input_width - 1;
654   if (tensorflow_legacy || align_corners) {
655     for (size_t output_y = 0; output_y < output_height; output_y++) {
656       const float input_y = (float) (int32_t) output_y * height_scale;
657       assert(input_y >= 0.0f);
658       assert(input_y < (float) input_height);
659 
660       const uint32_t input_y_top = (uint32_t) (int32_t) input_y;
661       const uint32_t input_y_bottom = math_min_u32(input_y_top + 1, input_y_max);
662       const float alpha_y = input_y - (float) input_y_top;
663       for (size_t output_x = 0; output_x < output_width; output_x++) {
664         const float input_x = (float) (int32_t) output_x * width_scale;
665         assert(input_x >= 0.0f);
666         assert(input_x < (float) input_width);
667 
668         uint32_t input_x_left = (uint32_t) (int32_t) input_x;
669 
670         float alpha_x = input_x - (float) input_x_left;
671         if (input_x_left == input_x_max) {
672           // Ensure that there is a pixel to the right of the one pointed at,
673           // as required by some CHW kernels.
674           --input_x_left;
675           alpha_x = 1.0f;
676         }
677        indirection_buffer[0] =
678           (void*) ((uintptr_t) input + (input_y_top * input_width + input_x_left) * input_pixel_stride);
679        indirection_buffer[1] =
680           (void*) ((uintptr_t) input + (input_y_bottom * input_width + input_x_left) * input_pixel_stride);
681         w[0] = fp16_ieee_from_fp32_value(alpha_x);
682         w[1] = fp16_ieee_from_fp32_value(alpha_y);
683         indirection_buffer += 2;
684         w += 2;
685       }
686     }
687   } else {
688     const float height_offset = 0.5f * height_scale - 0.5f;
689     const float width_offset = 0.5f * width_scale - 0.5f;
690     for (size_t output_y = 0; output_y < output_height; output_y++) {
691       float input_y = (float) (int32_t) output_y * height_scale + height_offset;
692       input_y = math_min_f32(math_max_f32(input_y, 0.0f), (float) input_y_max);
693       const uint32_t input_y_top = (uint32_t) (int32_t) input_y;
694       assert((int32_t) input_y_top >= 0);
695       const uint32_t input_y_bottom = math_min_u32(input_y_top + 1, input_y_max);
696       const float alpha_y = input_y - (float) input_y_top;
697       for (size_t output_x = 0; output_x < output_width; output_x++) {
698         float input_x = (float) (int32_t) output_x * width_scale + width_offset;
699         input_x = math_min_f32(math_max_f32(input_x, 0.0f), (float) input_x_max);
700         uint32_t input_x_left = (uint32_t) (int32_t) input_x;
701         assert((int32_t) input_x_left >= 0);
702 
703         float alpha_x = input_x - (float) input_x_left;
704         if (input_x_left == input_x_max) {
705           // Ensure that there is a pixel to the right of the one pointed at,
706           // as required by some CHW kernels.
707           --input_x_left;
708           alpha_x = 1.0f;
709         }
710 
711         indirection_buffer[0] =
712           (void*) ((uintptr_t) input + (input_y_top * input_width + input_x_left) * input_pixel_stride);
713         indirection_buffer[1] =
714           (void*) ((uintptr_t) input + (input_y_bottom * input_width + input_x_left) * input_pixel_stride);
715         w[0] = fp16_ieee_from_fp32_value(alpha_x);
716         w[1] = fp16_ieee_from_fp32_value(alpha_y);
717         indirection_buffer += 2;
718         w += 2;
719       }
720     }
721   }
722 }
723 
xnn_indirection_init_resize_bilinear2d_chw_f32(size_t input_pixel_stride,size_t input_height,size_t input_width,size_t output_height,size_t output_width,const void * input,const void ** indirection_buffer,float * packed_weights,bool align_corners,bool tensorflow_legacy)724 void xnn_indirection_init_resize_bilinear2d_chw_f32(
725   size_t input_pixel_stride,
726   size_t input_height,
727   size_t input_width,
728   size_t output_height,
729   size_t output_width,
730   const void* input,
731   const void** indirection_buffer,
732   float* packed_weights,
733   bool align_corners,
734   bool tensorflow_legacy)
735 {
736   assert(input_height > 1);
737   assert(input_height < 16777216 /* 2**24 */);
738   assert(input_width > 1);
739   assert(input_width < 16777216 /* 2**24 */);
740   assert(output_height != 0);
741   assert(output_height < 16777216 /* 2**24 */);
742   assert(output_width != 0);
743   assert(output_width < 16777216 /* 2**24 */);
744 
745   const int32_t width_adjustment = (int32_t) (align_corners && output_width != 1);
746   const int32_t height_adjustment = (int32_t) (align_corners && output_height != 1);
747   const float width_scale =
748     (float) ((int32_t) input_width - width_adjustment) / (float) ((int32_t) output_width - width_adjustment);
749   const float height_scale =
750     (float) ((int32_t) input_height - height_adjustment) / (float) ((int32_t) output_height - height_adjustment);
751 
752   const uint32_t input_y_max = (uint32_t) input_height - 1;
753   const uint32_t input_x_max = (uint32_t) input_width - 1;
754   if (tensorflow_legacy || align_corners) {
755     for (size_t output_y = 0; output_y < output_height; output_y++) {
756       const float input_y = (float) (int32_t) output_y * height_scale;
757       assert(input_y >= 0.0f);
758       assert(input_y < (float) input_height);
759 
760       const uint32_t input_y_top = (uint32_t) (int32_t) input_y;
761       const uint32_t input_y_bottom = math_min_u32(input_y_top + 1, input_y_max);
762       const float alpha_y = input_y - (float) input_y_top;
763       for (size_t output_x = 0; output_x < output_width; output_x++) {
764         const float input_x = (float) (int32_t) output_x * width_scale;
765         assert(input_x >= 0.0f);
766         assert(input_x < (float) input_width);
767 
768         uint32_t input_x_left = (uint32_t) (int32_t) input_x;
769 
770         float alpha_x = input_x - (float) input_x_left;
771         if (input_x_left == input_x_max) {
772           // Ensure that there is a pixel to the right of the one pointed at,
773           // as required by some CHW kernels.
774           --input_x_left;
775           alpha_x = 1.0f;
776         }
777        indirection_buffer[0] =
778           (void*) ((uintptr_t) input + (input_y_top * input_width + input_x_left) * input_pixel_stride);
779        indirection_buffer[1] =
780           (void*) ((uintptr_t) input + (input_y_bottom * input_width + input_x_left) * input_pixel_stride);
781         packed_weights[0] = alpha_x;
782         packed_weights[1] = alpha_y;
783         indirection_buffer += 2;
784         packed_weights += 2;
785       }
786     }
787   } else {
788     const float height_offset = 0.5f * height_scale - 0.5f;
789     const float width_offset = 0.5f * width_scale - 0.5f;
790     for (size_t output_y = 0; output_y < output_height; output_y++) {
791       float input_y = (float) (int32_t) output_y * height_scale + height_offset;
792       input_y = math_min_f32(math_max_f32(input_y, 0.0f), (float) input_y_max);
793       const uint32_t input_y_top = (uint32_t) (int32_t) input_y;
794       assert((int32_t) input_y_top >= 0);
795       const uint32_t input_y_bottom = math_min_u32(input_y_top + 1, input_y_max);
796       const float alpha_y = input_y - (float) input_y_top;
797       for (size_t output_x = 0; output_x < output_width; output_x++) {
798         float input_x = (float) (int32_t) output_x * width_scale + width_offset;
799         input_x = math_min_f32(math_max_f32(input_x, 0.0f), (float) input_x_max);
800         uint32_t input_x_left = (uint32_t) (int32_t) input_x;
801         assert((int32_t) input_x_left >= 0);
802 
803         float alpha_x = input_x - (float) input_x_left;
804         if (input_x_left == input_x_max) {
805           // Ensure that there is a pixel to the right of the one pointed at,
806           // as required by some CHW kernels.
807           --input_x_left;
808           alpha_x = 1.0f;
809         }
810 
811         indirection_buffer[0] =
812           (void*) ((uintptr_t) input + (input_y_top * input_width + input_x_left) * input_pixel_stride);
813         indirection_buffer[1] =
814           (void*) ((uintptr_t) input + (input_y_bottom * input_width + input_x_left) * input_pixel_stride);
815         packed_weights[0] = alpha_x;
816         packed_weights[1] = alpha_y;
817         indirection_buffer += 2;
818         packed_weights += 2;
819       }
820     }
821   }
822 }
823 
xnn_indirection_init_unpool2d(xnn_operator_t op,size_t batch_start,uint32_t log2_element_size)824 void xnn_indirection_init_unpool2d(
825   xnn_operator_t op,
826   size_t batch_start,
827   uint32_t log2_element_size)
828 {
829   const void** indirection_buffer  = op->indirection_buffer;
830   const void* output               = op->output;
831   const size_t output_pixel_stride = op->output_pixel_stride << log2_element_size;
832   const size_t batch_size          = op->batch_size;
833   const size_t input_height        = op->input_height;
834   const size_t input_width         = op->input_width;
835   const size_t output_height       = op->output_height;
836   const size_t output_width        = op->output_width;
837   const size_t pooling_height      = op->kernel_height;
838   const size_t pooling_width       = op->kernel_width;
839   const size_t output_padding_top  = op->padding_top;
840   const size_t output_padding_left = op->padding_left;
841 
842   for (size_t image = batch_start; image < batch_size; image++) {
843     for (size_t input_y = 0; input_y < input_height; input_y++) {
844       for (size_t pooling_y = 0; pooling_y < pooling_height; pooling_y++) {
845         const size_t output_y = min(doz(input_y * pooling_height + pooling_y, output_padding_top), output_height - 1);
846         for (size_t input_x = 0; input_x < input_width; input_x++) {
847           for (size_t pooling_x = 0; pooling_x < pooling_width; pooling_x++) {
848             const size_t output_x = min(doz(input_x * pooling_width + pooling_x, output_padding_left), output_width - 1);
849             indirection_buffer[(((image * input_height + input_y) * input_width + input_x) * pooling_width + pooling_x) * pooling_height + pooling_y] =
850               (const void*) ((uintptr_t) output + ((image * output_height + output_y) * output_width + output_x) * output_pixel_stride);
851           }
852         }
853       }
854     }
855   }
856 }
857 
xnn_indirection_init_pavgpool2d_f16(size_t input_height,size_t input_width,size_t output_height,size_t output_width,size_t pooling_height,size_t pooling_width,size_t stride_height,size_t stride_width,size_t padding_top,size_t padding_left,uint16_t * pixelwise_buffer)858 void xnn_indirection_init_pavgpool2d_f16(
859   size_t input_height,
860   size_t input_width,
861   size_t output_height,
862   size_t output_width,
863   size_t pooling_height,
864   size_t pooling_width,
865   size_t stride_height,
866   size_t stride_width,
867   size_t padding_top,
868   size_t padding_left,
869   uint16_t* pixelwise_buffer)
870 {
871   for (size_t output_y = 0; output_y < output_height; output_y++) {
872     const size_t input_y_start = doz(output_y * stride_height, padding_top);
873     const size_t input_y_end = min(doz(output_y * stride_height + pooling_height, padding_top), input_height);
874     const uint32_t input_y_range = (uint32_t) (input_y_end - input_y_start);
875     for (size_t output_x = 0; output_x < output_width; output_x++) {
876       const size_t input_x_start = doz(output_x * stride_width, padding_left);
877       const size_t input_x_end = min(doz(output_x * stride_width + pooling_width, padding_left), input_width);
878       const uint32_t input_x_range = (uint32_t) (input_x_end - input_x_start);
879       *pixelwise_buffer++ = fp16_ieee_from_fp32_value(1.0f / ((float) (int32_t) (input_y_range * input_x_range)));
880     }
881   }
882 }
883 
xnn_indirection_init_pavgpool2d_f32(size_t input_height,size_t input_width,size_t output_height,size_t output_width,size_t pooling_height,size_t pooling_width,size_t stride_height,size_t stride_width,size_t padding_top,size_t padding_left,float * pixelwise_buffer)884 void xnn_indirection_init_pavgpool2d_f32(
885   size_t input_height,
886   size_t input_width,
887   size_t output_height,
888   size_t output_width,
889   size_t pooling_height,
890   size_t pooling_width,
891   size_t stride_height,
892   size_t stride_width,
893   size_t padding_top,
894   size_t padding_left,
895   float* pixelwise_buffer)
896 {
897   for (size_t output_y = 0; output_y < output_height; output_y++) {
898     const size_t input_y_start = doz(output_y * stride_height, padding_top);
899     const size_t input_y_end = min(doz(output_y * stride_height + pooling_height, padding_top), input_height);
900     const uint32_t input_y_range = (uint32_t) (input_y_end - input_y_start);
901     for (size_t output_x = 0; output_x < output_width; output_x++) {
902       const size_t input_x_start = doz(output_x * stride_width, padding_left);
903       const size_t input_x_end = min(doz(output_x * stride_width + pooling_width, padding_left), input_width);
904       const uint32_t input_x_range = (uint32_t) (input_x_end - input_x_start);
905       *pixelwise_buffer++ = 1.0f / ((float) (int32_t) (input_y_range * input_x_range));
906     }
907   }
908 }
909