xref: /aosp_15_r20/external/XNNPACK/test/deconvolution-operator-tester.h (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1 // Copyright (c) Facebook, Inc. and its affiliates.
2 // All rights reserved.
3 //
4 // Copyright 2019 Google LLC
5 //
6 // This source code is licensed under the BSD-style license found in the
7 // LICENSE file in the root directory of this source tree.
8 
9 #pragma once
10 
11 #include <gtest/gtest.h>
12 
13 #include <algorithm>
14 #include <cassert>
15 #include <cmath>
16 #include <cstddef>
17 #include <cstdlib>
18 #include <limits>
19 #include <random>
20 #include <vector>
21 
22 #include <fp16.h>
23 
24 #include <xnnpack.h>
25 #include <xnnpack/cache.h>
26 
27 namespace {
28 
29 template<class T>
doz(T a,T b)30 inline T doz(T a, T b) {
31   return a > b ? a - b : T(0);
32 }
33 
34 }  // namespace
35 
36 class DeconvolutionOperatorTester {
37  public:
38   enum class WeightsType {
39     Default,
40     FP32,
41   };
42 
padding(uint32_t padding)43   inline DeconvolutionOperatorTester& padding(uint32_t padding) {
44     this->padding_top_ = padding;
45     this->padding_right_ = padding;
46     this->padding_bottom_ = padding;
47     this->padding_left_ = padding;
48     return *this;
49   }
50 
padding_height(uint32_t padding_height)51   inline DeconvolutionOperatorTester& padding_height(uint32_t padding_height) {
52     this->padding_top_ = padding_height;
53     this->padding_bottom_ = padding_height;
54     return *this;
55   }
56 
padding_height()57   inline uint32_t padding_height() const {
58     return this->padding_top_ + this->padding_bottom_;
59   }
60 
padding_width(uint32_t padding_width)61   inline DeconvolutionOperatorTester& padding_width(uint32_t padding_width) {
62     this->padding_right_ = padding_width;
63     this->padding_left_ = padding_width;
64     return *this;
65   }
66 
padding_width()67   inline uint32_t padding_width() const {
68     return this->padding_left_ + this->padding_right_;
69   }
70 
padding_top(uint32_t padding_top)71   inline DeconvolutionOperatorTester& padding_top(uint32_t padding_top) {
72     this->padding_top_ = padding_top;
73     return *this;
74   }
75 
padding_top()76   inline uint32_t padding_top() const { return this->padding_top_; }
77 
padding_right(uint32_t padding_right)78   inline DeconvolutionOperatorTester& padding_right(uint32_t padding_right) {
79     this->padding_right_ = padding_right;
80     return *this;
81   }
82 
padding_right()83   inline uint32_t padding_right() const { return this->padding_right_; }
84 
padding_bottom(uint32_t padding_bottom)85   inline DeconvolutionOperatorTester& padding_bottom(uint32_t padding_bottom) {
86     this->padding_bottom_ = padding_bottom;
87     return *this;
88   }
89 
padding_bottom()90   inline uint32_t padding_bottom() const { return this->padding_bottom_; }
91 
padding_left(uint32_t padding_left)92   inline DeconvolutionOperatorTester& padding_left(uint32_t padding_left) {
93     this->padding_left_ = padding_left;
94     return *this;
95   }
96 
padding_left()97   inline uint32_t padding_left() const { return this->padding_left_; }
98 
adjustment_height(uint32_t adjustment_height)99   inline DeconvolutionOperatorTester& adjustment_height(uint32_t adjustment_height) {
100     this->adjustment_height_ = adjustment_height;
101     return *this;
102   }
103 
adjustment_height()104   inline uint32_t adjustment_height() const {
105     return this->adjustment_height_;
106   }
107 
adjustment_width(uint32_t adjustment_width)108   inline DeconvolutionOperatorTester& adjustment_width(uint32_t adjustment_width) {
109     this->adjustment_width_ = adjustment_width;
110     return *this;
111   }
112 
adjustment_width()113   inline uint32_t adjustment_width() const {
114     return this->adjustment_width_;
115   }
116 
input_size(uint32_t input_height,uint32_t input_width)117   inline DeconvolutionOperatorTester& input_size(uint32_t input_height, uint32_t input_width) {
118     assert(input_height >= 1);
119     assert(input_width >= 1);
120     this->input_height_ = input_height;
121     this->input_width_ = input_width;
122     return *this;
123   }
124 
input_height(uint32_t input_height)125   inline DeconvolutionOperatorTester& input_height(uint32_t input_height) {
126     assert(input_height >= 1);
127     this->input_height_ = input_height;
128     return *this;
129   }
130 
input_height()131   inline uint32_t input_height() const {
132     return this->input_height_;
133   }
134 
input_width(uint32_t input_width)135   inline DeconvolutionOperatorTester& input_width(uint32_t input_width) {
136     assert(input_width >= 1);
137     this->input_width_ = input_width;
138     return *this;
139   }
140 
input_width()141   inline uint32_t input_width() const {
142     return this->input_width_;
143   }
144 
groups(uint32_t groups)145   inline DeconvolutionOperatorTester& groups(uint32_t groups) {
146     assert(groups >= 1);
147     this->groups_ = groups;
148     return *this;
149   }
150 
groups()151   inline uint32_t groups() const {
152     return this->groups_;
153   }
154 
group_input_channels(size_t group_input_channels)155   inline DeconvolutionOperatorTester& group_input_channels(size_t group_input_channels) {
156     assert(group_input_channels >= 1);
157     this->group_input_channels_ = group_input_channels;
158     return *this;
159   }
160 
group_input_channels()161   inline size_t group_input_channels() const {
162     return this->group_input_channels_;
163   }
164 
group_output_channels(size_t group_output_channels)165   inline DeconvolutionOperatorTester& group_output_channels(size_t group_output_channels) {
166     assert(group_output_channels >= 1);
167     this->group_output_channels_ = group_output_channels;
168     return *this;
169   }
170 
group_output_channels()171   inline size_t group_output_channels() const {
172     return this->group_output_channels_;
173   }
174 
batch_size(size_t batch_size)175   inline DeconvolutionOperatorTester& batch_size(size_t batch_size) {
176     assert(batch_size >= 1);
177     this->batch_size_ = batch_size;
178     return *this;
179   }
180 
batch_size()181   inline size_t batch_size() const {
182     return this->batch_size_;
183   }
184 
kernel_size(uint32_t kernel_size)185   inline DeconvolutionOperatorTester& kernel_size(uint32_t kernel_size) {
186     assert(kernel_size >= 1);
187     this->kernel_height_ = kernel_size;
188     this->kernel_width_ = kernel_size;
189     return *this;
190   }
191 
kernel_size(uint32_t kernel_height,uint32_t kernel_width)192   inline DeconvolutionOperatorTester& kernel_size(uint32_t kernel_height, uint32_t kernel_width) {
193     assert(kernel_height >= 1);
194     assert(kernel_width >= 1);
195     this->kernel_height_ = kernel_height;
196     this->kernel_width_ = kernel_width;
197     return *this;
198   }
199 
kernel_height(uint32_t kernel_height)200   inline DeconvolutionOperatorTester& kernel_height(uint32_t kernel_height) {
201     assert(kernel_height >= 1);
202     this->kernel_height_ = kernel_height;
203     return *this;
204   }
205 
kernel_height()206   inline uint32_t kernel_height() const {
207     return this->kernel_height_;
208   }
209 
kernel_width(uint32_t kernel_width)210   inline DeconvolutionOperatorTester& kernel_width(uint32_t kernel_width) {
211     assert(kernel_width >= 1);
212     this->kernel_width_ = kernel_width;
213     return *this;
214   }
215 
kernel_width()216   inline uint32_t kernel_width() const {
217     return this->kernel_width_;
218   }
219 
dilation(uint32_t dilation)220   inline DeconvolutionOperatorTester& dilation(uint32_t dilation) {
221     assert(dilation >= 1);
222     this->dilation_height_ = dilation;
223     this->dilation_width_ = dilation;
224     return *this;
225   }
226 
dilation(uint32_t dilation_height,uint32_t dilation_width)227   inline DeconvolutionOperatorTester& dilation(uint32_t dilation_height, uint32_t dilation_width) {
228     assert(dilation_height >= 1);
229     assert(dilation_width >= 1);
230     this->dilation_height_ = dilation_height;
231     this->dilation_width_ = dilation_width;
232     return *this;
233   }
234 
dilation_height(uint32_t dilation_height)235   inline DeconvolutionOperatorTester& dilation_height(uint32_t dilation_height) {
236     assert(dilation_height >= 1);
237     this->dilation_height_ = dilation_height;
238     return *this;
239   }
240 
dilation_height()241   inline uint32_t dilation_height() const {
242     return this->dilation_height_;
243   }
244 
dilation_width(uint32_t dilation_width)245   inline DeconvolutionOperatorTester& dilation_width(uint32_t dilation_width) {
246     assert(dilation_width >= 1);
247     this->dilation_width_ = dilation_width;
248     return *this;
249   }
250 
dilation_width()251   inline uint32_t dilation_width() const {
252     return this->dilation_width_;
253   }
254 
stride(uint32_t stride)255   inline DeconvolutionOperatorTester& stride(uint32_t stride) {
256     assert(stride >= 1);
257     this->stride_height_ = stride;
258     this->stride_width_ = stride;
259     return *this;
260   }
261 
stride(uint32_t stride_height,uint32_t stride_width)262   inline DeconvolutionOperatorTester& stride(uint32_t stride_height, uint32_t stride_width) {
263     assert(stride_height >= 1);
264     assert(stride_width >= 1);
265     this->stride_height_ = stride_height;
266     this->stride_width_ = stride_width;
267     return *this;
268   }
269 
stride_height(uint32_t stride_height)270   inline DeconvolutionOperatorTester& stride_height(uint32_t stride_height) {
271     assert(stride_height >= 1);
272     this->stride_height_ = stride_height;
273     return *this;
274   }
275 
stride_height()276   inline uint32_t stride_height() const {
277     return this->stride_height_;
278   }
279 
stride_width(uint32_t stride_width)280   inline DeconvolutionOperatorTester& stride_width(uint32_t stride_width) {
281     assert(stride_width >= 1);
282     this->stride_width_ = stride_width;
283     return *this;
284   }
285 
stride_width()286   inline uint32_t stride_width() const {
287     return this->stride_width_;
288   }
289 
input_pixel_stride(size_t input_pixel_stride)290   inline DeconvolutionOperatorTester& input_pixel_stride(size_t input_pixel_stride) {
291     assert(input_pixel_stride >= 1);
292     this->input_pixel_stride_ = input_pixel_stride;
293     return *this;
294   }
295 
input_pixel_stride()296   inline size_t input_pixel_stride() const {
297     if (this->input_pixel_stride_ == 0) {
298       return group_input_channels() * groups();
299     } else {
300       assert(this->input_pixel_stride_ >= group_input_channels() * groups());
301       return this->input_pixel_stride_;
302     }
303   }
304 
output_pixel_stride(size_t output_pixel_stride)305   inline DeconvolutionOperatorTester& output_pixel_stride(size_t output_pixel_stride) {
306     assert(output_pixel_stride >= 1);
307     this->output_pixel_stride_ = output_pixel_stride;
308     return *this;
309   }
310 
output_pixel_stride()311   inline size_t output_pixel_stride() const {
312     if (this->output_pixel_stride_ == 0) {
313       return group_output_channels() * groups();
314     } else {
315       assert(this->output_pixel_stride_ >= group_output_channels() * groups());
316       return this->output_pixel_stride_;
317     }
318   }
319 
dilated_kernel_height()320   inline uint32_t dilated_kernel_height() const {
321     return (kernel_height() - 1) * dilation_height() + 1;
322   }
323 
dilated_kernel_width()324   inline uint32_t dilated_kernel_width() const {
325     return (kernel_width() - 1) * dilation_width() + 1;
326   }
327 
output_height()328   inline size_t output_height() const {
329     return stride_height() * (input_height() - 1) + adjustment_height() + dilated_kernel_height() - padding_height();
330   }
331 
output_width()332   inline size_t output_width() const {
333     return stride_width() * (input_width() - 1) + adjustment_width() + dilated_kernel_width() - padding_width();
334   }
335 
next_input_size(uint32_t next_input_height,uint32_t next_input_width)336   inline DeconvolutionOperatorTester& next_input_size(uint32_t next_input_height, uint32_t next_input_width) {
337     assert(next_input_height >= 1);
338     assert(next_input_width >= 1);
339     this->next_input_height_ = next_input_height;
340     this->next_input_width_ = next_input_width;
341     return *this;
342   }
343 
next_input_height(uint32_t next_input_height)344   inline DeconvolutionOperatorTester& next_input_height(uint32_t next_input_height) {
345     assert(next_input_height >= 1);
346     this->next_input_height_ = next_input_height;
347     return *this;
348   }
349 
next_input_height()350   inline uint32_t next_input_height() const {
351     if (this->next_input_height_ == 0) {
352       return input_height();
353     } else {
354       return this->next_input_height_;
355     }
356   }
357 
next_input_width(uint32_t next_input_width)358   inline DeconvolutionOperatorTester& next_input_width(uint32_t next_input_width) {
359     assert(next_input_width >= 1);
360     this->next_input_width_ = next_input_width;
361     return *this;
362   }
363 
next_input_width()364   inline uint32_t next_input_width() const {
365     if (this->next_input_width_ == 0) {
366       return input_width();
367     } else {
368       return this->next_input_width_;
369     }
370   }
371 
next_output_height()372   inline size_t next_output_height() const {
373     return stride_height() * (next_input_height() - 1) + adjustment_height() + dilated_kernel_height() - padding_height();
374   }
375 
next_output_width()376   inline size_t next_output_width() const {
377     return stride_width() * (next_input_width() - 1) + adjustment_width() + dilated_kernel_width() - padding_width();
378   }
379 
next_batch_size(size_t next_batch_size)380   inline DeconvolutionOperatorTester& next_batch_size(size_t next_batch_size) {
381     assert(next_batch_size >= 1);
382     this->next_batch_size_ = next_batch_size;
383     return *this;
384   }
385 
next_batch_size()386   inline size_t next_batch_size() const {
387     if (this->next_batch_size_ == 0) {
388       return batch_size();
389     } else {
390       return this->next_batch_size_;
391     }
392   }
393 
qmin(uint8_t qmin)394   inline DeconvolutionOperatorTester& qmin(uint8_t qmin) {
395     this->qmin_ = qmin;
396     return *this;
397   }
398 
qmin()399   inline uint8_t qmin() const {
400     return this->qmin_;
401   }
402 
qmax(uint8_t qmax)403   inline DeconvolutionOperatorTester& qmax(uint8_t qmax) {
404     this->qmax_ = qmax;
405     return *this;
406   }
407 
qmax()408   inline uint8_t qmax() const {
409     return this->qmax_;
410   }
411 
has_bias(bool has_bias)412   inline DeconvolutionOperatorTester& has_bias(bool has_bias) {
413     this->has_bias_ = has_bias;
414     return *this;
415   }
416 
has_bias()417   inline bool has_bias() const {
418     return this->has_bias_;
419   }
420 
weights_type(WeightsType weights_type)421   inline DeconvolutionOperatorTester& weights_type(WeightsType weights_type) {
422     this->weights_type_ = weights_type;
423     return *this;
424   }
425 
weights_type()426   inline WeightsType weights_type() const {
427     return this->weights_type_;
428   }
429 
use_weights_cache(bool use_weights_cache)430   inline DeconvolutionOperatorTester& use_weights_cache(bool use_weights_cache) {
431     this->use_weights_cache_ = use_weights_cache;
432     return *this;
433   }
434 
use_weights_cache()435   inline bool use_weights_cache() const {
436     return this->use_weights_cache_;
437   }
438 
iterations(size_t iterations)439   inline DeconvolutionOperatorTester& iterations(size_t iterations) {
440     this->iterations_ = iterations;
441     return *this;
442   }
443 
iterations()444   inline size_t iterations() const {
445     return this->iterations_;
446   }
447 
TestQS8()448   void TestQS8() const {
449     ASSERT_EQ(weights_type(), WeightsType::Default);
450 
451     std::random_device random_device;
452     auto rng = std::mt19937(random_device());
453     std::uniform_int_distribution<int32_t> i32dist(-10000, 10000);
454     std::uniform_int_distribution<int32_t> i8dist(
455       std::numeric_limits<int8_t>::min(), std::numeric_limits<int8_t>::max());
456     std::uniform_int_distribution<int32_t> w8dist(
457       -std::numeric_limits<int8_t>::max(), std::numeric_limits<int8_t>::max());
458 
459     std::vector<int8_t> input(XNN_EXTRA_BYTES / sizeof(int8_t) +
460       (batch_size() * input_height() * input_width() - 1) * input_pixel_stride() + groups() * group_input_channels());
461     std::vector<int8_t> kernel(groups() * group_output_channels() * kernel_height() * kernel_width() * group_input_channels());
462     std::vector<int32_t> bias(groups() * group_output_channels());
463     std::vector<int8_t> output((batch_size() * output_height() * output_width() - 1) * output_pixel_stride() + groups() * group_output_channels());
464     std::vector<int32_t> accumulators(batch_size() * output_height() * output_width() * groups() * group_output_channels());
465     std::vector<double> output_ref(batch_size() * output_height() * output_width() * groups() * group_output_channels());
466 
467     const int8_t input_zero_point = 1;
468 
469     for (size_t iteration = 0; iteration < iterations(); iteration++) {
470       std::generate(input.begin(), input.end(), [&]() { return i8dist(rng); });
471       std::generate(kernel.begin(), kernel.end(), [&]() { return w8dist(rng); });
472       std::generate(bias.begin(), bias.end(), [&]() { return i32dist(rng); });
473       std::fill(output.begin(), output.end(), INT8_C(0xA5));
474 
475       // Compute reference results, without renormalization.
476       if (has_bias()) {
477         for (size_t i = 0; i < batch_size(); i++) {
478           for (size_t oy = 0; oy < output_height(); oy++) {
479             for (size_t ox = 0; ox < output_width(); ox++) {
480               for (size_t g = 0; g < groups(); g++) {
481                 for (size_t oc = 0; oc < group_output_channels(); oc++) {
482                   accumulators[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] =
483                     bias[g * group_output_channels() + oc];
484                 }
485               }
486             }
487           }
488         }
489       } else {
490         std::fill(accumulators.begin(), accumulators.end(), 0);
491       }
492       for (size_t i = 0; i < batch_size(); i++) {
493         for (size_t oy = 0; oy < output_height(); oy++) {
494           for (size_t ox = 0; ox < output_width(); ox++) {
495             for (size_t ky = 0; ky < kernel_height(); ky++) {
496               const size_t y = oy + padding_top() - ky * dilation_height();
497               const size_t iy = y / stride_height();
498               if (iy * stride_height() == y && iy < input_height()) {
499                 for (size_t kx = 0; kx < kernel_width(); kx++) {
500                   const size_t x = ox + padding_left() - kx * dilation_width();
501                   const size_t ix = x / stride_width();
502                   if (ix * stride_width() == x && ix < input_width()) {
503                     for (size_t g = 0; g < groups(); g++) {
504                       for (size_t oc = 0; oc < group_output_channels(); oc++) {
505                         for (size_t ic = 0; ic < group_input_channels(); ic++) {
506                           accumulators[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] +=
507                             (int32_t(input[((i * input_height() + iy) * input_width() + ix) * input_pixel_stride() + g * group_input_channels() + ic]) - int32_t(input_zero_point)) *
508                             int32_t(kernel[(((g * group_output_channels() + oc) * kernel_height() + ky) * kernel_width() + kx) * group_input_channels() + ic]);
509                         }
510                       }
511                     }
512                   }
513                 }
514               }
515             }
516           }
517         }
518       }
519 
520       // Compute renormalization parameters.
521       const int32_t accumulated_min = *std::min_element(accumulators.cbegin(), accumulators.cend());
522       const int32_t accumulated_max = *std::max_element(accumulators.cbegin(), accumulators.cend());
523 
524       const double output_scale = double(uint32_t(accumulated_max - accumulated_min)) / 255.0;
525       const int8_t output_zero_point = int8_t(std::max(std::min(
526         lrint(-0.5 - 0.5 * double(accumulated_min + accumulated_max) / output_scale),
527         long(std::numeric_limits<int8_t>::max())), long(std::numeric_limits<int8_t>::min())));
528 
529       // Renormalize reference results.
530       std::transform(accumulators.cbegin(), accumulators.cend(), output_ref.begin(),
531         [this, output_scale, output_zero_point](int32_t x) -> double {
532           return std::max<double>(std::min<double>(double(x) / output_scale, double(qmax() - 0x80) - output_zero_point), double(qmin() - 0x80) - output_zero_point);
533         });
534 
535       // Create, setup, run, and destroy Deconvolution operator.
536       ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
537       xnn_operator_t deconvolution_op = nullptr;
538 
539       xnn_caches caches = {
540         .code_cache = NULL,
541         .weights_cache = NULL,
542       };
543       xnn_weights_cache weights_cache;
544       if (use_weights_cache()) {
545         xnn_init_weights_cache(&weights_cache);
546         caches.weights_cache = &weights_cache;
547       }
548 
549       ASSERT_EQ(
550           xnn_status_success,
551           xnn_create_deconvolution2d_nhwc_qs8(
552               padding_top(), padding_right(), padding_bottom(), padding_left(),
553               kernel_height(), kernel_width(), stride_height(), stride_width(),
554               dilation_height(), dilation_width(), groups(),
555               group_input_channels(), group_output_channels(),
556               input_pixel_stride(), output_pixel_stride(), input_zero_point,
557               1.0f /* input scale */, 1.0f /* kernel scale */, kernel.data(),
558               has_bias() ? bias.data() : nullptr, output_zero_point,
559               output_scale, int8_t(qmin() - 0x80), int8_t(qmax() - 0x80),
560               /*flags=*/0, &caches, &deconvolution_op));
561 
562       if (use_weights_cache()) {
563         ASSERT_EQ(xnn_status_success,
564                   xnn_finalize_weights_cache(&weights_cache, xnn_weights_cache_finalization_kind_soft));
565       }
566       // Smart pointer to automatically delete deconvolution_op.
567       std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)> auto_deconvolution_op(deconvolution_op, xnn_delete_operator);
568 
569       ASSERT_EQ(xnn_status_success,
570         xnn_setup_deconvolution2d_nhwc_qs8(
571           deconvolution_op,
572           batch_size(), input_height(), input_width(),
573           adjustment_height(), adjustment_width(),
574           input.data(), output.data(),
575           nullptr /* thread pool */));
576 
577       ASSERT_EQ(xnn_status_success,
578         xnn_run_operator(deconvolution_op, nullptr /* thread pool */));
579 
580       VerifyQS8(output, output_ref, output_zero_point);
581 
582       if (use_weights_cache()) {
583         xnn_operator_t deconvolution_op2 = nullptr;
584         size_t old_weights_cache_size = weights_cache.cache.weights.size;
585 
586         ASSERT_EQ(
587             xnn_status_success,
588             xnn_create_deconvolution2d_nhwc_qs8(
589                 padding_top(), padding_right(), padding_bottom(), padding_left(),
590                 kernel_height(), kernel_width(), stride_height(), stride_width(),
591                 dilation_height(), dilation_width(), groups(),
592                 group_input_channels(), group_output_channels(),
593                 input_pixel_stride(), output_pixel_stride(), input_zero_point,
594                 1.0f /* input scale */, 1.0f /* kernel scale */, kernel.data(),
595                 has_bias() ? bias.data() : nullptr, output_zero_point,
596                 output_scale, int8_t(qmin() - 0x80), int8_t(qmax() - 0x80),
597                 /*flags=*/0, &caches, &deconvolution_op2));
598 
599         // Smart pointer to automatically delete deconvolution_op2.
600         std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)> auto_deconvolution_op(deconvolution_op2, xnn_delete_operator);
601         std::vector<int8_t> output2(output.size(), INT8_C(0xA5));
602 
603         ASSERT_EQ(xnn_status_success,
604                   xnn_setup_deconvolution2d_nhwc_qs8(
605                       deconvolution_op2,
606                       batch_size(), input_height(), input_width(),
607                       adjustment_height(), adjustment_width(),
608                       input.data(), output2.data(),
609                       nullptr /* thread pool */));
610 
611         ASSERT_EQ(xnn_status_success,
612                   xnn_run_operator(deconvolution_op2, nullptr /* thread pool */));
613 
614         VerifyWeightsCache(&weights_cache, old_weights_cache_size);
615         VerifyQS8(output2, output_ref, output_zero_point);
616         xnn_release_weights_cache(&weights_cache);
617       }
618 
619     }
620   }
621 
VerifyQS8(const std::vector<int8_t> & output,const std::vector<double> & output_ref,int8_t output_zero_point)622   void VerifyQS8(const std::vector<int8_t> &output,
623                  const std::vector<double> &output_ref,
624                  int8_t output_zero_point) const {
625     for (size_t i = 0; i < batch_size(); i++) {
626       for (size_t y = 0; y < output_height(); y++) {
627         for (size_t x = 0; x < output_width(); x++) {
628           for (size_t g = 0; g < groups(); g++) {
629             for (size_t c = 0; c < group_output_channels(); c++) {
630               ASSERT_LE(int32_t(output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + g * group_output_channels() + c]), int32_t(qmax() - 0x80))
631                   << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
632               ASSERT_GE(int32_t(output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + g * group_output_channels() + c]), int32_t(qmin() - 0x80))
633                   << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
634               ASSERT_NEAR(
635                   output_ref[(((i * output_height() + y) * output_width() + x) * groups() + g) * group_output_channels() + c],
636                   double(output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + g * group_output_channels() + c]) - double(output_zero_point),
637                   0.9)
638                   << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
639             }
640           }
641         }
642       }
643     }
644   }
645 
VerifyWeightsCache(xnn_weights_cache * weights_cache,size_t old_size)646   void VerifyWeightsCache(xnn_weights_cache* weights_cache, size_t old_size) const {
647     ASSERT_EQ(weights_cache->cache.hits, 1);
648     // Ensure that we did not write more weights to the cache because it was a cache hit.
649     ASSERT_EQ(old_size, weights_cache->cache.weights.size);
650   };
651 
TestQU8()652   void TestQU8() const {
653     ASSERT_EQ(weights_type(), WeightsType::Default);
654 
655     std::random_device random_device;
656     auto rng = std::mt19937(random_device());
657     std::uniform_int_distribution<int32_t> i32dist(-10000, 10000);
658     std::uniform_int_distribution<int32_t> u8dist(
659       std::numeric_limits<uint8_t>::min(), std::numeric_limits<uint8_t>::max());
660 
661     std::vector<uint8_t> input(XNN_EXTRA_BYTES / sizeof(uint8_t) +
662       (batch_size() * input_height() * input_width() - 1) * input_pixel_stride() + groups() * group_input_channels());
663     std::vector<uint8_t> kernel(groups() * group_output_channels() * kernel_height() * kernel_width() * group_input_channels());
664     std::vector<int32_t> bias(groups() * group_output_channels());
665     std::vector<uint8_t> output((batch_size() * output_height() * output_width() - 1) * output_pixel_stride() + groups() * group_output_channels());
666     std::vector<int32_t> accumulators(batch_size() * output_height() * output_width() * groups() * group_output_channels());
667     std::vector<double> output_ref(batch_size() * output_height() * output_width() * groups() * group_output_channels());
668 
669     const uint8_t input_zero_point = 127;
670     const uint8_t kernel_zero_point = 127;
671 
672     for (size_t iteration = 0; iteration < iterations(); iteration++) {
673       std::generate(input.begin(), input.end(), [&]() { return u8dist(rng); });
674       std::generate(kernel.begin(), kernel.end(), [&]() { return u8dist(rng); });
675       std::generate(bias.begin(), bias.end(), [&]() { return i32dist(rng); });
676       std::fill(output.begin(), output.end(), UINT8_C(0xA5));
677 
678       // Compute reference results, without renormalization.
679       if (has_bias()) {
680         for (size_t i = 0; i < batch_size(); i++) {
681           for (size_t oy = 0; oy < output_height(); oy++) {
682             for (size_t ox = 0; ox < output_width(); ox++) {
683               for (size_t g = 0; g < groups(); g++) {
684                 for (size_t oc = 0; oc < group_output_channels(); oc++) {
685                   accumulators[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] =
686                     bias[g * group_output_channels() + oc];
687                 }
688               }
689             }
690           }
691         }
692       } else {
693         std::fill(accumulators.begin(), accumulators.end(), 0);
694       }
695       for (size_t i = 0; i < batch_size(); i++) {
696         for (size_t oy = 0; oy < output_height(); oy++) {
697           for (size_t ox = 0; ox < output_width(); ox++) {
698             for (size_t ky = 0; ky < kernel_height(); ky++) {
699               const size_t y = oy + padding_top() - ky * dilation_height();
700               const size_t iy = y / stride_height();
701               if (iy * stride_height() == y && iy < input_height()) {
702                 for (size_t kx = 0; kx < kernel_width(); kx++) {
703                   const size_t x = ox + padding_left() - kx * dilation_width();
704                   const size_t ix = x / stride_width();
705                   if (ix * stride_width() == x && ix < input_width()) {
706                     for (size_t g = 0; g < groups(); g++) {
707                       for (size_t oc = 0; oc < group_output_channels(); oc++) {
708                         for (size_t ic = 0; ic < group_input_channels(); ic++) {
709                           accumulators[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] +=
710                             (int32_t(input[((i * input_height() + iy) * input_width() + ix) * input_pixel_stride() + g * group_input_channels() + ic]) - int32_t(input_zero_point)) *
711                             (int32_t(kernel[(((g * group_output_channels() + oc) * kernel_height() + ky) * kernel_width() + kx) * group_input_channels() + ic]) - int32_t(kernel_zero_point));
712                         }
713                       }
714                     }
715                   }
716                 }
717               }
718             }
719           }
720         }
721       }
722 
723       // Compute renormalization parameters.
724       const int32_t accumulated_min = *std::min_element(accumulators.cbegin(), accumulators.cend());
725       const int32_t accumulated_max = *std::max_element(accumulators.cbegin(), accumulators.cend());
726 
727       const double output_scale = double(uint32_t(accumulated_max - accumulated_min)) / 255.0;
728       const uint8_t output_zero_point = uint8_t(std::max(std::min(
729         lrint(127.5 - 0.5 * double(accumulated_min + accumulated_max) / output_scale),
730         long(std::numeric_limits<uint8_t>::max())), long(std::numeric_limits<uint8_t>::min())));
731 
732       // Renormalize reference results.
733       std::transform(accumulators.cbegin(), accumulators.cend(), output_ref.begin(),
734         [this, output_scale, output_zero_point](int32_t x) -> double {
735           return std::max<double>(std::min<double>(double(x) / output_scale, double(qmax()) - output_zero_point), double(qmin()) - output_zero_point);
736         });
737 
738       // Create, setup, run, and destroy Deconvolution operator.
739       ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
740       xnn_operator_t deconvolution_op = nullptr;
741 
742       xnn_caches caches = {
743         .code_cache = NULL,
744         .weights_cache = NULL,
745       };
746       xnn_weights_cache weights_cache;
747       if (use_weights_cache()) {
748         xnn_init_weights_cache(&weights_cache);
749         caches.weights_cache = &weights_cache;
750       }
751 
752       ASSERT_EQ(
753           xnn_status_success,
754           xnn_create_deconvolution2d_nhwc_qu8(
755               padding_top(), padding_right(), padding_bottom(), padding_left(),
756               kernel_height(), kernel_width(), stride_height(), stride_width(),
757               dilation_height(), dilation_width(), groups(),
758               group_input_channels(), group_output_channels(),
759               input_pixel_stride(), output_pixel_stride(), input_zero_point,
760               1.0f /* input scale */, kernel_zero_point,
761               1.0f /* kernel scale */, kernel.data(),
762               has_bias() ? bias.data() : nullptr, output_zero_point,
763               output_scale, qmin(), qmax(),
764               /*flags=*/0, &caches, &deconvolution_op));
765 
766       if (use_weights_cache()) {
767         ASSERT_EQ(xnn_status_success,
768                   xnn_finalize_weights_cache(&weights_cache, xnn_weights_cache_finalization_kind_soft));
769       }
770       // Smart pointer to automatically delete deconvolution_op.
771       std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)> auto_deconvolution_op(deconvolution_op, xnn_delete_operator);
772 
773       ASSERT_EQ(xnn_status_success,
774         xnn_setup_deconvolution2d_nhwc_qu8(
775           deconvolution_op,
776           batch_size(), input_height(), input_width(),
777           adjustment_height(), adjustment_width(),
778           input.data(), output.data(),
779           nullptr /* thread pool */));
780 
781       ASSERT_EQ(xnn_status_success,
782         xnn_run_operator(deconvolution_op, nullptr /* thread pool */));
783 
784       // Verify results.
785       VerifyQU8(output, output_ref, output_zero_point);
786 
787 
788       if (use_weights_cache()) {
789         xnn_operator_t deconvolution_op2 = nullptr;
790         size_t old_weights_cache_size = weights_cache.cache.weights.size;
791 
792         ASSERT_EQ(
793             xnn_status_success,
794             xnn_create_deconvolution2d_nhwc_qu8(
795                 padding_top(), padding_right(), padding_bottom(), padding_left(),
796                 kernel_height(), kernel_width(), stride_height(), stride_width(),
797                 dilation_height(), dilation_width(), groups(),
798                 group_input_channels(), group_output_channels(),
799                 input_pixel_stride(), output_pixel_stride(), input_zero_point,
800                 1.0f /* input scale */, kernel_zero_point,
801                 1.0f /* kernel scale */, kernel.data(),
802                 has_bias() ? bias.data() : nullptr, output_zero_point,
803                 output_scale, qmin(), qmax(),
804                 /*flags=*/0, &caches, &deconvolution_op2));
805 
806         // Smart pointer to automatically delete deconvolution_op2.
807         std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)> auto_deconvolution_op(deconvolution_op2, xnn_delete_operator);
808 
809         ASSERT_EQ(xnn_status_success,
810                   xnn_setup_deconvolution2d_nhwc_qu8(
811                       deconvolution_op2,
812                       batch_size(), input_height(), input_width(),
813                       adjustment_height(), adjustment_width(),
814                       input.data(), output.data(),
815                       nullptr /* thread pool */));
816 
817         ASSERT_EQ(xnn_status_success,
818                   xnn_run_operator(deconvolution_op2, nullptr /* thread pool */));
819 
820         VerifyWeightsCache(&weights_cache, old_weights_cache_size);
821         VerifyQU8(output, output_ref, output_zero_point);
822         xnn_release_weights_cache(&weights_cache);
823       }
824     }
825   }
826 
VerifyQU8(const std::vector<uint8_t> & output,const std::vector<double> & output_ref,uint8_t output_zero_point)827   void VerifyQU8(const std::vector<uint8_t> &output,
828                  const std::vector<double> &output_ref,
829                  uint8_t output_zero_point) const {
830     for (size_t i = 0; i < batch_size(); i++) {
831       for (size_t y = 0; y < output_height(); y++) {
832         for (size_t x = 0; x < output_width(); x++) {
833           for (size_t g = 0; g < groups(); g++) {
834             for (size_t c = 0; c < group_output_channels(); c++) {
835               ASSERT_LE(int32_t(output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + g * group_output_channels() + c]), int32_t(qmax()))
836                   << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
837               ASSERT_GE(int32_t(output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + g * group_output_channels() + c]), int32_t(qmin()))
838                   << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
839               ASSERT_NEAR(
840                   output_ref[(((i * output_height() + y) * output_width() + x) * groups() + g) * group_output_channels() + c],
841                   double(output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + g * group_output_channels() + c]) - double(output_zero_point),
842                   0.9)
843                   << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
844             }
845           }
846         }
847       }
848     }
849   }
850 
TestF16()851   void TestF16() const {
852     switch (weights_type()) {
853       case WeightsType::Default:
854         break;
855       case WeightsType::FP32:
856         break;
857       default:
858         GTEST_FAIL() << "unexpected weights type";
859     }
860 
861     std::random_device random_device;
862     auto rng = std::mt19937(random_device());
863     std::uniform_real_distribution<float> f32dist(0.1f, 1.0f);
864 
865     std::vector<uint16_t> input(XNN_EXTRA_BYTES / sizeof(uint16_t) +
866       (batch_size() * input_height() * input_width() - 1) * input_pixel_stride() + groups() * group_input_channels());
867     std::vector<uint16_t> kernel(groups() * group_output_channels() * kernel_height() * kernel_width() * group_input_channels());
868     std::vector<float> kernel_as_float(kernel.size());
869     std::vector<uint16_t> bias(groups() * group_output_channels());
870     std::vector<float> bias_as_float(bias.size());
871     std::vector<uint16_t> output((batch_size() * output_height() * output_width() - 1) * output_pixel_stride() + groups() * group_output_channels());
872     std::vector<float> output_ref(batch_size() * output_height() * output_width() * groups() * group_output_channels());
873 
874     for (size_t iteration = 0; iteration < iterations(); iteration++) {
875       std::generate(input.begin(), input.end(), [&]() { return fp16_ieee_from_fp32_value(f32dist(rng)); });
876       std::generate(kernel.begin(), kernel.end(), [&]() { return fp16_ieee_from_fp32_value(f32dist(rng)); });
877       std::transform(kernel.cbegin(), kernel.cend(), kernel_as_float.begin(), fp16_ieee_to_fp32_value);
878       std::generate(bias.begin(), bias.end(), [&]() { return fp16_ieee_from_fp32_value(f32dist(rng)); });
879       std::transform(bias.cbegin(), bias.cend(), bias_as_float.begin(), fp16_ieee_to_fp32_value);
880       std::fill(output.begin(), output.end(), UINT16_C(0x7E00) /* NaN */);
881 
882       // Compute reference results, without clamping.
883       if (has_bias()) {
884         for (size_t i = 0; i < batch_size(); i++) {
885           for (size_t oy = 0; oy < output_height(); oy++) {
886             for (size_t ox = 0; ox < output_width(); ox++) {
887               for (size_t g = 0; g < groups(); g++) {
888                 for (size_t oc = 0; oc < group_output_channels(); oc++) {
889                   output_ref[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] =
890                     bias_as_float[g * group_output_channels() + oc];
891                 }
892               }
893             }
894           }
895         }
896       } else {
897         std::fill(output_ref.begin(), output_ref.end(), 0.0f);
898       }
899       for (size_t i = 0; i < batch_size(); i++) {
900         for (size_t oy = 0; oy < output_height(); oy++) {
901           for (size_t ox = 0; ox < output_width(); ox++) {
902             for (size_t ky = 0; ky < kernel_height(); ky++) {
903               const size_t y = oy + padding_top() - ky * dilation_height();
904               const size_t iy = y / stride_height();
905               if (iy * stride_height() == y && iy < input_height()) {
906                 for (size_t kx = 0; kx < kernel_width(); kx++) {
907                   const size_t x = ox + padding_left() - kx * dilation_width();
908                   const size_t ix = x / stride_width();
909                   if (ix * stride_width() == x && ix < input_width()) {
910                     for (size_t g = 0; g < groups(); g++) {
911                       for (size_t oc = 0; oc < group_output_channels(); oc++) {
912                         for (size_t ic = 0; ic < group_input_channels(); ic++) {
913                           output_ref[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] +=
914                             fp16_ieee_to_fp32_value(input[((i * input_height() + iy) * input_width() + ix) * input_pixel_stride() + g * group_input_channels() + ic]) *
915                             kernel_as_float[(((g * group_output_channels() + oc) * kernel_height() + ky) * kernel_width() + kx) * group_input_channels() + ic];
916                         }
917                       }
918                     }
919                   }
920                 }
921               }
922             }
923           }
924         }
925       }
926 
927       // Compute clamping parameters.
928       const float accumulated_min = *std::min_element(output_ref.cbegin(), output_ref.cend());
929       const float accumulated_max = *std::max_element(output_ref.cbegin(), output_ref.cend());
930       const float accumulated_range = accumulated_max - accumulated_min;
931       float output_min = accumulated_min + accumulated_range / 255.0f * float(qmin());
932       float output_max = accumulated_max - accumulated_range / 255.0f * float(255 - qmax());
933       output_min = fp16_ieee_to_fp32_value(fp16_ieee_from_fp32_value(output_min));
934       output_max = fp16_ieee_to_fp32_value(fp16_ieee_from_fp32_value(output_max));
935       if (accumulated_range == 0.0f) {
936         output_min = -std::numeric_limits<float>::infinity();
937         output_max = +std::numeric_limits<float>::infinity();
938       }
939       if (qmin() == std::numeric_limits<uint8_t>::min()) {
940         output_min = -std::numeric_limits<float>::infinity();
941       }
942       if (qmax() == std::numeric_limits<uint8_t>::max()) {
943         output_max = +std::numeric_limits<float>::infinity();
944       }
945 
946       // Clamp reference results.
947       for (float& value : output_ref) {
948         value = std::max(std::min(value, output_max), output_min);
949       }
950 
951       // Create, setup, run, and destroy Deconvolution operator.
952       ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
953       xnn_operator_t deconvolution_op = nullptr;
954 
955       xnn_caches caches = {
956         .code_cache = NULL,
957         .weights_cache = NULL,
958       };
959       xnn_weights_cache weights_cache;
960       if (use_weights_cache()) {
961         xnn_init_weights_cache(&weights_cache);
962         caches.weights_cache = &weights_cache;
963       }
964 
965       const void* kernel_data = kernel.data();
966       const void* bias_data = bias.data();
967       if (weights_type() == WeightsType::FP32) {
968         kernel_data = kernel_as_float.data();
969         bias_data = bias_as_float.data();
970       }
971       uint32_t flags = 0;
972       if (weights_type() == WeightsType::FP32) {
973         flags |= XNN_FLAG_FP32_STATIC_WEIGHTS;
974       }
975       const xnn_status status = xnn_create_deconvolution2d_nhwc_f16(
976         padding_top(), padding_right(), padding_bottom(), padding_left(),
977         kernel_height(), kernel_width(), stride_height(), stride_width(),
978         dilation_height(), dilation_width(), groups(),
979         group_input_channels(), group_output_channels(),
980         input_pixel_stride(), output_pixel_stride(),
981         kernel_data, has_bias() ? bias_data : nullptr,
982         output_min, output_max,
983         flags, &caches, &deconvolution_op);
984       if (status == xnn_status_unsupported_hardware) {
985         GTEST_SKIP();
986       }
987       ASSERT_EQ(xnn_status_success, status);
988       ASSERT_NE(nullptr, deconvolution_op);
989       if (use_weights_cache()) {
990         ASSERT_EQ(xnn_status_success,
991                   xnn_finalize_weights_cache(&weights_cache, xnn_weights_cache_finalization_kind_soft));
992       }
993 
994       // Smart pointer to automatically delete deconvolution_op.
995       std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)> auto_deconvolution_op(deconvolution_op, xnn_delete_operator);
996 
997       ASSERT_EQ(xnn_status_success,
998         xnn_setup_deconvolution2d_nhwc_f16(
999           deconvolution_op,
1000           batch_size(), input_height(), input_width(),
1001           adjustment_height(), adjustment_width(),
1002           input.data(), output.data(),
1003           nullptr /* thread pool */));
1004 
1005       ASSERT_EQ(xnn_status_success,
1006         xnn_run_operator(deconvolution_op, nullptr /* thread pool */));
1007 
1008       VerifyF16(output, output_ref, output_max, output_min);
1009 
1010       if (use_weights_cache()) {
1011         xnn_operator_t deconvolution_op2 = nullptr;
1012         size_t old_weights_cache_size = weights_cache.cache.weights.size;
1013 
1014         ASSERT_EQ(xnn_status_success,
1015                   xnn_create_deconvolution2d_nhwc_f16(
1016                       padding_top(), padding_right(), padding_bottom(), padding_left(),
1017                       kernel_height(), kernel_width(), stride_height(), stride_width(),
1018                       dilation_height(), dilation_width(), groups(),
1019                       group_input_channels(), group_output_channels(),
1020                       input_pixel_stride(), output_pixel_stride(),
1021                       kernel_data, has_bias() ? bias_data : nullptr,
1022                       output_min, output_max,
1023                       flags, &caches, &deconvolution_op2));
1024         ASSERT_NE(nullptr, deconvolution_op2);
1025 
1026         // Smart pointer to automatically delete deconvolution_op2.
1027         std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)> auto_deconvolution_op(deconvolution_op2, xnn_delete_operator);
1028         std::vector<uint16_t> output2(output.size(), UINT16_C(0x7E00) /* NaN */);
1029 
1030         ASSERT_EQ(xnn_status_success,
1031                   xnn_setup_deconvolution2d_nhwc_f16(
1032                       deconvolution_op2,
1033                       batch_size(), input_height(), input_width(),
1034                       adjustment_height(), adjustment_width(),
1035                       input.data(), output2.data(),
1036                       nullptr /* thread pool */));
1037 
1038         ASSERT_EQ(xnn_status_success,
1039                   xnn_run_operator(deconvolution_op2, nullptr /* thread pool */));
1040 
1041         VerifyWeightsCache(&weights_cache, old_weights_cache_size);
1042         VerifyF16(output2, output_ref, output_max, output_min);
1043         xnn_release_weights_cache(&weights_cache);
1044       }
1045     }
1046   }
1047 
VerifyF16(const std::vector<uint16_t> & output,const std::vector<float> & output_ref,float output_max,float output_min)1048   void VerifyF16(const std::vector<uint16_t> &output,
1049                  const std::vector<float> &output_ref,
1050                  float output_max,
1051                  float output_min) const {
1052     for (size_t i = 0; i < batch_size(); i++) {
1053       for (size_t y = 0; y < output_height(); y++) {
1054         for (size_t x = 0; x < output_width(); x++) {
1055           for (size_t g = 0; g < groups(); g++) {
1056             for (size_t c = 0; c < group_output_channels(); c++) {
1057               ASSERT_GE(fp16_ieee_to_fp32_value(output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + g * group_output_channels() + c]), output_min)
1058                   << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
1059               ASSERT_LE(fp16_ieee_to_fp32_value(output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + g * group_output_channels() + c]), output_max)
1060                   << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
1061               ASSERT_NEAR(
1062                   fp16_ieee_to_fp32_value(output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + g * group_output_channels() + c]),
1063                   output_ref[(((i * output_height() + y) * output_width() + x) * groups() + g) * group_output_channels() + c],
1064                   1.0e-2f * std::abs(output_ref[(((i * output_height() + y) * output_width() + x) * groups() + g) * group_output_channels() + c]))
1065                   << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
1066             }
1067           }
1068         }
1069       }
1070     }
1071   }
1072 
TestF32()1073   void TestF32() const {
1074     ASSERT_EQ(weights_type(), WeightsType::Default);
1075 
1076     std::random_device random_device;
1077     auto rng = std::mt19937(random_device());
1078     std::uniform_real_distribution<float> f32dist(0.1f, 1.0f);
1079 
1080     std::vector<float> input(XNN_EXTRA_BYTES / sizeof(float) +
1081       (batch_size() * input_height() * input_width() - 1) * input_pixel_stride() + groups() * group_input_channels());
1082     std::vector<float> kernel(groups() * group_output_channels() * kernel_height() * kernel_width() * group_input_channels());
1083     std::vector<float> bias(groups() * group_output_channels());
1084     std::vector<float> output((batch_size() * output_height() * output_width() - 1) * output_pixel_stride() + groups() * group_output_channels());
1085     std::vector<float> output_ref(batch_size() * output_height() * output_width() * groups() * group_output_channels());
1086 
1087     for (size_t iteration = 0; iteration < iterations(); iteration++) {
1088       std::generate(input.begin(), input.end(), [&]() { return f32dist(rng); });
1089       std::generate(kernel.begin(), kernel.end(), [&]() { return f32dist(rng); });
1090       std::generate(bias.begin(), bias.end(), [&]() { return f32dist(rng); });
1091       std::fill(output.begin(), output.end(), nanf(""));
1092 
1093       // Compute reference results, without clamping.
1094       if (has_bias()) {
1095         for (size_t i = 0; i < batch_size(); i++) {
1096           for (size_t oy = 0; oy < output_height(); oy++) {
1097             for (size_t ox = 0; ox < output_width(); ox++) {
1098               for (size_t g = 0; g < groups(); g++) {
1099                 for (size_t oc = 0; oc < group_output_channels(); oc++) {
1100                   output_ref[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] =
1101                     bias[g * group_output_channels() + oc];
1102                 }
1103               }
1104             }
1105           }
1106         }
1107       } else {
1108         std::fill(output_ref.begin(), output_ref.end(), 0.0f);
1109       }
1110       for (size_t i = 0; i < batch_size(); i++) {
1111         for (size_t oy = 0; oy < output_height(); oy++) {
1112           for (size_t ox = 0; ox < output_width(); ox++) {
1113             for (size_t ky = 0; ky < kernel_height(); ky++) {
1114               const size_t y = oy + padding_top() - ky * dilation_height();
1115               const size_t iy = y / stride_height();
1116               if (iy * stride_height() == y && iy < input_height()) {
1117                 for (size_t kx = 0; kx < kernel_width(); kx++) {
1118                   const size_t x = ox + padding_left() - kx * dilation_width();
1119                   const size_t ix = x / stride_width();
1120                   if (ix * stride_width() == x && ix < input_width()) {
1121                     for (size_t g = 0; g < groups(); g++) {
1122                       for (size_t oc = 0; oc < group_output_channels(); oc++) {
1123                         for (size_t ic = 0; ic < group_input_channels(); ic++) {
1124                           output_ref[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] +=
1125                             input[((i * input_height() + iy) * input_width() + ix) * input_pixel_stride() + g * group_input_channels() + ic] *
1126                             kernel[(((g * group_output_channels() + oc) * kernel_height() + ky) * kernel_width() + kx) * group_input_channels() + ic];
1127                         }
1128                       }
1129                     }
1130                   }
1131                 }
1132               }
1133             }
1134           }
1135         }
1136       }
1137 
1138       // Compute clamping parameters.
1139       const float accumulated_min = *std::min_element(output_ref.cbegin(), output_ref.cend());
1140       const float accumulated_max = *std::max_element(output_ref.cbegin(), output_ref.cend());
1141 
1142       const float output_min = qmin() == 0 ? -std::numeric_limits<float>::infinity() :
1143         accumulated_min + (accumulated_max - accumulated_min) / 255.0f * float(qmin());
1144       const float output_max = qmax() == 255 ? std::numeric_limits<float>::infinity() :
1145         accumulated_max - (accumulated_max - accumulated_min) / 255.0f * float(255 - qmax());
1146 
1147       // Clamp reference results.
1148       for (float& value : output_ref) {
1149         value = std::max(std::min(value, output_max), output_min);
1150       }
1151 
1152       // Create, setup, run, and destroy Deconvolution operator.
1153       ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
1154       xnn_operator_t deconvolution_op = nullptr;
1155 
1156       xnn_caches caches = {
1157         .code_cache = NULL,
1158         .weights_cache = NULL,
1159       };
1160       xnn_weights_cache weights_cache;
1161       if (use_weights_cache()) {
1162         xnn_init_weights_cache(&weights_cache);
1163         caches.weights_cache = &weights_cache;
1164       }
1165 
1166       ASSERT_EQ(
1167           xnn_status_success,
1168           xnn_create_deconvolution2d_nhwc_f32(
1169               padding_top(), padding_right(), padding_bottom(), padding_left(),
1170               kernel_height(), kernel_width(), stride_height(), stride_width(),
1171               dilation_height(), dilation_width(), groups(),
1172               group_input_channels(), group_output_channels(),
1173               input_pixel_stride(), output_pixel_stride(), kernel.data(),
1174               has_bias() ? bias.data() : nullptr, output_min, output_max,
1175               /*flags=*/0, &caches, &deconvolution_op));
1176       if (use_weights_cache()) {
1177         ASSERT_EQ(xnn_status_success,
1178                   xnn_finalize_weights_cache(&weights_cache, xnn_weights_cache_finalization_kind_soft));
1179       }
1180 
1181       // Smart pointer to automatically delete deconvolution_op.
1182       std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)> auto_deconvolution_op(deconvolution_op, xnn_delete_operator);
1183 
1184       ASSERT_EQ(xnn_status_success,
1185         xnn_setup_deconvolution2d_nhwc_f32(
1186           deconvolution_op,
1187           batch_size(), input_height(), input_width(),
1188           adjustment_height(), adjustment_width(),
1189           input.data(), output.data(),
1190           nullptr /* thread pool */));
1191 
1192       ASSERT_EQ(xnn_status_success,
1193         xnn_run_operator(deconvolution_op, nullptr /* thread pool */));
1194 
1195       VerifyF32(output, output_ref, output_max, output_min);
1196 
1197       if (use_weights_cache()) {
1198         xnn_operator_t deconvolution_op2 = nullptr;
1199         size_t old_weights_cache_size = weights_cache.cache.weights.size;
1200 
1201         ASSERT_EQ(
1202             xnn_status_success,
1203             xnn_create_deconvolution2d_nhwc_f32(
1204                 padding_top(), padding_right(), padding_bottom(), padding_left(),
1205                 kernel_height(), kernel_width(), stride_height(), stride_width(),
1206                 dilation_height(), dilation_width(), groups(),
1207                 group_input_channels(), group_output_channels(),
1208                 input_pixel_stride(), output_pixel_stride(), kernel.data(),
1209                 has_bias() ? bias.data() : nullptr, output_min, output_max,
1210                 /*flags=*/0, &caches, &deconvolution_op2));
1211 
1212         // Smart pointer to automatically delete deconvolution_op2.
1213         std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)> auto_deconvolution_op(deconvolution_op2, xnn_delete_operator);
1214         std::vector<float> output2(output.size(), nanf(""));
1215 
1216         ASSERT_EQ(xnn_status_success,
1217                   xnn_setup_deconvolution2d_nhwc_f32(
1218                       deconvolution_op2,
1219                       batch_size(), input_height(), input_width(),
1220                       adjustment_height(), adjustment_width(),
1221                       input.data(), output2.data(),
1222                       nullptr /* thread pool */));
1223 
1224         ASSERT_EQ(xnn_status_success,
1225                   xnn_run_operator(deconvolution_op2, nullptr /* thread pool */));
1226 
1227         VerifyWeightsCache(&weights_cache, old_weights_cache_size);
1228         VerifyF32(output2, output_ref, output_max, output_min);
1229         xnn_release_weights_cache(&weights_cache);
1230       }
1231     }
1232   }
1233 
1234   // A variation of TestF32 that stresses the weights cache. All the operator creation needs to happen before
1235   // finalization and setup.
StressWeightsCacheTestF32()1236   void StressWeightsCacheTestF32() const {
1237     ASSERT_EQ(weights_type(), WeightsType::Default);
1238 
1239     std::random_device random_device;
1240     auto rng = std::mt19937(random_device());
1241     std::uniform_real_distribution<float> f32dist(0.1f, 1.0f);
1242 
1243     xnn_caches caches = {
1244       .code_cache = NULL,
1245       .weights_cache = NULL,
1246     };
1247     xnn_weights_cache weights_cache;
1248     xnn_init_weights_cache(&weights_cache);
1249     caches.weights_cache = &weights_cache;
1250     void* old_weights_cache_start = weights_cache.cache.weights.start;
1251     size_t old_weights_cache_size = weights_cache.cache.weights.size;
1252 
1253     std::vector<xnn_operator_t> operators;
1254     operators.reserve(iterations());
1255     std::vector<std::vector<float>> inputs;
1256     inputs.reserve(iterations());
1257     std::vector<std::vector<float>> outputs;
1258     outputs.reserve(iterations());
1259     std::vector<std::vector<float>> output_refs;
1260     output_refs.reserve(iterations());
1261     std::vector<float> output_mins;
1262     output_mins.reserve(iterations());
1263     std::vector<float> output_maxs;
1264     output_maxs.reserve(iterations());
1265 
1266     for (size_t iteration = 0; iteration < iterations(); iteration++) {
1267       std::vector<float> input(XNN_EXTRA_BYTES / sizeof(float) +
1268                                (batch_size() * input_height() * input_width() - 1) * input_pixel_stride() + groups() * group_input_channels());
1269       std::vector<float> kernel(groups() * group_output_channels() * kernel_height() * kernel_width() * group_input_channels());
1270       std::vector<float> bias(groups() * group_output_channels());
1271       std::vector<float> output((batch_size() * output_height() * output_width() - 1) * output_pixel_stride() + groups() * group_output_channels());
1272       std::vector<float> output_ref(batch_size() * output_height() * output_width() * groups() * group_output_channels());
1273 
1274       std::generate(input.begin(), input.end(), [&]() { return f32dist(rng); });
1275       std::generate(kernel.begin(), kernel.end(), [&]() { return f32dist(rng); });
1276       std::generate(bias.begin(), bias.end(), [&]() { return f32dist(rng); });
1277       std::fill(output.begin(), output.end(), nanf(""));
1278 
1279       // Compute reference results, without clamping.
1280       if (has_bias()) {
1281         for (size_t i = 0; i < batch_size(); i++) {
1282           for (size_t oy = 0; oy < output_height(); oy++) {
1283             for (size_t ox = 0; ox < output_width(); ox++) {
1284               for (size_t g = 0; g < groups(); g++) {
1285                 for (size_t oc = 0; oc < group_output_channels(); oc++) {
1286                   output_ref[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] =
1287                     bias[g * group_output_channels() + oc];
1288                 }
1289               }
1290             }
1291           }
1292         }
1293       } else {
1294         std::fill(output_ref.begin(), output_ref.end(), 0.0f);
1295       }
1296       for (size_t i = 0; i < batch_size(); i++) {
1297         for (size_t oy = 0; oy < output_height(); oy++) {
1298           for (size_t ox = 0; ox < output_width(); ox++) {
1299             for (size_t ky = 0; ky < kernel_height(); ky++) {
1300               const size_t y = oy + padding_top() - ky * dilation_height();
1301               const size_t iy = y / stride_height();
1302               if (iy * stride_height() == y && iy < input_height()) {
1303                 for (size_t kx = 0; kx < kernel_width(); kx++) {
1304                   const size_t x = ox + padding_left() - kx * dilation_width();
1305                   const size_t ix = x / stride_width();
1306                   if (ix * stride_width() == x && ix < input_width()) {
1307                     for (size_t g = 0; g < groups(); g++) {
1308                       for (size_t oc = 0; oc < group_output_channels(); oc++) {
1309                         for (size_t ic = 0; ic < group_input_channels(); ic++) {
1310                           output_ref[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] +=
1311                             input[((i * input_height() + iy) * input_width() + ix) * input_pixel_stride() + g * group_input_channels() + ic] *
1312                             kernel[(((g * group_output_channels() + oc) * kernel_height() + ky) * kernel_width() + kx) * group_input_channels() + ic];
1313                         }
1314                       }
1315                     }
1316                   }
1317                 }
1318               }
1319             }
1320           }
1321         }
1322       }
1323 
1324       // Compute clamping parameters.
1325       const float accumulated_min = *std::min_element(output_ref.cbegin(), output_ref.cend());
1326       const float accumulated_max = *std::max_element(output_ref.cbegin(), output_ref.cend());
1327 
1328       const float output_min = qmin() == 0 ? -std::numeric_limits<float>::infinity() :
1329         accumulated_min + (accumulated_max - accumulated_min) / 255.0f * float(qmin());
1330       const float output_max = qmax() == 255 ? std::numeric_limits<float>::infinity() :
1331         accumulated_max - (accumulated_max - accumulated_min) / 255.0f * float(255 - qmax());
1332       output_mins.push_back(output_min);
1333       output_maxs.push_back(output_max);
1334 
1335       // Clamp reference results.
1336       for (float& value : output_ref) {
1337         value = std::max(std::min(value, output_max), output_min);
1338       }
1339 
1340       // Create, setup, run, and destroy Deconvolution operator.
1341       ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
1342       xnn_operator_t deconvolution_op = nullptr;
1343 
1344       ASSERT_EQ(
1345           xnn_status_success,
1346           xnn_create_deconvolution2d_nhwc_f32(
1347               padding_top(), padding_right(), padding_bottom(), padding_left(),
1348               kernel_height(), kernel_width(), stride_height(), stride_width(),
1349               dilation_height(), dilation_width(), groups(),
1350               group_input_channels(), group_output_channels(),
1351               input_pixel_stride(), output_pixel_stride(), kernel.data(),
1352               has_bias() ? bias.data() : nullptr, output_min, output_max,
1353               /*flags=*/0, &caches, &deconvolution_op));
1354 
1355       operators.push_back(std::move(deconvolution_op));
1356       inputs.push_back(std::move(input));
1357       outputs.push_back(std::move(output));
1358       output_refs.push_back(std::move(output_ref));
1359     }
1360 
1361     ASSERT_EQ(xnn_status_success,
1362               xnn_finalize_weights_cache(&weights_cache, xnn_weights_cache_finalization_kind_soft));
1363 
1364     for (size_t iteration = 0; iteration < iterations(); iteration++) {
1365       xnn_operator_t deconvolution_op = operators[iteration];
1366 
1367       ASSERT_EQ(xnn_status_success,
1368         xnn_setup_deconvolution2d_nhwc_f32(
1369           deconvolution_op,
1370           batch_size(), input_height(), input_width(),
1371           adjustment_height(), adjustment_width(),
1372           inputs[iteration].data(), outputs[iteration].data(),
1373           nullptr /* thread pool */));
1374 
1375       ASSERT_EQ(xnn_status_success,
1376         xnn_run_operator(deconvolution_op, nullptr /* thread pool */));
1377 
1378       VerifyF32(outputs[iteration],
1379                 output_refs[iteration],
1380                 output_maxs[iteration],
1381                 output_mins[iteration]);
1382       xnn_delete_operator(deconvolution_op);
1383     }
1384 
1385     // Check that the weights cache grew and moved. If these assertion fails,
1386     // might have to increase the number of test iterations.
1387     ASSERT_NE(old_weights_cache_start, weights_cache.cache.weights.start);
1388     ASSERT_LT(old_weights_cache_size, weights_cache.cache.weights.size);
1389     // Since the weights are randomized, it is very unlikely to have any hits.
1390     ASSERT_EQ(iterations(), weights_cache.cache.misses);
1391     ASSERT_EQ(0, weights_cache.cache.hits);
1392     ASSERT_EQ(iterations(), weights_cache.cache.num_entries);
1393     xnn_release_weights_cache(&weights_cache);
1394   }
1395 
VerifyF32(const std::vector<float> & output,const std::vector<float> & output_ref,float output_max,float output_min)1396   void VerifyF32(const std::vector<float> &output,
1397                  const std::vector<float> &output_ref,
1398                  float output_max,
1399                  float output_min) const {
1400     for (size_t i = 0; i < batch_size(); i++) {
1401       for (size_t y = 0; y < output_height(); y++) {
1402         for (size_t x = 0; x < output_width(); x++) {
1403           for (size_t g = 0; g < groups(); g++) {
1404             for (size_t c = 0; c < group_output_channels(); c++) {
1405               ASSERT_GE(output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + g * group_output_channels() + c], output_min)
1406                   << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
1407               ASSERT_LE(output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + g * group_output_channels() + c], output_max)
1408                   << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
1409               ASSERT_NEAR(
1410                   output_ref[(((i * output_height() + y) * output_width() + x) * groups() + g) * group_output_channels() + c],
1411                   output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + g * group_output_channels() + c],
1412                   1.0e-4 * std::abs(output_ref[(((i * output_height() + y) * output_width() + x) * groups() + g) * group_output_channels() + c]))
1413                   << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
1414             }
1415           }
1416         }
1417       }
1418     }
1419   }
1420 
TestSetupQS8()1421   void TestSetupQS8() const {
1422     ASSERT_EQ(weights_type(), WeightsType::Default);
1423 
1424     std::random_device random_device;
1425     auto rng = std::mt19937(random_device());
1426     std::uniform_int_distribution<int32_t> i32dist(-10000, 10000);
1427     std::uniform_int_distribution<int32_t> i8dist(
1428       std::numeric_limits<int8_t>::min(), std::numeric_limits<int8_t>::max());
1429     std::uniform_int_distribution<int32_t> w8dist(
1430       -std::numeric_limits<int8_t>::max(), std::numeric_limits<int8_t>::max());
1431 
1432     std::vector<int8_t> input(XNN_EXTRA_BYTES / sizeof(int8_t) + std::max(
1433       (batch_size() * input_height() * input_width() - 1) * input_pixel_stride() + groups() * group_input_channels(),
1434       (next_batch_size() * next_input_height() * next_input_width() - 1) * input_pixel_stride() + groups() * group_input_channels()));
1435     std::vector<int8_t> kernel(groups() * group_output_channels() * kernel_height() * kernel_width() * group_input_channels());
1436     std::vector<int32_t> bias(groups() * group_output_channels());
1437     std::vector<int8_t> output(std::max(
1438       (batch_size() * output_height() * output_width() - 1) * output_pixel_stride() + groups() * group_output_channels(),
1439       (next_batch_size() * next_output_height() * next_output_width() - 1) * output_pixel_stride() + groups() * group_output_channels()));
1440     std::vector<int32_t> accumulators(batch_size() * output_height() * output_width() * groups() * group_output_channels());
1441     std::vector<double> output_ref(batch_size() * output_height() * output_width() * groups() * group_output_channels());
1442     std::vector<int32_t> next_accumulators(next_batch_size() * next_output_height() * next_output_width() * groups() * group_output_channels());
1443     std::vector<double> next_output_ref(next_batch_size() * next_output_height() * next_output_width() * groups() * group_output_channels());
1444 
1445     const int8_t input_zero_point = 127;
1446 
1447     for (size_t iteration = 0; iteration < iterations(); iteration++) {
1448       std::generate(input.begin(), input.end(), [&]() { return i8dist(rng); });
1449       std::generate(kernel.begin(), kernel.end(), [&]() { return w8dist(rng); });
1450       std::generate(bias.begin(), bias.end(), [&]() { return i32dist(rng); });
1451       std::fill(output.begin(), output.end(), INT8_C(0xA5));
1452 
1453       // Compute reference results, without renormalization.
1454       if (has_bias()) {
1455         for (size_t i = 0; i < batch_size(); i++) {
1456           for (size_t oy = 0; oy < output_height(); oy++) {
1457             for (size_t ox = 0; ox < output_width(); ox++) {
1458               for (size_t g = 0; g < groups(); g++) {
1459                 for (size_t oc = 0; oc < group_output_channels(); oc++) {
1460                   accumulators[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] =
1461                     bias[g * group_output_channels() + oc];
1462                 }
1463               }
1464             }
1465           }
1466         }
1467       } else {
1468         std::fill(accumulators.begin(), accumulators.end(), 0);
1469       }
1470       for (size_t i = 0; i < batch_size(); i++) {
1471         for (size_t oy = 0; oy < output_height(); oy++) {
1472           for (size_t ox = 0; ox < output_width(); ox++) {
1473             for (size_t ky = 0; ky < kernel_height(); ky++) {
1474               const size_t y = oy + padding_top() - ky * dilation_height();
1475               const size_t iy = y / stride_height();
1476               if (iy * stride_height() == y && iy < input_height()) {
1477                 for (size_t kx = 0; kx < kernel_width(); kx++) {
1478                   const size_t x = ox + padding_left() - kx * dilation_width();
1479                   const size_t ix = x / stride_width();
1480                   if (ix * stride_width() == x && ix < input_width()) {
1481                     for (size_t g = 0; g < groups(); g++) {
1482                       for (size_t oc = 0; oc < group_output_channels(); oc++) {
1483                         for (size_t ic = 0; ic < group_input_channels(); ic++) {
1484                           accumulators[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] +=
1485                             (int32_t(input[((i * input_height() + iy) * input_width() + ix) * input_pixel_stride() + g * group_input_channels() + ic]) - int32_t(input_zero_point)) *
1486                             int32_t(kernel[(((g * group_output_channels() + oc) * kernel_height() + ky) * kernel_width() + kx) * group_input_channels() + ic]);
1487                         }
1488                       }
1489                     }
1490                   }
1491                 }
1492               }
1493             }
1494           }
1495         }
1496       }
1497 
1498       // Compute renormalization parameters.
1499       const int32_t accumulated_min = *std::min_element(accumulators.cbegin(), accumulators.cend());
1500       const int32_t accumulated_max = *std::max_element(accumulators.cbegin(), accumulators.cend());
1501 
1502       const double output_scale = double(uint32_t(accumulated_max - accumulated_min)) / 255.0;
1503       const int8_t output_zero_point = int8_t(std::max(std::min(
1504         lrint(-0.5 - 0.5 * double(accumulated_min + accumulated_max) / output_scale),
1505         long(std::numeric_limits<int8_t>::max())), long(std::numeric_limits<int8_t>::min())));
1506 
1507       // Renormalize reference results.
1508       std::transform(accumulators.cbegin(), accumulators.cend(), output_ref.begin(),
1509         [this, output_scale, output_zero_point](int32_t x) -> double {
1510           return std::max<double>(std::min<double>(double(x) / output_scale, double(qmax() - 0x80) - output_zero_point), double(qmin() - 0x80) - output_zero_point);
1511         });
1512 
1513       // Create, setup, and run Deconvolution operator once.
1514       ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
1515       xnn_operator_t deconvolution_op = nullptr;
1516 
1517       ASSERT_EQ(xnn_status_success,
1518         xnn_create_deconvolution2d_nhwc_qs8(
1519           padding_top(), padding_right(), padding_bottom(), padding_left(),
1520           kernel_height(), kernel_width(),
1521           stride_height(), stride_width(),
1522           dilation_height(), dilation_width(),
1523           groups(), group_input_channels(), group_output_channels(),
1524           input_pixel_stride(), output_pixel_stride(),
1525           input_zero_point, 1.0f /* input scale */,
1526           1.0f /* kernel scale */,
1527           kernel.data(), has_bias() ? bias.data() : nullptr,
1528           output_zero_point, output_scale, int8_t(qmin() - 0x80), int8_t(qmax() - 0x80),
1529           0, NULL, &deconvolution_op));
1530 
1531       // Smart pointer to automatically delete deconvolution_op.
1532       std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)> auto_deconvolution_op(deconvolution_op, xnn_delete_operator);
1533 
1534       ASSERT_EQ(xnn_status_success,
1535         xnn_setup_deconvolution2d_nhwc_qs8(
1536           deconvolution_op,
1537           batch_size(), input_height(), input_width(),
1538           adjustment_height(), adjustment_width(),
1539           input.data(), output.data(),
1540           nullptr /* thread pool */));
1541 
1542       ASSERT_EQ(xnn_status_success,
1543         xnn_run_operator(deconvolution_op, nullptr /* thread pool */));
1544 
1545       // Verify results of the first run.
1546       for (size_t i = 0; i < batch_size(); i++) {
1547         for (size_t y = 0; y < output_height(); y++) {
1548           for (size_t x = 0; x < output_width(); x++) {
1549             for (size_t g = 0; g < groups(); g++) {
1550               for (size_t c = 0; c < group_output_channels(); c++) {
1551                 ASSERT_LE(int32_t(output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + g * group_output_channels() + c]), int32_t(qmax() - 0x80))
1552                      << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
1553                 ASSERT_GE(int32_t(output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + g * group_output_channels() + c]), int32_t(qmin() - 0x80))
1554                      << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
1555                 ASSERT_NEAR(
1556                     output_ref[(((i * output_height() + y) * output_width() + x) * groups() + g) * group_output_channels() + c],
1557                     double(output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + g * group_output_channels() + c]) - double(output_zero_point),
1558                     0.9)
1559                   << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
1560               }
1561             }
1562           }
1563         }
1564       }
1565 
1566       // Re-generate data for the second run.
1567       std::generate(input.begin(), input.end(), [&]() { return i8dist(rng); });
1568       std::fill(output.begin(), output.end(), INT8_C(0xA5));
1569 
1570       // Compute reference results for the second run, including renormalization.
1571       if (has_bias()) {
1572         for (size_t i = 0; i < next_batch_size(); i++) {
1573           for (size_t oy = 0; oy < next_output_height(); oy++) {
1574             for (size_t ox = 0; ox < next_output_width(); ox++) {
1575               for (size_t g = 0; g < groups(); g++) {
1576                 for (size_t oc = 0; oc < group_output_channels(); oc++) {
1577                   next_accumulators[(((i * next_output_height() + oy) * next_output_width() + ox) * groups() + g) * group_output_channels() + oc] =
1578                     bias[g * group_output_channels() + oc];
1579                 }
1580               }
1581             }
1582           }
1583         }
1584       } else {
1585         std::fill(next_accumulators.begin(), next_accumulators.end(), 0);
1586       }
1587       for (size_t i = 0; i < next_batch_size(); i++) {
1588         for (size_t oy = 0; oy < next_output_height(); oy++) {
1589           for (size_t ox = 0; ox < next_output_width(); ox++) {
1590             for (size_t ky = 0; ky < kernel_height(); ky++) {
1591               const size_t y = oy + padding_top() - ky * dilation_height();
1592               const size_t iy = y / stride_height();
1593               if (iy * stride_height() == y && iy < next_input_height()) {
1594                 for (size_t kx = 0; kx < kernel_width(); kx++) {
1595                   const size_t x = ox + padding_left() - kx * dilation_width();
1596                   const size_t ix = x / stride_width();
1597                   if (ix * stride_width() == x && ix < next_input_width()) {
1598                     for (size_t g = 0; g < groups(); g++) {
1599                       for (size_t oc = 0; oc < group_output_channels(); oc++) {
1600                         for (size_t ic = 0; ic < group_input_channels(); ic++) {
1601                           next_accumulators[(((i * next_output_height() + oy) * next_output_width() + ox) * groups() + g) * group_output_channels() + oc] +=
1602                             (int32_t(input[((i * next_input_height() + iy) * next_input_width() + ix) * input_pixel_stride() + g * group_input_channels() + ic]) - int32_t(input_zero_point)) *
1603                             int32_t(kernel[(((g * group_output_channels() + oc) * kernel_height() + ky) * kernel_width() + kx) * group_input_channels() + ic]);
1604                         }
1605                       }
1606                     }
1607                   }
1608                 }
1609               }
1610             }
1611           }
1612         }
1613       }
1614       std::transform(next_accumulators.cbegin(), next_accumulators.cend(), next_output_ref.begin(),
1615         [this, output_scale, output_zero_point](int32_t x) -> double {
1616           return std::max<double>(std::min<double>(double(x) / output_scale, double(qmax() - 0x80) - output_zero_point), double(qmin() - 0x80) - output_zero_point);
1617         });
1618 
1619       // Setup and run Deconvolution operator the second time, and destroy the operator.
1620       ASSERT_EQ(xnn_status_success,
1621         xnn_setup_deconvolution2d_nhwc_qs8(
1622           deconvolution_op,
1623           next_batch_size(), next_input_height(), next_input_width(),
1624           adjustment_height(), adjustment_width(),
1625           input.data(), output.data(),
1626           nullptr /* thread pool */));
1627 
1628       ASSERT_EQ(xnn_status_success,
1629         xnn_run_operator(deconvolution_op, nullptr /* thread pool */));
1630 
1631       // Verify results of the second run.
1632       for (size_t i = 0; i < next_batch_size(); i++) {
1633         for (size_t y = 0; y < next_output_height(); y++) {
1634           for (size_t x = 0; x < next_output_width(); x++) {
1635             for (size_t g = 0; g < groups(); g++) {
1636               for (size_t c = 0; c < group_output_channels(); c++) {
1637                 ASSERT_LE(int32_t(output[((i * next_output_height() + y) * next_output_width() + x) * output_pixel_stride() + g * group_output_channels() + c]), int32_t(qmax() - 0x80))
1638                      << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
1639                 ASSERT_GE(int32_t(output[((i * next_output_height() + y) * next_output_width() + x) * output_pixel_stride() + g * group_output_channels() + c]), int32_t(qmin() - 0x80))
1640                      << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
1641                 ASSERT_NEAR(
1642                     next_output_ref[(((i * next_output_height() + y) * next_output_width() + x) * groups() + g) * group_output_channels() + c],
1643                     double(output[((i * next_output_height() + y) * next_output_width() + x) * output_pixel_stride() + g * group_output_channels() + c]) - double(output_zero_point),
1644                     0.9)
1645                   << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
1646               }
1647             }
1648           }
1649         }
1650       }
1651     }
1652   }
1653 
TestSetupQU8()1654   void TestSetupQU8() const {
1655     ASSERT_EQ(weights_type(), WeightsType::Default);
1656 
1657     std::random_device random_device;
1658     auto rng = std::mt19937(random_device());
1659     std::uniform_int_distribution<int32_t> i32dist(-10000, 10000);
1660     std::uniform_int_distribution<int32_t> u8dist(
1661       std::numeric_limits<uint8_t>::min(), std::numeric_limits<uint8_t>::max());
1662 
1663     std::vector<uint8_t> input(XNN_EXTRA_BYTES / sizeof(uint8_t) + std::max(
1664       (batch_size() * input_height() * input_width() - 1) * input_pixel_stride() + groups() * group_input_channels(),
1665       (next_batch_size() * next_input_height() * next_input_width() - 1) * input_pixel_stride() + groups() * group_input_channels()));
1666     std::vector<uint8_t> kernel(groups() * group_output_channels() * kernel_height() * kernel_width() * group_input_channels());
1667     std::vector<int32_t> bias(groups() * group_output_channels());
1668     std::vector<uint8_t> output(std::max(
1669       (batch_size() * output_height() * output_width() - 1) * output_pixel_stride() + groups() * group_output_channels(),
1670       (next_batch_size() * next_output_height() * next_output_width() - 1) * output_pixel_stride() + groups() * group_output_channels()));
1671     std::vector<int32_t> accumulators(batch_size() * output_height() * output_width() * groups() * group_output_channels());
1672     std::vector<double> output_ref(batch_size() * output_height() * output_width() * groups() * group_output_channels());
1673     std::vector<int32_t> next_accumulators(next_batch_size() * next_output_height() * next_output_width() * groups() * group_output_channels());
1674     std::vector<double> next_output_ref(next_batch_size() * next_output_height() * next_output_width() * groups() * group_output_channels());
1675 
1676     const uint8_t input_zero_point = 127;
1677     const uint8_t kernel_zero_point = 127;
1678 
1679     for (size_t iteration = 0; iteration < iterations(); iteration++) {
1680       std::generate(input.begin(), input.end(), [&]() { return u8dist(rng); });
1681       std::generate(kernel.begin(), kernel.end(), [&]() { return u8dist(rng); });
1682       std::generate(bias.begin(), bias.end(), [&]() { return i32dist(rng); });
1683       std::fill(output.begin(), output.end(), UINT8_C(0xA5));
1684 
1685       // Compute reference results, without renormalization.
1686       if (has_bias()) {
1687         for (size_t i = 0; i < batch_size(); i++) {
1688           for (size_t oy = 0; oy < output_height(); oy++) {
1689             for (size_t ox = 0; ox < output_width(); ox++) {
1690               for (size_t g = 0; g < groups(); g++) {
1691                 for (size_t oc = 0; oc < group_output_channels(); oc++) {
1692                   accumulators[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] =
1693                     bias[g * group_output_channels() + oc];
1694                 }
1695               }
1696             }
1697           }
1698         }
1699       } else {
1700         std::fill(accumulators.begin(), accumulators.end(), 0);
1701       }
1702       for (size_t i = 0; i < batch_size(); i++) {
1703         for (size_t oy = 0; oy < output_height(); oy++) {
1704           for (size_t ox = 0; ox < output_width(); ox++) {
1705             for (size_t ky = 0; ky < kernel_height(); ky++) {
1706               const size_t y = oy + padding_top() - ky * dilation_height();
1707               const size_t iy = y / stride_height();
1708               if (iy * stride_height() == y && iy < input_height()) {
1709                 for (size_t kx = 0; kx < kernel_width(); kx++) {
1710                   const size_t x = ox + padding_left() - kx * dilation_width();
1711                   const size_t ix = x / stride_width();
1712                   if (ix * stride_width() == x && ix < input_width()) {
1713                     for (size_t g = 0; g < groups(); g++) {
1714                       for (size_t oc = 0; oc < group_output_channels(); oc++) {
1715                         for (size_t ic = 0; ic < group_input_channels(); ic++) {
1716                           accumulators[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] +=
1717                             (int32_t(input[((i * input_height() + iy) * input_width() + ix) * input_pixel_stride() + g * group_input_channels() + ic]) - int32_t(input_zero_point)) *
1718                             (int32_t(kernel[(((g * group_output_channels() + oc) * kernel_height() + ky) * kernel_width() + kx) * group_input_channels() + ic]) - int32_t(kernel_zero_point));
1719                         }
1720                       }
1721                     }
1722                   }
1723                 }
1724               }
1725             }
1726           }
1727         }
1728       }
1729 
1730       // Compute renormalization parameters.
1731       const int32_t accumulated_min = *std::min_element(accumulators.cbegin(), accumulators.cend());
1732       const int32_t accumulated_max = *std::max_element(accumulators.cbegin(), accumulators.cend());
1733 
1734       const double output_scale = double(uint32_t(accumulated_max - accumulated_min)) / 255.0;
1735       const uint8_t output_zero_point = uint8_t(std::max(std::min(
1736         lrint(127.5 - 0.5 * double(accumulated_min + accumulated_max) / output_scale),
1737         long(std::numeric_limits<uint8_t>::max())), long(std::numeric_limits<uint8_t>::min())));
1738 
1739       // Renormalize reference results.
1740       std::transform(accumulators.cbegin(), accumulators.cend(), output_ref.begin(),
1741         [this, output_scale, output_zero_point](int32_t x) -> double {
1742           return std::max<double>(std::min<double>(double(x) / output_scale, double(qmax()) - output_zero_point), double(qmin()) - output_zero_point);
1743         });
1744 
1745       // Create, setup, and run Deconvolution operator once.
1746       ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
1747       xnn_operator_t deconvolution_op = nullptr;
1748 
1749       ASSERT_EQ(xnn_status_success,
1750         xnn_create_deconvolution2d_nhwc_qu8(
1751           padding_top(), padding_right(), padding_bottom(), padding_left(),
1752           kernel_height(), kernel_width(),
1753           stride_height(), stride_width(),
1754           dilation_height(), dilation_width(),
1755           groups(), group_input_channels(), group_output_channels(),
1756           input_pixel_stride(), output_pixel_stride(),
1757           input_zero_point, 1.0f /* input scale */,
1758           kernel_zero_point, 1.0f /* kernel scale */,
1759           kernel.data(), has_bias() ? bias.data() : nullptr,
1760           output_zero_point, output_scale, qmin(), qmax(),
1761           0, NULL, &deconvolution_op));
1762 
1763       // Smart pointer to automatically delete deconvolution_op.
1764       std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)> auto_deconvolution_op(deconvolution_op, xnn_delete_operator);
1765 
1766       ASSERT_EQ(xnn_status_success,
1767         xnn_setup_deconvolution2d_nhwc_qu8(
1768           deconvolution_op,
1769           batch_size(), input_height(), input_width(),
1770           adjustment_height(), adjustment_width(),
1771           input.data(), output.data(),
1772           nullptr /* thread pool */));
1773 
1774       ASSERT_EQ(xnn_status_success,
1775         xnn_run_operator(deconvolution_op, nullptr /* thread pool */));
1776 
1777       // Verify results of the first run.
1778       for (size_t i = 0; i < batch_size(); i++) {
1779         for (size_t y = 0; y < output_height(); y++) {
1780           for (size_t x = 0; x < output_width(); x++) {
1781             for (size_t g = 0; g < groups(); g++) {
1782               for (size_t c = 0; c < group_output_channels(); c++) {
1783                 ASSERT_LE(int32_t(output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + g * group_output_channels() + c]), int32_t(qmax()))
1784                      << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
1785                 ASSERT_GE(int32_t(output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + g * group_output_channels() + c]), int32_t(qmin()))
1786                      << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
1787                 ASSERT_NEAR(
1788                     output_ref[(((i * output_height() + y) * output_width() + x) * groups() + g) * group_output_channels() + c],
1789                     double(output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + g * group_output_channels() + c]) - double(output_zero_point),
1790                     0.9)
1791                   << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
1792               }
1793             }
1794           }
1795         }
1796       }
1797 
1798       // Re-generate data for the second run.
1799       std::generate(input.begin(), input.end(), [&]() { return u8dist(rng); });
1800       std::fill(output.begin(), output.end(), 0xA5);
1801 
1802       // Compute reference results for the second run, including renormalization.
1803       if (has_bias()) {
1804         for (size_t i = 0; i < next_batch_size(); i++) {
1805           for (size_t oy = 0; oy < next_output_height(); oy++) {
1806             for (size_t ox = 0; ox < next_output_width(); ox++) {
1807               for (size_t g = 0; g < groups(); g++) {
1808                 for (size_t oc = 0; oc < group_output_channels(); oc++) {
1809                   next_accumulators[(((i * next_output_height() + oy) * next_output_width() + ox) * groups() + g) * group_output_channels() + oc] =
1810                     bias[g * group_output_channels() + oc];
1811                 }
1812               }
1813             }
1814           }
1815         }
1816       } else {
1817         std::fill(next_accumulators.begin(), next_accumulators.end(), 0);
1818       }
1819       for (size_t i = 0; i < next_batch_size(); i++) {
1820         for (size_t oy = 0; oy < next_output_height(); oy++) {
1821           for (size_t ox = 0; ox < next_output_width(); ox++) {
1822             for (size_t ky = 0; ky < kernel_height(); ky++) {
1823               const size_t y = oy + padding_top() - ky * dilation_height();
1824               const size_t iy = y / stride_height();
1825               if (iy * stride_height() == y && iy < next_input_height()) {
1826                 for (size_t kx = 0; kx < kernel_width(); kx++) {
1827                   const size_t x = ox + padding_left() - kx * dilation_width();
1828                   const size_t ix = x / stride_width();
1829                   if (ix * stride_width() == x && ix < next_input_width()) {
1830                     for (size_t g = 0; g < groups(); g++) {
1831                       for (size_t oc = 0; oc < group_output_channels(); oc++) {
1832                         for (size_t ic = 0; ic < group_input_channels(); ic++) {
1833                           next_accumulators[(((i * next_output_height() + oy) * next_output_width() + ox) * groups() + g) * group_output_channels() + oc] +=
1834                             (int32_t(input[((i * next_input_height() + iy) * next_input_width() + ix) * input_pixel_stride() + g * group_input_channels() + ic]) - int32_t(input_zero_point)) *
1835                             (int32_t(kernel[(((g * group_output_channels() + oc) * kernel_height() + ky) * kernel_width() + kx) * group_input_channels() + ic]) - int32_t(kernel_zero_point));
1836                         }
1837                       }
1838                     }
1839                   }
1840                 }
1841               }
1842             }
1843           }
1844         }
1845       }
1846       std::transform(next_accumulators.cbegin(), next_accumulators.cend(), next_output_ref.begin(),
1847         [this, output_scale, output_zero_point](int32_t x) -> double {
1848           return std::max<double>(std::min<double>(double(x) / output_scale, double(qmax()) - output_zero_point), double(qmin()) - output_zero_point);
1849         });
1850 
1851       // Setup and run Deconvolution operator the second time, and destroy the operator.
1852       ASSERT_EQ(xnn_status_success,
1853         xnn_setup_deconvolution2d_nhwc_qu8(
1854           deconvolution_op,
1855           next_batch_size(), next_input_height(), next_input_width(),
1856           adjustment_height(), adjustment_width(),
1857           input.data(), output.data(),
1858           nullptr /* thread pool */));
1859 
1860       ASSERT_EQ(xnn_status_success,
1861         xnn_run_operator(deconvolution_op, nullptr /* thread pool */));
1862 
1863       // Verify results of the second run.
1864       for (size_t i = 0; i < next_batch_size(); i++) {
1865         for (size_t y = 0; y < next_output_height(); y++) {
1866           for (size_t x = 0; x < next_output_width(); x++) {
1867             for (size_t g = 0; g < groups(); g++) {
1868               for (size_t c = 0; c < group_output_channels(); c++) {
1869                 ASSERT_LE(int32_t(output[((i * next_output_height() + y) * next_output_width() + x) * output_pixel_stride() + g * group_output_channels() + c]), int32_t(qmax()))
1870                      << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
1871                 ASSERT_GE(int32_t(output[((i * next_output_height() + y) * next_output_width() + x) * output_pixel_stride() + g * group_output_channels() + c]), int32_t(qmin()))
1872                      << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
1873                 ASSERT_NEAR(
1874                     next_output_ref[(((i * next_output_height() + y) * next_output_width() + x) * groups() + g) * group_output_channels() + c],
1875                     double(output[((i * next_output_height() + y) * next_output_width() + x) * output_pixel_stride() + g * group_output_channels() + c]) - double(output_zero_point),
1876                     0.9)
1877                   << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
1878               }
1879             }
1880           }
1881         }
1882       }
1883     }
1884   }
1885 
TestSetupF16()1886   void TestSetupF16() const {
1887     ASSERT_EQ(weights_type(), WeightsType::Default);
1888 
1889     std::random_device random_device;
1890     auto rng = std::mt19937(random_device());
1891     std::uniform_real_distribution<float> f32dist(0.1f, 1.0f);
1892 
1893     std::vector<uint16_t> input(XNN_EXTRA_BYTES / sizeof(uint16_t) + std::max(
1894       (batch_size() * input_height() * input_width() - 1) * input_pixel_stride() + groups() * group_input_channels(),
1895       (next_batch_size() * next_input_height() * next_input_width() - 1) * input_pixel_stride() + groups() * group_input_channels()));
1896     std::vector<uint16_t> kernel(groups() * group_output_channels() * kernel_height() * kernel_width() * group_input_channels());
1897     std::vector<uint16_t> bias(groups() * group_output_channels());
1898     std::vector<uint16_t> output(std::max(
1899       (batch_size() * output_height() * output_width() - 1) * output_pixel_stride() + groups() * group_output_channels(),
1900       (next_batch_size() * next_output_height() * next_output_width() - 1) * output_pixel_stride() + groups() * group_output_channels()));
1901     std::vector<float> output_ref(batch_size() * output_height() * output_width() * groups() * group_output_channels());
1902     std::vector<float> next_output_ref(next_batch_size() * next_output_height() * next_output_width() * groups() * group_output_channels());
1903 
1904     for (size_t iteration = 0; iteration < iterations(); iteration++) {
1905       std::generate(input.begin(), input.end(), [&]() { return fp16_ieee_from_fp32_value(f32dist(rng)); });
1906       std::generate(kernel.begin(), kernel.end(), [&]() { return fp16_ieee_from_fp32_value(f32dist(rng)); });
1907       std::generate(bias.begin(), bias.end(), [&]() { return fp16_ieee_from_fp32_value(f32dist(rng)); });
1908       std::fill(output.begin(), output.end(), UINT16_C(0x7E00) /* NaN */);
1909 
1910       // Compute reference results, without clamping.
1911       if (has_bias()) {
1912         for (size_t i = 0; i < batch_size(); i++) {
1913           for (size_t oy = 0; oy < output_height(); oy++) {
1914             for (size_t ox = 0; ox < output_width(); ox++) {
1915               for (size_t g = 0; g < groups(); g++) {
1916                 for (size_t oc = 0; oc < group_output_channels(); oc++) {
1917                   output_ref[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] =
1918                     fp16_ieee_to_fp32_value(bias[g * group_output_channels() + oc]);
1919                 }
1920               }
1921             }
1922           }
1923         }
1924       } else {
1925         std::fill(output_ref.begin(), output_ref.end(), 0);
1926       }
1927       for (size_t i = 0; i < batch_size(); i++) {
1928         for (size_t oy = 0; oy < output_height(); oy++) {
1929           for (size_t ox = 0; ox < output_width(); ox++) {
1930             for (size_t ky = 0; ky < kernel_height(); ky++) {
1931               const size_t y = oy + padding_top() - ky * dilation_height();
1932               const size_t iy = y / stride_height();
1933               if (iy * stride_height() == y && iy < input_height()) {
1934                 for (size_t kx = 0; kx < kernel_width(); kx++) {
1935                   const size_t x = ox + padding_left() - kx * dilation_width();
1936                   const size_t ix = x / stride_width();
1937                   if (ix * stride_width() == x && ix < input_width()) {
1938                     for (size_t g = 0; g < groups(); g++) {
1939                       for (size_t oc = 0; oc < group_output_channels(); oc++) {
1940                         for (size_t ic = 0; ic < group_input_channels(); ic++) {
1941                           output_ref[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] +=
1942                             fp16_ieee_to_fp32_value(input[((i * input_height() + iy) * input_width() + ix) * input_pixel_stride() + g * group_input_channels() + ic]) *
1943                             fp16_ieee_to_fp32_value(kernel[(((g * group_output_channels() + oc) * kernel_height() + ky) * kernel_width() + kx) * group_input_channels() + ic]);
1944                         }
1945                       }
1946                     }
1947                   }
1948                 }
1949               }
1950             }
1951           }
1952         }
1953       }
1954 
1955       // Compute clamping parameters.
1956       const float accumulated_min = *std::min_element(output_ref.cbegin(), output_ref.cend());
1957       const float accumulated_max = *std::max_element(output_ref.cbegin(), output_ref.cend());
1958       const float accumulated_range = accumulated_max - accumulated_min;
1959       float output_min = accumulated_min + accumulated_range / 255.0f * float(qmin());
1960       float output_max = accumulated_max - accumulated_range / 255.0f * float(255 - qmax());
1961       output_min = fp16_ieee_to_fp32_value(fp16_ieee_from_fp32_value(output_min));
1962       output_max = fp16_ieee_to_fp32_value(fp16_ieee_from_fp32_value(output_max));
1963       if (accumulated_range == 0.0f) {
1964         output_min = -std::numeric_limits<float>::infinity();
1965         output_max = +std::numeric_limits<float>::infinity();
1966       }
1967       if (qmin() == std::numeric_limits<uint8_t>::min()) {
1968         output_min = -std::numeric_limits<float>::infinity();
1969       }
1970       if (qmax() == std::numeric_limits<uint8_t>::max()) {
1971         output_max = +std::numeric_limits<float>::infinity();
1972       }
1973 
1974       // Clamp reference results.
1975       for (float& value : output_ref) {
1976         value = std::max(std::min(value, output_max), output_min);
1977       }
1978 
1979       // Create, setup, and run Deconvolution operator once.
1980       ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
1981       xnn_operator_t deconvolution_op = nullptr;
1982 
1983       const xnn_status status = xnn_create_deconvolution2d_nhwc_f16(
1984         padding_top(), padding_right(), padding_bottom(), padding_left(),
1985         kernel_height(), kernel_width(),
1986         stride_height(), stride_width(),
1987         dilation_height(), dilation_width(),
1988         groups(), group_input_channels(), group_output_channels(),
1989         input_pixel_stride(), output_pixel_stride(),
1990         kernel.data(), has_bias() ? bias.data() : nullptr,
1991         output_min, output_max,
1992         0, NULL, &deconvolution_op);
1993       if (status == xnn_status_unsupported_hardware) {
1994         GTEST_SKIP();
1995       }
1996       ASSERT_EQ(xnn_status_success, status);
1997       ASSERT_NE(nullptr, deconvolution_op);
1998 
1999       // Smart pointer to automatically delete deconvolution_op.
2000       std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)> auto_deconvolution_op(deconvolution_op, xnn_delete_operator);
2001 
2002       ASSERT_EQ(xnn_status_success,
2003         xnn_setup_deconvolution2d_nhwc_f16(
2004           deconvolution_op,
2005           batch_size(), input_height(), input_width(),
2006           adjustment_height(), adjustment_width(),
2007           input.data(), output.data(),
2008           nullptr /* thread pool */));
2009 
2010       ASSERT_EQ(xnn_status_success,
2011         xnn_run_operator(deconvolution_op, nullptr /* thread pool */));
2012 
2013       // Verify results of the first run.
2014       for (size_t i = 0; i < batch_size(); i++) {
2015         for (size_t y = 0; y < output_height(); y++) {
2016           for (size_t x = 0; x < output_width(); x++) {
2017             for (size_t g = 0; g < groups(); g++) {
2018               for (size_t c = 0; c < group_output_channels(); c++) {
2019                 ASSERT_GE(fp16_ieee_to_fp32_value(output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + g * group_output_channels() + c]), output_min)
2020                   << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
2021                 ASSERT_LE(fp16_ieee_to_fp32_value(output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + g * group_output_channels() + c]), output_max)
2022                   << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
2023                 ASSERT_NEAR(
2024                     fp16_ieee_to_fp32_value(output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + g * group_output_channels() + c]),
2025                     output_ref[(((i * output_height() + y) * output_width() + x) * groups() + g) * group_output_channels() + c],
2026                     1.0e-2f * std::abs(output_ref[(((i * output_height() + y) * output_width() + x) * groups() + g) * group_output_channels() + c]))
2027                   << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
2028               }
2029             }
2030           }
2031         }
2032       }
2033 
2034       // Re-generate data for the second run.
2035       std::generate(input.begin(), input.end(), [&]() { return fp16_ieee_from_fp32_value(f32dist(rng)); });
2036       std::fill(output.begin(), output.end(), UINT16_C(0x7E00) /* NaN */);
2037 
2038       // Compute reference results for the second run, including clamping.
2039       if (has_bias()) {
2040         for (size_t i = 0; i < next_batch_size(); i++) {
2041           for (size_t oy = 0; oy < next_output_height(); oy++) {
2042             for (size_t ox = 0; ox < next_output_width(); ox++) {
2043               for (size_t g = 0; g < groups(); g++) {
2044                 for (size_t oc = 0; oc < group_output_channels(); oc++) {
2045                   next_output_ref[(((i * next_output_height() + oy) * next_output_width() + ox) * groups() + g) * group_output_channels() + oc] =
2046                     fp16_ieee_to_fp32_value(bias[g * group_output_channels() + oc]);
2047                 }
2048               }
2049             }
2050           }
2051         }
2052       } else {
2053         std::fill(next_output_ref.begin(), next_output_ref.end(), 0);
2054       }
2055       for (size_t i = 0; i < next_batch_size(); i++) {
2056         for (size_t oy = 0; oy < next_output_height(); oy++) {
2057           for (size_t ox = 0; ox < next_output_width(); ox++) {
2058             for (size_t ky = 0; ky < kernel_height(); ky++) {
2059               const size_t y = oy + padding_top() - ky * dilation_height();
2060               const size_t iy = y / stride_height();
2061               if (iy * stride_height() == y && iy < next_input_height()) {
2062                 for (size_t kx = 0; kx < kernel_width(); kx++) {
2063                   const size_t x = ox + padding_left() - kx * dilation_width();
2064                   const size_t ix = x / stride_width();
2065                   if (ix * stride_width() == x && ix < next_input_width()) {
2066                     for (size_t g = 0; g < groups(); g++) {
2067                       for (size_t oc = 0; oc < group_output_channels(); oc++) {
2068                         for (size_t ic = 0; ic < group_input_channels(); ic++) {
2069                           next_output_ref[(((i * next_output_height() + oy) * next_output_width() + ox) * groups() + g) * group_output_channels() + oc] +=
2070                             fp16_ieee_to_fp32_value(input[((i * next_input_height() + iy) * next_input_width() + ix) * input_pixel_stride() + g * group_input_channels() + ic]) *
2071                             fp16_ieee_to_fp32_value(kernel[(((g * group_output_channels() + oc) * kernel_height() + ky) * kernel_width() + kx) * group_input_channels() + ic]);
2072                         }
2073                       }
2074                     }
2075                   }
2076                 }
2077               }
2078             }
2079           }
2080         }
2081       }
2082       for (float& value : next_output_ref) {
2083         value = std::max(std::min(value, output_max), output_min);
2084       }
2085 
2086       // Setup and run Deconvolution operator the second time, and destroy the operator.
2087       ASSERT_EQ(xnn_status_success,
2088         xnn_setup_deconvolution2d_nhwc_f16(
2089           deconvolution_op,
2090           next_batch_size(), next_input_height(), next_input_width(),
2091           adjustment_height(), adjustment_width(),
2092           input.data(), output.data(),
2093           nullptr /* thread pool */));
2094 
2095       ASSERT_EQ(xnn_status_success,
2096         xnn_run_operator(deconvolution_op, nullptr /* thread pool */));
2097 
2098       // Verify results of the second run.
2099       for (size_t i = 0; i < next_batch_size(); i++) {
2100         for (size_t y = 0; y < next_output_height(); y++) {
2101           for (size_t x = 0; x < next_output_width(); x++) {
2102             for (size_t g = 0; g < groups(); g++) {
2103               for (size_t c = 0; c < group_output_channels(); c++) {
2104                 ASSERT_GE(fp16_ieee_to_fp32_value(output[((i * next_output_height() + y) * next_output_width() + x) * output_pixel_stride() + g * group_output_channels() + c]), output_min)
2105                   << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
2106                 ASSERT_LE(fp16_ieee_to_fp32_value(output[((i * next_output_height() + y) * next_output_width() + x) * output_pixel_stride() + g * group_output_channels() + c]), output_max)
2107                   << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
2108                 ASSERT_NEAR(
2109                     fp16_ieee_to_fp32_value(output[((i * next_output_height() + y) * next_output_width() + x) * output_pixel_stride() + g * group_output_channels() + c]),
2110                     next_output_ref[(((i * next_output_height() + y) * next_output_width() + x) * groups() + g) * group_output_channels() + c],
2111                     1.0e-2f * std::abs(next_output_ref[(((i * next_output_height() + y) * next_output_width() + x) * groups() + g) * group_output_channels() + c]))
2112                   << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
2113               }
2114             }
2115           }
2116         }
2117       }
2118     }
2119   }
2120 
TestSetupF32()2121   void TestSetupF32() const {
2122     ASSERT_EQ(weights_type(), WeightsType::Default);
2123 
2124     std::random_device random_device;
2125     auto rng = std::mt19937(random_device());
2126     std::uniform_real_distribution<float> f32dist(0.1f, 1.0f);
2127 
2128     std::vector<float> input(XNN_EXTRA_BYTES / sizeof(float) + std::max(
2129       (batch_size() * input_height() * input_width() - 1) * input_pixel_stride() + groups() * group_input_channels(),
2130       (next_batch_size() * next_input_height() * next_input_width() - 1) * input_pixel_stride() + groups() * group_input_channels()));
2131     std::vector<float> kernel(groups() * group_output_channels() * kernel_height() * kernel_width() * group_input_channels());
2132     std::vector<float> bias(groups() * group_output_channels());
2133     std::vector<float> output(std::max(
2134       (batch_size() * output_height() * output_width() - 1) * output_pixel_stride() + groups() * group_output_channels(),
2135       (next_batch_size() * next_output_height() * next_output_width() - 1) * output_pixel_stride() + groups() * group_output_channels()));
2136     std::vector<float> output_ref(batch_size() * output_height() * output_width() * groups() * group_output_channels());
2137     std::vector<float> next_output_ref(next_batch_size() * next_output_height() * next_output_width() * groups() * group_output_channels());
2138 
2139     for (size_t iteration = 0; iteration < iterations(); iteration++) {
2140       std::generate(input.begin(), input.end(), [&]() { return f32dist(rng); });
2141       std::generate(kernel.begin(), kernel.end(), [&]() { return f32dist(rng); });
2142       std::generate(bias.begin(), bias.end(), [&]() { return f32dist(rng); });
2143       std::fill(output.begin(), output.end(), nanf(""));
2144 
2145       // Compute reference results, without clamping.
2146       if (has_bias()) {
2147         for (size_t i = 0; i < batch_size(); i++) {
2148           for (size_t oy = 0; oy < output_height(); oy++) {
2149             for (size_t ox = 0; ox < output_width(); ox++) {
2150               for (size_t g = 0; g < groups(); g++) {
2151                 for (size_t oc = 0; oc < group_output_channels(); oc++) {
2152                   output_ref[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] =
2153                     bias[g * group_output_channels() + oc];
2154                 }
2155               }
2156             }
2157           }
2158         }
2159       } else {
2160         std::fill(output_ref.begin(), output_ref.end(), 0.0f);
2161       }
2162       for (size_t i = 0; i < batch_size(); i++) {
2163         for (size_t oy = 0; oy < output_height(); oy++) {
2164           for (size_t ox = 0; ox < output_width(); ox++) {
2165             for (size_t ky = 0; ky < kernel_height(); ky++) {
2166               const size_t y = oy + padding_top() - ky * dilation_height();
2167               const size_t iy = y / stride_height();
2168               if (iy * stride_height() == y && iy < input_height()) {
2169                 for (size_t kx = 0; kx < kernel_width(); kx++) {
2170                   const size_t x = ox + padding_left() - kx * dilation_width();
2171                   const size_t ix = x / stride_width();
2172                   if (ix * stride_width() == x && ix < input_width()) {
2173                     for (size_t g = 0; g < groups(); g++) {
2174                       for (size_t oc = 0; oc < group_output_channels(); oc++) {
2175                         for (size_t ic = 0; ic < group_input_channels(); ic++) {
2176                           output_ref[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] +=
2177                             input[((i * input_height() + iy) * input_width() + ix) * input_pixel_stride() + g * group_input_channels() + ic] *
2178                             kernel[(((g * group_output_channels() + oc) * kernel_height() + ky) * kernel_width() + kx) * group_input_channels() + ic];
2179                         }
2180                       }
2181                     }
2182                   }
2183                 }
2184               }
2185             }
2186           }
2187         }
2188       }
2189 
2190       // Compute clamping parameters.
2191       const float accumulated_min = *std::min_element(output_ref.cbegin(), output_ref.cend());
2192       const float accumulated_max = *std::max_element(output_ref.cbegin(), output_ref.cend());
2193 
2194       const float output_min = accumulated_min + (accumulated_max - accumulated_min) / 255.0f * float(qmin());
2195       const float output_max = accumulated_max - (accumulated_max - accumulated_min) / 255.0f * float(255 - qmax());
2196 
2197       // Clamp reference results.
2198       for (float& value : output_ref) {
2199         value = std::max(std::min(value, output_max), output_min);
2200       }
2201 
2202       // Create, setup, and run Deconvolution operator once.
2203       ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
2204       xnn_operator_t deconvolution_op = nullptr;
2205 
2206       ASSERT_EQ(xnn_status_success,
2207         xnn_create_deconvolution2d_nhwc_f32(
2208           padding_top(), padding_right(), padding_bottom(), padding_left(),
2209           kernel_height(), kernel_width(),
2210           stride_height(), stride_width(),
2211           dilation_height(), dilation_width(),
2212           groups(), group_input_channels(), group_output_channels(),
2213           input_pixel_stride(), output_pixel_stride(),
2214           kernel.data(), has_bias() ? bias.data() : nullptr,
2215           output_min, output_max,
2216           0, NULL, &deconvolution_op));
2217 
2218       // Smart pointer to automatically delete deconvolution_op.
2219       std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)> auto_deconvolution_op(deconvolution_op, xnn_delete_operator);
2220 
2221       ASSERT_EQ(xnn_status_success,
2222         xnn_setup_deconvolution2d_nhwc_f32(
2223           deconvolution_op,
2224           batch_size(), input_height(), input_width(),
2225           adjustment_height(), adjustment_width(),
2226           input.data(), output.data(),
2227           nullptr /* thread pool */));
2228 
2229       ASSERT_EQ(xnn_status_success,
2230         xnn_run_operator(deconvolution_op, nullptr /* thread pool */));
2231 
2232       // Verify results of the first run.
2233       for (size_t i = 0; i < batch_size(); i++) {
2234         for (size_t y = 0; y < output_height(); y++) {
2235           for (size_t x = 0; x < output_width(); x++) {
2236             for (size_t g = 0; g < groups(); g++) {
2237               for (size_t c = 0; c < group_output_channels(); c++) {
2238                 ASSERT_GE(output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + g * group_output_channels() + c], output_min)
2239                   << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
2240                 ASSERT_LE(output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + g * group_output_channels() + c], output_max)
2241                   << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
2242                 ASSERT_NEAR(
2243                     output_ref[(((i * output_height() + y) * output_width() + x) * groups() + g) * group_output_channels() + c],
2244                     output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + g * group_output_channels() + c],
2245                     1.0e-4 * std::abs(output_ref[(((i * output_height() + y) * output_width() + x) * groups() + g) * group_output_channels() + c]))
2246                   << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
2247               }
2248             }
2249           }
2250         }
2251       }
2252 
2253       // Re-generate data for the second run.
2254       std::generate(input.begin(), input.end(), [&]() { return f32dist(rng); });
2255       std::fill(output.begin(), output.end(), nanf(""));
2256 
2257       // Compute reference results for the second run, including clamping.
2258       if (has_bias()) {
2259         for (size_t i = 0; i < next_batch_size(); i++) {
2260           for (size_t oy = 0; oy < next_output_height(); oy++) {
2261             for (size_t ox = 0; ox < next_output_width(); ox++) {
2262               for (size_t g = 0; g < groups(); g++) {
2263                 for (size_t oc = 0; oc < group_output_channels(); oc++) {
2264                   next_output_ref[(((i * next_output_height() + oy) * next_output_width() + ox) * groups() + g) * group_output_channels() + oc] =
2265                     bias[g * group_output_channels() + oc];
2266                 }
2267               }
2268             }
2269           }
2270         }
2271       } else {
2272         std::fill(next_output_ref.begin(), next_output_ref.end(), 0.0f);
2273       }
2274       for (size_t i = 0; i < next_batch_size(); i++) {
2275         for (size_t oy = 0; oy < next_output_height(); oy++) {
2276           for (size_t ox = 0; ox < next_output_width(); ox++) {
2277             for (size_t ky = 0; ky < kernel_height(); ky++) {
2278               const size_t y = oy + padding_top() - ky * dilation_height();
2279               const size_t iy = y / stride_height();
2280               if (iy * stride_height() == y && iy < next_input_height()) {
2281                 for (size_t kx = 0; kx < kernel_width(); kx++) {
2282                   const size_t x = ox + padding_left() - kx * dilation_width();
2283                   const size_t ix = x / stride_width();
2284                   if (ix * stride_width() == x && ix < next_input_width()) {
2285                     for (size_t g = 0; g < groups(); g++) {
2286                       for (size_t oc = 0; oc < group_output_channels(); oc++) {
2287                         for (size_t ic = 0; ic < group_input_channels(); ic++) {
2288                           next_output_ref[(((i * next_output_height() + oy) * next_output_width() + ox) * groups() + g) * group_output_channels() + oc] +=
2289                             input[((i * next_input_height() + iy) * next_input_width() + ix) * input_pixel_stride() + g * group_input_channels() + ic] *
2290                             kernel[(((g * group_output_channels() + oc) * kernel_height() + ky) * kernel_width() + kx) * group_input_channels() + ic];
2291                         }
2292                       }
2293                     }
2294                   }
2295                 }
2296               }
2297             }
2298           }
2299         }
2300       }
2301       for (float& value : next_output_ref) {
2302         value = std::max(std::min(value, output_max), output_min);
2303       }
2304 
2305       // Setup and run Deconvolution operator the second time, and destroy the operator.
2306       ASSERT_EQ(xnn_status_success,
2307         xnn_setup_deconvolution2d_nhwc_f32(
2308           deconvolution_op,
2309           next_batch_size(), next_input_height(), next_input_width(),
2310           adjustment_height(), adjustment_width(),
2311           input.data(), output.data(),
2312           nullptr /* thread pool */));
2313 
2314       ASSERT_EQ(xnn_status_success,
2315         xnn_run_operator(deconvolution_op, nullptr /* thread pool */));
2316 
2317       // Verify results of the second run.
2318       for (size_t i = 0; i < next_batch_size(); i++) {
2319         for (size_t y = 0; y < next_output_height(); y++) {
2320           for (size_t x = 0; x < next_output_width(); x++) {
2321             for (size_t g = 0; g < groups(); g++) {
2322               for (size_t c = 0; c < group_output_channels(); c++) {
2323                 ASSERT_GE(output[((i * next_output_height() + y) * next_output_width() + x) * output_pixel_stride() + g * group_output_channels() + c], output_min)
2324                   << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
2325                 ASSERT_LE(output[((i * next_output_height() + y) * next_output_width() + x) * output_pixel_stride() + g * group_output_channels() + c], output_max)
2326                   << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
2327                 ASSERT_NEAR(
2328                     next_output_ref[(((i * next_output_height() + y) * next_output_width() + x) * groups() + g) * group_output_channels() + c],
2329                     output[((i * next_output_height() + y) * next_output_width() + x) * output_pixel_stride() + g * group_output_channels() + c],
2330                     1.0e-4 * std::abs(next_output_ref[(((i * next_output_height() + y) * next_output_width() + x) * groups() + g) * group_output_channels() + c]))
2331                   << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
2332               }
2333             }
2334           }
2335         }
2336       }
2337     }
2338   }
2339 
2340  private:
2341   uint32_t padding_top_{0};
2342   uint32_t padding_right_{0};
2343   uint32_t padding_bottom_{0};
2344   uint32_t padding_left_{0};
2345   size_t input_height_{1};
2346   size_t input_width_{1};
2347   uint32_t groups_{1};
2348   size_t group_input_channels_{1};
2349   size_t input_pixel_stride_{0};
2350   size_t group_output_channels_{1};
2351   size_t output_pixel_stride_{0};
2352   size_t batch_size_{1};
2353   uint32_t kernel_height_{1};
2354   uint32_t kernel_width_{1};
2355   uint32_t adjustment_height_{0};
2356   uint32_t adjustment_width_{0};
2357   uint32_t dilation_height_{1};
2358   uint32_t dilation_width_{1};
2359   uint32_t stride_height_{1};
2360   uint32_t stride_width_{1};
2361   size_t next_input_height_{0};
2362   size_t next_input_width_{0};
2363   size_t next_batch_size_{0};
2364   uint8_t qmin_{0};
2365   uint8_t qmax_{255};
2366   bool has_bias_{true};
2367   WeightsType weights_type_{WeightsType::Default};
2368   bool use_weights_cache_{false};
2369   bool stress_weights_cache_{false};
2370   size_t iterations_{1};
2371 };
2372