1 // Copyright (c) Facebook, Inc. and its affiliates.
2 // All rights reserved.
3 //
4 // Copyright 2019 Google LLC
5 //
6 // This source code is licensed under the BSD-style license found in the
7 // LICENSE file in the root directory of this source tree.
8
9 #pragma once
10
11 #include <gtest/gtest.h>
12
13 #include <algorithm>
14 #include <cassert>
15 #include <cmath>
16 #include <cstddef>
17 #include <cstdlib>
18 #include <limits>
19 #include <random>
20 #include <vector>
21
22 #include <fp16.h>
23
24 #include <xnnpack.h>
25 #include <xnnpack/cache.h>
26
27 namespace {
28
29 template<class T>
doz(T a,T b)30 inline T doz(T a, T b) {
31 return a > b ? a - b : T(0);
32 }
33
34 } // namespace
35
36 class DeconvolutionOperatorTester {
37 public:
38 enum class WeightsType {
39 Default,
40 FP32,
41 };
42
padding(uint32_t padding)43 inline DeconvolutionOperatorTester& padding(uint32_t padding) {
44 this->padding_top_ = padding;
45 this->padding_right_ = padding;
46 this->padding_bottom_ = padding;
47 this->padding_left_ = padding;
48 return *this;
49 }
50
padding_height(uint32_t padding_height)51 inline DeconvolutionOperatorTester& padding_height(uint32_t padding_height) {
52 this->padding_top_ = padding_height;
53 this->padding_bottom_ = padding_height;
54 return *this;
55 }
56
padding_height()57 inline uint32_t padding_height() const {
58 return this->padding_top_ + this->padding_bottom_;
59 }
60
padding_width(uint32_t padding_width)61 inline DeconvolutionOperatorTester& padding_width(uint32_t padding_width) {
62 this->padding_right_ = padding_width;
63 this->padding_left_ = padding_width;
64 return *this;
65 }
66
padding_width()67 inline uint32_t padding_width() const {
68 return this->padding_left_ + this->padding_right_;
69 }
70
padding_top(uint32_t padding_top)71 inline DeconvolutionOperatorTester& padding_top(uint32_t padding_top) {
72 this->padding_top_ = padding_top;
73 return *this;
74 }
75
padding_top()76 inline uint32_t padding_top() const { return this->padding_top_; }
77
padding_right(uint32_t padding_right)78 inline DeconvolutionOperatorTester& padding_right(uint32_t padding_right) {
79 this->padding_right_ = padding_right;
80 return *this;
81 }
82
padding_right()83 inline uint32_t padding_right() const { return this->padding_right_; }
84
padding_bottom(uint32_t padding_bottom)85 inline DeconvolutionOperatorTester& padding_bottom(uint32_t padding_bottom) {
86 this->padding_bottom_ = padding_bottom;
87 return *this;
88 }
89
padding_bottom()90 inline uint32_t padding_bottom() const { return this->padding_bottom_; }
91
padding_left(uint32_t padding_left)92 inline DeconvolutionOperatorTester& padding_left(uint32_t padding_left) {
93 this->padding_left_ = padding_left;
94 return *this;
95 }
96
padding_left()97 inline uint32_t padding_left() const { return this->padding_left_; }
98
adjustment_height(uint32_t adjustment_height)99 inline DeconvolutionOperatorTester& adjustment_height(uint32_t adjustment_height) {
100 this->adjustment_height_ = adjustment_height;
101 return *this;
102 }
103
adjustment_height()104 inline uint32_t adjustment_height() const {
105 return this->adjustment_height_;
106 }
107
adjustment_width(uint32_t adjustment_width)108 inline DeconvolutionOperatorTester& adjustment_width(uint32_t adjustment_width) {
109 this->adjustment_width_ = adjustment_width;
110 return *this;
111 }
112
adjustment_width()113 inline uint32_t adjustment_width() const {
114 return this->adjustment_width_;
115 }
116
input_size(uint32_t input_height,uint32_t input_width)117 inline DeconvolutionOperatorTester& input_size(uint32_t input_height, uint32_t input_width) {
118 assert(input_height >= 1);
119 assert(input_width >= 1);
120 this->input_height_ = input_height;
121 this->input_width_ = input_width;
122 return *this;
123 }
124
input_height(uint32_t input_height)125 inline DeconvolutionOperatorTester& input_height(uint32_t input_height) {
126 assert(input_height >= 1);
127 this->input_height_ = input_height;
128 return *this;
129 }
130
input_height()131 inline uint32_t input_height() const {
132 return this->input_height_;
133 }
134
input_width(uint32_t input_width)135 inline DeconvolutionOperatorTester& input_width(uint32_t input_width) {
136 assert(input_width >= 1);
137 this->input_width_ = input_width;
138 return *this;
139 }
140
input_width()141 inline uint32_t input_width() const {
142 return this->input_width_;
143 }
144
groups(uint32_t groups)145 inline DeconvolutionOperatorTester& groups(uint32_t groups) {
146 assert(groups >= 1);
147 this->groups_ = groups;
148 return *this;
149 }
150
groups()151 inline uint32_t groups() const {
152 return this->groups_;
153 }
154
group_input_channels(size_t group_input_channels)155 inline DeconvolutionOperatorTester& group_input_channels(size_t group_input_channels) {
156 assert(group_input_channels >= 1);
157 this->group_input_channels_ = group_input_channels;
158 return *this;
159 }
160
group_input_channels()161 inline size_t group_input_channels() const {
162 return this->group_input_channels_;
163 }
164
group_output_channels(size_t group_output_channels)165 inline DeconvolutionOperatorTester& group_output_channels(size_t group_output_channels) {
166 assert(group_output_channels >= 1);
167 this->group_output_channels_ = group_output_channels;
168 return *this;
169 }
170
group_output_channels()171 inline size_t group_output_channels() const {
172 return this->group_output_channels_;
173 }
174
batch_size(size_t batch_size)175 inline DeconvolutionOperatorTester& batch_size(size_t batch_size) {
176 assert(batch_size >= 1);
177 this->batch_size_ = batch_size;
178 return *this;
179 }
180
batch_size()181 inline size_t batch_size() const {
182 return this->batch_size_;
183 }
184
kernel_size(uint32_t kernel_size)185 inline DeconvolutionOperatorTester& kernel_size(uint32_t kernel_size) {
186 assert(kernel_size >= 1);
187 this->kernel_height_ = kernel_size;
188 this->kernel_width_ = kernel_size;
189 return *this;
190 }
191
kernel_size(uint32_t kernel_height,uint32_t kernel_width)192 inline DeconvolutionOperatorTester& kernel_size(uint32_t kernel_height, uint32_t kernel_width) {
193 assert(kernel_height >= 1);
194 assert(kernel_width >= 1);
195 this->kernel_height_ = kernel_height;
196 this->kernel_width_ = kernel_width;
197 return *this;
198 }
199
kernel_height(uint32_t kernel_height)200 inline DeconvolutionOperatorTester& kernel_height(uint32_t kernel_height) {
201 assert(kernel_height >= 1);
202 this->kernel_height_ = kernel_height;
203 return *this;
204 }
205
kernel_height()206 inline uint32_t kernel_height() const {
207 return this->kernel_height_;
208 }
209
kernel_width(uint32_t kernel_width)210 inline DeconvolutionOperatorTester& kernel_width(uint32_t kernel_width) {
211 assert(kernel_width >= 1);
212 this->kernel_width_ = kernel_width;
213 return *this;
214 }
215
kernel_width()216 inline uint32_t kernel_width() const {
217 return this->kernel_width_;
218 }
219
dilation(uint32_t dilation)220 inline DeconvolutionOperatorTester& dilation(uint32_t dilation) {
221 assert(dilation >= 1);
222 this->dilation_height_ = dilation;
223 this->dilation_width_ = dilation;
224 return *this;
225 }
226
dilation(uint32_t dilation_height,uint32_t dilation_width)227 inline DeconvolutionOperatorTester& dilation(uint32_t dilation_height, uint32_t dilation_width) {
228 assert(dilation_height >= 1);
229 assert(dilation_width >= 1);
230 this->dilation_height_ = dilation_height;
231 this->dilation_width_ = dilation_width;
232 return *this;
233 }
234
dilation_height(uint32_t dilation_height)235 inline DeconvolutionOperatorTester& dilation_height(uint32_t dilation_height) {
236 assert(dilation_height >= 1);
237 this->dilation_height_ = dilation_height;
238 return *this;
239 }
240
dilation_height()241 inline uint32_t dilation_height() const {
242 return this->dilation_height_;
243 }
244
dilation_width(uint32_t dilation_width)245 inline DeconvolutionOperatorTester& dilation_width(uint32_t dilation_width) {
246 assert(dilation_width >= 1);
247 this->dilation_width_ = dilation_width;
248 return *this;
249 }
250
dilation_width()251 inline uint32_t dilation_width() const {
252 return this->dilation_width_;
253 }
254
stride(uint32_t stride)255 inline DeconvolutionOperatorTester& stride(uint32_t stride) {
256 assert(stride >= 1);
257 this->stride_height_ = stride;
258 this->stride_width_ = stride;
259 return *this;
260 }
261
stride(uint32_t stride_height,uint32_t stride_width)262 inline DeconvolutionOperatorTester& stride(uint32_t stride_height, uint32_t stride_width) {
263 assert(stride_height >= 1);
264 assert(stride_width >= 1);
265 this->stride_height_ = stride_height;
266 this->stride_width_ = stride_width;
267 return *this;
268 }
269
stride_height(uint32_t stride_height)270 inline DeconvolutionOperatorTester& stride_height(uint32_t stride_height) {
271 assert(stride_height >= 1);
272 this->stride_height_ = stride_height;
273 return *this;
274 }
275
stride_height()276 inline uint32_t stride_height() const {
277 return this->stride_height_;
278 }
279
stride_width(uint32_t stride_width)280 inline DeconvolutionOperatorTester& stride_width(uint32_t stride_width) {
281 assert(stride_width >= 1);
282 this->stride_width_ = stride_width;
283 return *this;
284 }
285
stride_width()286 inline uint32_t stride_width() const {
287 return this->stride_width_;
288 }
289
input_pixel_stride(size_t input_pixel_stride)290 inline DeconvolutionOperatorTester& input_pixel_stride(size_t input_pixel_stride) {
291 assert(input_pixel_stride >= 1);
292 this->input_pixel_stride_ = input_pixel_stride;
293 return *this;
294 }
295
input_pixel_stride()296 inline size_t input_pixel_stride() const {
297 if (this->input_pixel_stride_ == 0) {
298 return group_input_channels() * groups();
299 } else {
300 assert(this->input_pixel_stride_ >= group_input_channels() * groups());
301 return this->input_pixel_stride_;
302 }
303 }
304
output_pixel_stride(size_t output_pixel_stride)305 inline DeconvolutionOperatorTester& output_pixel_stride(size_t output_pixel_stride) {
306 assert(output_pixel_stride >= 1);
307 this->output_pixel_stride_ = output_pixel_stride;
308 return *this;
309 }
310
output_pixel_stride()311 inline size_t output_pixel_stride() const {
312 if (this->output_pixel_stride_ == 0) {
313 return group_output_channels() * groups();
314 } else {
315 assert(this->output_pixel_stride_ >= group_output_channels() * groups());
316 return this->output_pixel_stride_;
317 }
318 }
319
dilated_kernel_height()320 inline uint32_t dilated_kernel_height() const {
321 return (kernel_height() - 1) * dilation_height() + 1;
322 }
323
dilated_kernel_width()324 inline uint32_t dilated_kernel_width() const {
325 return (kernel_width() - 1) * dilation_width() + 1;
326 }
327
output_height()328 inline size_t output_height() const {
329 return stride_height() * (input_height() - 1) + adjustment_height() + dilated_kernel_height() - padding_height();
330 }
331
output_width()332 inline size_t output_width() const {
333 return stride_width() * (input_width() - 1) + adjustment_width() + dilated_kernel_width() - padding_width();
334 }
335
next_input_size(uint32_t next_input_height,uint32_t next_input_width)336 inline DeconvolutionOperatorTester& next_input_size(uint32_t next_input_height, uint32_t next_input_width) {
337 assert(next_input_height >= 1);
338 assert(next_input_width >= 1);
339 this->next_input_height_ = next_input_height;
340 this->next_input_width_ = next_input_width;
341 return *this;
342 }
343
next_input_height(uint32_t next_input_height)344 inline DeconvolutionOperatorTester& next_input_height(uint32_t next_input_height) {
345 assert(next_input_height >= 1);
346 this->next_input_height_ = next_input_height;
347 return *this;
348 }
349
next_input_height()350 inline uint32_t next_input_height() const {
351 if (this->next_input_height_ == 0) {
352 return input_height();
353 } else {
354 return this->next_input_height_;
355 }
356 }
357
next_input_width(uint32_t next_input_width)358 inline DeconvolutionOperatorTester& next_input_width(uint32_t next_input_width) {
359 assert(next_input_width >= 1);
360 this->next_input_width_ = next_input_width;
361 return *this;
362 }
363
next_input_width()364 inline uint32_t next_input_width() const {
365 if (this->next_input_width_ == 0) {
366 return input_width();
367 } else {
368 return this->next_input_width_;
369 }
370 }
371
next_output_height()372 inline size_t next_output_height() const {
373 return stride_height() * (next_input_height() - 1) + adjustment_height() + dilated_kernel_height() - padding_height();
374 }
375
next_output_width()376 inline size_t next_output_width() const {
377 return stride_width() * (next_input_width() - 1) + adjustment_width() + dilated_kernel_width() - padding_width();
378 }
379
next_batch_size(size_t next_batch_size)380 inline DeconvolutionOperatorTester& next_batch_size(size_t next_batch_size) {
381 assert(next_batch_size >= 1);
382 this->next_batch_size_ = next_batch_size;
383 return *this;
384 }
385
next_batch_size()386 inline size_t next_batch_size() const {
387 if (this->next_batch_size_ == 0) {
388 return batch_size();
389 } else {
390 return this->next_batch_size_;
391 }
392 }
393
qmin(uint8_t qmin)394 inline DeconvolutionOperatorTester& qmin(uint8_t qmin) {
395 this->qmin_ = qmin;
396 return *this;
397 }
398
qmin()399 inline uint8_t qmin() const {
400 return this->qmin_;
401 }
402
qmax(uint8_t qmax)403 inline DeconvolutionOperatorTester& qmax(uint8_t qmax) {
404 this->qmax_ = qmax;
405 return *this;
406 }
407
qmax()408 inline uint8_t qmax() const {
409 return this->qmax_;
410 }
411
has_bias(bool has_bias)412 inline DeconvolutionOperatorTester& has_bias(bool has_bias) {
413 this->has_bias_ = has_bias;
414 return *this;
415 }
416
has_bias()417 inline bool has_bias() const {
418 return this->has_bias_;
419 }
420
weights_type(WeightsType weights_type)421 inline DeconvolutionOperatorTester& weights_type(WeightsType weights_type) {
422 this->weights_type_ = weights_type;
423 return *this;
424 }
425
weights_type()426 inline WeightsType weights_type() const {
427 return this->weights_type_;
428 }
429
use_weights_cache(bool use_weights_cache)430 inline DeconvolutionOperatorTester& use_weights_cache(bool use_weights_cache) {
431 this->use_weights_cache_ = use_weights_cache;
432 return *this;
433 }
434
use_weights_cache()435 inline bool use_weights_cache() const {
436 return this->use_weights_cache_;
437 }
438
iterations(size_t iterations)439 inline DeconvolutionOperatorTester& iterations(size_t iterations) {
440 this->iterations_ = iterations;
441 return *this;
442 }
443
iterations()444 inline size_t iterations() const {
445 return this->iterations_;
446 }
447
TestQS8()448 void TestQS8() const {
449 ASSERT_EQ(weights_type(), WeightsType::Default);
450
451 std::random_device random_device;
452 auto rng = std::mt19937(random_device());
453 std::uniform_int_distribution<int32_t> i32dist(-10000, 10000);
454 std::uniform_int_distribution<int32_t> i8dist(
455 std::numeric_limits<int8_t>::min(), std::numeric_limits<int8_t>::max());
456 std::uniform_int_distribution<int32_t> w8dist(
457 -std::numeric_limits<int8_t>::max(), std::numeric_limits<int8_t>::max());
458
459 std::vector<int8_t> input(XNN_EXTRA_BYTES / sizeof(int8_t) +
460 (batch_size() * input_height() * input_width() - 1) * input_pixel_stride() + groups() * group_input_channels());
461 std::vector<int8_t> kernel(groups() * group_output_channels() * kernel_height() * kernel_width() * group_input_channels());
462 std::vector<int32_t> bias(groups() * group_output_channels());
463 std::vector<int8_t> output((batch_size() * output_height() * output_width() - 1) * output_pixel_stride() + groups() * group_output_channels());
464 std::vector<int32_t> accumulators(batch_size() * output_height() * output_width() * groups() * group_output_channels());
465 std::vector<double> output_ref(batch_size() * output_height() * output_width() * groups() * group_output_channels());
466
467 const int8_t input_zero_point = 1;
468
469 for (size_t iteration = 0; iteration < iterations(); iteration++) {
470 std::generate(input.begin(), input.end(), [&]() { return i8dist(rng); });
471 std::generate(kernel.begin(), kernel.end(), [&]() { return w8dist(rng); });
472 std::generate(bias.begin(), bias.end(), [&]() { return i32dist(rng); });
473 std::fill(output.begin(), output.end(), INT8_C(0xA5));
474
475 // Compute reference results, without renormalization.
476 if (has_bias()) {
477 for (size_t i = 0; i < batch_size(); i++) {
478 for (size_t oy = 0; oy < output_height(); oy++) {
479 for (size_t ox = 0; ox < output_width(); ox++) {
480 for (size_t g = 0; g < groups(); g++) {
481 for (size_t oc = 0; oc < group_output_channels(); oc++) {
482 accumulators[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] =
483 bias[g * group_output_channels() + oc];
484 }
485 }
486 }
487 }
488 }
489 } else {
490 std::fill(accumulators.begin(), accumulators.end(), 0);
491 }
492 for (size_t i = 0; i < batch_size(); i++) {
493 for (size_t oy = 0; oy < output_height(); oy++) {
494 for (size_t ox = 0; ox < output_width(); ox++) {
495 for (size_t ky = 0; ky < kernel_height(); ky++) {
496 const size_t y = oy + padding_top() - ky * dilation_height();
497 const size_t iy = y / stride_height();
498 if (iy * stride_height() == y && iy < input_height()) {
499 for (size_t kx = 0; kx < kernel_width(); kx++) {
500 const size_t x = ox + padding_left() - kx * dilation_width();
501 const size_t ix = x / stride_width();
502 if (ix * stride_width() == x && ix < input_width()) {
503 for (size_t g = 0; g < groups(); g++) {
504 for (size_t oc = 0; oc < group_output_channels(); oc++) {
505 for (size_t ic = 0; ic < group_input_channels(); ic++) {
506 accumulators[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] +=
507 (int32_t(input[((i * input_height() + iy) * input_width() + ix) * input_pixel_stride() + g * group_input_channels() + ic]) - int32_t(input_zero_point)) *
508 int32_t(kernel[(((g * group_output_channels() + oc) * kernel_height() + ky) * kernel_width() + kx) * group_input_channels() + ic]);
509 }
510 }
511 }
512 }
513 }
514 }
515 }
516 }
517 }
518 }
519
520 // Compute renormalization parameters.
521 const int32_t accumulated_min = *std::min_element(accumulators.cbegin(), accumulators.cend());
522 const int32_t accumulated_max = *std::max_element(accumulators.cbegin(), accumulators.cend());
523
524 const double output_scale = double(uint32_t(accumulated_max - accumulated_min)) / 255.0;
525 const int8_t output_zero_point = int8_t(std::max(std::min(
526 lrint(-0.5 - 0.5 * double(accumulated_min + accumulated_max) / output_scale),
527 long(std::numeric_limits<int8_t>::max())), long(std::numeric_limits<int8_t>::min())));
528
529 // Renormalize reference results.
530 std::transform(accumulators.cbegin(), accumulators.cend(), output_ref.begin(),
531 [this, output_scale, output_zero_point](int32_t x) -> double {
532 return std::max<double>(std::min<double>(double(x) / output_scale, double(qmax() - 0x80) - output_zero_point), double(qmin() - 0x80) - output_zero_point);
533 });
534
535 // Create, setup, run, and destroy Deconvolution operator.
536 ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
537 xnn_operator_t deconvolution_op = nullptr;
538
539 xnn_caches caches = {
540 .code_cache = NULL,
541 .weights_cache = NULL,
542 };
543 xnn_weights_cache weights_cache;
544 if (use_weights_cache()) {
545 xnn_init_weights_cache(&weights_cache);
546 caches.weights_cache = &weights_cache;
547 }
548
549 ASSERT_EQ(
550 xnn_status_success,
551 xnn_create_deconvolution2d_nhwc_qs8(
552 padding_top(), padding_right(), padding_bottom(), padding_left(),
553 kernel_height(), kernel_width(), stride_height(), stride_width(),
554 dilation_height(), dilation_width(), groups(),
555 group_input_channels(), group_output_channels(),
556 input_pixel_stride(), output_pixel_stride(), input_zero_point,
557 1.0f /* input scale */, 1.0f /* kernel scale */, kernel.data(),
558 has_bias() ? bias.data() : nullptr, output_zero_point,
559 output_scale, int8_t(qmin() - 0x80), int8_t(qmax() - 0x80),
560 /*flags=*/0, &caches, &deconvolution_op));
561
562 if (use_weights_cache()) {
563 ASSERT_EQ(xnn_status_success,
564 xnn_finalize_weights_cache(&weights_cache, xnn_weights_cache_finalization_kind_soft));
565 }
566 // Smart pointer to automatically delete deconvolution_op.
567 std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)> auto_deconvolution_op(deconvolution_op, xnn_delete_operator);
568
569 ASSERT_EQ(xnn_status_success,
570 xnn_setup_deconvolution2d_nhwc_qs8(
571 deconvolution_op,
572 batch_size(), input_height(), input_width(),
573 adjustment_height(), adjustment_width(),
574 input.data(), output.data(),
575 nullptr /* thread pool */));
576
577 ASSERT_EQ(xnn_status_success,
578 xnn_run_operator(deconvolution_op, nullptr /* thread pool */));
579
580 VerifyQS8(output, output_ref, output_zero_point);
581
582 if (use_weights_cache()) {
583 xnn_operator_t deconvolution_op2 = nullptr;
584 size_t old_weights_cache_size = weights_cache.cache.weights.size;
585
586 ASSERT_EQ(
587 xnn_status_success,
588 xnn_create_deconvolution2d_nhwc_qs8(
589 padding_top(), padding_right(), padding_bottom(), padding_left(),
590 kernel_height(), kernel_width(), stride_height(), stride_width(),
591 dilation_height(), dilation_width(), groups(),
592 group_input_channels(), group_output_channels(),
593 input_pixel_stride(), output_pixel_stride(), input_zero_point,
594 1.0f /* input scale */, 1.0f /* kernel scale */, kernel.data(),
595 has_bias() ? bias.data() : nullptr, output_zero_point,
596 output_scale, int8_t(qmin() - 0x80), int8_t(qmax() - 0x80),
597 /*flags=*/0, &caches, &deconvolution_op2));
598
599 // Smart pointer to automatically delete deconvolution_op2.
600 std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)> auto_deconvolution_op(deconvolution_op2, xnn_delete_operator);
601 std::vector<int8_t> output2(output.size(), INT8_C(0xA5));
602
603 ASSERT_EQ(xnn_status_success,
604 xnn_setup_deconvolution2d_nhwc_qs8(
605 deconvolution_op2,
606 batch_size(), input_height(), input_width(),
607 adjustment_height(), adjustment_width(),
608 input.data(), output2.data(),
609 nullptr /* thread pool */));
610
611 ASSERT_EQ(xnn_status_success,
612 xnn_run_operator(deconvolution_op2, nullptr /* thread pool */));
613
614 VerifyWeightsCache(&weights_cache, old_weights_cache_size);
615 VerifyQS8(output2, output_ref, output_zero_point);
616 xnn_release_weights_cache(&weights_cache);
617 }
618
619 }
620 }
621
VerifyQS8(const std::vector<int8_t> & output,const std::vector<double> & output_ref,int8_t output_zero_point)622 void VerifyQS8(const std::vector<int8_t> &output,
623 const std::vector<double> &output_ref,
624 int8_t output_zero_point) const {
625 for (size_t i = 0; i < batch_size(); i++) {
626 for (size_t y = 0; y < output_height(); y++) {
627 for (size_t x = 0; x < output_width(); x++) {
628 for (size_t g = 0; g < groups(); g++) {
629 for (size_t c = 0; c < group_output_channels(); c++) {
630 ASSERT_LE(int32_t(output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + g * group_output_channels() + c]), int32_t(qmax() - 0x80))
631 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
632 ASSERT_GE(int32_t(output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + g * group_output_channels() + c]), int32_t(qmin() - 0x80))
633 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
634 ASSERT_NEAR(
635 output_ref[(((i * output_height() + y) * output_width() + x) * groups() + g) * group_output_channels() + c],
636 double(output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + g * group_output_channels() + c]) - double(output_zero_point),
637 0.9)
638 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
639 }
640 }
641 }
642 }
643 }
644 }
645
VerifyWeightsCache(xnn_weights_cache * weights_cache,size_t old_size)646 void VerifyWeightsCache(xnn_weights_cache* weights_cache, size_t old_size) const {
647 ASSERT_EQ(weights_cache->cache.hits, 1);
648 // Ensure that we did not write more weights to the cache because it was a cache hit.
649 ASSERT_EQ(old_size, weights_cache->cache.weights.size);
650 };
651
TestQU8()652 void TestQU8() const {
653 ASSERT_EQ(weights_type(), WeightsType::Default);
654
655 std::random_device random_device;
656 auto rng = std::mt19937(random_device());
657 std::uniform_int_distribution<int32_t> i32dist(-10000, 10000);
658 std::uniform_int_distribution<int32_t> u8dist(
659 std::numeric_limits<uint8_t>::min(), std::numeric_limits<uint8_t>::max());
660
661 std::vector<uint8_t> input(XNN_EXTRA_BYTES / sizeof(uint8_t) +
662 (batch_size() * input_height() * input_width() - 1) * input_pixel_stride() + groups() * group_input_channels());
663 std::vector<uint8_t> kernel(groups() * group_output_channels() * kernel_height() * kernel_width() * group_input_channels());
664 std::vector<int32_t> bias(groups() * group_output_channels());
665 std::vector<uint8_t> output((batch_size() * output_height() * output_width() - 1) * output_pixel_stride() + groups() * group_output_channels());
666 std::vector<int32_t> accumulators(batch_size() * output_height() * output_width() * groups() * group_output_channels());
667 std::vector<double> output_ref(batch_size() * output_height() * output_width() * groups() * group_output_channels());
668
669 const uint8_t input_zero_point = 127;
670 const uint8_t kernel_zero_point = 127;
671
672 for (size_t iteration = 0; iteration < iterations(); iteration++) {
673 std::generate(input.begin(), input.end(), [&]() { return u8dist(rng); });
674 std::generate(kernel.begin(), kernel.end(), [&]() { return u8dist(rng); });
675 std::generate(bias.begin(), bias.end(), [&]() { return i32dist(rng); });
676 std::fill(output.begin(), output.end(), UINT8_C(0xA5));
677
678 // Compute reference results, without renormalization.
679 if (has_bias()) {
680 for (size_t i = 0; i < batch_size(); i++) {
681 for (size_t oy = 0; oy < output_height(); oy++) {
682 for (size_t ox = 0; ox < output_width(); ox++) {
683 for (size_t g = 0; g < groups(); g++) {
684 for (size_t oc = 0; oc < group_output_channels(); oc++) {
685 accumulators[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] =
686 bias[g * group_output_channels() + oc];
687 }
688 }
689 }
690 }
691 }
692 } else {
693 std::fill(accumulators.begin(), accumulators.end(), 0);
694 }
695 for (size_t i = 0; i < batch_size(); i++) {
696 for (size_t oy = 0; oy < output_height(); oy++) {
697 for (size_t ox = 0; ox < output_width(); ox++) {
698 for (size_t ky = 0; ky < kernel_height(); ky++) {
699 const size_t y = oy + padding_top() - ky * dilation_height();
700 const size_t iy = y / stride_height();
701 if (iy * stride_height() == y && iy < input_height()) {
702 for (size_t kx = 0; kx < kernel_width(); kx++) {
703 const size_t x = ox + padding_left() - kx * dilation_width();
704 const size_t ix = x / stride_width();
705 if (ix * stride_width() == x && ix < input_width()) {
706 for (size_t g = 0; g < groups(); g++) {
707 for (size_t oc = 0; oc < group_output_channels(); oc++) {
708 for (size_t ic = 0; ic < group_input_channels(); ic++) {
709 accumulators[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] +=
710 (int32_t(input[((i * input_height() + iy) * input_width() + ix) * input_pixel_stride() + g * group_input_channels() + ic]) - int32_t(input_zero_point)) *
711 (int32_t(kernel[(((g * group_output_channels() + oc) * kernel_height() + ky) * kernel_width() + kx) * group_input_channels() + ic]) - int32_t(kernel_zero_point));
712 }
713 }
714 }
715 }
716 }
717 }
718 }
719 }
720 }
721 }
722
723 // Compute renormalization parameters.
724 const int32_t accumulated_min = *std::min_element(accumulators.cbegin(), accumulators.cend());
725 const int32_t accumulated_max = *std::max_element(accumulators.cbegin(), accumulators.cend());
726
727 const double output_scale = double(uint32_t(accumulated_max - accumulated_min)) / 255.0;
728 const uint8_t output_zero_point = uint8_t(std::max(std::min(
729 lrint(127.5 - 0.5 * double(accumulated_min + accumulated_max) / output_scale),
730 long(std::numeric_limits<uint8_t>::max())), long(std::numeric_limits<uint8_t>::min())));
731
732 // Renormalize reference results.
733 std::transform(accumulators.cbegin(), accumulators.cend(), output_ref.begin(),
734 [this, output_scale, output_zero_point](int32_t x) -> double {
735 return std::max<double>(std::min<double>(double(x) / output_scale, double(qmax()) - output_zero_point), double(qmin()) - output_zero_point);
736 });
737
738 // Create, setup, run, and destroy Deconvolution operator.
739 ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
740 xnn_operator_t deconvolution_op = nullptr;
741
742 xnn_caches caches = {
743 .code_cache = NULL,
744 .weights_cache = NULL,
745 };
746 xnn_weights_cache weights_cache;
747 if (use_weights_cache()) {
748 xnn_init_weights_cache(&weights_cache);
749 caches.weights_cache = &weights_cache;
750 }
751
752 ASSERT_EQ(
753 xnn_status_success,
754 xnn_create_deconvolution2d_nhwc_qu8(
755 padding_top(), padding_right(), padding_bottom(), padding_left(),
756 kernel_height(), kernel_width(), stride_height(), stride_width(),
757 dilation_height(), dilation_width(), groups(),
758 group_input_channels(), group_output_channels(),
759 input_pixel_stride(), output_pixel_stride(), input_zero_point,
760 1.0f /* input scale */, kernel_zero_point,
761 1.0f /* kernel scale */, kernel.data(),
762 has_bias() ? bias.data() : nullptr, output_zero_point,
763 output_scale, qmin(), qmax(),
764 /*flags=*/0, &caches, &deconvolution_op));
765
766 if (use_weights_cache()) {
767 ASSERT_EQ(xnn_status_success,
768 xnn_finalize_weights_cache(&weights_cache, xnn_weights_cache_finalization_kind_soft));
769 }
770 // Smart pointer to automatically delete deconvolution_op.
771 std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)> auto_deconvolution_op(deconvolution_op, xnn_delete_operator);
772
773 ASSERT_EQ(xnn_status_success,
774 xnn_setup_deconvolution2d_nhwc_qu8(
775 deconvolution_op,
776 batch_size(), input_height(), input_width(),
777 adjustment_height(), adjustment_width(),
778 input.data(), output.data(),
779 nullptr /* thread pool */));
780
781 ASSERT_EQ(xnn_status_success,
782 xnn_run_operator(deconvolution_op, nullptr /* thread pool */));
783
784 // Verify results.
785 VerifyQU8(output, output_ref, output_zero_point);
786
787
788 if (use_weights_cache()) {
789 xnn_operator_t deconvolution_op2 = nullptr;
790 size_t old_weights_cache_size = weights_cache.cache.weights.size;
791
792 ASSERT_EQ(
793 xnn_status_success,
794 xnn_create_deconvolution2d_nhwc_qu8(
795 padding_top(), padding_right(), padding_bottom(), padding_left(),
796 kernel_height(), kernel_width(), stride_height(), stride_width(),
797 dilation_height(), dilation_width(), groups(),
798 group_input_channels(), group_output_channels(),
799 input_pixel_stride(), output_pixel_stride(), input_zero_point,
800 1.0f /* input scale */, kernel_zero_point,
801 1.0f /* kernel scale */, kernel.data(),
802 has_bias() ? bias.data() : nullptr, output_zero_point,
803 output_scale, qmin(), qmax(),
804 /*flags=*/0, &caches, &deconvolution_op2));
805
806 // Smart pointer to automatically delete deconvolution_op2.
807 std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)> auto_deconvolution_op(deconvolution_op2, xnn_delete_operator);
808
809 ASSERT_EQ(xnn_status_success,
810 xnn_setup_deconvolution2d_nhwc_qu8(
811 deconvolution_op2,
812 batch_size(), input_height(), input_width(),
813 adjustment_height(), adjustment_width(),
814 input.data(), output.data(),
815 nullptr /* thread pool */));
816
817 ASSERT_EQ(xnn_status_success,
818 xnn_run_operator(deconvolution_op2, nullptr /* thread pool */));
819
820 VerifyWeightsCache(&weights_cache, old_weights_cache_size);
821 VerifyQU8(output, output_ref, output_zero_point);
822 xnn_release_weights_cache(&weights_cache);
823 }
824 }
825 }
826
VerifyQU8(const std::vector<uint8_t> & output,const std::vector<double> & output_ref,uint8_t output_zero_point)827 void VerifyQU8(const std::vector<uint8_t> &output,
828 const std::vector<double> &output_ref,
829 uint8_t output_zero_point) const {
830 for (size_t i = 0; i < batch_size(); i++) {
831 for (size_t y = 0; y < output_height(); y++) {
832 for (size_t x = 0; x < output_width(); x++) {
833 for (size_t g = 0; g < groups(); g++) {
834 for (size_t c = 0; c < group_output_channels(); c++) {
835 ASSERT_LE(int32_t(output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + g * group_output_channels() + c]), int32_t(qmax()))
836 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
837 ASSERT_GE(int32_t(output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + g * group_output_channels() + c]), int32_t(qmin()))
838 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
839 ASSERT_NEAR(
840 output_ref[(((i * output_height() + y) * output_width() + x) * groups() + g) * group_output_channels() + c],
841 double(output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + g * group_output_channels() + c]) - double(output_zero_point),
842 0.9)
843 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
844 }
845 }
846 }
847 }
848 }
849 }
850
TestF16()851 void TestF16() const {
852 switch (weights_type()) {
853 case WeightsType::Default:
854 break;
855 case WeightsType::FP32:
856 break;
857 default:
858 GTEST_FAIL() << "unexpected weights type";
859 }
860
861 std::random_device random_device;
862 auto rng = std::mt19937(random_device());
863 std::uniform_real_distribution<float> f32dist(0.1f, 1.0f);
864
865 std::vector<uint16_t> input(XNN_EXTRA_BYTES / sizeof(uint16_t) +
866 (batch_size() * input_height() * input_width() - 1) * input_pixel_stride() + groups() * group_input_channels());
867 std::vector<uint16_t> kernel(groups() * group_output_channels() * kernel_height() * kernel_width() * group_input_channels());
868 std::vector<float> kernel_as_float(kernel.size());
869 std::vector<uint16_t> bias(groups() * group_output_channels());
870 std::vector<float> bias_as_float(bias.size());
871 std::vector<uint16_t> output((batch_size() * output_height() * output_width() - 1) * output_pixel_stride() + groups() * group_output_channels());
872 std::vector<float> output_ref(batch_size() * output_height() * output_width() * groups() * group_output_channels());
873
874 for (size_t iteration = 0; iteration < iterations(); iteration++) {
875 std::generate(input.begin(), input.end(), [&]() { return fp16_ieee_from_fp32_value(f32dist(rng)); });
876 std::generate(kernel.begin(), kernel.end(), [&]() { return fp16_ieee_from_fp32_value(f32dist(rng)); });
877 std::transform(kernel.cbegin(), kernel.cend(), kernel_as_float.begin(), fp16_ieee_to_fp32_value);
878 std::generate(bias.begin(), bias.end(), [&]() { return fp16_ieee_from_fp32_value(f32dist(rng)); });
879 std::transform(bias.cbegin(), bias.cend(), bias_as_float.begin(), fp16_ieee_to_fp32_value);
880 std::fill(output.begin(), output.end(), UINT16_C(0x7E00) /* NaN */);
881
882 // Compute reference results, without clamping.
883 if (has_bias()) {
884 for (size_t i = 0; i < batch_size(); i++) {
885 for (size_t oy = 0; oy < output_height(); oy++) {
886 for (size_t ox = 0; ox < output_width(); ox++) {
887 for (size_t g = 0; g < groups(); g++) {
888 for (size_t oc = 0; oc < group_output_channels(); oc++) {
889 output_ref[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] =
890 bias_as_float[g * group_output_channels() + oc];
891 }
892 }
893 }
894 }
895 }
896 } else {
897 std::fill(output_ref.begin(), output_ref.end(), 0.0f);
898 }
899 for (size_t i = 0; i < batch_size(); i++) {
900 for (size_t oy = 0; oy < output_height(); oy++) {
901 for (size_t ox = 0; ox < output_width(); ox++) {
902 for (size_t ky = 0; ky < kernel_height(); ky++) {
903 const size_t y = oy + padding_top() - ky * dilation_height();
904 const size_t iy = y / stride_height();
905 if (iy * stride_height() == y && iy < input_height()) {
906 for (size_t kx = 0; kx < kernel_width(); kx++) {
907 const size_t x = ox + padding_left() - kx * dilation_width();
908 const size_t ix = x / stride_width();
909 if (ix * stride_width() == x && ix < input_width()) {
910 for (size_t g = 0; g < groups(); g++) {
911 for (size_t oc = 0; oc < group_output_channels(); oc++) {
912 for (size_t ic = 0; ic < group_input_channels(); ic++) {
913 output_ref[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] +=
914 fp16_ieee_to_fp32_value(input[((i * input_height() + iy) * input_width() + ix) * input_pixel_stride() + g * group_input_channels() + ic]) *
915 kernel_as_float[(((g * group_output_channels() + oc) * kernel_height() + ky) * kernel_width() + kx) * group_input_channels() + ic];
916 }
917 }
918 }
919 }
920 }
921 }
922 }
923 }
924 }
925 }
926
927 // Compute clamping parameters.
928 const float accumulated_min = *std::min_element(output_ref.cbegin(), output_ref.cend());
929 const float accumulated_max = *std::max_element(output_ref.cbegin(), output_ref.cend());
930 const float accumulated_range = accumulated_max - accumulated_min;
931 float output_min = accumulated_min + accumulated_range / 255.0f * float(qmin());
932 float output_max = accumulated_max - accumulated_range / 255.0f * float(255 - qmax());
933 output_min = fp16_ieee_to_fp32_value(fp16_ieee_from_fp32_value(output_min));
934 output_max = fp16_ieee_to_fp32_value(fp16_ieee_from_fp32_value(output_max));
935 if (accumulated_range == 0.0f) {
936 output_min = -std::numeric_limits<float>::infinity();
937 output_max = +std::numeric_limits<float>::infinity();
938 }
939 if (qmin() == std::numeric_limits<uint8_t>::min()) {
940 output_min = -std::numeric_limits<float>::infinity();
941 }
942 if (qmax() == std::numeric_limits<uint8_t>::max()) {
943 output_max = +std::numeric_limits<float>::infinity();
944 }
945
946 // Clamp reference results.
947 for (float& value : output_ref) {
948 value = std::max(std::min(value, output_max), output_min);
949 }
950
951 // Create, setup, run, and destroy Deconvolution operator.
952 ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
953 xnn_operator_t deconvolution_op = nullptr;
954
955 xnn_caches caches = {
956 .code_cache = NULL,
957 .weights_cache = NULL,
958 };
959 xnn_weights_cache weights_cache;
960 if (use_weights_cache()) {
961 xnn_init_weights_cache(&weights_cache);
962 caches.weights_cache = &weights_cache;
963 }
964
965 const void* kernel_data = kernel.data();
966 const void* bias_data = bias.data();
967 if (weights_type() == WeightsType::FP32) {
968 kernel_data = kernel_as_float.data();
969 bias_data = bias_as_float.data();
970 }
971 uint32_t flags = 0;
972 if (weights_type() == WeightsType::FP32) {
973 flags |= XNN_FLAG_FP32_STATIC_WEIGHTS;
974 }
975 const xnn_status status = xnn_create_deconvolution2d_nhwc_f16(
976 padding_top(), padding_right(), padding_bottom(), padding_left(),
977 kernel_height(), kernel_width(), stride_height(), stride_width(),
978 dilation_height(), dilation_width(), groups(),
979 group_input_channels(), group_output_channels(),
980 input_pixel_stride(), output_pixel_stride(),
981 kernel_data, has_bias() ? bias_data : nullptr,
982 output_min, output_max,
983 flags, &caches, &deconvolution_op);
984 if (status == xnn_status_unsupported_hardware) {
985 GTEST_SKIP();
986 }
987 ASSERT_EQ(xnn_status_success, status);
988 ASSERT_NE(nullptr, deconvolution_op);
989 if (use_weights_cache()) {
990 ASSERT_EQ(xnn_status_success,
991 xnn_finalize_weights_cache(&weights_cache, xnn_weights_cache_finalization_kind_soft));
992 }
993
994 // Smart pointer to automatically delete deconvolution_op.
995 std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)> auto_deconvolution_op(deconvolution_op, xnn_delete_operator);
996
997 ASSERT_EQ(xnn_status_success,
998 xnn_setup_deconvolution2d_nhwc_f16(
999 deconvolution_op,
1000 batch_size(), input_height(), input_width(),
1001 adjustment_height(), adjustment_width(),
1002 input.data(), output.data(),
1003 nullptr /* thread pool */));
1004
1005 ASSERT_EQ(xnn_status_success,
1006 xnn_run_operator(deconvolution_op, nullptr /* thread pool */));
1007
1008 VerifyF16(output, output_ref, output_max, output_min);
1009
1010 if (use_weights_cache()) {
1011 xnn_operator_t deconvolution_op2 = nullptr;
1012 size_t old_weights_cache_size = weights_cache.cache.weights.size;
1013
1014 ASSERT_EQ(xnn_status_success,
1015 xnn_create_deconvolution2d_nhwc_f16(
1016 padding_top(), padding_right(), padding_bottom(), padding_left(),
1017 kernel_height(), kernel_width(), stride_height(), stride_width(),
1018 dilation_height(), dilation_width(), groups(),
1019 group_input_channels(), group_output_channels(),
1020 input_pixel_stride(), output_pixel_stride(),
1021 kernel_data, has_bias() ? bias_data : nullptr,
1022 output_min, output_max,
1023 flags, &caches, &deconvolution_op2));
1024 ASSERT_NE(nullptr, deconvolution_op2);
1025
1026 // Smart pointer to automatically delete deconvolution_op2.
1027 std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)> auto_deconvolution_op(deconvolution_op2, xnn_delete_operator);
1028 std::vector<uint16_t> output2(output.size(), UINT16_C(0x7E00) /* NaN */);
1029
1030 ASSERT_EQ(xnn_status_success,
1031 xnn_setup_deconvolution2d_nhwc_f16(
1032 deconvolution_op2,
1033 batch_size(), input_height(), input_width(),
1034 adjustment_height(), adjustment_width(),
1035 input.data(), output2.data(),
1036 nullptr /* thread pool */));
1037
1038 ASSERT_EQ(xnn_status_success,
1039 xnn_run_operator(deconvolution_op2, nullptr /* thread pool */));
1040
1041 VerifyWeightsCache(&weights_cache, old_weights_cache_size);
1042 VerifyF16(output2, output_ref, output_max, output_min);
1043 xnn_release_weights_cache(&weights_cache);
1044 }
1045 }
1046 }
1047
VerifyF16(const std::vector<uint16_t> & output,const std::vector<float> & output_ref,float output_max,float output_min)1048 void VerifyF16(const std::vector<uint16_t> &output,
1049 const std::vector<float> &output_ref,
1050 float output_max,
1051 float output_min) const {
1052 for (size_t i = 0; i < batch_size(); i++) {
1053 for (size_t y = 0; y < output_height(); y++) {
1054 for (size_t x = 0; x < output_width(); x++) {
1055 for (size_t g = 0; g < groups(); g++) {
1056 for (size_t c = 0; c < group_output_channels(); c++) {
1057 ASSERT_GE(fp16_ieee_to_fp32_value(output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + g * group_output_channels() + c]), output_min)
1058 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
1059 ASSERT_LE(fp16_ieee_to_fp32_value(output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + g * group_output_channels() + c]), output_max)
1060 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
1061 ASSERT_NEAR(
1062 fp16_ieee_to_fp32_value(output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + g * group_output_channels() + c]),
1063 output_ref[(((i * output_height() + y) * output_width() + x) * groups() + g) * group_output_channels() + c],
1064 1.0e-2f * std::abs(output_ref[(((i * output_height() + y) * output_width() + x) * groups() + g) * group_output_channels() + c]))
1065 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
1066 }
1067 }
1068 }
1069 }
1070 }
1071 }
1072
TestF32()1073 void TestF32() const {
1074 ASSERT_EQ(weights_type(), WeightsType::Default);
1075
1076 std::random_device random_device;
1077 auto rng = std::mt19937(random_device());
1078 std::uniform_real_distribution<float> f32dist(0.1f, 1.0f);
1079
1080 std::vector<float> input(XNN_EXTRA_BYTES / sizeof(float) +
1081 (batch_size() * input_height() * input_width() - 1) * input_pixel_stride() + groups() * group_input_channels());
1082 std::vector<float> kernel(groups() * group_output_channels() * kernel_height() * kernel_width() * group_input_channels());
1083 std::vector<float> bias(groups() * group_output_channels());
1084 std::vector<float> output((batch_size() * output_height() * output_width() - 1) * output_pixel_stride() + groups() * group_output_channels());
1085 std::vector<float> output_ref(batch_size() * output_height() * output_width() * groups() * group_output_channels());
1086
1087 for (size_t iteration = 0; iteration < iterations(); iteration++) {
1088 std::generate(input.begin(), input.end(), [&]() { return f32dist(rng); });
1089 std::generate(kernel.begin(), kernel.end(), [&]() { return f32dist(rng); });
1090 std::generate(bias.begin(), bias.end(), [&]() { return f32dist(rng); });
1091 std::fill(output.begin(), output.end(), nanf(""));
1092
1093 // Compute reference results, without clamping.
1094 if (has_bias()) {
1095 for (size_t i = 0; i < batch_size(); i++) {
1096 for (size_t oy = 0; oy < output_height(); oy++) {
1097 for (size_t ox = 0; ox < output_width(); ox++) {
1098 for (size_t g = 0; g < groups(); g++) {
1099 for (size_t oc = 0; oc < group_output_channels(); oc++) {
1100 output_ref[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] =
1101 bias[g * group_output_channels() + oc];
1102 }
1103 }
1104 }
1105 }
1106 }
1107 } else {
1108 std::fill(output_ref.begin(), output_ref.end(), 0.0f);
1109 }
1110 for (size_t i = 0; i < batch_size(); i++) {
1111 for (size_t oy = 0; oy < output_height(); oy++) {
1112 for (size_t ox = 0; ox < output_width(); ox++) {
1113 for (size_t ky = 0; ky < kernel_height(); ky++) {
1114 const size_t y = oy + padding_top() - ky * dilation_height();
1115 const size_t iy = y / stride_height();
1116 if (iy * stride_height() == y && iy < input_height()) {
1117 for (size_t kx = 0; kx < kernel_width(); kx++) {
1118 const size_t x = ox + padding_left() - kx * dilation_width();
1119 const size_t ix = x / stride_width();
1120 if (ix * stride_width() == x && ix < input_width()) {
1121 for (size_t g = 0; g < groups(); g++) {
1122 for (size_t oc = 0; oc < group_output_channels(); oc++) {
1123 for (size_t ic = 0; ic < group_input_channels(); ic++) {
1124 output_ref[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] +=
1125 input[((i * input_height() + iy) * input_width() + ix) * input_pixel_stride() + g * group_input_channels() + ic] *
1126 kernel[(((g * group_output_channels() + oc) * kernel_height() + ky) * kernel_width() + kx) * group_input_channels() + ic];
1127 }
1128 }
1129 }
1130 }
1131 }
1132 }
1133 }
1134 }
1135 }
1136 }
1137
1138 // Compute clamping parameters.
1139 const float accumulated_min = *std::min_element(output_ref.cbegin(), output_ref.cend());
1140 const float accumulated_max = *std::max_element(output_ref.cbegin(), output_ref.cend());
1141
1142 const float output_min = qmin() == 0 ? -std::numeric_limits<float>::infinity() :
1143 accumulated_min + (accumulated_max - accumulated_min) / 255.0f * float(qmin());
1144 const float output_max = qmax() == 255 ? std::numeric_limits<float>::infinity() :
1145 accumulated_max - (accumulated_max - accumulated_min) / 255.0f * float(255 - qmax());
1146
1147 // Clamp reference results.
1148 for (float& value : output_ref) {
1149 value = std::max(std::min(value, output_max), output_min);
1150 }
1151
1152 // Create, setup, run, and destroy Deconvolution operator.
1153 ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
1154 xnn_operator_t deconvolution_op = nullptr;
1155
1156 xnn_caches caches = {
1157 .code_cache = NULL,
1158 .weights_cache = NULL,
1159 };
1160 xnn_weights_cache weights_cache;
1161 if (use_weights_cache()) {
1162 xnn_init_weights_cache(&weights_cache);
1163 caches.weights_cache = &weights_cache;
1164 }
1165
1166 ASSERT_EQ(
1167 xnn_status_success,
1168 xnn_create_deconvolution2d_nhwc_f32(
1169 padding_top(), padding_right(), padding_bottom(), padding_left(),
1170 kernel_height(), kernel_width(), stride_height(), stride_width(),
1171 dilation_height(), dilation_width(), groups(),
1172 group_input_channels(), group_output_channels(),
1173 input_pixel_stride(), output_pixel_stride(), kernel.data(),
1174 has_bias() ? bias.data() : nullptr, output_min, output_max,
1175 /*flags=*/0, &caches, &deconvolution_op));
1176 if (use_weights_cache()) {
1177 ASSERT_EQ(xnn_status_success,
1178 xnn_finalize_weights_cache(&weights_cache, xnn_weights_cache_finalization_kind_soft));
1179 }
1180
1181 // Smart pointer to automatically delete deconvolution_op.
1182 std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)> auto_deconvolution_op(deconvolution_op, xnn_delete_operator);
1183
1184 ASSERT_EQ(xnn_status_success,
1185 xnn_setup_deconvolution2d_nhwc_f32(
1186 deconvolution_op,
1187 batch_size(), input_height(), input_width(),
1188 adjustment_height(), adjustment_width(),
1189 input.data(), output.data(),
1190 nullptr /* thread pool */));
1191
1192 ASSERT_EQ(xnn_status_success,
1193 xnn_run_operator(deconvolution_op, nullptr /* thread pool */));
1194
1195 VerifyF32(output, output_ref, output_max, output_min);
1196
1197 if (use_weights_cache()) {
1198 xnn_operator_t deconvolution_op2 = nullptr;
1199 size_t old_weights_cache_size = weights_cache.cache.weights.size;
1200
1201 ASSERT_EQ(
1202 xnn_status_success,
1203 xnn_create_deconvolution2d_nhwc_f32(
1204 padding_top(), padding_right(), padding_bottom(), padding_left(),
1205 kernel_height(), kernel_width(), stride_height(), stride_width(),
1206 dilation_height(), dilation_width(), groups(),
1207 group_input_channels(), group_output_channels(),
1208 input_pixel_stride(), output_pixel_stride(), kernel.data(),
1209 has_bias() ? bias.data() : nullptr, output_min, output_max,
1210 /*flags=*/0, &caches, &deconvolution_op2));
1211
1212 // Smart pointer to automatically delete deconvolution_op2.
1213 std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)> auto_deconvolution_op(deconvolution_op2, xnn_delete_operator);
1214 std::vector<float> output2(output.size(), nanf(""));
1215
1216 ASSERT_EQ(xnn_status_success,
1217 xnn_setup_deconvolution2d_nhwc_f32(
1218 deconvolution_op2,
1219 batch_size(), input_height(), input_width(),
1220 adjustment_height(), adjustment_width(),
1221 input.data(), output2.data(),
1222 nullptr /* thread pool */));
1223
1224 ASSERT_EQ(xnn_status_success,
1225 xnn_run_operator(deconvolution_op2, nullptr /* thread pool */));
1226
1227 VerifyWeightsCache(&weights_cache, old_weights_cache_size);
1228 VerifyF32(output2, output_ref, output_max, output_min);
1229 xnn_release_weights_cache(&weights_cache);
1230 }
1231 }
1232 }
1233
1234 // A variation of TestF32 that stresses the weights cache. All the operator creation needs to happen before
1235 // finalization and setup.
StressWeightsCacheTestF32()1236 void StressWeightsCacheTestF32() const {
1237 ASSERT_EQ(weights_type(), WeightsType::Default);
1238
1239 std::random_device random_device;
1240 auto rng = std::mt19937(random_device());
1241 std::uniform_real_distribution<float> f32dist(0.1f, 1.0f);
1242
1243 xnn_caches caches = {
1244 .code_cache = NULL,
1245 .weights_cache = NULL,
1246 };
1247 xnn_weights_cache weights_cache;
1248 xnn_init_weights_cache(&weights_cache);
1249 caches.weights_cache = &weights_cache;
1250 void* old_weights_cache_start = weights_cache.cache.weights.start;
1251 size_t old_weights_cache_size = weights_cache.cache.weights.size;
1252
1253 std::vector<xnn_operator_t> operators;
1254 operators.reserve(iterations());
1255 std::vector<std::vector<float>> inputs;
1256 inputs.reserve(iterations());
1257 std::vector<std::vector<float>> outputs;
1258 outputs.reserve(iterations());
1259 std::vector<std::vector<float>> output_refs;
1260 output_refs.reserve(iterations());
1261 std::vector<float> output_mins;
1262 output_mins.reserve(iterations());
1263 std::vector<float> output_maxs;
1264 output_maxs.reserve(iterations());
1265
1266 for (size_t iteration = 0; iteration < iterations(); iteration++) {
1267 std::vector<float> input(XNN_EXTRA_BYTES / sizeof(float) +
1268 (batch_size() * input_height() * input_width() - 1) * input_pixel_stride() + groups() * group_input_channels());
1269 std::vector<float> kernel(groups() * group_output_channels() * kernel_height() * kernel_width() * group_input_channels());
1270 std::vector<float> bias(groups() * group_output_channels());
1271 std::vector<float> output((batch_size() * output_height() * output_width() - 1) * output_pixel_stride() + groups() * group_output_channels());
1272 std::vector<float> output_ref(batch_size() * output_height() * output_width() * groups() * group_output_channels());
1273
1274 std::generate(input.begin(), input.end(), [&]() { return f32dist(rng); });
1275 std::generate(kernel.begin(), kernel.end(), [&]() { return f32dist(rng); });
1276 std::generate(bias.begin(), bias.end(), [&]() { return f32dist(rng); });
1277 std::fill(output.begin(), output.end(), nanf(""));
1278
1279 // Compute reference results, without clamping.
1280 if (has_bias()) {
1281 for (size_t i = 0; i < batch_size(); i++) {
1282 for (size_t oy = 0; oy < output_height(); oy++) {
1283 for (size_t ox = 0; ox < output_width(); ox++) {
1284 for (size_t g = 0; g < groups(); g++) {
1285 for (size_t oc = 0; oc < group_output_channels(); oc++) {
1286 output_ref[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] =
1287 bias[g * group_output_channels() + oc];
1288 }
1289 }
1290 }
1291 }
1292 }
1293 } else {
1294 std::fill(output_ref.begin(), output_ref.end(), 0.0f);
1295 }
1296 for (size_t i = 0; i < batch_size(); i++) {
1297 for (size_t oy = 0; oy < output_height(); oy++) {
1298 for (size_t ox = 0; ox < output_width(); ox++) {
1299 for (size_t ky = 0; ky < kernel_height(); ky++) {
1300 const size_t y = oy + padding_top() - ky * dilation_height();
1301 const size_t iy = y / stride_height();
1302 if (iy * stride_height() == y && iy < input_height()) {
1303 for (size_t kx = 0; kx < kernel_width(); kx++) {
1304 const size_t x = ox + padding_left() - kx * dilation_width();
1305 const size_t ix = x / stride_width();
1306 if (ix * stride_width() == x && ix < input_width()) {
1307 for (size_t g = 0; g < groups(); g++) {
1308 for (size_t oc = 0; oc < group_output_channels(); oc++) {
1309 for (size_t ic = 0; ic < group_input_channels(); ic++) {
1310 output_ref[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] +=
1311 input[((i * input_height() + iy) * input_width() + ix) * input_pixel_stride() + g * group_input_channels() + ic] *
1312 kernel[(((g * group_output_channels() + oc) * kernel_height() + ky) * kernel_width() + kx) * group_input_channels() + ic];
1313 }
1314 }
1315 }
1316 }
1317 }
1318 }
1319 }
1320 }
1321 }
1322 }
1323
1324 // Compute clamping parameters.
1325 const float accumulated_min = *std::min_element(output_ref.cbegin(), output_ref.cend());
1326 const float accumulated_max = *std::max_element(output_ref.cbegin(), output_ref.cend());
1327
1328 const float output_min = qmin() == 0 ? -std::numeric_limits<float>::infinity() :
1329 accumulated_min + (accumulated_max - accumulated_min) / 255.0f * float(qmin());
1330 const float output_max = qmax() == 255 ? std::numeric_limits<float>::infinity() :
1331 accumulated_max - (accumulated_max - accumulated_min) / 255.0f * float(255 - qmax());
1332 output_mins.push_back(output_min);
1333 output_maxs.push_back(output_max);
1334
1335 // Clamp reference results.
1336 for (float& value : output_ref) {
1337 value = std::max(std::min(value, output_max), output_min);
1338 }
1339
1340 // Create, setup, run, and destroy Deconvolution operator.
1341 ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
1342 xnn_operator_t deconvolution_op = nullptr;
1343
1344 ASSERT_EQ(
1345 xnn_status_success,
1346 xnn_create_deconvolution2d_nhwc_f32(
1347 padding_top(), padding_right(), padding_bottom(), padding_left(),
1348 kernel_height(), kernel_width(), stride_height(), stride_width(),
1349 dilation_height(), dilation_width(), groups(),
1350 group_input_channels(), group_output_channels(),
1351 input_pixel_stride(), output_pixel_stride(), kernel.data(),
1352 has_bias() ? bias.data() : nullptr, output_min, output_max,
1353 /*flags=*/0, &caches, &deconvolution_op));
1354
1355 operators.push_back(std::move(deconvolution_op));
1356 inputs.push_back(std::move(input));
1357 outputs.push_back(std::move(output));
1358 output_refs.push_back(std::move(output_ref));
1359 }
1360
1361 ASSERT_EQ(xnn_status_success,
1362 xnn_finalize_weights_cache(&weights_cache, xnn_weights_cache_finalization_kind_soft));
1363
1364 for (size_t iteration = 0; iteration < iterations(); iteration++) {
1365 xnn_operator_t deconvolution_op = operators[iteration];
1366
1367 ASSERT_EQ(xnn_status_success,
1368 xnn_setup_deconvolution2d_nhwc_f32(
1369 deconvolution_op,
1370 batch_size(), input_height(), input_width(),
1371 adjustment_height(), adjustment_width(),
1372 inputs[iteration].data(), outputs[iteration].data(),
1373 nullptr /* thread pool */));
1374
1375 ASSERT_EQ(xnn_status_success,
1376 xnn_run_operator(deconvolution_op, nullptr /* thread pool */));
1377
1378 VerifyF32(outputs[iteration],
1379 output_refs[iteration],
1380 output_maxs[iteration],
1381 output_mins[iteration]);
1382 xnn_delete_operator(deconvolution_op);
1383 }
1384
1385 // Check that the weights cache grew and moved. If these assertion fails,
1386 // might have to increase the number of test iterations.
1387 ASSERT_NE(old_weights_cache_start, weights_cache.cache.weights.start);
1388 ASSERT_LT(old_weights_cache_size, weights_cache.cache.weights.size);
1389 // Since the weights are randomized, it is very unlikely to have any hits.
1390 ASSERT_EQ(iterations(), weights_cache.cache.misses);
1391 ASSERT_EQ(0, weights_cache.cache.hits);
1392 ASSERT_EQ(iterations(), weights_cache.cache.num_entries);
1393 xnn_release_weights_cache(&weights_cache);
1394 }
1395
VerifyF32(const std::vector<float> & output,const std::vector<float> & output_ref,float output_max,float output_min)1396 void VerifyF32(const std::vector<float> &output,
1397 const std::vector<float> &output_ref,
1398 float output_max,
1399 float output_min) const {
1400 for (size_t i = 0; i < batch_size(); i++) {
1401 for (size_t y = 0; y < output_height(); y++) {
1402 for (size_t x = 0; x < output_width(); x++) {
1403 for (size_t g = 0; g < groups(); g++) {
1404 for (size_t c = 0; c < group_output_channels(); c++) {
1405 ASSERT_GE(output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + g * group_output_channels() + c], output_min)
1406 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
1407 ASSERT_LE(output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + g * group_output_channels() + c], output_max)
1408 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
1409 ASSERT_NEAR(
1410 output_ref[(((i * output_height() + y) * output_width() + x) * groups() + g) * group_output_channels() + c],
1411 output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + g * group_output_channels() + c],
1412 1.0e-4 * std::abs(output_ref[(((i * output_height() + y) * output_width() + x) * groups() + g) * group_output_channels() + c]))
1413 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
1414 }
1415 }
1416 }
1417 }
1418 }
1419 }
1420
TestSetupQS8()1421 void TestSetupQS8() const {
1422 ASSERT_EQ(weights_type(), WeightsType::Default);
1423
1424 std::random_device random_device;
1425 auto rng = std::mt19937(random_device());
1426 std::uniform_int_distribution<int32_t> i32dist(-10000, 10000);
1427 std::uniform_int_distribution<int32_t> i8dist(
1428 std::numeric_limits<int8_t>::min(), std::numeric_limits<int8_t>::max());
1429 std::uniform_int_distribution<int32_t> w8dist(
1430 -std::numeric_limits<int8_t>::max(), std::numeric_limits<int8_t>::max());
1431
1432 std::vector<int8_t> input(XNN_EXTRA_BYTES / sizeof(int8_t) + std::max(
1433 (batch_size() * input_height() * input_width() - 1) * input_pixel_stride() + groups() * group_input_channels(),
1434 (next_batch_size() * next_input_height() * next_input_width() - 1) * input_pixel_stride() + groups() * group_input_channels()));
1435 std::vector<int8_t> kernel(groups() * group_output_channels() * kernel_height() * kernel_width() * group_input_channels());
1436 std::vector<int32_t> bias(groups() * group_output_channels());
1437 std::vector<int8_t> output(std::max(
1438 (batch_size() * output_height() * output_width() - 1) * output_pixel_stride() + groups() * group_output_channels(),
1439 (next_batch_size() * next_output_height() * next_output_width() - 1) * output_pixel_stride() + groups() * group_output_channels()));
1440 std::vector<int32_t> accumulators(batch_size() * output_height() * output_width() * groups() * group_output_channels());
1441 std::vector<double> output_ref(batch_size() * output_height() * output_width() * groups() * group_output_channels());
1442 std::vector<int32_t> next_accumulators(next_batch_size() * next_output_height() * next_output_width() * groups() * group_output_channels());
1443 std::vector<double> next_output_ref(next_batch_size() * next_output_height() * next_output_width() * groups() * group_output_channels());
1444
1445 const int8_t input_zero_point = 127;
1446
1447 for (size_t iteration = 0; iteration < iterations(); iteration++) {
1448 std::generate(input.begin(), input.end(), [&]() { return i8dist(rng); });
1449 std::generate(kernel.begin(), kernel.end(), [&]() { return w8dist(rng); });
1450 std::generate(bias.begin(), bias.end(), [&]() { return i32dist(rng); });
1451 std::fill(output.begin(), output.end(), INT8_C(0xA5));
1452
1453 // Compute reference results, without renormalization.
1454 if (has_bias()) {
1455 for (size_t i = 0; i < batch_size(); i++) {
1456 for (size_t oy = 0; oy < output_height(); oy++) {
1457 for (size_t ox = 0; ox < output_width(); ox++) {
1458 for (size_t g = 0; g < groups(); g++) {
1459 for (size_t oc = 0; oc < group_output_channels(); oc++) {
1460 accumulators[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] =
1461 bias[g * group_output_channels() + oc];
1462 }
1463 }
1464 }
1465 }
1466 }
1467 } else {
1468 std::fill(accumulators.begin(), accumulators.end(), 0);
1469 }
1470 for (size_t i = 0; i < batch_size(); i++) {
1471 for (size_t oy = 0; oy < output_height(); oy++) {
1472 for (size_t ox = 0; ox < output_width(); ox++) {
1473 for (size_t ky = 0; ky < kernel_height(); ky++) {
1474 const size_t y = oy + padding_top() - ky * dilation_height();
1475 const size_t iy = y / stride_height();
1476 if (iy * stride_height() == y && iy < input_height()) {
1477 for (size_t kx = 0; kx < kernel_width(); kx++) {
1478 const size_t x = ox + padding_left() - kx * dilation_width();
1479 const size_t ix = x / stride_width();
1480 if (ix * stride_width() == x && ix < input_width()) {
1481 for (size_t g = 0; g < groups(); g++) {
1482 for (size_t oc = 0; oc < group_output_channels(); oc++) {
1483 for (size_t ic = 0; ic < group_input_channels(); ic++) {
1484 accumulators[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] +=
1485 (int32_t(input[((i * input_height() + iy) * input_width() + ix) * input_pixel_stride() + g * group_input_channels() + ic]) - int32_t(input_zero_point)) *
1486 int32_t(kernel[(((g * group_output_channels() + oc) * kernel_height() + ky) * kernel_width() + kx) * group_input_channels() + ic]);
1487 }
1488 }
1489 }
1490 }
1491 }
1492 }
1493 }
1494 }
1495 }
1496 }
1497
1498 // Compute renormalization parameters.
1499 const int32_t accumulated_min = *std::min_element(accumulators.cbegin(), accumulators.cend());
1500 const int32_t accumulated_max = *std::max_element(accumulators.cbegin(), accumulators.cend());
1501
1502 const double output_scale = double(uint32_t(accumulated_max - accumulated_min)) / 255.0;
1503 const int8_t output_zero_point = int8_t(std::max(std::min(
1504 lrint(-0.5 - 0.5 * double(accumulated_min + accumulated_max) / output_scale),
1505 long(std::numeric_limits<int8_t>::max())), long(std::numeric_limits<int8_t>::min())));
1506
1507 // Renormalize reference results.
1508 std::transform(accumulators.cbegin(), accumulators.cend(), output_ref.begin(),
1509 [this, output_scale, output_zero_point](int32_t x) -> double {
1510 return std::max<double>(std::min<double>(double(x) / output_scale, double(qmax() - 0x80) - output_zero_point), double(qmin() - 0x80) - output_zero_point);
1511 });
1512
1513 // Create, setup, and run Deconvolution operator once.
1514 ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
1515 xnn_operator_t deconvolution_op = nullptr;
1516
1517 ASSERT_EQ(xnn_status_success,
1518 xnn_create_deconvolution2d_nhwc_qs8(
1519 padding_top(), padding_right(), padding_bottom(), padding_left(),
1520 kernel_height(), kernel_width(),
1521 stride_height(), stride_width(),
1522 dilation_height(), dilation_width(),
1523 groups(), group_input_channels(), group_output_channels(),
1524 input_pixel_stride(), output_pixel_stride(),
1525 input_zero_point, 1.0f /* input scale */,
1526 1.0f /* kernel scale */,
1527 kernel.data(), has_bias() ? bias.data() : nullptr,
1528 output_zero_point, output_scale, int8_t(qmin() - 0x80), int8_t(qmax() - 0x80),
1529 0, NULL, &deconvolution_op));
1530
1531 // Smart pointer to automatically delete deconvolution_op.
1532 std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)> auto_deconvolution_op(deconvolution_op, xnn_delete_operator);
1533
1534 ASSERT_EQ(xnn_status_success,
1535 xnn_setup_deconvolution2d_nhwc_qs8(
1536 deconvolution_op,
1537 batch_size(), input_height(), input_width(),
1538 adjustment_height(), adjustment_width(),
1539 input.data(), output.data(),
1540 nullptr /* thread pool */));
1541
1542 ASSERT_EQ(xnn_status_success,
1543 xnn_run_operator(deconvolution_op, nullptr /* thread pool */));
1544
1545 // Verify results of the first run.
1546 for (size_t i = 0; i < batch_size(); i++) {
1547 for (size_t y = 0; y < output_height(); y++) {
1548 for (size_t x = 0; x < output_width(); x++) {
1549 for (size_t g = 0; g < groups(); g++) {
1550 for (size_t c = 0; c < group_output_channels(); c++) {
1551 ASSERT_LE(int32_t(output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + g * group_output_channels() + c]), int32_t(qmax() - 0x80))
1552 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
1553 ASSERT_GE(int32_t(output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + g * group_output_channels() + c]), int32_t(qmin() - 0x80))
1554 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
1555 ASSERT_NEAR(
1556 output_ref[(((i * output_height() + y) * output_width() + x) * groups() + g) * group_output_channels() + c],
1557 double(output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + g * group_output_channels() + c]) - double(output_zero_point),
1558 0.9)
1559 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
1560 }
1561 }
1562 }
1563 }
1564 }
1565
1566 // Re-generate data for the second run.
1567 std::generate(input.begin(), input.end(), [&]() { return i8dist(rng); });
1568 std::fill(output.begin(), output.end(), INT8_C(0xA5));
1569
1570 // Compute reference results for the second run, including renormalization.
1571 if (has_bias()) {
1572 for (size_t i = 0; i < next_batch_size(); i++) {
1573 for (size_t oy = 0; oy < next_output_height(); oy++) {
1574 for (size_t ox = 0; ox < next_output_width(); ox++) {
1575 for (size_t g = 0; g < groups(); g++) {
1576 for (size_t oc = 0; oc < group_output_channels(); oc++) {
1577 next_accumulators[(((i * next_output_height() + oy) * next_output_width() + ox) * groups() + g) * group_output_channels() + oc] =
1578 bias[g * group_output_channels() + oc];
1579 }
1580 }
1581 }
1582 }
1583 }
1584 } else {
1585 std::fill(next_accumulators.begin(), next_accumulators.end(), 0);
1586 }
1587 for (size_t i = 0; i < next_batch_size(); i++) {
1588 for (size_t oy = 0; oy < next_output_height(); oy++) {
1589 for (size_t ox = 0; ox < next_output_width(); ox++) {
1590 for (size_t ky = 0; ky < kernel_height(); ky++) {
1591 const size_t y = oy + padding_top() - ky * dilation_height();
1592 const size_t iy = y / stride_height();
1593 if (iy * stride_height() == y && iy < next_input_height()) {
1594 for (size_t kx = 0; kx < kernel_width(); kx++) {
1595 const size_t x = ox + padding_left() - kx * dilation_width();
1596 const size_t ix = x / stride_width();
1597 if (ix * stride_width() == x && ix < next_input_width()) {
1598 for (size_t g = 0; g < groups(); g++) {
1599 for (size_t oc = 0; oc < group_output_channels(); oc++) {
1600 for (size_t ic = 0; ic < group_input_channels(); ic++) {
1601 next_accumulators[(((i * next_output_height() + oy) * next_output_width() + ox) * groups() + g) * group_output_channels() + oc] +=
1602 (int32_t(input[((i * next_input_height() + iy) * next_input_width() + ix) * input_pixel_stride() + g * group_input_channels() + ic]) - int32_t(input_zero_point)) *
1603 int32_t(kernel[(((g * group_output_channels() + oc) * kernel_height() + ky) * kernel_width() + kx) * group_input_channels() + ic]);
1604 }
1605 }
1606 }
1607 }
1608 }
1609 }
1610 }
1611 }
1612 }
1613 }
1614 std::transform(next_accumulators.cbegin(), next_accumulators.cend(), next_output_ref.begin(),
1615 [this, output_scale, output_zero_point](int32_t x) -> double {
1616 return std::max<double>(std::min<double>(double(x) / output_scale, double(qmax() - 0x80) - output_zero_point), double(qmin() - 0x80) - output_zero_point);
1617 });
1618
1619 // Setup and run Deconvolution operator the second time, and destroy the operator.
1620 ASSERT_EQ(xnn_status_success,
1621 xnn_setup_deconvolution2d_nhwc_qs8(
1622 deconvolution_op,
1623 next_batch_size(), next_input_height(), next_input_width(),
1624 adjustment_height(), adjustment_width(),
1625 input.data(), output.data(),
1626 nullptr /* thread pool */));
1627
1628 ASSERT_EQ(xnn_status_success,
1629 xnn_run_operator(deconvolution_op, nullptr /* thread pool */));
1630
1631 // Verify results of the second run.
1632 for (size_t i = 0; i < next_batch_size(); i++) {
1633 for (size_t y = 0; y < next_output_height(); y++) {
1634 for (size_t x = 0; x < next_output_width(); x++) {
1635 for (size_t g = 0; g < groups(); g++) {
1636 for (size_t c = 0; c < group_output_channels(); c++) {
1637 ASSERT_LE(int32_t(output[((i * next_output_height() + y) * next_output_width() + x) * output_pixel_stride() + g * group_output_channels() + c]), int32_t(qmax() - 0x80))
1638 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
1639 ASSERT_GE(int32_t(output[((i * next_output_height() + y) * next_output_width() + x) * output_pixel_stride() + g * group_output_channels() + c]), int32_t(qmin() - 0x80))
1640 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
1641 ASSERT_NEAR(
1642 next_output_ref[(((i * next_output_height() + y) * next_output_width() + x) * groups() + g) * group_output_channels() + c],
1643 double(output[((i * next_output_height() + y) * next_output_width() + x) * output_pixel_stride() + g * group_output_channels() + c]) - double(output_zero_point),
1644 0.9)
1645 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
1646 }
1647 }
1648 }
1649 }
1650 }
1651 }
1652 }
1653
TestSetupQU8()1654 void TestSetupQU8() const {
1655 ASSERT_EQ(weights_type(), WeightsType::Default);
1656
1657 std::random_device random_device;
1658 auto rng = std::mt19937(random_device());
1659 std::uniform_int_distribution<int32_t> i32dist(-10000, 10000);
1660 std::uniform_int_distribution<int32_t> u8dist(
1661 std::numeric_limits<uint8_t>::min(), std::numeric_limits<uint8_t>::max());
1662
1663 std::vector<uint8_t> input(XNN_EXTRA_BYTES / sizeof(uint8_t) + std::max(
1664 (batch_size() * input_height() * input_width() - 1) * input_pixel_stride() + groups() * group_input_channels(),
1665 (next_batch_size() * next_input_height() * next_input_width() - 1) * input_pixel_stride() + groups() * group_input_channels()));
1666 std::vector<uint8_t> kernel(groups() * group_output_channels() * kernel_height() * kernel_width() * group_input_channels());
1667 std::vector<int32_t> bias(groups() * group_output_channels());
1668 std::vector<uint8_t> output(std::max(
1669 (batch_size() * output_height() * output_width() - 1) * output_pixel_stride() + groups() * group_output_channels(),
1670 (next_batch_size() * next_output_height() * next_output_width() - 1) * output_pixel_stride() + groups() * group_output_channels()));
1671 std::vector<int32_t> accumulators(batch_size() * output_height() * output_width() * groups() * group_output_channels());
1672 std::vector<double> output_ref(batch_size() * output_height() * output_width() * groups() * group_output_channels());
1673 std::vector<int32_t> next_accumulators(next_batch_size() * next_output_height() * next_output_width() * groups() * group_output_channels());
1674 std::vector<double> next_output_ref(next_batch_size() * next_output_height() * next_output_width() * groups() * group_output_channels());
1675
1676 const uint8_t input_zero_point = 127;
1677 const uint8_t kernel_zero_point = 127;
1678
1679 for (size_t iteration = 0; iteration < iterations(); iteration++) {
1680 std::generate(input.begin(), input.end(), [&]() { return u8dist(rng); });
1681 std::generate(kernel.begin(), kernel.end(), [&]() { return u8dist(rng); });
1682 std::generate(bias.begin(), bias.end(), [&]() { return i32dist(rng); });
1683 std::fill(output.begin(), output.end(), UINT8_C(0xA5));
1684
1685 // Compute reference results, without renormalization.
1686 if (has_bias()) {
1687 for (size_t i = 0; i < batch_size(); i++) {
1688 for (size_t oy = 0; oy < output_height(); oy++) {
1689 for (size_t ox = 0; ox < output_width(); ox++) {
1690 for (size_t g = 0; g < groups(); g++) {
1691 for (size_t oc = 0; oc < group_output_channels(); oc++) {
1692 accumulators[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] =
1693 bias[g * group_output_channels() + oc];
1694 }
1695 }
1696 }
1697 }
1698 }
1699 } else {
1700 std::fill(accumulators.begin(), accumulators.end(), 0);
1701 }
1702 for (size_t i = 0; i < batch_size(); i++) {
1703 for (size_t oy = 0; oy < output_height(); oy++) {
1704 for (size_t ox = 0; ox < output_width(); ox++) {
1705 for (size_t ky = 0; ky < kernel_height(); ky++) {
1706 const size_t y = oy + padding_top() - ky * dilation_height();
1707 const size_t iy = y / stride_height();
1708 if (iy * stride_height() == y && iy < input_height()) {
1709 for (size_t kx = 0; kx < kernel_width(); kx++) {
1710 const size_t x = ox + padding_left() - kx * dilation_width();
1711 const size_t ix = x / stride_width();
1712 if (ix * stride_width() == x && ix < input_width()) {
1713 for (size_t g = 0; g < groups(); g++) {
1714 for (size_t oc = 0; oc < group_output_channels(); oc++) {
1715 for (size_t ic = 0; ic < group_input_channels(); ic++) {
1716 accumulators[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] +=
1717 (int32_t(input[((i * input_height() + iy) * input_width() + ix) * input_pixel_stride() + g * group_input_channels() + ic]) - int32_t(input_zero_point)) *
1718 (int32_t(kernel[(((g * group_output_channels() + oc) * kernel_height() + ky) * kernel_width() + kx) * group_input_channels() + ic]) - int32_t(kernel_zero_point));
1719 }
1720 }
1721 }
1722 }
1723 }
1724 }
1725 }
1726 }
1727 }
1728 }
1729
1730 // Compute renormalization parameters.
1731 const int32_t accumulated_min = *std::min_element(accumulators.cbegin(), accumulators.cend());
1732 const int32_t accumulated_max = *std::max_element(accumulators.cbegin(), accumulators.cend());
1733
1734 const double output_scale = double(uint32_t(accumulated_max - accumulated_min)) / 255.0;
1735 const uint8_t output_zero_point = uint8_t(std::max(std::min(
1736 lrint(127.5 - 0.5 * double(accumulated_min + accumulated_max) / output_scale),
1737 long(std::numeric_limits<uint8_t>::max())), long(std::numeric_limits<uint8_t>::min())));
1738
1739 // Renormalize reference results.
1740 std::transform(accumulators.cbegin(), accumulators.cend(), output_ref.begin(),
1741 [this, output_scale, output_zero_point](int32_t x) -> double {
1742 return std::max<double>(std::min<double>(double(x) / output_scale, double(qmax()) - output_zero_point), double(qmin()) - output_zero_point);
1743 });
1744
1745 // Create, setup, and run Deconvolution operator once.
1746 ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
1747 xnn_operator_t deconvolution_op = nullptr;
1748
1749 ASSERT_EQ(xnn_status_success,
1750 xnn_create_deconvolution2d_nhwc_qu8(
1751 padding_top(), padding_right(), padding_bottom(), padding_left(),
1752 kernel_height(), kernel_width(),
1753 stride_height(), stride_width(),
1754 dilation_height(), dilation_width(),
1755 groups(), group_input_channels(), group_output_channels(),
1756 input_pixel_stride(), output_pixel_stride(),
1757 input_zero_point, 1.0f /* input scale */,
1758 kernel_zero_point, 1.0f /* kernel scale */,
1759 kernel.data(), has_bias() ? bias.data() : nullptr,
1760 output_zero_point, output_scale, qmin(), qmax(),
1761 0, NULL, &deconvolution_op));
1762
1763 // Smart pointer to automatically delete deconvolution_op.
1764 std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)> auto_deconvolution_op(deconvolution_op, xnn_delete_operator);
1765
1766 ASSERT_EQ(xnn_status_success,
1767 xnn_setup_deconvolution2d_nhwc_qu8(
1768 deconvolution_op,
1769 batch_size(), input_height(), input_width(),
1770 adjustment_height(), adjustment_width(),
1771 input.data(), output.data(),
1772 nullptr /* thread pool */));
1773
1774 ASSERT_EQ(xnn_status_success,
1775 xnn_run_operator(deconvolution_op, nullptr /* thread pool */));
1776
1777 // Verify results of the first run.
1778 for (size_t i = 0; i < batch_size(); i++) {
1779 for (size_t y = 0; y < output_height(); y++) {
1780 for (size_t x = 0; x < output_width(); x++) {
1781 for (size_t g = 0; g < groups(); g++) {
1782 for (size_t c = 0; c < group_output_channels(); c++) {
1783 ASSERT_LE(int32_t(output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + g * group_output_channels() + c]), int32_t(qmax()))
1784 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
1785 ASSERT_GE(int32_t(output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + g * group_output_channels() + c]), int32_t(qmin()))
1786 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
1787 ASSERT_NEAR(
1788 output_ref[(((i * output_height() + y) * output_width() + x) * groups() + g) * group_output_channels() + c],
1789 double(output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + g * group_output_channels() + c]) - double(output_zero_point),
1790 0.9)
1791 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
1792 }
1793 }
1794 }
1795 }
1796 }
1797
1798 // Re-generate data for the second run.
1799 std::generate(input.begin(), input.end(), [&]() { return u8dist(rng); });
1800 std::fill(output.begin(), output.end(), 0xA5);
1801
1802 // Compute reference results for the second run, including renormalization.
1803 if (has_bias()) {
1804 for (size_t i = 0; i < next_batch_size(); i++) {
1805 for (size_t oy = 0; oy < next_output_height(); oy++) {
1806 for (size_t ox = 0; ox < next_output_width(); ox++) {
1807 for (size_t g = 0; g < groups(); g++) {
1808 for (size_t oc = 0; oc < group_output_channels(); oc++) {
1809 next_accumulators[(((i * next_output_height() + oy) * next_output_width() + ox) * groups() + g) * group_output_channels() + oc] =
1810 bias[g * group_output_channels() + oc];
1811 }
1812 }
1813 }
1814 }
1815 }
1816 } else {
1817 std::fill(next_accumulators.begin(), next_accumulators.end(), 0);
1818 }
1819 for (size_t i = 0; i < next_batch_size(); i++) {
1820 for (size_t oy = 0; oy < next_output_height(); oy++) {
1821 for (size_t ox = 0; ox < next_output_width(); ox++) {
1822 for (size_t ky = 0; ky < kernel_height(); ky++) {
1823 const size_t y = oy + padding_top() - ky * dilation_height();
1824 const size_t iy = y / stride_height();
1825 if (iy * stride_height() == y && iy < next_input_height()) {
1826 for (size_t kx = 0; kx < kernel_width(); kx++) {
1827 const size_t x = ox + padding_left() - kx * dilation_width();
1828 const size_t ix = x / stride_width();
1829 if (ix * stride_width() == x && ix < next_input_width()) {
1830 for (size_t g = 0; g < groups(); g++) {
1831 for (size_t oc = 0; oc < group_output_channels(); oc++) {
1832 for (size_t ic = 0; ic < group_input_channels(); ic++) {
1833 next_accumulators[(((i * next_output_height() + oy) * next_output_width() + ox) * groups() + g) * group_output_channels() + oc] +=
1834 (int32_t(input[((i * next_input_height() + iy) * next_input_width() + ix) * input_pixel_stride() + g * group_input_channels() + ic]) - int32_t(input_zero_point)) *
1835 (int32_t(kernel[(((g * group_output_channels() + oc) * kernel_height() + ky) * kernel_width() + kx) * group_input_channels() + ic]) - int32_t(kernel_zero_point));
1836 }
1837 }
1838 }
1839 }
1840 }
1841 }
1842 }
1843 }
1844 }
1845 }
1846 std::transform(next_accumulators.cbegin(), next_accumulators.cend(), next_output_ref.begin(),
1847 [this, output_scale, output_zero_point](int32_t x) -> double {
1848 return std::max<double>(std::min<double>(double(x) / output_scale, double(qmax()) - output_zero_point), double(qmin()) - output_zero_point);
1849 });
1850
1851 // Setup and run Deconvolution operator the second time, and destroy the operator.
1852 ASSERT_EQ(xnn_status_success,
1853 xnn_setup_deconvolution2d_nhwc_qu8(
1854 deconvolution_op,
1855 next_batch_size(), next_input_height(), next_input_width(),
1856 adjustment_height(), adjustment_width(),
1857 input.data(), output.data(),
1858 nullptr /* thread pool */));
1859
1860 ASSERT_EQ(xnn_status_success,
1861 xnn_run_operator(deconvolution_op, nullptr /* thread pool */));
1862
1863 // Verify results of the second run.
1864 for (size_t i = 0; i < next_batch_size(); i++) {
1865 for (size_t y = 0; y < next_output_height(); y++) {
1866 for (size_t x = 0; x < next_output_width(); x++) {
1867 for (size_t g = 0; g < groups(); g++) {
1868 for (size_t c = 0; c < group_output_channels(); c++) {
1869 ASSERT_LE(int32_t(output[((i * next_output_height() + y) * next_output_width() + x) * output_pixel_stride() + g * group_output_channels() + c]), int32_t(qmax()))
1870 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
1871 ASSERT_GE(int32_t(output[((i * next_output_height() + y) * next_output_width() + x) * output_pixel_stride() + g * group_output_channels() + c]), int32_t(qmin()))
1872 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
1873 ASSERT_NEAR(
1874 next_output_ref[(((i * next_output_height() + y) * next_output_width() + x) * groups() + g) * group_output_channels() + c],
1875 double(output[((i * next_output_height() + y) * next_output_width() + x) * output_pixel_stride() + g * group_output_channels() + c]) - double(output_zero_point),
1876 0.9)
1877 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
1878 }
1879 }
1880 }
1881 }
1882 }
1883 }
1884 }
1885
TestSetupF16()1886 void TestSetupF16() const {
1887 ASSERT_EQ(weights_type(), WeightsType::Default);
1888
1889 std::random_device random_device;
1890 auto rng = std::mt19937(random_device());
1891 std::uniform_real_distribution<float> f32dist(0.1f, 1.0f);
1892
1893 std::vector<uint16_t> input(XNN_EXTRA_BYTES / sizeof(uint16_t) + std::max(
1894 (batch_size() * input_height() * input_width() - 1) * input_pixel_stride() + groups() * group_input_channels(),
1895 (next_batch_size() * next_input_height() * next_input_width() - 1) * input_pixel_stride() + groups() * group_input_channels()));
1896 std::vector<uint16_t> kernel(groups() * group_output_channels() * kernel_height() * kernel_width() * group_input_channels());
1897 std::vector<uint16_t> bias(groups() * group_output_channels());
1898 std::vector<uint16_t> output(std::max(
1899 (batch_size() * output_height() * output_width() - 1) * output_pixel_stride() + groups() * group_output_channels(),
1900 (next_batch_size() * next_output_height() * next_output_width() - 1) * output_pixel_stride() + groups() * group_output_channels()));
1901 std::vector<float> output_ref(batch_size() * output_height() * output_width() * groups() * group_output_channels());
1902 std::vector<float> next_output_ref(next_batch_size() * next_output_height() * next_output_width() * groups() * group_output_channels());
1903
1904 for (size_t iteration = 0; iteration < iterations(); iteration++) {
1905 std::generate(input.begin(), input.end(), [&]() { return fp16_ieee_from_fp32_value(f32dist(rng)); });
1906 std::generate(kernel.begin(), kernel.end(), [&]() { return fp16_ieee_from_fp32_value(f32dist(rng)); });
1907 std::generate(bias.begin(), bias.end(), [&]() { return fp16_ieee_from_fp32_value(f32dist(rng)); });
1908 std::fill(output.begin(), output.end(), UINT16_C(0x7E00) /* NaN */);
1909
1910 // Compute reference results, without clamping.
1911 if (has_bias()) {
1912 for (size_t i = 0; i < batch_size(); i++) {
1913 for (size_t oy = 0; oy < output_height(); oy++) {
1914 for (size_t ox = 0; ox < output_width(); ox++) {
1915 for (size_t g = 0; g < groups(); g++) {
1916 for (size_t oc = 0; oc < group_output_channels(); oc++) {
1917 output_ref[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] =
1918 fp16_ieee_to_fp32_value(bias[g * group_output_channels() + oc]);
1919 }
1920 }
1921 }
1922 }
1923 }
1924 } else {
1925 std::fill(output_ref.begin(), output_ref.end(), 0);
1926 }
1927 for (size_t i = 0; i < batch_size(); i++) {
1928 for (size_t oy = 0; oy < output_height(); oy++) {
1929 for (size_t ox = 0; ox < output_width(); ox++) {
1930 for (size_t ky = 0; ky < kernel_height(); ky++) {
1931 const size_t y = oy + padding_top() - ky * dilation_height();
1932 const size_t iy = y / stride_height();
1933 if (iy * stride_height() == y && iy < input_height()) {
1934 for (size_t kx = 0; kx < kernel_width(); kx++) {
1935 const size_t x = ox + padding_left() - kx * dilation_width();
1936 const size_t ix = x / stride_width();
1937 if (ix * stride_width() == x && ix < input_width()) {
1938 for (size_t g = 0; g < groups(); g++) {
1939 for (size_t oc = 0; oc < group_output_channels(); oc++) {
1940 for (size_t ic = 0; ic < group_input_channels(); ic++) {
1941 output_ref[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] +=
1942 fp16_ieee_to_fp32_value(input[((i * input_height() + iy) * input_width() + ix) * input_pixel_stride() + g * group_input_channels() + ic]) *
1943 fp16_ieee_to_fp32_value(kernel[(((g * group_output_channels() + oc) * kernel_height() + ky) * kernel_width() + kx) * group_input_channels() + ic]);
1944 }
1945 }
1946 }
1947 }
1948 }
1949 }
1950 }
1951 }
1952 }
1953 }
1954
1955 // Compute clamping parameters.
1956 const float accumulated_min = *std::min_element(output_ref.cbegin(), output_ref.cend());
1957 const float accumulated_max = *std::max_element(output_ref.cbegin(), output_ref.cend());
1958 const float accumulated_range = accumulated_max - accumulated_min;
1959 float output_min = accumulated_min + accumulated_range / 255.0f * float(qmin());
1960 float output_max = accumulated_max - accumulated_range / 255.0f * float(255 - qmax());
1961 output_min = fp16_ieee_to_fp32_value(fp16_ieee_from_fp32_value(output_min));
1962 output_max = fp16_ieee_to_fp32_value(fp16_ieee_from_fp32_value(output_max));
1963 if (accumulated_range == 0.0f) {
1964 output_min = -std::numeric_limits<float>::infinity();
1965 output_max = +std::numeric_limits<float>::infinity();
1966 }
1967 if (qmin() == std::numeric_limits<uint8_t>::min()) {
1968 output_min = -std::numeric_limits<float>::infinity();
1969 }
1970 if (qmax() == std::numeric_limits<uint8_t>::max()) {
1971 output_max = +std::numeric_limits<float>::infinity();
1972 }
1973
1974 // Clamp reference results.
1975 for (float& value : output_ref) {
1976 value = std::max(std::min(value, output_max), output_min);
1977 }
1978
1979 // Create, setup, and run Deconvolution operator once.
1980 ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
1981 xnn_operator_t deconvolution_op = nullptr;
1982
1983 const xnn_status status = xnn_create_deconvolution2d_nhwc_f16(
1984 padding_top(), padding_right(), padding_bottom(), padding_left(),
1985 kernel_height(), kernel_width(),
1986 stride_height(), stride_width(),
1987 dilation_height(), dilation_width(),
1988 groups(), group_input_channels(), group_output_channels(),
1989 input_pixel_stride(), output_pixel_stride(),
1990 kernel.data(), has_bias() ? bias.data() : nullptr,
1991 output_min, output_max,
1992 0, NULL, &deconvolution_op);
1993 if (status == xnn_status_unsupported_hardware) {
1994 GTEST_SKIP();
1995 }
1996 ASSERT_EQ(xnn_status_success, status);
1997 ASSERT_NE(nullptr, deconvolution_op);
1998
1999 // Smart pointer to automatically delete deconvolution_op.
2000 std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)> auto_deconvolution_op(deconvolution_op, xnn_delete_operator);
2001
2002 ASSERT_EQ(xnn_status_success,
2003 xnn_setup_deconvolution2d_nhwc_f16(
2004 deconvolution_op,
2005 batch_size(), input_height(), input_width(),
2006 adjustment_height(), adjustment_width(),
2007 input.data(), output.data(),
2008 nullptr /* thread pool */));
2009
2010 ASSERT_EQ(xnn_status_success,
2011 xnn_run_operator(deconvolution_op, nullptr /* thread pool */));
2012
2013 // Verify results of the first run.
2014 for (size_t i = 0; i < batch_size(); i++) {
2015 for (size_t y = 0; y < output_height(); y++) {
2016 for (size_t x = 0; x < output_width(); x++) {
2017 for (size_t g = 0; g < groups(); g++) {
2018 for (size_t c = 0; c < group_output_channels(); c++) {
2019 ASSERT_GE(fp16_ieee_to_fp32_value(output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + g * group_output_channels() + c]), output_min)
2020 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
2021 ASSERT_LE(fp16_ieee_to_fp32_value(output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + g * group_output_channels() + c]), output_max)
2022 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
2023 ASSERT_NEAR(
2024 fp16_ieee_to_fp32_value(output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + g * group_output_channels() + c]),
2025 output_ref[(((i * output_height() + y) * output_width() + x) * groups() + g) * group_output_channels() + c],
2026 1.0e-2f * std::abs(output_ref[(((i * output_height() + y) * output_width() + x) * groups() + g) * group_output_channels() + c]))
2027 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
2028 }
2029 }
2030 }
2031 }
2032 }
2033
2034 // Re-generate data for the second run.
2035 std::generate(input.begin(), input.end(), [&]() { return fp16_ieee_from_fp32_value(f32dist(rng)); });
2036 std::fill(output.begin(), output.end(), UINT16_C(0x7E00) /* NaN */);
2037
2038 // Compute reference results for the second run, including clamping.
2039 if (has_bias()) {
2040 for (size_t i = 0; i < next_batch_size(); i++) {
2041 for (size_t oy = 0; oy < next_output_height(); oy++) {
2042 for (size_t ox = 0; ox < next_output_width(); ox++) {
2043 for (size_t g = 0; g < groups(); g++) {
2044 for (size_t oc = 0; oc < group_output_channels(); oc++) {
2045 next_output_ref[(((i * next_output_height() + oy) * next_output_width() + ox) * groups() + g) * group_output_channels() + oc] =
2046 fp16_ieee_to_fp32_value(bias[g * group_output_channels() + oc]);
2047 }
2048 }
2049 }
2050 }
2051 }
2052 } else {
2053 std::fill(next_output_ref.begin(), next_output_ref.end(), 0);
2054 }
2055 for (size_t i = 0; i < next_batch_size(); i++) {
2056 for (size_t oy = 0; oy < next_output_height(); oy++) {
2057 for (size_t ox = 0; ox < next_output_width(); ox++) {
2058 for (size_t ky = 0; ky < kernel_height(); ky++) {
2059 const size_t y = oy + padding_top() - ky * dilation_height();
2060 const size_t iy = y / stride_height();
2061 if (iy * stride_height() == y && iy < next_input_height()) {
2062 for (size_t kx = 0; kx < kernel_width(); kx++) {
2063 const size_t x = ox + padding_left() - kx * dilation_width();
2064 const size_t ix = x / stride_width();
2065 if (ix * stride_width() == x && ix < next_input_width()) {
2066 for (size_t g = 0; g < groups(); g++) {
2067 for (size_t oc = 0; oc < group_output_channels(); oc++) {
2068 for (size_t ic = 0; ic < group_input_channels(); ic++) {
2069 next_output_ref[(((i * next_output_height() + oy) * next_output_width() + ox) * groups() + g) * group_output_channels() + oc] +=
2070 fp16_ieee_to_fp32_value(input[((i * next_input_height() + iy) * next_input_width() + ix) * input_pixel_stride() + g * group_input_channels() + ic]) *
2071 fp16_ieee_to_fp32_value(kernel[(((g * group_output_channels() + oc) * kernel_height() + ky) * kernel_width() + kx) * group_input_channels() + ic]);
2072 }
2073 }
2074 }
2075 }
2076 }
2077 }
2078 }
2079 }
2080 }
2081 }
2082 for (float& value : next_output_ref) {
2083 value = std::max(std::min(value, output_max), output_min);
2084 }
2085
2086 // Setup and run Deconvolution operator the second time, and destroy the operator.
2087 ASSERT_EQ(xnn_status_success,
2088 xnn_setup_deconvolution2d_nhwc_f16(
2089 deconvolution_op,
2090 next_batch_size(), next_input_height(), next_input_width(),
2091 adjustment_height(), adjustment_width(),
2092 input.data(), output.data(),
2093 nullptr /* thread pool */));
2094
2095 ASSERT_EQ(xnn_status_success,
2096 xnn_run_operator(deconvolution_op, nullptr /* thread pool */));
2097
2098 // Verify results of the second run.
2099 for (size_t i = 0; i < next_batch_size(); i++) {
2100 for (size_t y = 0; y < next_output_height(); y++) {
2101 for (size_t x = 0; x < next_output_width(); x++) {
2102 for (size_t g = 0; g < groups(); g++) {
2103 for (size_t c = 0; c < group_output_channels(); c++) {
2104 ASSERT_GE(fp16_ieee_to_fp32_value(output[((i * next_output_height() + y) * next_output_width() + x) * output_pixel_stride() + g * group_output_channels() + c]), output_min)
2105 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
2106 ASSERT_LE(fp16_ieee_to_fp32_value(output[((i * next_output_height() + y) * next_output_width() + x) * output_pixel_stride() + g * group_output_channels() + c]), output_max)
2107 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
2108 ASSERT_NEAR(
2109 fp16_ieee_to_fp32_value(output[((i * next_output_height() + y) * next_output_width() + x) * output_pixel_stride() + g * group_output_channels() + c]),
2110 next_output_ref[(((i * next_output_height() + y) * next_output_width() + x) * groups() + g) * group_output_channels() + c],
2111 1.0e-2f * std::abs(next_output_ref[(((i * next_output_height() + y) * next_output_width() + x) * groups() + g) * group_output_channels() + c]))
2112 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
2113 }
2114 }
2115 }
2116 }
2117 }
2118 }
2119 }
2120
TestSetupF32()2121 void TestSetupF32() const {
2122 ASSERT_EQ(weights_type(), WeightsType::Default);
2123
2124 std::random_device random_device;
2125 auto rng = std::mt19937(random_device());
2126 std::uniform_real_distribution<float> f32dist(0.1f, 1.0f);
2127
2128 std::vector<float> input(XNN_EXTRA_BYTES / sizeof(float) + std::max(
2129 (batch_size() * input_height() * input_width() - 1) * input_pixel_stride() + groups() * group_input_channels(),
2130 (next_batch_size() * next_input_height() * next_input_width() - 1) * input_pixel_stride() + groups() * group_input_channels()));
2131 std::vector<float> kernel(groups() * group_output_channels() * kernel_height() * kernel_width() * group_input_channels());
2132 std::vector<float> bias(groups() * group_output_channels());
2133 std::vector<float> output(std::max(
2134 (batch_size() * output_height() * output_width() - 1) * output_pixel_stride() + groups() * group_output_channels(),
2135 (next_batch_size() * next_output_height() * next_output_width() - 1) * output_pixel_stride() + groups() * group_output_channels()));
2136 std::vector<float> output_ref(batch_size() * output_height() * output_width() * groups() * group_output_channels());
2137 std::vector<float> next_output_ref(next_batch_size() * next_output_height() * next_output_width() * groups() * group_output_channels());
2138
2139 for (size_t iteration = 0; iteration < iterations(); iteration++) {
2140 std::generate(input.begin(), input.end(), [&]() { return f32dist(rng); });
2141 std::generate(kernel.begin(), kernel.end(), [&]() { return f32dist(rng); });
2142 std::generate(bias.begin(), bias.end(), [&]() { return f32dist(rng); });
2143 std::fill(output.begin(), output.end(), nanf(""));
2144
2145 // Compute reference results, without clamping.
2146 if (has_bias()) {
2147 for (size_t i = 0; i < batch_size(); i++) {
2148 for (size_t oy = 0; oy < output_height(); oy++) {
2149 for (size_t ox = 0; ox < output_width(); ox++) {
2150 for (size_t g = 0; g < groups(); g++) {
2151 for (size_t oc = 0; oc < group_output_channels(); oc++) {
2152 output_ref[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] =
2153 bias[g * group_output_channels() + oc];
2154 }
2155 }
2156 }
2157 }
2158 }
2159 } else {
2160 std::fill(output_ref.begin(), output_ref.end(), 0.0f);
2161 }
2162 for (size_t i = 0; i < batch_size(); i++) {
2163 for (size_t oy = 0; oy < output_height(); oy++) {
2164 for (size_t ox = 0; ox < output_width(); ox++) {
2165 for (size_t ky = 0; ky < kernel_height(); ky++) {
2166 const size_t y = oy + padding_top() - ky * dilation_height();
2167 const size_t iy = y / stride_height();
2168 if (iy * stride_height() == y && iy < input_height()) {
2169 for (size_t kx = 0; kx < kernel_width(); kx++) {
2170 const size_t x = ox + padding_left() - kx * dilation_width();
2171 const size_t ix = x / stride_width();
2172 if (ix * stride_width() == x && ix < input_width()) {
2173 for (size_t g = 0; g < groups(); g++) {
2174 for (size_t oc = 0; oc < group_output_channels(); oc++) {
2175 for (size_t ic = 0; ic < group_input_channels(); ic++) {
2176 output_ref[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] +=
2177 input[((i * input_height() + iy) * input_width() + ix) * input_pixel_stride() + g * group_input_channels() + ic] *
2178 kernel[(((g * group_output_channels() + oc) * kernel_height() + ky) * kernel_width() + kx) * group_input_channels() + ic];
2179 }
2180 }
2181 }
2182 }
2183 }
2184 }
2185 }
2186 }
2187 }
2188 }
2189
2190 // Compute clamping parameters.
2191 const float accumulated_min = *std::min_element(output_ref.cbegin(), output_ref.cend());
2192 const float accumulated_max = *std::max_element(output_ref.cbegin(), output_ref.cend());
2193
2194 const float output_min = accumulated_min + (accumulated_max - accumulated_min) / 255.0f * float(qmin());
2195 const float output_max = accumulated_max - (accumulated_max - accumulated_min) / 255.0f * float(255 - qmax());
2196
2197 // Clamp reference results.
2198 for (float& value : output_ref) {
2199 value = std::max(std::min(value, output_max), output_min);
2200 }
2201
2202 // Create, setup, and run Deconvolution operator once.
2203 ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
2204 xnn_operator_t deconvolution_op = nullptr;
2205
2206 ASSERT_EQ(xnn_status_success,
2207 xnn_create_deconvolution2d_nhwc_f32(
2208 padding_top(), padding_right(), padding_bottom(), padding_left(),
2209 kernel_height(), kernel_width(),
2210 stride_height(), stride_width(),
2211 dilation_height(), dilation_width(),
2212 groups(), group_input_channels(), group_output_channels(),
2213 input_pixel_stride(), output_pixel_stride(),
2214 kernel.data(), has_bias() ? bias.data() : nullptr,
2215 output_min, output_max,
2216 0, NULL, &deconvolution_op));
2217
2218 // Smart pointer to automatically delete deconvolution_op.
2219 std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)> auto_deconvolution_op(deconvolution_op, xnn_delete_operator);
2220
2221 ASSERT_EQ(xnn_status_success,
2222 xnn_setup_deconvolution2d_nhwc_f32(
2223 deconvolution_op,
2224 batch_size(), input_height(), input_width(),
2225 adjustment_height(), adjustment_width(),
2226 input.data(), output.data(),
2227 nullptr /* thread pool */));
2228
2229 ASSERT_EQ(xnn_status_success,
2230 xnn_run_operator(deconvolution_op, nullptr /* thread pool */));
2231
2232 // Verify results of the first run.
2233 for (size_t i = 0; i < batch_size(); i++) {
2234 for (size_t y = 0; y < output_height(); y++) {
2235 for (size_t x = 0; x < output_width(); x++) {
2236 for (size_t g = 0; g < groups(); g++) {
2237 for (size_t c = 0; c < group_output_channels(); c++) {
2238 ASSERT_GE(output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + g * group_output_channels() + c], output_min)
2239 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
2240 ASSERT_LE(output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + g * group_output_channels() + c], output_max)
2241 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
2242 ASSERT_NEAR(
2243 output_ref[(((i * output_height() + y) * output_width() + x) * groups() + g) * group_output_channels() + c],
2244 output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + g * group_output_channels() + c],
2245 1.0e-4 * std::abs(output_ref[(((i * output_height() + y) * output_width() + x) * groups() + g) * group_output_channels() + c]))
2246 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
2247 }
2248 }
2249 }
2250 }
2251 }
2252
2253 // Re-generate data for the second run.
2254 std::generate(input.begin(), input.end(), [&]() { return f32dist(rng); });
2255 std::fill(output.begin(), output.end(), nanf(""));
2256
2257 // Compute reference results for the second run, including clamping.
2258 if (has_bias()) {
2259 for (size_t i = 0; i < next_batch_size(); i++) {
2260 for (size_t oy = 0; oy < next_output_height(); oy++) {
2261 for (size_t ox = 0; ox < next_output_width(); ox++) {
2262 for (size_t g = 0; g < groups(); g++) {
2263 for (size_t oc = 0; oc < group_output_channels(); oc++) {
2264 next_output_ref[(((i * next_output_height() + oy) * next_output_width() + ox) * groups() + g) * group_output_channels() + oc] =
2265 bias[g * group_output_channels() + oc];
2266 }
2267 }
2268 }
2269 }
2270 }
2271 } else {
2272 std::fill(next_output_ref.begin(), next_output_ref.end(), 0.0f);
2273 }
2274 for (size_t i = 0; i < next_batch_size(); i++) {
2275 for (size_t oy = 0; oy < next_output_height(); oy++) {
2276 for (size_t ox = 0; ox < next_output_width(); ox++) {
2277 for (size_t ky = 0; ky < kernel_height(); ky++) {
2278 const size_t y = oy + padding_top() - ky * dilation_height();
2279 const size_t iy = y / stride_height();
2280 if (iy * stride_height() == y && iy < next_input_height()) {
2281 for (size_t kx = 0; kx < kernel_width(); kx++) {
2282 const size_t x = ox + padding_left() - kx * dilation_width();
2283 const size_t ix = x / stride_width();
2284 if (ix * stride_width() == x && ix < next_input_width()) {
2285 for (size_t g = 0; g < groups(); g++) {
2286 for (size_t oc = 0; oc < group_output_channels(); oc++) {
2287 for (size_t ic = 0; ic < group_input_channels(); ic++) {
2288 next_output_ref[(((i * next_output_height() + oy) * next_output_width() + ox) * groups() + g) * group_output_channels() + oc] +=
2289 input[((i * next_input_height() + iy) * next_input_width() + ix) * input_pixel_stride() + g * group_input_channels() + ic] *
2290 kernel[(((g * group_output_channels() + oc) * kernel_height() + ky) * kernel_width() + kx) * group_input_channels() + ic];
2291 }
2292 }
2293 }
2294 }
2295 }
2296 }
2297 }
2298 }
2299 }
2300 }
2301 for (float& value : next_output_ref) {
2302 value = std::max(std::min(value, output_max), output_min);
2303 }
2304
2305 // Setup and run Deconvolution operator the second time, and destroy the operator.
2306 ASSERT_EQ(xnn_status_success,
2307 xnn_setup_deconvolution2d_nhwc_f32(
2308 deconvolution_op,
2309 next_batch_size(), next_input_height(), next_input_width(),
2310 adjustment_height(), adjustment_width(),
2311 input.data(), output.data(),
2312 nullptr /* thread pool */));
2313
2314 ASSERT_EQ(xnn_status_success,
2315 xnn_run_operator(deconvolution_op, nullptr /* thread pool */));
2316
2317 // Verify results of the second run.
2318 for (size_t i = 0; i < next_batch_size(); i++) {
2319 for (size_t y = 0; y < next_output_height(); y++) {
2320 for (size_t x = 0; x < next_output_width(); x++) {
2321 for (size_t g = 0; g < groups(); g++) {
2322 for (size_t c = 0; c < group_output_channels(); c++) {
2323 ASSERT_GE(output[((i * next_output_height() + y) * next_output_width() + x) * output_pixel_stride() + g * group_output_channels() + c], output_min)
2324 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
2325 ASSERT_LE(output[((i * next_output_height() + y) * next_output_width() + x) * output_pixel_stride() + g * group_output_channels() + c], output_max)
2326 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
2327 ASSERT_NEAR(
2328 next_output_ref[(((i * next_output_height() + y) * next_output_width() + x) * groups() + g) * group_output_channels() + c],
2329 output[((i * next_output_height() + y) * next_output_width() + x) * output_pixel_stride() + g * group_output_channels() + c],
2330 1.0e-4 * std::abs(next_output_ref[(((i * next_output_height() + y) * next_output_width() + x) * groups() + g) * group_output_channels() + c]))
2331 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
2332 }
2333 }
2334 }
2335 }
2336 }
2337 }
2338 }
2339
2340 private:
2341 uint32_t padding_top_{0};
2342 uint32_t padding_right_{0};
2343 uint32_t padding_bottom_{0};
2344 uint32_t padding_left_{0};
2345 size_t input_height_{1};
2346 size_t input_width_{1};
2347 uint32_t groups_{1};
2348 size_t group_input_channels_{1};
2349 size_t input_pixel_stride_{0};
2350 size_t group_output_channels_{1};
2351 size_t output_pixel_stride_{0};
2352 size_t batch_size_{1};
2353 uint32_t kernel_height_{1};
2354 uint32_t kernel_width_{1};
2355 uint32_t adjustment_height_{0};
2356 uint32_t adjustment_width_{0};
2357 uint32_t dilation_height_{1};
2358 uint32_t dilation_width_{1};
2359 uint32_t stride_height_{1};
2360 uint32_t stride_width_{1};
2361 size_t next_input_height_{0};
2362 size_t next_input_width_{0};
2363 size_t next_batch_size_{0};
2364 uint8_t qmin_{0};
2365 uint8_t qmax_{255};
2366 bool has_bias_{true};
2367 WeightsType weights_type_{WeightsType::Default};
2368 bool use_weights_cache_{false};
2369 bool stress_weights_cache_{false};
2370 size_t iterations_{1};
2371 };
2372