xref: /aosp_15_r20/external/pytorch/aten/src/ATen/test/vulkan_api_test.cpp (revision da0073e96a02ea20f0ac840b70461e3646d07c45)
1 #ifdef USE_VULKAN_API
2 
3 // @lint-ignore-every CLANGTIDY
4 
5 #include <gtest/gtest.h>
6 #include <ATen/ATen.h>
7 #include <ATen/core/dispatch/Dispatcher.h>
8 #include <ATen/native/vulkan/api/api.h>
9 #include <c10/util/irange.h>
10 #include <c10/util/ArrayRef.h>
11 
12 // TODO: These functions should move to a common place.
13 
14 namespace {
15 
16 #ifdef USE_VULKAN_FP16_INFERENCE
17   constexpr float kTolerance = 1e-2;
18 #else
19   constexpr float kTolerance = 1e-5;
20 #endif
21 
checkRtol(const at::Tensor & diff,float maxTolerance)22 bool checkRtol(const at::Tensor& diff, float maxTolerance) {
23   if (diff.numel() == 0) {
24     return true;
25   }
26   return diff.abs().max().item<float>() <= maxTolerance;
27 }
28 
checkRtol(const at::Tensor & diff,const std::vector<at::Tensor> & inputs)29 bool checkRtol(const at::Tensor& diff, const std::vector<at::Tensor>& inputs) {
30   if (diff.numel() == 0) {
31     return true;
32   }
33   float maxValue = 0.0f;
34 
35   for (const auto& tensor : inputs) {
36     maxValue = fmax(tensor.abs().max().item<float>(), maxValue);
37   }
38 
39   return checkRtol(diff, kTolerance * maxValue);
40 }
41 
almostEqual(const at::Tensor & a,const at::Tensor & b)42 bool almostEqual(const at::Tensor& a, const at::Tensor& b) {
43   return checkRtol(a - b, {a, b});
44 }
45 
checkHardShrink(const at::Tensor & ref,const at::Tensor & out,const float clamp_thresh)46 bool checkHardShrink(
47     const at::Tensor& ref, const at::Tensor& out, const float clamp_thresh) {
48   float* ref_ptr = ref.data_ptr<float>();
49   float* out_ptr = out.data_ptr<float>();
50   float ref_max = ref.abs().max().item<float>();
51   float out_max = out.abs().max().item<float>();
52   float max_val = std::fmax(ref_max, out_max);
53 
54   float abs_clamp_thresh = std::abs(clamp_thresh);
55 
56   for (int i = 0; i < ref.numel(); ++i) {
57     float ref_val = ref_ptr[i];
58     float out_val = out_ptr[i];
59 
60     float abs_diff = std::abs(ref_val - out_val);
61 
62     // For values near the clamp threshold, results may be ambiguous.
63     float distance_from_thresh = std::abs(std::abs(ref_val) - abs_clamp_thresh);
64     if (distance_from_thresh < kTolerance * abs_clamp_thresh) {
65       if (out_val != 0.0f) {
66         if (abs_diff >= kTolerance * max_val) {
67           return false;
68         }
69       }
70     }
71     else if (std::abs(ref_val) < std::abs(abs_clamp_thresh)) {
72       if (out_val != 0.0f) {
73         return false;
74       }
75     }
76     else if (abs_diff >= kTolerance * max_val) {
77       return false;
78     }
79   }
80   return true;
81 }
82 
checkThreshold(const at::Tensor & ref,const at::Tensor & out,const float clamp_thresh,const float value)83 bool checkThreshold(
84     const at::Tensor& ref,
85     const at::Tensor& out,
86     const float clamp_thresh,
87     const float value) {
88   float* ref_ptr = ref.data_ptr<float>();
89   float* out_ptr = out.data_ptr<float>();
90   float ref_max = ref.abs().max().item<float>();
91   float out_max = out.abs().max().item<float>();
92   float max_val = std::fmax(ref_max, out_max);
93 
94   for (int i = 0; i < ref.numel(); ++i) {
95     float ref_val = ref_ptr[i];
96     float out_val = out_ptr[i];
97 
98     float abs_diff = std::abs(ref_val - out_val);
99     float val_diff = std::abs(out_val - value);
100 
101     // For values near the clamp threshold, results may be ambiguous.
102     float distance_from_thresh = std::abs(std::abs(ref_val) - clamp_thresh);
103     if (distance_from_thresh < kTolerance * clamp_thresh) {
104       if (val_diff >= kTolerance * value) {
105         if (abs_diff >= kTolerance * max_val) {
106           return false;
107         }
108       }
109     }
110     else if (std::abs(ref_val) < std::abs(clamp_thresh)) {
111       if (val_diff >= kTolerance * value) {
112         return false;
113       }
114     }
115     else if (abs_diff >= kTolerance * max_val) {
116       return false;
117     }
118   }
119   return true;
120 }
121 
showRtol(const at::Tensor & a,const at::Tensor & b)122 void showRtol(const at::Tensor& a, const at::Tensor& b) {
123   const auto diff = (a - b).abs();
124 
125   float maxValue = a.abs().max().item<float>();
126   maxValue = fmax(b.abs().max().item<float>(), maxValue);
127 
128   const float maxDiff = maxValue * kTolerance;
129   std::cout << "Max Diff allowed: " << maxDiff << std::endl;
130   if (diff.sizes().size() == 2) {
131     for (const auto y : c10::irange(diff.sizes()[0])) {
132       std::cout << y << ":";
133       for (const auto x : c10::irange(diff.sizes()[1])) {
134         float diff_xy = diff[y][x].item<float>();
135         if (diff_xy > maxDiff) {
136           std::cout << std::setw(5) << x;
137         }
138         else {
139           std::cout << std::setw(5) << " ";
140         }
141       }
142       std::cout << std::endl;
143     }
144   }
145 }
146 
147 
gen_allpermutations(std::vector<std::vector<int64_t>> & out,std::vector<int64_t> in,unsigned i)148 static void gen_allpermutations(std::vector<std::vector<int64_t>>& out, std::vector<int64_t> in, unsigned i) {
149   // generate all permutations of a given dims
150   if (i == in.size()) {
151     out.push_back(in);
152   }
153   else {
154     for (const auto j : c10::irange(i, in.size())) {
155       std::swap(in[i], in[j]);
156       gen_allpermutations(out, in, i + 1);
157     }
158   }
159 }
160 
gen_all_subsets(std::vector<std::vector<int64_t>> & out,int64_t n,unsigned i,std::vector<int64_t> curr)161 static void gen_all_subsets(
162     std::vector<std::vector<int64_t>>& out,
163     int64_t n,
164     unsigned i,
165     std::vector<int64_t> curr) {
166   // generate all subsets of set {0,...,n - 1} through backtracking
167   if (i == n) {
168     out.push_back(curr);
169   } else {
170     curr.push_back(i);
171     gen_all_subsets(out, n, i + 1, curr);
172     curr.pop_back();
173     gen_all_subsets(out, n, i + 1, curr);
174   }
175 }
176 
slice_test(const std::vector<int64_t> & size,int64_t dim,std::optional<int64_t> start,std::optional<int64_t> end,int64_t step)177 static void slice_test(
178     const std::vector<int64_t>& size,
179     int64_t dim,
180     std::optional<int64_t> start,
181     std::optional<int64_t> end,
182     int64_t step) {
183   // Arrange
184   const auto in_cpu = at::rand(size, at::device(at::kCPU).dtype(at::kFloat));
185   const auto in_vulkan = in_cpu.vulkan();
186 
187   // Act
188   const auto out_cpu = at::slice(in_cpu, dim, start, end, step);
189   const auto out_vulkan = at::slice(in_vulkan, dim, start, end, step);
190 
191   // Assert
192   const auto check = almostEqual(out_cpu, out_vulkan.cpu());
193   if (!check) {
194     showRtol(out_cpu, out_vulkan.cpu());
195   }
196 
197   ASSERT_TRUE(check);
198 }
199 
slice_tests(const std::unordered_map<int64_t,std::vector<int64_t>> & dim2sizes)200 static void slice_tests(const std::unordered_map<int64_t, std::vector<int64_t>>& dim2sizes) {
201   for (const auto& dim2size : dim2sizes) {
202     slice_test(dim2size.second, dim2size.first, 10, 30, 1);         // i.e., 4D tensor's equivalent indexing = [:,:,:,10:30:1]
203     slice_test(dim2size.second, dim2size.first, 10, 30, 7);         // i.e., 4D tensor's equivalent indexing = [:,:,:,10:30:7]
204     slice_test(dim2size.second, dim2size.first, 10, 50, 2);         // i.e., 4D tensor's equivalent indexing = [:,:,:,10:50:2] with end=out of range
205     slice_test(dim2size.second, dim2size.first, -60, 60, 2);        // i.e., 4D tensor's equivalent indexing = [:,:,:,-60:60:2] with start/end=out of range
206     slice_test(dim2size.second, dim2size.first, -30, -10, 1);       // i.e., 4D tensor's equivalent indexing = [:,:,:,-30:-10:1] with negative start/end
207     slice_test(dim2size.second, dim2size.first, 0, INT64_MAX, 1);   // i.e., 4D 's equivalent indexing = [:,:,:,0:9223372036854775807:1] with end=INT64_MAX
208     slice_test(dim2size.second, dim2size.first, -10, INT64_MAX, 1); // i.e., 4D 's equivalent indexing = [:,:,:,-10:9223372036854775807:1] with negative start and end=INT64_MAX
209     // This triggers a SymInt assert since [-2^63, -2^62-1] range is reserved for packed symints
210     //slice_test(dim2size.second, dim2size.first, INT64_MIN, INT64_MAX, 1); // i.e., 4D 's equivalent indexing = [:,:,:,-9223372036854775808:9223372036854775807:1] with start=INT64_MIN and end=INT64_MAX
211     slice_test(dim2size.second, dim2size.first, {}, {}, 1);         // i.e., 4D 's equivalent indexing = [:,:,:,::1] with empty start/end
212   }
213 }
214 
clone_test(const std::vector<int64_t> & size,std::optional<at::MemoryFormat> optional_memory_format)215 static void clone_test(const std::vector<int64_t>& size, std::optional<at::MemoryFormat> optional_memory_format) {
216   // Arrange
217   const auto in_cpu = at::rand(size, at::device(at::kCPU).dtype(at::kFloat));
218   const auto in_vulkan = in_cpu.vulkan();
219 
220   // Act
221   const auto out_cpu = at::clone(in_cpu, optional_memory_format);
222   const auto out_vulkan = at::clone(in_vulkan, optional_memory_format);
223 
224   // Assert
225   const auto check = almostEqual(out_cpu, out_vulkan.cpu());
226   if (!check) {
227     showRtol(out_cpu, out_vulkan.cpu());
228   }
229 
230   ASSERT_TRUE(check);
231 }
232 
233 template <class... Inputs>
makeStack(Inputs &&...inputs)234 inline std::vector<c10::IValue> makeStack(Inputs&&... inputs) {
235   return {std::forward<Inputs>(inputs)...};
236 }
237 
238 template <class... Args>
callOpByHandle(const c10::OperatorHandle & op,Args...args)239 inline std::vector<c10::IValue> callOpByHandle(
240     const c10::OperatorHandle& op,
241     Args... args) {
242   auto stack = makeStack(std::forward<Args>(args)...);
243   c10::Dispatcher::singleton().callBoxed(op, &stack);
244   return stack;
245 }
246 
247 template <class... Args>
callOpByName(const char * func_name,const char * overload_name,Args...args)248 inline std::vector<c10::IValue> callOpByName(
249     const char* func_name,
250     const char* overload_name,
251     Args... args) {
252   const std::optional<c10::OperatorHandle> op_handle =
253       c10::Dispatcher::singleton().findSchema({func_name, overload_name});
254   assert(op_handle.has_value());
255   return callOpByHandle(op_handle.value(), std::forward<Args>(args)...);
256 }
257 
258 } // namespace
259 
260 namespace {
261 
262 class VulkanAPITest : public ::testing::Test {
263  public:
SetUp()264   void SetUp() {
265     if (!at::is_vulkan_available()) {
266       GTEST_SKIP() << "Vulkan is not available";
267     }
268 #if defined(USE_VULKAN_GPU_DIAGNOSTICS) && defined(__ANDROID__)
269     if (at::native::vulkan::api::context()->op_profiling_enabled()) {
270       at::native::vulkan::api::context()->reset_querypool();
271     }
272 #endif
273   }
274 
TearDown()275   void TearDown() {
276 #if defined(USE_VULKAN_GPU_DIAGNOSTICS) && defined(__ANDROID__)
277     if (at::native::vulkan::api::context()->op_profiling_enabled()) {
278       try {
279         at::native::vulkan::api::context()->querypool().extract_results();
280         at::native::vulkan::api::context()->querypool().print_results();
281       } catch (const std::exception& e) {
282         std::cout << "Could not get querypool results!"
283                   << " Reason: " << e.what() << std::endl;
284       }
285     }
286 #endif
287   }
288 };
289 
TEST_F(VulkanAPITest,zero_size_tensor)290 TEST_F(VulkanAPITest, zero_size_tensor) {
291   auto cpu = at::rand({1, 0, 0}, at::device(at::kCPU).dtype(at::kFloat));
292   auto vk = cpu.vulkan();
293   auto out_vk = vk.cpu();
294   ASSERT_TRUE(at::equal(out_vk, cpu));
295 }
296 
TEST_F(VulkanAPITest,zero_size_tensor_numel)297 TEST_F(VulkanAPITest, zero_size_tensor_numel) {
298   auto vk = at::rand({18, 0, 5}, at::device(at::kVulkan).dtype(at::kFloat));
299   ASSERT_TRUE(vk.numel() == 0);
300 }
301 
TEST_F(VulkanAPITest,zero_dim_tensor_1)302 TEST_F(VulkanAPITest, zero_dim_tensor_1) {
303   auto cpu = at::rand({}, at::device(at::kCPU).dtype(at::kFloat));
304   auto vv = cpu.item<float>();
305 
306   auto vk = cpu.vulkan();
307   auto out_vk = vk.cpu();
308   ASSERT_TRUE(almostEqual(cpu, out_vk));
309 
310   auto vk_vv = out_vk.item<float>();
311   EXPECT_NEAR(vv, vk_vv, kTolerance);
312 }
313 
TEST_F(VulkanAPITest,zero_dim_tensor_2)314 TEST_F(VulkanAPITest, zero_dim_tensor_2) {
315   float v = 3.14f;
316   auto cpu = at::zeros({}, at::device(at::kCPU).dtype(at::kFloat)) + v;
317   auto vk = at::zeros({}, at::device(at::kVulkan).dtype(at::kFloat)) + v;
318 
319   ASSERT_TRUE(almostEqual(cpu, vk.cpu()));
320 }
321 
TEST_F(VulkanAPITest,zero_dim_tensor_3)322 TEST_F(VulkanAPITest, zero_dim_tensor_3) {
323   auto vk = at::zeros({}, at::device(at::kVulkan).dtype(at::kFloat));
324 
325   ASSERT_TRUE(vk.cpu().item<float>() == 0.0f);
326 }
327 
TEST_F(VulkanAPITest,local_scalar_dense)328 TEST_F(VulkanAPITest, local_scalar_dense) {
329   float v = 8.31f;
330   // Force the zero-dim tensor to a non-zero constant v.
331   auto vk = at::zeros({}, at::device(at::kVulkan).dtype(at::kFloat)) + v;
332   c10::Scalar scalar = at::_local_scalar_dense(vk);
333   EXPECT_NEAR(v, scalar.toFloat(), kTolerance);
334 }
335 
TEST_F(VulkanAPITest,copy_to_texture)336 TEST_F(VulkanAPITest, copy_to_texture) {
337   using namespace at::native::vulkan;
338   at::Tensor test_tensors[] = {
339     // 4D
340     at::rand({7, 17, 134, 213}, at::TensorOptions(at::kCPU).dtype(at::kFloat)),
341     // 3D
342     at::rand({67, 134, 213}, at::TensorOptions(at::kCPU).dtype(at::kFloat)),
343     // 2D
344     at::rand({229, 213}, at::TensorOptions(at::kCPU).dtype(at::kFloat)),
345     // 1D
346     at::rand({1902}, at::TensorOptions(at::kCPU).dtype(at::kFloat)),
347   };
348 
349   for (auto in_cpu : test_tensors) {
350     at::Tensor in_vk_copied = in_cpu.vulkan();
351     at::Tensor out_copied = in_vk_copied.cpu();
352 
353     const auto check_copy = almostEqual(out_copied, in_cpu);
354 
355     if(!check_copy) {
356       std::cout << "Copy failed on size " << in_cpu.sizes()
357                 << "with dtype" << in_cpu.dtype() << std::endl;
358     }
359 
360     ASSERT_TRUE(check_copy);
361   }
362 }
363 
test_copy_to_texture_bool(const at::IntArrayRef input_shape)364 void test_copy_to_texture_bool(const at::IntArrayRef input_shape) {
365   using namespace at::native::vulkan;
366   auto cpu = at::randint(0, 2, input_shape, at::TensorOptions(at::kCPU).dtype(at::kBool));
367   auto in_vulkan = cpu.vulkan();
368 
369   auto out_vulkan = in_vulkan.cpu();
370   auto check = at::equal(cpu, out_vulkan.cpu());
371 
372   if (!check) {
373     std::cout << "Copy texture to bool failed on input_shape " << input_shape << std::endl;
374   }
375   ASSERT_TRUE(check);
376 }
377 
TEST_F(VulkanAPITest,copy_to_texture_bool_mul4_hw)378 TEST_F(VulkanAPITest, copy_to_texture_bool_mul4_hw) {
379   // Uses the shader: image_to_nchw_quantized_mul4 ((H * W) % 4 == 0)
380   // ch % 4 != 0,  ch < 4
381   test_copy_to_texture_bool({5, 1, 2, 2});
382   test_copy_to_texture_bool({17, 2, 4, 2});
383   test_copy_to_texture_bool({9, 3, 3, 8});
384 
385   // ch % 4 != 0, ch > 5
386   test_copy_to_texture_bool({7, 17, 4, 8});
387   test_copy_to_texture_bool({8, 6, 2, 4});
388   test_copy_to_texture_bool({13, 31, 4, 57});
389 
390   // 3d, 2d, 1d
391   test_copy_to_texture_bool({17, 31, 4});
392   test_copy_to_texture_bool({64, 16});
393   test_copy_to_texture_bool({8});
394 }
395 
TEST_F(VulkanAPITest,copy_to_texture_bool_mul4_chw)396 TEST_F(VulkanAPITest, copy_to_texture_bool_mul4_chw) {
397   // Uses the shader: image_to_nchw_quantized_mul4 ((H * W) % 4 == 0)
398   // ch % 4 == 0
399   test_copy_to_texture_bool({5, 16, 2, 16});
400   test_copy_to_texture_bool({8, 8, 2, 2});
401   test_copy_to_texture_bool({16, 31, 4});
402 }
403 
TEST_F(VulkanAPITest,copy_to_texture_bool)404 TEST_F(VulkanAPITest, copy_to_texture_bool) {
405   // Uses the shader: image_to_nchw_uint ((H * W) % 4 != 0)
406   test_copy_to_texture_bool({13, 1, 3, 5});
407   test_copy_to_texture_bool({13, 7, 1, 5});
408   test_copy_to_texture_bool({13, 8, 2, 5});
409   test_copy_to_texture_bool({13, 31, 2, 57});
410 
411   test_copy_to_texture_bool({67, 19, 7});
412   test_copy_to_texture_bool({229, 213});
413   test_copy_to_texture_bool({1902});
414 }
415 
TEST_F(VulkanAPITest,adaptive_avg_pool2d)416 TEST_F(VulkanAPITest, adaptive_avg_pool2d) {
417   c10::InferenceMode mode;
418 
419   const auto in_cpu = at::rand({5, 7, 47, 31}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
420   const auto out_cpu = at::adaptive_avg_pool2d(in_cpu, {3, 3});
421   const auto out_vulkan = at::adaptive_avg_pool2d(in_cpu.vulkan(), {3, 3});
422 
423   const auto check = almostEqual(out_cpu, out_vulkan.cpu());
424   if (!check) {
425     showRtol(out_cpu, out_vulkan.cpu());
426   }
427 
428   ASSERT_TRUE(check);
429 }
430 
test_add(const at::IntArrayRef input_shape,const at::IntArrayRef other_shape,float alpha)431 void test_add(const at::IntArrayRef input_shape, const at::IntArrayRef other_shape, float alpha) {
432   const auto in_cpu = at::rand(input_shape, at::device(at::kCPU).dtype(at::kFloat));
433   const auto other_cpu = at::rand(other_shape, at::device(at::kCPU).dtype(at::kFloat));
434 
435   const auto in_vulkan = in_cpu.vulkan();
436   const auto other_vulkan = other_cpu.vulkan();
437 
438   const auto out_cpu = at::add(in_cpu, other_cpu, alpha);
439   const auto out_vulkan = at::add(in_vulkan, other_vulkan, alpha);
440 
441   const auto check = almostEqual(out_cpu, out_vulkan.cpu());
442   if (!check) {
443     showRtol(out_cpu, out_vulkan.cpu());
444   }
445 
446   ASSERT_TRUE(check);
447 }
448 
TEST_F(VulkanAPITest,add_invalid_inputs)449 TEST_F(VulkanAPITest, add_invalid_inputs) {
450   // Incompatible dimensions for broadcasting for binary elementwise op
451   auto in_cpu = at::rand({2, 3, 4, 5}, at::device(at::kCPU).dtype(at::kFloat));
452   auto other_cpu = at::rand({2, 4, 4, 5}, at::device(at::kCPU).dtype(at::kFloat));
453 
454   EXPECT_THROW(at::add(in_cpu.vulkan(), other_cpu.vulkan(), 1.0f), ::std::exception);
455 }
456 
TEST_F(VulkanAPITest,add)457 TEST_F(VulkanAPITest, add) {
458   test_add({2, 3}, {2, 3}, 1.0f);
459   test_add({11, 7, 139, 109}, {11, 7, 139, 109}, 2.1f);
460 }
461 
TEST_F(VulkanAPITest,add_broadcast0)462 TEST_F(VulkanAPITest, add_broadcast0) {
463   test_add({3, 5, 179, 221}, {3, 5, 1, 1}, 1.8f);
464 }
465 
TEST_F(VulkanAPITest,add_broadcast1)466 TEST_F(VulkanAPITest, add_broadcast1) {
467   test_add({3, 5, 179, 221}, {3, 5, 1, 221}, 1.8f);
468 }
469 
TEST_F(VulkanAPITest,add_broadcast2)470 TEST_F(VulkanAPITest, add_broadcast2) {
471   test_add({3, 4, 179, 221}, {4, 1, 1}, 2.5f);
472 }
473 
TEST_F(VulkanAPITest,add_broadcast3)474 TEST_F(VulkanAPITest, add_broadcast3) {
475   test_add({3, 4, 41, 53}, {1, 1, 41, 53}, 2.5f);
476 }
477 
TEST_F(VulkanAPITest,add_broadcast4)478 TEST_F(VulkanAPITest, add_broadcast4) {
479   test_add({3, 4, 41, 1}, {1, 41, 53}, 2.5f);
480 }
481 
TEST_F(VulkanAPITest,add_broadcast5)482 TEST_F(VulkanAPITest, add_broadcast5) {
483   test_add({2, 1, 7, 1}, {1, 5, 1, 4}, 1.2f);
484 }
485 
TEST_F(VulkanAPITest,add_broadcast6)486 TEST_F(VulkanAPITest, add_broadcast6) {
487   test_add({1, 15, 5, 4}, {21, 1, 5, 4}, 1.8f);
488 }
489 
TEST_F(VulkanAPITest,add_zero_dim)490 TEST_F(VulkanAPITest, add_zero_dim) {
491  test_add({2, 6, 5, 6}, {}, 1.5f);
492 }
493 
test_add_other_cpu_int(const at::IntArrayRef input_shape,const at::IntArrayRef other_shape,float alpha)494 void test_add_other_cpu_int(
495     const at::IntArrayRef input_shape,
496     const at::IntArrayRef other_shape,
497     float alpha) {
498   const auto in_cpu =
499       at::rand(input_shape, at::device(at::kCPU).dtype(at::kFloat));
500   const auto other_cpu =
501       (at::rand(input_shape, at::device(at::kCPU).dtype(at::kFloat)) * 100)
502           .to(at::kInt);
503 
504   const auto in_vulkan = in_cpu.vulkan();
505 
506   const auto out_cpu = at::add(in_cpu, other_cpu, alpha);
507   const auto out_vulkan = at::add(in_vulkan, other_cpu, alpha);
508 
509   const auto check = almostEqual(out_cpu, out_vulkan.cpu());
510   if (!check) {
511     showRtol(out_cpu, out_vulkan.cpu());
512   }
513 
514   ASSERT_TRUE(check);
515 }
516 
TEST_F(VulkanAPITest,add_other_cpu_int)517 TEST_F(VulkanAPITest, add_other_cpu_int) {
518   test_add_other_cpu_int({2, 3}, {2, 3}, 1.0f);
519   test_add_other_cpu_int({11, 7, 139, 109}, {11, 7, 139, 109}, 2.1f);
520 }
521 
TEST_F(VulkanAPITest,add_broadcast0_other_cpu_int)522 TEST_F(VulkanAPITest, add_broadcast0_other_cpu_int) {
523   test_add_other_cpu_int({3, 5, 179, 221}, {3, 5, 1, 1}, 1.8f);
524 }
525 
TEST_F(VulkanAPITest,add_other_cpu_unsupported_type_should_fail)526 TEST_F(VulkanAPITest, add_other_cpu_unsupported_type_should_fail) {
527   const auto in_cpu = at::rand({2,2,2}, at::device(at::kCPU).dtype(at::kFloat));
528 
529   const auto other_cpu =
530     at::zeros({2, 2, 2}, at::device(at::kCPU).dtype(at::kComplexFloat));
531 
532   EXPECT_THROW(at::add(in_cpu.vulkan(), other_cpu.vulkan(), 1.0f), ::std::exception);
533 }
534 
TEST_F(VulkanAPITest,add_)535 TEST_F(VulkanAPITest, add_) {
536   auto a_cpu = at::rand({61, 17, 29, 83}, at::device(at::kCPU).dtype(at::kFloat));
537   auto a_vulkan = a_cpu.vulkan();
538 
539   const auto b_cpu = at::rand({61, 17, 29, 83}, at::device(at::kCPU).dtype(at::kFloat));
540   const auto b_vulkan = b_cpu.vulkan();
541 
542   a_cpu.add_(b_cpu, 2.1f);
543   a_vulkan.add_(b_vulkan, 2.1f);
544 
545   const auto check = almostEqual(a_cpu, a_vulkan.cpu());
546   if (!check) {
547     showRtol(a_cpu, a_vulkan.cpu());
548   }
549 
550   ASSERT_TRUE(check);
551 }
552 
TEST_F(VulkanAPITest,add_broadcast0_)553 TEST_F(VulkanAPITest, add_broadcast0_) {
554   auto a_cpu = at::rand({16, 17, 29, 83}, at::device(at::kCPU).dtype(at::kFloat));
555   auto a_vulkan = a_cpu.vulkan();
556 
557   const auto b_cpu = at::rand({16, 17, 29, 1}, at::device(at::kCPU).dtype(at::kFloat));
558   const auto b_vulkan = b_cpu.vulkan();
559 
560   a_cpu.add_(b_cpu, 2.1f);
561   a_vulkan.add_(b_vulkan, 2.1f);
562 
563   const auto check = almostEqual(a_cpu, a_vulkan.cpu());
564   if (!check) {
565     showRtol(a_cpu, a_vulkan.cpu());
566   }
567 
568   ASSERT_TRUE(check);
569 }
570 
TEST_F(VulkanAPITest,add_other_cpu_int_)571 TEST_F(VulkanAPITest, add_other_cpu_int_) {
572   std::vector<int64_t> input_shape{12, 17, 29, 33};
573   const auto in_cpu =
574       at::rand(input_shape, at::device(at::kCPU).dtype(at::kFloat));
575   const auto other_cpu =
576       (at::rand(input_shape, at::device(at::kCPU).dtype(at::kFloat)) * 100)
577           .to(at::kInt);
578 
579   const auto in_vulkan = in_cpu.vulkan();
580 
581   float alpha = -8.31f;
582   in_cpu.add(other_cpu, alpha);
583   in_vulkan.add(other_cpu, alpha);
584 
585   const auto check = almostEqual(in_cpu, in_vulkan.cpu());
586   if (!check) {
587     showRtol(in_cpu, in_vulkan.cpu());
588   }
589 }
590 
TEST_F(VulkanAPITest,add_broadcast1_)591 TEST_F(VulkanAPITest, add_broadcast1_) {
592   auto a_cpu = at::rand({3, 8, 29, 83}, at::device(at::kCPU).dtype(at::kFloat));
593   auto a_vulkan = a_cpu.vulkan();
594 
595   const auto b_cpu = at::rand({3, 8, 1, 1}, at::device(at::kCPU).dtype(at::kFloat));
596   const auto b_vulkan = b_cpu.vulkan();
597 
598   a_cpu.add_(b_cpu, 2.1f);
599   a_vulkan.add_(b_vulkan, 2.1f);
600 
601   const auto check = almostEqual(a_cpu, a_vulkan.cpu());
602   if (!check) {
603     showRtol(b_cpu, b_vulkan.cpu());
604   }
605 
606   ASSERT_TRUE(check);
607 }
608 
TEST_F(VulkanAPITest,add_scalar)609 TEST_F(VulkanAPITest, add_scalar) {
610   const auto a_cpu = at::rand({13, 23, 59, 73}, at::device(at::kCPU).dtype(at::kFloat));
611   const auto a_vulkan = a_cpu.vulkan();
612 
613   const float b_scalar = 3.1415f;
614 
615   const auto c_cpu = at::add(a_cpu, b_scalar, 2.1f);
616   const auto c_vulkan = at::add(a_vulkan, b_scalar, 2.1f);
617 
618   const auto check = almostEqual(c_cpu, c_vulkan.cpu());
619   if (!check) {
620     showRtol(c_cpu, c_vulkan.cpu());
621   }
622 
623   ASSERT_TRUE(check);
624 }
625 
TEST_F(VulkanAPITest,add_scalar_)626 TEST_F(VulkanAPITest, add_scalar_) {
627   auto a_cpu = at::rand({47, 2, 23, 97}, at::device(at::kCPU).dtype(at::kFloat));
628   auto a_vulkan = a_cpu.vulkan();
629 
630   const float b_scalar = 3.1415f;
631 
632   a_cpu.add_(b_scalar, 2.1f);
633   a_vulkan.add_(b_scalar, 2.1f);
634 
635   const auto check = almostEqual(a_cpu, a_vulkan.cpu());
636   if (!check) {
637     showRtol(a_cpu, a_vulkan.cpu());
638   }
639 
640   ASSERT_TRUE(check);
641 }
642 
TEST_F(VulkanAPITest,add_scalar_wrapped)643 TEST_F(VulkanAPITest, add_scalar_wrapped) {
644   if (!at::is_vulkan_available()) {
645     return;
646   }
647 
648   const auto a_cpu = at::rand({13, 23, 59, 73}, at::device(at::kCPU).dtype(at::kFloat));
649   const auto a_vulkan = a_cpu.vulkan();
650 
651   const auto b_scalar = at::rand({1}, at::device(at::kCPU).dtype(at::kFloat));
652 
653   const auto c_cpu = at::add(a_cpu, b_scalar, 2.1f);
654   const auto c_vulkan = at::add(a_vulkan, b_scalar, 2.1f);
655 
656   const auto check = almostEqual(c_cpu, c_vulkan.cpu());
657   if (!check) {
658     showRtol(c_cpu, c_vulkan.cpu());
659   }
660 
661   ASSERT_TRUE(check);
662 }
663 
TEST_F(VulkanAPITest,add_scalar_wrapped_)664 TEST_F(VulkanAPITest, add_scalar_wrapped_) {
665   if (!at::is_vulkan_available()) {
666     return;
667   }
668 
669   auto a_cpu = at::rand({47, 2, 23, 97}, at::device(at::kCPU).dtype(at::kFloat));
670   auto a_vulkan = a_cpu.vulkan();
671 
672   const auto b_scalar = at::rand({1}, at::device(at::kCPU).dtype(at::kFloat));
673 
674   a_cpu.add_(b_scalar, 2.1f);
675   a_vulkan.add_(b_scalar, 2.1f);
676 
677   const auto check = almostEqual(a_cpu, a_vulkan.cpu());
678   if (!check) {
679     showRtol(a_cpu, a_vulkan.cpu());
680   }
681 
682   ASSERT_TRUE(check);
683 }
684 
TEST_F(VulkanAPITest,add_to_scalar_wrapped)685 TEST_F(VulkanAPITest, add_to_scalar_wrapped) {
686   if (!at::is_vulkan_available()) {
687     return;
688   }
689 
690   const auto a = at::rand({1}, at::device(at::kCPU).dtype(at::kFloat));
691 
692   const auto b_cpu = at::rand({11, 7, 139, 109}, at::device(at::kCPU).dtype(at::kFloat));
693   const auto b_vulkan = b_cpu.vulkan();
694 
695   const auto c_cpu = at::add(a, b_cpu, 2.1f);
696   const auto c_vulkan = at::add(a, b_vulkan, 2.1f);
697 
698   const auto check = almostEqual(c_cpu, c_vulkan.cpu());
699   if (!check) {
700     showRtol(c_cpu, c_vulkan.cpu());
701   }
702 
703   ASSERT_TRUE(check);
704 }
705 
TEST_F(VulkanAPITest,addmm)706 TEST_F(VulkanAPITest, addmm) {
707   constexpr float alpha = 2.1f;
708   constexpr float beta = 103.24;
709 
710   const auto bias_cpu = at::rand({179, 163}, at::device(at::kCPU).dtype(at::kFloat));
711   const auto m1_cpu = at::rand({179, 67}, at::device(at::kCPU).dtype(at::kFloat));
712   const auto m2_cpu = at::rand({67, 163}, at::device(at::kCPU).dtype(at::kFloat));
713   const auto out_cpu = at::addmm(bias_cpu, m1_cpu, m2_cpu, beta, alpha);
714 
715   const auto m1_vulkan = m1_cpu.vulkan();
716   const auto out_vulkan = at::addmm(bias_cpu, m1_vulkan, m2_cpu, beta, alpha);
717 
718   const auto check = almostEqual(out_cpu, out_vulkan.cpu());
719   if (!check) {
720     showRtol(out_cpu, out_vulkan.cpu());
721   }
722 
723   ASSERT_TRUE(check);
724 }
725 
TEST_F(VulkanAPITest,addmm_expand)726 TEST_F(VulkanAPITest, addmm_expand) {
727   constexpr float alpha = 2.1f;
728   constexpr float beta = 103.24;
729 
730   const auto bias_cpu = at::rand({1000}, at::device(at::kCPU).dtype(at::kFloat));
731   const auto m1_cpu = at::rand({1, 1280}, at::device(at::kCPU).dtype(at::kFloat));
732   const auto m2_cpu = at::rand({1280, 1000}, at::device(at::kCPU).dtype(at::kFloat));
733   const auto out_cpu = at::addmm(bias_cpu, m1_cpu, m2_cpu, beta, alpha);
734 
735   const auto m1_vulkan = m1_cpu.vulkan();
736   const auto out_vulkan = at::addmm(bias_cpu, m1_vulkan, m2_cpu, beta, alpha);
737 
738   const auto check = almostEqual(out_cpu, out_vulkan.cpu());
739   if (!check) {
740     showRtol(out_cpu, out_vulkan.cpu());
741   }
742 
743   ASSERT_TRUE(check);
744 }
745 
TEST_F(VulkanAPITest,addmm_expand2)746 TEST_F(VulkanAPITest, addmm_expand2) {
747   constexpr float alpha = 2.1f;
748   constexpr float beta = 103.24;
749 
750   const auto bias_cpu = at::rand({9}, at::device(at::kCPU).dtype(at::kFloat));
751   const auto m1_cpu = at::rand({17, 6}, at::device(at::kCPU).dtype(at::kFloat));
752   const auto m2_cpu = at::rand({6, 9}, at::device(at::kCPU).dtype(at::kFloat));
753   const auto out_cpu = at::addmm(bias_cpu, m1_cpu, m2_cpu, beta, alpha);
754 
755   const auto m1_vulkan = m1_cpu.vulkan();
756   const auto out_vulkan = at::addmm(bias_cpu, m1_vulkan, m2_cpu, beta, alpha);
757 
758   const auto check = almostEqual(out_cpu, out_vulkan.cpu());
759   if (!check) {
760     showRtol(out_cpu, out_vulkan.cpu());
761   }
762 
763   ASSERT_TRUE(check);
764 }
765 
TEST_F(VulkanAPITest,addmm_error_bias)766 TEST_F(VulkanAPITest, addmm_error_bias) {
767   constexpr float alpha = 2.1f;
768   constexpr float beta = 103.24;
769 
770   // mismatched bias size (should be 1-dim or {17, 9})
771   const auto bias_cpu = at::rand({5, 5}, at::device(at::kCPU).dtype(at::kFloat));
772   const auto m1_cpu = at::rand({17, 6}, at::device(at::kCPU).dtype(at::kFloat));
773   const auto m2_cpu = at::rand({6, 9}, at::device(at::kCPU).dtype(at::kFloat));
774   const auto m1_vulkan = m1_cpu.vulkan();
775   EXPECT_THROW(at::addmm(bias_cpu, m1_vulkan, m2_cpu, beta, alpha), ::std::exception);
776 }
777 
TEST_F(VulkanAPITest,avg_pool2d)778 TEST_F(VulkanAPITest, avg_pool2d) {
779   const auto in_cpu = at::rand({3, 19, 43, 79}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
780   const auto out_cpu = at::avg_pool2d(in_cpu, {5, 3}, {1, 2}, {2, 0}, true);
781   const auto out_vulkan = at::avg_pool2d(in_cpu.vulkan(), {5, 3}, {1, 2}, {2, 0}, true);
782 
783   const auto check = almostEqual(out_cpu, out_vulkan.cpu());
784   if (!check) {
785     showRtol(out_cpu, out_vulkan.cpu());
786   }
787 
788   ASSERT_TRUE(check);
789 }
790 
TEST_F(VulkanAPITest,DISABLED_batch_norm_invalid_inputs)791 TEST_F(VulkanAPITest, DISABLED_batch_norm_invalid_inputs) {
792   c10::InferenceMode mode;
793 
794   // Act: Vulkan batchnorm only supports evaluation mode
795   EXPECT_THROW({
796     at::batch_norm(
797       at::rand({3, 8, 5, 7}, at::device(at::kCPU).dtype(at::kFloat)).vulkan(),
798       at::rand({8}, at::device(at::kCPU).dtype(at::kFloat)).vulkan(),
799       at::rand({8}, at::device(at::kCPU).dtype(at::kFloat)).vulkan(),
800       at::rand({8}, at::device(at::kCPU).dtype(at::kFloat)).vulkan(),
801       at::rand({8}, at::device(at::kCPU).dtype(at::kFloat)).vulkan(),
802       true,
803       0.1,
804       1e-05,
805       false);
806   }, ::std::exception);
807 
808   // Act: Vulkan batchnorm expects 4-dim input
809   EXPECT_THROW({
810     at::batch_norm(
811       at::rand({3, 8, 5}, at::device(at::kCPU).dtype(at::kFloat)).vulkan(),
812       at::rand({8}, at::device(at::kCPU).dtype(at::kFloat)).vulkan(),
813       at::rand({8}, at::device(at::kCPU).dtype(at::kFloat)).vulkan(),
814       at::rand({8}, at::device(at::kCPU).dtype(at::kFloat)).vulkan(),
815       at::rand({8}, at::device(at::kCPU).dtype(at::kFloat)).vulkan(),
816       false,
817       0.1,
818       1e-05,
819       false);
820   }, ::std::exception);
821 
822   // Act: Vulkan batchnorm expects 4-dim input
823   EXPECT_THROW({
824     at::batch_norm(
825       at::rand({2, 8, 3, 5, 7}, at::device(at::kCPU).dtype(at::kFloat)).vulkan(),
826       at::rand({8}, at::device(at::kCPU).dtype(at::kFloat)).vulkan(),
827       at::rand({8}, at::device(at::kCPU).dtype(at::kFloat)).vulkan(),
828       at::rand({8}, at::device(at::kCPU).dtype(at::kFloat)).vulkan(),
829       at::rand({8}, at::device(at::kCPU).dtype(at::kFloat)).vulkan(),
830       false,
831       0.1,
832       1e-05,
833       false);
834   }, ::std::exception);
835 
836   // Act: Vulkan batchnorm expects channel dim to be multiple of 4
837   EXPECT_THROW({
838     at::batch_norm(
839       at::rand({4, 7, 4, 4}, at::device(at::kCPU).dtype(at::kFloat)).vulkan(),
840       at::rand({7}, at::device(at::kCPU).dtype(at::kFloat)).vulkan(),
841       at::rand({7}, at::device(at::kCPU).dtype(at::kFloat)).vulkan(),
842       at::rand({7}, at::device(at::kCPU).dtype(at::kFloat)).vulkan(),
843       at::rand({7}, at::device(at::kCPU).dtype(at::kFloat)).vulkan(),
844       false,
845       0.1,
846       1e-05,
847       false);
848   }, ::std::exception);
849 
850   // Act: weight tensor contains incorrect number of elements
851   EXPECT_THROW({
852     at::batch_norm(
853       at::rand({4, 8, 4, 4}, at::device(at::kCPU).dtype(at::kFloat)).vulkan(),
854       at::rand({12}, at::device(at::kCPU).dtype(at::kFloat)).vulkan(),
855       at::rand({8}, at::device(at::kCPU).dtype(at::kFloat)).vulkan(),
856       at::rand({8}, at::device(at::kCPU).dtype(at::kFloat)).vulkan(),
857       at::rand({8}, at::device(at::kCPU).dtype(at::kFloat)).vulkan(),
858       false,
859       0.1,
860       1e-05,
861       false);
862   }, ::std::exception);
863 
864   // Act: bias tensor contains incorrect number of elements
865   EXPECT_THROW({
866     at::batch_norm(
867       at::rand({4, 8, 4, 4}, at::device(at::kCPU).dtype(at::kFloat)).vulkan(),
868       at::rand({8}, at::device(at::kCPU).dtype(at::kFloat)).vulkan(),
869       at::rand({12}, at::device(at::kCPU).dtype(at::kFloat)).vulkan(),
870       at::rand({8}, at::device(at::kCPU).dtype(at::kFloat)).vulkan(),
871       at::rand({8}, at::device(at::kCPU).dtype(at::kFloat)).vulkan(),
872       false,
873       0.1,
874       1e-05,
875       false);
876   }, ::std::exception);
877 
878   // Act: running mean tensor contains incorrect number of elements
879   EXPECT_THROW({
880     at::batch_norm(
881       at::rand({4, 8, 4, 4}, at::device(at::kCPU).dtype(at::kFloat)).vulkan(),
882       at::rand({8}, at::device(at::kCPU).dtype(at::kFloat)).vulkan(),
883       at::rand({8}, at::device(at::kCPU).dtype(at::kFloat)).vulkan(),
884       at::rand({12}, at::device(at::kCPU).dtype(at::kFloat)).vulkan(),
885       at::rand({8}, at::device(at::kCPU).dtype(at::kFloat)).vulkan(),
886       false,
887       0.1,
888       1e-05,
889       false);
890   }, ::std::exception);
891 
892   // Act: running var tensor contains incorrect number of elements
893   EXPECT_THROW({
894     at::batch_norm(
895       at::rand({4, 8, 4, 4}, at::device(at::kCPU).dtype(at::kFloat)).vulkan(),
896       at::rand({8}, at::device(at::kCPU).dtype(at::kFloat)).vulkan(),
897       at::rand({8}, at::device(at::kCPU).dtype(at::kFloat)).vulkan(),
898       at::rand({8}, at::device(at::kCPU).dtype(at::kFloat)).vulkan(),
899       at::rand({12}, at::device(at::kCPU).dtype(at::kFloat)).vulkan(),
900       false,
901       0.1,
902       1e-05,
903       false);
904   }, ::std::exception);
905 }
906 
TEST_F(VulkanAPITest,batch_norm_small)907 TEST_F(VulkanAPITest, batch_norm_small) {
908   c10::InferenceMode mode;
909 
910   const auto input_cpu = at::rand({1, 4, 1, 1}, at::device(at::kCPU).dtype(at::kFloat));
911   const auto input_vulkan = input_cpu.vulkan();
912 
913   const auto weight_cpu = at::rand({4}, at::device(at::kCPU).dtype(at::kFloat));
914   const auto weight_vulkan = weight_cpu.vulkan();
915 
916   const auto bias_cpu = at::rand({4}, at::device(at::kCPU).dtype(at::kFloat));
917   const auto bias_vulkan = bias_cpu.vulkan();
918 
919   const auto running_mean_cpu = at::rand({4}, at::device(at::kCPU).dtype(at::kFloat));
920   const auto running_mean_vulkan = running_mean_cpu.vulkan();
921 
922   const auto running_var_cpu = at::rand({4}, at::device(at::kCPU).dtype(at::kFloat));
923   const auto running_var_vulkan = running_var_cpu.vulkan();
924 
925   const auto output_cpu = at::batch_norm(input_cpu, weight_cpu, bias_cpu, running_mean_cpu, running_var_cpu, false, 0.1, 1e-05, false);
926   const auto output_vulkan = at::batch_norm(input_vulkan, weight_vulkan, bias_vulkan, running_mean_vulkan, running_var_vulkan, false, 0.1, 1e-05, false);
927 
928   const auto check = almostEqual(output_cpu, output_vulkan.cpu());
929   if (!check) {
930     showRtol(output_cpu, output_vulkan.cpu());
931   }
932 
933   ASSERT_TRUE(check);
934 }
935 
TEST_F(VulkanAPITest,batch_norm_medium)936 TEST_F(VulkanAPITest, batch_norm_medium) {
937   c10::InferenceMode mode;
938 
939   const auto input_cpu = at::rand({3, 8, 5, 7}, at::device(at::kCPU).dtype(at::kFloat));
940   const auto input_vulkan = input_cpu.vulkan();
941 
942   const auto weight_cpu = at::rand({8}, at::device(at::kCPU).dtype(at::kFloat));
943   const auto weight_vulkan = weight_cpu.vulkan();
944 
945   const auto bias_cpu = at::rand({8}, at::device(at::kCPU).dtype(at::kFloat));
946   const auto bias_vulkan = bias_cpu.vulkan();
947 
948   const auto running_mean_cpu = at::rand({8}, at::device(at::kCPU).dtype(at::kFloat));
949   const auto running_mean_vulkan = running_mean_cpu.vulkan();
950 
951   const auto running_var_cpu = at::rand({8}, at::device(at::kCPU).dtype(at::kFloat));
952   const auto running_var_vulkan = running_var_cpu.vulkan();
953 
954   const auto output_cpu = at::batch_norm(input_cpu, weight_cpu, bias_cpu, running_mean_cpu, running_var_cpu, false, 0.1, 1e-05, false);
955   const auto output_vulkan = at::batch_norm(input_vulkan, weight_vulkan, bias_vulkan, running_mean_vulkan, running_var_vulkan, false, 0.1, 1e-05, false);
956 
957   const auto check = almostEqual(output_cpu, output_vulkan.cpu());
958   if (!check) {
959     showRtol(output_cpu, output_vulkan.cpu());
960   }
961 
962   ASSERT_TRUE(check);
963 }
964 
TEST_F(VulkanAPITest,batch_norm_large)965 TEST_F(VulkanAPITest, batch_norm_large) {
966   c10::InferenceMode mode;
967 
968 
969   const auto input_cpu = at::rand({11, 52, 139, 109}, at::device(at::kCPU).dtype(at::kFloat));
970   const auto input_vulkan = input_cpu.vulkan();
971 
972   const auto weight_cpu = at::rand({52}, at::device(at::kCPU).dtype(at::kFloat));
973   const auto weight_vulkan = weight_cpu.vulkan();
974 
975   const auto bias_cpu = at::rand({52}, at::device(at::kCPU).dtype(at::kFloat));
976   const auto bias_vulkan = bias_cpu.vulkan();
977 
978   const auto running_mean_cpu = at::rand({52}, at::device(at::kCPU).dtype(at::kFloat));
979   const auto running_mean_vulkan = running_mean_cpu.vulkan();
980 
981   const auto running_var_cpu = at::rand({52}, at::device(at::kCPU).dtype(at::kFloat));
982   const auto running_var_vulkan = running_var_cpu.vulkan();
983 
984   const auto output_cpu = at::batch_norm(input_cpu, weight_cpu, bias_cpu, running_mean_cpu, running_var_cpu, false, 0.1, 1e-05, false);
985   const auto output_vulkan = at::batch_norm(input_vulkan, weight_vulkan, bias_vulkan, running_mean_vulkan, running_var_vulkan, false, 0.1, 1e-05, false);
986 
987   const auto check = almostEqual(output_cpu, output_vulkan.cpu());
988   if (!check) {
989     showRtol(output_cpu, output_vulkan.cpu());
990   }
991 
992   ASSERT_TRUE(check);
993 }
994 
test_baddbmm(at::Tensor bias_cpu,at::Tensor m1_cpu,at::Tensor m2_cpu,float beta,float alpha)995 void test_baddbmm(
996     at::Tensor bias_cpu,
997     at::Tensor m1_cpu,
998     at::Tensor m2_cpu,
999     float beta,
1000     float alpha) {
1001   const auto out_cpu = at::baddbmm(bias_cpu, m1_cpu, m2_cpu, beta, alpha);
1002 
1003   const auto m1_vulkan = m1_cpu.vulkan();
1004   const auto out_vulkan =
1005       at::baddbmm(bias_cpu, m1_vulkan, m2_cpu.vulkan(), beta, alpha);
1006 
1007   const auto check = almostEqual(out_cpu, out_vulkan.cpu());
1008   if (!check) {
1009     showRtol(out_cpu, out_vulkan.cpu());
1010   }
1011 
1012   ASSERT_TRUE(check);
1013 }
1014 
TEST_F(VulkanAPITest,baddbmm)1015 TEST_F(VulkanAPITest, baddbmm) {
1016   constexpr float alpha = 1.5f;
1017   constexpr float beta = 2.0f;
1018   int batch = 9;
1019   int n = 10;
1020   int p = 41;
1021   int m = 13;
1022 
1023   const auto bias_cpu =
1024       at::rand({batch, n, m}, at::device(at::kCPU).dtype(at::kFloat));
1025   const auto m1_cpu =
1026       at::rand({batch, n, p}, at::device(at::kCPU).dtype(at::kFloat));
1027   const auto m2_cpu =
1028       at::rand({batch, p, m}, at::device(at::kCPU).dtype(at::kFloat));
1029 
1030   test_baddbmm(bias_cpu, m1_cpu, m2_cpu, beta, alpha);
1031 }
1032 
TEST_F(VulkanAPITest,baddbmm_small)1033 TEST_F(VulkanAPITest, baddbmm_small) {
1034   constexpr float alpha = -1.0f;
1035   constexpr float beta = 2.0f;
1036   int batch = 3;
1037   int n = 3;
1038   int p = 5;
1039   int m = 4;
1040 
1041   const auto bias_cpu_0 =
1042       at::rand({1, n, m}, at::device(at::kCPU).dtype(at::kFloat));
1043   const auto bias_cpu_1 =
1044       at::ones({1, n, m}, at::device(at::kCPU).dtype(at::kFloat));
1045   const auto bias_cpu_2 =
1046       at::rand({1, n, m}, at::device(at::kCPU).dtype(at::kFloat)) * -1;
1047   const auto bias_cpu = at::cat({bias_cpu_0, bias_cpu_1, bias_cpu_2}, 0);
1048 
1049   const auto m1_cpu =
1050       at::rand({batch, n, p}, at::device(at::kCPU).dtype(at::kFloat));
1051   const auto m2_cpu =
1052       at::rand({batch, p, m}, at::device(at::kCPU).dtype(at::kFloat));
1053 
1054   test_baddbmm(bias_cpu, m1_cpu, m2_cpu, beta, alpha);
1055 }
1056 
TEST_F(VulkanAPITest,baddbmm_one)1057 TEST_F(VulkanAPITest, baddbmm_one) {
1058   constexpr float alpha = 2.1f;
1059   constexpr float beta = 103.24;
1060 
1061   const auto bias_cpu =
1062       at::rand({1, 1, 1}, at::device(at::kCPU).dtype(at::kFloat));
1063   const auto m1_cpu =
1064       at::rand({1, 1, 1}, at::device(at::kCPU).dtype(at::kFloat));
1065   const auto m2_cpu =
1066       at::rand({1, 1, 1}, at::device(at::kCPU).dtype(at::kFloat));
1067 
1068   test_baddbmm(bias_cpu, m1_cpu, m2_cpu, beta, alpha);
1069 }
1070 
TEST_F(VulkanAPITest,baddbmm_bais_error)1071 TEST_F(VulkanAPITest, baddbmm_bais_error) {
1072   constexpr float alpha = 2.1f;
1073   constexpr float beta = 103.24;
1074 
1075   // mismatched dimensions of batch sizes.
1076   const auto bias_cpu =
1077       at::rand({200, 179, 163}, at::device(at::kCPU).dtype(at::kFloat));
1078   const auto m1_cpu =
1079       at::rand({150, 179, 67}, at::device(at::kCPU).dtype(at::kFloat));
1080   const auto m2_cpu =
1081       at::rand({150, 67, 163}, at::device(at::kCPU).dtype(at::kFloat));
1082   const auto m1_vulkan = m1_cpu.vulkan();
1083   EXPECT_THROW(
1084       at::baddbmm(bias_cpu, m1_vulkan, m2_cpu, beta, alpha), ::std::exception);
1085 }
1086 
TEST_F(VulkanAPITest,baddbmm_bias_boardcast_batch)1087 TEST_F(VulkanAPITest, baddbmm_bias_boardcast_batch) {
1088   constexpr float alpha = 1.5f;
1089   constexpr float beta = 2.0f;
1090   const auto bias_cpu =
1091       at::rand({1, 179, 163}, at::device(at::kCPU).dtype(at::kFloat));
1092   const auto m1_cpu =
1093       at::rand({150, 179, 67}, at::device(at::kCPU).dtype(at::kFloat));
1094   const auto m2_cpu =
1095       at::rand({150, 67, 163}, at::device(at::kCPU).dtype(at::kFloat));
1096   test_baddbmm(bias_cpu, m1_cpu, m2_cpu, beta, alpha);
1097 }
1098 
TEST_F(VulkanAPITest,baddbmm_bias_boardcast_height)1099 TEST_F(VulkanAPITest, baddbmm_bias_boardcast_height) {
1100   constexpr float alpha = 1.5f;
1101   constexpr float beta = 2.0f;
1102   const auto bias_cpu =
1103       at::rand({150, 1, 163}, at::device(at::kCPU).dtype(at::kFloat));
1104   const auto m1_cpu =
1105       at::rand({150, 179, 67}, at::device(at::kCPU).dtype(at::kFloat));
1106   const auto m2_cpu =
1107       at::rand({150, 67, 163}, at::device(at::kCPU).dtype(at::kFloat));
1108   test_baddbmm(bias_cpu, m1_cpu, m2_cpu, beta, alpha);
1109 }
1110 
TEST_F(VulkanAPITest,baddbmm_bias_boardcast_width)1111 TEST_F(VulkanAPITest, baddbmm_bias_boardcast_width) {
1112   constexpr float alpha = 1.5f;
1113   constexpr float beta = 2.0f;
1114   const auto bias_cpu =
1115       at::rand({150, 179, 1}, at::device(at::kCPU).dtype(at::kFloat));
1116   const auto m1_cpu =
1117       at::rand({150, 179, 67}, at::device(at::kCPU).dtype(at::kFloat));
1118   const auto m2_cpu =
1119       at::rand({150, 67, 163}, at::device(at::kCPU).dtype(at::kFloat));
1120   test_baddbmm(bias_cpu, m1_cpu, m2_cpu, beta, alpha);
1121 }
1122 
TEST_F(VulkanAPITest,baddbmm_bias_boardcast_batch_width)1123 TEST_F(VulkanAPITest, baddbmm_bias_boardcast_batch_width) {
1124   constexpr float alpha = 1.5f;
1125   constexpr float beta = 2.0f;
1126   const auto bias_cpu =
1127       at::rand({1, 179, 1}, at::device(at::kCPU).dtype(at::kFloat));
1128   const auto m1_cpu =
1129       at::rand({150, 179, 67}, at::device(at::kCPU).dtype(at::kFloat));
1130   const auto m2_cpu =
1131       at::rand({150, 67, 163}, at::device(at::kCPU).dtype(at::kFloat));
1132   test_baddbmm(bias_cpu, m1_cpu, m2_cpu, beta, alpha);
1133 }
1134 
TEST_F(VulkanAPITest,baddbmm_bias_boardcast_batch_height)1135 TEST_F(VulkanAPITest, baddbmm_bias_boardcast_batch_height) {
1136   constexpr float alpha = 1.5f;
1137   constexpr float beta = 2.0f;
1138   const auto bias_cpu =
1139       at::rand({1, 1, 163}, at::device(at::kCPU).dtype(at::kFloat));
1140   const auto m1_cpu =
1141       at::rand({150, 179, 67}, at::device(at::kCPU).dtype(at::kFloat));
1142   const auto m2_cpu =
1143       at::rand({150, 67, 163}, at::device(at::kCPU).dtype(at::kFloat));
1144   test_baddbmm(bias_cpu, m1_cpu, m2_cpu, beta, alpha);
1145 }
1146 
TEST_F(VulkanAPITest,baddbmm_bias_boardcast_one)1147 TEST_F(VulkanAPITest, baddbmm_bias_boardcast_one) {
1148   constexpr float alpha = 1.5f;
1149   constexpr float beta = 2.0f;
1150   const auto bias_cpu =
1151       at::rand({1, 1, 1}, at::device(at::kCPU).dtype(at::kFloat));
1152   const auto m1_cpu =
1153       at::rand({150, 179, 67}, at::device(at::kCPU).dtype(at::kFloat));
1154   const auto m2_cpu =
1155       at::rand({150, 67, 163}, at::device(at::kCPU).dtype(at::kFloat));
1156   test_baddbmm(bias_cpu, m1_cpu, m2_cpu, beta, alpha);
1157 }
1158 
TEST_F(VulkanAPITest,baddbmm_bias_boardcast_reduce_batch)1159 TEST_F(VulkanAPITest, baddbmm_bias_boardcast_reduce_batch) {
1160   constexpr float alpha = 1.5f;
1161   constexpr float beta = 2.0f;
1162   const auto bias_cpu =
1163       at::rand({179, 163}, at::device(at::kCPU).dtype(at::kFloat));
1164   const auto m1_cpu =
1165       at::rand({150, 179, 67}, at::device(at::kCPU).dtype(at::kFloat));
1166   const auto m2_cpu =
1167       at::rand({150, 67, 163}, at::device(at::kCPU).dtype(at::kFloat));
1168   test_baddbmm(bias_cpu, m1_cpu, m2_cpu, beta, alpha);
1169 }
1170 
TEST_F(VulkanAPITest,baddbmm_bias_boardcast_reduce_batch1)1171 TEST_F(VulkanAPITest, baddbmm_bias_boardcast_reduce_batch1) {
1172   constexpr float alpha = 1.5f;
1173   constexpr float beta = 2.0f;
1174   const auto bias_cpu =
1175       at::rand({179, 1}, at::device(at::kCPU).dtype(at::kFloat));
1176   const auto m1_cpu =
1177       at::rand({150, 179, 67}, at::device(at::kCPU).dtype(at::kFloat));
1178   const auto m2_cpu =
1179       at::rand({150, 67, 163}, at::device(at::kCPU).dtype(at::kFloat));
1180   test_baddbmm(bias_cpu, m1_cpu, m2_cpu, beta, alpha);
1181 }
1182 
TEST_F(VulkanAPITest,baddbmm_bias_boardcast_reduce_batch2)1183 TEST_F(VulkanAPITest, baddbmm_bias_boardcast_reduce_batch2) {
1184   constexpr float alpha = 1.5f;
1185   constexpr float beta = 2.0f;
1186   const auto bias_cpu =
1187       at::rand({1, 163}, at::device(at::kCPU).dtype(at::kFloat));
1188   const auto m1_cpu =
1189       at::rand({150, 179, 67}, at::device(at::kCPU).dtype(at::kFloat));
1190   const auto m2_cpu =
1191       at::rand({150, 67, 163}, at::device(at::kCPU).dtype(at::kFloat));
1192   test_baddbmm(bias_cpu, m1_cpu, m2_cpu, beta, alpha);
1193 }
1194 
TEST_F(VulkanAPITest,baddbmm_bias_boardcast_reduce_batch_height)1195 TEST_F(VulkanAPITest, baddbmm_bias_boardcast_reduce_batch_height) {
1196   constexpr float alpha = 1.5f;
1197   constexpr float beta = 2.0f;
1198   const auto bias_cpu = at::rand({163}, at::device(at::kCPU).dtype(at::kFloat));
1199   const auto m1_cpu =
1200       at::rand({150, 179, 67}, at::device(at::kCPU).dtype(at::kFloat));
1201   const auto m2_cpu =
1202       at::rand({150, 67, 163}, at::device(at::kCPU).dtype(at::kFloat));
1203   test_baddbmm(bias_cpu, m1_cpu, m2_cpu, beta, alpha);
1204 }
1205 
TEST_F(VulkanAPITest,baddbmm_bias_boardcast_reduce_all)1206 TEST_F(VulkanAPITest, baddbmm_bias_boardcast_reduce_all) {
1207   constexpr float alpha = 1.5f;
1208   constexpr float beta = 2.0f;
1209   const auto bias_cpu = at::rand({1}, at::device(at::kCPU).dtype(at::kFloat));
1210   const auto m1_cpu =
1211       at::rand({150, 179, 67}, at::device(at::kCPU).dtype(at::kFloat));
1212   const auto m2_cpu =
1213       at::rand({150, 67, 163}, at::device(at::kCPU).dtype(at::kFloat));
1214   test_baddbmm(bias_cpu, m1_cpu, m2_cpu, beta, alpha);
1215 }
1216 
test_matmul(at::Tensor m1_cpu,at::Tensor m2_cpu,bool m2_use_vulkan=false)1217 void test_matmul(
1218     at::Tensor m1_cpu,
1219     at::Tensor m2_cpu,
1220     bool m2_use_vulkan = false) {
1221   c10::InferenceMode mode;
1222   const auto out_cpu = at::matmul(m1_cpu, m2_cpu);
1223   auto out_vk =
1224       at::matmul(m1_cpu.vulkan(), m2_use_vulkan ? m2_cpu.vulkan() : m2_cpu);
1225 
1226   const auto check = almostEqual(out_cpu, out_vk.cpu());
1227   if (!check) {
1228     showRtol(out_cpu, out_vk.cpu());
1229   }
1230 
1231   ASSERT_TRUE(check);
1232 }
1233 
TEST_F(VulkanAPITest,DISABLED_matmul_3d_weight_vulkan)1234 TEST_F(VulkanAPITest, DISABLED_matmul_3d_weight_vulkan) {
1235   // This will call at::bmm. Will crash for unknow reason.
1236   const auto m1_cpu =
1237       at::rand({13, 23, 45}, at::device(at::kCPU).dtype(at::kFloat));
1238   const auto m2_cpu =
1239       at::rand({13, 45, 26}, at::device(at::kCPU).dtype(at::kFloat));
1240   test_matmul(m1_cpu, m2_cpu, true);
1241 }
1242 
TEST_F(VulkanAPITest,DISABLED_matmul_3d_weight_cpu)1243 TEST_F(VulkanAPITest, DISABLED_matmul_3d_weight_cpu) {
1244   // This will call at::bmm. Will crash for unknow reason.
1245   const auto m1_cpu =
1246       at::rand({13, 23, 45}, at::device(at::kCPU).dtype(at::kFloat));
1247   const auto m2_cpu =
1248       at::rand({13, 45, 26}, at::device(at::kCPU).dtype(at::kFloat));
1249   test_matmul(m1_cpu, m2_cpu);
1250 }
1251 
TEST_F(VulkanAPITest,matmul_2d_weight_vulkan)1252 TEST_F(VulkanAPITest, matmul_2d_weight_vulkan) {
1253   // This will call at::mm
1254   const auto m1_cpu = at::rand({7, 42}, at::device(at::kCPU).dtype(at::kFloat));
1255   const auto m2_cpu = at::rand({42, 9}, at::device(at::kCPU).dtype(at::kFloat));
1256   test_matmul(m1_cpu, m2_cpu, true);
1257 }
1258 
TEST_F(VulkanAPITest,matmul_2d_weight_cpu)1259 TEST_F(VulkanAPITest, matmul_2d_weight_cpu) {
1260   // This will call at::mm
1261   const auto m1_cpu =
1262       at::rand({23, 45}, at::device(at::kCPU).dtype(at::kFloat));
1263   const auto m2_cpu =
1264       at::rand({45, 26}, at::device(at::kCPU).dtype(at::kFloat));
1265   test_matmul(m1_cpu, m2_cpu);
1266 }
1267 
test_bmm(at::Tensor m1_cpu,at::Tensor m2_cpu,bool m2_use_vulkan=false)1268 void test_bmm(
1269     at::Tensor m1_cpu,
1270     at::Tensor m2_cpu,
1271     bool m2_use_vulkan = false) {
1272   const auto out_cpu = m1_cpu.bmm(m2_cpu);
1273 
1274   const auto m1_vulkan = m1_cpu.vulkan();
1275   const auto out_vulkan =
1276       m1_vulkan.bmm(m2_use_vulkan ? m2_cpu.vulkan() : m2_cpu);
1277 
1278   const auto check = almostEqual(out_cpu, out_vulkan.cpu());
1279   if (!check) {
1280     showRtol(out_cpu, out_vulkan.cpu());
1281   }
1282 
1283   ASSERT_TRUE(check);
1284 }
1285 
TEST_F(VulkanAPITest,bmm_vulkan_small)1286 TEST_F(VulkanAPITest, bmm_vulkan_small) {
1287   const auto m1_cpu =
1288       at::rand({5, 2, 3}, at::device(at::kCPU).dtype(at::kFloat));
1289   const auto m2_cpu =
1290       at::rand({5, 3, 4}, at::device(at::kCPU).dtype(at::kFloat));
1291   test_bmm(m1_cpu, m2_cpu, true);
1292 }
1293 
TEST_F(VulkanAPITest,bmm_vulkan_small_width)1294 TEST_F(VulkanAPITest, bmm_vulkan_small_width) {
1295   const auto m1_cpu =
1296       at::rand({9, 32, 5}, at::device(at::kCPU).dtype(at::kFloat));
1297   const auto m2_cpu =
1298       at::rand({9, 5, 13}, at::device(at::kCPU).dtype(at::kFloat));
1299   test_bmm(m1_cpu, m2_cpu, true);
1300 }
1301 
TEST_F(VulkanAPITest,bmm_vulkan_large_width)1302 TEST_F(VulkanAPITest, bmm_vulkan_large_width) {
1303   const auto m1_cpu =
1304       at::rand({9, 7, 45}, at::device(at::kCPU).dtype(at::kFloat));
1305   const auto m2_cpu =
1306       at::rand({9, 45, 6}, at::device(at::kCPU).dtype(at::kFloat));
1307   test_bmm(m1_cpu, m2_cpu, true);
1308 }
1309 
TEST_F(VulkanAPITest,bmm_cpu)1310 TEST_F(VulkanAPITest, bmm_cpu) {
1311   const auto m1_cpu =
1312       at::rand({13, 23, 45}, at::device(at::kCPU).dtype(at::kFloat));
1313   const auto m2_cpu =
1314       at::rand({13, 45, 26}, at::device(at::kCPU).dtype(at::kFloat));
1315   test_bmm(m1_cpu, m2_cpu);
1316 }
1317 
TEST_F(VulkanAPITest,bmm_small)1318 TEST_F(VulkanAPITest, bmm_small) {
1319   const auto m1_cpu =
1320       at::rand({2, 6, 5}, at::device(at::kCPU).dtype(at::kFloat));
1321   const auto m2_cpu =
1322       at::rand({2, 5, 3}, at::device(at::kCPU).dtype(at::kFloat));
1323   test_bmm(m1_cpu, m2_cpu);
1324 }
1325 
TEST_F(VulkanAPITest,bmm_one)1326 TEST_F(VulkanAPITest, bmm_one) {
1327   const auto m1_cpu =
1328       at::rand({1, 1, 1}, at::device(at::kCPU).dtype(at::kFloat));
1329   const auto m2_cpu =
1330       at::rand({1, 1, 1}, at::device(at::kCPU).dtype(at::kFloat));
1331   test_bmm(m1_cpu, m2_cpu);
1332 }
1333 
TEST_F(VulkanAPITest,bmm_error)1334 TEST_F(VulkanAPITest, bmm_error) {
1335   // mismatched dimensions of batch sizes.
1336   const auto m1_cpu =
1337       at::rand({100, 235, 546}, at::device(at::kCPU).dtype(at::kFloat));
1338   const auto m2_cpu =
1339       at::rand({200, 546, 267}, at::device(at::kCPU).dtype(at::kFloat));
1340   const auto m1_vulkan = m1_cpu.vulkan();
1341   EXPECT_THROW(m1_vulkan.bmm(m2_cpu), ::std::exception);
1342 }
1343 
TEST_F(VulkanAPITest,clamp)1344 TEST_F(VulkanAPITest, clamp) {
1345   const auto in_cpu = at::rand({17, 197, 302, 5}, at::device(at::kCPU).dtype(at::kFloat));
1346   const auto in_vulkan = in_cpu.vulkan();
1347 
1348   const float min_value = 0.2f;
1349   const float max_value = 0.8f;
1350 
1351   const auto out_cpu = at::clamp(in_cpu, min_value, max_value);
1352   const auto out_vulkan = at::clamp(in_vulkan, min_value, max_value);
1353 
1354   const auto check = almostEqual(out_cpu, out_vulkan.cpu());
1355   if (!check) {
1356     showRtol(out_cpu, out_vulkan.cpu());
1357   }
1358 
1359   ASSERT_TRUE(check);
1360 }
1361 
TEST_F(VulkanAPITest,clamp_)1362 TEST_F(VulkanAPITest, clamp_) {
1363   const auto cpu = at::rand({17, 197, 302, 5}, at::device(at::kCPU).dtype(at::kFloat));
1364   const auto vulkan = cpu.vulkan();
1365 
1366   const float min_value = 0.2f;
1367   const float max_value = 0.8f;
1368 
1369   cpu.clamp_(min_value, max_value);
1370   vulkan.clamp_(min_value, max_value);
1371 
1372   const auto check = almostEqual(cpu, vulkan.cpu());
1373   if (!check) {
1374     showRtol(cpu, vulkan.cpu());
1375   }
1376 
1377   ASSERT_TRUE(check);
1378 }
1379 
TEST_F(VulkanAPITest,conv1d_simple)1380 TEST_F(VulkanAPITest, conv1d_simple) {
1381   // This is a simple case using arange for input, ones for weights, and arange
1382   // for bias. This makes debugging easiser.
1383   int64_t kernel_size = 3;
1384   int64_t channels = 5;
1385   int64_t lengths = 9;
1386 
1387   c10::InferenceMode mode;
1388 
1389   const auto input_cpu = at::arange(lengths * channels, at::kFloat).reshape({1, channels, lengths});
1390   const auto weights_cpu = at::ones({channels, 1, kernel_size}, at::device(at::kCPU).dtype(at::kFloat));
1391   const auto bias_cpu = at::arange(channels, at::kFloat);
1392 
1393   const auto input_vk = input_cpu.vulkan();
1394   const auto weights_vk = weights_cpu.vulkan();
1395   const auto bias_vk = bias_cpu.vulkan();
1396 
1397   int64_t stride = 1;
1398   int64_t padding = 0;
1399   int64_t dilation = 1;
1400 
1401   const auto output_cpu = at::conv1d(
1402       input_cpu, weights_cpu, bias_cpu, stride, padding, dilation, channels);
1403 
1404   const auto output_vk = at::conv1d(
1405       input_vk, weights_vk, bias_vk, stride, padding, dilation, channels);
1406   const auto output_vk_cpu = output_vk.cpu();
1407 
1408   const bool check = almostEqual(output_cpu, output_vk_cpu);
1409   if (!check) {
1410     showRtol(output_cpu, output_vk_cpu);
1411   }
1412 
1413   ASSERT_TRUE(check);
1414 }
1415 
test_conv1d(int64_t kernel_size,int64_t groups,int64_t lengths,int64_t stride=1,int64_t padding=0,int64_t dilation=1,int64_t in_group_size=1,int64_t out_group_size=1,int64_t batch_size=1)1416 void test_conv1d(
1417     int64_t kernel_size,
1418     int64_t groups,
1419     int64_t lengths,
1420     int64_t stride = 1,
1421     int64_t padding = 0,
1422     int64_t dilation = 1,
1423     int64_t in_group_size = 1,
1424     int64_t out_group_size = 1,
1425     int64_t batch_size = 1) {
1426   c10::InferenceMode mode;
1427 
1428   int64_t in_channels = in_group_size * groups;
1429   int64_t out_channels = out_group_size * groups;
1430 
1431   const auto input_cpu = at::rand({batch_size, in_channels, lengths}, at::kFloat);
1432   const auto weights_cpu = at::rand({out_channels, in_group_size, kernel_size}, at::kFloat);
1433   const auto bias_cpu = at::rand({out_channels,}, at::kFloat);
1434 
1435   const auto input_vk = input_cpu.vulkan();
1436   const auto weights_vk = weights_cpu.vulkan();
1437   const auto bias_vk = bias_cpu.vulkan();
1438 
1439   const auto output_cpu = at::conv1d(
1440       input_cpu, weights_cpu, bias_cpu, stride, padding, dilation, groups);
1441 
1442   const auto output_vk = at::conv1d(
1443       input_vk, weights_vk, bias_vk, stride, padding, dilation, groups);
1444   const auto output_vk_cpu = output_vk.cpu();
1445 
1446   const bool check = almostEqual(output_cpu, output_vk_cpu);
1447   if (!check) {
1448     showRtol(output_cpu, output_vk_cpu);
1449   }
1450 
1451   ASSERT_TRUE(check);
1452 }
1453 
TEST_F(VulkanAPITest,conv1d)1454 TEST_F(VulkanAPITest, conv1d) {
1455   test_conv1d(3, 5, 8);
1456   test_conv1d(9, 5, 9);
1457   test_conv1d(1, 12, 3);
1458   test_conv1d(1, 12, 1);
1459   test_conv1d(10, 12, 20);
1460   test_conv1d(3, 5, 9, 2, 0, 1);
1461   test_conv1d(3, 5, 9, 2, 1, 1);
1462   test_conv1d(3, 5, 9, 2, 1, 2);
1463   test_conv1d(3, 5, 9, 1, 4, 2);
1464   test_conv1d(6, 22, 30, 5, 5, 3);
1465   test_conv1d(6, 5, 30, 5, 5, 3, 3, 5);
1466   test_conv1d(6, 5, 30, 5, 5, 3, 4, 2);
1467   test_conv1d(6, 5, 30, 5, 5, 3, 4, 2, 2);
1468   test_conv1d(6, 5, 30, 5, 5, 3, 4, 2, 5);
1469   test_conv1d(6, 5, 30, 5, 5, 3, 4, 2, 9);
1470 }
1471 
1472 
1473 
test_conv2d_context(const at::IntArrayRef input_shape,const at::IntArrayRef weight_shape,const at::IntArrayRef bias_shape,std::vector<int64_t> stride,std::vector<int64_t> padding,std::vector<int64_t> dilation,int64_t groups)1474 void test_conv2d_context(
1475     const at::IntArrayRef input_shape,
1476     const at::IntArrayRef weight_shape,
1477     const at::IntArrayRef bias_shape,
1478     std::vector<int64_t> stride,
1479     std::vector<int64_t> padding,
1480     std::vector<int64_t> dilation,
1481     int64_t groups) {
1482   c10::InferenceMode mode;
1483 
1484   at::Tensor input = at::rand(input_shape, at::device(at::kCPU).dtype(at::kFloat));
1485   at::Tensor weight = at::rand(weight_shape, at::device(at::kCPU).dtype(at::kFloat));
1486   at::Tensor bias = at::rand(bias_shape, at::device(at::kCPU).dtype(at::kFloat));
1487 
1488   // cpu
1489   const auto out_cpu = at::conv2d(
1490     input, weight, bias, stride, padding, dilation, groups);
1491 
1492   // vulkan
1493   const auto prepack_vulkan = callOpByName(
1494       "vulkan_prepack::create_conv2d_context",
1495       "",
1496       weight, bias, stride, padding, dilation, groups, std::nullopt, std::nullopt);
1497 
1498   const auto vulkan_output = callOpByName(
1499       "vulkan_prepack::run_conv2d_context",
1500       "",
1501       input.vulkan(), prepack_vulkan[0]);
1502 
1503   const auto out_vulkan = vulkan_output[0].toTensor();
1504   const auto out_vk_cpu = out_vulkan.cpu();
1505 
1506   // check
1507   const bool check = almostEqual(out_cpu, out_vk_cpu);
1508   if (!check) {
1509     showRtol(out_cpu, out_vk_cpu);
1510   }
1511 
1512   ASSERT_TRUE(check);
1513 }
1514 
test_backwards_compatible_conv2d_context(const at::IntArrayRef input_shape,const at::IntArrayRef weight_shape,const at::IntArrayRef bias_shape,std::vector<int64_t> stride,std::vector<int64_t> padding,std::vector<int64_t> dilation,int64_t groups)1515 void test_backwards_compatible_conv2d_context(
1516     const at::IntArrayRef input_shape,
1517     const at::IntArrayRef weight_shape,
1518     const at::IntArrayRef bias_shape,
1519     std::vector<int64_t> stride,
1520     std::vector<int64_t> padding,
1521     std::vector<int64_t> dilation,
1522     int64_t groups) {
1523   c10::InferenceMode mode;
1524 
1525   at::Tensor input = at::rand(input_shape, at::device(at::kCPU).dtype(at::kFloat));
1526   at::Tensor weight = at::rand(weight_shape, at::device(at::kCPU).dtype(at::kFloat));
1527   at::Tensor bias = at::rand(bias_shape, at::device(at::kCPU).dtype(at::kFloat));
1528 
1529   // cpu
1530   const auto out_cpu = at::conv2d(
1531     input, weight, bias, stride, padding, dilation, groups);
1532 
1533   // vulkan
1534   const auto prepack_vulkan = callOpByName(
1535       "vulkan_prepack::conv2d_clamp_prepack",
1536       "",
1537       weight, bias, stride, padding, dilation, groups, std::nullopt, std::nullopt);
1538 
1539   const auto vulkan_output = callOpByName(
1540       "vulkan_prepack::conv2d_clamp_run",
1541       "",
1542       input.vulkan(), prepack_vulkan[0]);
1543 
1544   const auto out_vulkan = vulkan_output[0].toTensor();
1545   const auto out_vk_cpu = out_vulkan.cpu();
1546 
1547   // check
1548   const bool check = almostEqual(out_cpu, out_vk_cpu);
1549   if (!check) {
1550     showRtol(out_cpu, out_vk_cpu);
1551   }
1552 
1553   ASSERT_TRUE(check);
1554 }
1555 
test_transposed_conv2d_context(const at::IntArrayRef input_shape,const at::IntArrayRef weight_shape,const at::IntArrayRef bias_shape,std::vector<int64_t> stride,std::vector<int64_t> padding,std::vector<int64_t> output_padding,std::vector<int64_t> dilation,int64_t groups)1556 void test_transposed_conv2d_context(
1557     const at::IntArrayRef input_shape,
1558     const at::IntArrayRef weight_shape,
1559     const at::IntArrayRef bias_shape,
1560     std::vector<int64_t> stride,
1561     std::vector<int64_t> padding,
1562     std::vector<int64_t> output_padding,
1563     std::vector<int64_t> dilation,
1564     int64_t groups) {
1565   c10::InferenceMode mode;
1566 
1567   at::Tensor input = at::rand(input_shape, at::device(at::kCPU).dtype(at::kFloat));
1568   at::Tensor weight = at::rand(weight_shape, at::device(at::kCPU).dtype(at::kFloat));
1569   at::Tensor bias = at::rand(bias_shape, at::device(at::kCPU).dtype(at::kFloat));
1570 
1571   // cpu
1572   const auto out_cpu = at::conv_transpose2d(
1573     input, weight, bias, stride, padding, output_padding, groups, dilation);
1574 
1575   // vulkan
1576   const auto prepack_vulkan = callOpByName(
1577       "vulkan_prepack::create_tconv2d_context",
1578       "",
1579       weight, bias, stride, padding, output_padding, dilation, groups, std::nullopt, std::nullopt);
1580 
1581   const auto vulkan_output = callOpByName(
1582       "vulkan_prepack::run_tconv2d_context",
1583       "",
1584       input.vulkan(), prepack_vulkan[0]);
1585 
1586   const auto out_vulkan = vulkan_output[0].toTensor();
1587   const auto out_vk_cpu = out_vulkan.cpu();
1588 
1589   // check
1590   const bool check = almostEqual(out_cpu, out_vk_cpu);
1591   if (!check) {
1592     showRtol(out_cpu, out_vk_cpu);
1593   }
1594 
1595   ASSERT_TRUE(check);
1596 }
1597 
TEST_F(VulkanAPITest,conv2d)1598 TEST_F(VulkanAPITest, conv2d) {
1599   constexpr int64_t groups = 1;
1600   constexpr std::array<int64_t, 2u> stride{2, 2};
1601   constexpr std::array<int64_t, 2u> padding{1, 1};
1602   //TODO: Support conv2d with dilation != 1
1603   constexpr std::array<int64_t, 2u> dilation{1, 1};
1604 
1605   constexpr struct {
1606     uint32_t batches;
1607     uint32_t channels;
1608     uint32_t width;
1609     uint32_t height;
1610 
1611     std::array<int64_t, 4u> size() const {
1612       return {
1613         batches,
1614         channels,
1615         width,
1616         height,
1617       };
1618     }
1619   } input {1, 3, 8, 8};
1620 
1621   constexpr struct {
1622     uint32_t output_channels;
1623     uint32_t input_channels;
1624     uint32_t width;
1625     uint32_t height;
1626 
1627     std::array<int64_t, 4u> size() const {
1628       return {
1629         output_channels,
1630         input_channels,
1631         width,
1632         height,
1633       };
1634     }
1635   } weights {1, input.channels, 3, 3};
1636 
1637   const auto input_cpu = at::randn(input.size(), at::device(at::kCPU).dtype(at::kFloat));
1638   const auto weights_cpu = at::randn(weights.size(), at::device(at::kCPU).dtype(at::kFloat));
1639   const auto bias_cpu = at::randn({weights.output_channels}, at::device(at::kCPU).dtype(at::kFloat));
1640 
1641   const auto output_cpu = at::conv2d(
1642       input_cpu,
1643       weights_cpu,
1644       bias_cpu,
1645       stride,
1646       padding,
1647       dilation,
1648       groups);
1649 
1650   const auto output_vulkan = at::conv2d(
1651       input_cpu.vulkan(),
1652       weights_cpu,
1653       bias_cpu,
1654       stride,
1655       padding,
1656       dilation,
1657       groups).cpu();
1658 
1659   const bool check = almostEqual(output_cpu, output_vulkan);
1660   if (!check) {
1661     showRtol(output_cpu, output_vulkan);
1662   }
1663 
1664   ASSERT_TRUE(check);
1665 }
1666 
TEST_F(VulkanAPITest,conv2d_prepack)1667 TEST_F(VulkanAPITest, conv2d_prepack) {
1668   test_conv2d_context(
1669     {1, 3, 8, 8}, // input_shape
1670     {1, 3, 3, 3}, // weight_shape
1671     {1},          // bias_shape
1672     {2, 2},       // stride
1673     {1, 1},       // padding
1674     {1, 1},       // dilation
1675     1);           // groups
1676 }
1677 
TEST_F(VulkanAPITest,conv2d_prepack_bc)1678 TEST_F(VulkanAPITest, conv2d_prepack_bc) {
1679   test_backwards_compatible_conv2d_context(
1680     {1, 3, 8, 8}, // input_shape
1681     {1, 3, 3, 3}, // weight_shape
1682     {1},          // bias_shape
1683     {2, 2},       // stride
1684     {1, 1},       // padding
1685     {1, 1},       // dilation
1686     1);           // groups
1687 }
1688 
TEST_F(VulkanAPITest,conv2d_dw_3x3)1689 TEST_F(VulkanAPITest, conv2d_dw_3x3) {
1690   constexpr int64_t groups = 7;
1691   constexpr std::array<int64_t, 2u> stride{2, 3};
1692   constexpr std::array<int64_t, 2u> padding{0, 4};
1693   constexpr std::array<int64_t, 2u> dilation{3, 1};
1694 
1695   constexpr struct {
1696     uint32_t batches;
1697     uint32_t channels;
1698     uint32_t width;
1699     uint32_t height;
1700 
1701     std::array<int64_t, 4u> size() const {
1702       return {
1703           batches,
1704           channels,
1705           width,
1706           height,
1707       };
1708     }
1709   } input{1, groups, 137, 199};
1710 
1711   constexpr struct {
1712     uint32_t output_channels;
1713     uint32_t input_channels;
1714     uint32_t width;
1715     uint32_t height;
1716 
1717     std::array<int64_t, 4u> size() const {
1718       return {
1719           output_channels,
1720           input_channels,
1721           width,
1722           height,
1723       };
1724     }
1725   } weights{groups, 1, 3, 3};
1726 
1727   const auto input_cpu =
1728       at::rand(input.size(), at::device(at::kCPU).dtype(at::kFloat));
1729   const auto weights_cpu =
1730       at::rand(weights.size(), at::device(at::kCPU).dtype(at::kFloat));
1731   const auto bias_cpu = at::rand(
1732       {weights.output_channels}, at::device(at::kCPU).dtype(at::kFloat));
1733 
1734   const auto output_cpu = at::conv2d(
1735       input_cpu, weights_cpu, bias_cpu, stride, padding, dilation, groups);
1736 
1737   const auto output_vulkan = at::conv2d(
1738       input_cpu.vulkan(),
1739       weights_cpu,
1740       bias_cpu,
1741       stride,
1742       padding,
1743       dilation,
1744       groups);
1745 
1746   const bool check = almostEqual(output_cpu, output_vulkan.cpu());
1747   if (!check) {
1748     showRtol(output_cpu, output_vulkan.cpu());
1749   }
1750 
1751   ASSERT_TRUE(check);
1752 }
1753 
TEST_F(VulkanAPITest,conv2d_dw_5x5)1754 TEST_F(VulkanAPITest, conv2d_dw_5x5) {
1755   constexpr int64_t groups = 7;
1756   constexpr std::array<int64_t, 2u> stride{2, 3};
1757   constexpr std::array<int64_t, 2u> padding{0, 4};
1758   constexpr std::array<int64_t, 2u> dilation{3, 1};
1759 
1760   constexpr struct {
1761     uint32_t batches;
1762     uint32_t channels;
1763     uint32_t width;
1764     uint32_t height;
1765 
1766     std::array<int64_t, 4u> size() const {
1767       return {
1768           batches,
1769           channels,
1770           width,
1771           height,
1772       };
1773     }
1774   } input{1, groups, 137, 199};
1775 
1776   constexpr struct {
1777     uint32_t output_channels;
1778     uint32_t input_channels;
1779     uint32_t width;
1780     uint32_t height;
1781 
1782     std::array<int64_t, 4u> size() const {
1783       return {
1784           output_channels,
1785           input_channels,
1786           width,
1787           height,
1788       };
1789     }
1790   } weights{groups, 1, 5, 5};
1791 
1792   const auto input_cpu =
1793       at::rand(input.size(), at::device(at::kCPU).dtype(at::kFloat));
1794   const auto weights_cpu =
1795       at::rand(weights.size(), at::device(at::kCPU).dtype(at::kFloat));
1796   const auto bias_cpu = at::rand(
1797       {weights.output_channels}, at::device(at::kCPU).dtype(at::kFloat));
1798 
1799   const auto output_cpu = at::conv2d(
1800       input_cpu, weights_cpu, bias_cpu, stride, padding, dilation, groups);
1801 
1802   const auto output_vulkan = at::conv2d(
1803       input_cpu.vulkan(),
1804       weights_cpu,
1805       bias_cpu,
1806       stride,
1807       padding,
1808       dilation,
1809       groups);
1810 
1811   const bool check = almostEqual(output_cpu, output_vulkan.cpu());
1812   if (!check) {
1813     showRtol(output_cpu, output_vulkan.cpu());
1814   }
1815 
1816   ASSERT_TRUE(check);
1817 }
1818 
TEST_F(VulkanAPITest,conv2d_dw)1819 TEST_F(VulkanAPITest, conv2d_dw) {
1820   constexpr int64_t groups = 7;
1821   constexpr std::array<int64_t, 2u> stride{2, 3};
1822   constexpr std::array<int64_t, 2u> padding{0, 4};
1823   constexpr std::array<int64_t, 2u> dilation{3, 1};
1824 
1825   constexpr struct {
1826     uint32_t batches;
1827     uint32_t channels;
1828     uint32_t width;
1829     uint32_t height;
1830 
1831     std::array<int64_t, 4u> size() const {
1832       return {
1833         batches,
1834         channels,
1835         width,
1836         height,
1837       };
1838     }
1839   } input {1, groups, 137, 199};
1840 
1841   constexpr struct {
1842     uint32_t output_channels;
1843     uint32_t input_channels;
1844     uint32_t width;
1845     uint32_t height;
1846 
1847     std::array<int64_t, 4u> size() const {
1848       return {
1849         output_channels,
1850         input_channels,
1851         width,
1852         height,
1853       };
1854     }
1855   } weights {groups, 1, 17, 7};
1856 
1857   const auto input_cpu = at::rand(input.size(), at::device(at::kCPU).dtype(at::kFloat));
1858   const auto weights_cpu = at::rand(weights.size(), at::device(at::kCPU).dtype(at::kFloat));
1859   const auto bias_cpu = at::rand({weights.output_channels}, at::device(at::kCPU).dtype(at::kFloat));
1860 
1861   const auto output_cpu = at::conv2d(
1862       input_cpu,
1863       weights_cpu,
1864       bias_cpu,
1865       stride,
1866       padding,
1867       dilation,
1868       groups);
1869 
1870   const auto output_vulkan = at::conv2d(
1871       input_cpu.vulkan(),
1872       weights_cpu,
1873       bias_cpu,
1874       stride,
1875       padding,
1876       dilation,
1877       groups);
1878 
1879   const bool check = almostEqual(output_cpu, output_vulkan.cpu());
1880   if (!check) {
1881     showRtol(output_cpu, output_vulkan.cpu());
1882   }
1883 
1884   ASSERT_TRUE(check);
1885 }
1886 
TEST_F(VulkanAPITest,conv2d_dw_prepack)1887 TEST_F(VulkanAPITest, conv2d_dw_prepack) {
1888   test_conv2d_context(
1889     {1, 7, 137, 199}, // input_shape
1890     {7, 1, 17, 7},    // weight_shape
1891     {7},              // bias_shape
1892     {2, 3},           // stride
1893     {0, 4},           // padding
1894     {3, 1},           // dilation
1895     7);               // groups
1896 }
1897 
TEST_F(VulkanAPITest,conv2d_dw_prepack_bc)1898 TEST_F(VulkanAPITest, conv2d_dw_prepack_bc) {
1899   test_backwards_compatible_conv2d_context(
1900     {1, 7, 137, 199}, // input_shape
1901     {7, 1, 17, 7},    // weight_shape
1902     {7},              // bias_shape
1903     {2, 3},           // stride
1904     {0, 4},           // padding
1905     {3, 1},           // dilation
1906     7);               // groups
1907 }
1908 
TEST_F(VulkanAPITest,conv2d_pw)1909 TEST_F(VulkanAPITest, conv2d_pw) {
1910   constexpr int64_t groups = 1;
1911   constexpr std::array<int64_t, 2u> stride{1, 1};
1912   constexpr std::array<int64_t, 2u> padding{0, 0};
1913   constexpr std::array<int64_t, 2u> dilation{1, 1};
1914 
1915   constexpr struct {
1916     uint32_t batches;
1917     uint32_t channels;
1918     uint32_t width;
1919     uint32_t height;
1920 
1921     std::array<int64_t, 4u> size() const {
1922       return {
1923         batches,
1924         channels,
1925         width,
1926         height,
1927       };
1928     }
1929   } input {1, 17, 127, 397};
1930 
1931   constexpr struct {
1932     uint32_t output_channels;
1933     uint32_t input_channels;
1934     uint32_t width;
1935     uint32_t height;
1936 
1937     std::array<int64_t, 4u> size() const {
1938       return {
1939         output_channels,
1940         input_channels,
1941         width,
1942         height,
1943       };
1944     }
1945   } weights {29, input.channels, 1, 1};
1946 
1947   const auto input_cpu = at::randn(input.size(), at::device(at::kCPU).dtype(at::kFloat));
1948   const auto weights_cpu = at::randn(weights.size(), at::device(at::kCPU).dtype(at::kFloat));
1949   const auto bias_cpu = at::randn({weights.output_channels}, at::device(at::kCPU).dtype(at::kFloat));
1950 
1951   const auto output_cpu = at::conv2d(
1952       input_cpu,
1953       weights_cpu,
1954       bias_cpu,
1955       stride,
1956       padding,
1957       dilation,
1958       groups);
1959 
1960   const auto output_vulkan = at::conv2d(
1961       input_cpu.vulkan(),
1962       weights_cpu,
1963       bias_cpu,
1964       stride,
1965       padding,
1966       dilation,
1967       groups);
1968 
1969   const bool check = almostEqual(output_cpu, output_vulkan.cpu());
1970   if (!check) {
1971     showRtol(output_cpu, output_vulkan.cpu());
1972   }
1973 
1974   ASSERT_TRUE(check);
1975 }
1976 
TEST_F(VulkanAPITest,conv2d_pw_prepack_medium)1977 TEST_F(VulkanAPITest, conv2d_pw_prepack_medium) {
1978   int in_channels = 17;
1979   int out_channels = 29;
1980   int height = 27;
1981   int width = 39;
1982   test_conv2d_context(
1983     {1, in_channels, height, width},  // input_shape
1984     {out_channels, in_channels, 1, 1},     // weight_shape
1985     {out_channels},               // bias_shape
1986     {1, 1},             // stride
1987     {0, 0},             // padding
1988     {1, 1},             // dilation
1989     1);                 // groups
1990 }
1991 
TEST_F(VulkanAPITest,conv2d_pw_prepack_bc_medium)1992 TEST_F(VulkanAPITest, conv2d_pw_prepack_bc_medium) {
1993   int in_channels = 17;
1994   int out_channels = 29;
1995   int height = 27;
1996   int width = 39;
1997   test_backwards_compatible_conv2d_context(
1998     {1, in_channels, height, width},  // input_shape
1999     {out_channels, in_channels, 1, 1},     // weight_shape
2000     {out_channels},               // bias_shape
2001     {1, 1},             // stride
2002     {0, 0},             // padding
2003     {1, 1},             // dilation
2004     1);                 // groups
2005 }
2006 
2007 // The followin 2 tests failed on Meta's CI when all tests are executed.  Output
2008 // has lots of nan. Cause unknown.
2009 // When this test is run alone (with gtest_filter), it passes.
2010 // The test also passes with smaller planes, see "conv2d_pw_prepack_medium".
TEST_F(VulkanAPITest,DISABLED_conv2d_pw_prepack)2011 TEST_F(VulkanAPITest, DISABLED_conv2d_pw_prepack) {
2012   test_conv2d_context(
2013     {1, 17, 127, 397},  // input_shape
2014     {29, 17, 1, 1},     // weight_shape
2015     {29},               // bias_shape
2016     {1, 1},             // stride
2017     {0, 0},             // padding
2018     {1, 1},             // dilation
2019     1);                 // groups
2020 }
2021 
TEST_F(VulkanAPITest,DISABLED_conv2d_pw_prepack_bc)2022 TEST_F(VulkanAPITest, DISABLED_conv2d_pw_prepack_bc) {
2023   test_backwards_compatible_conv2d_context(
2024     {1, 17, 127, 397},  // input_shape
2025     {29, 17, 1, 1},     // weight_shape
2026     {29},               // bias_shape
2027     {1, 1},             // stride
2028     {0, 0},             // padding
2029     {1, 1},             // dilation
2030     1);                 // groups
2031 }
2032 
TEST_F(VulkanAPITest,conv2d_transposed)2033 TEST_F(VulkanAPITest, conv2d_transposed) {
2034   // Arrange
2035   constexpr int64_t groups = 1;
2036   constexpr std::array<int64_t, 2u> stride{1, 2};
2037   constexpr std::array<int64_t, 2u> padding{1, 0};
2038   constexpr std::array<int64_t, 2u> output_padding{0, 1};
2039   //TODO: Support conv_transpose2d with dilation != 1
2040   constexpr std::array<int64_t, 2u> dilation{1, 1};
2041 
2042   constexpr struct {
2043     uint32_t batches;
2044     uint32_t channels;
2045     uint32_t height;
2046     uint32_t width;
2047 
2048     std::array<int64_t, 4u> size() const {
2049       return {
2050         batches,
2051         channels,
2052         height,
2053         width,
2054       };
2055     }
2056   } input {1, 55, 7, 19};
2057 
2058   constexpr struct {
2059     uint32_t input_channels;
2060     uint32_t output_channels;
2061     uint32_t height;
2062     uint32_t width;
2063 
2064     std::array<int64_t, 4u> size() const {
2065       return {
2066         input_channels,
2067         output_channels,
2068         height,
2069         width,
2070       };
2071     }
2072   } weights {input.channels, 47, 2, 3};
2073 
2074   const auto input_cpu = at::randn(input.size(), at::device(at::kCPU).dtype(at::kFloat));
2075   const auto weights_cpu = at::randn(weights.size(), at::device(at::kCPU).dtype(at::kFloat));
2076   const auto bias_cpu = at::zeros({weights.output_channels}, at::device(at::kCPU).dtype(at::kFloat));
2077 
2078   // Act
2079   const auto output_cpu = at::conv_transpose2d(
2080       input_cpu,
2081       weights_cpu,
2082       bias_cpu,
2083       stride,
2084       padding,
2085       output_padding,
2086       groups,
2087       dilation);
2088 
2089   const auto output_vk = at::conv_transpose2d(
2090       input_cpu.vulkan(),
2091       weights_cpu,
2092       bias_cpu,
2093       stride,
2094       padding,
2095       output_padding,
2096       groups,
2097       dilation).cpu();
2098 
2099   // Assert
2100   const bool check = almostEqual(output_cpu, output_vk);
2101   if (!check) {
2102     showRtol(output_cpu, output_vk);
2103   }
2104 
2105   ASSERT_TRUE(check);
2106 }
2107 
TEST_F(VulkanAPITest,conv2d_transposed_prepack)2108 TEST_F(VulkanAPITest, conv2d_transposed_prepack) {
2109   test_transposed_conv2d_context(
2110     {1, 55, 7, 19}, // input_shape
2111     {55, 47, 2, 3}, // weight_shape
2112     {47},           // bias_shape
2113     {1, 2},         // stride
2114     {1, 0},         // padding
2115     {0, 1},         // output_padding
2116     {1, 1},         // dilation
2117     1);             // groups
2118 }
2119 
TEST_F(VulkanAPITest,conv2d_clamp_after_div)2120 TEST_F(VulkanAPITest, conv2d_clamp_after_div) {
2121   c10::InferenceMode mode;
2122 
2123   constexpr std::array<int64_t, 2u> stride{2, 2};
2124   constexpr std::array<int64_t, 2u> padding{1, 1};
2125   constexpr std::array<int64_t, 2u> dilation{1, 1};
2126   constexpr int64_t groups = 1;
2127 
2128   const auto input_numerator = at::rand({1, 3, 64, 64}, at::device(at::kCPU).dtype(at::kFloat));
2129   const auto input_denominator = at::rand({3, 1, 1}, at::device(at::kCPU).dtype(at::kFloat)) + 0.01;
2130   const auto input_cpu = at::div(input_numerator, input_denominator);
2131   const auto input_vk = at::div(input_numerator.vulkan(), input_denominator.vulkan());
2132   at::Tensor weight = at::rand({24, 3, 3, 3}, at::device(at::kCPU).dtype(at::kFloat));
2133   at::Tensor bias = at::rand({24}, at::device(at::kCPU).dtype(at::kFloat));
2134 
2135   // cpu
2136   const auto prepack_cpu = callOpByName(
2137       "prepacked::conv2d_clamp_prepack",
2138       "",
2139       weight, bias, stride, padding, dilation, groups, 0.0f, std::nullopt)[0];
2140 
2141   const auto out_cpu = callOpByName(
2142       "prepacked::conv2d_clamp_run",
2143       "",
2144       input_cpu, prepack_cpu)[0].toTensor();
2145 
2146   // vulkan
2147   const auto prepack_vk = callOpByName(
2148       "vulkan_prepack::create_conv2d_context",
2149       "",
2150       weight, bias, stride, padding, dilation, groups, 0.0f, std::nullopt)[0];
2151 
2152   const auto out_vk = callOpByName(
2153       "vulkan_prepack::run_conv2d_context",
2154       "",
2155       input_vk, prepack_vk)[0].toTensor();
2156 
2157   const auto out_vk_cpu = out_vk.cpu();
2158 
2159   // check
2160   const bool check = almostEqual(out_cpu, out_vk_cpu);
2161   if (!check) {
2162     showRtol(out_cpu, out_vk_cpu);
2163   }
2164 
2165   ASSERT_TRUE(check);
2166 }
2167 
TEST_F(VulkanAPITest,copy)2168 TEST_F(VulkanAPITest, copy) {
2169   const auto cpu = at::rand({13, 17, 37, 19}, at::device(at::kCPU).dtype(at::kFloat));
2170   const auto vulkan = cpu.vulkan();
2171 
2172   const auto check = almostEqual(cpu, vulkan.cpu());
2173   if (!check) {
2174     showRtol(cpu, vulkan.cpu());
2175   }
2176 
2177   ASSERT_TRUE(check);
2178 }
2179 
test_cumsum(const at::IntArrayRef input_shape,const int64_t dim)2180 void test_cumsum(const at::IntArrayRef input_shape, const int64_t dim) {
2181   const auto in_cpu = at::rand(input_shape, at::TensorOptions(at::kCPU).dtype(at::kFloat));
2182 
2183   const auto out_cpu = at::cumsum(in_cpu, dim);
2184   const auto out_vulkan = at::cumsum(in_cpu.vulkan(), dim);
2185   const auto check = almostEqual(out_cpu, out_vulkan.cpu());
2186   if (!check) {
2187     showRtol(out_cpu, out_vulkan.cpu());
2188   }
2189   ASSERT_TRUE(check);
2190 }
2191 
TEST_F(VulkanAPITest,cumsum_1d)2192 TEST_F(VulkanAPITest, cumsum_1d) {
2193   test_cumsum({37}, 0);
2194   test_cumsum({37}, -1);
2195 }
2196 
TEST_F(VulkanAPITest,cumsum_2d)2197 TEST_F(VulkanAPITest, cumsum_2d) {
2198   for (int64_t i = -1; i <= 1; i++) {
2199     test_cumsum({17, 37}, i);
2200   }
2201 }
2202 
TEST_F(VulkanAPITest,cumsum_3d)2203 TEST_F(VulkanAPITest, cumsum_3d) {
2204   for (int64_t i = -2; i <= 2; i++) {
2205     test_cumsum({17, 37, 49}, i);
2206   }
2207 }
2208 
TEST_F(VulkanAPITest,cumsum_4d)2209 TEST_F(VulkanAPITest, cumsum_4d) {
2210   for (int64_t i = -3; i <= 3; i++) {
2211     test_cumsum({12, 17, 37, 49}, i);
2212   }
2213 }
2214 
test_div(const at::IntArrayRef input_shape,const at::IntArrayRef other_shape)2215 void test_div(const at::IntArrayRef input_shape, const at::IntArrayRef other_shape) {
2216   const auto in_cpu = at::rand(input_shape, at::device(at::kCPU).dtype(at::kFloat));
2217   const auto other_cpu = at::rand(other_shape, at::device(at::kCPU).dtype(at::kFloat)) + 0.01;
2218 
2219   const auto in_vulkan = in_cpu.vulkan();
2220   const auto other_vulkan = other_cpu.vulkan();
2221 
2222   const auto out_cpu = at::div(in_cpu, other_cpu);
2223   const auto out_vulkan = at::div(in_vulkan, other_vulkan);
2224 
2225   const auto check = almostEqual(out_cpu, out_vulkan.cpu());
2226   if (!check) {
2227     showRtol(out_cpu, out_vulkan.cpu());
2228   }
2229 
2230   ASSERT_TRUE(check);
2231 }
2232 
TEST_F(VulkanAPITest,div)2233 TEST_F(VulkanAPITest, div) {
2234   test_div({11, 7, 139, 109}, {11, 7, 139, 109});
2235 }
2236 
TEST_F(VulkanAPITest,div_broadcast0)2237 TEST_F(VulkanAPITest, div_broadcast0) {
2238   test_div({3, 5, 1, 1}, {3, 5, 179, 221});
2239 }
2240 
TEST_F(VulkanAPITest,div_broadcast1)2241 TEST_F(VulkanAPITest, div_broadcast1) {
2242   test_div({3, 5, 179, 221}, {3, 5, 1, 221});
2243 }
2244 
TEST_F(VulkanAPITest,div_broadcast2)2245 TEST_F(VulkanAPITest, div_broadcast2) {
2246   test_div({3, 4, 179, 221}, {4, 1, 1});
2247 }
2248 
TEST_F(VulkanAPITest,div_broadcast3)2249 TEST_F(VulkanAPITest, div_broadcast3) {
2250   test_div({3, 4, 179, 221}, {1, 1, 179, 221});
2251 }
2252 
TEST_F(VulkanAPITest,div_broadcast4)2253 TEST_F(VulkanAPITest, div_broadcast4) {
2254   test_div({3, 4, 41, 1}, {1, 41, 53});
2255 }
2256 
TEST_F(VulkanAPITest,div_broadcast5)2257 TEST_F(VulkanAPITest, div_broadcast5) {
2258   test_div({2, 1, 7, 1}, {1, 5, 1, 4});
2259 }
2260 
TEST_F(VulkanAPITest,div_broadcast6)2261 TEST_F(VulkanAPITest, div_broadcast6) {
2262   test_div({1, 15, 5, 4}, {21, 1, 5, 4});
2263 }
2264 
TEST_F(VulkanAPITest,div_zero_dim)2265 TEST_F(VulkanAPITest, div_zero_dim) {
2266   test_div({1, 15, 5, 4}, {});
2267 }
2268 
TEST_F(VulkanAPITest,div_)2269 TEST_F(VulkanAPITest, div_) {
2270   auto a_cpu = at::rand({61, 17, 29, 83}, at::device(at::kCPU).dtype(at::kFloat));
2271   auto a_vulkan = a_cpu.vulkan();
2272 
2273   const auto b_cpu = at::rand({61, 17, 29, 83}, at::device(at::kCPU).dtype(at::kFloat)) + 0.01;
2274   const auto b_vulkan = b_cpu.vulkan();
2275 
2276   a_cpu.div_(b_cpu);
2277   a_vulkan.div_(b_vulkan);
2278 
2279   const auto check = almostEqual(a_cpu, a_vulkan.cpu());
2280   if (!check) {
2281     showRtol(b_cpu, b_vulkan.cpu());
2282   }
2283 
2284   ASSERT_TRUE(check);
2285 }
2286 
TEST_F(VulkanAPITest,div_broadcast0_)2287 TEST_F(VulkanAPITest, div_broadcast0_) {
2288   auto a_cpu = at::rand({12, 17, 29, 83}, at::device(at::kCPU).dtype(at::kFloat));
2289   auto a_vulkan = a_cpu.vulkan();
2290 
2291   const auto b_cpu = at::rand({12, 17, 29, 1}, at::device(at::kCPU).dtype(at::kFloat)) + 0.01;
2292   const auto b_vulkan = b_cpu.vulkan();
2293 
2294   a_cpu.div_(b_cpu);
2295   a_vulkan.div_(b_vulkan);
2296 
2297   const auto check = almostEqual(a_cpu, a_vulkan.cpu());
2298   if (!check) {
2299     showRtol(b_cpu, b_vulkan.cpu());
2300   }
2301 
2302   ASSERT_TRUE(check);
2303 }
2304 
TEST_F(VulkanAPITest,div_broadcast1_)2305 TEST_F(VulkanAPITest, div_broadcast1_) {
2306   auto a_cpu = at::rand({3, 8, 29, 83}, at::device(at::kCPU).dtype(at::kFloat));
2307   auto a_vulkan = a_cpu.vulkan();
2308 
2309   const auto b_cpu = at::rand({8, 1, 1}, at::device(at::kCPU).dtype(at::kFloat)) + 0.01;
2310   const auto b_vulkan = b_cpu.vulkan();
2311 
2312   a_cpu.div_(b_cpu);
2313   a_vulkan.div_(b_vulkan);
2314 
2315   const auto check = almostEqual(a_cpu, a_vulkan.cpu());
2316   if (!check) {
2317     showRtol(b_cpu, b_vulkan.cpu());
2318   }
2319 
2320   ASSERT_TRUE(check);
2321 }
2322 
TEST_F(VulkanAPITest,div_scalar)2323 TEST_F(VulkanAPITest, div_scalar) {
2324 
2325   const auto a_cpu = at::rand({17, 213, 213, 7}, at::device(at::kCPU).dtype(at::kFloat));
2326   const auto a_vulkan = a_cpu.vulkan();
2327 
2328   const float b_scalar = 3.1415f;
2329 
2330   const auto c_cpu = at::div(a_cpu, b_scalar);
2331   const auto c_vulkan = at::div(a_vulkan, b_scalar);
2332 
2333   const auto check = almostEqual(c_cpu, c_vulkan.cpu());
2334   if (!check) {
2335     showRtol(c_cpu, c_vulkan.cpu());
2336   }
2337 
2338   ASSERT_TRUE(check);
2339 }
2340 
TEST_F(VulkanAPITest,div_scalar_)2341 TEST_F(VulkanAPITest, div_scalar_) {
2342   auto a_cpu = at::rand({11, 7, 139, 109}, at::device(at::kCPU).dtype(at::kFloat));
2343   auto a_vulkan = a_cpu.vulkan();
2344 
2345   const float b_scalar = 3.1415f;
2346 
2347   a_cpu.div_(b_scalar);
2348   a_vulkan.div_(b_scalar);
2349 
2350   const auto check = almostEqual(a_cpu, a_vulkan.cpu());
2351   if (!check) {
2352     showRtol(a_cpu, a_vulkan.cpu());
2353   }
2354 
2355   ASSERT_TRUE(check);
2356 }
2357 
TEST_F(VulkanAPITest,div_scalar_wrapped)2358 TEST_F(VulkanAPITest, div_scalar_wrapped) {
2359   if (!at::is_vulkan_available()) {
2360     return;
2361   }
2362 
2363   const auto a_cpu = at::rand({17, 213, 213, 7}, at::device(at::kCPU).dtype(at::kFloat));
2364   const auto a_vulkan = a_cpu.vulkan();
2365 
2366   const auto b_scalar = at::rand({1}, at::device(at::kCPU).dtype(at::kFloat)) + 0.01;
2367 
2368   const auto c_cpu = at::div(a_cpu, b_scalar);
2369   const auto c_vulkan = at::div(a_vulkan, b_scalar);
2370 
2371   const auto check = almostEqual(c_cpu, c_vulkan.cpu());
2372   if (!check) {
2373     showRtol(c_cpu, c_vulkan.cpu());
2374   }
2375 
2376   ASSERT_TRUE(check);
2377 }
2378 
TEST_F(VulkanAPITest,div_scalar_wrapped_)2379 TEST_F(VulkanAPITest, div_scalar_wrapped_) {
2380   if (!at::is_vulkan_available()) {
2381     return;
2382   }
2383 
2384   auto a_cpu = at::rand({11, 7, 139, 109}, at::device(at::kCPU).dtype(at::kFloat));
2385   auto a_vulkan = a_cpu.vulkan();
2386 
2387   const auto b_scalar = at::rand({1}, at::device(at::kCPU).dtype(at::kFloat)) + 0.01;
2388 
2389   a_cpu.div_(b_scalar);
2390   a_vulkan.div_(b_scalar);
2391 
2392   const auto check = almostEqual(a_cpu, a_vulkan.cpu());
2393   if (!check) {
2394     showRtol(a_cpu, a_vulkan.cpu());
2395   }
2396 
2397   ASSERT_TRUE(check);
2398 }
2399 
TEST_F(VulkanAPITest,div_to_scalar_wrapped)2400 TEST_F(VulkanAPITest, div_to_scalar_wrapped) {
2401   if (!at::is_vulkan_available()) {
2402     return;
2403   }
2404 
2405   const auto a = at::rand({1}, at::device(at::kCPU).dtype(at::kFloat));
2406 
2407   const auto b_cpu = at::rand({2, 3, 5, 7}, at::device(at::kCPU).dtype(at::kFloat)) + 0.01;
2408   const auto b_vulkan = b_cpu.vulkan();
2409 
2410   const auto c_cpu = at::div(a, b_cpu);
2411   const auto c_vulkan = at::div(a, b_vulkan);
2412 
2413   const auto check = almostEqual(c_cpu, c_vulkan.cpu());
2414   if (!check) {
2415     showRtol(c_cpu, c_vulkan.cpu());
2416   }
2417 
2418   ASSERT_TRUE(check);
2419 }
2420 
TEST_F(VulkanAPITest,empty)2421 TEST_F(VulkanAPITest, empty) {
2422 
2423   ASSERT_NO_THROW(at::empty({1, 17, 41, 53}, at::device(at::kVulkan).dtype(at::kFloat)));
2424 }
2425 
test_expand(const at::IntArrayRef input_shape,const at::IntArrayRef output_shape)2426 void test_expand(const at::IntArrayRef input_shape, const at::IntArrayRef output_shape) {
2427   c10::InferenceMode mode;
2428   const auto cpu = at::rand(input_shape, at::device(at::kCPU).dtype(at::kFloat));
2429   const auto vulkan = cpu.vulkan();
2430 
2431   cpu.expand(output_shape);
2432   vulkan.expand(output_shape);
2433 
2434   const auto check = almostEqual(cpu, vulkan.cpu());
2435   if (!check) {
2436     showRtol(cpu, vulkan.cpu());
2437   }
2438   ASSERT_TRUE(check);
2439 }
2440 
TEST_F(VulkanAPITest,expand_exceptions)2441 TEST_F(VulkanAPITest, expand_exceptions) {
2442   // Vulkan expand supports input dims <= 4
2443   auto in_cpu = at::rand({1, 2, 3, 4, 5}, at::device(at::kCPU).dtype(at::kFloat));
2444   EXPECT_THROW(const auto out_vulkan = in_cpu.vulkan().expand({1, 2, 3, 4}), ::std::exception);
2445 
2446   // Vulkan expand supports output_size <= 4
2447   in_cpu = at::rand({1, 2, 3, 4}, at::device(at::kCPU).dtype(at::kFloat));
2448   EXPECT_THROW(const auto out_vulkan = in_cpu.vulkan().expand({1, 1, 2, 3, 4}), ::std::exception);
2449 
2450   // Vulkan expand expects output size >= input
2451   in_cpu = at::rand({1, 2, 3}, at::device(at::kCPU).dtype(at::kFloat));
2452   EXPECT_THROW(const auto out_vulkan = in_cpu.vulkan().expand({2, 3}), ::std::exception);
2453 
2454   // Non-singleton dimensions must match
2455   in_cpu = at::rand({3, 1}, at::device(at::kCPU).dtype(at::kFloat));
2456   EXPECT_THROW(const auto out_vulkan = in_cpu.vulkan().expand({1, 1}), ::std::exception);
2457 
2458   // -1 not allowed in leading, non-existing dimension
2459   in_cpu = at::rand({3, 1}, at::device(at::kCPU).dtype(at::kFloat));
2460   EXPECT_THROW(const auto out_vulkan = in_cpu.vulkan().expand({-1, 3, 1}), ::std::exception);
2461 }
2462 
TEST_F(VulkanAPITest,expand_1d)2463 TEST_F(VulkanAPITest, expand_1d) {
2464   test_expand({1}, {3});
2465 
2466   test_expand({1}, {9, 3});       // 1d->2d
2467   test_expand({1}, {8, 9, 3});    // 1d->3d
2468   test_expand({1}, {7, 8, 9, 3}); // 1d->4d
2469 }
2470 
TEST_F(VulkanAPITest,expand_2d)2471 TEST_F(VulkanAPITest, expand_2d) {
2472   test_expand({5, 1}, {-1, 5}); // W
2473   test_expand({1, 5}, {5, 5});  // H
2474 
2475   test_expand({5, 1}, {2, -1, 5});    // 2d->3d
2476   test_expand({1, 5}, {2, 5, 3, -1}); // 2d->4d
2477 }
2478 
TEST_F(VulkanAPITest,expand_3d)2479 TEST_F(VulkanAPITest, expand_3d) {
2480   test_expand({3, 4, 1}, {3, 4, -1}); // W
2481   test_expand({3, 1, 5}, {-1, 4, 5}); // H
2482   test_expand({1, 4, 5}, {3, -1, 5}); // C
2483 
2484   test_expand({5, 4, 3}, {2, -1, -1, -1}); // 3d->4d
2485 }
2486 
TEST_F(VulkanAPITest,expand_4d)2487 TEST_F(VulkanAPITest, expand_4d) {
2488   test_expand({5, 4, 3, 1}, {5, 4, 3, 9}); // W
2489   test_expand({5, 4, 1, 2}, {5, 4, 9, 2}); // H
2490   test_expand({5, 1, 3, 2}, {5, 9, 3, 2}); // C
2491   test_expand({1, 4, 3, 2}, {9, 4, 3, 2}); // N
2492 }
2493 
TEST_F(VulkanAPITest,expand_as)2494 TEST_F(VulkanAPITest, expand_as) {
2495   // expand_as calls into expand, without negative sizes, those tests should be sufficient.
2496   c10::InferenceMode mode;
2497   const auto cpu = at::rand({1, 1, 1, 1}, at::device(at::kCPU).dtype(at::kFloat));
2498   const auto vulkan = cpu.vulkan();
2499   const auto other = at::rand({9, 11, 33, 22}, at::device(at::kCPU).dtype(at::kFloat));
2500 
2501   cpu.expand_as(other);
2502   vulkan.expand_as(other);
2503 
2504   const auto check = almostEqual(cpu, vulkan.cpu());
2505   if (!check) {
2506     showRtol(cpu, vulkan.cpu());
2507   }
2508   ASSERT_TRUE(check);
2509 }
2510 
test_flip(const at::IntArrayRef input_shape,const at::IntArrayRef dim_list)2511 void test_flip(const at::IntArrayRef input_shape, const at::IntArrayRef dim_list) {
2512   c10::InferenceMode mode;
2513   const auto in_cpu = at::rand(input_shape, at::device(at::kCPU).dtype(at::kFloat));
2514   const auto in_vulkan = in_cpu.vulkan();
2515 
2516   const auto out_cpu = at::flip(in_cpu, dim_list);
2517   const auto out_vulkan = at::flip(in_vulkan, dim_list);
2518 
2519   const auto check = almostEqual(out_cpu, out_vulkan.cpu());
2520   if (!check) {
2521     showRtol(out_cpu, out_vulkan.cpu());
2522     std::cout << "test flip failed with input_shape: " << input_shape
2523               << " and dim_list: " << dim_list << std::endl;
2524   }
2525 
2526   ASSERT_TRUE(check);
2527 }
2528 
TEST_F(VulkanAPITest,flip_1d)2529 TEST_F(VulkanAPITest, flip_1d) {
2530   test_flip({5}, {0});
2531   test_flip({5}, {-1});
2532 }
2533 
TEST_F(VulkanAPITest,flip_2d)2534 TEST_F(VulkanAPITest, flip_2d) {
2535   test_flip({5, 5}, {-1});
2536   test_flip({2, 7}, {-2});
2537 
2538   test_flip({5, 5}, {0, 1});
2539 }
2540 
TEST_F(VulkanAPITest,flip_3d)2541 TEST_F(VulkanAPITest, flip_3d) {
2542   test_flip({5, 7, 5}, {-1});
2543   test_flip({2, 9, 7}, {-2});
2544   test_flip({9, 7, 5}, {-3});
2545 
2546   test_flip({10, 7, 5}, {0, 1});
2547   test_flip({10, 7, 5}, {0, 2});
2548   test_flip({10, 7, 5}, {1, 2});
2549 
2550   test_flip({10, 7, 5}, {2, 1, 0});
2551 }
2552 
TEST_F(VulkanAPITest,flip_4d)2553 TEST_F(VulkanAPITest, flip_4d) {
2554   test_flip({2, 9, 1, 1}, {-1});
2555   test_flip({7, 5, 9, 3}, {-2});
2556   test_flip({3, 8, 5, 2}, {-3});
2557   test_flip({7, 9, 5, 3}, {-4});
2558 
2559   test_flip({10, 7, 5, 6}, {0, 1});
2560   test_flip({10, 7, 5, 6}, {0, 2});
2561   test_flip({10, 7, 5, 6}, {0, 3});
2562   test_flip({10, 7, 5, 6}, {1, 2});
2563   test_flip({10, 7, 5, 6}, {1, 3});
2564   test_flip({10, 7, 5, 6}, {2, 3});
2565 
2566   test_flip({10, 7, 5, 6}, {0, 1, 2});
2567   test_flip({10, 7, 5, 6}, {0, 1, 3});
2568   test_flip({10, 7, 5, 6}, {0, 2, 3});
2569   test_flip({10, 7, 5, 6}, {3, 2, 1});
2570 
2571   test_flip({10, 7, 5, 6}, {3, 2, 1, 0});
2572 }
2573 
TEST_F(VulkanAPITest,gelu)2574 TEST_F(VulkanAPITest, gelu) {
2575   const auto in_cpu = at::rand({17, 197, 302, 5}, at::device(at::kCPU).dtype(at::kFloat));
2576   const auto in_vulkan = in_cpu.vulkan();
2577 
2578   auto out_cpu = at::gelu(in_cpu, "tanh");
2579   auto out_vulkan = at::gelu(in_vulkan, "tanh");
2580 
2581   auto check = almostEqual(out_cpu, out_vulkan.cpu());
2582 
2583   if (!check) {
2584     showRtol(out_cpu, out_vulkan.cpu());
2585   }
2586 
2587   ASSERT_TRUE(check);
2588 }
2589 
TEST_F(VulkanAPITest,gelu_)2590 TEST_F(VulkanAPITest, gelu_) {
2591   auto cpu = at::rand({17, 197, 302, 5}, at::device(at::kCPU).dtype(at::kFloat));
2592   auto vulkan = cpu.vulkan();
2593 
2594   at::gelu_(cpu, "tanh");
2595   at::gelu_(vulkan, "tanh");
2596 
2597   auto check = almostEqual(cpu, vulkan.cpu());
2598   if (!check) {
2599     showRtol(cpu, vulkan.cpu());
2600   }
2601 
2602   ASSERT_TRUE(check);
2603 }
2604 
test_glu(const at::IntArrayRef input_shape)2605 void test_glu(const at::IntArrayRef input_shape) {
2606   const auto in_cpu = at::rand(input_shape, at::device(at::kCPU).dtype(at::kFloat));
2607   const auto in_vulkan = in_cpu.vulkan();
2608 
2609   const auto out_cpu = at::glu(in_cpu, 1);
2610   const auto out_vulkan = at::glu(in_vulkan, 1);
2611 
2612   const auto check = almostEqual(out_cpu, out_vulkan.cpu());
2613   if (!check) {
2614     showRtol(out_cpu, out_vulkan.cpu());
2615   }
2616 
2617   ASSERT_TRUE(check);
2618 }
2619 
TEST_F(VulkanAPITest,glu_ch_200)2620 TEST_F(VulkanAPITest, glu_ch_200) {
2621   test_glu({17, 200, 302, 5});
2622 }
2623 
TEST_F(VulkanAPITest,glu_ch_64)2624 TEST_F(VulkanAPITest, glu_ch_64) {
2625   test_glu({1, 64, 100, 8});
2626 }
2627 
TEST_F(VulkanAPITest,glu_ch_32)2628 TEST_F(VulkanAPITest, glu_ch_32) {
2629   test_glu({1, 32, 100, 19});
2630 }
2631 
2632 // Re-enable once glu_channel shader is fixed
TEST_F(VulkanAPITest,DISABLED_glu_ch_10)2633 TEST_F(VulkanAPITest, DISABLED_glu_ch_10) {
2634   test_glu({17, 10, 57, 41});
2635 }
2636 
2637 // Re-enable once glu_channel shader is fixed
TEST_F(VulkanAPITest,DISABLED_glu_ch_2)2638 TEST_F(VulkanAPITest, DISABLED_glu_ch_2) {
2639   test_glu({1, 2, 100, 40});
2640 }
2641 
TEST_F(VulkanAPITest,hardsigmoid)2642 TEST_F(VulkanAPITest, hardsigmoid) {
2643   const auto in_cpu = at::rand({17, 197, 302, 5}, at::device(at::kCPU).dtype(at::kFloat))*12 - 6;
2644   const auto in_vulkan = in_cpu.vulkan();
2645 
2646   const auto out_cpu = at::hardsigmoid(in_cpu);
2647   const auto out_vulkan = at::hardsigmoid(in_vulkan);
2648 
2649   const auto check = almostEqual(out_cpu, out_vulkan.cpu());
2650   if (!check) {
2651     showRtol(out_cpu, out_vulkan.cpu());
2652   }
2653 
2654   ASSERT_TRUE(check);
2655 }
2656 
TEST_F(VulkanAPITest,hardsigmoid_)2657 TEST_F(VulkanAPITest, hardsigmoid_) {
2658   auto cpu = at::rand({17, 197, 302, 5}, at::device(at::kCPU).dtype(at::kFloat))*12 - 6;
2659   auto vulkan = cpu.vulkan();
2660 
2661   at::hardsigmoid_(cpu);
2662   at::hardsigmoid_(vulkan);
2663 
2664   const auto check = almostEqual(cpu, vulkan.cpu());
2665   if (!check) {
2666     showRtol(cpu, vulkan.cpu());
2667   }
2668 
2669   ASSERT_TRUE(check);
2670 }
2671 
TEST_F(VulkanAPITest,hardshrink)2672 TEST_F(VulkanAPITest, hardshrink) {
2673   for (const auto lambd_value : {-4.2, -1.0, 0.42, 1.0, 4.2, 13.7}) {
2674     // Generate values between -10 and +10
2675     const auto in_cpu = (at::rand({3, 63, 79, 17}, at::device(at::kCPU).dtype(at::kFloat)) - 0.5) * 20;
2676     const auto in_vulkan = in_cpu.vulkan();
2677 
2678     const auto out_vulkan = at::hardshrink(in_vulkan, lambd_value);
2679 
2680     const auto check = checkHardShrink(in_cpu, out_vulkan.cpu(), lambd_value);
2681     ASSERT_TRUE(check);
2682   }
2683 }
2684 
TEST_F(VulkanAPITest,hardshrink_)2685 TEST_F(VulkanAPITest, hardshrink_) {
2686   for (const auto lambd_value : {0.42, 1.0, 4.2, 13.7}) {
2687     // Generate values between -10 and +10
2688     const auto in_cpu = (at::rand({3, 63, 79, 17}, at::device(at::kCPU).dtype(at::kFloat)) - 0.5) * 20;
2689     const auto in_vulkan = in_cpu.vulkan();
2690 
2691     const auto out_cpu = in_cpu.hardshrink(lambd_value);
2692     const auto out_vulkan = in_vulkan.hardshrink(lambd_value).cpu();
2693 
2694     const auto check = checkHardShrink(out_cpu, out_vulkan, lambd_value);
2695     ASSERT_TRUE(check);
2696   }
2697 }
2698 
TEST_F(VulkanAPITest,hardtanh)2699 TEST_F(VulkanAPITest, hardtanh) {
2700   const auto in_cpu = at::rand({17, 197, 302, 5}, at::device(at::kCPU).dtype(at::kFloat)) * 10;
2701   const auto in_vulkan = in_cpu.vulkan();
2702 
2703   const auto out_cpu = at::hardtanh(in_cpu, 3, 7);
2704   const auto out_vulkan = at::hardtanh(in_vulkan, 3, 7);
2705 
2706   const auto check = almostEqual(out_cpu, out_vulkan.cpu());
2707   if (!check) {
2708     showRtol(out_cpu, out_vulkan.cpu());
2709   }
2710 
2711   ASSERT_TRUE(check);
2712 }
2713 
TEST_F(VulkanAPITest,hardtanh_)2714 TEST_F(VulkanAPITest, hardtanh_) {
2715   auto a_cpu = at::rand({17, 197, 302, 5}, at::device(at::kCPU).dtype(at::kFloat)) * 10;
2716   auto a_vulkan = a_cpu.vulkan();
2717 
2718   at::hardtanh_(a_cpu, 3, 7);
2719   at::hardtanh_(a_vulkan, 3, 7);
2720 
2721   const auto check = almostEqual(a_cpu, a_vulkan.cpu());
2722   if (!check) {
2723     showRtol(a_cpu, a_vulkan.cpu());
2724   }
2725 
2726   ASSERT_TRUE(check);
2727 }
2728 
test_packed_layer_norm(const at::IntArrayRef input_shape,const at::IntArrayRef normalized_shape,const at::IntArrayRef weight_shape,const at::IntArrayRef bias_shape,const float eps)2729 void test_packed_layer_norm(
2730     const at::IntArrayRef input_shape,
2731     const at::IntArrayRef normalized_shape,
2732     const at::IntArrayRef weight_shape,
2733     const at::IntArrayRef bias_shape,
2734     const float eps) {
2735   c10::InferenceMode mode;
2736 
2737   const auto input_cpu =
2738       at::rand(input_shape, at::device(at::kCPU).dtype(at::kFloat));
2739   const auto input_vulkan = input_cpu.vulkan();
2740 
2741   const auto weight_cpu =
2742       at::rand(weight_shape, at::device(at::kCPU).dtype(at::kFloat));
2743 
2744   const auto bias_cpu =
2745       at::rand(bias_shape, at::device(at::kCPU).dtype(at::kFloat));
2746 
2747   const auto output_cpu = at::layer_norm(
2748       input_cpu, normalized_shape, weight_cpu, bias_cpu, eps, false);
2749 
2750   auto prepack = callOpByName(
2751       "vulkan_prepack::create_layernorm_context",
2752       "",
2753       weight_cpu, bias_cpu, eps);
2754 
2755   auto vulkan_output = callOpByName(
2756       "vulkan_prepack::run_layernorm_context",
2757       "",
2758       input_cpu.vulkan(), normalized_shape, prepack[0]);
2759 
2760   auto output_vulkan = vulkan_output[0].toTensor();
2761 
2762   const auto check = almostEqual(output_cpu, output_vulkan.cpu());
2763   if (!check) {
2764     showRtol(output_cpu, output_vulkan.cpu());
2765   }
2766 
2767   ASSERT_TRUE(check);
2768 }
2769 
TEST_F(VulkanAPITest,packed_layer_norm_2d)2770 TEST_F(VulkanAPITest, packed_layer_norm_2d) {
2771   test_packed_layer_norm({5, 7}, {7}, {7}, {7}, 1e-05);
2772   test_packed_layer_norm({5, 7}, {5, 7}, {5, 7}, {5, 7}, 1e-05);
2773 }
2774 
TEST_F(VulkanAPITest,packed_layer_norm_3d)2775 TEST_F(VulkanAPITest, packed_layer_norm_3d) {
2776   test_packed_layer_norm({11, 5, 7}, {7}, {7}, {7}, 1e-05);
2777   test_packed_layer_norm({11, 5, 7}, {5, 7}, {5, 7}, {5, 7}, 1e-05);
2778   test_packed_layer_norm({11, 5, 7}, {11, 5, 7}, {11, 5, 7}, {11, 5, 7}, 1e-05);
2779 }
2780 
TEST_F(VulkanAPITest,packed_layer_norm_4d)2781 TEST_F(VulkanAPITest, packed_layer_norm_4d) {
2782   test_packed_layer_norm({3, 11, 5, 7}, {7}, {7}, {7}, 1e-05);
2783   test_packed_layer_norm({3, 11, 5, 7}, {5, 7}, {5, 7}, {5, 7}, 1e-05);
2784   test_packed_layer_norm({3, 11, 5, 7}, {11, 5, 7}, {11, 5, 7}, {11, 5, 7}, 1e-05);
2785   test_packed_layer_norm(
2786       {3, 11, 5, 7}, {3, 11, 5, 7}, {3, 11, 5, 7}, {3, 11, 5, 7}, 1e-05);
2787 }
2788 
TEST_F(VulkanAPITest,layer_norm_invalid_inputs)2789 TEST_F(VulkanAPITest, layer_norm_invalid_inputs) {
2790   c10::InferenceMode mode;
2791 
2792   // Act: incorrect normalized shape
2793   EXPECT_THROW({
2794     at::layer_norm(
2795       at::rand({3, 5, 7}, at::device(at::kCPU).dtype(at::kFloat)).vulkan(),
2796       {8, 5},
2797       at::rand({8, 5}, at::device(at::kCPU).dtype(at::kFloat)).vulkan(),
2798       at::rand({8, 5}, at::device(at::kCPU).dtype(at::kFloat)).vulkan(),
2799       1e-05,
2800       false);
2801   }, ::std::exception);
2802 
2803   // Act: incorrect weight dimensions
2804   EXPECT_THROW({
2805     at::layer_norm(
2806       at::rand({3, 5, 7}, at::device(at::kCPU).dtype(at::kFloat)).vulkan(),
2807       {3, 5, 7},
2808       at::rand({3, 5}, at::device(at::kCPU).dtype(at::kFloat)).vulkan(),
2809       at::rand({3, 5, 7}, at::device(at::kCPU).dtype(at::kFloat)).vulkan(),
2810       1e-05,
2811       false);
2812   }, ::std::exception);
2813 
2814   // Act: incorrect bias dimensions
2815   EXPECT_THROW({
2816     at::layer_norm(
2817       at::rand({3, 5, 7}, at::device(at::kCPU).dtype(at::kFloat)).vulkan(),
2818       {3, 5, 7},
2819       at::rand({3, 5, 7}, at::device(at::kCPU).dtype(at::kFloat)).vulkan(),
2820       at::rand({5, 7}, at::device(at::kCPU).dtype(at::kFloat)).vulkan(),
2821       1e-05,
2822       false);
2823   }, ::std::exception);
2824 
2825   // Act: input has too many dimensions
2826   EXPECT_THROW({
2827     at::layer_norm(
2828       at::rand({1, 2, 3, 5, 7}, at::device(at::kCPU).dtype(at::kFloat)).vulkan(),
2829       {3, 5, 7},
2830       at::rand({3, 5, 7}, at::device(at::kCPU).dtype(at::kFloat)).vulkan(),
2831       at::rand({3, 5, 7}, at::device(at::kCPU).dtype(at::kFloat)).vulkan(),
2832       1e-05,
2833       false);
2834   }, ::std::exception);
2835 }
2836 
test_layer_norm(const at::IntArrayRef input_shape,const at::IntArrayRef normalized_shape,const at::IntArrayRef weight_shape,const at::IntArrayRef bias_shape,const float eps)2837 void test_layer_norm(
2838     const at::IntArrayRef input_shape,
2839     const at::IntArrayRef normalized_shape,
2840     const at::IntArrayRef weight_shape,
2841     const at::IntArrayRef bias_shape,
2842     const float eps) {
2843   c10::InferenceMode mode;
2844 
2845   const auto input_cpu =
2846       at::rand(input_shape, at::device(at::kCPU).dtype(at::kFloat));
2847   const auto input_vulkan = input_cpu.vulkan();
2848 
2849   const auto weight_cpu =
2850       at::rand(weight_shape, at::device(at::kCPU).dtype(at::kFloat));
2851   const auto weight_vulkan = weight_cpu.vulkan();
2852 
2853   const auto bias_cpu =
2854       at::rand(bias_shape, at::device(at::kCPU).dtype(at::kFloat));
2855   const auto bias_vulkan = bias_cpu.vulkan();
2856 
2857   const auto output_cpu = at::layer_norm(
2858       input_cpu, normalized_shape, weight_cpu, bias_cpu, eps, false);
2859   const auto output_vulkan = at::layer_norm(
2860       input_vulkan, normalized_shape, weight_vulkan, bias_vulkan, eps, false);
2861 
2862   const auto check = almostEqual(output_cpu, output_vulkan.cpu());
2863   if (!check) {
2864     showRtol(output_cpu, output_vulkan.cpu());
2865   }
2866 
2867   ASSERT_TRUE(check);
2868 }
2869 
TEST_F(VulkanAPITest,layer_norm_2d)2870 TEST_F(VulkanAPITest, layer_norm_2d) {
2871   test_layer_norm({5, 7}, {7}, {7}, {7}, 1e-05);
2872   test_layer_norm({5, 7}, {5, 7}, {5, 7}, {5, 7}, 1e-05);
2873 }
2874 
TEST_F(VulkanAPITest,layer_norm_3d)2875 TEST_F(VulkanAPITest, layer_norm_3d) {
2876   test_layer_norm({11, 5, 7}, {7}, {7}, {7}, 1e-05);
2877   test_layer_norm({11, 5, 7}, {5, 7}, {5, 7}, {5, 7}, 1e-05);
2878   test_layer_norm({11, 5, 7}, {11, 5, 7}, {11, 5, 7}, {11, 5, 7}, 1e-05);
2879 }
2880 
TEST_F(VulkanAPITest,layer_norm_4d)2881 TEST_F(VulkanAPITest, layer_norm_4d) {
2882   test_layer_norm({3, 11, 5, 7}, {7}, {7}, {7}, 1e-05);
2883   test_layer_norm({3, 11, 5, 7}, {5, 7}, {5, 7}, {5, 7}, 1e-05);
2884   test_layer_norm({3, 11, 5, 7}, {11, 5, 7}, {11, 5, 7}, {11, 5, 7}, 1e-05);
2885   test_layer_norm(
2886       {3, 11, 5, 7}, {3, 11, 5, 7}, {3, 11, 5, 7}, {3, 11, 5, 7}, 1e-05);
2887 }
2888 
test_native_layer_norm(const at::IntArrayRef input_shape,const at::IntArrayRef normalized_shape,const at::IntArrayRef weight_shape,const at::IntArrayRef bias_shape,const float eps)2889 void test_native_layer_norm(
2890     const at::IntArrayRef input_shape,
2891     const at::IntArrayRef normalized_shape,
2892     const at::IntArrayRef weight_shape,
2893     const at::IntArrayRef bias_shape,
2894     const float eps) {
2895   c10::InferenceMode mode;
2896 
2897   const auto input_cpu =
2898       at::rand(input_shape, at::device(at::kCPU).dtype(at::kFloat));
2899   const auto input_vulkan = input_cpu.vulkan();
2900 
2901   const auto weight_cpu =
2902       at::rand(weight_shape, at::device(at::kCPU).dtype(at::kFloat));
2903   const auto weight_vulkan = weight_cpu.vulkan();
2904 
2905   const auto bias_cpu =
2906       at::rand(bias_shape, at::device(at::kCPU).dtype(at::kFloat));
2907   const auto bias_vulkan = bias_cpu.vulkan();
2908 
2909   const auto output_cpu = at::native_layer_norm(
2910       input_cpu, normalized_shape, weight_cpu, bias_cpu, eps);
2911   const auto output_vulkan = at::native_layer_norm(
2912       input_vulkan, normalized_shape, weight_vulkan, bias_vulkan, eps);
2913 
2914   const auto check0 =
2915       almostEqual(std::get<0>(output_cpu), std::get<0>(output_vulkan).cpu());
2916   const auto check1 =
2917       almostEqual(std::get<1>(output_cpu), std::get<1>(output_vulkan).cpu());
2918   const auto check2 =
2919       almostEqual(std::get<2>(output_cpu), std::get<2>(output_vulkan).cpu());
2920 
2921   if (!check0) {
2922     std::cout
2923         << "the first output of native_layer_norm: layer_norm is incorrect"
2924         << std::endl;
2925     showRtol(std::get<0>(output_cpu), std::get<0>(output_vulkan).cpu());
2926   }
2927   if (!check1) {
2928     std::cout << "the second output of native_layer_norm: mean is incorrect"
2929               << std::endl;
2930     showRtol(std::get<1>(output_cpu), std::get<1>(output_vulkan).cpu());
2931   }
2932   if (!check2) {
2933     std::cout
2934         << "the third output of native_layer_norm: 1/sqrt(var+eps) is incorrect"
2935         << std::endl;
2936     showRtol(std::get<2>(output_cpu), std::get<2>(output_vulkan).cpu());
2937   }
2938 
2939   ASSERT_TRUE(check0 && check2 && check2);
2940 }
2941 
TEST_F(VulkanAPITest,native_layer_norm_2d)2942 TEST_F(VulkanAPITest, native_layer_norm_2d) {
2943   test_native_layer_norm({5, 7}, {7}, {7}, {7}, 1e-05);
2944   test_native_layer_norm({5, 7}, {5, 7}, {5, 7}, {5, 7}, 1e-05);
2945 }
2946 
TEST_F(VulkanAPITest,native_layer_norm_3d)2947 TEST_F(VulkanAPITest, native_layer_norm_3d) {
2948   test_native_layer_norm({11, 5, 7}, {7}, {7}, {7}, 1e-05);
2949   test_native_layer_norm({11, 5, 7}, {5, 7}, {5, 7}, {5, 7}, 1e-05);
2950   test_native_layer_norm({11, 5, 7}, {11, 5, 7}, {11, 5, 7}, {11, 5, 7}, 1e-05);
2951 }
2952 
TEST_F(VulkanAPITest,native_layer_norm_4d)2953 TEST_F(VulkanAPITest, native_layer_norm_4d) {
2954   test_native_layer_norm({3, 11, 5, 7}, {7}, {7}, {7}, 1e-05);
2955   test_native_layer_norm({3, 11, 5, 7}, {5, 7}, {5, 7}, {5, 7}, 1e-05);
2956   test_native_layer_norm(
2957       {3, 11, 5, 7}, {11, 5, 7}, {11, 5, 7}, {11, 5, 7}, 1e-05);
2958   test_native_layer_norm(
2959       {3, 11, 5, 7}, {3, 11, 5, 7}, {3, 11, 5, 7}, {3, 11, 5, 7}, 1e-05);
2960 }
2961 
TEST_F(VulkanAPITest,leaky_relu)2962 TEST_F(VulkanAPITest, leaky_relu) {
2963   for (const auto negative_slope : {0.01, 0.001, 1.0, -0.001}) {
2964     const auto in_cpu = at::rand({17, 197, 302, 5}, at::device(at::kCPU).dtype(at::kFloat));
2965     const auto in_vulkan = in_cpu.vulkan();
2966 
2967     const auto out_cpu = at::leaky_relu(in_cpu, negative_slope);
2968     const auto out_vulkan = at::leaky_relu(in_vulkan, negative_slope);
2969 
2970     const auto check = almostEqual(out_cpu, out_vulkan.cpu());
2971 
2972     if (!check) {
2973       showRtol(out_cpu, out_vulkan.cpu());
2974     }
2975 
2976     ASSERT_TRUE(check);
2977   }
2978 }
2979 
TEST_F(VulkanAPITest,leaky_relu_)2980 TEST_F(VulkanAPITest, leaky_relu_) {
2981   for (const auto negative_slope : {0.01, 0.001, 1.0, -0.001}) {
2982     auto cpu = at::rand({17, 197, 302, 5}, at::device(at::kCPU).dtype(at::kFloat));
2983     auto vulkan = cpu.vulkan();
2984 
2985     at::leaky_relu_(cpu, negative_slope);
2986     at::leaky_relu_(vulkan, negative_slope);
2987 
2988     const auto check = almostEqual(cpu, vulkan.cpu());
2989     if (!check) {
2990       showRtol(cpu, vulkan.cpu());
2991     }
2992 
2993     ASSERT_TRUE(check);
2994   }
2995 }
2996 
TEST_F(VulkanAPITest,lerp)2997 TEST_F(VulkanAPITest, lerp) {
2998   const auto a_cpu = at::rand({11, 7, 139, 109}, at::device(at::kCPU).dtype(at::kFloat));
2999   const auto a_vulkan = a_cpu.vulkan();
3000 
3001   const auto b_cpu = at::rand({11, 7, 139, 109}, at::device(at::kCPU).dtype(at::kFloat));
3002   const auto b_vulkan = b_cpu.vulkan();
3003 
3004   const auto w_cpu = at::rand({11, 7, 139, 109}, at::device(at::kCPU).dtype(at::kFloat));
3005   const auto w_vulkan = w_cpu.vulkan();
3006 
3007   const auto c_cpu = at::lerp(a_cpu, b_cpu, w_cpu);
3008   const auto c_vulkan = at::lerp(a_vulkan, b_vulkan, w_vulkan);
3009 
3010   const auto check = almostEqual(c_cpu, c_vulkan.cpu());
3011   if (!check) {
3012     showRtol(c_cpu, c_vulkan.cpu());
3013   }
3014 
3015   ASSERT_TRUE(check);
3016 }
3017 
TEST_F(VulkanAPITest,lerp_broadcast0)3018 TEST_F(VulkanAPITest, lerp_broadcast0) {
3019   const auto a_cpu = at::rand({3, 5, 179, 221}, at::device(at::kCPU).dtype(at::kFloat));
3020   const auto a_vulkan = a_cpu.vulkan();
3021 
3022   const auto b_cpu = at::rand({3, 5, 1, 1}, at::device(at::kCPU).dtype(at::kFloat));
3023   const auto b_vulkan = b_cpu.vulkan();
3024 
3025   const auto w_cpu = at::rand({3, 5, 1, 221}, at::device(at::kCPU).dtype(at::kFloat));
3026   const auto w_vulkan = w_cpu.vulkan();
3027 
3028   const auto c_cpu = at::lerp(a_cpu, b_cpu, w_cpu);
3029   const auto c_vulkan = at::lerp(a_vulkan, b_vulkan, w_vulkan);
3030 
3031   const auto check = almostEqual(c_cpu, c_vulkan.cpu());
3032   if (!check) {
3033     showRtol(c_cpu, c_vulkan.cpu());
3034   }
3035 
3036   ASSERT_TRUE(check);
3037 }
3038 
TEST_F(VulkanAPITest,lerp_broadcast1)3039 TEST_F(VulkanAPITest, lerp_broadcast1) {
3040   const auto a_cpu = at::rand({3, 4, 179, 221}, at::device(at::kCPU).dtype(at::kFloat));
3041   const auto a_vulkan = a_cpu.vulkan();
3042 
3043   const auto b_cpu = at::rand({4, 179, 221}, at::device(at::kCPU).dtype(at::kFloat));
3044   const auto b_vulkan = b_cpu.vulkan();
3045 
3046   const auto w_cpu = at::rand({4, 179, 221}, at::device(at::kCPU).dtype(at::kFloat));
3047   const auto w_vulkan = w_cpu.vulkan();
3048 
3049   const auto c_cpu = at::lerp(a_cpu, b_cpu, w_cpu);
3050   const auto c_vulkan = at::lerp(a_vulkan, b_vulkan, w_vulkan);
3051 
3052   const auto check = almostEqual(c_cpu, c_vulkan.cpu());
3053   if (!check) {
3054     showRtol(c_cpu, c_vulkan.cpu());
3055   }
3056 
3057   ASSERT_TRUE(check);
3058 }
3059 
TEST_F(VulkanAPITest,lerp_)3060 TEST_F(VulkanAPITest, lerp_) {
3061   auto a_cpu = at::rand({61, 17, 29, 83}, at::device(at::kCPU).dtype(at::kFloat));
3062   auto a_vulkan = a_cpu.vulkan();
3063 
3064   const auto b_cpu = at::rand({61, 17, 29, 83}, at::device(at::kCPU).dtype(at::kFloat));
3065   const auto b_vulkan = b_cpu.vulkan();
3066 
3067   const auto w_cpu = at::rand({61, 17, 29, 83}, at::device(at::kCPU).dtype(at::kFloat));
3068   const auto w_vulkan = w_cpu.vulkan();
3069 
3070   a_cpu.lerp_(b_cpu, w_cpu);
3071   a_vulkan.lerp_(b_vulkan, w_vulkan);
3072 
3073   const auto check = almostEqual(a_cpu, a_vulkan.cpu());
3074   if (!check) {
3075     showRtol(a_cpu, a_vulkan.cpu());
3076   }
3077 
3078   ASSERT_TRUE(check);
3079 }
3080 
TEST_F(VulkanAPITest,lerp_broadcast0_)3081 TEST_F(VulkanAPITest, lerp_broadcast0_) {
3082   auto a_cpu = at::rand({3, 5, 179, 221}, at::device(at::kCPU).dtype(at::kFloat));
3083   auto a_vulkan = a_cpu.vulkan();
3084 
3085   const auto b_cpu = at::rand({3, 5, 1, 1}, at::device(at::kCPU).dtype(at::kFloat));
3086   const auto b_vulkan = b_cpu.vulkan();
3087 
3088   const auto w_cpu = at::rand({3, 5, 1, 221}, at::device(at::kCPU).dtype(at::kFloat));
3089   const auto w_vulkan = w_cpu.vulkan();
3090 
3091   a_cpu.lerp_(b_cpu, w_cpu);
3092   a_vulkan.lerp_(b_vulkan, w_vulkan);
3093 
3094   const auto check = almostEqual(a_cpu, a_vulkan.cpu());
3095   if (!check) {
3096     showRtol(a_cpu, a_vulkan.cpu());
3097   }
3098 
3099   ASSERT_TRUE(check);
3100 }
3101 
TEST_F(VulkanAPITest,lerp_broadcast1_)3102 TEST_F(VulkanAPITest, lerp_broadcast1_) {
3103   auto a_cpu = at::rand({3, 4, 179, 221}, at::device(at::kCPU).dtype(at::kFloat));
3104   auto a_vulkan = a_cpu.vulkan();
3105 
3106   const auto b_cpu = at::rand({4, 179, 221}, at::device(at::kCPU).dtype(at::kFloat));
3107   const auto b_vulkan = b_cpu.vulkan();
3108 
3109   const auto w_cpu = at::rand({4, 179, 221}, at::device(at::kCPU).dtype(at::kFloat));
3110   const auto w_vulkan = w_cpu.vulkan();
3111 
3112   a_cpu.lerp_(b_cpu, w_cpu);
3113   a_vulkan.lerp_(b_vulkan, w_vulkan);
3114 
3115   const auto check = almostEqual(a_cpu, a_vulkan.cpu());
3116   if (!check) {
3117     showRtol(a_cpu, a_vulkan.cpu());
3118   }
3119 
3120   ASSERT_TRUE(check);
3121 }
3122 
TEST_F(VulkanAPITest,lerp_scalar)3123 TEST_F(VulkanAPITest, lerp_scalar) {
3124   const auto a_cpu = at::rand({13, 23, 59, 73}, at::device(at::kCPU).dtype(at::kFloat));
3125   const auto a_vulkan = a_cpu.vulkan();
3126 
3127   const auto b_cpu = at::rand({13, 23, 59, 73}, at::device(at::kCPU).dtype(at::kFloat));
3128   const auto b_vulkan = b_cpu.vulkan();
3129 
3130   const float w_scalar = 3.1415f;
3131 
3132   const auto c_cpu = at::lerp(a_cpu, b_cpu, w_scalar);
3133   const auto c_vulkan = at::lerp(a_vulkan, b_vulkan, w_scalar);
3134 
3135   const auto check = almostEqual(c_cpu, c_vulkan.cpu());
3136   if (!check) {
3137     showRtol(c_cpu, c_vulkan.cpu());
3138   }
3139 
3140   ASSERT_TRUE(check);
3141 }
3142 
TEST_F(VulkanAPITest,lerp_scalar_)3143 TEST_F(VulkanAPITest, lerp_scalar_) {
3144   auto a_cpu = at::rand({47, 2, 23, 97}, at::device(at::kCPU).dtype(at::kFloat));
3145   auto a_vulkan = a_cpu.vulkan();
3146 
3147   const auto b_cpu = at::rand({47, 2, 23, 97}, at::device(at::kCPU).dtype(at::kFloat));
3148   const auto b_vulkan = b_cpu.vulkan();
3149 
3150   const float w_scalar = 3.1415f;
3151 
3152   a_cpu.lerp_(b_cpu, w_scalar);
3153   a_vulkan.lerp_(b_vulkan, w_scalar);
3154 
3155   const auto check = almostEqual(a_cpu, a_vulkan.cpu());
3156   if (!check) {
3157     showRtol(a_cpu, a_vulkan.cpu());
3158   }
3159 
3160   ASSERT_TRUE(check);
3161 }
3162 
TEST_F(VulkanAPITest,hardswish)3163 TEST_F(VulkanAPITest, hardswish) {
3164   const auto in_cpu = at::rand({17, 197, 302, 5}, at::device(at::kCPU).dtype(at::kFloat))*12 - 6;
3165   const auto in_vulkan = in_cpu.vulkan();
3166 
3167   const auto out_cpu = at::hardswish(in_cpu);
3168   const auto out_vulkan = at::hardswish(in_vulkan);
3169 
3170   const auto check = almostEqual(out_cpu, out_vulkan.cpu());
3171   if (!check) {
3172     showRtol(out_cpu, out_vulkan.cpu());
3173   }
3174 
3175   ASSERT_TRUE(check);
3176 }
3177 
TEST_F(VulkanAPITest,threshold)3178 TEST_F(VulkanAPITest, threshold) {
3179   const auto in_cpu = at::rand({2, 11, 57, 23}, at::device(at::kCPU).dtype(at::kFloat))*12 - 6;
3180   const auto in_vulkan = in_cpu.vulkan();
3181 
3182   const float threshold = 2.0f;
3183   const float value = 5.0f;
3184 
3185   const auto out_cpu = at::threshold(in_cpu, threshold, value);
3186   const auto out_vulkan = at::threshold(in_vulkan, threshold, value);
3187 
3188   const auto check = checkThreshold(out_cpu, out_vulkan.cpu(), threshold, value);
3189   ASSERT_TRUE(check);
3190 }
3191 
TEST_F(VulkanAPITest,hardswish_)3192 TEST_F(VulkanAPITest, hardswish_) {
3193   auto cpu = at::rand({17, 197, 302, 5}, at::device(at::kCPU).dtype(at::kFloat))*12 - 6;
3194   auto vulkan = cpu.vulkan();
3195 
3196   at::hardswish_(cpu);
3197   at::hardswish_(vulkan);
3198 
3199   const auto check = almostEqual(cpu, vulkan.cpu());
3200   if (!check) {
3201     showRtol(cpu, vulkan.cpu());
3202   }
3203 
3204   ASSERT_TRUE(check);
3205 }
3206 
TEST_F(VulkanAPITest,masked_fill_invalidinputs_exceptions)3207 TEST_F(VulkanAPITest, masked_fill_invalidinputs_exceptions) {
3208   // Arrange: Vulkan masked_fill expects inputs of dim <= 4
3209   {
3210     const auto in_cpu =
3211         at::rand({3, 5, 2, 3, 2}, at::device(at::kCPU).dtype(at::kFloat));
3212     const auto mask_cpu =
3213         at::randint(0, 2, {2, 3, 2}, at::device(at::kCPU).dtype(at::kBool));
3214 
3215     // Act
3216     EXPECT_THROW(
3217         {
3218           const auto out_vulkan =
3219               in_cpu.vulkan().masked_fill(mask_cpu.vulkan(), -7.0f);
3220           ;
3221         },
3222         ::std::exception);
3223   }
3224 
3225   // Arrange: Vulkan masked_fill expects mask of dim <= 4
3226   {
3227     const auto in_cpu =
3228         at::rand({2, 3, 2}, at::device(at::kCPU).dtype(at::kFloat));
3229     const auto mask_cpu = at::randint(
3230         0, 2, {3, 5, 2, 3, 2}, at::device(at::kCPU).dtype(at::kBool));
3231 
3232     // Act
3233     EXPECT_THROW(
3234         {
3235           const auto out_vulkan =
3236               in_cpu.vulkan().masked_fill(mask_cpu.vulkan(), -7.0f);
3237           ;
3238         },
3239         ::std::exception);
3240   }
3241 
3242   // Arrange: shapes of input tensor and mask tensor should be broadcastable
3243   {
3244     const auto in_cpu =
3245         at::rand({2, 3, 2}, at::device(at::kCPU).dtype(at::kFloat));
3246     const auto mask_cpu =
3247         at::randint(0, 2, {3, 3, 2}, at::device(at::kCPU).dtype(at::kBool));
3248 
3249     // Act
3250     EXPECT_THROW(
3251         {
3252           const auto out_vulkan =
3253               in_cpu.vulkan().masked_fill(mask_cpu.vulkan(), -7.0f);
3254           ;
3255         },
3256         ::std::exception);
3257   }
3258 
3259   // Arrange: value should be a 0-dimensional value tensor or a scalar
3260   {
3261     const auto in_cpu =
3262         at::rand({2, 3, 2}, at::device(at::kCPU).dtype(at::kFloat));
3263     const auto mask_cpu =
3264         at::randint(0, 2, {2, 3, 2}, at::device(at::kCPU).dtype(at::kBool));
3265 
3266     // Act
3267     EXPECT_THROW(
3268         {
3269           const auto out_vulkan =
3270               in_cpu.vulkan().masked_fill(mask_cpu.vulkan(), at::rand({1, 2}));
3271           ;
3272         },
3273         ::std::exception);
3274   }
3275 }
3276 
print_shape(const std::vector<int64_t> & shape)3277 void print_shape(const std::vector<int64_t>& shape) {
3278   for (const auto& num : shape) {
3279     std::cout << num << " ";
3280   }
3281 }
3282 
test_masked_fill_scalar(const at::IntArrayRef input_shape,const at::IntArrayRef mask_shape)3283 void test_masked_fill_scalar(
3284     const at::IntArrayRef input_shape,
3285     const at::IntArrayRef mask_shape) {
3286   c10::InferenceMode mode;
3287 
3288   /**
3289    * We test masked_fill by considering all possible broadcasting cases of
3290    * input_shape and mask_shape. The given input_shape and mask_shape are
3291    * identical, e.g. both are equal to [3, 5, 2, 3]. First we truncate all
3292    * possible proceeding dimensions of input_shape and mask_shape respectively.
3293    * Denote the results as curr_input_shape and curr_mask_shape, e.g.
3294    * curr_input_shape = [5, 2, 3] and curr_mask_shape = [2, 3]. Then for both
3295    * curr_input_shape and curr_mask_shape we generate all possible subsets of
3296    * the indices and set the corresponding elements to 1 for each subset. For
3297    * example, for curr_input_shape = [5, 2, 3], a possible input_idx_subset =
3298    * [0, 2]. We set the 0th and 2nd elements of curr_input_shape to be 1, then
3299    * curr_input_shape = [1, 2, 1]. Similarly for curr_mask_shape = [2, 3], a
3300    * possible mask_idx_subset = [0], then the updated curr_mask_shape = [1, 3].
3301    * In the end, we test masked_fill with the combinations of curr_input_shape
3302    * and curr_mask_shape. In the example above, an output tensor of shape [1, 2,
3303    * 3] will be generated.
3304    */
3305   const size_t input_dim = input_shape.size();
3306   const size_t mask_dim = mask_shape.size();
3307   for (int input_shape_id = input_dim - 1; input_shape_id >= 0;
3308        --input_shape_id) {
3309     // truncate input_shape by the proceeding dimensitions
3310     auto curr_input_shape =
3311         input_shape.slice(input_shape_id, input_dim - input_shape_id);
3312 
3313     // generate all possible subsets of numbers between 0 and input_dim -
3314     // input_shape_id - 1 (inclusive)
3315     std::vector<std::vector<int64_t>> input_indices_subsets;
3316     std::vector<int64_t> curr_input_indices;
3317     gen_all_subsets(
3318         input_indices_subsets,
3319         input_dim - input_shape_id,
3320         0,
3321         curr_input_indices);
3322 
3323     for (auto input_idx_subset : input_indices_subsets) {
3324       // set the elements at indices of the subset of curr_input_shape to 1
3325       auto tmp_curr_input_shape = curr_input_shape.vec();
3326       for (auto input_idx : input_idx_subset) {
3327         tmp_curr_input_shape[input_idx] = 1;
3328       }
3329 
3330       for (int mask_shape_id = mask_dim - 1; mask_shape_id >= 0;
3331            --mask_shape_id) {
3332         // truncate amsk_shape by the proceeding dimensitions
3333         auto curr_mask_shape =
3334             mask_shape.slice(mask_shape_id, mask_dim - mask_shape_id);
3335 
3336         // generate all possible subsets of numbers between 0 and mask_dim -
3337         // mask_shape_id - 1 (inclusive)
3338         std::vector<std::vector<int64_t>> mask_indices_subsets;
3339         std::vector<int64_t> curr_mask_indices;
3340         gen_all_subsets(
3341             mask_indices_subsets,
3342             mask_dim - mask_shape_id,
3343             0,
3344             curr_mask_indices);
3345 
3346         for (auto mask_idx_subset : mask_indices_subsets) {
3347           // set the elements at indices of the subset of curr_mask_shape to 1
3348           auto tmp_curr_mask_shape = curr_mask_shape.vec();
3349           for (auto mask_idx : mask_idx_subset) {
3350             tmp_curr_mask_shape[mask_idx] = 1;
3351           }
3352 
3353           at::Tensor in_cpu = at::rand(
3354               tmp_curr_input_shape, at::device(at::kCPU).dtype(at::kFloat));
3355           at::Tensor mask_cpu = at::randint(
3356               0, 2, tmp_curr_mask_shape, at::device(at::kCPU).dtype(at::kBool));
3357           at::Tensor out_cpu = in_cpu.masked_fill(mask_cpu, -7.0f);
3358 
3359           at::Tensor in_vulkan = in_cpu.vulkan();
3360           at::Tensor mask_vulkan = mask_cpu.vulkan();
3361           at::Tensor out_vulkan = in_vulkan.masked_fill(mask_vulkan, -7.0f);
3362           const bool check = almostEqual(out_cpu, out_vulkan.cpu());
3363 
3364           if (!check) {
3365             showRtol(out_cpu, out_vulkan.cpu());
3366             std::cout << "Masked_fill test failed when input is of shape [";
3367             print_shape(tmp_curr_input_shape);
3368             std::cout << "], and mask of shape [";
3369             print_shape(tmp_curr_mask_shape);
3370             std::cout << "]" << std::endl;
3371           }
3372 
3373           ASSERT_TRUE(check);
3374         }
3375       }
3376     }
3377   }
3378 }
3379 
TEST_F(VulkanAPITest,masked_fill_scalar_mult4ch)3380 TEST_F(VulkanAPITest, masked_fill_scalar_mult4ch) {
3381   test_masked_fill_scalar({3, 4, 5, 7}, {3, 4, 5, 7});
3382 }
3383 
TEST_F(VulkanAPITest,masked_fill_scalar_nonmult4ch)3384 TEST_F(VulkanAPITest, masked_fill_scalar_nonmult4ch) {
3385   test_masked_fill_scalar({3, 5, 2, 3}, {3, 5, 2, 3});
3386 }
3387 
test_masked_fill_tensor(const at::IntArrayRef input_shape,const at::IntArrayRef mask_shape)3388 void test_masked_fill_tensor(
3389     const at::IntArrayRef input_shape,
3390     const at::IntArrayRef mask_shape) {
3391   c10::InferenceMode mode;
3392 
3393   at::Tensor in_cpu =
3394       at::rand(input_shape, at::device(at::kCPU).dtype(at::kFloat));
3395   at::Tensor mask_cpu =
3396       at::randint(0, 2, mask_shape, at::device(at::kCPU).dtype(at::kBool));
3397   at::Tensor out_cpu = in_cpu.masked_fill(mask_cpu, at::scalar_tensor(-7.0f));
3398   at::Tensor in_vulkan = in_cpu.vulkan();
3399   at::Tensor mask_vulkan = mask_cpu.vulkan();
3400   at::Tensor out_vulkan =
3401       in_vulkan.masked_fill(mask_vulkan, at::scalar_tensor(-7.0f));
3402   const bool check = almostEqual(out_cpu, out_vulkan.cpu());
3403   if (!check) {
3404     showRtol(out_cpu, out_vulkan.cpu());
3405   }
3406 
3407   ASSERT_TRUE(check);
3408 }
3409 
TEST_F(VulkanAPITest,masked_fill_tensor_mult4ch)3410 TEST_F(VulkanAPITest, masked_fill_tensor_mult4ch) {
3411   test_masked_fill_tensor({3, 4, 2, 3}, {1, 4, 1, 1});
3412 }
3413 
TEST_F(VulkanAPITest,masked_fill_tensor_nonmult4ch)3414 TEST_F(VulkanAPITest, masked_fill_tensor_nonmult4ch) {
3415   test_masked_fill_tensor({3, 5, 2, 3}, {1, 5, 1, 1});
3416 }
3417 
TEST_F(VulkanAPITest,max_pool2d)3418 TEST_F(VulkanAPITest, max_pool2d) {
3419   c10::InferenceMode mode;
3420 
3421   const auto in_cpu = at::rand({5, 13, 55, 68}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
3422   const auto out_cpu = at::max_pool2d(in_cpu, {3, 4}, {2, 1}, {1, 1}, {1, 1}, false);
3423   const auto out_vulkan = at::max_pool2d(in_cpu.vulkan(), {3, 4}, {2, 1}, {1, 1}, {1,1}, false);
3424 
3425   const auto check = almostEqual(out_cpu, out_vulkan.cpu());
3426   if (!check) {
3427     showRtol(out_cpu, out_vulkan.cpu());
3428   }
3429 
3430   ASSERT_TRUE(check);
3431 }
3432 
3433 
TEST_F(VulkanAPITest,mean_invalid_inputs)3434 TEST_F(VulkanAPITest, mean_invalid_inputs) {
3435   c10::InferenceMode mode;
3436 
3437   // Act: input dimension too large
3438   EXPECT_THROW({
3439     at::mean(at::rand({3, 5, 7, 8, 9}, at::device(at::kCPU).dtype(at::kFloat))
3440       .vulkan(), {3});
3441   }, ::std::exception);
3442 
3443   // Act: dimension out of range
3444   EXPECT_THROW({
3445     at::mean(at::rand({7, 8, 9}, at::device(at::kCPU).dtype(at::kFloat))
3446       .vulkan(), {3});
3447   }, ::std::exception);
3448 
3449   // Act: dimension out of range
3450   EXPECT_THROW({
3451     at::mean(at::rand({7, 8, 9}, at::device(at::kCPU).dtype(at::kFloat))
3452       .vulkan(), {-4});
3453   }, ::std::exception);
3454 
3455   // Act: repeated dimensions
3456   EXPECT_THROW({
3457     at::mean(at::rand({7, 8, 9}, at::device(at::kCPU).dtype(at::kFloat))
3458       .vulkan(), {1, 1});
3459   }, ::std::exception);
3460 
3461   // Act: repeated dimensions
3462   EXPECT_THROW({
3463     at::mean(at::rand({7, 8, 9}, at::device(at::kCPU).dtype(at::kFloat))
3464       .vulkan(), {1, -2});
3465   }, ::std::exception);
3466 }
3467 
test_mean_dim(const at::IntArrayRef input_shape,const at::IntArrayRef dim_list,bool keepdim=false)3468 void test_mean_dim(const at::IntArrayRef input_shape, const at::IntArrayRef dim_list, bool keepdim=false) {
3469   const auto in_cpu = at::rand(input_shape, at::device(at::kCPU).dtype(at::kFloat));
3470   const auto in_vulkan = in_cpu.vulkan();
3471 
3472   const auto out_cpu = at::mean(in_cpu, dim_list, keepdim);
3473   const auto out_vulkan = at::mean(in_vulkan, dim_list, keepdim);
3474 
3475   const auto check = almostEqual(out_cpu, out_vulkan.cpu());
3476   if (!check) {
3477     std::cout << "mean_dim test failed with input shape: "
3478               << input_shape << " and dim_list: " << dim_list << std::endl;
3479     showRtol(out_cpu, out_vulkan.cpu());
3480   }
3481 
3482   ASSERT_TRUE(check);
3483 }
3484 
TEST_F(VulkanAPITest,mean_dim_2d)3485 TEST_F(VulkanAPITest, mean_dim_2d) {
3486   test_mean_dim({2, 3}, {-1});
3487   test_mean_dim({2, 7}, {-2});
3488 }
3489 
TEST_F(VulkanAPITest,mean_dim_3d)3490 TEST_F(VulkanAPITest, mean_dim_3d) {
3491   test_mean_dim({9, 7, 5}, {-1});
3492   test_mean_dim({5, 7, 9}, {-2});
3493   test_mean_dim({5, 7, 9}, {-3});
3494 
3495   test_mean_dim({10, 7, 5}, {0, 1});
3496   test_mean_dim({10, 7, 5}, {0, 2});
3497   test_mean_dim({10, 7, 5}, {1, 2});
3498   test_mean_dim({10, 7, 5}, {-1, -2});
3499   test_mean_dim({10, 7, 5}, {0, -2});
3500 }
3501 
TEST_F(VulkanAPITest,mean_dim_4d)3502 TEST_F(VulkanAPITest, mean_dim_4d) {
3503   test_mean_dim({7, 9, 6, 5}, {-1});
3504   test_mean_dim({6, 5, 7, 9}, {-2});
3505   test_mean_dim({6, 5, 7, 9}, {-3});
3506   test_mean_dim({6, 5, 7, 9}, {-4});
3507 
3508   test_mean_dim({10, 7, 5, 6}, {0, 1});
3509   test_mean_dim({10, 7, 5, 6}, {0, 2});
3510   test_mean_dim({10, 7, 5, 6}, {0, 3});
3511   test_mean_dim({10, 7, 5, 6}, {1, 2});
3512   test_mean_dim({10, 7, 5, 6}, {1, 3});
3513   test_mean_dim({10, 7, 5, 6}, {2, 3});
3514   test_mean_dim({10, 7, 5, 6}, {-2, -4});
3515 
3516   test_mean_dim({10, 7, 5, 6}, {0, 1, 2});
3517   test_mean_dim({10, 7, 5, 6}, {0, 1, 3});
3518   test_mean_dim({10, 7, 5, 6}, {0, 2, 3});
3519   test_mean_dim({10, 7, 5, 6}, {3, 2, 1});
3520   test_mean_dim({10, 7, 5, 6}, {3, -2, 1});
3521   test_mean_dim({10, 7, 5, 6}, {-3, -2, -1});
3522 }
3523 
TEST_F(VulkanAPITest,mean_dim_keepdim_2d)3524 TEST_F(VulkanAPITest, mean_dim_keepdim_2d) {
3525   test_mean_dim({5, 7}, {-1}, true);
3526   test_mean_dim({5, 7}, {-2}, true);
3527 }
3528 
TEST_F(VulkanAPITest,mean_dim_keepdim_3d)3529 TEST_F(VulkanAPITest, mean_dim_keepdim_3d) {
3530   test_mean_dim({9, 5, 7}, {-1}, true);
3531   test_mean_dim({5, 9, 7}, {-2}, true);
3532   test_mean_dim({7, 9, 5}, {-3}, true);
3533 
3534   test_mean_dim({9, 5, 7}, {0, 1}, true);
3535   test_mean_dim({5, 9, 7}, {0, 2}, true);
3536   test_mean_dim({7, 9, 5}, {1, 2}, true);
3537 }
3538 
TEST_F(VulkanAPITest,mean_dim_keepdim_4d)3539 TEST_F(VulkanAPITest, mean_dim_keepdim_4d) {
3540   test_mean_dim({9, 5, 7, 11}, {-1}, true);
3541   test_mean_dim({5, 9, 11, 7}, {-2}, true);
3542   test_mean_dim({7, 11, 9, 5}, {-3}, true);
3543   test_mean_dim({11, 7, 9, 5}, {-4}, true);
3544 
3545   test_mean_dim({9, 5, 7, 11}, {0, 1}, true);
3546   test_mean_dim({5, 9, 11, 7}, {0, 2}, true);
3547   test_mean_dim({7, 11, 9, 5}, {0, 3}, true);
3548   test_mean_dim({11, 7, 9, 5}, {1, 2}, true);
3549   test_mean_dim({9, 5, 7, 11}, {1, 3}, true);
3550   test_mean_dim({5, 9, 11, 7}, {2, 3}, true);
3551 
3552   test_mean_dim({7, 11, 9, 5}, {-1, -2, -3}, true);
3553   test_mean_dim({11, 7, 9, 5}, {-1, -2, -4}, true);
3554   test_mean_dim({9, 5, 7, 11}, {-2, -3, -4}, true);
3555 }
3556 
TEST_F(VulkanAPITest,mm)3557 TEST_F(VulkanAPITest, mm) {
3558   const auto m1_cpu = at::rand({179, 67}, at::device(at::kCPU).dtype(at::kFloat));
3559   const auto m2_cpu = at::rand({67, 163}, at::device(at::kCPU).dtype(at::kFloat));
3560   const auto out_cpu = m1_cpu.mm(m2_cpu);
3561 
3562   const auto m1_vulkan = m1_cpu.vulkan();
3563   const auto out_vulkan = m1_vulkan.mm(m2_cpu);
3564 
3565   const auto check = almostEqual(out_cpu, out_vulkan.cpu());
3566   if (!check) {
3567     showRtol(out_cpu, out_vulkan.cpu());
3568   }
3569 
3570   ASSERT_TRUE(check);
3571 }
3572 
TEST_F(VulkanAPITest,mm_m2_is_variable)3573 TEST_F(VulkanAPITest, mm_m2_is_variable) {
3574   int n = 19;
3575   int p = 25;
3576   int m = 21;
3577   const auto m1_cpu = at::rand({n, p}, at::device(at::kCPU).dtype(at::kFloat));
3578   const auto m2_cpu = at::rand({p, m}, at::device(at::kCPU).dtype(at::kFloat));
3579 
3580   const auto out_cpu = m1_cpu.mm(m2_cpu);
3581 
3582   const auto m1_vulkan = m1_cpu.vulkan();
3583   const auto m2_vulkan = m2_cpu.vulkan();
3584 
3585   const auto out_vulkan = m1_vulkan.mm(m2_vulkan);
3586   const auto check = almostEqual(out_cpu, out_vulkan.cpu());
3587   if (!check) {
3588     showRtol(out_cpu, out_vulkan.cpu());
3589   }
3590 
3591   ASSERT_TRUE(check);
3592 }
3593 
TEST_F(VulkanAPITest,mm_m1_m2_variable)3594 TEST_F(VulkanAPITest, mm_m1_m2_variable) {
3595   int n = 19;
3596   int p = 25;
3597   int m = 21;
3598   const auto m1_cpu = at::rand({n, p}, at::device(at::kCPU).dtype(at::kFloat));
3599   const auto m2_cpu = at::rand({p, m}, at::device(at::kCPU).dtype(at::kFloat));
3600 
3601   const auto out_cpu = at::mm(m1_cpu, m2_cpu);
3602 
3603   const auto m1_vulkan = m1_cpu.vulkan();
3604   const auto m2_vulkan = m2_cpu.vulkan();
3605 
3606   const auto out_vulkan = at::mm(m1_vulkan, m2_vulkan);
3607   const auto check = almostEqual(out_cpu, out_vulkan.cpu());
3608   if (!check) {
3609     showRtol(out_cpu, out_vulkan.cpu());
3610   }
3611 
3612   ASSERT_TRUE(check);
3613 }
3614 
TEST_F(VulkanAPITest,mm_error)3615 TEST_F(VulkanAPITest, mm_error) {
3616   // mismatched dimensions of m1 and m2.
3617   const auto m1_cpu = at::rand({179, 99}, at::device(at::kCPU).dtype(at::kFloat));
3618   const auto m2_cpu = at::rand({67, 163}, at::device(at::kCPU).dtype(at::kFloat));
3619   const auto m1_vulkan = m1_cpu.vulkan();
3620 
3621   EXPECT_THROW(m1_vulkan.mm(m2_cpu), ::std::exception);
3622 }
3623 
test_mul(const at::IntArrayRef input_shape,const at::IntArrayRef other_shape)3624 void test_mul(const at::IntArrayRef input_shape, const at::IntArrayRef other_shape) {
3625   const auto in_cpu = at::rand(input_shape, at::device(at::kCPU).dtype(at::kFloat));
3626   const auto other_cpu = at::rand(other_shape, at::device(at::kCPU).dtype(at::kFloat));
3627 
3628   const auto in_vulkan = in_cpu.vulkan();
3629   const auto other_vulkan = other_cpu.vulkan();
3630 
3631   const auto out_cpu = at::mul(in_cpu, other_cpu);
3632   const auto out_vulkan = at::mul(in_vulkan, other_vulkan);
3633 
3634   const auto check = almostEqual(out_cpu, out_vulkan.cpu());
3635   if (!check) {
3636     showRtol(out_cpu, out_vulkan.cpu());
3637   }
3638 
3639   ASSERT_TRUE(check);
3640 }
3641 
TEST_F(VulkanAPITest,mul)3642 TEST_F(VulkanAPITest, mul) {
3643   test_mul({11, 7, 139, 109}, {11, 7, 139, 109});
3644 }
3645 
TEST_F(VulkanAPITest,mul_broadcast0)3646 TEST_F(VulkanAPITest, mul_broadcast0) {
3647   test_mul({3, 5, 1, 1}, {3, 5, 179, 221});
3648 }
3649 
TEST_F(VulkanAPITest,mul_broadcast1)3650 TEST_F(VulkanAPITest, mul_broadcast1) {
3651   test_mul({3, 5, 179, 221}, {3, 5, 1, 221});
3652 }
3653 
TEST_F(VulkanAPITest,mul_broadcast2)3654 TEST_F(VulkanAPITest, mul_broadcast2) {
3655   test_mul({3, 4, 179, 221}, {4, 1, 1});
3656 }
3657 
TEST_F(VulkanAPITest,mul_broadcast3)3658 TEST_F(VulkanAPITest, mul_broadcast3) {
3659   test_mul({3, 4, 179, 221}, {1, 1, 179, 221});
3660 }
3661 
TEST_F(VulkanAPITest,mul_broadcast4)3662 TEST_F(VulkanAPITest, mul_broadcast4) {
3663   test_mul({3, 4, 179, 1}, {1, 179, 221});
3664 }
3665 
TEST_F(VulkanAPITest,mul_broadcast5)3666 TEST_F(VulkanAPITest, mul_broadcast5) {
3667   test_mul({2, 1, 7, 1}, {1, 5, 1, 4});
3668 }
3669 
TEST_F(VulkanAPITest,mul_broadcast6)3670 TEST_F(VulkanAPITest, mul_broadcast6) {
3671   test_mul({1, 15, 5, 4}, {21, 1, 5, 4});
3672 }
3673 
TEST_F(VulkanAPITest,mul_zero_dim)3674 TEST_F(VulkanAPITest, mul_zero_dim) {
3675   test_mul({1, 15, 5, 4}, {});
3676 }
3677 
TEST_F(VulkanAPITest,mul_)3678 TEST_F(VulkanAPITest, mul_) {
3679   auto a_cpu = at::rand({61, 17, 29, 83}, at::device(at::kCPU).dtype(at::kFloat));
3680   auto a_vulkan = a_cpu.vulkan();
3681 
3682   const auto b_cpu = at::rand({61, 17, 29, 83}, at::device(at::kCPU).dtype(at::kFloat));
3683   const auto b_vulkan = b_cpu.vulkan();
3684 
3685   a_cpu.mul_(b_cpu);
3686   a_vulkan.mul_(b_vulkan);
3687 
3688   const auto check = almostEqual(a_cpu, a_vulkan.cpu());
3689   if (!check) {
3690     showRtol(b_cpu, b_vulkan.cpu());
3691   }
3692 
3693   ASSERT_TRUE(check);
3694 }
3695 
TEST_F(VulkanAPITest,mul_broadcast0_)3696 TEST_F(VulkanAPITest, mul_broadcast0_) {
3697   auto a_cpu = at::rand({12, 17, 29, 83}, at::device(at::kCPU).dtype(at::kFloat));
3698   auto a_vulkan = a_cpu.vulkan();
3699 
3700   const auto b_cpu = at::rand({12, 17, 29, 1}, at::device(at::kCPU).dtype(at::kFloat));
3701   const auto b_vulkan = b_cpu.vulkan();
3702 
3703   a_cpu.mul_(b_cpu);
3704   a_vulkan.mul_(b_vulkan);
3705 
3706   const auto check = almostEqual(a_cpu, a_vulkan.cpu());
3707   if (!check) {
3708     showRtol(b_cpu, b_vulkan.cpu());
3709   }
3710 
3711   ASSERT_TRUE(check);
3712 }
3713 
TEST_F(VulkanAPITest,mul_broadcast1_)3714 TEST_F(VulkanAPITest, mul_broadcast1_) {
3715   auto a_cpu = at::rand({3, 8, 29, 83}, at::device(at::kCPU).dtype(at::kFloat));
3716   auto a_vulkan = a_cpu.vulkan();
3717 
3718   const auto b_cpu = at::rand({8, 1, 1}, at::device(at::kCPU).dtype(at::kFloat));
3719   const auto b_vulkan = b_cpu.vulkan();
3720 
3721   a_cpu.mul_(b_cpu);
3722   a_vulkan.mul_(b_vulkan);
3723 
3724   const auto check = almostEqual(a_cpu, a_vulkan.cpu());
3725   if (!check) {
3726     showRtol(b_cpu, b_vulkan.cpu());
3727   }
3728 
3729   ASSERT_TRUE(check);
3730 }
3731 
TEST_F(VulkanAPITest,mul_scalar)3732 TEST_F(VulkanAPITest, mul_scalar) {
3733   const auto a_cpu = at::rand({17, 213, 213, 7}, at::device(at::kCPU).dtype(at::kFloat));
3734   const auto a_vulkan = a_cpu.vulkan();
3735 
3736   const float b_scalar = 3.1415f;
3737 
3738   const auto c_cpu = at::mul(a_cpu, b_scalar);
3739   const auto c_vulkan = at::mul(a_vulkan, b_scalar);
3740 
3741   const auto check = almostEqual(c_cpu, c_vulkan.cpu());
3742   if (!check) {
3743     showRtol(c_cpu, c_vulkan.cpu());
3744   }
3745 
3746   ASSERT_TRUE(check);
3747 }
3748 
TEST_F(VulkanAPITest,mul_scalar_)3749 TEST_F(VulkanAPITest, mul_scalar_) {
3750   auto a_cpu = at::rand({11, 7, 139, 109}, at::device(at::kCPU).dtype(at::kFloat));
3751   auto a_vulkan = a_cpu.vulkan();
3752 
3753   const float b_scalar = 3.1415f;
3754 
3755   a_cpu.mul_(b_scalar);
3756   a_vulkan.mul_(b_scalar);
3757 
3758   const auto check = almostEqual(a_cpu, a_vulkan.cpu());
3759   if (!check) {
3760     showRtol(a_cpu, a_vulkan.cpu());
3761   }
3762 
3763   ASSERT_TRUE(check);
3764 }
3765 
TEST_F(VulkanAPITest,mul_scalar_wrapped)3766 TEST_F(VulkanAPITest, mul_scalar_wrapped) {
3767   if (!at::is_vulkan_available()) {
3768     return;
3769   }
3770 
3771   const auto a_cpu = at::rand({17, 213, 213, 7}, at::device(at::kCPU).dtype(at::kFloat));
3772   const auto a_vulkan = a_cpu.vulkan();
3773 
3774   const auto b_scalar = at::rand({1}, at::device(at::kCPU).dtype(at::kFloat));
3775 
3776   const auto c_cpu = at::mul(a_cpu, b_scalar);
3777   const auto c_vulkan = at::mul(a_vulkan, b_scalar);
3778 
3779   const auto check = almostEqual(c_cpu, c_vulkan.cpu());
3780   if (!check) {
3781     showRtol(c_cpu, c_vulkan.cpu());
3782   }
3783 
3784   ASSERT_TRUE(check);
3785 }
3786 
TEST_F(VulkanAPITest,mul_scalar_wrapped_)3787 TEST_F(VulkanAPITest, mul_scalar_wrapped_) {
3788   if (!at::is_vulkan_available()) {
3789     return;
3790   }
3791 
3792   auto a_cpu = at::rand({11, 7, 139, 109}, at::device(at::kCPU).dtype(at::kFloat));
3793   auto a_vulkan = a_cpu.vulkan();
3794 
3795   const auto b_scalar = at::rand({1}, at::device(at::kCPU).dtype(at::kFloat));
3796 
3797   a_cpu.mul_(b_scalar);
3798   a_vulkan.mul_(b_scalar);
3799 
3800   const auto check = almostEqual(a_cpu, a_vulkan.cpu());
3801   if (!check) {
3802     showRtol(a_cpu, a_vulkan.cpu());
3803   }
3804 
3805   ASSERT_TRUE(check);
3806 }
3807 
TEST_F(VulkanAPITest,mul_to_scalar_wrapped)3808 TEST_F(VulkanAPITest, mul_to_scalar_wrapped) {
3809   if (!at::is_vulkan_available()) {
3810     return;
3811   }
3812 
3813   const auto a = at::rand({1}, at::device(at::kCPU).dtype(at::kFloat));
3814 
3815   const auto b_cpu = at::rand({11, 7, 139, 109}, at::device(at::kCPU).dtype(at::kFloat));
3816   const auto b_vulkan = b_cpu.vulkan();
3817 
3818   const auto c_cpu = at::mul(a, b_cpu);
3819   const auto c_vulkan = at::mul(a, b_vulkan);
3820 
3821   const auto check = almostEqual(c_cpu, c_vulkan.cpu());
3822   if (!check) {
3823     showRtol(c_cpu, c_vulkan.cpu());
3824   }
3825 
3826   ASSERT_TRUE(check);
3827 }
3828 
test_pow(const at::IntArrayRef input_shape,const at::IntArrayRef other_shape)3829 void test_pow(const at::IntArrayRef input_shape, const at::IntArrayRef other_shape) {
3830   const auto in_cpu = at::rand(input_shape, at::device(at::kCPU).dtype(at::kFloat));
3831   const auto other_cpu = at::rand(other_shape, at::device(at::kCPU).dtype(at::kFloat));
3832 
3833   const auto in_vulkan = in_cpu.vulkan();
3834   const auto other_vulkan = other_cpu.vulkan();
3835 
3836   const auto out_cpu = at::pow(in_cpu, other_cpu);
3837   const auto out_vulkan = at::pow(in_vulkan, other_vulkan);
3838 
3839   const auto check = almostEqual(out_cpu, out_vulkan.cpu());
3840   if (!check) {
3841     showRtol(out_cpu, out_vulkan.cpu());
3842     std::cout << "pow test failed with input shape: "
3843               << input_shape << " and other shape: " << other_shape << std::endl;
3844   }
3845 
3846   ASSERT_TRUE(check);
3847 }
3848 
TEST_F(VulkanAPITest,pow)3849 TEST_F(VulkanAPITest, pow) {
3850   test_pow({4}, {4});
3851   test_pow({4, 2}, {4, 2});
3852   test_pow({11, 7, 9}, {11, 7, 9});
3853   test_pow({3, 11, 9, 7}, {3, 11, 9, 7});
3854 }
3855 
TEST_F(VulkanAPITest,pow_broadcast)3856 TEST_F(VulkanAPITest, pow_broadcast) {
3857   // broadcast input
3858   test_pow({1}, {3});
3859   test_pow({1, 1}, {3, 2});
3860   test_pow({2, 1, 3}, {2, 2, 5, 3});
3861   test_pow({1, 1, 4}, {4, 8, 5, 4}); // mul4ch
3862   test_pow({3, 7, 1, 4}, {3, 7, 9, 4});
3863 
3864   // broadcast other
3865   test_pow({3}, {1});
3866   test_pow({3, 2}, {1, 2});
3867   test_pow({2, 2, 5, 3}, {2, 1, 3});
3868   test_pow({3, 7, 9, 4}, {3, 7, 1, 4});
3869   test_pow({3, 8, 2, 5}, {1, 1, 2, 5}); // mul4ch
3870 
3871   // broadcast both
3872   test_pow({2, 1, 2}, {1, 5, 1});
3873   test_pow({5, 1, 4}, {7, 1, 2, 1});
3874   test_pow({2, 1, 7, 1}, {1, 5, 1, 4});
3875   test_pow({1, 15, 5, 4}, {21, 1, 5, 4});
3876   test_pow({1, 1, 5, 5}, {8, 8, 1, 1}); // mul4ch
3877 }
3878 
TEST_F(VulkanAPITest,pow_zero_dim)3879 TEST_F(VulkanAPITest, pow_zero_dim) {
3880   test_mul({1, 15, 5, 4}, {});
3881 }
3882 
test_pow_(const at::IntArrayRef input_shape,const at::IntArrayRef other_shape)3883 void test_pow_(const at::IntArrayRef input_shape, const at::IntArrayRef other_shape) {
3884   const auto cpu = at::rand(input_shape, at::device(at::kCPU).dtype(at::kFloat));
3885   const auto other_cpu = at::rand(other_shape, at::device(at::kCPU).dtype(at::kFloat));
3886 
3887   const auto vulkan = cpu.vulkan();
3888   const auto other_vulkan = other_cpu.vulkan();
3889 
3890   cpu.pow_(other_cpu);
3891   vulkan.pow_(other_vulkan);
3892 
3893   const auto check = almostEqual(cpu, vulkan.cpu());
3894   if (!check) {
3895     showRtol(cpu, vulkan.cpu());
3896     std::cout << "pow_ test failed with input shape: "
3897               << input_shape << " and other shape: " << other_shape << std::endl;
3898   }
3899 
3900   ASSERT_TRUE(check);
3901 }
3902 
TEST_F(VulkanAPITest,pow_)3903 TEST_F(VulkanAPITest, pow_) {
3904   test_pow_({4}, {4});
3905   test_pow_({4, 2}, {4, 2});
3906   test_pow_({11, 7, 9}, {11, 7, 9});
3907   test_pow_({3, 11, 9, 7}, {3, 11, 9, 7});
3908 }
3909 
TEST_F(VulkanAPITest,pow_broadcast_other_)3910 TEST_F(VulkanAPITest, pow_broadcast_other_) {
3911   test_pow_({3}, {1});
3912   test_pow_({3, 2}, {1, 2});
3913   test_pow_({2, 2, 5, 3}, {2, 1, 3});
3914   test_pow_({3, 7, 9, 4}, {3, 7, 1, 4});
3915 }
3916 
test_pow_tensor_scalar(const at::IntArrayRef input_shape,const float exp)3917 void test_pow_tensor_scalar(const at::IntArrayRef input_shape, const float exp) {
3918   const auto in_cpu = at::rand(input_shape, at::device(at::kCPU).dtype(at::kFloat));
3919   const auto in_vulkan = in_cpu.vulkan();
3920 
3921   const auto out_cpu = at::pow(in_cpu, exp);
3922   const auto out_vulkan = at::pow(in_vulkan, exp);
3923 
3924   const auto check = almostEqual(out_cpu, out_vulkan.cpu());
3925   if (!check) {
3926     showRtol(out_cpu, out_vulkan.cpu());
3927     std::cout << "pow_tensor_scalar test failed with input shape: "
3928               << input_shape << std::endl;
3929   }
3930 
3931   ASSERT_TRUE(check);
3932 }
3933 
TEST_F(VulkanAPITest,pow_tensor_scalar)3934 TEST_F(VulkanAPITest, pow_tensor_scalar) {
3935   test_pow_tensor_scalar({4}, 2.5);             // 1d
3936   test_pow_tensor_scalar({4, 2}, -1);           // 2d
3937   test_pow_tensor_scalar({11, 7, 9}, 7.7);      // 3d
3938   test_pow_tensor_scalar({3, 11, 9, 7}, -0.03); // 4d
3939 }
3940 
test_pow_tensor_scalar_(const at::IntArrayRef input_shape,const float exp)3941 void test_pow_tensor_scalar_(const at::IntArrayRef input_shape, const float exp) {
3942   // Make sure inputs are not 0, cannot compare
3943   const auto cpu = at::rand(input_shape, at::device(at::kCPU).dtype(at::kFloat));
3944   const auto vulkan = cpu.vulkan();
3945 
3946   cpu.pow_(exp);
3947   vulkan.pow_(exp);
3948 
3949   const auto check = almostEqual(cpu, vulkan.cpu());
3950   if (!check) {
3951     showRtol(cpu, vulkan.cpu());
3952     std::cout << "pow_scalar_ test failed with input shape: "
3953               << input_shape << std::endl;
3954   }
3955 
3956   ASSERT_TRUE(check);
3957 }
3958 
TEST_F(VulkanAPITest,pow_tensor_scalar_)3959 TEST_F(VulkanAPITest, pow_tensor_scalar_) {
3960   test_pow_tensor_scalar_({4}, 2.5);             // 1d
3961   test_pow_tensor_scalar_({4, 2}, -1);           // 2d
3962   test_pow_tensor_scalar_({11, 7, 9}, 7.7);      // 3d
3963   test_pow_tensor_scalar_({3, 11, 9, 7}, -0.03); // 4d
3964 }
3965 
test_pow_scalar_tensor(const float base,const at::IntArrayRef other)3966 void test_pow_scalar_tensor(const float base, const at::IntArrayRef other) {
3967   const auto other_cpu = at::rand(other, at::device(at::kCPU).dtype(at::kFloat));
3968   const auto other_vulkan = other_cpu.vulkan();
3969 
3970   const auto out_cpu = at::pow(base, other_cpu);
3971   const auto out_vulkan = at::pow(base, other_vulkan);
3972 
3973   const auto check = almostEqual(out_cpu, out_vulkan.cpu());
3974   if (!check) {
3975     showRtol(out_cpu, out_vulkan.cpu());
3976     std::cout << "pow_scalar_tensor test failed with other shape: "
3977               << other << std::endl;
3978   }
3979 
3980   ASSERT_TRUE(check);
3981 }
3982 
TEST_F(VulkanAPITest,pow_scalar_tensor)3983 TEST_F(VulkanAPITest, pow_scalar_tensor) {
3984   test_pow_scalar_tensor(2.5, {4});             // 1d
3985   test_pow_scalar_tensor(2, {4, 2});            // 2d
3986   test_pow_scalar_tensor(7.7, {11, 7, 9});      // 3d
3987   test_pow_scalar_tensor(3, {3, 11, 9, 7});     // 4d
3988 }
3989 
test_floor_divide_scalar(const at::IntArrayRef input_shape,float input_scale,float other)3990 void test_floor_divide_scalar(const at::IntArrayRef input_shape, float input_scale, float other) {
3991   c10::InferenceMode mode;
3992 
3993   auto in_cpu = at::rand(input_shape, at::device(at::kCPU).dtype(at::kFloat));
3994   in_cpu = at::mul(in_cpu, input_scale);
3995 
3996   auto in_vulkan = in_cpu.vulkan();
3997   auto out_vk = at::floor_divide(in_vulkan, other);
3998   auto out_cpu = at::floor_divide(in_cpu, other);
3999 
4000   // max tolerance is 1.0 due to floor.
4001   // may consider adding extra check on number of violation. it should be rare.
4002   const auto check = checkRtol(out_cpu - out_vk.cpu(), 1.0f);
4003   if (!check) {
4004     std::cout << "floor_divide test failed with "
4005               << "scale: " << input_scale
4006               << " other: " << other
4007               << std::endl;
4008   }
4009 
4010   ASSERT_TRUE(check);
4011 }
4012 
TEST_F(VulkanAPITest,floor_divide_scalar)4013 TEST_F(VulkanAPITest, floor_divide_scalar) {
4014   test_floor_divide_scalar({3, 3, 12, 12}, 100.0, 10.0);
4015   test_floor_divide_scalar({12, 12}, 10.0, 3.4);
4016   test_floor_divide_scalar({4, 5, 12, 12}, 100.0, 10.0);
4017   test_floor_divide_scalar({3, 3, 12, 12}, 0.3, 0.08);
4018 }
4019 
TEST_F(VulkanAPITest,floor_divide_scalar_error)4020 TEST_F(VulkanAPITest, floor_divide_scalar_error) {
4021   c10::InferenceMode mode;
4022 
4023   auto in_cpu = at::rand({2, 3, 4}, at::device(at::kCPU).dtype(at::kFloat));
4024   auto in_vulkan = in_cpu.vulkan();
4025   EXPECT_THROW(at::floor_divide(in_vulkan, 0.0f), ::std::exception);
4026 }
4027 
test_floor_divide_scalar_inplace(const at::IntArrayRef input_shape,float input_scale,float other)4028 void test_floor_divide_scalar_inplace(const at::IntArrayRef input_shape, float input_scale, float other) {
4029   c10::InferenceMode mode;
4030 
4031   auto in_cpu = at::rand(input_shape, at::device(at::kCPU).dtype(at::kFloat));
4032   in_cpu = at::mul(in_cpu, input_scale);
4033   auto in_vk = in_cpu.vulkan();
4034 
4035   in_cpu.floor_divide_(other);
4036   in_vk.floor_divide_(other);
4037 
4038   // max tolerance is 1.0 due to floor.
4039   // may consider adding extra check on number of violation. it should be rare.
4040   const auto check = checkRtol(in_cpu - in_vk.cpu(), 1.0f);
4041   if (!check) {
4042     std::cout << "floor_divide test failed with "
4043               << "scale: " << input_scale
4044               << " other: " << other
4045               << std::endl;
4046   }
4047 
4048   ASSERT_TRUE(check);
4049 }
4050 
TEST_F(VulkanAPITest,floor_divide_scalar_inplace_error)4051 TEST_F(VulkanAPITest, floor_divide_scalar_inplace_error) {
4052   c10::InferenceMode mode;
4053 
4054   auto in_cpu = at::rand({2, 3, 4}, at::device(at::kCPU).dtype(at::kFloat));
4055   auto in_vulkan = in_cpu.vulkan();
4056   EXPECT_THROW(in_vulkan.floor_divide(0.0f), ::std::exception);
4057 }
4058 
TEST_F(VulkanAPITest,floor_divide_scalar_inplace)4059 TEST_F(VulkanAPITest, floor_divide_scalar_inplace) {
4060   test_floor_divide_scalar_inplace({3, 3, 12, 12}, 100.0, 10.0);
4061   test_floor_divide_scalar_inplace({12, 12}, 10.0, 3.4);
4062   test_floor_divide_scalar_inplace({4, 5, 12, 12}, 100.0, 10.0);
4063   test_floor_divide_scalar_inplace({3, 3, 12, 12}, 0.3, 0.08);
4064 }
4065 
TEST_F(VulkanAPITest,floor_divide_zero_dim_tensor)4066 TEST_F(VulkanAPITest, floor_divide_zero_dim_tensor) {
4067   c10::InferenceMode mode;
4068 
4069   std::vector<int64_t> input_shape{5, 3, 4, 5};
4070   float input_scale = 100.0;
4071 
4072   auto in_cpu = at::rand(input_shape, at::device(at::kCPU).dtype(at::kFloat));
4073   in_cpu = at::mul(in_cpu, input_scale);
4074   auto in_vk = in_cpu.vulkan();
4075 
4076   auto other_cpu = at::zeros({}, at::device(at::kCPU).dtype(at::kFloat)) + 10.0f;
4077   auto other_vk = other_cpu.vulkan();
4078 
4079   auto out_cpu = at::floor_divide(in_cpu, other_cpu);
4080   auto out_vk = at::floor_divide(in_vk, other_vk);
4081 
4082   // max tolerance is 1.0 due to floor.
4083   // may consider adding extra check on number of violation. it should be rare.
4084   const auto check = checkRtol(out_cpu - out_vk.cpu(), 1.0f);
4085   if (!check) {
4086     std::cout << "floor_divide test failed with "
4087               << "scale: " << input_scale
4088               << std::endl;
4089   }
4090 
4091   ASSERT_TRUE(check);
4092 }
4093 
TEST_F(VulkanAPITest,floor_divide_tensor)4094 TEST_F(VulkanAPITest, floor_divide_tensor) {
4095   c10::InferenceMode mode;
4096 
4097   std::vector<int64_t> input_shape{6, 3, 5, 5};
4098   float input_scale = 10.0;
4099 
4100   auto in_cpu = at::rand(input_shape, at::device(at::kCPU).dtype(at::kFloat));
4101   in_cpu = at::mul(in_cpu, input_scale);
4102   // "other" is at least 0.5 to avoid rounding error causes by very small
4103   // values.
4104   auto other_cpu =
4105       at::rand(input_shape, at::device(at::kCPU).dtype(at::kFloat)) + 0.5;
4106 
4107   auto in_vk = in_cpu.vulkan();
4108   auto other_vk = other_cpu.vulkan();
4109 
4110   auto out_cpu = at::floor_divide(in_cpu, other_cpu);
4111   auto out_vk = at::floor_divide(in_vk, other_vk);
4112 
4113   // max tolerance is 1.0 due to floor.
4114   // may consider adding extra check on number of violation. it should be rare.
4115   const auto check = checkRtol(out_cpu - out_vk.cpu(), 1.0f);
4116   if (!check) {
4117     std::cout << "floor_divide test failed with "
4118               << "scale: " << input_scale << std::endl;
4119   }
4120 
4121   ASSERT_TRUE(check);
4122 }
4123 
TEST_F(VulkanAPITest,floor_divide_tensor_inplace)4124 TEST_F(VulkanAPITest, floor_divide_tensor_inplace) {
4125   c10::InferenceMode mode;
4126 
4127   std::vector<int64_t> input_shape{5, 3, 5, 5};
4128   float input_scale = 10.0;
4129 
4130   auto in_cpu = at::rand(input_shape, at::device(at::kCPU).dtype(at::kFloat));
4131   in_cpu = at::mul(in_cpu, input_scale);
4132   // "other" is at least 0.5 to avoid rounding error causes by very small
4133   // values.
4134   auto other_cpu =
4135       at::rand(input_shape, at::device(at::kCPU).dtype(at::kFloat)) + 0.5;
4136 
4137   auto in_vk = in_cpu.vulkan();
4138   auto other_vk = other_cpu.vulkan();
4139 
4140   in_cpu.floor_divide_(other_cpu);
4141   in_vk.floor_divide_(other_vk);
4142 
4143   // max tolerance is 1.0 due to floor.
4144   // may consider adding extra check on number of violation. it should be rare.
4145   const auto check = checkRtol(in_cpu - in_vk.cpu(), 1.0f);
4146   if (!check) {
4147     std::cout << "floor_divide test failed with "
4148               << "scale: " << input_scale << std::endl;
4149   }
4150 
4151   ASSERT_TRUE(check);
4152 }
4153 
TEST_F(VulkanAPITest,relu)4154 TEST_F(VulkanAPITest, relu) {
4155   const auto in_cpu = at::rand({17, 197, 302, 5}, at::device(at::kCPU).dtype(at::kFloat));
4156   const auto in_vulkan = in_cpu.vulkan();
4157 
4158   const auto out_cpu = at::relu(in_cpu);
4159   const auto out_vulkan = at::relu(in_vulkan);
4160 
4161   const auto check = almostEqual(out_cpu, out_vulkan.cpu());
4162 
4163   if (!check) {
4164     showRtol(out_cpu, out_vulkan.cpu());
4165   }
4166 
4167   ASSERT_TRUE(check);
4168 }
4169 
TEST_F(VulkanAPITest,relu_)4170 TEST_F(VulkanAPITest, relu_) {
4171   auto a_cpu = at::rand({17, 197, 302, 5}, at::device(at::kCPU).dtype(at::kFloat));
4172   auto a_vulkan = a_cpu.vulkan();
4173 
4174   at::relu_(a_cpu);
4175   at::relu_(a_vulkan);
4176 
4177   const auto check = almostEqual(a_cpu, a_vulkan.cpu());
4178 
4179   if (!check) {
4180     showRtol(a_cpu, a_vulkan.cpu());
4181   }
4182 
4183   ASSERT_TRUE(check);
4184 }
4185 
TEST_F(VulkanAPITest,reflection_pad2d)4186 TEST_F(VulkanAPITest, reflection_pad2d) {
4187   const auto a_cpu = at::rand({2, 3, 47, 63}, at::device(at::kCPU).dtype(at::kFloat));
4188   const auto a_vulkan = a_cpu.vulkan();
4189 
4190   const auto out_cpu = at::reflection_pad2d(a_cpu, {9,8,5,12});
4191   const auto out_vulkan = at::reflection_pad2d(a_vulkan, {9,8,5,12}).cpu();
4192 
4193   const auto check = almostEqual(out_cpu, out_vulkan);
4194   if (!check) {
4195     showRtol(out_cpu, out_vulkan);
4196   }
4197 
4198   ASSERT_TRUE(check);
4199 }
4200 
TEST_F(VulkanAPITest,repeat_invalid_inputs_outputs_exceptions)4201 TEST_F(VulkanAPITest, repeat_invalid_inputs_outputs_exceptions) {
4202   // Arrange: Vulkan repeat only supports input of dims <= 4
4203   {
4204     const auto in_cpu =
4205         at::rand({3, 9, 11, 7, 3}, at::device(at::kCPU).dtype(at::kFloat));
4206     const at::IntArrayRef repeats = {5, 7, 3, 9, 2};
4207 
4208     // Act
4209     EXPECT_THROW(
4210         { const auto out_vulkan = in_cpu.vulkan().repeat(repeats); },
4211         ::std::exception);
4212   }
4213 
4214   // Arrange: Number of dimensions of repeat dims can not be smaller than
4215   // number of dimensions of tensor
4216   {
4217     const auto in_cpu =
4218         at::rand({3, 5, 11, 13}, at::device(at::kCPU).dtype(at::kFloat));
4219     const at::IntArrayRef repeats = {5, 7};
4220 
4221     // Act
4222     EXPECT_THROW(
4223         { const auto out_vulkan = in_cpu.vulkan().repeat(repeats); },
4224         ::std::exception);
4225   }
4226 
4227   // Arrange: Vulkan repeat only supports output of dims <= 4
4228   {
4229     const auto in_cpu =
4230         at::rand({3, 9, 11, 7}, at::device(at::kCPU).dtype(at::kFloat));
4231     const at::IntArrayRef repeats = {5, 7, 3, 9, 2};
4232 
4233     // Act
4234     EXPECT_THROW(
4235         { const auto out_vulkan = in_cpu.vulkan().repeat(repeats); },
4236         ::std::exception);
4237   }
4238 }
4239 
test_repeat(const at::IntArrayRef input_shape,const at::IntArrayRef repeats)4240 void test_repeat(
4241     const at::IntArrayRef input_shape,
4242     const at::IntArrayRef repeats) {
4243   c10::InferenceMode mode;
4244 
4245   at::Tensor in_cpu;
4246   at::Tensor out_cpu;
4247   at::Tensor in_vulkan;
4248   at::Tensor out_vulkan;
4249   at::IntArrayRef repeat;
4250   bool check = true;
4251   for (int idx_input = 1; (unsigned)idx_input < input_shape.size() + 1; ++idx_input) {
4252     for (int idx_repeat = idx_input; (unsigned)idx_repeat < repeats.size() + 1;
4253           ++idx_repeat) {
4254       in_cpu = at::rand(
4255           input_shape.slice(0, idx_input),
4256           at::device(at::kCPU).dtype(at::kFloat));
4257       repeat = repeats.slice(0, idx_repeat);
4258       out_cpu = in_cpu.repeat(repeats);
4259       in_vulkan = in_cpu.vulkan();
4260       out_vulkan = in_vulkan.repeat(repeats);
4261       bool local_check = almostEqual(out_cpu, out_vulkan.cpu());
4262       if (!local_check) {
4263         check = false;
4264         std::cout << "Repeat test failed when input is of shape "
4265                   << input_shape.slice(0, idx_input) << " and repeat of "
4266                   << repeat << std::endl;
4267         showRtol(out_cpu, out_vulkan.cpu());
4268       }
4269     }
4270   }
4271 
4272   ASSERT_TRUE(check);
4273 }
4274 
TEST_F(VulkanAPITest,repeat)4275 TEST_F(VulkanAPITest, repeat) {
4276   test_repeat({13, 5, 13, 7}, {7, 2, 3, 5});
4277 }
4278 
TEST_F(VulkanAPITest,replication_pad2d)4279 TEST_F(VulkanAPITest, replication_pad2d) {
4280   const auto a_cpu = at::rand({2, 3, 47, 63}, at::device(at::kCPU).dtype(at::kFloat));
4281   const auto a_vulkan = a_cpu.vulkan();
4282 
4283   constexpr std::array<int64_t, 4u> padding_params{9, 8, 5, 12};
4284 
4285   const auto out_cpu = at::replication_pad2d(a_cpu, padding_params);
4286   const auto out_vulkan = at::replication_pad2d(a_vulkan, padding_params).cpu();
4287 
4288   const auto check = almostEqual(out_cpu, out_vulkan);
4289   if (!check) {
4290     showRtol(out_cpu, out_vulkan);
4291   }
4292 
4293   ASSERT_TRUE(check);
4294 }
4295 
TEST_F(VulkanAPITest,reshape)4296 TEST_F(VulkanAPITest, reshape) {
4297   c10::InferenceMode mode;
4298 
4299   const auto in_cpu = at::rand({7, 11, 8, 9}, at::device(at::kCPU).dtype(at::kFloat));
4300   const auto in_vulkan = in_cpu.vulkan();
4301 
4302   const std::array<int64_t, 2> shape{7 * 8, 11 * 9};
4303 
4304   const auto out_cpu = at::reshape(in_cpu, shape);
4305   const auto out_vulkan = at::reshape(in_vulkan, shape);
4306 
4307   const auto check = almostEqual(out_cpu, out_vulkan.cpu());
4308   if (!check) {
4309     showRtol(out_cpu, out_vulkan.cpu());
4310   }
4311 
4312   ASSERT_TRUE(check);
4313 }
4314 
TEST_F(VulkanAPITest,reshape_)4315 TEST_F(VulkanAPITest, reshape_) {
4316   c10::InferenceMode mode;
4317 
4318   const auto cpu = at::rand({9, 4, 12, 6}, at::device(at::kCPU).dtype(at::kFloat));
4319   const auto vulkan = cpu.vulkan();
4320 
4321   const std::array<int64_t, 3> shape{9, 4 * 6, 12};
4322 
4323   cpu.reshape(shape);
4324   vulkan.reshape(shape);
4325 
4326   const auto check = almostEqual(cpu, vulkan.cpu());
4327   if (!check) {
4328     showRtol(cpu, vulkan.cpu());
4329   }
4330 
4331   ASSERT_TRUE(check);
4332 }
4333 
test_select(const at::IntArrayRef input_shape,int64_t dim,int64_t index)4334 void test_select(const at::IntArrayRef input_shape, int64_t dim, int64_t index) {
4335   const auto in_cpu = at::rand(input_shape, at::device(at::kCPU).dtype(at::kFloat));
4336   const auto out_cpu = at::select(in_cpu, dim, index);
4337 
4338   const auto in_vulkan = in_cpu.vulkan();
4339   const auto out_vulkan = at::select(in_vulkan, dim, index);
4340 
4341   const auto check = almostEqual(out_cpu, out_vulkan.cpu());
4342   if (!check) {
4343     showRtol(out_cpu, out_vulkan.cpu());
4344   }
4345 
4346   ASSERT_TRUE(check);
4347 }
4348 
TEST_F(VulkanAPITest,select_3d_depth_small)4349 TEST_F(VulkanAPITest, select_3d_depth_small) {
4350   test_select({1, 1, 1}, 0, 0);
4351 }
4352 
TEST_F(VulkanAPITest,select_3d_depth_medium)4353 TEST_F(VulkanAPITest, select_3d_depth_medium) {
4354   test_select({3, 2, 5}, 0, 2);
4355 }
4356 
TEST_F(VulkanAPITest,select_3d_depth_large)4357 TEST_F(VulkanAPITest, select_3d_depth_large) {
4358   test_select({100, 1, 144}, 0, 50);
4359 }
4360 
TEST_F(VulkanAPITest,select_3d_height_small)4361 TEST_F(VulkanAPITest, select_3d_height_small) {
4362   test_select({1, 1, 1}, 1, 0);
4363 }
4364 
TEST_F(VulkanAPITest,select_3d_height_medium)4365 TEST_F(VulkanAPITest, select_3d_height_medium) {
4366   test_select({3, 5, 2}, 1, 2);
4367 }
4368 
TEST_F(VulkanAPITest,select_3d_height_medium1)4369 TEST_F(VulkanAPITest, select_3d_height_medium1) {
4370   test_select({16, 16, 5}, 1, 6);
4371 }
4372 
TEST_F(VulkanAPITest,select_3d_height_medium2)4373 TEST_F(VulkanAPITest, select_3d_height_medium2) {
4374   test_select({17, 17, 5}, 1, 6);
4375 }
4376 
TEST_F(VulkanAPITest,select_3d_height_large)4377 TEST_F(VulkanAPITest, select_3d_height_large) {
4378   test_select({100, 144, 5}, 1, 50);
4379 }
4380 
TEST_F(VulkanAPITest,select_3d_width_small)4381 TEST_F(VulkanAPITest, select_3d_width_small) {
4382   test_select({1, 1, 1}, 2, 0);
4383 }
4384 
TEST_F(VulkanAPITest,select_3d_width_medium)4385 TEST_F(VulkanAPITest, select_3d_width_medium) {
4386   test_select({3, 5, 3}, 2, 2);
4387 }
4388 
TEST_F(VulkanAPITest,select_3d_width_medium2)4389 TEST_F(VulkanAPITest, select_3d_width_medium2) {
4390   test_select({17, 17, 8}, 2, 6);
4391 }
4392 
TEST_F(VulkanAPITest,select_3d_width_large)4393 TEST_F(VulkanAPITest, select_3d_width_large) {
4394   test_select({100, 3, 144}, 2, 50);
4395 }
4396 
TEST_F(VulkanAPITest,select_4d_batch_small)4397 TEST_F(VulkanAPITest, select_4d_batch_small) {
4398   test_select({1, 1, 1, 1}, 0, 0);
4399 }
4400 
TEST_F(VulkanAPITest,select_4d_batch_medium)4401 TEST_F(VulkanAPITest, select_4d_batch_medium) {
4402   test_select({3, 2, 5, 4}, 0, 1);
4403 }
4404 
TEST_F(VulkanAPITest,select_4d_batch_large)4405 TEST_F(VulkanAPITest, select_4d_batch_large) {
4406   test_select({30, 8, 12, 17}, 0, 27);
4407 }
4408 
TEST_F(VulkanAPITest,select_4d_depth_small)4409 TEST_F(VulkanAPITest, select_4d_depth_small) {
4410   test_select({1, 1, 1, 1}, 1, 0);
4411 }
4412 
TEST_F(VulkanAPITest,select_4d_depth_medium)4413 TEST_F(VulkanAPITest, select_4d_depth_medium) {
4414   test_select({7, 5, 2, 4}, 1, 4);
4415 }
4416 
TEST_F(VulkanAPITest,select_4d_depth_large)4417 TEST_F(VulkanAPITest, select_4d_depth_large) {
4418   test_select({5, 30, 12, 30}, 1, 23);
4419 }
4420 
TEST_F(VulkanAPITest,select_4d_height_small)4421 TEST_F(VulkanAPITest, select_4d_height_small) {
4422   test_select({1, 1, 1, 1}, 2, 0);
4423 }
4424 
TEST_F(VulkanAPITest,select_4d_height_medium)4425 TEST_F(VulkanAPITest, select_4d_height_medium) {
4426   test_select({3, 5, 4, 2}, 2, 3);
4427 }
4428 
TEST_F(VulkanAPITest,select_4d_height_large)4429 TEST_F(VulkanAPITest, select_4d_height_large) {
4430   test_select({5, 8, 50, 50}, 2, 41);
4431 }
4432 
TEST_F(VulkanAPITest,select_4d_width_small)4433 TEST_F(VulkanAPITest, select_4d_width_small) {
4434   test_select({1, 1, 1, 1}, 3, 0);
4435 }
4436 
TEST_F(VulkanAPITest,select_4d_width_medium)4437 TEST_F(VulkanAPITest, select_4d_width_medium) {
4438   test_select({3, 5, 4, 2}, 3, 1);
4439 }
4440 
TEST_F(VulkanAPITest,select_4d_width_large)4441 TEST_F(VulkanAPITest, select_4d_width_large) {
4442   test_select({5, 8, 50, 50}, 3, 33);
4443 }
4444 
TEST_F(VulkanAPITest,sigmoid)4445 TEST_F(VulkanAPITest, sigmoid) {
4446   const auto in_cpu = at::rand({17, 197, 302, 5}, at::device(at::kCPU).dtype(at::kFloat));
4447   const auto in_vulkan = in_cpu.vulkan();
4448 
4449   const auto out_cpu = at::sigmoid(in_cpu);
4450   const auto out_vulkan = at::sigmoid(in_vulkan);
4451 
4452   const auto check = almostEqual(out_cpu, out_vulkan.cpu());
4453   if (!check) {
4454     showRtol(out_cpu, out_vulkan.cpu());
4455   }
4456 
4457   ASSERT_TRUE(check);
4458 }
4459 
TEST_F(VulkanAPITest,sigmoid_)4460 TEST_F(VulkanAPITest, sigmoid_) {
4461   auto cpu = at::rand({17, 197, 302, 5}, at::device(at::kCPU).dtype(at::kFloat));
4462   auto vulkan = cpu.vulkan();
4463 
4464   at::sigmoid_(cpu);
4465   at::sigmoid_(vulkan);
4466 
4467   const auto check = almostEqual(cpu, vulkan.cpu());
4468   if (!check) {
4469     showRtol(cpu, vulkan.cpu());
4470   }
4471 
4472   ASSERT_TRUE(check);
4473 }
4474 
TEST_F(VulkanAPITest,DISABLED_log_softmax_underflow_exception)4475 TEST_F(VulkanAPITest, DISABLED_log_softmax_underflow_exception) {
4476   // We apply softmax and log in a sequence to the tesnor [20, 0].
4477   // The output of softmax on CPU is [1.0000e+00, 2.0612e-09]; while
4478   // the output on Vulkan is [1, 0] since 2.0612e-09 is smaller than
4479   // the smallest represetable positive 5.96e−8. We expect to see nan
4480   // or -inf when applying log.
4481   float data[] = {20, 0};
4482   const auto in_cpu = at::from_blob(data, {2}, at::kFloat);
4483   const auto in_vulkan = in_cpu.vulkan();
4484 
4485   const auto softmax_out_cpu = at::softmax(in_cpu, 0);
4486   const auto softmax_out_vulkan = at::softmax(in_vulkan, 0);
4487 
4488   const auto log_out_cpu = at::log(softmax_out_cpu);
4489   const auto log_out_vulkan = at::log(softmax_out_vulkan);
4490 
4491   auto has_nan = log_out_vulkan.cpu().isnan().any().item().to<bool>();
4492   auto has_inf = log_out_vulkan.cpu().isinf().any().item().to<bool>();
4493 
4494   // We expect the output of log containing nan or inf.
4495   const auto check = has_nan || has_inf;
4496   if (!check) {
4497     std::cout << "expect log_out_vulkan contains nan or inf, but got" << std::endl;
4498     std::cout << log_out_vulkan.cpu() << std::endl;
4499   }
4500   ASSERT_TRUE(check);
4501 }
4502 
TEST_F(VulkanAPITest,log_softmax_underflow)4503 TEST_F(VulkanAPITest, log_softmax_underflow) {
4504   // The minimum strictly positive (subnormal) value of float16 on Vulkan is 2−24 ≈ 5.96 × 10^−8.
4505   // https://en.wikipedia.org/wiki/Half-precision_floating-point_format#Exponent_encoding
4506   // then smallest_representable_log = log(5.96 × 10^−8) = -16.64.
4507   // The implementation of `log_softmax` adds 6e-8 to the output of softmax before applying `log`
4508   // to deal with underflow, so there won't be nan or -inf as shown in the
4509   // `log_softmax_underflow_exception` test above
4510   float smallest_representable_log = -16.64f;
4511   float data[] = {20, 0};
4512   const auto in_cpu = at::from_blob(data, {2}, at::kFloat);
4513   const auto in_vulkan = in_cpu.vulkan();
4514 
4515   const auto log_softmax_cpu = at::log_softmax(in_cpu, 0);
4516   const auto log_softmax_vulkan = at::log_softmax(in_vulkan, 0);
4517 
4518   const auto check = checkRtol(log_softmax_cpu - log_softmax_vulkan.cpu(), -smallest_representable_log);
4519   if (!check) {
4520     showRtol(log_softmax_cpu, log_softmax_vulkan.cpu());
4521   }
4522   ASSERT_TRUE(check);
4523 }
4524 
test_softmax(const at::IntArrayRef shape,bool log_softmax=false)4525 void test_softmax(const at::IntArrayRef shape, bool log_softmax = false) {
4526   at::Tensor in_cpu =
4527       at::rand(shape, at::TensorOptions(at::kCPU).dtype(at::kFloat));
4528   const at::Tensor in_vulkan = in_cpu.vulkan();
4529 
4530   // Cast to signed to test negative index for dim
4531   int64_t size = static_cast<int64_t>(shape.size());
4532 
4533   // Test on all dim
4534   for (auto dim = -size; dim < size; dim++) {
4535     const at::Tensor out_cpu =
4536         log_softmax ? at::log_softmax(in_cpu, dim) : at::softmax(in_cpu, dim);
4537 
4538     const at::Tensor out_vulkan = log_softmax ? at::log_softmax(in_vulkan, dim)
4539                                               : at::softmax(in_vulkan, dim);
4540     const bool check = almostEqual(out_cpu, out_vulkan.cpu());
4541 
4542     if (!check) {
4543       std::cout << "Softmax test failed on axis " << dim << "for tensor dims {";
4544       for (uint32_t place = 0; place < shape.size() - 1; place++) {
4545         std::cout << shape[place] << " ";
4546       }
4547       std::cout << shape.back() << "}" << std::endl;
4548       showRtol(out_cpu, out_vulkan.cpu());
4549     }
4550     ASSERT_TRUE(check);
4551   }
4552 }
4553 
TEST_F(VulkanAPITest,softmax)4554 TEST_F(VulkanAPITest, softmax) {
4555   c10::InferenceMode mode;
4556   std::vector<std::vector<int64_t>> test_in_dims = {
4557       {1, 3, 4, 2},
4558       {4, 8, 5, 7},
4559       {9, 11, 12, 12},
4560   };
4561   bool log_softmax = false;
4562   for (const std::vector<int64_t>& dim_vec : test_in_dims) {
4563     for (uint32_t trunc = 0; trunc < dim_vec.size(); trunc++) {
4564       const std::vector<int64_t> trunc_dim_vec =
4565           std::vector<int64_t>(dim_vec.begin(), dim_vec.end() - trunc);
4566       test_softmax(trunc_dim_vec, log_softmax);
4567     }
4568   }
4569 }
4570 
TEST_F(VulkanAPITest,DISABLED_log_softmax)4571 TEST_F(VulkanAPITest, DISABLED_log_softmax) {
4572   c10::InferenceMode mode;
4573   std::vector<std::vector<int64_t>> test_in_dims = {
4574       {1, 3, 4, 2},
4575       {4, 8, 5, 7},
4576       {9, 11, 12, 12},
4577   };
4578   bool log_softmax = true;
4579   for (const std::vector<int64_t>& dim_vec : test_in_dims) {
4580     for (uint32_t trunc = 0; trunc < dim_vec.size(); trunc++) {
4581       const std::vector<int64_t> trunc_dim_vec =
4582           std::vector<int64_t>(dim_vec.begin(), dim_vec.end() - trunc);
4583       test_softmax(trunc_dim_vec, log_softmax);
4584     }
4585   }
4586 }
4587 
TEST_F(VulkanAPITest,abs)4588 TEST_F(VulkanAPITest, abs) {
4589   const auto in_cpu = at::rand({17, 197, 302, 5}, at::device(at::kCPU).dtype(at::kFloat)) * 30;
4590   const auto in_vulkan = in_cpu.vulkan();
4591 
4592   const auto out_cpu = at::abs(in_cpu);
4593   const auto out_vulkan = at::abs(in_vulkan);
4594 
4595   const auto check = almostEqual(out_cpu, out_vulkan.cpu());
4596   if (!check) {
4597     showRtol(out_cpu, out_vulkan.cpu());
4598   }
4599 
4600   ASSERT_TRUE(check);
4601 }
4602 
TEST_F(VulkanAPITest,abs_)4603 TEST_F(VulkanAPITest, abs_) {
4604   auto cpu = at::rand({17, 197, 302, 5}, at::device(at::kCPU).dtype(at::kFloat)) * 30;
4605   auto vulkan = cpu.vulkan();
4606 
4607   at::abs_(cpu);
4608   at::abs_(vulkan);
4609 
4610   const auto check = almostEqual(cpu, vulkan.cpu());
4611   if (!check) {
4612     showRtol(cpu, vulkan.cpu());
4613   }
4614 
4615   ASSERT_TRUE(check);
4616 }
4617 
TEST_F(VulkanAPITest,tanh)4618 TEST_F(VulkanAPITest, tanh) {
4619   const auto in_cpu = at::rand({17, 197, 302, 5}, at::device(at::kCPU).dtype(at::kFloat)) * 30;
4620   const auto in_vulkan = in_cpu.vulkan();
4621 
4622   const auto out_cpu = at::tanh(in_cpu);
4623   const auto out_vulkan = at::tanh(in_vulkan);
4624 
4625   const auto check = almostEqual(out_cpu, out_vulkan.cpu());
4626   if (!check) {
4627     showRtol(out_cpu, out_vulkan.cpu());
4628   }
4629 
4630   ASSERT_TRUE(check);
4631 }
4632 
TEST_F(VulkanAPITest,tanh_)4633 TEST_F(VulkanAPITest, tanh_) {
4634   auto cpu = at::rand({17, 197, 302, 5}, at::device(at::kCPU).dtype(at::kFloat)) * 30;
4635   auto vulkan = cpu.vulkan();
4636 
4637   at::tanh_(cpu);
4638   at::tanh_(vulkan);
4639 
4640   const auto check = almostEqual(cpu, vulkan.cpu());
4641   if (!check) {
4642     showRtol(cpu, vulkan.cpu());
4643   }
4644 
4645   ASSERT_TRUE(check);
4646 }
4647 
test_sub(const at::IntArrayRef input_shape,const at::IntArrayRef other_shape,float alpha)4648 void test_sub(const at::IntArrayRef input_shape, const at::IntArrayRef other_shape, float alpha) {
4649   const auto in_cpu = at::rand(input_shape, at::device(at::kCPU).dtype(at::kFloat));
4650   const auto other_cpu = at::rand(other_shape, at::device(at::kCPU).dtype(at::kFloat));
4651 
4652   const auto in_vulkan = in_cpu.vulkan();
4653   const auto other_vulkan = other_cpu.vulkan();
4654 
4655   const auto out_cpu = at::sub(in_cpu, other_cpu, alpha);
4656   const auto out_vulkan = at::sub(in_vulkan, other_vulkan, alpha);
4657 
4658   const auto check = almostEqual(out_cpu, out_vulkan.cpu());
4659   if (!check) {
4660     showRtol(out_cpu, out_vulkan.cpu());
4661   }
4662 
4663   ASSERT_TRUE(check);
4664 }
4665 
TEST_F(VulkanAPITest,sub)4666 TEST_F(VulkanAPITest, sub) {
4667   test_sub({11, 7, 139, 109}, {11, 7, 139, 109}, 2.1f);
4668 }
4669 
TEST_F(VulkanAPITest,sub_broadcast0)4670 TEST_F(VulkanAPITest, sub_broadcast0) {
4671   test_sub({3, 5, 179, 221}, {3, 5, 1, 1}, 1.8f);
4672 }
4673 
TEST_F(VulkanAPITest,sub_broadcast1)4674 TEST_F(VulkanAPITest, sub_broadcast1) {
4675   test_sub({3, 5, 179, 221}, {3, 5, 1, 221}, 1.8f);
4676 }
4677 
TEST_F(VulkanAPITest,sub_broadcast2)4678 TEST_F(VulkanAPITest, sub_broadcast2) {
4679   test_sub({3, 4, 179, 221}, {4, 1, 1}, 2.5f);
4680 }
4681 
TEST_F(VulkanAPITest,sub_broadcast3)4682 TEST_F(VulkanAPITest, sub_broadcast3) {
4683   test_sub({3, 4, 179, 221}, {1, 1, 179, 221}, 2.5f);
4684 }
4685 
TEST_F(VulkanAPITest,sub_broadcast4)4686 TEST_F(VulkanAPITest, sub_broadcast4) {
4687   test_sub({3, 4, 179, 1}, {1, 179, 221}, 2.5f);
4688 }
4689 
TEST_F(VulkanAPITest,sub_broadcast5)4690 TEST_F(VulkanAPITest, sub_broadcast5) {
4691   test_sub({2, 1, 7, 1}, {1, 5, 1, 4}, 1.2f);
4692 }
4693 
TEST_F(VulkanAPITest,sub_broadcast6)4694 TEST_F(VulkanAPITest, sub_broadcast6) {
4695   test_sub({1, 15, 5, 4}, {21, 1, 5, 4}, 1.8f);
4696 }
4697 
TEST_F(VulkanAPITest,sub_zero_dim)4698 TEST_F(VulkanAPITest, sub_zero_dim) {
4699   test_sub({1, 15, 5, 4}, {}, 1.8f);
4700 }
4701 
TEST_F(VulkanAPITest,sub_)4702 TEST_F(VulkanAPITest, sub_) {
4703   auto a_cpu = at::rand({61, 17, 29, 83}, at::device(at::kCPU).dtype(at::kFloat));
4704   auto a_vulkan = a_cpu.vulkan();
4705 
4706   const auto b_cpu = at::rand({61, 17, 29, 83}, at::device(at::kCPU).dtype(at::kFloat));
4707   const auto b_vulkan = b_cpu.vulkan();
4708 
4709   a_cpu.sub_(b_cpu, 2.1f);
4710   a_vulkan.sub_(b_vulkan, 2.1f);
4711 
4712   const auto check = almostEqual(a_cpu, a_vulkan.cpu());
4713   if (!check) {
4714     showRtol(b_cpu, b_vulkan.cpu());
4715   }
4716 
4717   ASSERT_TRUE(check);
4718 }
4719 
TEST_F(VulkanAPITest,sub_broadcast0_)4720 TEST_F(VulkanAPITest, sub_broadcast0_) {
4721   auto a_cpu = at::rand({16, 17, 29, 83}, at::device(at::kCPU).dtype(at::kFloat));
4722   auto a_vulkan = a_cpu.vulkan();
4723 
4724   const auto b_cpu = at::rand({16, 17, 29, 1}, at::device(at::kCPU).dtype(at::kFloat));
4725   const auto b_vulkan = b_cpu.vulkan();
4726 
4727   a_cpu.sub_(b_cpu, 2.1f);
4728   a_vulkan.sub_(b_vulkan, 2.1f);
4729 
4730   const auto check = almostEqual(a_cpu, a_vulkan.cpu());
4731   if (!check) {
4732     showRtol(b_cpu, b_vulkan.cpu());
4733   }
4734 
4735   ASSERT_TRUE(check);
4736 }
4737 
TEST_F(VulkanAPITest,sub_broadcast1_)4738 TEST_F(VulkanAPITest, sub_broadcast1_) {
4739   auto a_cpu = at::rand({3, 8, 29, 83}, at::device(at::kCPU).dtype(at::kFloat));
4740   auto a_vulkan = a_cpu.vulkan();
4741 
4742   const auto b_cpu = at::rand({3, 8, 1, 1}, at::device(at::kCPU).dtype(at::kFloat));
4743   const auto b_vulkan = b_cpu.vulkan();
4744 
4745   a_cpu.sub_(b_cpu, 2.1f);
4746   a_vulkan.sub_(b_vulkan, 2.1f);
4747 
4748   const auto check = almostEqual(a_cpu, a_vulkan.cpu());
4749   if (!check) {
4750     showRtol(b_cpu, b_vulkan.cpu());
4751   }
4752 
4753   ASSERT_TRUE(check);
4754 }
4755 
TEST_F(VulkanAPITest,sub_scalar)4756 TEST_F(VulkanAPITest, sub_scalar) {
4757   if (!at::is_vulkan_available()) {
4758     return;
4759   }
4760 
4761   const auto a_cpu = at::rand({13, 23, 59, 73}, at::device(at::kCPU).dtype(at::kFloat));
4762   const auto a_vulkan = a_cpu.vulkan();
4763 
4764   const float b_scalar = 3.1415f;
4765 
4766   const auto c_cpu = at::sub(a_cpu, b_scalar, 2.1f);
4767   const auto c_vulkan = at::sub(a_vulkan, b_scalar, 2.1f);
4768 
4769   const auto check = almostEqual(c_cpu, c_vulkan.cpu());
4770   if (!check) {
4771     showRtol(c_cpu, c_vulkan.cpu());
4772   }
4773 
4774   ASSERT_TRUE(check);
4775 }
4776 
TEST_F(VulkanAPITest,sub_scalar_)4777 TEST_F(VulkanAPITest, sub_scalar_) {
4778   if (!at::is_vulkan_available()) {
4779     return;
4780   }
4781 
4782   auto a_cpu = at::rand({47, 2, 23, 97}, at::device(at::kCPU).dtype(at::kFloat));
4783   auto a_vulkan = a_cpu.vulkan();
4784 
4785   const float b_scalar = 3.1415f;
4786 
4787   a_cpu.sub_(b_scalar, 2.1f);
4788   a_vulkan.sub_(b_scalar, 2.1f);
4789 
4790   const auto check = almostEqual(a_cpu, a_vulkan.cpu());
4791   if (!check) {
4792     showRtol(a_cpu, a_vulkan.cpu());
4793   }
4794 
4795   ASSERT_TRUE(check);
4796 }
4797 
TEST_F(VulkanAPITest,sub_scalar_wrapped)4798 TEST_F(VulkanAPITest, sub_scalar_wrapped) {
4799   if (!at::is_vulkan_available()) {
4800     return;
4801   }
4802 
4803   const auto a_cpu = at::rand({13, 23, 59, 73}, at::device(at::kCPU).dtype(at::kFloat));
4804   const auto a_vulkan = a_cpu.vulkan();
4805 
4806   const auto b_scalar = at::rand({1}, at::device(at::kCPU).dtype(at::kFloat));
4807 
4808   const auto c_cpu = at::sub(a_cpu, b_scalar, 2.1f);
4809   const auto c_vulkan = at::sub(a_vulkan, b_scalar, 2.1f);
4810 
4811   const auto check = almostEqual(c_cpu, c_vulkan.cpu());
4812   if (!check) {
4813     showRtol(c_cpu, c_vulkan.cpu());
4814   }
4815 
4816   ASSERT_TRUE(check);
4817 }
4818 
TEST_F(VulkanAPITest,sub_scalar_wrapped_)4819 TEST_F(VulkanAPITest, sub_scalar_wrapped_) {
4820   if (!at::is_vulkan_available()) {
4821     return;
4822   }
4823 
4824   auto a_cpu = at::rand({47, 2, 23, 97}, at::device(at::kCPU).dtype(at::kFloat));
4825   auto a_vulkan = a_cpu.vulkan();
4826 
4827   const auto b_scalar = at::rand({1}, at::device(at::kCPU).dtype(at::kFloat));
4828 
4829   a_cpu.sub_(b_scalar, 2.1f);
4830   a_vulkan.sub_(b_scalar, 2.1f);
4831 
4832   const auto check = almostEqual(a_cpu, a_vulkan.cpu());
4833   if (!check) {
4834     showRtol(a_cpu, a_vulkan.cpu());
4835   }
4836 
4837   ASSERT_TRUE(check);
4838 }
4839 
TEST_F(VulkanAPITest,sub_to_scalar_wrapped)4840 TEST_F(VulkanAPITest, sub_to_scalar_wrapped) {
4841   if (!at::is_vulkan_available()) {
4842     return;
4843   }
4844 
4845   const auto a = at::rand({1}, at::device(at::kCPU).dtype(at::kFloat));
4846 
4847   const auto b_cpu = at::rand({11, 7, 139, 109}, at::device(at::kCPU).dtype(at::kFloat));
4848   const auto b_vulkan = b_cpu.vulkan();
4849 
4850   const auto c_cpu = at::sub(a, b_cpu, 2.1f);
4851   const auto c_vulkan = at::sub(a, b_vulkan, 2.1f);
4852 
4853   const auto check = almostEqual(c_cpu, c_vulkan.cpu());
4854   if (!check) {
4855     showRtol(c_cpu, c_vulkan.cpu());
4856   }
4857 
4858   ASSERT_TRUE(check);
4859 }
4860 
TEST_F(VulkanAPITest,sum_invalid_inputs)4861 TEST_F(VulkanAPITest, sum_invalid_inputs) {
4862   c10::InferenceMode mode;
4863 
4864   // Act: input dimension too large
4865   EXPECT_THROW({
4866     at::sum(at::rand({3, 5, 7, 8, 9}, at::device(at::kCPU).dtype(at::kFloat))
4867       .vulkan(), {3});
4868   }, ::std::exception);
4869 
4870   // Act: dimension out of range
4871   EXPECT_THROW({
4872     at::sum(at::rand({7, 8, 9}, at::device(at::kCPU).dtype(at::kFloat))
4873       .vulkan(), {3});
4874   }, ::std::exception);
4875 
4876   // Act: dimension out of range
4877   EXPECT_THROW({
4878     at::sum(at::rand({7, 8, 9}, at::device(at::kCPU).dtype(at::kFloat))
4879       .vulkan(), {-4});
4880   }, ::std::exception);
4881 
4882   // Act: repeated dimensions
4883   EXPECT_THROW({
4884     at::sum(at::rand({7, 8, 9}, at::device(at::kCPU).dtype(at::kFloat))
4885       .vulkan(), {1, 1});
4886   }, ::std::exception);
4887 
4888   // Act: repeated dimensions
4889   EXPECT_THROW({
4890     at::sum(at::rand({7, 8, 9}, at::device(at::kCPU).dtype(at::kFloat))
4891       .vulkan(), {1, -2});
4892   }, ::std::exception);
4893 }
4894 
test_sum_dim(const at::IntArrayRef input_shape,const at::IntArrayRef dim_list,bool keepdim=false)4895 void test_sum_dim(const at::IntArrayRef input_shape, const at::IntArrayRef dim_list, bool keepdim=false) {
4896   const auto in_cpu = at::rand(input_shape, at::device(at::kCPU).dtype(at::kFloat));
4897   const auto in_vulkan = in_cpu.vulkan();
4898 
4899   const auto out_cpu = at::sum(in_cpu, dim_list, keepdim);
4900   const auto out_vulkan = at::sum(in_vulkan, dim_list, keepdim);
4901 
4902   const auto check = almostEqual(out_cpu, out_vulkan.cpu());
4903   if (!check) {
4904     std::cout << "sum_dim test failed with input shape: "
4905               << input_shape << " and dim_list: " << dim_list << std::endl;
4906     showRtol(out_cpu, out_vulkan.cpu());
4907   }
4908 
4909   ASSERT_TRUE(check);
4910 }
4911 
TEST_F(VulkanAPITest,sum_dim_1d)4912 TEST_F(VulkanAPITest, sum_dim_1d) {
4913   test_sum_dim({7}, {-1});
4914   test_sum_dim({3}, {0});
4915 }
4916 
TEST_F(VulkanAPITest,sum_dim_2d)4917 TEST_F(VulkanAPITest, sum_dim_2d) {
4918   test_sum_dim({2, 3}, {-1});
4919   test_sum_dim({2, 7}, {-2});
4920   test_sum_dim({2, 7}, {-1, -2});
4921 }
4922 
TEST_F(VulkanAPITest,sum_dim_3d)4923 TEST_F(VulkanAPITest, sum_dim_3d) {
4924   test_sum_dim({9, 7, 5}, {-1});
4925   test_sum_dim({5, 7, 9}, {-2});
4926   test_sum_dim({5, 7, 9}, {-3});
4927 
4928   test_sum_dim({10, 7, 5}, {0, 1});
4929   test_sum_dim({10, 7, 5}, {0, 2});
4930   test_sum_dim({10, 7, 5}, {1, 2});
4931 
4932   test_sum_dim({10, 7, 5}, {-1, -2});
4933   test_sum_dim({10, 7, 5}, {-1, -3});
4934   test_sum_dim({10, 7, 5}, {-2, -3});
4935 
4936   test_sum_dim({10, 7, 5}, {0, 1, 2});
4937   test_sum_dim({10, 7, 5}, {-1, -2, -3});
4938 }
4939 
TEST_F(VulkanAPITest,sum_dim_4d)4940 TEST_F(VulkanAPITest, sum_dim_4d) {
4941   test_sum_dim({7, 9, 6, 5}, {-1});
4942   test_sum_dim({6, 5, 7, 9}, {-2});
4943   test_sum_dim({6, 5, 7, 9}, {-3});
4944   test_sum_dim({6, 5, 7, 9}, {-4});
4945 
4946   test_sum_dim({10, 7, 5, 6}, {0, 1});
4947   test_sum_dim({10, 7, 5, 6}, {0, 2});
4948   test_sum_dim({10, 7, 5, 6}, {0, 3});
4949   test_sum_dim({10, 7, 5, 6}, {1, 2});
4950   test_sum_dim({10, 7, 5, 6}, {1, 3});
4951   test_sum_dim({10, 7, 5, 6}, {2, 3});
4952   test_sum_dim({10, 7, 5, 6}, {-2, -4});
4953 
4954   test_sum_dim({10, 7, 5, 6}, {0, 1, 2});
4955   test_sum_dim({10, 7, 5, 6}, {0, 1, 3});
4956   test_sum_dim({10, 7, 5, 6}, {0, 2, 3});
4957   test_sum_dim({10, 7, 5, 6}, {3, 2, 1});
4958   test_sum_dim({10, 7, 5, 6}, {3, -2, 1});
4959   test_sum_dim({10, 7, 5, 6}, {-3, -2, -1});
4960 
4961   test_sum_dim({10, 7, 5, 6}, {-1, -2, -3});
4962   test_sum_dim({10, 7, 5, 6}, {-1, -2, -4});
4963   test_sum_dim({10, 7, 5, 6}, {-1, -3, -4});
4964   test_sum_dim({10, 7, 5, 6}, {-2, -3, -4});
4965 
4966   test_sum_dim({10, 7, 5, 6}, {-1, -2, -3, -4});
4967 }
4968 
TEST_F(VulkanAPITest,sum_dim_keepdim_1d)4969 TEST_F(VulkanAPITest, sum_dim_keepdim_1d) {
4970   test_sum_dim({5}, {-1}, true);
4971   test_sum_dim({3}, {-1}, true);
4972 }
4973 
TEST_F(VulkanAPITest,sum_dim_keepdim_2d)4974 TEST_F(VulkanAPITest, sum_dim_keepdim_2d) {
4975   test_sum_dim({5, 7}, {-1}, true);
4976   test_sum_dim({5, 7}, {-2}, true);
4977 }
4978 
TEST_F(VulkanAPITest,sum_dim_keepdim_3d)4979 TEST_F(VulkanAPITest, sum_dim_keepdim_3d) {
4980   test_sum_dim({9, 5, 7}, {-1}, true);
4981   test_sum_dim({5, 9, 7}, {-2}, true);
4982   test_sum_dim({7, 9, 5}, {-3}, true);
4983 
4984   test_sum_dim({9, 5, 7}, {0, 1}, true);
4985   test_sum_dim({5, 9, 7}, {0, 2}, true);
4986   test_sum_dim({7, 9, 5}, {1, 2}, true);
4987 
4988   test_sum_dim({7, 9, 5}, {0, 1, 2}, true);
4989 }
4990 
TEST_F(VulkanAPITest,sum_dim_keepdim_4d)4991 TEST_F(VulkanAPITest, sum_dim_keepdim_4d) {
4992   test_sum_dim({9, 5, 7, 11}, {-1}, true);
4993   test_sum_dim({5, 9, 11, 7}, {-2}, true);
4994   test_sum_dim({7, 11, 9, 5}, {-3}, true);
4995   test_sum_dim({11, 7, 9, 5}, {-4}, true);
4996 
4997   test_sum_dim({9, 5, 7, 11}, {0, 1}, true);
4998   test_sum_dim({5, 9, 11, 7}, {0, 2}, true);
4999   test_sum_dim({7, 11, 9, 5}, {0, 3}, true);
5000   test_sum_dim({11, 7, 9, 5}, {1, 2}, true);
5001   test_sum_dim({9, 5, 7, 11}, {1, 3}, true);
5002   test_sum_dim({5, 9, 11, 7}, {2, 3}, true);
5003 
5004   test_sum_dim({7, 11, 9, 5}, {-1, -2, -3}, true);
5005   test_sum_dim({11, 7, 9, 5}, {-1, -2, -4}, true);
5006   test_sum_dim({9, 5, 7, 11}, {-2, -3, -4}, true);
5007 
5008   test_sum_dim({9, 5, 7, 11}, {-1, -2, -3, -4}, true);
5009 }
5010 
test_sum(const at::IntArrayRef input_shape)5011 void test_sum(const at::IntArrayRef input_shape) {
5012   const auto in_cpu = at::rand(input_shape, at::device(at::kCPU).dtype(at::kFloat));
5013   const auto in_vulkan = in_cpu.vulkan();
5014 
5015   const auto out_cpu = at::sum(in_cpu);
5016   const auto out_vulkan = at::sum(in_vulkan);
5017 
5018   ASSERT_TRUE(out_vulkan.dim() == 0);
5019   const auto check = almostEqual(out_cpu, out_vulkan.cpu());
5020   if (!check) {
5021     std::cout << "sum test failed with input shape: "
5022               << input_shape << std::endl;
5023     showRtol(out_cpu, out_vulkan.cpu());
5024   }
5025 
5026   ASSERT_TRUE(check);
5027 }
5028 
TEST_F(VulkanAPITest,sum_test)5029 TEST_F(VulkanAPITest, sum_test) {
5030   test_sum({6});
5031   test_sum({5, 6});
5032   test_sum({0, 3, 1});
5033   test_sum({5, 0, 1});
5034   test_sum({5, 3, 0});
5035   test_sum({3, 3, 1});
5036   test_sum({7, 6, 6});
5037   test_sum({7, 8, 5, 6});
5038 }
5039 
5040 
test_uniform(at::Tensor a_vulkan,const float a_min,const float a_max)5041 void test_uniform(at::Tensor a_vulkan, const float a_min, const float a_max) {
5042   auto a_cpu = a_vulkan.cpu();
5043   ASSERT_TRUE(a_cpu.max().item<float>() <= a_max);
5044   ASSERT_TRUE(a_cpu.min().item<float>() >= a_min);
5045 
5046   // Verify range, also perform a loose check with on histogram distribution.
5047   float b_min = 0.0f;
5048   float b_max = 10.0f;
5049 
5050   auto b_vulkan =
5051       at::rand({80, 7, 12, 10}, at::device(at::kCPU).dtype(at::kFloat))
5052           .vulkan();
5053   b_vulkan.uniform_(b_min, b_max);
5054   auto b_cpu = b_vulkan.cpu();
5055 
5056   int bins = 10;
5057   auto b_hist_tuple = at::histogram(b_cpu, bins);
5058 
5059   int64_t expected_per_bin = b_vulkan.numel() / bins;
5060   auto b_hist = std::get<0>(b_hist_tuple);
5061 
5062   // Very relaxed definition of uniform. Pass if all bins are within 5% of
5063   // expected.
5064   ASSERT_TRUE(
5065       (b_hist - expected_per_bin).abs().max().item<float>() <=
5066       (expected_per_bin * 0.05));
5067 }
5068 
TEST_F(VulkanAPITest,uniform)5069 TEST_F(VulkanAPITest, uniform) {
5070   float a_min = -8.2f;
5071   float a_max = -1.4f;
5072   auto a_vulkan =
5073       at::rand({8, 7, 12, 10}, at::device(at::kCPU).dtype(at::kFloat)).vulkan();
5074   a_vulkan.uniform_(a_min, a_max);
5075   test_uniform(a_vulkan, a_min, a_max);
5076 }
5077 
TEST_F(VulkanAPITest,rand_like)5078 TEST_F(VulkanAPITest, rand_like) {
5079   float a_min = 0.0f;
5080   float a_max = 1.0f;
5081   auto a_vulkan =
5082       at::zeros({8, 7, 12, 10}, at::device(at::kCPU).dtype(at::kFloat)).vulkan();
5083   const auto out_vulkan = at::rand_like(a_vulkan);
5084   // verify that the input are still all zeros (not in-place)
5085   ASSERT_TRUE(at::mean(a_vulkan.cpu()).item<float>() == 0.0);
5086   test_uniform(out_vulkan, a_min, a_max);
5087 }
5088 
test_normal(at::Tensor out_vulkan,const float mean,const float std)5089 void test_normal(at::Tensor out_vulkan, const float mean, const float std) {
5090   // Verify the distribution is normal. The difference between given mean vs generated mean should be within 5% of standard deviation, and the same for standard deviation itself.
5091   ASSERT_TRUE(std::abs(at::mean(out_vulkan.cpu()).item<float>() - mean) < std::abs(std) * 0.05);
5092   ASSERT_TRUE(std::abs(at::std(out_vulkan.cpu()).item<float>() - std) < std::abs(std) * 0.05);
5093 }
5094 
TEST_F(VulkanAPITest,normal_)5095 TEST_F(VulkanAPITest, normal_) {
5096   float a_mean = -10.0;
5097   float a_std = 2.0;
5098 
5099   auto a_vulkan =
5100       at::zeros({3, 4, 5, 6}, at::device(at::kCPU).dtype(at::kFloat)).vulkan();
5101   a_vulkan.normal_(a_mean, a_std);
5102 
5103   test_normal(a_vulkan, a_mean, a_std);
5104 }
5105 
TEST_F(VulkanAPITest,normal_large)5106 TEST_F(VulkanAPITest, normal_large) {
5107   float a_mean = 1.0;
5108   float a_std = 0.01;
5109 
5110   auto a_vulkan =
5111       at::zeros({30, 40, 50, 60}, at::device(at::kCPU).dtype(at::kFloat)).vulkan();
5112   a_vulkan.normal_(a_mean, a_std);
5113 
5114   test_normal(a_vulkan, a_mean, a_std);
5115 }
5116 
TEST_F(VulkanAPITest,normal_error)5117 TEST_F(VulkanAPITest, normal_error) {
5118   float a_mean = 1.0;
5119   float a_std = -1;
5120 
5121   auto a_vulkan =
5122       at::zeros({30, 40, 50, 60}, at::device(at::kCPU).dtype(at::kFloat)).vulkan();
5123   EXPECT_THROW(a_vulkan.normal_(a_mean, a_std), ::std::exception);
5124 }
5125 
TEST_F(VulkanAPITest,randn_like)5126 TEST_F(VulkanAPITest, randn_like) {
5127   float a_mean = 0.0;
5128   float a_std = 1.0;
5129 
5130   auto a_vulkan =
5131       at::zeros({8, 7, 6, 5}, at::device(at::kCPU).dtype(at::kFloat)).vulkan();
5132   const auto out_vulkan = at::randn_like(a_vulkan);
5133   // verify that the input are still all zeros (not in-place)
5134   ASSERT_TRUE(at::mean(a_vulkan.cpu()).item<float>() == 0.0);
5135   test_normal(out_vulkan, a_mean, a_std);
5136 }
5137 
TEST_F(VulkanAPITest,randn_like_large)5138 TEST_F(VulkanAPITest, randn_like_large) {
5139   float a_mean = 0.0;
5140   float a_std = 1.0;
5141 
5142   auto a_vulkan =
5143       at::zeros({80, 70, 60, 50}, at::device(at::kCPU).dtype(at::kFloat)).vulkan();
5144   const auto out_vulkan = at::randn_like(a_vulkan);
5145 
5146   test_normal(out_vulkan, a_mean, a_std);
5147 }
5148 
test_t(const at::IntArrayRef input_shape)5149 void test_t(const at::IntArrayRef input_shape) {
5150   const auto in_cpu = at::rand(input_shape, at::device(at::kCPU).dtype(at::kFloat));
5151   const auto out_cpu = at::t(in_cpu);
5152 
5153   const auto in_vulkan = in_cpu.vulkan();
5154   const auto out_vulkan = at::t(in_vulkan);
5155 
5156   const auto check = almostEqual(out_cpu, out_vulkan.cpu());
5157   if (!check) {
5158     showRtol(out_cpu, out_vulkan.cpu());
5159   }
5160 
5161   ASSERT_TRUE(check);
5162 }
5163 
TEST_F(VulkanAPITest,transpose_t_1d)5164 TEST_F(VulkanAPITest, transpose_t_1d) {
5165   test_t({7});
5166 }
5167 
TEST_F(VulkanAPITest,transpose_t_2d_small)5168 TEST_F(VulkanAPITest, transpose_t_2d_small) {
5169   test_t({1, 1});
5170 }
5171 
TEST_F(VulkanAPITest,transpose_t_2d_medium)5172 TEST_F(VulkanAPITest, transpose_t_2d_medium) {
5173   test_t({7, 5});
5174 }
5175 
TEST_F(VulkanAPITest,transpose_t_2d_large)5176 TEST_F(VulkanAPITest, transpose_t_2d_large) {
5177   test_t({53, 117});
5178 }
5179 
test_transpose(const at::IntArrayRef input_shape,int64_t index0,int64_t index1)5180 void test_transpose(const at::IntArrayRef input_shape, int64_t index0, int64_t index1) {
5181   const auto in_cpu = at::rand(input_shape, at::device(at::kCPU).dtype(at::kFloat));
5182   const auto out_cpu = at::transpose(in_cpu, index0, index1);
5183 
5184   const auto in_vulkan = in_cpu.vulkan();
5185   const auto out_vulkan = at::transpose(in_vulkan, index0, index1);
5186 
5187   const auto check = almostEqual(out_cpu, out_vulkan.cpu());
5188   if (!check) {
5189     showRtol(out_cpu, out_vulkan.cpu());
5190   }
5191 
5192   ASSERT_TRUE(check);
5193 }
5194 
TEST_F(VulkanAPITest,transpose_2d_height_and_width_small)5195 TEST_F(VulkanAPITest, transpose_2d_height_and_width_small) {
5196   test_transpose({1, 1}, 0, 1);
5197 }
5198 
TEST_F(VulkanAPITest,transpose_2d_height_and_width_medium)5199 TEST_F(VulkanAPITest, transpose_2d_height_and_width_medium) {
5200   test_transpose({7, 5}, 0, 1);
5201 }
5202 
TEST_F(VulkanAPITest,transpose_2d_height_and_width_large)5203 TEST_F(VulkanAPITest, transpose_2d_height_and_width_large) {
5204   test_transpose({53, 117}, 0, 1);
5205 }
5206 
TEST_F(VulkanAPITest,transpose_2d_height_and_height_large)5207 TEST_F(VulkanAPITest, transpose_2d_height_and_height_large) {
5208   test_transpose({53, 117}, 0, 0);
5209 }
5210 
TEST_F(VulkanAPITest,transpose_2d_width_and_width_large)5211 TEST_F(VulkanAPITest, transpose_2d_width_and_width_large) {
5212   test_transpose({53, 117}, 1, 1);
5213 }
5214 
TEST_F(VulkanAPITest,transpose_3d_height_and_width_small)5215 TEST_F(VulkanAPITest, transpose_3d_height_and_width_small) {
5216   test_transpose({1, 1, 1}, 1, 2);
5217 }
5218 
TEST_F(VulkanAPITest,transpose_3d_height_and_width_medium)5219 TEST_F(VulkanAPITest, transpose_3d_height_and_width_medium) {
5220   test_transpose({3, 2, 5}, 1, 2);
5221 }
5222 
TEST_F(VulkanAPITest,transpose_3d_height_and_width_large)5223 TEST_F(VulkanAPITest, transpose_3d_height_and_width_large) {
5224   test_transpose({100, 1, 144}, 1, 2);
5225 }
5226 
TEST_F(VulkanAPITest,transpose_3d_width_and_width_large)5227 TEST_F(VulkanAPITest, transpose_3d_width_and_width_large) {
5228   test_transpose({100, 1, 144}, 2, 2);
5229 }
5230 
TEST_F(VulkanAPITest,transpose_3d_depth_and_width_small)5231 TEST_F(VulkanAPITest, transpose_3d_depth_and_width_small) {
5232   test_transpose({1, 1, 1}, 0, 2);
5233 }
5234 
TEST_F(VulkanAPITest,transpose_3d_depth_and_width_medium)5235 TEST_F(VulkanAPITest, transpose_3d_depth_and_width_medium) {
5236   test_transpose({3, 2, 5}, 0, 2);
5237 }
5238 
TEST_F(VulkanAPITest,transpose_3d_depth_and_width_large)5239 TEST_F(VulkanAPITest, transpose_3d_depth_and_width_large) {
5240   test_transpose({113, 1, 141}, 0, 2);
5241 }
5242 
TEST_F(VulkanAPITest,transpose_3d_depth_and_depth_large)5243 TEST_F(VulkanAPITest, transpose_3d_depth_and_depth_large) {
5244   test_transpose({113, 2, 131}, 0, 0);
5245 }
5246 
TEST_F(VulkanAPITest,transpose_3d_depth_and_height_small)5247 TEST_F(VulkanAPITest, transpose_3d_depth_and_height_small) {
5248   test_transpose({1, 1, 1}, 0, 1);
5249 }
5250 
TEST_F(VulkanAPITest,transpose_3d_depth_and_height_medium)5251 TEST_F(VulkanAPITest, transpose_3d_depth_and_height_medium) {
5252   test_transpose({3, 7, 5}, 0, 1);
5253 }
5254 
TEST_F(VulkanAPITest,transpose_3d_depth_and_height_large)5255 TEST_F(VulkanAPITest, transpose_3d_depth_and_height_large) {
5256   test_transpose({113, 141, 1}, 0, 1);
5257 }
5258 
TEST_F(VulkanAPITest,transpose_3d_height_and_height_large)5259 TEST_F(VulkanAPITest, transpose_3d_height_and_height_large) {
5260   test_transpose({101, 1, 141}, 1, 1);
5261 }
5262 
TEST_F(VulkanAPITest,transpose_4d_batch_and_batch_large)5263 TEST_F(VulkanAPITest, transpose_4d_batch_and_batch_large) {
5264   test_transpose({7, 51, 41, 3}, 0, 0);
5265 }
5266 
TEST_F(VulkanAPITest,transpose_4d_depth_and_depth_large)5267 TEST_F(VulkanAPITest, transpose_4d_depth_and_depth_large) {
5268   test_transpose({7, 51, 41, 3}, 1, 1);
5269 }
5270 
TEST_F(VulkanAPITest,transpose_4d_height_and_height_large)5271 TEST_F(VulkanAPITest, transpose_4d_height_and_height_large) {
5272   test_transpose({7, 51, 41, 3}, 2, 2);
5273 }
5274 
TEST_F(VulkanAPITest,transpose_4d_width_and_width_large)5275 TEST_F(VulkanAPITest, transpose_4d_width_and_width_large) {
5276   test_transpose({7, 51, 41, 3}, 3, 3);
5277 }
5278 
TEST_F(VulkanAPITest,transpose_4d_batch_and_depth_large)5279 TEST_F(VulkanAPITest, transpose_4d_batch_and_depth_large) {
5280   test_transpose({7, 51, 41, 3}, 0, 1);
5281 }
5282 
TEST_F(VulkanAPITest,transpose_4d_batch_and_height_large)5283 TEST_F(VulkanAPITest, transpose_4d_batch_and_height_large) {
5284   test_transpose({7, 51, 41, 3}, 0, 2);
5285 }
5286 
TEST_F(VulkanAPITest,transpose_4d_batch_and_width_large)5287 TEST_F(VulkanAPITest, transpose_4d_batch_and_width_large) {
5288   test_transpose({7, 51, 41, 3}, 0, 3);
5289 }
5290 
TEST_F(VulkanAPITest,transpose_4d_depth_and_height_large)5291 TEST_F(VulkanAPITest, transpose_4d_depth_and_height_large) {
5292   test_transpose({7, 51, 41, 3}, 1, 2);
5293 }
5294 
TEST_F(VulkanAPITest,transpose_4d_depth_and_width_large)5295 TEST_F(VulkanAPITest, transpose_4d_depth_and_width_large) {
5296   test_transpose({7, 51, 41, 3}, 1, 3);
5297 }
5298 
TEST_F(VulkanAPITest,transpose_4d_height_and_width_large)5299 TEST_F(VulkanAPITest, transpose_4d_height_and_width_large) {
5300   test_transpose({7, 51, 41, 3}, 2, 3);
5301 }
5302 
5303 // Test Unary Ops
test_exp(const at::IntArrayRef input_shape)5304 void test_exp(const at::IntArrayRef input_shape) {
5305   c10::InferenceMode mode;
5306   const auto in_cpu = at::rand(input_shape, at::device(at::kCPU).dtype(at::kFloat));
5307   const auto out_cpu = at::exp(in_cpu);
5308 
5309   const auto in_vulkan = in_cpu.vulkan();
5310   const auto out_vulkan = at::exp(in_vulkan);
5311 
5312   const auto check = almostEqual(out_cpu, out_vulkan.cpu());
5313   if (!check) {
5314     showRtol(out_cpu, out_vulkan.cpu());
5315     std::cout << "exp test failed with input shape: "
5316               << input_shape << std::endl;
5317   }
5318   ASSERT_TRUE(check);
5319 }
5320 
TEST_F(VulkanAPITest,unary_op_exp)5321 TEST_F(VulkanAPITest, unary_op_exp) {
5322   test_exp({5});
5323   test_exp({5, 6});
5324   test_exp({7, 3, 5});
5325   test_exp({11, 1, 4, 2});
5326 }
5327 
test_exp_(const at::IntArrayRef input_shape)5328 void test_exp_(const at::IntArrayRef input_shape) {
5329   c10::InferenceMode mode;
5330   const auto cpu = at::rand(input_shape, at::device(at::kCPU).dtype(at::kFloat));
5331   const auto vulkan = cpu.vulkan();
5332 
5333   cpu.exp_();
5334   vulkan.exp_();
5335 
5336   const auto check = almostEqual(cpu, vulkan.cpu());
5337   if (!check) {
5338     showRtol(cpu, vulkan.cpu());
5339     std::cout << "exp_ test failed with input shape: "
5340               << input_shape << std::endl;
5341   }
5342   ASSERT_TRUE(check);
5343 }
5344 
TEST_F(VulkanAPITest,unary_op_exp_)5345 TEST_F(VulkanAPITest, unary_op_exp_) {
5346   test_exp_({5});
5347   test_exp_({5, 6});
5348   test_exp_({7, 3, 5});
5349   test_exp_({11, 1, 4, 2});
5350 }
5351 
test_sqrt(const at::IntArrayRef input_shape)5352 void test_sqrt(const at::IntArrayRef input_shape) {
5353   c10::InferenceMode mode;
5354   const auto in_cpu = at::rand(input_shape, at::device(at::kCPU).dtype(at::kFloat));
5355   const auto out_cpu = at::sqrt(in_cpu);
5356 
5357   const auto in_vulkan = in_cpu.vulkan();
5358   const auto out_vulkan = at::sqrt(in_vulkan);
5359 
5360   const auto check = almostEqual(out_cpu, out_vulkan.cpu());
5361   if (!check) {
5362     showRtol(out_cpu, out_vulkan.cpu());
5363     std::cout << "sqrt test failed with input shape: "
5364               << input_shape << std::endl;
5365   }
5366   ASSERT_TRUE(check);
5367 }
5368 
TEST_F(VulkanAPITest,unary_op_sqrt)5369 TEST_F(VulkanAPITest, unary_op_sqrt) {
5370   test_sqrt({5});
5371   test_sqrt({5, 6});
5372   test_sqrt({7, 3, 5});
5373   test_sqrt({11, 1, 4, 2});
5374 }
5375 
test_sqrt_(const at::IntArrayRef input_shape)5376 void test_sqrt_(const at::IntArrayRef input_shape) {
5377   c10::InferenceMode mode;
5378   const auto cpu = at::rand(input_shape, at::device(at::kCPU).dtype(at::kFloat));
5379   const auto vulkan = cpu.vulkan();
5380 
5381   cpu.sqrt_();
5382   vulkan.sqrt_();
5383 
5384   const auto check = almostEqual(cpu, vulkan.cpu());
5385   if (!check) {
5386     showRtol(cpu, vulkan.cpu());
5387     std::cout << "sqrt_ test failed with input shape: "
5388               << input_shape << std::endl;
5389   }
5390   ASSERT_TRUE(check);
5391 }
5392 
TEST_F(VulkanAPITest,unary_op_sqrt_)5393 TEST_F(VulkanAPITest, unary_op_sqrt_) {
5394   test_sqrt_({5});
5395   test_sqrt_({5, 6});
5396   test_sqrt_({7, 3, 5});
5397   test_sqrt_({11, 1, 4, 2});
5398 }
5399 
test_log(const at::IntArrayRef input_shape)5400 void test_log(const at::IntArrayRef input_shape) {
5401   c10::InferenceMode mode;
5402   // Need to add a very small constant to avoid 0.
5403   const auto in_cpu =
5404       at::rand(input_shape, at::device(at::kCPU).dtype(at::kFloat)) + 0.0001;
5405   const auto out_cpu = at::log(in_cpu);
5406 
5407   const auto in_vulkan = in_cpu.vulkan();
5408   const auto out_vulkan = at::log(in_vulkan);
5409 
5410   const auto check = almostEqual(out_cpu, out_vulkan.cpu());
5411   if (!check) {
5412     showRtol(out_cpu, out_vulkan.cpu());
5413     std::cout << "log test failed with input shape: " << input_shape
5414               << std::endl;
5415   }
5416   ASSERT_TRUE(check);
5417 }
5418 
TEST_F(VulkanAPITest,unary_op_log)5419 TEST_F(VulkanAPITest, unary_op_log) {
5420   test_log({5});
5421   test_log({5, 6});
5422   test_log({7, 3, 5});
5423   test_log({11, 1, 4, 2});
5424 }
5425 
test_log_(const at::IntArrayRef input_shape)5426 void test_log_(const at::IntArrayRef input_shape) {
5427   c10::InferenceMode mode;
5428   // Need to add a very small constant to avoid 0.
5429   const auto cpu =
5430       at::rand(input_shape, at::device(at::kCPU).dtype(at::kFloat)) + 0.0001;
5431   const auto vulkan = cpu.vulkan();
5432 
5433   cpu.log_();
5434   vulkan.log_();
5435 
5436   const auto check = almostEqual(cpu, vulkan.cpu());
5437   if (!check) {
5438     showRtol(cpu, vulkan.cpu());
5439     std::cout << "log_ test failed with input shape: " << input_shape
5440               << std::endl;
5441   }
5442   ASSERT_TRUE(check);
5443 }
5444 
TEST_F(VulkanAPITest,unary_op_log_)5445 TEST_F(VulkanAPITest, unary_op_log_) {
5446   test_log_({5});
5447   test_log_({5, 6});
5448   test_log_({7, 3, 5});
5449   test_log_({11, 1, 4, 2});
5450 }
5451 
test_unsqueeze(const at::IntArrayRef input_shape,int64_t dim)5452 void test_unsqueeze(const at::IntArrayRef input_shape, int64_t dim) {
5453   c10::InferenceMode mode;
5454   const auto in_cpu = at::rand(input_shape, at::device(at::kCPU).dtype(at::kFloat));
5455   const auto out_cpu = at::unsqueeze(in_cpu, dim);
5456 
5457   const auto in_vulkan = in_cpu.vulkan();
5458   const auto out_vulkan = at::unsqueeze(in_vulkan, dim);
5459 
5460   const auto check = almostEqual(out_cpu, out_vulkan.cpu());
5461   if (!check) {
5462     showRtol(out_cpu, out_vulkan.cpu());
5463     std::cout << "unsqueeze test failed with input shape: "
5464               << input_shape << std::endl;
5465   }
5466   ASSERT_TRUE(check);
5467 }
5468 
TEST_F(VulkanAPITest,unsqueeze_0dto1d_dim0)5469 TEST_F(VulkanAPITest, unsqueeze_0dto1d_dim0) {
5470   test_unsqueeze({}, 0);
5471   test_unsqueeze({}, -1);
5472 }
5473 
TEST_F(VulkanAPITest,unsqueeze_1dto2d_dim0)5474 TEST_F(VulkanAPITest, unsqueeze_1dto2d_dim0) {
5475   test_unsqueeze({5}, 0);
5476   test_unsqueeze({6}, -2);
5477   test_unsqueeze({111}, 0);
5478   test_unsqueeze({112}, -2);
5479 }
5480 
TEST_F(VulkanAPITest,unsqueeze_1dto2d_dim1)5481 TEST_F(VulkanAPITest, unsqueeze_1dto2d_dim1) {
5482   test_unsqueeze({5}, 1);
5483   test_unsqueeze({6}, -1);
5484   test_unsqueeze({111}, 1);
5485   test_unsqueeze({112}, -1);
5486 }
5487 
TEST_F(VulkanAPITest,unsqueeze_2dto3d_dim0)5488 TEST_F(VulkanAPITest, unsqueeze_2dto3d_dim0) {
5489   test_unsqueeze({1, 5}, 2);
5490   test_unsqueeze({5, 7}, 0);
5491   test_unsqueeze({7, 5}, -3);
5492   test_unsqueeze({111, 222}, 0);
5493   test_unsqueeze({222, 111}, -3);
5494 }
5495 
TEST_F(VulkanAPITest,unsqueeze_2dto3d_dim1)5496 TEST_F(VulkanAPITest, unsqueeze_2dto3d_dim1) {
5497   test_unsqueeze({5, 7}, 1);
5498   test_unsqueeze({7, 5}, -2);
5499   test_unsqueeze({111, 222}, 1);
5500   test_unsqueeze({222, 111}, -2);
5501 }
5502 
TEST_F(VulkanAPITest,unsqueeze_2dto3d_dim2)5503 TEST_F(VulkanAPITest, unsqueeze_2dto3d_dim2) {
5504   test_unsqueeze({5, 7}, 2);
5505   test_unsqueeze({7, 5}, -1);
5506   test_unsqueeze({111, 222}, 2);
5507   test_unsqueeze({222, 111}, -1);
5508 }
5509 
TEST_F(VulkanAPITest,unsqueeze_3dto4d_dim0)5510 TEST_F(VulkanAPITest, unsqueeze_3dto4d_dim0) {
5511   test_unsqueeze({2, 3, 4}, 0);
5512   test_unsqueeze({4, 3, 2}, -4);
5513   test_unsqueeze({22, 33, 11}, 0);
5514   test_unsqueeze({33, 11, 22}, -4);
5515 }
5516 
TEST_F(VulkanAPITest,unsqueeze_3dto4d_dim1)5517 TEST_F(VulkanAPITest, unsqueeze_3dto4d_dim1) {
5518   test_unsqueeze({2, 3, 4}, 1);
5519   test_unsqueeze({4, 3, 2}, -3);
5520   test_unsqueeze({22, 33, 11}, 1);
5521   test_unsqueeze({33, 11, 22}, -3);
5522 }
5523 
TEST_F(VulkanAPITest,unsqueeze_3dto4d_dim2)5524 TEST_F(VulkanAPITest, unsqueeze_3dto4d_dim2) {
5525   test_unsqueeze({2, 3, 4}, 2);
5526   test_unsqueeze({4, 3, 2}, -2);
5527   test_unsqueeze({22, 33, 11}, 2);
5528   test_unsqueeze({33, 11, 22}, -2);
5529 }
5530 
TEST_F(VulkanAPITest,unsqueeze_3dto4d_dim3)5531 TEST_F(VulkanAPITest, unsqueeze_3dto4d_dim3) {
5532   test_unsqueeze({1, 5, 2}, 3);
5533   test_unsqueeze({2, 3, 4}, 3);
5534   test_unsqueeze({4, 3, 2}, -1);
5535   test_unsqueeze({22, 33, 11}, 3);
5536   test_unsqueeze({33, 11, 22}, -1);
5537 }
5538 
TEST_F(VulkanAPITest,upsample_nearest2d)5539 TEST_F(VulkanAPITest, upsample_nearest2d) {
5540   const auto in_cpu = at::rand({1, 2, 2, 3}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
5541   const auto out_cpu = at::upsample_nearest2d(in_cpu, {4, 6});
5542 
5543   const auto in_vulkan = in_cpu.vulkan();
5544   const auto out_vulkan = at::upsample_nearest2d(in_vulkan, {4, 6});
5545 
5546   const auto check = almostEqual(out_cpu, out_vulkan.cpu());
5547   if (!check) {
5548     showRtol(out_cpu, out_vulkan.cpu());
5549   }
5550 
5551   ASSERT_TRUE(check);
5552 }
5553 
TEST_F(VulkanAPITest,upsample_bilinear2d_align_false_small)5554 TEST_F(VulkanAPITest, upsample_bilinear2d_align_false_small) {
5555   const auto in_cpu = at::rand({1, 2, 2, 3}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
5556   const auto out_cpu = at::upsample_bilinear2d(in_cpu, {4, 6}, false);
5557 
5558   const auto in_vulkan = in_cpu.vulkan();
5559   const auto out_vulkan = at::upsample_bilinear2d(in_vulkan, {4, 6}, false);
5560 
5561   const auto check = almostEqual(out_cpu, out_vulkan.cpu());
5562   if (!check) {
5563     showRtol(out_cpu, out_vulkan.cpu());
5564   }
5565 
5566   ASSERT_TRUE(check);
5567 }
5568 
TEST_F(VulkanAPITest,upsample_bilinear2d_align_false_large)5569 TEST_F(VulkanAPITest, upsample_bilinear2d_align_false_large) {
5570   const auto in_cpu = at::rand({1, 7, 25, 25}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
5571   const auto out_cpu = at::upsample_bilinear2d(in_cpu, {45, 45}, false);
5572 
5573   const auto in_vulkan = in_cpu.vulkan();
5574   const auto out_vulkan = at::upsample_bilinear2d(in_vulkan, {45, 45}, false);
5575 
5576   const auto check = almostEqual(out_cpu, out_vulkan.cpu());
5577   if (!check) {
5578     showRtol(out_cpu, out_vulkan.cpu());
5579   }
5580 
5581   ASSERT_TRUE(check);
5582 }
5583 
TEST_F(VulkanAPITest,upsample_bilinear2d_align_true_small)5584 TEST_F(VulkanAPITest, upsample_bilinear2d_align_true_small) {
5585   const auto in_cpu = at::rand({1, 2, 2, 3}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
5586   const auto out_cpu = at::upsample_bilinear2d(in_cpu, {4, 6}, true);
5587 
5588   const auto in_vulkan = in_cpu.vulkan();
5589   const auto out_vulkan = at::upsample_bilinear2d(in_vulkan, {4, 6}, true);
5590 
5591   const auto check = almostEqual(out_cpu, out_vulkan.cpu());
5592   if (!check) {
5593     showRtol(out_cpu, out_vulkan.cpu());
5594   }
5595 
5596   ASSERT_TRUE(check);
5597 }
5598 
TEST_F(VulkanAPITest,upsample_bilinear2d_align_true_large)5599 TEST_F(VulkanAPITest, upsample_bilinear2d_align_true_large) {
5600   const auto in_cpu = at::rand({1, 7, 25, 25}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
5601   const auto out_cpu = at::upsample_bilinear2d(in_cpu, {45, 45}, true);
5602 
5603   const auto in_vulkan = in_cpu.vulkan();
5604   const auto out_vulkan = at::upsample_bilinear2d(in_vulkan, {45, 45}, true);
5605 
5606   const auto check = almostEqual(out_cpu, out_vulkan.cpu());
5607   if (!check) {
5608     showRtol(out_cpu, out_vulkan.cpu());
5609   }
5610 
5611   ASSERT_TRUE(check);
5612 }
5613 
test_unbind(const at::IntArrayRef input_shape,int64_t dim)5614 void test_unbind(const at::IntArrayRef input_shape, int64_t dim) {
5615   const auto in_cpu = at::rand(input_shape, at::device(at::kCPU).dtype(at::kFloat));
5616   const auto out_cpu = at::unbind(in_cpu, dim);
5617 
5618   const auto in_vulkan = in_cpu.vulkan();
5619   const auto out_vulkan = at::unbind(in_vulkan, dim);
5620 
5621   int64_t size = out_vulkan.size();
5622 
5623   for (const auto i : c10::irange(size)) {
5624     const auto check = almostEqual(out_cpu[i], out_vulkan[i].cpu());
5625     if (!check) {
5626       std::cout << "The " << i << "th vectors aren't equal." << std::endl;
5627       showRtol(out_cpu[i], out_vulkan[i].cpu());
5628     }
5629 
5630     ASSERT_TRUE(check);
5631   }
5632 }
5633 
TEST_F(VulkanAPITest,unbind_3d_depth_small)5634 TEST_F(VulkanAPITest, unbind_3d_depth_small) {
5635   test_unbind({1, 1, 1}, 0);
5636 }
5637 
TEST_F(VulkanAPITest,unbind_3d_depth_medium)5638 TEST_F(VulkanAPITest, unbind_3d_depth_medium) {
5639   test_unbind({3, 2, 5}, 0);
5640 }
5641 
TEST_F(VulkanAPITest,unbind_3d_depth_large)5642 TEST_F(VulkanAPITest, unbind_3d_depth_large) {
5643   test_unbind({100, 1, 144}, 0);
5644 }
5645 
test_var(const at::IntArrayRef input_shape,const at::IntArrayRef dim_list,bool unbiased=true,bool keepdim=false)5646 void test_var(const at::IntArrayRef input_shape, const at::IntArrayRef dim_list, bool unbiased=true, bool keepdim=false) {
5647   c10::InferenceMode mode;
5648 
5649   const auto in_cpu = at::rand(input_shape, at::TensorOptions(at::kCPU).dtype(at::kFloat));
5650   const auto out_cpu = at::var(in_cpu, dim_list, unbiased, keepdim);
5651 
5652   const auto in_vulkan = in_cpu.vulkan();
5653   const auto out_vulkan = at::var(in_vulkan, dim_list, unbiased, keepdim);
5654 
5655   const auto check = almostEqual(out_cpu, out_vulkan.cpu());
5656   if (!check) {
5657     showRtol(out_cpu, out_vulkan.cpu());
5658   }
5659 
5660   ASSERT_TRUE(check);
5661 }
5662 
TEST_F(VulkanAPITest,var_2d_unbiased)5663 TEST_F(VulkanAPITest, var_2d_unbiased) {
5664   test_var({3, 5}, {1}, true, true);
5665   test_var({3, 5}, {1}, true, false);
5666 
5667   // inpu.dim() == dim_list.size(), only keepdim == true is supported
5668   test_var({3, 5}, {0, 1}, true, true);
5669 }
5670 
TEST_F(VulkanAPITest,var_2d_biased)5671 TEST_F(VulkanAPITest, var_2d_biased) {
5672   test_var({3, 5}, {1}, false, true);
5673   test_var({3, 5}, {1}, false, false);
5674 
5675   // inpu.dim() == dim_list.size(), only keepdim == true is supported
5676   test_var({3, 5}, {0, 1}, false, true);
5677 }
5678 
TEST_F(VulkanAPITest,var_3d_unbiased)5679 TEST_F(VulkanAPITest, var_3d_unbiased) {
5680   test_var({3, 5, 7}, {1}, true, true);
5681   test_var({3, 5, 7}, {1}, true, false);
5682 
5683   test_var({3, 5, 7}, {0, 1}, true, true);
5684   test_var({3, 5, 7}, {0, 1}, true, false);
5685 
5686   test_var({3, 5, 7}, {0, 2}, true, true);
5687   test_var({3, 5, 7}, {0, 2}, true, false);
5688 
5689   test_var({3, 5, 7}, {-1, -2}, true, true);
5690   test_var({3, 5, 7}, {-1, -2}, true, false);
5691 
5692   test_var({3, 5, 7}, {0, 1, 2}, true, true);
5693 }
5694 
TEST_F(VulkanAPITest,var_3d_biased)5695 TEST_F(VulkanAPITest, var_3d_biased) {
5696   test_var({3, 5, 7}, {1}, false, true);
5697   test_var({3, 5, 7}, {1}, false, false);
5698 
5699   test_var({3, 5, 7}, {0, 1}, false, true);
5700   test_var({3, 5, 7}, {0, 1}, false, false);
5701 
5702   test_var({3, 5, 7}, {0, 2}, false, true);
5703   test_var({3, 5, 7}, {0, 2}, false, false);
5704 
5705   test_var({3, 5, 7}, {-1, -2}, false, true);
5706   test_var({3, 5, 7}, {-1, -2}, false, false);
5707 
5708   test_var({3, 5, 7}, {0, 1, 2}, false, true);
5709 }
5710 
TEST_F(VulkanAPITest,var_4d_unbiased)5711 TEST_F(VulkanAPITest, var_4d_unbiased) {
5712   test_var({3, 5, 7, 11}, {0}, true, true);
5713   test_var({3, 5, 7, 11}, {1}, true, false);
5714 
5715   test_var({3, 5, 7, 11}, {0, 1}, true, true);
5716   test_var({3, 5, 7, 11}, {0, 1}, true, false);
5717 
5718   test_var({3, 5, 7, 11}, {0, 2}, true, true);
5719   test_var({3, 5, 7, 11}, {0, 2}, true, false);
5720 
5721   test_var({3, 5, 7, 11}, {-1, -2}, true, true);
5722   test_var({3, 5, 7, 11}, {-1, -2}, true, false);
5723 
5724   test_var({3, 5, 7, 11}, {0, 1, 2}, true, true);
5725   test_var({3, 5, 7, 11}, {0, -1, 2}, true, false);
5726 
5727   test_var({3, 5, 7, 11}, {0, 1, 2, 3}, true, true);
5728 }
5729 
TEST_F(VulkanAPITest,var_4d_biased)5730 TEST_F(VulkanAPITest, var_4d_biased) {
5731   test_var({3, 5, 7, 11}, {0}, false, true);
5732   test_var({3, 5, 7, 11}, {1}, false, false);
5733 
5734   test_var({3, 5, 7, 11}, {0, 1}, false, true);
5735   test_var({3, 5, 7, 11}, {0, 1}, false, false);
5736 
5737   test_var({3, 5, 7, 11}, {0, 2}, false, true);
5738   test_var({3, 5, 7, 11}, {0, 2}, false, false);
5739 
5740   test_var({3, 5, 7, 11}, {-1, -2}, false, true);
5741   test_var({3, 5, 7, 11}, {-1, -2}, false, false);
5742 
5743   test_var({3, 5, 7, 11}, {0, 1, 2}, false, true);
5744   test_var({3, 5, 7, 11}, {0, -1, 2}, false, false);
5745 
5746   test_var({3, 5, 7, 11}, {0, 1, 2, 3}, false, true);
5747 }
5748 
TEST_F(VulkanAPITest,view_explicit)5749 TEST_F(VulkanAPITest, view_explicit) {
5750   c10::InferenceMode mode;
5751 
5752   const auto in_cpu = at::rand({7, 8, 9}, at::device(at::kCPU).dtype(at::kFloat));
5753   const auto in_vulkan = in_cpu.vulkan();
5754 
5755   const std::array<int64_t, 4> shape{7, 8, 9, 1};
5756 
5757   const auto out_cpu = in_cpu.view(shape);
5758   const auto out_vulkan = in_vulkan.view(shape);
5759 
5760   const auto check = almostEqual(out_cpu, out_vulkan.cpu());
5761   if (!check) {
5762     showRtol(out_cpu, out_vulkan.cpu());
5763   }
5764 
5765   ASSERT_TRUE(check);
5766 }
5767 
TEST_F(VulkanAPITest,view_inferred)5768 TEST_F(VulkanAPITest, view_inferred) {
5769   c10::InferenceMode mode;
5770 
5771   const auto in_cpu = at::rand({7, 11, 8, 9}, at::device(at::kCPU).dtype(at::kFloat));
5772   const auto in_vulkan = in_cpu.vulkan();
5773 
5774   const std::array<int64_t, 3> shape{7, 11, -1};
5775 
5776   const auto out_cpu = in_cpu.view(shape);
5777   const auto out_vulkan = in_vulkan.view(shape);
5778 
5779   const auto check = almostEqual(out_cpu, out_vulkan.cpu());
5780   if (!check) {
5781     showRtol(out_cpu, out_vulkan.cpu());
5782   }
5783 
5784   ASSERT_TRUE(check);
5785 }
5786 
TEST_F(VulkanAPITest,view_invalid_inputs)5787 TEST_F(VulkanAPITest, view_invalid_inputs) {
5788   c10::InferenceMode mode;
5789 
5790   // Act: only one dimension can be inferred
5791   EXPECT_THROW({
5792     at::rand({7, 8, 9}, at::device(at::kCPU).dtype(at::kFloat))
5793       .vulkan().view({7, -1, -1});
5794   }, ::std::runtime_error);
5795 
5796   // Act: invalid shape dimension
5797   EXPECT_THROW({
5798     at::rand({7, 8, 9}, at::device(at::kCPU).dtype(at::kFloat))
5799       .vulkan().view({7, 8, -2});
5800   }, ::std::exception);
5801 
5802   // Act: incompatible shape
5803   EXPECT_THROW({
5804     at::rand({7, 8, 9}, at::device(at::kCPU).dtype(at::kFloat))
5805       .vulkan().view({7, 70});
5806   }, ::std::runtime_error);
5807 }
5808 
TEST_F(VulkanAPITest,cat_4d_dim0_invalidinputs_exceptions)5809 TEST_F(VulkanAPITest, cat_4d_dim0_invalidinputs_exceptions) {
5810   // Arrange: Vulkan cat inputs must have matching sizes except concatenated dimension
5811   {
5812     const auto in_cpu1 = at::rand({3, 5, 221, 193}, at::device(at::kCPU).dtype(at::kFloat));
5813     const auto in_cpu2 = at::rand({3, 9, 112, 193}, at::device(at::kCPU).dtype(at::kFloat));
5814     const auto in_cpu3 = at::rand({3, 9, 331, 193}, at::device(at::kCPU).dtype(at::kFloat));
5815 
5816     // Act
5817     EXPECT_THROW({
5818       const auto out_vulkan = at::cat({in_cpu1.vulkan(), in_cpu2.vulkan(), in_cpu3.vulkan()}, 0);
5819     }, ::std::exception);
5820   }
5821 
5822   // Arrange: Vulkan cat expects 4 dimensional inputs
5823   {
5824     const auto in_cpu1 = at::rand({3, 9, 221, 193}, at::device(at::kCPU).dtype(at::kFloat));
5825     const auto in_cpu2 = at::rand({9, 112, 193}, at::device(at::kCPU).dtype(at::kFloat));
5826     const auto in_cpu3 = at::rand({3, 9, 331, 193}, at::device(at::kCPU).dtype(at::kFloat));
5827 
5828     // Act
5829     EXPECT_THROW({
5830       const auto out_vulkan = at::cat({in_cpu1.vulkan(), in_cpu2.vulkan(), in_cpu3.vulkan()}, 0);
5831     }, ::std::exception);
5832   }
5833 }
5834 
TEST_F(VulkanAPITest,cat_4d_dim0_samebatch_success)5835 TEST_F(VulkanAPITest, cat_4d_dim0_samebatch_success) {
5836   // Arrange
5837   const auto in_cpu1 = at::rand({221, 3, 9, 193}, at::device(at::kCPU).dtype(at::kFloat));
5838   const auto in_cpu2 = at::rand({221, 3, 9, 193}, at::device(at::kCPU).dtype(at::kFloat));
5839   const auto in_cpu3 = at::rand({221, 3, 9, 193}, at::device(at::kCPU).dtype(at::kFloat));
5840 
5841   // Act
5842   const auto out_cpu = at::cat({in_cpu1, in_cpu2, in_cpu3}, 0);
5843   const auto out_vulkan = at::cat({in_cpu1.vulkan(), in_cpu2.vulkan(), in_cpu3.vulkan()}, 0); // dim=batch
5844 
5845   // Assert
5846   const auto check = almostEqual(out_cpu, out_vulkan.cpu());
5847   if (!check) {
5848     showRtol(out_cpu, out_vulkan.cpu());
5849   }
5850 
5851   ASSERT_TRUE(check);
5852 }
5853 
TEST_F(VulkanAPITest,cat_4d_dim0_diffbatch_success)5854 TEST_F(VulkanAPITest, cat_4d_dim0_diffbatch_success) {
5855   // Arrange
5856   const auto in_cpu1 = at::rand({221, 3, 9, 193}, at::device(at::kCPU).dtype(at::kFloat));
5857   const auto in_cpu2 = at::rand({117, 3, 9, 193}, at::device(at::kCPU).dtype(at::kFloat));
5858   const auto in_cpu3 = at::rand({139, 3, 9, 193}, at::device(at::kCPU).dtype(at::kFloat));
5859 
5860   // Act
5861   const auto out_cpu = at::cat({in_cpu1, in_cpu2, in_cpu3}, 0);
5862   const auto out_vulkan = at::cat({in_cpu1.vulkan(), in_cpu2.vulkan(), in_cpu3.vulkan()}, 0); // dim=batch
5863 
5864   // Assert
5865   const auto check = almostEqual(out_cpu, out_vulkan.cpu());
5866   if (!check) {
5867     showRtol(out_cpu, out_vulkan.cpu());
5868   }
5869 
5870   ASSERT_TRUE(check);
5871 }
5872 
TEST_F(VulkanAPITest,cat_4d_dim0_singledepth_success)5873 TEST_F(VulkanAPITest, cat_4d_dim0_singledepth_success) {
5874   // Arrange: batch x channel (1x1) = single depth texture
5875   const auto in_cpu1 = at::rand({1, 1, 221, 193}, at::device(at::kCPU).dtype(at::kFloat));
5876   const auto in_cpu2 = at::rand({1, 1, 221, 193}, at::device(at::kCPU).dtype(at::kFloat));
5877   const auto in_cpu3 = at::rand({1, 1, 221, 193}, at::device(at::kCPU).dtype(at::kFloat));
5878 
5879   // Act
5880   const auto out_cpu = at::cat({in_cpu1, in_cpu2, in_cpu3}, 0);
5881   const auto out_vulkan = at::cat({in_cpu1.vulkan(), in_cpu2.vulkan(), in_cpu3.vulkan()}, 0); // dim=batch
5882 
5883   // Assert
5884   const auto check = almostEqual(out_cpu, out_vulkan.cpu());
5885   if (!check) {
5886     showRtol(out_cpu, out_vulkan.cpu());
5887   }
5888 
5889   ASSERT_TRUE(check);
5890 }
5891 
TEST_F(VulkanAPITest,cat_4d_dim0_singletensor_success)5892 TEST_F(VulkanAPITest, cat_4d_dim0_singletensor_success) {
5893   // Arrange: single input tensor
5894   const auto in_cpu1 = at::rand({3, 7, 221, 193}, at::device(at::kCPU).dtype(at::kFloat));
5895 
5896   // Act
5897   const auto out_cpu = at::cat({in_cpu1}, 0);
5898   const auto out_vulkan = at::cat({in_cpu1}, 0); // dim=batch
5899 
5900   // Assert
5901   const auto check = almostEqual(out_cpu, out_vulkan.cpu());
5902   if (!check) {
5903     showRtol(out_cpu, out_vulkan.cpu());
5904   }
5905 
5906   ASSERT_TRUE(check);
5907 }
5908 
TEST_F(VulkanAPITest,cat_4d_dim0_twotensors_success)5909 TEST_F(VulkanAPITest, cat_4d_dim0_twotensors_success) {
5910   // Arrange: two input tensors
5911   const auto in_cpu1 = at::rand({3, 7, 221, 193}, at::device(at::kCPU).dtype(at::kFloat));
5912   const auto in_cpu2 = at::rand({3, 7, 221, 193}, at::device(at::kCPU).dtype(at::kFloat));
5913 
5914   // Act
5915   const auto out_cpu = at::cat({in_cpu1, in_cpu2}, 0);
5916   const auto out_vulkan = at::cat({in_cpu1.vulkan(), in_cpu2.vulkan()}, 0); // dim=batch
5917 
5918   // Assert
5919   const auto check = almostEqual(out_cpu, out_vulkan.cpu());
5920   if (!check) {
5921     showRtol(out_cpu, out_vulkan.cpu());
5922   }
5923 
5924   ASSERT_TRUE(check);
5925 }
5926 
TEST_F(VulkanAPITest,cat_4d_dim0_negdim_success)5927 TEST_F(VulkanAPITest, cat_4d_dim0_negdim_success) {
5928   // Arrange
5929   const auto in_cpu1 = at::rand({221, 9, 193, 3}, at::device(at::kCPU).dtype(at::kFloat));
5930   const auto in_cpu2 = at::rand({113, 9, 193, 3}, at::device(at::kCPU).dtype(at::kFloat));
5931   const auto in_cpu3 = at::rand({331, 9, 193, 3}, at::device(at::kCPU).dtype(at::kFloat));
5932 
5933   // Act
5934   const auto out_cpu = at::cat({in_cpu1, in_cpu2, in_cpu3}, -4);
5935   const auto out_vulkan = at::cat({in_cpu1.vulkan(), in_cpu2.vulkan(), in_cpu3.vulkan()}, -4);
5936 
5937   // Assert
5938   const auto check = almostEqual(out_cpu, out_vulkan.cpu());
5939   if (!check) {
5940     showRtol(out_cpu, out_vulkan.cpu());
5941   }
5942 
5943   ASSERT_TRUE(check);
5944 }
5945 
TEST_F(VulkanAPITest,cat_4d_dim1_negdim_success)5946 TEST_F(VulkanAPITest, cat_4d_dim1_negdim_success) {
5947   // Arrange
5948   const auto in_cpu1 = at::rand({9, 221, 193, 3}, at::device(at::kCPU).dtype(at::kFloat));
5949   const auto in_cpu2 = at::rand({9, 113, 193, 3}, at::device(at::kCPU).dtype(at::kFloat));
5950   const auto in_cpu3 = at::rand({9, 331, 193, 3}, at::device(at::kCPU).dtype(at::kFloat));
5951 
5952   // Act
5953   const auto out_cpu = at::cat({in_cpu1, in_cpu2, in_cpu3}, -3);
5954   const auto out_vulkan = at::cat({in_cpu1.vulkan(), in_cpu2.vulkan(), in_cpu3.vulkan()}, -3);
5955 
5956   // Assert
5957   const auto check = almostEqual(out_cpu, out_vulkan.cpu());
5958   if (!check) {
5959     showRtol(out_cpu, out_vulkan.cpu());
5960   }
5961 
5962   ASSERT_TRUE(check);
5963 }
5964 
TEST_F(VulkanAPITest,cat_4d_dim2_negdim_success)5965 TEST_F(VulkanAPITest, cat_4d_dim2_negdim_success) {
5966   // Arrange
5967   const auto in_cpu1 = at::rand({9, 193, 221, 3}, at::device(at::kCPU).dtype(at::kFloat));
5968   const auto in_cpu2 = at::rand({9, 193, 113, 3}, at::device(at::kCPU).dtype(at::kFloat));
5969   const auto in_cpu3 = at::rand({9, 193, 331, 3}, at::device(at::kCPU).dtype(at::kFloat));
5970 
5971   // Act
5972   const auto out_cpu = at::cat({in_cpu1, in_cpu2, in_cpu3}, -2);
5973   const auto out_vulkan = at::cat({in_cpu1.vulkan(), in_cpu2.vulkan(), in_cpu3.vulkan()}, -2);
5974 
5975   // Assert
5976   const auto check = almostEqual(out_cpu, out_vulkan.cpu());
5977   if (!check) {
5978     showRtol(out_cpu, out_vulkan.cpu());
5979   }
5980 
5981   ASSERT_TRUE(check);
5982 }
5983 
TEST_F(VulkanAPITest,cat_4d_dim3_negdim_success)5984 TEST_F(VulkanAPITest, cat_4d_dim3_negdim_success) {
5985   // Arrange
5986   const auto in_cpu1 = at::rand({9, 193, 3, 221}, at::device(at::kCPU).dtype(at::kFloat));
5987   const auto in_cpu2 = at::rand({9, 193, 3, 113}, at::device(at::kCPU).dtype(at::kFloat));
5988   const auto in_cpu3 = at::rand({9, 193, 3, 331}, at::device(at::kCPU).dtype(at::kFloat));
5989 
5990   // Act
5991   const auto out_cpu = at::cat({in_cpu1, in_cpu2, in_cpu3}, -1);
5992   const auto out_vulkan = at::cat({in_cpu1.vulkan(), in_cpu2.vulkan(), in_cpu3.vulkan()}, -1);
5993 
5994   // Assert
5995   const auto check = almostEqual(out_cpu, out_vulkan.cpu());
5996   if (!check) {
5997     showRtol(out_cpu, out_vulkan.cpu());
5998   }
5999 
6000   ASSERT_TRUE(check);
6001 }
6002 
6003 #if !defined(__APPLE__)
TEST_F(VulkanAPITest,DISABLED_cat_4d_dim1_samefeature_success)6004 TEST_F(VulkanAPITest, DISABLED_cat_4d_dim1_samefeature_success) {
6005   // Arrange
6006   const auto in_cpu1 = at::rand({3, 9, 221, 193}, at::device(at::kCPU).dtype(at::kFloat));
6007   const auto in_cpu2 = at::rand({3, 9, 221, 193}, at::device(at::kCPU).dtype(at::kFloat));
6008   const auto in_cpu3 = at::rand({3, 9, 221, 193}, at::device(at::kCPU).dtype(at::kFloat));
6009 
6010   // Act
6011   const auto out_cpu = at::cat({in_cpu1, in_cpu2, in_cpu3}, 1);
6012   const auto out_vulkan = at::cat({in_cpu1.vulkan(), in_cpu2.vulkan(), in_cpu3.vulkan()}, 1); // dim=feature(channel)
6013 
6014   // Assert
6015   const auto check = almostEqual(out_cpu, out_vulkan.cpu());
6016   if (!check) {
6017     showRtol(out_cpu, out_vulkan.cpu());
6018   }
6019 
6020   ASSERT_TRUE(check);
6021 }
6022 
TEST_F(VulkanAPITest,DISABLED_cat_4d_dim1_difffeature_success)6023 TEST_F(VulkanAPITest, DISABLED_cat_4d_dim1_difffeature_success) {
6024   // Arrange
6025   const auto in_cpu1 = at::rand({3, 3, 221, 193}, at::device(at::kCPU).dtype(at::kFloat));
6026   const auto in_cpu2 = at::rand({3, 8, 221, 193}, at::device(at::kCPU).dtype(at::kFloat));
6027   const auto in_cpu3 = at::rand({3, 11, 221, 193}, at::device(at::kCPU).dtype(at::kFloat));
6028 
6029   // Act
6030   const auto out_cpu = at::cat({in_cpu1, in_cpu2, in_cpu3}, 1);
6031   const auto out_vulkan = at::cat({in_cpu1.vulkan(), in_cpu2.vulkan(), in_cpu3.vulkan()}, 1); // dim=feature(channel)
6032 
6033   // Assert
6034   const auto check = almostEqual(out_cpu, out_vulkan.cpu());
6035   if (!check) {
6036     showRtol(out_cpu, out_vulkan.cpu());
6037   }
6038 
6039   ASSERT_TRUE(check);
6040 }
6041 
TEST_F(VulkanAPITest,cat_4d_dim1_texture2d_success)6042 TEST_F(VulkanAPITest, cat_4d_dim1_texture2d_success) {
6043   // Arrange: 2D Texture (VK_IMAGE_VIEW_TYPE_2D)
6044   const auto in_cpu1 = at::rand({2, 3, 2, 2}, at::device(at::kCPU).dtype(at::kFloat));
6045   const auto in_cpu2 = at::rand({2, 3, 2, 2}, at::device(at::kCPU).dtype(at::kFloat));
6046   const auto in_cpu3 = at::rand({2, 3, 2, 2}, at::device(at::kCPU).dtype(at::kFloat));
6047 
6048   // Act
6049   const auto out_cpu = at::cat({in_cpu1, in_cpu2, in_cpu3}, 1);
6050   const auto out_vulkan = at::cat({in_cpu1.vulkan(), in_cpu2.vulkan(), in_cpu3.vulkan()}, 1); // dim=feature(channel)
6051 
6052   // Assert
6053   const auto check = almostEqual(out_cpu, out_vulkan.cpu());
6054   if (!check) {
6055     showRtol(out_cpu, out_vulkan.cpu());
6056   }
6057 
6058   ASSERT_TRUE(check);
6059 }
6060 #endif /* !defined(__APPLE__) */
6061 
TEST_F(VulkanAPITest,cat_4d_dim1_singledepth_success)6062 TEST_F(VulkanAPITest, cat_4d_dim1_singledepth_success) {
6063   // Arrange: batch x channel (1x1) = single depth texture
6064   const auto in_cpu1 = at::rand({1, 1, 221, 193}, at::device(at::kCPU).dtype(at::kFloat));
6065   const auto in_cpu2 = at::rand({1, 1, 221, 193}, at::device(at::kCPU).dtype(at::kFloat));
6066   const auto in_cpu3 = at::rand({1, 1, 221, 193}, at::device(at::kCPU).dtype(at::kFloat));
6067 
6068   // Act
6069   const auto out_cpu = at::cat({in_cpu1, in_cpu2, in_cpu3}, 1);
6070   const auto out_vulkan = at::cat({in_cpu1.vulkan(), in_cpu2.vulkan(), in_cpu3.vulkan()}, 1); // dim=feature(channel)
6071 
6072   // Assert
6073   const auto check = almostEqual(out_cpu, out_vulkan.cpu());
6074   if (!check) {
6075     showRtol(out_cpu, out_vulkan.cpu());
6076   }
6077 
6078   ASSERT_TRUE(check);
6079 }
6080 
TEST_F(VulkanAPITest,cat_4d_dim1_singletensor_success)6081 TEST_F(VulkanAPITest, cat_4d_dim1_singletensor_success) {
6082   // Arrange: single input tensor
6083   const auto in_cpu1 = at::rand({3, 7, 221, 193}, at::device(at::kCPU).dtype(at::kFloat));
6084 
6085   // Act
6086   const auto out_cpu = at::cat({in_cpu1}, 1);
6087   const auto out_vulkan = at::cat({in_cpu1}, 1); // dim=feature(channel)
6088 
6089   // Assert
6090   const auto check = almostEqual(out_cpu, out_vulkan.cpu());
6091   if (!check) {
6092     showRtol(out_cpu, out_vulkan.cpu());
6093   }
6094 
6095   ASSERT_TRUE(check);
6096 }
6097 
TEST_F(VulkanAPITest,DISABLED_cat_4d_dim1_twotensors_success)6098 TEST_F(VulkanAPITest, DISABLED_cat_4d_dim1_twotensors_success) {
6099   // Arrange: two input tensors
6100   const auto in_cpu1 = at::rand({3, 7, 221, 193}, at::device(at::kCPU).dtype(at::kFloat));
6101   const auto in_cpu2 = at::rand({3, 7, 221, 193}, at::device(at::kCPU).dtype(at::kFloat));
6102 
6103   // Act
6104   const auto out_cpu = at::cat({in_cpu1, in_cpu2}, 1);
6105   const auto out_vulkan = at::cat({in_cpu1.vulkan(), in_cpu2.vulkan()}, 1); // dim=feature(channel)
6106 
6107   // Assert
6108   const auto check = almostEqual(out_cpu, out_vulkan.cpu());
6109   if (!check) {
6110     showRtol(out_cpu, out_vulkan.cpu());
6111   }
6112 
6113   ASSERT_TRUE(check);
6114 }
6115 
TEST_F(VulkanAPITest,cat_4d_dim1_bat1_mult4ch_success)6116 TEST_F(VulkanAPITest, cat_4d_dim1_bat1_mult4ch_success) {
6117   // Arrange: batch=1 and channel (a multiple of 4 <-> channel %4 == 0)
6118   const auto in_cpu1 = at::rand({1, 4, 221, 193}, at::device(at::kCPU).dtype(at::kFloat));
6119   const auto in_cpu2 = at::rand({1, 4, 221, 193}, at::device(at::kCPU).dtype(at::kFloat));
6120   const auto in_cpu3 = at::rand({1, 4, 221, 193}, at::device(at::kCPU).dtype(at::kFloat));
6121 
6122   // Act
6123   const auto out_cpu = at::cat({in_cpu1, in_cpu2, in_cpu3}, 1);
6124   const auto out_vulkan = at::cat({in_cpu1.vulkan(), in_cpu2.vulkan(), in_cpu3.vulkan()}, 1); // dim=feature(channel)
6125 
6126   // Assert
6127   const auto check = almostEqual(out_cpu, out_vulkan.cpu());
6128   if (!check) {
6129     showRtol(out_cpu, out_vulkan.cpu());
6130   }
6131 
6132   ASSERT_TRUE(check);
6133 }
6134 
TEST_F(VulkanAPITest,cat_4d_dim1_bat2_mult4ch_success)6135 TEST_F(VulkanAPITest, cat_4d_dim1_bat2_mult4ch_success) {
6136   // Arrange: batch=2 and channel (a multiple of 4 <-> channel %4 == 0)
6137   const auto in_cpu1 = at::rand({2, 4, 221, 193}, at::device(at::kCPU).dtype(at::kFloat));
6138   const auto in_cpu2 = at::rand({2, 4, 221, 193}, at::device(at::kCPU).dtype(at::kFloat));
6139   const auto in_cpu3 = at::rand({2, 4, 221, 193}, at::device(at::kCPU).dtype(at::kFloat));
6140 
6141   // Act
6142   const auto out_cpu = at::cat({in_cpu1, in_cpu2, in_cpu3}, 1);
6143   const auto out_vulkan = at::cat({in_cpu1.vulkan(), in_cpu2.vulkan(), in_cpu3.vulkan()}, 1); // dim=feature(channel)
6144 
6145   // Assert
6146   const auto check = almostEqual(out_cpu, out_vulkan.cpu());
6147   if (!check) {
6148     showRtol(out_cpu, out_vulkan.cpu());
6149   }
6150 
6151   ASSERT_TRUE(check);
6152 }
6153 
TEST_F(VulkanAPITest,cat_4d_dim1_mult4ch_mixed_success)6154 TEST_F(VulkanAPITest, cat_4d_dim1_mult4ch_mixed_success) {
6155   // Arrange: batch=1 and channel (different multiples of 4 <-> channel %4 == 0)
6156   const auto in_cpu1 = at::rand({3, 4, 221, 193}, at::device(at::kCPU).dtype(at::kFloat));
6157   const auto in_cpu2 = at::rand({3, 8, 221, 193}, at::device(at::kCPU).dtype(at::kFloat));
6158   const auto in_cpu3 = at::rand({3, 12, 221, 193}, at::device(at::kCPU).dtype(at::kFloat));
6159 
6160   // Act
6161   const auto out_cpu = at::cat({in_cpu1, in_cpu2, in_cpu3}, 1);
6162   const auto out_vulkan = at::cat({in_cpu1.vulkan(), in_cpu2.vulkan(), in_cpu3.vulkan()}, 1); // dim=feature(channel)
6163 
6164   // Assert
6165   const auto check = almostEqual(out_cpu, out_vulkan.cpu());
6166   if (!check) {
6167     showRtol(out_cpu, out_vulkan.cpu());
6168   }
6169 
6170   ASSERT_TRUE(check);
6171 }
6172 
TEST_F(VulkanAPITest,DISABLED_cat_4d_dim1_mult4ch_nonmult4ch_success)6173 TEST_F(VulkanAPITest, DISABLED_cat_4d_dim1_mult4ch_nonmult4ch_success) {
6174   // Arrange: batch=1 and channel (a mixed set of multiples and non-multiples of 4)
6175   const auto in_cpu1 = at::rand({3, 3, 221, 193}, at::device(at::kCPU).dtype(at::kFloat));
6176   const auto in_cpu2 = at::rand({3, 4, 221, 193}, at::device(at::kCPU).dtype(at::kFloat));
6177   const auto in_cpu3 = at::rand({3, 7, 221, 193}, at::device(at::kCPU).dtype(at::kFloat));
6178   const auto in_cpu4 = at::rand({3, 8, 221, 193}, at::device(at::kCPU).dtype(at::kFloat));
6179 
6180   // Act
6181   const auto out_cpu = at::cat({in_cpu1, in_cpu2, in_cpu3, in_cpu4}, 1);
6182   const auto out_vulkan = at::cat({in_cpu1.vulkan(), in_cpu2.vulkan(), in_cpu3.vulkan(), in_cpu4.vulkan()}, 1); // dim=feature(channel)
6183 
6184   // Assert
6185   const auto check = almostEqual(out_cpu, out_vulkan.cpu());
6186   if (!check) {
6187     showRtol(out_cpu, out_vulkan.cpu());
6188   }
6189 
6190   ASSERT_TRUE(check);
6191 }
6192 
TEST_F(VulkanAPITest,cat_4d_dim2_sameheight_success)6193 TEST_F(VulkanAPITest, cat_4d_dim2_sameheight_success) {
6194   // Arrange
6195   const auto in_cpu1 = at::rand({3, 9, 221, 193}, at::device(at::kCPU).dtype(at::kFloat));
6196   const auto in_cpu2 = at::rand({3, 9, 221, 193}, at::device(at::kCPU).dtype(at::kFloat));
6197   const auto in_cpu3 = at::rand({3, 9, 221, 193}, at::device(at::kCPU).dtype(at::kFloat));
6198 
6199   // Act
6200   const auto out_cpu = at::cat({in_cpu1, in_cpu2, in_cpu3}, 2);
6201   const auto out_vulkan = at::cat({in_cpu1.vulkan(), in_cpu2.vulkan(), in_cpu3.vulkan()}, 2);
6202 
6203   // Assert
6204   const auto check = almostEqual(out_cpu, out_vulkan.cpu());
6205   if (!check) {
6206     showRtol(out_cpu, out_vulkan.cpu());
6207   }
6208 
6209   ASSERT_TRUE(check);
6210 }
6211 
TEST_F(VulkanAPITest,cat_4d_dim2_diffheight_success)6212 TEST_F(VulkanAPITest, cat_4d_dim2_diffheight_success) {
6213   // Arrange
6214   const auto in_cpu1 = at::rand({3, 9, 221, 193}, at::device(at::kCPU).dtype(at::kFloat));
6215   const auto in_cpu2 = at::rand({3, 9, 112, 193}, at::device(at::kCPU).dtype(at::kFloat));
6216   const auto in_cpu3 = at::rand({3, 9, 331, 193}, at::device(at::kCPU).dtype(at::kFloat));
6217 
6218   // Act
6219   const auto out_cpu = at::cat({in_cpu1, in_cpu2, in_cpu3}, 2);
6220   const auto out_vulkan = at::cat({in_cpu1.vulkan(), in_cpu2.vulkan(), in_cpu3.vulkan()}, 2);
6221 
6222   // Assert
6223   const auto check = almostEqual(out_cpu, out_vulkan.cpu());
6224   if (!check) {
6225     showRtol(out_cpu, out_vulkan.cpu());
6226   }
6227 
6228   ASSERT_TRUE(check);
6229 }
6230 
TEST_F(VulkanAPITest,cat_4d_dim2_singledepth_success)6231 TEST_F(VulkanAPITest, cat_4d_dim2_singledepth_success) {
6232   // Arrange: batch x channel (1x1) = single depth texture
6233   const auto in_cpu1 = at::rand({1, 1, 221, 193}, at::device(at::kCPU).dtype(at::kFloat));
6234   const auto in_cpu2 = at::rand({1, 1, 221, 193}, at::device(at::kCPU).dtype(at::kFloat));
6235   const auto in_cpu3 = at::rand({1, 1, 221, 193}, at::device(at::kCPU).dtype(at::kFloat));
6236 
6237   // Act
6238   const auto out_cpu = at::cat({in_cpu1, in_cpu2, in_cpu3}, 2);
6239   const auto out_vulkan = at::cat({in_cpu1.vulkan(), in_cpu2.vulkan(), in_cpu3.vulkan()}, 2);
6240 
6241   // Assert
6242   const auto check = almostEqual(out_cpu, out_vulkan.cpu());
6243   if (!check) {
6244     showRtol(out_cpu, out_vulkan.cpu());
6245   }
6246 
6247   ASSERT_TRUE(check);
6248 }
6249 
TEST_F(VulkanAPITest,cat_4d_dim2_invalidinputs_exceptions)6250 TEST_F(VulkanAPITest, cat_4d_dim2_invalidinputs_exceptions) {
6251   // Arrange: Vulkan cat inputs must have matching sizes except concatenated dimension
6252   {
6253     const auto in_cpu1 = at::rand({3, 5, 221, 193}, at::device(at::kCPU).dtype(at::kFloat));
6254     const auto in_cpu2 = at::rand({3, 9, 112, 193}, at::device(at::kCPU).dtype(at::kFloat));
6255     const auto in_cpu3 = at::rand({3, 9, 331, 193}, at::device(at::kCPU).dtype(at::kFloat));
6256 
6257     // Act
6258     EXPECT_THROW({
6259       const auto out_vulkan = at::cat({in_cpu1.vulkan(), in_cpu2.vulkan(), in_cpu3.vulkan()}, 2);
6260     }, ::std::exception);
6261   }
6262 
6263   // Arrange: Vulkan cat expects inputs of same dimensions
6264   {
6265     const auto in_cpu1 = at::rand({3, 9, 221, 193}, at::device(at::kCPU).dtype(at::kFloat));
6266     const auto in_cpu2 = at::rand({9, 112, 193}, at::device(at::kCPU).dtype(at::kFloat));
6267     const auto in_cpu3 = at::rand({3, 9, 331, 193}, at::device(at::kCPU).dtype(at::kFloat));
6268 
6269     // Act
6270     EXPECT_THROW({
6271       const auto out_vulkan = at::cat({in_cpu1.vulkan(), in_cpu2.vulkan(), in_cpu3.vulkan()}, 2);
6272     }, ::std::exception);
6273   }
6274 }
6275 
TEST_F(VulkanAPITest,cat_4d_dim3_invalidinputs_exceptions)6276 TEST_F(VulkanAPITest, cat_4d_dim3_invalidinputs_exceptions) {
6277   // Arrange: Vulkan cat inputs must have matching sizes except concatenated dimension
6278   {
6279     const auto in_cpu1 = at::rand({3, 5, 221, 193}, at::device(at::kCPU).dtype(at::kFloat));
6280     const auto in_cpu2 = at::rand({3, 9, 112, 193}, at::device(at::kCPU).dtype(at::kFloat));
6281     const auto in_cpu3 = at::rand({3, 9, 331, 193}, at::device(at::kCPU).dtype(at::kFloat));
6282 
6283     // Act
6284     EXPECT_THROW({
6285       const auto out_vulkan = at::cat({in_cpu1.vulkan(), in_cpu2.vulkan(), in_cpu3.vulkan()}, 3);
6286     }, ::std::exception);
6287   }
6288 
6289   // Arrange: Vulkan cat expects 4 dimensional inputs
6290   {
6291     const auto in_cpu1 = at::rand({3, 9, 221, 193}, at::device(at::kCPU).dtype(at::kFloat));
6292     const auto in_cpu2 = at::rand({9, 112, 193}, at::device(at::kCPU).dtype(at::kFloat));
6293     const auto in_cpu3 = at::rand({3, 9, 331, 193}, at::device(at::kCPU).dtype(at::kFloat));
6294 
6295     // Act
6296     EXPECT_THROW({
6297       const auto out_vulkan = at::cat({in_cpu1.vulkan(), in_cpu2.vulkan(), in_cpu3.vulkan()}, 3);
6298     }, ::std::exception);
6299   }
6300 }
6301 
TEST_F(VulkanAPITest,cat_4d_dim3_samewidth_success)6302 TEST_F(VulkanAPITest, cat_4d_dim3_samewidth_success) {
6303   // Arrange
6304   const auto in_cpu1 = at::rand({3, 9, 221, 193}, at::device(at::kCPU).dtype(at::kFloat));
6305   const auto in_cpu2 = at::rand({3, 9, 221, 193}, at::device(at::kCPU).dtype(at::kFloat));
6306   const auto in_cpu3 = at::rand({3, 9, 221, 193}, at::device(at::kCPU).dtype(at::kFloat));
6307 
6308   // Act
6309   const auto out_cpu = at::cat({in_cpu1, in_cpu2, in_cpu3}, 3);
6310   const auto out_vulkan = at::cat({in_cpu1.vulkan(), in_cpu2.vulkan(), in_cpu3.vulkan()}, 3);
6311 
6312   // Assert
6313   const auto check = almostEqual(out_cpu, out_vulkan.cpu());
6314   if (!check) {
6315     showRtol(out_cpu, out_vulkan.cpu());
6316   }
6317 
6318   ASSERT_TRUE(check);
6319 }
6320 
TEST_F(VulkanAPITest,cat_4d_dim3_diffwidth_success)6321 TEST_F(VulkanAPITest, cat_4d_dim3_diffwidth_success) {
6322   // Arrange
6323   const auto in_cpu1 = at::rand({3, 9, 193, 221}, at::device(at::kCPU).dtype(at::kFloat));
6324   const auto in_cpu2 = at::rand({3, 9, 193, 113}, at::device(at::kCPU).dtype(at::kFloat));
6325   const auto in_cpu3 = at::rand({3, 9, 193, 331}, at::device(at::kCPU).dtype(at::kFloat));
6326 
6327   // Act
6328   const auto out_cpu = at::cat({in_cpu1, in_cpu2, in_cpu3}, 3);
6329   const auto out_vulkan = at::cat({in_cpu1.vulkan(), in_cpu2.vulkan(), in_cpu3.vulkan()}, 3);
6330 
6331   // Assert
6332   const auto check = almostEqual(out_cpu, out_vulkan.cpu());
6333   if (!check) {
6334     showRtol(out_cpu, out_vulkan.cpu());
6335   }
6336 
6337   ASSERT_TRUE(check);
6338 }
6339 
TEST_F(VulkanAPITest,cat_3d_dim0_mult4ch_success)6340 TEST_F(VulkanAPITest, cat_3d_dim0_mult4ch_success) {
6341   // Arrange
6342   const auto in_cpu1 = at::rand({4, 193, 113}, at::device(at::kCPU).dtype(at::kFloat));
6343   const auto in_cpu2 = at::rand({4, 193, 113}, at::device(at::kCPU).dtype(at::kFloat));
6344   const auto in_cpu3 = at::rand({4, 193, 113}, at::device(at::kCPU).dtype(at::kFloat));
6345 
6346   // Act
6347   const auto out_cpu = at::cat({in_cpu1, in_cpu2, in_cpu3}, 0);
6348   const auto out_vulkan = at::cat({in_cpu1.vulkan(), in_cpu2.vulkan(), in_cpu3.vulkan()}, 0);
6349 
6350   // Assert
6351   const auto check = almostEqual(out_cpu, out_vulkan.cpu());
6352   if (!check) {
6353     showRtol(out_cpu, out_vulkan.cpu());
6354   }
6355 
6356   ASSERT_TRUE(check);
6357 }
6358 
TEST_F(VulkanAPITest,cat_3d_dim0_diff_channel_success)6359 TEST_F(VulkanAPITest, cat_3d_dim0_diff_channel_success) {
6360   // Arrange
6361   const auto in_cpu1 = at::rand({221, 9, 193}, at::device(at::kCPU).dtype(at::kFloat));
6362   const auto in_cpu2 = at::rand({113, 9, 193}, at::device(at::kCPU).dtype(at::kFloat));
6363   const auto in_cpu3 = at::rand({331, 9, 193}, at::device(at::kCPU).dtype(at::kFloat));
6364 
6365   // Act
6366   const auto out_cpu = at::cat({in_cpu1, in_cpu2, in_cpu3}, 0);
6367   const auto out_vulkan = at::cat({in_cpu1.vulkan(), in_cpu2.vulkan(), in_cpu3.vulkan()}, 0);
6368 
6369   // Assert
6370   const auto check = almostEqual(out_cpu, out_vulkan.cpu());
6371   if (!check) {
6372     showRtol(out_cpu, out_vulkan.cpu());
6373   }
6374 
6375   ASSERT_TRUE(check);
6376 }
6377 
TEST_F(VulkanAPITest,cat_3d_dim0_same_channel_success)6378 TEST_F(VulkanAPITest, cat_3d_dim0_same_channel_success) {
6379   // Arrange
6380   const auto in_cpu1 = at::rand({9, 193, 113}, at::device(at::kCPU).dtype(at::kFloat));
6381   const auto in_cpu2 = at::rand({9, 193, 113}, at::device(at::kCPU).dtype(at::kFloat));
6382   const auto in_cpu3 = at::rand({9, 193, 113}, at::device(at::kCPU).dtype(at::kFloat));
6383 
6384   // Act
6385   const auto out_cpu = at::cat({in_cpu1, in_cpu2, in_cpu3}, 0);
6386   const auto out_vulkan = at::cat({in_cpu1.vulkan(), in_cpu2.vulkan(), in_cpu3.vulkan()}, 0);
6387 
6388   // Assert
6389   const auto check = almostEqual(out_cpu, out_vulkan.cpu());
6390   if (!check) {
6391     showRtol(out_cpu, out_vulkan.cpu());
6392   }
6393 
6394   ASSERT_TRUE(check);
6395 }
6396 
TEST_F(VulkanAPITest,cat_3d_dim1_diffheight_success)6397 TEST_F(VulkanAPITest, cat_3d_dim1_diffheight_success) {
6398   // Arrange
6399   const auto in_cpu1 = at::rand({9, 221, 193}, at::device(at::kCPU).dtype(at::kFloat));
6400   const auto in_cpu2 = at::rand({9, 113, 193}, at::device(at::kCPU).dtype(at::kFloat));
6401   const auto in_cpu3 = at::rand({9, 331, 193}, at::device(at::kCPU).dtype(at::kFloat));
6402 
6403   // Act
6404   const auto out_cpu = at::cat({in_cpu1, in_cpu2, in_cpu3}, 1);
6405   const auto out_vulkan = at::cat({in_cpu1.vulkan(), in_cpu2.vulkan(), in_cpu3.vulkan()}, 1);
6406 
6407   // Assert
6408   const auto check = almostEqual(out_cpu, out_vulkan.cpu());
6409   if (!check) {
6410     showRtol(out_cpu, out_vulkan.cpu());
6411   }
6412 
6413   ASSERT_TRUE(check);
6414 }
6415 
TEST_F(VulkanAPITest,cat_3d_dim1_same_height_success)6416 TEST_F(VulkanAPITest, cat_3d_dim1_same_height_success) {
6417   // Arrange
6418   const auto in_cpu1 = at::rand({9, 193, 113}, at::device(at::kCPU).dtype(at::kFloat));
6419   const auto in_cpu2 = at::rand({9, 193, 113}, at::device(at::kCPU).dtype(at::kFloat));
6420   const auto in_cpu3 = at::rand({9, 193, 113}, at::device(at::kCPU).dtype(at::kFloat));
6421 
6422   // Act
6423   const auto out_cpu = at::cat({in_cpu1, in_cpu2, in_cpu3}, 1);
6424   const auto out_vulkan = at::cat({in_cpu1.vulkan(), in_cpu2.vulkan(), in_cpu3.vulkan()}, 1);
6425 
6426   // Assert
6427   const auto check = almostEqual(out_cpu, out_vulkan.cpu());
6428   if (!check) {
6429     showRtol(out_cpu, out_vulkan.cpu());
6430   }
6431 
6432   ASSERT_TRUE(check);
6433 }
6434 
TEST_F(VulkanAPITest,cat_3d_dim2_diffwidth_success)6435 TEST_F(VulkanAPITest, cat_3d_dim2_diffwidth_success) {
6436   // Arrange
6437   const auto in_cpu1 = at::rand({9, 193, 221}, at::device(at::kCPU).dtype(at::kFloat));
6438   const auto in_cpu2 = at::rand({9, 193, 113}, at::device(at::kCPU).dtype(at::kFloat));
6439   const auto in_cpu3 = at::rand({9, 193, 331}, at::device(at::kCPU).dtype(at::kFloat));
6440 
6441   // Act
6442   const auto out_cpu = at::cat({in_cpu1, in_cpu2, in_cpu3}, 2);
6443   const auto out_vulkan = at::cat({in_cpu1.vulkan(), in_cpu2.vulkan(), in_cpu3.vulkan()}, 2);
6444 
6445   // Assert
6446   const auto check = almostEqual(out_cpu, out_vulkan.cpu());
6447   if (!check) {
6448     showRtol(out_cpu, out_vulkan.cpu());
6449   }
6450 
6451   ASSERT_TRUE(check);
6452 }
6453 
TEST_F(VulkanAPITest,cat_3d_dim2_samewidth_success)6454 TEST_F(VulkanAPITest, cat_3d_dim2_samewidth_success) {
6455   // Arrange
6456   const auto in_cpu1 = at::rand({9, 193, 113}, at::device(at::kCPU).dtype(at::kFloat));
6457   const auto in_cpu2 = at::rand({9, 193, 113}, at::device(at::kCPU).dtype(at::kFloat));
6458   const auto in_cpu3 = at::rand({9, 193, 113}, at::device(at::kCPU).dtype(at::kFloat));
6459 
6460   // Act
6461   const auto out_cpu = at::cat({in_cpu1, in_cpu2, in_cpu3}, 2);
6462   const auto out_vulkan = at::cat({in_cpu1.vulkan(), in_cpu2.vulkan(), in_cpu3.vulkan()}, 2);
6463 
6464   // Assert
6465   const auto check = almostEqual(out_cpu, out_vulkan.cpu());
6466   if (!check) {
6467     showRtol(out_cpu, out_vulkan.cpu());
6468   }
6469 
6470   ASSERT_TRUE(check);
6471 }
6472 
TEST_F(VulkanAPITest,cat_3d_dim0_negdim_success)6473 TEST_F(VulkanAPITest, cat_3d_dim0_negdim_success) {
6474   // Arrange
6475   const auto in_cpu1 = at::rand({221, 9, 193}, at::device(at::kCPU).dtype(at::kFloat));
6476   const auto in_cpu2 = at::rand({113, 9, 193}, at::device(at::kCPU).dtype(at::kFloat));
6477   const auto in_cpu3 = at::rand({331, 9, 193}, at::device(at::kCPU).dtype(at::kFloat));
6478 
6479   // Act
6480   const auto out_cpu = at::cat({in_cpu1, in_cpu2, in_cpu3}, -3);
6481   const auto out_vulkan = at::cat({in_cpu1.vulkan(), in_cpu2.vulkan(), in_cpu3.vulkan()}, -3);
6482 
6483   // Assert
6484   const auto check = almostEqual(out_cpu, out_vulkan.cpu());
6485   if (!check) {
6486     showRtol(out_cpu, out_vulkan.cpu());
6487   }
6488 
6489   ASSERT_TRUE(check);
6490 }
6491 
TEST_F(VulkanAPITest,cat_3d_dim1_negdim_success)6492 TEST_F(VulkanAPITest, cat_3d_dim1_negdim_success) {
6493   // Arrange
6494   const auto in_cpu1 = at::rand({9, 221, 193}, at::device(at::kCPU).dtype(at::kFloat));
6495   const auto in_cpu2 = at::rand({9, 113, 193}, at::device(at::kCPU).dtype(at::kFloat));
6496   const auto in_cpu3 = at::rand({9, 331, 193}, at::device(at::kCPU).dtype(at::kFloat));
6497 
6498   // Act
6499   const auto out_cpu = at::cat({in_cpu1, in_cpu2, in_cpu3}, -2);
6500   const auto out_vulkan = at::cat({in_cpu1.vulkan(), in_cpu2.vulkan(), in_cpu3.vulkan()}, -2);
6501 
6502   // Assert
6503   const auto check = almostEqual(out_cpu, out_vulkan.cpu());
6504   if (!check) {
6505     showRtol(out_cpu, out_vulkan.cpu());
6506   }
6507 
6508   ASSERT_TRUE(check);
6509 }
6510 
TEST_F(VulkanAPITest,cat_3d_dim2_negdim_success)6511 TEST_F(VulkanAPITest, cat_3d_dim2_negdim_success) {
6512   // Arrange
6513   const auto in_cpu1 = at::rand({193, 13, 89}, at::device(at::kCPU).dtype(at::kFloat));
6514   const auto in_cpu2 = at::rand({193, 13, 59}, at::device(at::kCPU).dtype(at::kFloat));
6515   const auto in_cpu3 = at::rand({193, 13, 67}, at::device(at::kCPU).dtype(at::kFloat));
6516 
6517   // Act
6518   const auto out_cpu = at::cat({in_cpu1, in_cpu2, in_cpu3}, -1);
6519   const auto out_vulkan = at::cat({in_cpu1.vulkan(), in_cpu2.vulkan(), in_cpu3.vulkan()}, -1);
6520 
6521   // Assert
6522   const auto check = almostEqual(out_cpu, out_vulkan.cpu());
6523   if (!check) {
6524     showRtol(out_cpu, out_vulkan.cpu());
6525   }
6526 
6527   ASSERT_TRUE(check);
6528 }
6529 
TEST_F(VulkanAPITest,cat_2d_dim0_same_height_success)6530 TEST_F(VulkanAPITest, cat_2d_dim0_same_height_success) {
6531   // Arrange
6532   const auto in_cpu1 = at::rand({193, 113}, at::device(at::kCPU).dtype(at::kFloat));
6533   const auto in_cpu2 = at::rand({193, 113}, at::device(at::kCPU).dtype(at::kFloat));
6534   const auto in_cpu3 = at::rand({193, 113}, at::device(at::kCPU).dtype(at::kFloat));
6535 
6536   // Act
6537   const auto out_cpu = at::cat({in_cpu1, in_cpu2, in_cpu3}, 0);
6538   const auto out_vulkan = at::cat({in_cpu1.vulkan(), in_cpu2.vulkan(), in_cpu3.vulkan()}, 0);
6539 
6540   // Assert
6541   const auto check = almostEqual(out_cpu, out_vulkan.cpu());
6542   if (!check) {
6543     showRtol(out_cpu, out_vulkan.cpu());
6544   }
6545 
6546   ASSERT_TRUE(check);
6547 }
6548 
TEST_F(VulkanAPITest,cat_2d_dim0_diff_height_success)6549 TEST_F(VulkanAPITest, cat_2d_dim0_diff_height_success) {
6550   // Arrange
6551   const auto in_cpu1 = at::rand({193, 113}, at::device(at::kCPU).dtype(at::kFloat));
6552   const auto in_cpu2 = at::rand({191, 113}, at::device(at::kCPU).dtype(at::kFloat));
6553   const auto in_cpu3 = at::rand({137, 113}, at::device(at::kCPU).dtype(at::kFloat));
6554 
6555   // Act
6556   const auto out_cpu = at::cat({in_cpu1, in_cpu2, in_cpu3}, 0);
6557   const auto out_vulkan = at::cat({in_cpu1.vulkan(), in_cpu2.vulkan(), in_cpu3.vulkan()}, 0);
6558 
6559   // Assert
6560   const auto check = almostEqual(out_cpu, out_vulkan.cpu());
6561   if (!check) {
6562     showRtol(out_cpu, out_vulkan.cpu());
6563   }
6564 
6565   ASSERT_TRUE(check);
6566 }
6567 
TEST_F(VulkanAPITest,cat_2d_dim1_same_width_success)6568 TEST_F(VulkanAPITest, cat_2d_dim1_same_width_success) {
6569   // Arrange
6570   const auto in_cpu1 = at::rand({193, 113}, at::device(at::kCPU).dtype(at::kFloat));
6571   const auto in_cpu2 = at::rand({193, 113}, at::device(at::kCPU).dtype(at::kFloat));
6572   const auto in_cpu3 = at::rand({193, 113}, at::device(at::kCPU).dtype(at::kFloat));
6573 
6574   // Act
6575   const auto out_cpu = at::cat({in_cpu1, in_cpu2, in_cpu3}, 1);
6576   const auto out_vulkan = at::cat({in_cpu1.vulkan(), in_cpu2.vulkan(), in_cpu3.vulkan()}, 1);
6577 
6578   // Assert
6579   const auto check = almostEqual(out_cpu, out_vulkan.cpu());
6580   if (!check) {
6581     showRtol(out_cpu, out_vulkan.cpu());
6582   }
6583 
6584   ASSERT_TRUE(check);
6585 }
6586 
TEST_F(VulkanAPITest,cat_2d_dim1_diff_width_success)6587 TEST_F(VulkanAPITest, cat_2d_dim1_diff_width_success) {
6588   // Arrange
6589   const auto in_cpu1 = at::rand({193, 113}, at::device(at::kCPU).dtype(at::kFloat));
6590   const auto in_cpu2 = at::rand({193, 131}, at::device(at::kCPU).dtype(at::kFloat));
6591   const auto in_cpu3 = at::rand({193, 127}, at::device(at::kCPU).dtype(at::kFloat));
6592 
6593   // Act
6594   const auto out_cpu = at::cat({in_cpu1, in_cpu2, in_cpu3}, 1);
6595   const auto out_vulkan = at::cat({in_cpu1.vulkan(), in_cpu2.vulkan(), in_cpu3.vulkan()}, 1);
6596 
6597   // Assert
6598   const auto check = almostEqual(out_cpu, out_vulkan.cpu());
6599   if (!check) {
6600     showRtol(out_cpu, out_vulkan.cpu());
6601   }
6602 
6603   ASSERT_TRUE(check);
6604 }
6605 
TEST_F(VulkanAPITest,cat_2d_dim0_negdim_success)6606 TEST_F(VulkanAPITest, cat_2d_dim0_negdim_success) {
6607   // Arrange
6608   const auto in_cpu1 = at::rand({113, 193}, at::device(at::kCPU).dtype(at::kFloat));
6609   const auto in_cpu2 = at::rand({131, 193}, at::device(at::kCPU).dtype(at::kFloat));
6610   const auto in_cpu3 = at::rand({127, 193}, at::device(at::kCPU).dtype(at::kFloat));
6611 
6612   // Act
6613   const auto out_cpu = at::cat({in_cpu1, in_cpu2, in_cpu3}, -2);
6614   const auto out_vulkan = at::cat({in_cpu1.vulkan(), in_cpu2.vulkan(), in_cpu3.vulkan()}, -2);
6615 
6616   // Assert
6617   const auto check = almostEqual(out_cpu, out_vulkan.cpu());
6618   if (!check) {
6619     showRtol(out_cpu, out_vulkan.cpu());
6620   }
6621 
6622   ASSERT_TRUE(check);
6623 }
6624 
TEST_F(VulkanAPITest,cat_2d_dim1_negdim_success)6625 TEST_F(VulkanAPITest, cat_2d_dim1_negdim_success) {
6626   // Arrange
6627   const auto in_cpu1 = at::rand({193, 113}, at::device(at::kCPU).dtype(at::kFloat));
6628   const auto in_cpu2 = at::rand({193, 131}, at::device(at::kCPU).dtype(at::kFloat));
6629   const auto in_cpu3 = at::rand({193, 127}, at::device(at::kCPU).dtype(at::kFloat));
6630 
6631   // Act
6632   const auto out_cpu = at::cat({in_cpu1, in_cpu2, in_cpu3}, -1);
6633   const auto out_vulkan = at::cat({in_cpu1.vulkan(), in_cpu2.vulkan(), in_cpu3.vulkan()}, -1);
6634 
6635   // Assert
6636   const auto check = almostEqual(out_cpu, out_vulkan.cpu());
6637   if (!check) {
6638     showRtol(out_cpu, out_vulkan.cpu());
6639   }
6640 
6641   ASSERT_TRUE(check);
6642 }
6643 
TEST_F(VulkanAPITest,cat_1d_dim0_same_width_success)6644 TEST_F(VulkanAPITest, cat_1d_dim0_same_width_success) {
6645   // Arrange
6646   const auto in_cpu1 = at::rand({193}, at::device(at::kCPU).dtype(at::kFloat));
6647   const auto in_cpu2 = at::rand({193}, at::device(at::kCPU).dtype(at::kFloat));
6648   const auto in_cpu3 = at::rand({193}, at::device(at::kCPU).dtype(at::kFloat));
6649 
6650   // Act
6651   const auto out_cpu = at::cat({in_cpu1, in_cpu2, in_cpu3}, 0);
6652   const auto out_vulkan = at::cat({in_cpu1.vulkan(), in_cpu2.vulkan(), in_cpu3.vulkan()}, 0);
6653 
6654   // Assert
6655   const auto check = almostEqual(out_cpu, out_vulkan.cpu());
6656   if (!check) {
6657     showRtol(out_cpu, out_vulkan.cpu());
6658   }
6659 
6660   ASSERT_TRUE(check);
6661 }
6662 
TEST_F(VulkanAPITest,cat_1d_dim0_diff_width_success)6663 TEST_F(VulkanAPITest, cat_1d_dim0_diff_width_success) {
6664   // Arrange
6665   const auto in_cpu1 = at::rand({193}, at::device(at::kCPU).dtype(at::kFloat));
6666   const auto in_cpu2 = at::rand({137}, at::device(at::kCPU).dtype(at::kFloat));
6667   const auto in_cpu3 = at::rand({131}, at::device(at::kCPU).dtype(at::kFloat));
6668 
6669   // Act
6670   const auto out_cpu = at::cat({in_cpu1, in_cpu2, in_cpu3}, 0);
6671   const auto out_vulkan = at::cat({in_cpu1.vulkan(), in_cpu2.vulkan(), in_cpu3.vulkan()}, 0);
6672 
6673   // Assert
6674   const auto check = almostEqual(out_cpu, out_vulkan.cpu());
6675   if (!check) {
6676     showRtol(out_cpu, out_vulkan.cpu());
6677   }
6678 
6679   ASSERT_TRUE(check);
6680 }
6681 
TEST_F(VulkanAPITest,cat_1d_dim0_negdim_success)6682 TEST_F(VulkanAPITest, cat_1d_dim0_negdim_success) {
6683   // Arrange
6684   const auto in_cpu1 = at::rand({193}, at::device(at::kCPU).dtype(at::kFloat));
6685   const auto in_cpu2 = at::rand({137}, at::device(at::kCPU).dtype(at::kFloat));
6686   const auto in_cpu3 = at::rand({131}, at::device(at::kCPU).dtype(at::kFloat));
6687 
6688   // Act
6689   const auto out_cpu = at::cat({in_cpu1, in_cpu2, in_cpu3}, -1);
6690   const auto out_vulkan = at::cat({in_cpu1.vulkan(), in_cpu2.vulkan(), in_cpu3.vulkan()}, -1);
6691 
6692   // Assert
6693   const auto check = almostEqual(out_cpu, out_vulkan.cpu());
6694   if (!check) {
6695     showRtol(out_cpu, out_vulkan.cpu());
6696   }
6697 
6698   ASSERT_TRUE(check);
6699 }
6700 
TEST_F(VulkanAPITest,permute_2d_success)6701 TEST_F(VulkanAPITest, permute_2d_success) {
6702   // Arrange
6703   const auto in_cpu = at::rand({2, 3}, at::device(at::kCPU).dtype(at::kFloat));
6704 
6705   // Act
6706   const auto out_cpu = at::permute(in_cpu, {1, 0});
6707   const auto out_vulkan = at::permute(in_cpu.vulkan(), {1, 0});
6708 
6709   // Assert
6710   const auto check = almostEqual(out_cpu, out_vulkan.cpu());
6711   if (!check) {
6712     showRtol(out_cpu, out_vulkan.cpu());
6713   }
6714 
6715   ASSERT_TRUE(check);
6716 }
6717 
TEST_F(VulkanAPITest,permute_3d_success)6718 TEST_F(VulkanAPITest, permute_3d_success) {
6719   // Arrange
6720   const auto in_cpu = at::rand({2, 3, 2}, at::device(at::kCPU).dtype(at::kFloat));
6721   std::vector<std::vector<int64_t>> all_dims;
6722   std::vector<int64_t> in{0, 1, 2};
6723   gen_allpermutations(all_dims, in, 0);
6724 
6725   for (const auto i : c10::irange(1, all_dims.size())) {
6726     const auto dims = all_dims[i];
6727 
6728     // Act
6729     const auto out_cpu = at::permute(in_cpu, dims);
6730     const auto out_vulkan = at::permute(in_cpu.vulkan(), dims);
6731 
6732     // Assert
6733     const auto check = almostEqual(out_cpu, out_vulkan.cpu());
6734     if (!check) {
6735       showRtol(out_cpu, out_vulkan.cpu());
6736     }
6737 
6738     ASSERT_TRUE(check);
6739   }
6740 }
6741 
TEST_F(VulkanAPITest,permute_4d_success)6742 TEST_F(VulkanAPITest, permute_4d_success) {
6743   // Arrange
6744   const auto in_cpu = at::rand({2, 3, 4, 5}, at::device(at::kCPU).dtype(at::kFloat));
6745   std::vector<std::vector<int64_t>> all_dims;
6746   std::vector<int64_t> in{0, 1, 2, 3};
6747   gen_allpermutations(all_dims, in, 0);
6748 
6749   for (const auto i : c10::irange(1, all_dims.size())) {
6750     const auto dims = all_dims[i];
6751 
6752     // Act
6753     const auto out_cpu = at::permute(in_cpu, dims);
6754     const auto out_vulkan = at::permute(in_cpu.vulkan(), dims);
6755 
6756     // Assert
6757     const auto check = almostEqual(out_cpu, out_vulkan.cpu());
6758     if (!check) {
6759       showRtol(out_cpu, out_vulkan.cpu());
6760     }
6761 
6762     ASSERT_TRUE(check);
6763   }
6764 }
6765 
TEST_F(VulkanAPITest,permute_4dmclaren_success)6766 TEST_F(VulkanAPITest, permute_4dmclaren_success) {
6767   // Arrange: McLaren Model usage
6768   const auto in_cpu = at::rand({1, 2, 1, 161}, at::device(at::kCPU).dtype(at::kFloat));
6769 
6770   // Act
6771   const auto out_cpu = at::permute(in_cpu, {0, 2, 1, 3});
6772   const auto out_vulkan = at::permute(in_cpu.vulkan(), {0, 2, 1, 3});
6773 
6774   // Assert
6775   const auto check = almostEqual(out_cpu, out_vulkan.cpu());
6776   if (!check) {
6777     showRtol(out_cpu, out_vulkan.cpu());
6778   }
6779 
6780   ASSERT_TRUE(check);
6781 }
6782 
TEST_F(VulkanAPITest,permute_4dbig_success)6783 TEST_F(VulkanAPITest, permute_4dbig_success) {
6784   // Arrange
6785   const auto in_cpu = at::rand({3, 9, 51, 41}, at::device(at::kCPU).dtype(at::kFloat));
6786   std::vector<std::vector<int64_t>> all_dims;
6787   std::vector<int64_t> in{0, 1, 2, 3};
6788   gen_allpermutations(all_dims, in, 0);
6789 
6790   for (const auto i : c10::irange(1, all_dims.size())) {
6791     const auto dims = all_dims[i];
6792     // Act
6793     const auto out_cpu = at::permute(in_cpu, dims);
6794     const auto out_vulkan = at::permute(in_cpu.vulkan(), dims);
6795 
6796     // Assert
6797     const auto check = almostEqual(out_cpu, out_vulkan.cpu());
6798     if (!check) {
6799       showRtol(out_cpu, out_vulkan.cpu());
6800     }
6801 
6802     ASSERT_TRUE(check);
6803   }
6804 }
6805 
TEST_F(VulkanAPITest,permute_negativedims_success)6806 TEST_F(VulkanAPITest, permute_negativedims_success) {
6807   // Arrange
6808   const auto in_cpu = at::rand({5, 4, 3, 2}, at::device(at::kCPU).dtype(at::kFloat));
6809 
6810   // Act: {-1,-2,-3,0} is equivalent to {3,2,1,0}
6811   const auto out_cpu = at::permute(in_cpu, {-1, -2, -3, 0});
6812   const auto out_vulkan = at::permute(in_cpu.vulkan(), {-1, -2, -3, 0});
6813 
6814   // Assert
6815   const auto check = almostEqual(out_cpu, out_vulkan.cpu());
6816   if (!check) {
6817     showRtol(out_cpu, out_vulkan.cpu());
6818   }
6819 
6820   ASSERT_TRUE(check);
6821 }
6822 
TEST_F(VulkanAPITest,permute_invalidinputs_exceptions)6823 TEST_F(VulkanAPITest, permute_invalidinputs_exceptions) {
6824   // Arrange
6825   const auto in_cpu = at::rand({1, 2, 1, 161}, at::device(at::kCPU).dtype(at::kFloat));
6826 
6827   // Act: Repeated dim
6828   EXPECT_THROW({
6829     const auto out_vulkan = at::permute(in_cpu.vulkan(), {2, 2, 1, 0});
6830   }, ::std::exception);
6831 
6832   EXPECT_THROW({
6833     const auto out_vulkan = in_cpu.vulkan();
6834     out_vulkan.permute({2, 2, 1, 0});
6835   }, ::std::exception);
6836 
6837   // Act: Number of dims don't match
6838   EXPECT_THROW({
6839     const auto out_vulkan = at::permute(in_cpu.vulkan(), {4, 3, 2, 1, 0});
6840   }, ::std::exception);
6841 
6842   EXPECT_THROW({
6843     const auto out_vulkan = at::permute(in_cpu.vulkan(), {2, 1, 0});
6844   }, ::std::exception);
6845 
6846   EXPECT_THROW({
6847     const auto out_vulkan = in_cpu.vulkan();
6848     out_vulkan.permute({4, 3, 2, 1, 0});
6849   }, ::std::exception);
6850 
6851   EXPECT_THROW({
6852     const auto out_vulkan = in_cpu.vulkan();
6853     out_vulkan.permute({2, 1, 0});
6854   }, ::std::exception);
6855 
6856   // Act: Dim out of range
6857   EXPECT_THROW({
6858     const auto out_vulkan = at::permute(in_cpu.vulkan(), {5, 2, 1, 0});
6859   }, ::std::exception);
6860 
6861   EXPECT_THROW({
6862     const auto out_vulkan = in_cpu.vulkan();
6863     out_vulkan.permute({5, 2, 1, 0});
6864   }, ::std::exception);
6865 
6866   // Act: Input tensor size > 4D
6867   const auto in_cpu_5d = at::rand({1, 2, 1, 2, 161}, at::device(at::kCPU).dtype(at::kFloat));
6868   EXPECT_THROW({
6869     const auto out_vulkan_5d = at::permute(in_cpu_5d.vulkan(), {4, 3, 2, 1, 0});
6870   }, ::std::exception);
6871 
6872   EXPECT_THROW({
6873     const auto out_vulkan_5d = in_cpu_5d.vulkan();
6874     out_vulkan_5d.permute({4, 3, 2, 1, 0});
6875   }, ::std::exception);
6876 }
6877 
TEST_F(VulkanAPITest,slice_width_success)6878 TEST_F(VulkanAPITest, slice_width_success) {
6879   // Arrange
6880   std::unordered_map<int64_t, std::vector<int64_t>> dim2sizes {
6881     {3, {2, 3, 40, 50}},  // 4D tensors with dim=width
6882     {2, {3, 40, 50}},     // 3D tensors with dim=width
6883     {1, {40, 50}},        // 2D tensors with dim=width
6884     {0, {50}},            // 1D tensors with dim=width
6885   };
6886 
6887   // Act/Assert
6888   slice_tests(dim2sizes);
6889 }
6890 
TEST_F(VulkanAPITest,slice_height_success)6891 TEST_F(VulkanAPITest, slice_height_success) {
6892   // Arrange
6893   std::unordered_map<int64_t, std::vector<int64_t>> dim2sizes {
6894     {2, {2, 3, 40, 50}},  // 4D tensors with dim=height
6895     {1, {3, 40, 50}},     // 3D tensors with dim=height
6896     {0, {40, 50}},        // 2D tensors with dim=height
6897                           // 1D tesnors don't have height dim for test
6898   };
6899 
6900   // Act/Assert
6901   slice_tests(dim2sizes);
6902 }
6903 
TEST_F(VulkanAPITest,slice_feature_success)6904 TEST_F(VulkanAPITest, slice_feature_success) {
6905   // Arrange
6906   std::unordered_map<int64_t, std::vector<int64_t>> dim2sizes {
6907     {1, {2, 40, 13, 14}}, // 4D tensors with dim=feature(channel)
6908     {0, {40, 13, 14}},    // 3D tensors with dim=feature(channel)
6909                           // 1D and 2D tesnors don't have feature(channel) dim for test
6910   };
6911 
6912   // Act/Assert
6913   slice_tests(dim2sizes);
6914 }
6915 
TEST_F(VulkanAPITest,slice_batch_success)6916 TEST_F(VulkanAPITest, slice_batch_success) {
6917   // Arrange
6918   std::unordered_map<int64_t, std::vector<int64_t>> dim2sizes {
6919     {0, {40, 3, 13, 14}}, // 4D tensors with dim=batch
6920                           // 1D, 2D and 3D tesnors don't have batch dim for test
6921   };
6922 
6923   // Act/Assert
6924   slice_tests(dim2sizes);
6925 }
6926 
TEST_F(VulkanAPITest,slice_zero_sized)6927 TEST_F(VulkanAPITest, slice_zero_sized) {
6928   // When start == end
6929   slice_test({2, 3, 4, 5}, 3, 0, 0, 1);
6930   // When start > end
6931   slice_test({2, 3, 4, 5}, 3, 3, 2, 1);
6932 }
6933 
TEST_F(VulkanAPITest,slice_invalidinputs_exceptions)6934 TEST_F(VulkanAPITest, slice_invalidinputs_exceptions) {
6935   // Act: slice step must be positive
6936   EXPECT_THROW({
6937     slice_test({2, 3, 4, 5}, 3, 0, 3, 0);
6938   }, ::std::exception);
6939 }
6940 
TEST_F(VulkanAPITest,stack_invalid_inputs)6941 TEST_F(VulkanAPITest, stack_invalid_inputs) {
6942   // Act: Vulkan stack expects at least one tensor
6943   EXPECT_THROW({
6944     at::stack({}, 0);
6945   }, ::std::exception);
6946 
6947   // Act: Vulkan stack inputs must have matching sizes
6948   EXPECT_THROW({
6949     at::stack({
6950         at::rand({5, 7}, at::device(at::kCPU).dtype(at::kFloat)).vulkan(),
6951         at::rand({5, 7}, at::device(at::kCPU).dtype(at::kFloat)).vulkan(),
6952         at::rand({6, 7}, at::device(at::kCPU).dtype(at::kFloat)).vulkan()}, 0);
6953   }, ::std::exception);
6954 }
6955 
test_stack(const at::IntArrayRef input_shape,int64_t dim,int numTensors)6956 void test_stack(const at::IntArrayRef input_shape, int64_t dim, int numTensors) {
6957   std::vector<at::Tensor> tensors_cpu = {};
6958   std::vector<at::Tensor> tensors_vulkan = {};
6959 
6960   for (int i = 0; i < numTensors; i++) {
6961     at::Tensor in_cpu = at::rand(input_shape, at::device(at::kCPU).dtype(at::kFloat));
6962     tensors_cpu.emplace_back(in_cpu);
6963     tensors_vulkan.emplace_back(in_cpu.vulkan());
6964   }
6965 
6966   at::Tensor out_cpu = at::stack(tensors_cpu, 0);
6967   at::Tensor out_vulkan = at::stack(tensors_vulkan, 0);
6968   const auto check = almostEqual(out_cpu, out_vulkan.cpu());
6969   if (!check) {
6970     std::cout << "Error when stacking " << numTensors << " tensors" << std::endl;
6971     showRtol(out_cpu, out_vulkan.cpu());
6972   }
6973   ASSERT_TRUE(check);
6974 }
6975 
TEST_F(VulkanAPITest,stack_0d)6976 TEST_F(VulkanAPITest, stack_0d) {
6977   test_stack({}, 0, 1);
6978   test_stack({}, 0, 2);
6979   test_stack({}, 0, 3);
6980 }
6981 
TEST_F(VulkanAPITest,stack_1d)6982 TEST_F(VulkanAPITest, stack_1d) {
6983   test_stack({221}, 0, 2);
6984   test_stack({193}, 1, 3);
6985 
6986   test_stack({221}, -1, 2);
6987   test_stack({193}, -2, 3);
6988 }
6989 
TEST_F(VulkanAPITest,stack_2d)6990 TEST_F(VulkanAPITest, stack_2d) {
6991   test_stack({221, 193}, 0, 2);
6992   test_stack({221, 193}, 1, 3);
6993   test_stack({221, 193}, 2, 4);
6994 
6995   test_stack({221, 193}, -1, 2);
6996   test_stack({221, 193}, -2, 3);
6997   test_stack({221, 193}, -3, 4);
6998 }
6999 
TEST_F(VulkanAPITest,stack_3d)7000 TEST_F(VulkanAPITest, stack_3d) {
7001   test_stack({221, 193, 11}, 0, 2);
7002   test_stack({221, 193, 11}, 1, 3);
7003   test_stack({221, 193, 11}, 2, 4);
7004   test_stack({221, 193, 11}, 3, 5);
7005 
7006   test_stack({221, 193, 11}, -1, 2);
7007   test_stack({221, 193, 11}, -2, 3);
7008   test_stack({221, 193, 11}, -3, 4);
7009   test_stack({221, 193, 11}, -4, 5);
7010 }
7011 
TEST_F(VulkanAPITest,tile_invalid_inputs_exceptions)7012 TEST_F(VulkanAPITest, tile_invalid_inputs_exceptions) {
7013   // Arrange: Vulkan tile only supports input of dims <= 4
7014   {
7015     const auto in_cpu =
7016         at::rand({3, 9, 5, 7, 3}, at::device(at::kCPU).dtype(at::kFloat));
7017     const at::IntArrayRef repeats = {7, 3, 9, 2};
7018 
7019     // Act
7020     EXPECT_THROW(
7021         { const auto out_vulkan = at::tile(in_cpu.vulkan(), repeats); },
7022         ::std::exception);
7023   }
7024 }
7025 
TEST_F(VulkanAPITest,tile_invalid_outpus_exceptions)7026 TEST_F(VulkanAPITest, tile_invalid_outpus_exceptions) {
7027   // Arrange: Vulkan tile only supports output of dims <= 4
7028   {
7029     const auto in_cpu =
7030         at::rand({3, 9, 5, 13}, at::device(at::kCPU).dtype(at::kFloat));
7031     const at::IntArrayRef repeats = {5, 7, 3, 9, 2};
7032 
7033     // Act
7034     EXPECT_THROW(
7035         { const auto out_vulkan = at::tile(in_cpu.vulkan(), repeats); },
7036         ::std::exception);
7037   }
7038 }
7039 
test_tile(const at::IntArrayRef input_shape,const at::IntArrayRef repeats)7040 void test_tile(
7041     const at::IntArrayRef input_shape,
7042     const at::IntArrayRef repeats) {
7043   c10::InferenceMode mode;
7044 
7045   at::Tensor in_cpu;
7046   at::Tensor out_cpu;
7047   at::Tensor in_vulkan;
7048   at::Tensor out_vulkan;
7049   at::IntArrayRef repeat;
7050   bool check = true;
7051   for (int idx_input = 1; (unsigned)idx_input < input_shape.size() + 1; ++idx_input) {
7052     for (int idx_repeat = 1; (unsigned)idx_repeat < repeats.size() + 1; ++idx_repeat) {
7053       in_cpu = at::rand(
7054           input_shape.slice(0, idx_input),
7055           at::device(at::kCPU).dtype(at::kFloat));
7056       repeat = repeats.slice(0, idx_repeat);
7057       out_cpu = at::tile(in_cpu, repeat);
7058       in_vulkan = in_cpu.vulkan();
7059       out_vulkan = at::tile(in_vulkan, repeat);
7060       check = almostEqual(out_cpu, out_vulkan.cpu());
7061       if (!check) {
7062         check = false;
7063         std::cout << "Tile test failed when input is of shape "
7064                   << input_shape.slice(0, idx_input) << " and repeat of "
7065                   << repeat << std::endl;
7066         showRtol(out_cpu, out_vulkan.cpu());
7067       }
7068     }
7069   }
7070 
7071   ASSERT_TRUE(check);
7072 }
7073 
TEST_F(VulkanAPITest,tile)7074 TEST_F(VulkanAPITest, tile) {
7075   test_tile({13, 5, 13, 7}, {7, 2, 3, 5});
7076 }
7077 
test_zero_(const at::IntArrayRef input_shape)7078 void test_zero_(const at::IntArrayRef input_shape) {
7079   auto cpu = at::rand(input_shape, at::device(at::kCPU).dtype(at::kFloat));
7080   auto vulkan = cpu.vulkan();
7081 
7082   cpu.zero_();
7083   vulkan.zero_();
7084 
7085   const auto check = almostEqual(cpu, vulkan.cpu());
7086   if (!check) {
7087     showRtol(cpu, vulkan.cpu());
7088     std::cout << "zero_ test failed with input shape: "
7089               << input_shape << std::endl;
7090   }
7091   ASSERT_TRUE(check);
7092 }
7093 
TEST_F(VulkanAPITest,zero_)7094 TEST_F(VulkanAPITest, zero_) {
7095   test_zero_({5});
7096   test_zero_({5, 7});
7097   test_zero_({9, 7, 5});
7098   test_zero_({22, 11, 19, 17});
7099 }
7100 
test_zeros(const at::IntArrayRef input_shape)7101 void test_zeros(const at::IntArrayRef input_shape) {
7102   auto cpu = at::zeros(input_shape);
7103   auto vulkan = at::zeros(input_shape, at::device(at::kVulkan));
7104 
7105   const auto check = almostEqual(cpu, vulkan.cpu());
7106   if (!check) {
7107     showRtol(cpu, vulkan.cpu());
7108     std::cout << "zeros test failed with input shape: "
7109               << input_shape << std::endl;
7110   }
7111   ASSERT_TRUE(check);
7112 }
7113 
TEST_F(VulkanAPITest,zeros)7114 TEST_F(VulkanAPITest, zeros) {
7115   test_zeros({5});
7116   test_zeros({5, 7});
7117   test_zeros({9, 7, 5});
7118   test_zeros({22, 11, 19, 17});
7119 }
7120 
TEST_F(VulkanAPITest,clone_success)7121 TEST_F(VulkanAPITest, clone_success) {
7122   // Arrange
7123   std::multimap<std::optional<c10::MemoryFormat>, std::vector<int64_t>> mem2sizes {
7124     {c10::MemoryFormat::Preserve, {2, 3, 5, 161}},    // 4D tensors with MemoryFormat::Preserve
7125     {c10::MemoryFormat::Contiguous, {2, 3, 5, 161}},  // 4D tensors with MemoryFormat::Contiguous
7126     {{}, {2, 3, 5, 161}},                             // 4D tensors with null
7127     {c10::MemoryFormat::Preserve, {3, 5, 161}},       // 3D tensors with MemoryFormat::Preserve
7128     {c10::MemoryFormat::Contiguous, {3, 5, 161}},     // 3D tensors with MemoryFormat::Contiguous
7129     {{}, {3, 5, 161}},                                // 3D tensors with null
7130     {c10::MemoryFormat::Preserve, {5, 161}},          // 2D tensors with MemoryFormat::Preserve
7131     {c10::MemoryFormat::Contiguous, {5, 161}},        // 2D tensors with MemoryFormat::Contiguous
7132     {{}, {5, 161}},                                   // 2D tensors with null
7133     {c10::MemoryFormat::Preserve, {161}},             // 1D tensors with MemoryFormat::Preserve
7134     {c10::MemoryFormat::Contiguous, {161}},           // 1D tensors with MemoryFormat::Contiguous
7135     {{}, {161}},                                      // 1D tensors with null
7136   };
7137 
7138   // Act/Assert
7139   for (const auto& mem2size : mem2sizes) {
7140     clone_test(mem2size.second, mem2size.first);
7141   }
7142 }
7143 
TEST_F(VulkanAPITest,clone_invalidinputs_exceptions)7144 TEST_F(VulkanAPITest, clone_invalidinputs_exceptions) {
7145   // Act: Vulkan supports Preserve and Contiguous memory foramts
7146   EXPECT_THROW({
7147     clone_test({2, 3, 5, 161}, c10::MemoryFormat::ChannelsLast);
7148   }, ::std::exception);
7149 
7150   // Act: Vulkan supports Preserve and Contiguous memory foramts
7151   EXPECT_THROW({
7152     clone_test({2, 3, 5, 161}, c10::MemoryFormat::ChannelsLast3d);
7153   }, ::std::exception);
7154 }
7155 
7156 enum class OpType {
7157   addmm,
7158   conv2d,
7159   hardtanh_,
7160   mean,
7161  };
7162 
7163 class BaseOp {
7164  public:
BaseOp(const OpType)7165   explicit BaseOp(const OpType) {}
7166   virtual ~BaseOp() = default;
7167 
7168   virtual at::Tensor run(at::Tensor&) const = 0;
7169   virtual std::string toString() const = 0;
7170 
7171 };
7172 
7173 class Addmm final : public BaseOp {
7174  public:
Addmm(const int64_t m1H,const int64_t m1W,const int64_t m2W,const float beta,const float alpha)7175   Addmm(
7176       const int64_t m1H,
7177       const int64_t m1W,
7178       const int64_t m2W,
7179       const float beta,
7180       const float alpha)
7181     : BaseOp(OpType::addmm),
7182       m2_(at::rand(c10::IntArrayRef({m1W, m2W}), at::device(at::kCPU).dtype(at::kFloat))),
7183       b_(at::rand(c10::IntArrayRef({m1H, m2W}), at::device(at::kCPU).dtype(at::kFloat))),
7184       beta_(beta),
7185       alpha_(alpha) {
7186   }
7187 
run(at::Tensor & t) const7188   at::Tensor run(at::Tensor& t) const override {
7189     if (t.is_vulkan()) {
7190       return at::addmm(b_, t, m2_, beta_, alpha_);
7191     }
7192 
7193     return at::addmm(b_, t, m2_, beta_, alpha_);
7194   }
7195 
toString() const7196   std::string toString() const override {
7197     return "addmm";
7198   }
7199 
7200  private:
7201   at::Tensor m2_;
7202   at::Tensor b_;
7203   float beta_;
7204   float alpha_;
7205 };
7206 
7207 class Conv2d final : public BaseOp {
7208  public:
Conv2d(const c10::IntArrayRef wsizes,const int64_t groups,const int64_t stride,const int64_t padding)7209   Conv2d(
7210       const c10::IntArrayRef wsizes,
7211       const int64_t groups,
7212       const int64_t stride,
7213       const int64_t padding)
7214       : BaseOp(OpType::conv2d),
7215         groups_(groups),
7216         stride_(stride),
7217         padding_(padding),
7218         w_(at::rand(wsizes, at::device(at::kCPU).dtype(at::kFloat))),
7219         b_(at::rand(wsizes[0], at::device(at::kCPU).dtype(at::kFloat))){
7220   }
7221 
run(at::Tensor & t) const7222   at::Tensor run(at::Tensor& t) const override {
7223     return at::conv2d(t, w_, b_, {stride_}, {padding_}, {1}, groups_);
7224   }
7225 
toString() const7226   std::string toString() const override {
7227     return "conv2d";
7228   }
7229 
7230  private:
7231   int64_t groups_;
7232   int64_t stride_;
7233   int64_t padding_;
7234   at::Tensor w_;
7235   at::Tensor b_;
7236 };
7237 
7238 class Hardtanh_ final : public BaseOp {
7239  public:
Hardtanh_()7240   Hardtanh_() : BaseOp(OpType::hardtanh_) {}
7241 
run(at::Tensor & input) const7242   at::Tensor run(at::Tensor& input) const override {
7243     return at::hardtanh_(input, 0, 6);
7244   }
7245 
toString() const7246   std::string toString() const override {
7247     return "hardtanh_";
7248   }
7249 };
7250 
7251 class Mean final : public BaseOp {
7252  public:
Mean()7253   Mean() : BaseOp(OpType::mean) {}
7254 
run(at::Tensor & input) const7255   at::Tensor run(at::Tensor& input) const override {
7256     return at::mean(input, {2, 3}, false);
7257   }
7258 
toString() const7259   std::string toString() const override {
7260     return "mean";
7261   }
7262 };
7263 
7264 class OpsList {
7265  public:
OpsList()7266   OpsList() {}
OpsList(std::vector<std::unique_ptr<BaseOp>> ops)7267   explicit OpsList(std::vector<std::unique_ptr<BaseOp>> ops)
7268     : ops_(std::move(ops)) {
7269   }
7270 
run(const at::Tensor & input)7271   auto run(const at::Tensor& input) {
7272     at::Tensor output = input;
7273 
7274     for (const auto& op : ops_) {
7275       output = op->run(output);
7276     }
7277 
7278     return output;
7279   }
7280 
run(const at::Tensor & input,const at::Tensor & v_input)7281   auto run(const at::Tensor& input, const at::Tensor& v_input) {
7282     at::Tensor output = input;
7283     at::Tensor v_output = v_input;
7284 
7285     for (const auto& op : ops_) {
7286       output = op->run(output);
7287       v_output = op->run(v_output);
7288     }
7289 
7290     return std::make_pair(output, v_output);
7291   }
7292 
7293  protected:
7294   std::vector<std::unique_ptr<BaseOp>> ops_;
7295 };
7296 
7297 class MobileNetV2 final : public OpsList {
7298  public:
MobileNetV2()7299   MobileNetV2() {
7300     ops_.emplace_back(new Conv2d({32, 3, 3, 3}, 1, 2, 1));
7301     ops_.emplace_back(new Hardtanh_());
7302     ops_.emplace_back(new Conv2d({32, 1, 3, 3}, 32, 1, 1));
7303     ops_.emplace_back(new Hardtanh_());
7304     ops_.emplace_back(new Conv2d({16, 32, 1, 1}, 1, 1, 0));
7305     ops_.emplace_back(new Conv2d({96, 16, 1, 1}, 1, 1, 0));
7306     ops_.emplace_back(new Hardtanh_());
7307     ops_.emplace_back(new Conv2d({96, 1, 3, 3}, 96, 2, 1));
7308     ops_.emplace_back(new Hardtanh_());
7309     ops_.emplace_back(new Conv2d({24, 96, 1, 1}, 1, 1, 0));
7310     ops_.emplace_back(new Conv2d({144, 24, 1, 1}, 1, 1, 0));
7311     ops_.emplace_back(new Hardtanh_());
7312     ops_.emplace_back(new Conv2d({144, 1, 3, 3}, 144, 1, 1));
7313     ops_.emplace_back(new Hardtanh_());
7314     ops_.emplace_back(new Conv2d({24, 144, 1, 1}, 1, 1, 0));
7315     ops_.emplace_back(new Conv2d({144, 24, 1, 1}, 1, 1, 0));
7316     ops_.emplace_back(new Hardtanh_());
7317     ops_.emplace_back(new Conv2d({144, 1, 3, 3}, 144, 2, 1));
7318     ops_.emplace_back(new Hardtanh_());
7319     ops_.emplace_back(new Conv2d({32, 144, 1, 1}, 1, 1, 0));
7320     ops_.emplace_back(new Conv2d({192, 32, 1, 1}, 1, 1, 0));
7321     ops_.emplace_back(new Hardtanh_());
7322     ops_.emplace_back(new Conv2d({192, 1, 3, 3}, 192, 1, 1));
7323     ops_.emplace_back(new Hardtanh_());
7324     ops_.emplace_back(new Conv2d({32, 192, 1, 1}, 1, 1, 0));
7325     ops_.emplace_back(new Conv2d({192, 32, 1, 1}, 1, 1, 0));
7326     ops_.emplace_back(new Hardtanh_());
7327     ops_.emplace_back(new Conv2d({192, 1, 3, 3}, 192, 1, 1));
7328     ops_.emplace_back(new Hardtanh_());
7329     ops_.emplace_back(new Conv2d({32, 192, 1, 1}, 1, 1, 0));
7330     ops_.emplace_back(new Conv2d({192, 32, 1, 1}, 1, 1, 0));
7331     ops_.emplace_back(new Hardtanh_());
7332     ops_.emplace_back(new Conv2d({192, 1, 3, 3}, 192, 2, 1));
7333     ops_.emplace_back(new Hardtanh_());
7334     ops_.emplace_back(new Conv2d({64, 192, 1, 1}, 1, 1, 0));
7335     ops_.emplace_back(new Conv2d({384, 64, 1, 1}, 1, 1, 0));
7336     ops_.emplace_back(new Hardtanh_());
7337     ops_.emplace_back(new Conv2d({384, 1, 3, 3}, 384, 1, 1));
7338     ops_.emplace_back(new Hardtanh_());
7339     ops_.emplace_back(new Conv2d({64, 384, 1, 1}, 1, 1, 0));
7340     ops_.emplace_back(new Conv2d({384, 64, 1, 1}, 1, 1, 0));
7341     ops_.emplace_back(new Hardtanh_());
7342     ops_.emplace_back(new Conv2d({384, 1, 3, 3}, 384, 1, 1));
7343     ops_.emplace_back(new Hardtanh_());
7344     ops_.emplace_back(new Conv2d({64, 384, 1, 1}, 1, 1, 0));
7345     ops_.emplace_back(new Conv2d({384, 64, 1, 1}, 1, 1, 0));
7346     ops_.emplace_back(new Hardtanh_());
7347     ops_.emplace_back(new Conv2d({384, 1, 3, 3}, 384, 1, 1));
7348     ops_.emplace_back(new Hardtanh_());
7349     ops_.emplace_back(new Conv2d({64, 384, 1, 1}, 1, 1, 0));
7350     ops_.emplace_back(new Conv2d({384, 64, 1, 1}, 1, 1, 0));
7351     ops_.emplace_back(new Hardtanh_());
7352     ops_.emplace_back(new Conv2d({384, 1, 3, 3}, 384, 1, 1));
7353     ops_.emplace_back(new Hardtanh_());
7354     ops_.emplace_back(new Conv2d({96, 384, 1, 1}, 1, 1, 0));
7355     ops_.emplace_back(new Conv2d({576, 96, 1, 1}, 1, 1, 0));
7356     ops_.emplace_back(new Hardtanh_());
7357     ops_.emplace_back(new Conv2d({576, 1, 3, 3}, 576, 1, 1));
7358     ops_.emplace_back(new Hardtanh_());
7359     ops_.emplace_back(new Conv2d({96, 576, 1, 1}, 1, 1, 0));
7360     ops_.emplace_back(new Conv2d({576, 96, 1, 1}, 1, 1, 0));
7361     ops_.emplace_back(new Hardtanh_());
7362     ops_.emplace_back(new Conv2d({576, 1, 3, 3}, 576, 1, 1));
7363     ops_.emplace_back(new Hardtanh_());
7364     ops_.emplace_back(new Conv2d({96, 576, 1, 1}, 1, 1, 0));
7365     ops_.emplace_back(new Conv2d({576, 96, 1, 1}, 1, 1, 0));
7366     ops_.emplace_back(new Hardtanh_());
7367     ops_.emplace_back(new Conv2d({576, 1, 3, 3}, 576, 2, 1));
7368     ops_.emplace_back(new Hardtanh_());
7369     ops_.emplace_back(new Conv2d({160, 576, 1, 1}, 1, 1, 0));
7370     ops_.emplace_back(new Conv2d({960, 160, 1, 1}, 1, 1, 0));
7371     ops_.emplace_back(new Hardtanh_());
7372     ops_.emplace_back(new Conv2d({960, 1, 3, 3}, 960, 1, 1));
7373     ops_.emplace_back(new Hardtanh_());
7374     ops_.emplace_back(new Conv2d({160, 960, 1, 1}, 1, 1, 0));
7375     ops_.emplace_back(new Conv2d({960, 160, 1, 1}, 1, 1, 0));
7376     ops_.emplace_back(new Hardtanh_());
7377     ops_.emplace_back(new Conv2d({960, 1, 3, 3}, 960, 1, 1));
7378     ops_.emplace_back(new Hardtanh_());
7379     ops_.emplace_back(new Conv2d({160, 960, 1, 1}, 1, 1, 0));
7380     ops_.emplace_back(new Conv2d({960, 160, 1, 1}, 1, 1, 0));
7381     ops_.emplace_back(new Hardtanh_());
7382     ops_.emplace_back(new Conv2d({960, 1, 3, 3}, 960, 1, 1));
7383     ops_.emplace_back(new Hardtanh_());
7384     ops_.emplace_back(new Conv2d({320, 960, 1, 1}, 1, 1, 0));
7385     ops_.emplace_back(new Conv2d({1280, 320, 1, 1}, 1, 1, 0));
7386     ops_.emplace_back(new Hardtanh_());
7387     ops_.emplace_back(new Mean());
7388     ops_.emplace_back(new Addmm(1, 1280, 1000, 0, 1));
7389   }
7390 };
7391 
TEST_F(VulkanAPITest,mobilenetv2)7392 TEST_F(VulkanAPITest, mobilenetv2) {
7393   c10::InferenceMode mode;
7394 
7395   MobileNetV2 mn2;
7396 
7397   const auto input = at::rand({1, 3, 224, 224}, at::device(at::kCPU).dtype(at::kFloat));
7398   const auto output = mn2.run(input, input.vulkan());
7399 
7400   const auto check = almostEqual(output.first, output.second.cpu());
7401   if (!check) {
7402     showRtol(output.first, output.second.cpu());
7403   }
7404 
7405   ASSERT_TRUE(check);
7406 }
7407 
TEST_F(VulkanAPITest,gru_success)7408 TEST_F(VulkanAPITest, gru_success) {
7409   // Arrange
7410   const int H_in = 5;  // input_size
7411   const int H_out = 7; // hidden_size
7412   const int num_layers = 3;
7413   const int L = 1;
7414   const int N = 1;
7415   const double gru_dropout = .0;
7416   const bool has_biases = true;
7417   const bool train = false;
7418   const bool bidirectional = false;
7419   const bool batch_first = true;
7420   const auto in_cpu = at::rand({N, L, H_in}, at::device(at::kCPU).dtype(at::kFloat));
7421   const auto h0_cpu = at::rand({num_layers, N, H_out}, at::device(at::kCPU).dtype(at::kFloat));
7422 
7423   c10::List<at::Tensor> weight_ih_l; // shape (3 * hidden_size, input_size)
7424   c10::List<at::Tensor> weight_hh_l; // shape (3 * hidden_size, hidden_size)
7425   c10::List<at::Tensor> bias_ih_l;   // shape (3 * hidden_size)
7426   c10::List<at::Tensor> bias_hh_l;   // shape (3 * hidden_size)
7427   for (int i = 0; i < num_layers; ++i) {
7428     if (i == 0) {
7429       weight_ih_l.emplace_back(at::rand({3 * H_out, H_in}, at::device(at::kCPU).dtype(at::kFloat)));
7430     } else {
7431       weight_ih_l.emplace_back(at::rand({3 * H_out, H_out}, at::device(at::kCPU).dtype(at::kFloat)));
7432     }
7433     weight_hh_l.emplace_back(at::rand({3 * H_out, H_out}, at::device(at::kCPU).dtype(at::kFloat)));
7434     bias_ih_l.emplace_back(at::rand({3 * H_out}, at::device(at::kCPU).dtype(at::kFloat)));
7435     bias_hh_l.emplace_back(at::rand({3 * H_out}, at::device(at::kCPU).dtype(at::kFloat)));
7436   }
7437 
7438   // put this guard here to run inference inststead of training
7439   // to avoid the following error:
7440   //     C++ exception with description "0INTERNAL ASSERT FAILED at "xplat/caffe2/aten/src/ATen/core/boxing/KernelFunction.cpp":31, please report a bug to PyTorch. aten::gru.input has kernels registered to both CompositeImplicitAutograd and a backend mapped to AutogradOther. This makes the backend kernel unreachable; the dispatcher will always prefer the CompositeImplicitAutograd lowering (see Note [Ambiguity in AutogradOther kernel]). If you want to override CompositeImplicitAutograd, please open an issue to request a dedicated Autograd dispatch key for the backend.
7441   //     If you only want to run inference instead of training, add `c10::InferenceMode mode;` before model.forward(). Note this guard is only available in C++ but not Python at present.
7442   c10::InferenceMode mode;
7443 
7444   // Act
7445   const auto out_cpu = at::gru(in_cpu, h0_cpu,
7446       { weight_ih_l[0], weight_hh_l[0], bias_ih_l[0], bias_hh_l[0],
7447         weight_ih_l[1], weight_hh_l[1], bias_ih_l[1], bias_hh_l[1],
7448         weight_ih_l[2], weight_hh_l[2], bias_ih_l[2], bias_hh_l[2] },
7449       has_biases, num_layers, gru_dropout, train, bidirectional, batch_first);
7450 
7451   // weights/biases should be always on CPU.
7452   const auto out_vulkan = at::gru(in_cpu.vulkan(), h0_cpu.vulkan(),
7453       { weight_ih_l.get(0), weight_hh_l.get(0), bias_ih_l.get(0), bias_hh_l.get(0),
7454         weight_ih_l.get(1), weight_hh_l.get(1), bias_ih_l.get(1), bias_hh_l.get(1),
7455         weight_ih_l.get(2), weight_hh_l.get(2), bias_ih_l.get(2), bias_hh_l.get(2) },
7456       has_biases, num_layers, gru_dropout, train, bidirectional, batch_first);
7457 
7458   auto cpu_output = std::get<0>(out_cpu);
7459   auto cpu_hidden = std::get<1>(out_cpu);
7460   auto vulkan_output = std::get<0>(out_vulkan);
7461   auto vulkan_hidden = std::get<1>(out_vulkan);
7462 
7463   // Assert
7464   const auto check_output = almostEqual(cpu_output, vulkan_output.cpu());
7465   if (!check_output) {
7466     showRtol(cpu_output, vulkan_output.cpu());
7467   }
7468   ASSERT_TRUE(check_output);
7469 
7470   const auto check_hidden = almostEqual(cpu_hidden, vulkan_hidden.cpu());
7471   if (!check_hidden) {
7472     showRtol(cpu_hidden, vulkan_hidden.cpu());
7473   }
7474   ASSERT_TRUE(check_hidden);
7475 }
7476 
TEST_F(VulkanAPITest,gru_mclareninputs_success)7477 TEST_F(VulkanAPITest, gru_mclareninputs_success) {
7478   // Arrange
7479   const int H_in = 384;  // input_size
7480   const int H_out = 384; // hidden_size
7481   const int num_layers = 2;
7482   const int L = 1;
7483   const int N = 1;
7484   const double gru_dropout = .0;
7485   const bool has_biases = true;
7486   const bool train = false;
7487   const bool bidirectional = false;
7488   const bool batch_first = true;
7489   const auto in_cpu = at::rand({N, L, H_in}, at::device(at::kCPU).dtype(at::kFloat));
7490   const auto h0_cpu = at::rand({num_layers, N, H_out}, at::device(at::kCPU).dtype(at::kFloat));
7491 
7492   c10::List<at::Tensor> weight_ih_l; // shape (3 * hidden_size, input_size)
7493   c10::List<at::Tensor> weight_hh_l; // shape (3 * hidden_size, hidden_size)
7494   c10::List<at::Tensor> bias_ih_l;   // shape (3 * hidden_size)
7495   c10::List<at::Tensor> bias_hh_l;   // shape (3 * hidden_size)
7496   for (int i = 0; i < num_layers; ++i) {
7497     if (i == 0) {
7498       weight_ih_l.emplace_back(at::rand({3 * H_out, H_in}, at::device(at::kCPU).dtype(at::kFloat)));
7499     } else {
7500       weight_ih_l.emplace_back(at::rand({3 * H_out, H_out}, at::device(at::kCPU).dtype(at::kFloat)));
7501     }
7502     weight_hh_l.emplace_back(at::rand({3 * H_out, H_out}, at::device(at::kCPU).dtype(at::kFloat)));
7503     bias_ih_l.emplace_back(at::rand({3 * H_out}, at::device(at::kCPU).dtype(at::kFloat)));
7504     bias_hh_l.emplace_back(at::rand({3 * H_out}, at::device(at::kCPU).dtype(at::kFloat)));
7505   }
7506 
7507   // put this guard here to run inference inststead of training
7508   // to avoid the following error:
7509   //     C++ exception with description "0INTERNAL ASSERT FAILED at "xplat/caffe2/aten/src/ATen/core/boxing/KernelFunction.cpp":31, please report a bug to PyTorch. aten::gru.input has kernels registered to both CompositeImplicitAutograd and a backend mapped to AutogradOther. This makes the backend kernel unreachable; the dispatcher will always prefer the CompositeImplicitAutograd lowering (see Note [Ambiguity in AutogradOther kernel]). If you want to override CompositeImplicitAutograd, please open an issue to request a dedicated Autograd dispatch key for the backend.
7510   //     If you only want to run inference instead of training, add `c10::InferenceMode mode;` before model.forward(). Note this guard is only available in C++ but not Python at present.
7511   c10::InferenceMode mode;
7512 
7513   // Act
7514   const auto out_cpu = at::gru(in_cpu, h0_cpu,
7515       { weight_ih_l[0], weight_hh_l[0], bias_ih_l[0], bias_hh_l[0], weight_ih_l[1], weight_hh_l[1], bias_ih_l[1], bias_hh_l[1] },
7516       has_biases, num_layers, gru_dropout, train, bidirectional, batch_first);
7517 
7518   // weights/biases should be always on CPU.
7519   const auto out_vulkan = at::gru(in_cpu.vulkan(), h0_cpu.vulkan(), { weight_ih_l.get(0), weight_hh_l.get(0), bias_ih_l.get(0), bias_hh_l.get(0),
7520       weight_ih_l.get(1), weight_hh_l.get(1), bias_ih_l.get(1), bias_hh_l.get(1) },
7521       has_biases, num_layers, gru_dropout, train, bidirectional, batch_first);
7522 
7523   auto cpu_output = std::get<0>(out_cpu);
7524   auto cpu_hidden = std::get<1>(out_cpu);
7525   auto vulkan_output = std::get<0>(out_vulkan);
7526   auto vulkan_hidden = std::get<1>(out_vulkan);
7527 
7528   // Assert
7529   const auto check_output = almostEqual(cpu_output, vulkan_output.cpu());
7530   if (!check_output) {
7531     showRtol(cpu_output, vulkan_output.cpu());
7532   }
7533   ASSERT_TRUE(check_output);
7534 
7535   const auto check_hidden = almostEqual(cpu_hidden, vulkan_hidden.cpu());
7536   if (!check_hidden) {
7537     showRtol(cpu_hidden, vulkan_hidden.cpu());
7538   }
7539   ASSERT_TRUE(check_hidden);
7540 }
7541 
TEST_F(VulkanAPITest,gru_invalidinputs_exceptions)7542 TEST_F(VulkanAPITest, gru_invalidinputs_exceptions) {
7543   // Arrange
7544   const int H_in = 17;  // input_size
7545   const int H_out = 50; // hidden_size
7546   const int num_layers = 2;
7547   const int L = 5;
7548   const int N = 4;
7549   const double gru_dropout = .0;
7550   const bool has_biases = true;
7551   const bool train = false;
7552   const bool bidirectional = false;
7553   const bool batch_first = true;
7554   const auto in_cpu = at::rand({N, L, H_in}, at::device(at::kCPU).dtype(at::kFloat));
7555   const auto h0_cpu = at::rand({num_layers, N, H_out}, at::device(at::kCPU).dtype(at::kFloat));
7556 
7557   c10::List<at::Tensor> weight_ih_l; // shape (3 * hidden_size, input_size)
7558   c10::List<at::Tensor> weight_hh_l; // shape (3 * hidden_size, hidden_size)
7559   c10::List<at::Tensor> bias_ih_l;   // shape (3 * hidden_size)
7560   c10::List<at::Tensor> bias_hh_l;   // shape (3 * hidden_size)
7561   for (int i = 0; i < num_layers; ++i) {
7562     if (i == 0) {
7563       weight_ih_l.emplace_back(at::rand({3 * H_out, H_in}, at::device(at::kCPU).dtype(at::kFloat)));
7564     } else {
7565       weight_ih_l.emplace_back(at::rand({3 * H_out, H_out}, at::device(at::kCPU).dtype(at::kFloat)));
7566     }
7567     weight_hh_l.emplace_back(at::rand({3 * H_out, H_out}, at::device(at::kCPU).dtype(at::kFloat)));
7568     bias_ih_l.emplace_back(at::rand({3 * H_out}, at::device(at::kCPU).dtype(at::kFloat)));
7569     bias_hh_l.emplace_back(at::rand({3 * H_out}, at::device(at::kCPU).dtype(at::kFloat)));
7570   }
7571 
7572   // put this guard here to run inference inststead of training
7573   // to avoid the following error:
7574   //     C++ exception with description "0INTERNAL ASSERT FAILED at "xplat/caffe2/aten/src/ATen/core/boxing/KernelFunction.cpp":31, please report a bug to PyTorch. aten::gru.input has kernels registered to both CompositeImplicitAutograd and a backend mapped to AutogradOther. This makes the backend kernel unreachable; the dispatcher will always prefer the CompositeImplicitAutograd lowering (see Note [Ambiguity in AutogradOther kernel]). If you want to override CompositeImplicitAutograd, please open an issue to request a dedicated Autograd dispatch key for the backend.
7575   //     If you only want to run inference instead of training, add `c10::InferenceMode mode;` before model.forward(). Note this guard is only available in C++ but not Python at present.
7576   c10::InferenceMode mode;
7577 
7578   // Act: incorrect # of weights/biases
7579   EXPECT_THROW({
7580     at::gru(in_cpu.vulkan(), h0_cpu.vulkan(), { weight_ih_l.get(0), weight_hh_l.get(0), bias_ih_l.get(0), bias_hh_l.get(0),
7581       weight_ih_l.get(1), weight_hh_l.get(1), bias_ih_l.get(1) },
7582       has_biases, num_layers, gru_dropout, train, bidirectional, batch_first);
7583   }, ::std::exception);
7584 
7585   // Act: non-3D input tensor
7586   EXPECT_THROW({
7587     const auto in_cpu_2d = at::rand({1, H_in}, at::device(at::kCPU).dtype(at::kFloat));
7588     at::gru(in_cpu_2d.vulkan(), h0_cpu.vulkan(), { weight_ih_l.get(0), weight_hh_l.get(0), bias_ih_l.get(0), bias_hh_l.get(0),
7589       weight_ih_l.get(1), weight_hh_l.get(1), bias_ih_l.get(1), bias_hh_l.get(1) },
7590       has_biases, num_layers, gru_dropout, train, bidirectional, batch_first);
7591   }, ::std::exception);
7592 
7593   // Act: non-3D hidden tensor
7594   EXPECT_THROW({
7595     const auto h0_cpu_2d = at::rand({num_layers, H_out}, at::device(at::kCPU).dtype(at::kFloat));
7596     at::gru(in_cpu.vulkan(), h0_cpu_2d.vulkan(), { weight_ih_l.get(0), weight_hh_l.get(0), bias_ih_l.get(0), bias_hh_l.get(0),
7597       weight_ih_l.get(1), weight_hh_l.get(1), bias_ih_l.get(1), bias_hh_l.get(1) },
7598       has_biases, num_layers, gru_dropout, train, bidirectional, batch_first);
7599   }, ::std::exception);
7600 
7601   // Act: has_biases should be true
7602   EXPECT_THROW({
7603     at::gru(in_cpu.vulkan(), h0_cpu.vulkan(), { weight_ih_l.get(0), weight_hh_l.get(0), bias_ih_l.get(0), bias_hh_l.get(0),
7604       weight_ih_l.get(1), weight_hh_l.get(1), bias_ih_l.get(1), bias_hh_l.get(1) },
7605       false, num_layers, gru_dropout, train, bidirectional, batch_first);
7606   }, ::std::exception);
7607 
7608   // Act: train should be false
7609   EXPECT_THROW({
7610     at::gru(in_cpu.vulkan(), h0_cpu.vulkan(), { weight_ih_l.get(0), weight_hh_l.get(0), bias_ih_l.get(0), bias_hh_l.get(0),
7611       weight_ih_l.get(1), weight_hh_l.get(1), bias_ih_l.get(1), bias_hh_l.get(1) },
7612       has_biases, num_layers, gru_dropout, true, bidirectional, batch_first);
7613   }, ::std::exception);
7614 
7615   // Act: bidirectional should be false
7616   EXPECT_THROW({
7617     at::gru(in_cpu.vulkan(), h0_cpu.vulkan(), { weight_ih_l.get(0), weight_hh_l.get(0), bias_ih_l.get(0), bias_hh_l.get(0),
7618       weight_ih_l.get(1), weight_hh_l.get(1), bias_ih_l.get(1), bias_hh_l.get(1) },
7619       has_biases, num_layers, gru_dropout, train, true, batch_first);
7620   }, ::std::exception);
7621 
7622   // Act: batch_first should be true
7623   EXPECT_THROW({
7624     at::gru(in_cpu.vulkan(), h0_cpu.vulkan(), { weight_ih_l.get(0), weight_hh_l.get(0), bias_ih_l.get(0), bias_hh_l.get(0),
7625       weight_ih_l.get(1), weight_hh_l.get(1), bias_ih_l.get(1), bias_hh_l.get(1) },
7626       has_biases, num_layers, gru_dropout, train, bidirectional, false);
7627   }, ::std::exception);
7628 
7629   // Act: dropout should be 0.0
7630   EXPECT_THROW({
7631     at::gru(in_cpu.vulkan(), h0_cpu.vulkan(), { weight_ih_l.get(0), weight_hh_l.get(0), bias_ih_l.get(0), bias_hh_l.get(0),
7632       weight_ih_l.get(1), weight_hh_l.get(1), bias_ih_l.get(1), bias_hh_l.get(1) },
7633       has_biases, num_layers, 1.0, train, bidirectional, batch_first);
7634   }, ::std::exception);
7635 }
7636 
TEST_F(VulkanAPITest,gru_prepack_success)7637 TEST_F(VulkanAPITest, gru_prepack_success) {
7638   // Arrange
7639   const int H_in = 81;  // input_size
7640   const int H_out = 10; // hidden_size
7641   const int num_layers = 2;
7642   const int L = 1;
7643   const int N = 1;
7644   const double gru_dropout = .0;
7645   const bool has_biases = true;
7646   const bool train = false;
7647   const bool bidirectional = false;
7648   const bool batch_first = true;
7649   const auto in_cpu = at::rand({N, L, H_in}, at::device(at::kCPU).dtype(at::kFloat));
7650   const auto h0_cpu = at::rand({num_layers, N, H_out}, at::device(at::kCPU).dtype(at::kFloat));
7651 
7652   c10::List<at::Tensor> weight_ih_l; // shape (3 * hidden_size, input_size)
7653   c10::List<at::Tensor> weight_hh_l; // shape (3 * hidden_size, hidden_size)
7654   c10::List<at::Tensor> bias_ih_l;   // shape (3 * hidden_size)
7655   c10::List<at::Tensor> bias_hh_l;   // shape (3 * hidden_size)
7656   for (int i = 0; i < num_layers; ++i) {
7657     if (i == 0) {
7658       weight_ih_l.emplace_back(at::rand({3 * H_out, H_in}, at::device(at::kCPU).dtype(at::kFloat)));
7659     } else {
7660       weight_ih_l.emplace_back(at::rand({3 * H_out, H_out}, at::device(at::kCPU).dtype(at::kFloat)));
7661     }
7662     weight_hh_l.emplace_back(at::rand({3 * H_out, H_out}, at::device(at::kCPU).dtype(at::kFloat)));
7663     bias_ih_l.emplace_back(at::rand({3 * H_out}, at::device(at::kCPU).dtype(at::kFloat)));
7664     bias_hh_l.emplace_back(at::rand({3 * H_out}, at::device(at::kCPU).dtype(at::kFloat)));
7665   }
7666 
7667   // put this guard here to run inference inststead of training
7668   // to avoid the following error:
7669   //     C++ exception with description "0INTERNAL ASSERT FAILED at "xplat/caffe2/aten/src/ATen/core/boxing/KernelFunction.cpp":31, please report a bug to PyTorch. aten::gru.input has kernels registered to both CompositeImplicitAutograd and a backend mapped to AutogradOther. This makes the backend kernel unreachable; the dispatcher will always prefer the CompositeImplicitAutograd lowering (see Note [Ambiguity in AutogradOther kernel]). If you want to override CompositeImplicitAutograd, please open an issue to request a dedicated Autograd dispatch key for the backend.
7670   //     If you only want to run inference instead of training, add `c10::InferenceMode mode;` before model.forward(). Note this guard is only available in C++ but not Python at present.
7671   c10::InferenceMode mode;
7672 
7673   // Act
7674   const auto out_cpu = at::gru(in_cpu, h0_cpu,
7675       { weight_ih_l[0], weight_hh_l[0], bias_ih_l[0], bias_hh_l[0], weight_ih_l[1], weight_hh_l[1], bias_ih_l[1], bias_hh_l[1] },
7676       has_biases, num_layers, gru_dropout, train, bidirectional, batch_first);
7677 
7678   auto prepack = callOpByName(
7679       "vulkan_prepack::create_gru_context",
7680       "",
7681       std::vector<at::Tensor>({ weight_ih_l.get(0), weight_hh_l.get(0), bias_ih_l.get(0), bias_hh_l.get(0),
7682         weight_ih_l.get(1), weight_hh_l.get(1), bias_ih_l.get(1), bias_hh_l.get(1) }),
7683       has_biases, num_layers, gru_dropout, train, bidirectional, batch_first);
7684   auto out_vulkan = callOpByName(
7685       "vulkan_prepack::run_gru_context",
7686       "",
7687       in_cpu.vulkan(), h0_cpu.vulkan(), prepack[0]);
7688 
7689   auto cpu_output = std::get<0>(out_cpu);
7690   auto cpu_hidden = std::get<1>(out_cpu);
7691   auto vulkan_output = out_vulkan[0].toTensor();
7692   auto vulkan_hidden = out_vulkan[1].toTensor();
7693 
7694   // Assert
7695   const auto check_output = almostEqual(cpu_output, vulkan_output.cpu());
7696   if (!check_output) {
7697     showRtol(cpu_output, vulkan_output.cpu());
7698   }
7699   ASSERT_TRUE(check_output);
7700 
7701   const auto check_hidden = almostEqual(cpu_hidden, vulkan_hidden.cpu());
7702   if (!check_hidden) {
7703     showRtol(cpu_hidden, vulkan_hidden.cpu());
7704   }
7705   ASSERT_TRUE(check_hidden);
7706 }
7707 
TEST_F(VulkanAPITest,gru_prepack_invalidinputs_exceptions)7708 TEST_F(VulkanAPITest, gru_prepack_invalidinputs_exceptions) {
7709   // Arrange
7710   const int H_in = 70;  // input_size
7711   const int H_out = 2; // hidden_size
7712   const int num_layers = 2;
7713   const int L = 3;
7714   const int N = 5;
7715   const double gru_dropout = .0;
7716   const bool has_biases = true;
7717   const bool train = false;
7718   const bool bidirectional = false;
7719   const bool batch_first = true;
7720   const auto in_cpu = at::rand({N, L, H_in}, at::device(at::kCPU).dtype(at::kFloat));
7721   const auto h0_cpu = at::rand({num_layers, N, H_out}, at::device(at::kCPU).dtype(at::kFloat));
7722 
7723   c10::List<at::Tensor> weight_ih_l; // shape (3 * hidden_size, input_size)
7724   c10::List<at::Tensor> weight_hh_l; // shape (3 * hidden_size, hidden_size)
7725   c10::List<at::Tensor> bias_ih_l;   // shape (3 * hidden_size)
7726   c10::List<at::Tensor> bias_hh_l;   // shape (3 * hidden_size)
7727   for (int i = 0; i < num_layers; ++i) {
7728     if (i == 0) {
7729       weight_ih_l.emplace_back(at::rand({3 * H_out, H_in}, at::device(at::kCPU).dtype(at::kFloat)));
7730     } else {
7731       weight_ih_l.emplace_back(at::rand({3 * H_out, H_out}, at::device(at::kCPU).dtype(at::kFloat)));
7732     }
7733     weight_hh_l.emplace_back(at::rand({3 * H_out, H_out}, at::device(at::kCPU).dtype(at::kFloat)));
7734     bias_ih_l.emplace_back(at::rand({3 * H_out}, at::device(at::kCPU).dtype(at::kFloat)));
7735     bias_hh_l.emplace_back(at::rand({3 * H_out}, at::device(at::kCPU).dtype(at::kFloat)));
7736   }
7737 
7738   // put this guard here to run inference inststead of training
7739   // to avoid the following error:
7740   //     C++ exception with description "0INTERNAL ASSERT FAILED at "xplat/caffe2/aten/src/ATen/core/boxing/KernelFunction.cpp":31, please report a bug to PyTorch. aten::gru.input has kernels registered to both CompositeImplicitAutograd and a backend mapped to AutogradOther. This makes the backend kernel unreachable; the dispatcher will always prefer the CompositeImplicitAutograd lowering (see Note [Ambiguity in AutogradOther kernel]). If you want to override CompositeImplicitAutograd, please open an issue to request a dedicated Autograd dispatch key for the backend.
7741   //     If you only want to run inference instead of training, add `c10::InferenceMode mode;` before model.forward(). Note this guard is only available in C++ but not Python at present.
7742   c10::InferenceMode mode;
7743 
7744   // Act: incorrect # of weights/biases
7745   EXPECT_THROW({
7746     auto prepack = callOpByName(
7747         "vulkan_prepack::create_gru_context",
7748         "",
7749         std::vector<at::Tensor>({ weight_ih_l.get(0), weight_hh_l.get(0), bias_ih_l.get(0), bias_hh_l.get(0),
7750             weight_ih_l.get(1), weight_hh_l.get(1), bias_ih_l.get(1) }),
7751         has_biases, num_layers, gru_dropout, train, bidirectional, batch_first);
7752   }, ::std::exception);
7753 
7754   // Act: non-3D input tensor
7755   EXPECT_THROW({
7756     const auto in_cpu_2d = at::rand({1, H_in}, at::device(at::kCPU).dtype(at::kFloat));
7757     auto prepack = callOpByName(
7758         "vulkan_prepack::create_gru_context",
7759         "",
7760         std::vector<at::Tensor>({ weight_ih_l.get(0), weight_hh_l.get(0), bias_ih_l.get(0), bias_hh_l.get(0),
7761             weight_ih_l.get(1), weight_hh_l.get(1), bias_ih_l.get(1), bias_hh_l.get(1) }),
7762         has_biases, num_layers, gru_dropout, train, bidirectional, batch_first);
7763     auto out_vulkan = callOpByName(
7764         "vulkan_prepack::run_gru_context",
7765         "",
7766         in_cpu_2d.vulkan(), h0_cpu.vulkan(), prepack[0]);
7767   }, ::std::exception);
7768 
7769   // Act: non-3D hidden tensor
7770   EXPECT_THROW({
7771     const auto h0_cpu_2d = at::rand({num_layers, H_out}, at::device(at::kCPU).dtype(at::kFloat));
7772     auto prepack = callOpByName(
7773         "vulkan_prepack::create_gru_context",
7774         "",
7775         std::vector<at::Tensor>({ weight_ih_l.get(0), weight_hh_l.get(0), bias_ih_l.get(0), bias_hh_l.get(0),
7776             weight_ih_l.get(1), weight_hh_l.get(1), bias_ih_l.get(1), bias_hh_l.get(1) }),
7777         has_biases, num_layers, gru_dropout, train, bidirectional, batch_first);
7778     auto out_vulkan = callOpByName(
7779         "vulkan_prepack::run_gru_context",
7780         "",
7781         in_cpu.vulkan(), h0_cpu_2d.vulkan(), prepack[0]);
7782   }, ::std::exception);
7783 
7784   // Act: has_biases should be true
7785   EXPECT_THROW({
7786     auto prepack = callOpByName(
7787         "vulkan_prepack::create_gru_context",
7788         "",
7789         std::vector<at::Tensor>({ weight_ih_l.get(0), weight_hh_l.get(0), bias_ih_l.get(0), bias_hh_l.get(0),
7790            weight_ih_l.get(1), weight_hh_l.get(1), bias_ih_l.get(1), bias_hh_l.get(1) }),
7791         false, num_layers, gru_dropout, train, bidirectional, batch_first);
7792   }, ::std::exception);
7793 
7794   // Act: train should be false
7795   EXPECT_THROW({
7796     auto prepack = callOpByName(
7797         "vulkan_prepack::create_gru_context",
7798         "",
7799         std::vector<at::Tensor>({ weight_ih_l.get(0), weight_hh_l.get(0), bias_ih_l.get(0), bias_hh_l.get(0),
7800            weight_ih_l.get(1), weight_hh_l.get(1), bias_ih_l.get(1), bias_hh_l.get(1) }),
7801         has_biases, num_layers, gru_dropout, true, bidirectional, batch_first);
7802   }, ::std::exception);
7803 
7804   // Act: bidirectional should be false
7805   EXPECT_THROW({
7806      auto prepack = callOpByName(
7807         "vulkan_prepack::create_gru_context",
7808         "",
7809         std::vector<at::Tensor>({ weight_ih_l.get(0), weight_hh_l.get(0), bias_ih_l.get(0), bias_hh_l.get(0),
7810            weight_ih_l.get(1), weight_hh_l.get(1), bias_ih_l.get(1), bias_hh_l.get(1) }),
7811         has_biases, num_layers, gru_dropout, train, true, batch_first);
7812  }, ::std::exception);
7813 
7814   // Act: batch_first should be true
7815   EXPECT_THROW({
7816     auto prepack = callOpByName(
7817         "vulkan_prepack::create_gru_context",
7818         "",
7819         std::vector<at::Tensor>({ weight_ih_l.get(0), weight_hh_l.get(0), bias_ih_l.get(0), bias_hh_l.get(0),
7820            weight_ih_l.get(1), weight_hh_l.get(1), bias_ih_l.get(1), bias_hh_l.get(1) }),
7821         has_biases, num_layers, gru_dropout, train, bidirectional, false);
7822     auto out_vulkan = callOpByName(
7823         "vulkan_prepack::run_gru_context",
7824         "",
7825         in_cpu.vulkan(), h0_cpu.vulkan(), prepack[0]);
7826   }, ::std::exception);
7827 
7828   // Act: dropout should be 0.0
7829   EXPECT_THROW({
7830     auto prepack = callOpByName(
7831         "vulkan_prepack::create_gru_context",
7832         "",
7833         std::vector<at::Tensor>({ weight_ih_l.get(0), weight_hh_l.get(0), bias_ih_l.get(0), bias_hh_l.get(0),
7834            weight_ih_l.get(1), weight_hh_l.get(1), bias_ih_l.get(1), bias_hh_l.get(1) }),
7835         has_biases, num_layers, 1.0, train, bidirectional, batch_first);
7836   }, ::std::exception);
7837 }
7838 
test_linear(const at::IntArrayRef input_shape,const at::IntArrayRef weight_shape,const at::IntArrayRef bias_shape)7839 void test_linear(
7840     const at::IntArrayRef input_shape,
7841     const at::IntArrayRef weight_shape,
7842     const at::IntArrayRef bias_shape) {
7843   c10::InferenceMode mode;
7844 
7845   const auto input_cpu = at::rand(input_shape, at::device(at::kCPU).dtype(at::kFloat));
7846   const auto weight = at::rand(weight_shape, at::device(at::kCPU).dtype(at::kFloat));
7847   const auto bias = at::rand(bias_shape, at::device(at::kCPU).dtype(at::kFloat));
7848 
7849   const auto out_cpu = at::linear(input_cpu, weight, bias);
7850 
7851   auto prepack = callOpByName(
7852       "vulkan_prepack::create_linear_context",
7853       "",
7854       weight.t(), bias);
7855 
7856   auto vulkan_output = callOpByName(
7857       "vulkan_prepack::run_linear_context",
7858       "",
7859       input_cpu.vulkan(), prepack[0]);
7860 
7861   auto out_vulkan = vulkan_output[0].toTensor();
7862 
7863   const auto check = almostEqual(out_cpu, out_vulkan.cpu());
7864   if (!check) {
7865     showRtol(out_cpu, out_vulkan.cpu());
7866   }
7867 
7868   ASSERT_TRUE(check);
7869 }
7870 
TEST_F(VulkanAPITest,linear_1d_small)7871 TEST_F(VulkanAPITest, linear_1d_small) {
7872   test_linear({3}, {4, 3}, {4});
7873 }
7874 
TEST_F(VulkanAPITest,linear_1d_large)7875 TEST_F(VulkanAPITest, linear_1d_large) {
7876   test_linear({37}, {23, 37}, {23});
7877 }
7878 
TEST_F(VulkanAPITest,linear_2d_flat)7879 TEST_F(VulkanAPITest, linear_2d_flat) {
7880   test_linear({1, 37}, {41, 37}, {41});
7881 }
7882 
TEST_F(VulkanAPITest,linear_2d_small)7883 TEST_F(VulkanAPITest, linear_2d_small) {
7884   test_linear({2, 3}, {4, 3}, {4});
7885 }
7886 
TEST_F(VulkanAPITest,linear_2d_large)7887 TEST_F(VulkanAPITest, linear_2d_large) {
7888   test_linear({49, 37}, {23, 37}, {23});
7889 }
7890 
TEST_F(VulkanAPITest,linear_3d_flat)7891 TEST_F(VulkanAPITest, linear_3d_flat) {
7892   test_linear({1, 1, 37}, {41, 37}, {41});
7893 }
7894 
TEST_F(VulkanAPITest,linear_3d_small)7895 TEST_F(VulkanAPITest, linear_3d_small) {
7896   test_linear({2, 3, 4}, {5, 4}, {5});
7897 }
7898 
TEST_F(VulkanAPITest,linear_3d_large)7899 TEST_F(VulkanAPITest, linear_3d_large) {
7900   test_linear({23, 17, 41}, {15, 41}, {15});
7901 }
7902 
TEST_F(VulkanAPITest,linear_4d_flat)7903 TEST_F(VulkanAPITest, linear_4d_flat) {
7904   test_linear({1, 1, 1, 37}, {41, 37}, {41});
7905 }
7906 
TEST_F(VulkanAPITest,linear_4d_small)7907 TEST_F(VulkanAPITest, linear_4d_small) {
7908   test_linear({2, 3, 4, 5}, {6, 5}, {6});
7909 }
7910 
TEST_F(VulkanAPITest,linear_4d_large)7911 TEST_F(VulkanAPITest, linear_4d_large) {
7912   test_linear({9, 13, 11, 17}, {23, 17}, {23});
7913 }
7914 
TEST_F(VulkanAPITest,lstm_success)7915 TEST_F(VulkanAPITest, lstm_success) {
7916   // Arrange
7917   const int input_size = 5;
7918   const int hidden_size = 7;
7919   const int num_layers = 4;
7920   const int L = 1;
7921   const int N = 1;
7922   const double lstm_dropout = .0;
7923   const bool has_biases = true;
7924   const bool train = false;
7925   const bool bidirectional = false;
7926   const bool batch_first = true;
7927   const auto in_cpu = at::rand({N, L, input_size}, at::device(at::kCPU).dtype(at::kFloat));
7928   const auto h0_cpu = at::rand({num_layers, N, hidden_size}, at::device(at::kCPU).dtype(at::kFloat));
7929   const auto c0_cpu = at::rand({num_layers, N, hidden_size}, at::device(at::kCPU).dtype(at::kFloat));
7930 
7931   c10::List<at::Tensor> weight_ih_l; // shape (4 * hidden_size, input_size)
7932   c10::List<at::Tensor> weight_hh_l; // shape (4 * hidden_size, hidden_size)
7933   c10::List<at::Tensor> bias_ih_l;   // shape (4 * hidden_size)
7934   c10::List<at::Tensor> bias_hh_l;   // shape (4 * hidden_size)
7935   for (int l = 0; l < num_layers; ++l) {
7936     if (l == 0) {
7937       weight_ih_l.emplace_back(at::rand({4 * hidden_size, input_size}, at::device(at::kCPU).dtype(at::kFloat)));
7938     } else {
7939       weight_ih_l.emplace_back(at::rand({4 * hidden_size, hidden_size}, at::device(at::kCPU).dtype(at::kFloat)));
7940     }
7941     weight_hh_l.emplace_back(at::rand({4 * hidden_size, hidden_size}, at::device(at::kCPU).dtype(at::kFloat)));
7942     bias_ih_l.emplace_back(at::rand({4 * hidden_size}, at::device(at::kCPU).dtype(at::kFloat)));
7943     bias_hh_l.emplace_back(at::rand({4 * hidden_size}, at::device(at::kCPU).dtype(at::kFloat)));
7944   }
7945 
7946   // put this guard here to run inference inststead of training
7947   // to avoid the following error:
7948   //     C++ exception with description "0INTERNAL ASSERT FAILED at "xplat/caffe2/aten/src/ATen/core/boxing/KernelFunction.cpp":31, please report a bug to PyTorch. aten::gru.input has kernels registered to both CompositeImplicitAutograd and a backend mapped to AutogradOther. This makes the backend kernel unreachable; the dispatcher will always prefer the CompositeImplicitAutograd lowering (see Note [Ambiguity in AutogradOther kernel]). If you want to override CompositeImplicitAutograd, please open an issue to request a dedicated Autograd dispatch key for the backend.
7949   //     If you only want to run inference instead of training, add `c10::InferenceMode mode;` before model.forward(). Note this guard is only available in C++ but not Python at present.
7950   c10::InferenceMode mode;
7951 
7952   // Act
7953   const auto out_cpu = at::lstm(in_cpu, {h0_cpu, c0_cpu},
7954       { weight_ih_l[0], weight_hh_l[0], bias_ih_l[0], bias_hh_l[0],
7955         weight_ih_l[1], weight_hh_l[1], bias_ih_l[1], bias_hh_l[1],
7956         weight_ih_l[2], weight_hh_l[2], bias_ih_l[2], bias_hh_l[2],
7957         weight_ih_l[3], weight_hh_l[3], bias_ih_l[3], bias_hh_l[3] },
7958       has_biases, num_layers, lstm_dropout, train, bidirectional, batch_first);
7959 
7960   // weights/biases should be always on CPU.
7961   const auto out_vulkan = at::lstm(in_cpu.vulkan(), {h0_cpu.vulkan(), c0_cpu.vulkan()},
7962       { weight_ih_l.get(0), weight_hh_l.get(0), bias_ih_l.get(0), bias_hh_l.get(0),
7963         weight_ih_l.get(1), weight_hh_l.get(1), bias_ih_l.get(1), bias_hh_l.get(1),
7964         weight_ih_l.get(2), weight_hh_l.get(2), bias_ih_l.get(2), bias_hh_l.get(2),
7965         weight_ih_l.get(3), weight_hh_l.get(3), bias_ih_l.get(3), bias_hh_l.get(3) },
7966       has_biases, num_layers, lstm_dropout, train, bidirectional, batch_first);
7967 
7968   auto cpu_output = std::get<0>(out_cpu);
7969   auto cpu_hidden = std::get<1>(out_cpu);
7970   auto cpu_cell = std::get<2>(out_cpu);
7971   auto vulkan_output = std::get<0>(out_vulkan);
7972   auto vulkan_hidden = std::get<1>(out_vulkan);
7973   auto vulkan_cell = std::get<2>(out_vulkan);
7974 
7975   // Assert
7976   const auto check_output = almostEqual(cpu_output, vulkan_output.cpu());
7977   if (!check_output) {
7978     showRtol(cpu_output, vulkan_output.cpu());
7979   }
7980   ASSERT_TRUE(check_output);
7981 
7982   const auto check_hidden = almostEqual(cpu_hidden, vulkan_hidden.cpu());
7983   if (!check_hidden) {
7984     showRtol(cpu_hidden, vulkan_hidden.cpu());
7985   }
7986   ASSERT_TRUE(check_hidden);
7987 
7988   const auto check_cell = almostEqual(cpu_cell, vulkan_cell.cpu());
7989   if (!check_cell) {
7990     showRtol(cpu_cell, vulkan_cell.cpu());
7991   }
7992   ASSERT_TRUE(check_cell);
7993 }
7994 
TEST_F(VulkanAPITest,lstm_mclareninputs_success)7995 TEST_F(VulkanAPITest, lstm_mclareninputs_success) {
7996   // Arrange
7997   const int input_size = 384;
7998   const int hidden_size = 384;
7999   const int num_layers = 2;
8000   const int L = 1;
8001   const int N = 1;
8002   const double lstm_dropout = .0;
8003   const bool has_biases = true;
8004   const bool train = false;
8005   const bool bidirectional = false;
8006   const bool batch_first = true;
8007   const auto in_cpu = at::rand({N, L, input_size}, at::device(at::kCPU).dtype(at::kFloat));
8008   const auto h0_cpu = at::rand({num_layers, N, hidden_size}, at::device(at::kCPU).dtype(at::kFloat));
8009   const auto c0_cpu = at::rand({num_layers, N, hidden_size}, at::device(at::kCPU).dtype(at::kFloat));
8010 
8011   c10::List<at::Tensor> weight_ih_l; // shape (4 * hidden_size, input_size)
8012   c10::List<at::Tensor> weight_hh_l; // shape (4 * hidden_size, hidden_size)
8013   c10::List<at::Tensor> bias_ih_l;   // shape (4 * hidden_size)
8014   c10::List<at::Tensor> bias_hh_l;   // shape (4 * hidden_size)
8015   for (int l = 0; l < num_layers; ++l) {
8016     if (l == 0) {
8017       weight_ih_l.emplace_back(at::rand({4 * hidden_size, input_size}, at::device(at::kCPU).dtype(at::kFloat)));
8018     } else {
8019       weight_ih_l.emplace_back(at::rand({4 * hidden_size, hidden_size}, at::device(at::kCPU).dtype(at::kFloat)));
8020     }
8021     weight_hh_l.emplace_back(at::rand({4 * hidden_size, hidden_size}, at::device(at::kCPU).dtype(at::kFloat)));
8022     bias_ih_l.emplace_back(at::rand({4 * hidden_size}, at::device(at::kCPU).dtype(at::kFloat)));
8023     bias_hh_l.emplace_back(at::rand({4 * hidden_size}, at::device(at::kCPU).dtype(at::kFloat)));
8024   }
8025 
8026   // put this guard here to run inference inststead of training
8027   // to avoid the following error:
8028   //     C++ exception with description "0INTERNAL ASSERT FAILED at "xplat/caffe2/aten/src/ATen/core/boxing/KernelFunction.cpp":31, please report a bug to PyTorch. aten::gru.input has kernels registered to both CompositeImplicitAutograd and a backend mapped to AutogradOther. This makes the backend kernel unreachable; the dispatcher will always prefer the CompositeImplicitAutograd lowering (see Note [Ambiguity in AutogradOther kernel]). If you want to override CompositeImplicitAutograd, please open an issue to request a dedicated Autograd dispatch key for the backend.
8029   //     If you only want to run inference instead of training, add `c10::InferenceMode mode;` before model.forward(). Note this guard is only available in C++ but not Python at present.
8030   c10::InferenceMode mode;
8031 
8032   // Act
8033   const auto out_cpu = at::lstm(in_cpu, {h0_cpu, c0_cpu},
8034       { weight_ih_l[0], weight_hh_l[0], bias_ih_l[0], bias_hh_l[0],
8035         weight_ih_l[1], weight_hh_l[1], bias_ih_l[1], bias_hh_l[1] },
8036       has_biases, num_layers, lstm_dropout, train, bidirectional, batch_first);
8037 
8038   // weights/biases should be always on CPU.
8039   const auto out_vulkan = at::lstm(in_cpu.vulkan(), {h0_cpu.vulkan(), c0_cpu.vulkan()},
8040       { weight_ih_l.get(0), weight_hh_l.get(0), bias_ih_l.get(0), bias_hh_l.get(0),
8041         weight_ih_l.get(1), weight_hh_l.get(1), bias_ih_l.get(1), bias_hh_l.get(1) },
8042       has_biases, num_layers, lstm_dropout, train, bidirectional, batch_first);
8043 
8044   auto cpu_output = std::get<0>(out_cpu);
8045   auto cpu_hidden = std::get<1>(out_cpu);
8046   auto cpu_cell = std::get<2>(out_cpu);
8047   auto vulkan_output = std::get<0>(out_vulkan);
8048   auto vulkan_hidden = std::get<1>(out_vulkan);
8049   auto vulkan_cell = std::get<2>(out_vulkan);
8050 
8051   // Assert
8052   const auto check_output = almostEqual(cpu_output, vulkan_output.cpu());
8053   if (!check_output) {
8054     showRtol(cpu_output, vulkan_output.cpu());
8055   }
8056   ASSERT_TRUE(check_output);
8057 
8058   const auto check_hidden = almostEqual(cpu_hidden, vulkan_hidden.cpu());
8059   if (!check_hidden) {
8060     showRtol(cpu_hidden, vulkan_hidden.cpu());
8061   }
8062   ASSERT_TRUE(check_hidden);
8063 
8064   const auto check_cell = almostEqual(cpu_cell, vulkan_cell.cpu());
8065   if (!check_cell) {
8066     showRtol(cpu_cell, vulkan_cell.cpu());
8067   }
8068   ASSERT_TRUE(check_cell);
8069 }
8070 
TEST_F(VulkanAPITest,lstm_prepack_success)8071 TEST_F(VulkanAPITest, lstm_prepack_success) {
8072   // Arrange
8073   const int input_size = 81;
8074   const int hidden_size = 10;
8075   const int num_layers = 2;
8076   const int L = 1;
8077   const int N = 1;
8078   const double lstm_dropout = .0;
8079   const bool has_biases = true;
8080   const bool train = false;
8081   const bool bidirectional = false;
8082   const bool batch_first = true;
8083   const auto in_cpu = at::rand({N, L, input_size}, at::device(at::kCPU).dtype(at::kFloat));
8084   const auto h0_cpu = at::rand({num_layers, N, hidden_size}, at::device(at::kCPU).dtype(at::kFloat));
8085   const auto c0_cpu = at::rand({num_layers, N, hidden_size}, at::device(at::kCPU).dtype(at::kFloat));
8086 
8087   c10::List<at::Tensor> weight_ih_l; // shape (4 * hidden_size, l == 0 ? input_size : hidden_size)
8088   c10::List<at::Tensor> weight_hh_l; // shape (4 * hidden_size, hidden_size)
8089   c10::List<at::Tensor> bias_ih_l;   // shape (4 * hidden_size)
8090   c10::List<at::Tensor> bias_hh_l;   // shape (4 * hidden_size)
8091   for (int l = 0; l < num_layers; ++l) {
8092     if (l == 0) {
8093       weight_ih_l.emplace_back(at::rand({4 * hidden_size, input_size}, at::device(at::kCPU).dtype(at::kFloat)));
8094     } else {
8095       weight_ih_l.emplace_back(at::rand({4 * hidden_size, hidden_size}, at::device(at::kCPU).dtype(at::kFloat)));
8096     }
8097     weight_hh_l.emplace_back(at::rand({4 * hidden_size, hidden_size}, at::device(at::kCPU).dtype(at::kFloat)));
8098     bias_ih_l.emplace_back(at::rand({4 * hidden_size}, at::device(at::kCPU).dtype(at::kFloat)));
8099     bias_hh_l.emplace_back(at::rand({4 * hidden_size}, at::device(at::kCPU).dtype(at::kFloat)));
8100   }
8101 
8102   // put this guard here to run inference inststead of training
8103   // to avoid the following error:
8104   //     C++ exception with description "0INTERNAL ASSERT FAILED at "xplat/caffe2/aten/src/ATen/core/boxing/KernelFunction.cpp":31, please report a bug to PyTorch. aten::gru.input has kernels registered to both CompositeImplicitAutograd and a backend mapped to AutogradOther. This makes the backend kernel unreachable; the dispatcher will always prefer the CompositeImplicitAutograd lowering (see Note [Ambiguity in AutogradOther kernel]). If you want to override CompositeImplicitAutograd, please open an issue to request a dedicated Autograd dispatch key for the backend.
8105   //     If you only want to run inference instead of training, add `c10::InferenceMode mode;` before model.forward(). Note this guard is only available in C++ but not Python at present.
8106   c10::InferenceMode mode;
8107 
8108   // Act
8109   const auto out_cpu = at::lstm(in_cpu, {h0_cpu, c0_cpu},
8110       { weight_ih_l[0], weight_hh_l[0], bias_ih_l[0], bias_hh_l[0],
8111         weight_ih_l[1], weight_hh_l[1], bias_ih_l[1], bias_hh_l[1] },
8112       has_biases, num_layers, lstm_dropout, train, bidirectional, batch_first);
8113 
8114   auto prepack = callOpByName(
8115       "vulkan_prepack::create_lstm_context",
8116       "",
8117       std::vector<at::Tensor>({ weight_ih_l.get(0), weight_hh_l.get(0), bias_ih_l.get(0), bias_hh_l.get(0),
8118                                 weight_ih_l.get(1), weight_hh_l.get(1), bias_ih_l.get(1), bias_hh_l.get(1) }),
8119       has_biases, num_layers, lstm_dropout, train, bidirectional, batch_first);
8120 
8121   auto out_vulkan = callOpByName(
8122       "vulkan_prepack::run_lstm_context",
8123       "",
8124       in_cpu.vulkan(), h0_cpu.vulkan(), c0_cpu.vulkan(), prepack[0]);
8125 
8126   auto cpu_output = std::get<0>(out_cpu);
8127   auto cpu_hidden = std::get<1>(out_cpu);
8128   auto cpu_cell = std::get<2>(out_cpu);
8129   auto vulkan_output = out_vulkan[0].toTensor();
8130   auto vulkan_hidden = out_vulkan[1].toTensor();
8131   auto vulkan_cell = out_vulkan[2].toTensor();
8132 
8133   // Assert
8134   const auto check_output = almostEqual(cpu_output, vulkan_output.cpu());
8135   if (!check_output) {
8136     showRtol(cpu_output, vulkan_output.cpu());
8137   }
8138   ASSERT_TRUE(check_output);
8139 
8140   const auto check_hidden = almostEqual(cpu_hidden, vulkan_hidden.cpu());
8141   if (!check_hidden) {
8142     showRtol(cpu_hidden, vulkan_hidden.cpu());
8143   }
8144   ASSERT_TRUE(check_hidden);
8145 
8146   const auto check_cell = almostEqual(cpu_cell, vulkan_cell.cpu());
8147   if (!check_cell) {
8148     showRtol(cpu_cell, vulkan_cell.cpu());
8149   }
8150   ASSERT_TRUE(check_cell);
8151 }
8152 
TEST_F(VulkanAPITest,querypool_flushed_shader_log)8153 TEST_F(VulkanAPITest, querypool_flushed_shader_log) {
8154 #if defined(USE_VULKAN_GPU_DIAGNOSTICS) && defined(__ANDROID__)
8155   const bool op_profiling_enabled_initially =
8156       at::native::vulkan::api::context()->op_profiling_enabled();
8157 
8158   at::native::vulkan::api::context()->enable_op_profiling();
8159 
8160   const at::Tensor a_add_cpu =
8161       at::rand({11, 7, 139, 109}, at::device(at::kCPU).dtype(at::kFloat));
8162   const at::Tensor a_add_vulkan = a_add_cpu.vulkan();
8163 
8164   const at::Tensor b_add_cpu =
8165       at::rand({11, 7, 139, 109}, at::device(at::kCPU).dtype(at::kFloat));
8166   const at::Tensor b_add_vulkan = b_add_cpu.vulkan();
8167 
8168   at::add(a_add_vulkan, b_add_vulkan, 2.1f).cpu();
8169 
8170   at::native::vulkan::api::context()->querypool().extract_results();
8171   at::native::vulkan::api::context()->reset_querypool();
8172 
8173   const at::Tensor a_sub_cpu =
8174       at::rand({11, 7, 139, 109}, at::device(at::kCPU).dtype(at::kFloat));
8175   const at::Tensor a_sub_vulkan = a_sub_cpu.vulkan();
8176 
8177   const at::Tensor b_sub_cpu =
8178       at::rand({11, 7, 139, 109}, at::device(at::kCPU).dtype(at::kFloat));
8179   const at::Tensor b_sub_vulkan = b_sub_cpu.vulkan();
8180 
8181   at::sub(a_sub_vulkan, b_sub_vulkan, 2.1f).cpu();
8182 
8183   at::native::vulkan::api::context()->querypool().extract_results();
8184   at::native::vulkan::api::context()->reset_querypool();
8185 
8186   const at::Tensor a_mul_cpu =
8187       at::rand({11, 7, 139, 109}, at::device(at::kCPU).dtype(at::kFloat));
8188   const at::Tensor a_mul_vulkan = a_mul_cpu.vulkan();
8189 
8190   const at::Tensor b_mul_cpu =
8191       at::rand({11, 7, 139, 109}, at::device(at::kCPU).dtype(at::kFloat));
8192   const at::Tensor b_mul_vulkan = b_mul_cpu.vulkan();
8193 
8194   at::mul(a_mul_vulkan, b_mul_vulkan).cpu();
8195 
8196   /*
8197     The most recent shaders should be
8198     (-12) vulkan.nchw_to_image
8199     (-11) vulkan.nchw_to_image
8200     (-10) vulkan.add
8201     (-9)  vulkan.image_to_nchw
8202 
8203     (-8)  vulkan.nchw_to_image
8204     (-7)  vulkan.nchw_to_image
8205     (-6)  vulkan.sub
8206     (-5)  vulkan.image_to_nchw
8207 
8208     (-4)  vulkan.nchw_to_image
8209     (-3)  vulkan.nchw_to_image
8210     (-2)  vulkan.mul
8211     (-1)  vulkan.image_to_nchw
8212   */
8213 
8214   const size_t entry_count =
8215       at::native::vulkan::api::context()->querypool().shader_logs_entry_count();
8216 
8217   std::tuple<std::string, uint64_t> add_shader_details =
8218       at::native::vulkan::api::context()
8219           ->querypool()
8220           .get_shader_name_and_execution_duration_ns(entry_count - 10);
8221   std::tuple<std::string, uint64_t> sub_shader_details =
8222       at::native::vulkan::api::context()
8223           ->querypool()
8224           .get_shader_name_and_execution_duration_ns(entry_count - 6);
8225   std::tuple<std::string, uint64_t> mul_shader_details =
8226       at::native::vulkan::api::context()
8227           ->querypool()
8228           .get_shader_name_and_execution_duration_ns(entry_count - 2);
8229 
8230   EXPECT_EQ(std::get<0>(add_shader_details), "vulkan.add");
8231   EXPECT_EQ(std::get<0>(sub_shader_details), "vulkan.sub");
8232   EXPECT_EQ(std::get<0>(mul_shader_details), "vulkan.mul");
8233 
8234   if (!op_profiling_enabled_initially) {
8235     at::native::vulkan::api::context()->reset_querypool();
8236     at::native::vulkan::api::context()->disable_op_profiling();
8237   }
8238 #else
8239   GTEST_SKIP() << "QueryPool is not available";
8240 #endif
8241 }
8242 
8243 } // namespace
8244 
8245 #endif /* USE_VULKAN_API */
8246