xref: /aosp_15_r20/external/tensorflow/tensorflow/lite/delegates/gpu/metal/kernels/conv_test.mm (revision b6fb3261f9314811a0f4371741dbb8839866f948)
1/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2
3Licensed under the Apache License, Version 2.0 (the "License");
4you may not use this file except in compliance with the License.
5You may obtain a copy of the License at
6
7    http://www.apache.org/licenses/LICENSE-2.0
8
9Unless required by applicable law or agreed to in writing, software
10distributed under the License is distributed on an "AS IS" BASIS,
11WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12See the License for the specific language governing permissions and
13limitations under the License.
14==============================================================================*/
15
16#include "tensorflow/lite/delegates/gpu/metal/compute_task.h"
17
18#import <XCTest/XCTest.h>
19
20#include <memory>
21#include <string>
22#include <utility>
23#include <vector>
24
25#include "tensorflow/lite/delegates/gpu/common/operations.h"
26#include "tensorflow/lite/delegates/gpu/common/shape.h"
27#include "tensorflow/lite/delegates/gpu/common/status.h"
28#include "tensorflow/lite/delegates/gpu/common/tasks/conv_constants_test_util.h"
29#include "tensorflow/lite/delegates/gpu/common/tasks/conv_generic.h"
30#include "tensorflow/lite/delegates/gpu/common/tasks/conv_generic_test_util.h"
31#include "tensorflow/lite/delegates/gpu/common/tasks/conv_metal_simd.h"
32#include "tensorflow/lite/delegates/gpu/common/tasks/winograd.h"
33#include "tensorflow/lite/delegates/gpu/common/tensor.h"
34#include "tensorflow/lite/delegates/gpu/common/util.h"
35#include "tensorflow/lite/delegates/gpu/metal/kernels/test_util.h"
36#include "tensorflow/lite/delegates/gpu/metal/metal_spatial_tensor.h"
37
38@interface ConvTest : XCTestCase
39@end
40
41@implementation ConvTest {
42  tflite::gpu::metal::MetalExecutionEnvironment exec_env_;
43}
44
45namespace tflite {
46namespace gpu {
47namespace metal {
48
49absl::Status Winograd4x4To6x6Test(TestExecutionEnvironment* env) {
50  const int src_channels = 7;
51  const int dst_channels = 13;
52  Convolution2DAttributes attr;
53  attr.padding.prepended = HW(0, 0);
54  attr.padding.appended = HW(10, 10);
55  attr.strides = HW(1, 1);
56  attr.dilations = HW(1, 1);
57  attr.weights.shape = OHWI(dst_channels, 3, 3, src_channels);
58  attr.weights.data.resize(attr.weights.shape.DimensionsProduct());
59  for (int i = 0; i < attr.weights.data.size(); ++i) {
60    attr.weights.data[i] = sin(i);
61  }
62  attr.bias.shape = Linear(dst_channels);
63  attr.bias.data.resize(attr.bias.shape.DimensionsProduct());
64  for (int i = 0; i < attr.bias.data.size(); ++i) {
65    attr.bias.data[i] = sin(i);
66  }
67
68  auto src_shape = BHWC(1, 17, 13, src_channels);
69  auto dst_shape = CalculateOutputShape(src_shape, attr);
70  int new_width = src_shape.w + attr.padding.prepended.w + attr.padding.appended.w - 2;
71  int new_height = src_shape.h + attr.padding.prepended.h + attr.padding.appended.h - 2;
72  BHWC conv_shape;
73  conv_shape.b = dst_shape.b;
74  conv_shape.h = 36;
75  conv_shape.w = DivideRoundUp(new_width, 4) * DivideRoundUp(new_height, 4);
76  conv_shape.c = dst_shape.c;
77
78  TensorFloat32 src_tensor;
79  src_tensor.shape = src_shape;
80  src_tensor.data.resize(src_tensor.shape.DimensionsProduct());
81  for (int i = 0; i < src_tensor.data.size(); ++i) {
82    src_tensor.data[i] = sin(i);
83  }
84
85  for (auto precision : env->GetSupportedPrecisions()) {
86    auto data_type = DeduceDataTypeFromPrecision(precision);
87    for (auto storage : env->GetSupportedStorages(data_type)) {
88      const float eps = precision == CalculationsPrecision::F32 ? 1e-4f : 0.4f;
89      OperationDef op_def;
90      op_def.precision = precision;
91      op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
92      op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
93
94      TensorFloat32 output0;
95      auto gpu_op0 = CreateConvGeneric(env->GetGpuInfo(), op_def, attr, &dst_shape);
96      auto op0_ptr = std::make_unique<ConvGeneric>(std::move(gpu_op0));
97      RETURN_IF_ERROR(
98          env->ExecuteGPUOperation(src_tensor, std::move(op0_ptr), dst_shape, &output0));
99
100      auto gpu_op1 = CreateWinograd4x4To36(op_def, attr.padding, env->GetGpuInfo());
101      std::unique_ptr<GPUOperation> op1_ptr = std::make_unique<Winograd4x4To36>(std::move(gpu_op1));
102
103      auto gpu_op2 = CreateConvGenericWino4x4To6x6(env->GetGpuInfo(), op_def, attr, &conv_shape);
104      auto op2_ptr = std::make_unique<ConvGeneric>(std::move(gpu_op2));
105
106      auto gpu_op3 = CreateWinograd36To4x4(op_def, attr.bias);
107      std::unique_ptr<GPUOperation> op3_ptr = std::make_unique<Winograd36To4x4>(std::move(gpu_op3));
108
109      TensorFloat32 output1;
110      BHWC output1_shape = conv_shape;
111      output1_shape.c = src_shape.c;
112      RETURN_IF_ERROR(
113          env->ExecuteGPUOperation(src_tensor, std::move(op1_ptr), output1_shape, &output1));
114
115      TensorFloat32 output2;
116      BHWC output2_shape = conv_shape;
117      RETURN_IF_ERROR(
118          env->ExecuteGPUOperation(output1, std::move(op2_ptr), output2_shape, &output2));
119
120      TensorFloat32 output3;
121      BHWC output3_shape = dst_shape;
122      RETURN_IF_ERROR(
123          env->ExecuteGPUOperation(output2, std::move(op3_ptr), output3_shape, &output3));
124
125      RETURN_IF_ERROR(PointWiseNear(output0.data, output3.data, eps))
126          << "Failed using precision " << ToString(precision);
127    }
128  }
129  return absl::OkStatus();
130}
131
132absl::Status ConvolutionGroupedTest(TestExecutionEnvironment* env) {
133  TensorFloat32 src_tensor;
134  src_tensor.shape = BHWC(1, 1, 1, 8);
135  src_tensor.data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f};
136
137  Convolution2DAttributes attr;
138  attr.groups = 2;
139  attr.padding.prepended = HW(0, 0);
140  attr.padding.appended = HW(0, 0);
141  attr.strides = HW(1, 1);
142  attr.dilations = HW(1, 1);
143  attr.weights.shape = OHWI(8, 1, 1, 4);
144  attr.weights.data = {1.0f,  2.0f,  3.0f,  4.0f,  5.0f,  6.0f,  7.0f,  8.0f,  9.0f,  10.0f, 11.0f,
145                       12.0f, 13.0f, 14.0f, 15.0f, 16.0f, 17.0f, 18.0f, 19.0f, 20.0f, 21.0f, 22.0f,
146                       23.0f, 24.0f, 25.0f, 26.0f, 27.0f, 28.0f, 29.0f, 30.0f, 31.0f, 32.0f};
147  attr.bias.shape = Linear(8);
148  attr.bias.data = {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f};
149
150  for (auto precision : env->GetSupportedPrecisions()) {
151    auto data_type = DeduceDataTypeFromPrecision(precision);
152    for (auto storage : env->GetSupportedStorages(data_type)) {
153      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
154      OperationDef op_def;
155      op_def.precision = precision;
156      op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
157      op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
158      TensorFloat32 dst_tensor;
159      auto dst_shape = BHWC(1, 1, 1, 8);
160      ConvGeneric operation = CreateConvGeneric(env->GetGpuInfo(), op_def, attr, &dst_shape);
161      RETURN_IF_ERROR(env->ExecuteGPUOperation(
162          src_tensor, std::make_unique<ConvGeneric>(std::move(operation)), dst_shape, &dst_tensor));
163      RETURN_IF_ERROR(PointWiseNear({20.0f, 44.0f, 68.0f, 92.0f, 412.0f, 500.0f, 588.0f, 676.0f},
164                                    dst_tensor.data, eps))
165          << "Failed using precision " << ToString(precision);
166    }
167  }
168  return absl::OkStatus();
169}
170
171absl::Status ConvolutionSimdMatrixMultiplyTest(TestExecutionEnvironment* env) {
172  TensorFloat32 src_tensor;
173  src_tensor.shape = BHWC(1, 32, 32, 1024);
174  const BHWC dst_shape(1, 32, 32, 1024);
175  src_tensor.data.resize(src_tensor.shape.DimensionsProduct());
176  for (int i = 0; i < src_tensor.data.size(); ++i) {
177    src_tensor.data[i] = sin(0.01f * i);
178  }
179
180  Convolution2DAttributes attr;
181  attr.padding.prepended = HW(0, 0);
182  attr.padding.appended = HW(0, 0);
183  attr.strides = HW(1, 1);
184  attr.dilations = HW(1, 1);
185  attr.weights.shape = OHWI(dst_shape.c, 1, 1, src_tensor.shape.c);
186  attr.weights.data.resize(attr.weights.shape.DimensionsProduct());
187  for (int i = 0; i < attr.weights.data.size(); ++i) {
188    attr.weights.data[i] = sin(0.1f * i);
189  }
190  attr.bias.shape = Linear(dst_shape.c);
191  attr.bias.data.resize(attr.bias.shape.DimensionsProduct());
192  for (int i = 0; i < attr.bias.data.size(); ++i) {
193    attr.bias.data[i] = sin(0.1f * i);
194  }
195
196  TensorFloat32 dst_tensor_ref;
197  {
198    OperationDef op_def;
199    op_def.precision = CalculationsPrecision::F32;
200    auto data_type = DeduceDataTypeFromPrecision(op_def.precision);
201    op_def.src_tensors.push_back({data_type, TensorStorageType::BUFFER, Layout::HWC});
202    op_def.dst_tensors.push_back({data_type, TensorStorageType::BUFFER, Layout::HWC});
203    ConvGeneric operation = CreateConvGeneric(env->GetGpuInfo(), op_def, attr, &dst_shape);
204    RETURN_IF_ERROR(env->ExecuteGPUOperation(src_tensor,
205                                             std::make_unique<ConvGeneric>(std::move(operation)),
206                                             dst_shape, &dst_tensor_ref));
207  }
208  for (auto precision : env->GetSupportedPrecisions()) {
209    auto data_type = DeduceDataTypeFromPrecision(precision);
210    for (auto storage : env->GetSupportedStorages(data_type)) {
211      const float eps = precision == CalculationsPrecision::F32 ? 4e-5f : 0.4f;
212      OperationDef op_def;
213      op_def.precision = precision;
214      op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
215      op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
216      if (!IsConvolutionMetalSimdSupported(env->GetGpuInfo(), op_def, attr)) {
217        continue;
218      }
219      TensorFloat32 dst_tensor_simd;
220      ConvolutionMetalSimd operation_simd =
221          CreateConvolutionMetalSimd(op_def, dst_shape, attr, env->GetGpuInfo());
222      RETURN_IF_ERROR(env->ExecuteGPUOperation(
223          src_tensor, std::make_unique<ConvolutionMetalSimd>(std::move(operation_simd)), dst_shape,
224          &dst_tensor_simd));
225      RETURN_IF_ERROR(PointWiseNear(dst_tensor_ref.data, dst_tensor_simd.data, eps))
226          << "Failed using precision " << ToString(precision);
227    }
228  }
229  return absl::OkStatus();
230}
231
232absl::Status ConvolutionSimdMatrixMultiplyBatchTest(TestExecutionEnvironment* env) {
233  TensorFloat32 src_tensor;
234  src_tensor.shape = BHWC(8, 8, 8, 128);
235  const BHWC dst_shape(8, 8, 8, 256);
236  src_tensor.data.resize(src_tensor.shape.DimensionsProduct());
237  for (int i = 0; i < src_tensor.data.size(); ++i) {
238    src_tensor.data[i] = sin(0.01f * i);
239  }
240
241  Convolution2DAttributes attr;
242  attr.padding.prepended = HW(0, 0);
243  attr.padding.appended = HW(0, 0);
244  attr.strides = HW(1, 1);
245  attr.dilations = HW(1, 1);
246  attr.weights.shape = OHWI(dst_shape.c, 1, 1, src_tensor.shape.c);
247  attr.weights.data.resize(attr.weights.shape.DimensionsProduct());
248  for (int i = 0; i < attr.weights.data.size(); ++i) {
249    attr.weights.data[i] = sin(0.1f * i);
250  }
251  attr.bias.shape = Linear(dst_shape.c);
252  attr.bias.data.resize(attr.bias.shape.DimensionsProduct());
253  for (int i = 0; i < attr.bias.data.size(); ++i) {
254    attr.bias.data[i] = sin(0.1f * i);
255  }
256
257  TensorFloat32 dst_tensor_ref;
258  {
259    OperationDef op_def;
260    op_def.precision = CalculationsPrecision::F32;
261    auto data_type = DeduceDataTypeFromPrecision(op_def.precision);
262    op_def.src_tensors.push_back({data_type, TensorStorageType::BUFFER, Layout::BHWC});
263    op_def.dst_tensors.push_back({data_type, TensorStorageType::BUFFER, Layout::BHWC});
264    ConvGeneric operation = CreateConvGeneric(env->GetGpuInfo(), op_def, attr, &dst_shape);
265    RETURN_IF_ERROR(env->ExecuteGPUOperation(src_tensor,
266                                             std::make_unique<ConvGeneric>(std::move(operation)),
267                                             dst_shape, &dst_tensor_ref));
268  }
269  for (auto precision : env->GetSupportedPrecisions()) {
270    auto data_type = DeduceDataTypeFromPrecision(precision);
271    for (auto storage : env->GetSupportedStorages(data_type)) {
272      const float eps = precision == CalculationsPrecision::F32 ? 8e-6f : 0.2f;
273      OperationDef op_def;
274      op_def.precision = precision;
275      op_def.src_tensors.push_back({data_type, storage, Layout::BHWC});
276      op_def.dst_tensors.push_back({data_type, storage, Layout::BHWC});
277      if (!IsConvolutionMetalSimdSupported(env->GetGpuInfo(), op_def, attr)) {
278        continue;
279      }
280      TensorFloat32 dst_tensor_simd;
281      ConvolutionMetalSimd operation_simd =
282          CreateConvolutionMetalSimd(op_def, dst_shape, attr, env->GetGpuInfo());
283      RETURN_IF_ERROR(env->ExecuteGPUOperation(
284          src_tensor, std::make_unique<ConvolutionMetalSimd>(std::move(operation_simd)), dst_shape,
285          &dst_tensor_simd));
286      RETURN_IF_ERROR(PointWiseNear(dst_tensor_ref.data, dst_tensor_simd.data, eps))
287          << "Failed using precision " << ToString(precision);
288    }
289  }
290  return absl::OkStatus();
291}
292
293absl::Status ConvolutionSimdMatrixMultiplyPerfTest() {
294  const BHWC src_shape(1, 32, 32, 1024);
295  const BHWC dst_shape(1, 32, 32, 1024);
296  Convolution2DAttributes attr;
297  attr.padding.prepended = HW(0, 0);
298  attr.padding.appended = HW(0, 0);
299  attr.strides = HW(1, 1);
300  attr.dilations = HW(1, 1);
301  attr.weights.shape = OHWI(dst_shape.c, 1, 1, src_shape.c);
302  attr.weights.data.resize(attr.weights.shape.DimensionsProduct());
303  for (int i = 0; i < attr.weights.data.size(); ++i) {
304    attr.weights.data[i] = sin(0.1f * i);
305  }
306  attr.bias.shape = Linear(dst_shape.c);
307  attr.bias.data.resize(attr.bias.shape.DimensionsProduct());
308  for (int i = 0; i < attr.bias.data.size(); ++i) {
309    attr.bias.data[i] = sin(0.1f * i);
310  }
311
312  MetalDevice device;
313  OperationDef op_def;
314  op_def.precision = CalculationsPrecision::F32;
315  auto data_type = DeduceDataTypeFromPrecision(op_def.precision);
316  op_def.src_tensors.push_back({data_type, TensorStorageType::BUFFER, Layout::HWC});
317  op_def.dst_tensors.push_back({data_type, TensorStorageType::BUFFER, Layout::HWC});
318  ConvolutionMetalSimd operation_simd =
319      CreateConvolutionMetalSimd(op_def, dst_shape, attr, device.GetInfo());
320  auto op_ptr = std::make_unique<ConvolutionMetalSimd>(std::move(operation_simd));
321
322  MetalSpatialTensor src_gpu, dst_gpu;
323  TensorDescriptor descriptor_with_shape = op_def.src_tensors[0];
324  descriptor_with_shape.SetBHWCShape(src_shape);
325  RETURN_IF_ERROR(CreateTensor(device.device(), descriptor_with_shape, &src_gpu));
326  descriptor_with_shape = op_def.dst_tensors[0];
327  descriptor_with_shape.SetBHWCShape(dst_shape);
328  RETURN_IF_ERROR(CreateTensor(device.device(), descriptor_with_shape, &dst_gpu));
329
330  RETURN_IF_ERROR(op_ptr->AssembleCode(device.GetInfo()));
331
332  ComputeTask gpu_task;
333  gpu_task.Init(std::move(op_ptr));
334  RETURN_IF_ERROR(gpu_task.Compile(&device));
335  gpu_task.SetSrcTensor(&src_gpu, 0);
336  gpu_task.SetDstTensor(&dst_gpu, 0);
337  RETURN_IF_ERROR(gpu_task.UpdateParams());
338
339  const double ops_count = 2.0 * dst_shape.w * dst_shape.h * dst_shape.c * attr.weights.shape.i;
340  const double gops_count = ops_count * 1e-9;
341  id<MTLCommandQueue> command_queue = [device.device() newCommandQueue];
342  const int iterations = 10;
343  const int iteration_size = 100;
344  double alu_fp32_gflops_per_cu = 162.0;
345  if (device.GetInfo().apple_info.gpu_type == AppleGpu::kA15) {
346    alu_fp32_gflops_per_cu *= 2.0;
347  }
348  double alu_fp16_gflops_per_cu = 162.0 * 2.0;
349  double alu_gflops_per_gpu;
350  if (op_def.precision == CalculationsPrecision::F32) {
351    alu_gflops_per_gpu =
352        alu_fp32_gflops_per_cu * device.GetInfo().apple_info.GetComputeUnitsCount();
353  } else {
354    alu_gflops_per_gpu =
355        alu_fp16_gflops_per_cu * device.GetInfo().apple_info.GetComputeUnitsCount();
356  }
357  for (int i = 0; i < iterations; ++i) {
358    @autoreleasepool {
359      id<MTLCommandBuffer> command_buffer = [command_queue commandBuffer];
360      for (int j = 0; j < iteration_size; ++j) {
361        id<MTLComputeCommandEncoder> encoder = [command_buffer computeCommandEncoder];
362        gpu_task.Encode(encoder);
363        [encoder endEncoding];
364      }
365      const auto start = std::chrono::high_resolution_clock::now();
366      [command_buffer commit];
367      [command_buffer waitUntilCompleted];
368      const auto end = std::chrono::high_resolution_clock::now();
369      const std::chrono::duration<double> diff = end - start;
370      const double execution_time_ms = diff.count() / static_cast<double>(iteration_size) * 1000.0;
371      const double fps = 1000.0 / execution_time_ms;
372      const double pers = gops_count * fps / alu_gflops_per_gpu * 100.0;
373      std::cout << execution_time_ms << " ms, " << gops_count * fps << " GFLOPS(" << pers << "%)"
374                << std::endl;
375    }
376  }
377
378  return absl::OkStatus();
379}
380
381}  // namespace metal
382}  // namespace gpu
383}  // namespace tflite
384
385- (void)testWinograd4x4To6x6 {
386  auto status = tflite::gpu::metal::Winograd4x4To6x6Test(&exec_env_);
387  XCTAssertTrue(status.ok(), @"%s", std::string(status.message()).c_str());
388}
389
390- (void)testGroupedConvolution {
391  auto status = tflite::gpu::metal::ConvolutionGroupedTest(&exec_env_);
392  XCTAssertTrue(status.ok(), @"%s", std::string(status.message()).c_str());
393}
394
395- (void)testConvGeneric1x1SimpleWeights {
396  const auto status = ConvGeneric1x1SimpleWeightsTest(&exec_env_);
397  XCTAssertTrue(status.ok(), @"%s", std::string(status.message()).c_str());
398}
399
400- (void)testConvGeneric1x1 {
401  const auto status = ConvGeneric1x1Test(&exec_env_);
402  XCTAssertTrue(status.ok(), @"%s", std::string(status.message()).c_str());
403}
404
405- (void)testConvGenericSimpleWeights {
406  const auto status = ConvGenericSimpleWeightsTest(&exec_env_);
407  XCTAssertTrue(status.ok(), @"%s", std::string(status.message()).c_str());
408}
409
410- (void)testConvGeneric {
411  const auto status = ConvGenericTest(&exec_env_);
412  XCTAssertTrue(status.ok(), @"%s", std::string(status.message()).c_str());
413}
414
415- (void)testConvGenericGrouped {
416  const auto status = ConvGenericGroupedTest(&exec_env_);
417  XCTAssertTrue(status.ok(), @"%s", std::string(status.message()).c_str());
418}
419
420- (void)testConvConstantsSimpleWeights {
421  const auto status = ConvConstantsSimpleWeightsTest(&exec_env_);
422  XCTAssertTrue(status.ok(), @"%s", std::string(status.message()).c_str());
423}
424
425- (void)testConvConstants {
426  const auto status = ConvConstantsTest(&exec_env_);
427  XCTAssertTrue(status.ok(), @"%s", std::string(status.message()).c_str());
428}
429
430- (void)testConvSimdMatrixMultiply {
431  const auto status = tflite::gpu::metal::ConvolutionSimdMatrixMultiplyTest(&exec_env_);
432  XCTAssertTrue(status.ok(), @"%s", std::string(status.message()).c_str());
433}
434
435- (void)testConvSimdMatrixMultiplyBatch {
436  const auto status = tflite::gpu::metal::ConvolutionSimdMatrixMultiplyBatchTest(&exec_env_);
437  XCTAssertTrue(status.ok(), @"%s", std::string(status.message()).c_str());
438}
439
440- (void)testConvSimdMatrixMultiplyPerf {
441  const auto status = tflite::gpu::metal::ConvolutionSimdMatrixMultiplyPerfTest();
442  XCTAssertTrue(status.ok(), @"%s", std::string(status.message()).c_str());
443}
444
445@end
446