1/* Copyright 2019 The TensorFlow Authors. All Rights Reserved. 2 3Licensed under the Apache License, Version 2.0 (the "License"); 4you may not use this file except in compliance with the License. 5You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9Unless required by applicable law or agreed to in writing, software 10distributed under the License is distributed on an "AS IS" BASIS, 11WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12See the License for the specific language governing permissions and 13limitations under the License. 14==============================================================================*/ 15 16#include "tensorflow/lite/delegates/gpu/metal/compute_task.h" 17 18#import <XCTest/XCTest.h> 19 20#include <memory> 21#include <string> 22#include <utility> 23#include <vector> 24 25#include "tensorflow/lite/delegates/gpu/common/operations.h" 26#include "tensorflow/lite/delegates/gpu/common/shape.h" 27#include "tensorflow/lite/delegates/gpu/common/status.h" 28#include "tensorflow/lite/delegates/gpu/common/tasks/conv_constants_test_util.h" 29#include "tensorflow/lite/delegates/gpu/common/tasks/conv_generic.h" 30#include "tensorflow/lite/delegates/gpu/common/tasks/conv_generic_test_util.h" 31#include "tensorflow/lite/delegates/gpu/common/tasks/conv_metal_simd.h" 32#include "tensorflow/lite/delegates/gpu/common/tasks/winograd.h" 33#include "tensorflow/lite/delegates/gpu/common/tensor.h" 34#include "tensorflow/lite/delegates/gpu/common/util.h" 35#include "tensorflow/lite/delegates/gpu/metal/kernels/test_util.h" 36#include "tensorflow/lite/delegates/gpu/metal/metal_spatial_tensor.h" 37 38@interface ConvTest : XCTestCase 39@end 40 41@implementation ConvTest { 42 tflite::gpu::metal::MetalExecutionEnvironment exec_env_; 43} 44 45namespace tflite { 46namespace gpu { 47namespace metal { 48 49absl::Status Winograd4x4To6x6Test(TestExecutionEnvironment* env) { 50 const int src_channels = 7; 51 const int dst_channels = 13; 52 Convolution2DAttributes attr; 53 attr.padding.prepended = HW(0, 0); 54 attr.padding.appended = HW(10, 10); 55 attr.strides = HW(1, 1); 56 attr.dilations = HW(1, 1); 57 attr.weights.shape = OHWI(dst_channels, 3, 3, src_channels); 58 attr.weights.data.resize(attr.weights.shape.DimensionsProduct()); 59 for (int i = 0; i < attr.weights.data.size(); ++i) { 60 attr.weights.data[i] = sin(i); 61 } 62 attr.bias.shape = Linear(dst_channels); 63 attr.bias.data.resize(attr.bias.shape.DimensionsProduct()); 64 for (int i = 0; i < attr.bias.data.size(); ++i) { 65 attr.bias.data[i] = sin(i); 66 } 67 68 auto src_shape = BHWC(1, 17, 13, src_channels); 69 auto dst_shape = CalculateOutputShape(src_shape, attr); 70 int new_width = src_shape.w + attr.padding.prepended.w + attr.padding.appended.w - 2; 71 int new_height = src_shape.h + attr.padding.prepended.h + attr.padding.appended.h - 2; 72 BHWC conv_shape; 73 conv_shape.b = dst_shape.b; 74 conv_shape.h = 36; 75 conv_shape.w = DivideRoundUp(new_width, 4) * DivideRoundUp(new_height, 4); 76 conv_shape.c = dst_shape.c; 77 78 TensorFloat32 src_tensor; 79 src_tensor.shape = src_shape; 80 src_tensor.data.resize(src_tensor.shape.DimensionsProduct()); 81 for (int i = 0; i < src_tensor.data.size(); ++i) { 82 src_tensor.data[i] = sin(i); 83 } 84 85 for (auto precision : env->GetSupportedPrecisions()) { 86 auto data_type = DeduceDataTypeFromPrecision(precision); 87 for (auto storage : env->GetSupportedStorages(data_type)) { 88 const float eps = precision == CalculationsPrecision::F32 ? 1e-4f : 0.4f; 89 OperationDef op_def; 90 op_def.precision = precision; 91 op_def.src_tensors.push_back({data_type, storage, Layout::HWC}); 92 op_def.dst_tensors.push_back({data_type, storage, Layout::HWC}); 93 94 TensorFloat32 output0; 95 auto gpu_op0 = CreateConvGeneric(env->GetGpuInfo(), op_def, attr, &dst_shape); 96 auto op0_ptr = std::make_unique<ConvGeneric>(std::move(gpu_op0)); 97 RETURN_IF_ERROR( 98 env->ExecuteGPUOperation(src_tensor, std::move(op0_ptr), dst_shape, &output0)); 99 100 auto gpu_op1 = CreateWinograd4x4To36(op_def, attr.padding, env->GetGpuInfo()); 101 std::unique_ptr<GPUOperation> op1_ptr = std::make_unique<Winograd4x4To36>(std::move(gpu_op1)); 102 103 auto gpu_op2 = CreateConvGenericWino4x4To6x6(env->GetGpuInfo(), op_def, attr, &conv_shape); 104 auto op2_ptr = std::make_unique<ConvGeneric>(std::move(gpu_op2)); 105 106 auto gpu_op3 = CreateWinograd36To4x4(op_def, attr.bias); 107 std::unique_ptr<GPUOperation> op3_ptr = std::make_unique<Winograd36To4x4>(std::move(gpu_op3)); 108 109 TensorFloat32 output1; 110 BHWC output1_shape = conv_shape; 111 output1_shape.c = src_shape.c; 112 RETURN_IF_ERROR( 113 env->ExecuteGPUOperation(src_tensor, std::move(op1_ptr), output1_shape, &output1)); 114 115 TensorFloat32 output2; 116 BHWC output2_shape = conv_shape; 117 RETURN_IF_ERROR( 118 env->ExecuteGPUOperation(output1, std::move(op2_ptr), output2_shape, &output2)); 119 120 TensorFloat32 output3; 121 BHWC output3_shape = dst_shape; 122 RETURN_IF_ERROR( 123 env->ExecuteGPUOperation(output2, std::move(op3_ptr), output3_shape, &output3)); 124 125 RETURN_IF_ERROR(PointWiseNear(output0.data, output3.data, eps)) 126 << "Failed using precision " << ToString(precision); 127 } 128 } 129 return absl::OkStatus(); 130} 131 132absl::Status ConvolutionGroupedTest(TestExecutionEnvironment* env) { 133 TensorFloat32 src_tensor; 134 src_tensor.shape = BHWC(1, 1, 1, 8); 135 src_tensor.data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f}; 136 137 Convolution2DAttributes attr; 138 attr.groups = 2; 139 attr.padding.prepended = HW(0, 0); 140 attr.padding.appended = HW(0, 0); 141 attr.strides = HW(1, 1); 142 attr.dilations = HW(1, 1); 143 attr.weights.shape = OHWI(8, 1, 1, 4); 144 attr.weights.data = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 145 12.0f, 13.0f, 14.0f, 15.0f, 16.0f, 17.0f, 18.0f, 19.0f, 20.0f, 21.0f, 22.0f, 146 23.0f, 24.0f, 25.0f, 26.0f, 27.0f, 28.0f, 29.0f, 30.0f, 31.0f, 32.0f}; 147 attr.bias.shape = Linear(8); 148 attr.bias.data = {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f}; 149 150 for (auto precision : env->GetSupportedPrecisions()) { 151 auto data_type = DeduceDataTypeFromPrecision(precision); 152 for (auto storage : env->GetSupportedStorages(data_type)) { 153 const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f; 154 OperationDef op_def; 155 op_def.precision = precision; 156 op_def.src_tensors.push_back({data_type, storage, Layout::HWC}); 157 op_def.dst_tensors.push_back({data_type, storage, Layout::HWC}); 158 TensorFloat32 dst_tensor; 159 auto dst_shape = BHWC(1, 1, 1, 8); 160 ConvGeneric operation = CreateConvGeneric(env->GetGpuInfo(), op_def, attr, &dst_shape); 161 RETURN_IF_ERROR(env->ExecuteGPUOperation( 162 src_tensor, std::make_unique<ConvGeneric>(std::move(operation)), dst_shape, &dst_tensor)); 163 RETURN_IF_ERROR(PointWiseNear({20.0f, 44.0f, 68.0f, 92.0f, 412.0f, 500.0f, 588.0f, 676.0f}, 164 dst_tensor.data, eps)) 165 << "Failed using precision " << ToString(precision); 166 } 167 } 168 return absl::OkStatus(); 169} 170 171absl::Status ConvolutionSimdMatrixMultiplyTest(TestExecutionEnvironment* env) { 172 TensorFloat32 src_tensor; 173 src_tensor.shape = BHWC(1, 32, 32, 1024); 174 const BHWC dst_shape(1, 32, 32, 1024); 175 src_tensor.data.resize(src_tensor.shape.DimensionsProduct()); 176 for (int i = 0; i < src_tensor.data.size(); ++i) { 177 src_tensor.data[i] = sin(0.01f * i); 178 } 179 180 Convolution2DAttributes attr; 181 attr.padding.prepended = HW(0, 0); 182 attr.padding.appended = HW(0, 0); 183 attr.strides = HW(1, 1); 184 attr.dilations = HW(1, 1); 185 attr.weights.shape = OHWI(dst_shape.c, 1, 1, src_tensor.shape.c); 186 attr.weights.data.resize(attr.weights.shape.DimensionsProduct()); 187 for (int i = 0; i < attr.weights.data.size(); ++i) { 188 attr.weights.data[i] = sin(0.1f * i); 189 } 190 attr.bias.shape = Linear(dst_shape.c); 191 attr.bias.data.resize(attr.bias.shape.DimensionsProduct()); 192 for (int i = 0; i < attr.bias.data.size(); ++i) { 193 attr.bias.data[i] = sin(0.1f * i); 194 } 195 196 TensorFloat32 dst_tensor_ref; 197 { 198 OperationDef op_def; 199 op_def.precision = CalculationsPrecision::F32; 200 auto data_type = DeduceDataTypeFromPrecision(op_def.precision); 201 op_def.src_tensors.push_back({data_type, TensorStorageType::BUFFER, Layout::HWC}); 202 op_def.dst_tensors.push_back({data_type, TensorStorageType::BUFFER, Layout::HWC}); 203 ConvGeneric operation = CreateConvGeneric(env->GetGpuInfo(), op_def, attr, &dst_shape); 204 RETURN_IF_ERROR(env->ExecuteGPUOperation(src_tensor, 205 std::make_unique<ConvGeneric>(std::move(operation)), 206 dst_shape, &dst_tensor_ref)); 207 } 208 for (auto precision : env->GetSupportedPrecisions()) { 209 auto data_type = DeduceDataTypeFromPrecision(precision); 210 for (auto storage : env->GetSupportedStorages(data_type)) { 211 const float eps = precision == CalculationsPrecision::F32 ? 4e-5f : 0.4f; 212 OperationDef op_def; 213 op_def.precision = precision; 214 op_def.src_tensors.push_back({data_type, storage, Layout::HWC}); 215 op_def.dst_tensors.push_back({data_type, storage, Layout::HWC}); 216 if (!IsConvolutionMetalSimdSupported(env->GetGpuInfo(), op_def, attr)) { 217 continue; 218 } 219 TensorFloat32 dst_tensor_simd; 220 ConvolutionMetalSimd operation_simd = 221 CreateConvolutionMetalSimd(op_def, dst_shape, attr, env->GetGpuInfo()); 222 RETURN_IF_ERROR(env->ExecuteGPUOperation( 223 src_tensor, std::make_unique<ConvolutionMetalSimd>(std::move(operation_simd)), dst_shape, 224 &dst_tensor_simd)); 225 RETURN_IF_ERROR(PointWiseNear(dst_tensor_ref.data, dst_tensor_simd.data, eps)) 226 << "Failed using precision " << ToString(precision); 227 } 228 } 229 return absl::OkStatus(); 230} 231 232absl::Status ConvolutionSimdMatrixMultiplyBatchTest(TestExecutionEnvironment* env) { 233 TensorFloat32 src_tensor; 234 src_tensor.shape = BHWC(8, 8, 8, 128); 235 const BHWC dst_shape(8, 8, 8, 256); 236 src_tensor.data.resize(src_tensor.shape.DimensionsProduct()); 237 for (int i = 0; i < src_tensor.data.size(); ++i) { 238 src_tensor.data[i] = sin(0.01f * i); 239 } 240 241 Convolution2DAttributes attr; 242 attr.padding.prepended = HW(0, 0); 243 attr.padding.appended = HW(0, 0); 244 attr.strides = HW(1, 1); 245 attr.dilations = HW(1, 1); 246 attr.weights.shape = OHWI(dst_shape.c, 1, 1, src_tensor.shape.c); 247 attr.weights.data.resize(attr.weights.shape.DimensionsProduct()); 248 for (int i = 0; i < attr.weights.data.size(); ++i) { 249 attr.weights.data[i] = sin(0.1f * i); 250 } 251 attr.bias.shape = Linear(dst_shape.c); 252 attr.bias.data.resize(attr.bias.shape.DimensionsProduct()); 253 for (int i = 0; i < attr.bias.data.size(); ++i) { 254 attr.bias.data[i] = sin(0.1f * i); 255 } 256 257 TensorFloat32 dst_tensor_ref; 258 { 259 OperationDef op_def; 260 op_def.precision = CalculationsPrecision::F32; 261 auto data_type = DeduceDataTypeFromPrecision(op_def.precision); 262 op_def.src_tensors.push_back({data_type, TensorStorageType::BUFFER, Layout::BHWC}); 263 op_def.dst_tensors.push_back({data_type, TensorStorageType::BUFFER, Layout::BHWC}); 264 ConvGeneric operation = CreateConvGeneric(env->GetGpuInfo(), op_def, attr, &dst_shape); 265 RETURN_IF_ERROR(env->ExecuteGPUOperation(src_tensor, 266 std::make_unique<ConvGeneric>(std::move(operation)), 267 dst_shape, &dst_tensor_ref)); 268 } 269 for (auto precision : env->GetSupportedPrecisions()) { 270 auto data_type = DeduceDataTypeFromPrecision(precision); 271 for (auto storage : env->GetSupportedStorages(data_type)) { 272 const float eps = precision == CalculationsPrecision::F32 ? 8e-6f : 0.2f; 273 OperationDef op_def; 274 op_def.precision = precision; 275 op_def.src_tensors.push_back({data_type, storage, Layout::BHWC}); 276 op_def.dst_tensors.push_back({data_type, storage, Layout::BHWC}); 277 if (!IsConvolutionMetalSimdSupported(env->GetGpuInfo(), op_def, attr)) { 278 continue; 279 } 280 TensorFloat32 dst_tensor_simd; 281 ConvolutionMetalSimd operation_simd = 282 CreateConvolutionMetalSimd(op_def, dst_shape, attr, env->GetGpuInfo()); 283 RETURN_IF_ERROR(env->ExecuteGPUOperation( 284 src_tensor, std::make_unique<ConvolutionMetalSimd>(std::move(operation_simd)), dst_shape, 285 &dst_tensor_simd)); 286 RETURN_IF_ERROR(PointWiseNear(dst_tensor_ref.data, dst_tensor_simd.data, eps)) 287 << "Failed using precision " << ToString(precision); 288 } 289 } 290 return absl::OkStatus(); 291} 292 293absl::Status ConvolutionSimdMatrixMultiplyPerfTest() { 294 const BHWC src_shape(1, 32, 32, 1024); 295 const BHWC dst_shape(1, 32, 32, 1024); 296 Convolution2DAttributes attr; 297 attr.padding.prepended = HW(0, 0); 298 attr.padding.appended = HW(0, 0); 299 attr.strides = HW(1, 1); 300 attr.dilations = HW(1, 1); 301 attr.weights.shape = OHWI(dst_shape.c, 1, 1, src_shape.c); 302 attr.weights.data.resize(attr.weights.shape.DimensionsProduct()); 303 for (int i = 0; i < attr.weights.data.size(); ++i) { 304 attr.weights.data[i] = sin(0.1f * i); 305 } 306 attr.bias.shape = Linear(dst_shape.c); 307 attr.bias.data.resize(attr.bias.shape.DimensionsProduct()); 308 for (int i = 0; i < attr.bias.data.size(); ++i) { 309 attr.bias.data[i] = sin(0.1f * i); 310 } 311 312 MetalDevice device; 313 OperationDef op_def; 314 op_def.precision = CalculationsPrecision::F32; 315 auto data_type = DeduceDataTypeFromPrecision(op_def.precision); 316 op_def.src_tensors.push_back({data_type, TensorStorageType::BUFFER, Layout::HWC}); 317 op_def.dst_tensors.push_back({data_type, TensorStorageType::BUFFER, Layout::HWC}); 318 ConvolutionMetalSimd operation_simd = 319 CreateConvolutionMetalSimd(op_def, dst_shape, attr, device.GetInfo()); 320 auto op_ptr = std::make_unique<ConvolutionMetalSimd>(std::move(operation_simd)); 321 322 MetalSpatialTensor src_gpu, dst_gpu; 323 TensorDescriptor descriptor_with_shape = op_def.src_tensors[0]; 324 descriptor_with_shape.SetBHWCShape(src_shape); 325 RETURN_IF_ERROR(CreateTensor(device.device(), descriptor_with_shape, &src_gpu)); 326 descriptor_with_shape = op_def.dst_tensors[0]; 327 descriptor_with_shape.SetBHWCShape(dst_shape); 328 RETURN_IF_ERROR(CreateTensor(device.device(), descriptor_with_shape, &dst_gpu)); 329 330 RETURN_IF_ERROR(op_ptr->AssembleCode(device.GetInfo())); 331 332 ComputeTask gpu_task; 333 gpu_task.Init(std::move(op_ptr)); 334 RETURN_IF_ERROR(gpu_task.Compile(&device)); 335 gpu_task.SetSrcTensor(&src_gpu, 0); 336 gpu_task.SetDstTensor(&dst_gpu, 0); 337 RETURN_IF_ERROR(gpu_task.UpdateParams()); 338 339 const double ops_count = 2.0 * dst_shape.w * dst_shape.h * dst_shape.c * attr.weights.shape.i; 340 const double gops_count = ops_count * 1e-9; 341 id<MTLCommandQueue> command_queue = [device.device() newCommandQueue]; 342 const int iterations = 10; 343 const int iteration_size = 100; 344 double alu_fp32_gflops_per_cu = 162.0; 345 if (device.GetInfo().apple_info.gpu_type == AppleGpu::kA15) { 346 alu_fp32_gflops_per_cu *= 2.0; 347 } 348 double alu_fp16_gflops_per_cu = 162.0 * 2.0; 349 double alu_gflops_per_gpu; 350 if (op_def.precision == CalculationsPrecision::F32) { 351 alu_gflops_per_gpu = 352 alu_fp32_gflops_per_cu * device.GetInfo().apple_info.GetComputeUnitsCount(); 353 } else { 354 alu_gflops_per_gpu = 355 alu_fp16_gflops_per_cu * device.GetInfo().apple_info.GetComputeUnitsCount(); 356 } 357 for (int i = 0; i < iterations; ++i) { 358 @autoreleasepool { 359 id<MTLCommandBuffer> command_buffer = [command_queue commandBuffer]; 360 for (int j = 0; j < iteration_size; ++j) { 361 id<MTLComputeCommandEncoder> encoder = [command_buffer computeCommandEncoder]; 362 gpu_task.Encode(encoder); 363 [encoder endEncoding]; 364 } 365 const auto start = std::chrono::high_resolution_clock::now(); 366 [command_buffer commit]; 367 [command_buffer waitUntilCompleted]; 368 const auto end = std::chrono::high_resolution_clock::now(); 369 const std::chrono::duration<double> diff = end - start; 370 const double execution_time_ms = diff.count() / static_cast<double>(iteration_size) * 1000.0; 371 const double fps = 1000.0 / execution_time_ms; 372 const double pers = gops_count * fps / alu_gflops_per_gpu * 100.0; 373 std::cout << execution_time_ms << " ms, " << gops_count * fps << " GFLOPS(" << pers << "%)" 374 << std::endl; 375 } 376 } 377 378 return absl::OkStatus(); 379} 380 381} // namespace metal 382} // namespace gpu 383} // namespace tflite 384 385- (void)testWinograd4x4To6x6 { 386 auto status = tflite::gpu::metal::Winograd4x4To6x6Test(&exec_env_); 387 XCTAssertTrue(status.ok(), @"%s", std::string(status.message()).c_str()); 388} 389 390- (void)testGroupedConvolution { 391 auto status = tflite::gpu::metal::ConvolutionGroupedTest(&exec_env_); 392 XCTAssertTrue(status.ok(), @"%s", std::string(status.message()).c_str()); 393} 394 395- (void)testConvGeneric1x1SimpleWeights { 396 const auto status = ConvGeneric1x1SimpleWeightsTest(&exec_env_); 397 XCTAssertTrue(status.ok(), @"%s", std::string(status.message()).c_str()); 398} 399 400- (void)testConvGeneric1x1 { 401 const auto status = ConvGeneric1x1Test(&exec_env_); 402 XCTAssertTrue(status.ok(), @"%s", std::string(status.message()).c_str()); 403} 404 405- (void)testConvGenericSimpleWeights { 406 const auto status = ConvGenericSimpleWeightsTest(&exec_env_); 407 XCTAssertTrue(status.ok(), @"%s", std::string(status.message()).c_str()); 408} 409 410- (void)testConvGeneric { 411 const auto status = ConvGenericTest(&exec_env_); 412 XCTAssertTrue(status.ok(), @"%s", std::string(status.message()).c_str()); 413} 414 415- (void)testConvGenericGrouped { 416 const auto status = ConvGenericGroupedTest(&exec_env_); 417 XCTAssertTrue(status.ok(), @"%s", std::string(status.message()).c_str()); 418} 419 420- (void)testConvConstantsSimpleWeights { 421 const auto status = ConvConstantsSimpleWeightsTest(&exec_env_); 422 XCTAssertTrue(status.ok(), @"%s", std::string(status.message()).c_str()); 423} 424 425- (void)testConvConstants { 426 const auto status = ConvConstantsTest(&exec_env_); 427 XCTAssertTrue(status.ok(), @"%s", std::string(status.message()).c_str()); 428} 429 430- (void)testConvSimdMatrixMultiply { 431 const auto status = tflite::gpu::metal::ConvolutionSimdMatrixMultiplyTest(&exec_env_); 432 XCTAssertTrue(status.ok(), @"%s", std::string(status.message()).c_str()); 433} 434 435- (void)testConvSimdMatrixMultiplyBatch { 436 const auto status = tflite::gpu::metal::ConvolutionSimdMatrixMultiplyBatchTest(&exec_env_); 437 XCTAssertTrue(status.ok(), @"%s", std::string(status.message()).c_str()); 438} 439 440- (void)testConvSimdMatrixMultiplyPerf { 441 const auto status = tflite::gpu::metal::ConvolutionSimdMatrixMultiplyPerfTest(); 442 XCTAssertTrue(status.ok(), @"%s", std::string(status.message()).c_str()); 443} 444 445@end 446