/* * Copyright 2023-2024 Arm Limited and/or its affiliates. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ /* * Arm backend for Ethos-U baremetal driver stack, this relies on the * ethos-u-core-driver for hardware interaction. */ #include #include #include #include #include #include #include #include #include using namespace std; using executorch::aten::ScalarType; using executorch::runtime::ArrayRef; using executorch::runtime::Backend; using executorch::runtime::BackendExecutionContext; using executorch::runtime::BackendInitContext; using executorch::runtime::CompileSpec; using executorch::runtime::DelegateHandle; using executorch::runtime::Error; using executorch::runtime::EValue; using executorch::runtime::FreeableBuffer; using executorch::runtime::MemoryAllocator; using executorch::runtime::Result; namespace executorch { namespace backends { namespace arm { typedef struct { FreeableBuffer* processed; bool permuted_io_flag; } ExecutionHandle; extern "C" { void __attribute__((weak)) ArmBackend_execute_begin() {} void __attribute__((weak)) ArmBackend_execute_end() {} } class ArmBackendExecuteCallbacks { public: ArmBackendExecuteCallbacks() { ArmBackend_execute_begin(); } ~ArmBackendExecuteCallbacks() { ArmBackend_execute_end(); } }; class ArmBackend final : public ::executorch::runtime::BackendInterface { public: ArmBackend() {} ~ArmBackend() = default; virtual bool is_available() const override { // TODO: revise to use a register check/init function return 1; } Result init( BackendInitContext& context, FreeableBuffer* processed, ArrayRef compile_specs) const override { ET_LOG(Info, "ArmBackend::init %p", processed->data()); char* data = (char*)processed->data(); size_t size = processed->size(); // Verify format of vela_bin if (vela_bin_validate(data, size) == false) { ET_LOG(Error, "Malformed vela_bin_stream found"); return Error::InvalidProgram; } MemoryAllocator* allocator = context.get_runtime_allocator(); ExecutionHandle* handle = ET_ALLOCATE_INSTANCE_OR_RETURN_ERROR(allocator, ExecutionHandle); handle->processed = processed; handle->permuted_io_flag = false; for (auto& compile_spec : compile_specs) { if (0 == std::strcmp(compile_spec.key, "permute_memory_format") && 0 == std::memcmp(compile_spec.value.buffer, "nhwc", 4)) { handle->permuted_io_flag = true; } } // Return the same buffer we were passed - this data will be // executed directly return handle; } Error execute( BackendExecutionContext& context, DelegateHandle* input_handle, EValue** args) const override { ExecutionHandle* execution_handle = (ExecutionHandle*)input_handle; VelaHandles handles; ArmBackendExecuteCallbacks ArmBackend_execute_callbacks; // Command stream - we know at this point it's aligned char* data = (char*)execution_handle->processed->data(); ET_LOG(Debug, "ArmBackend::execute %p", data); // Read key sections from the vela_bin_stream if (vela_bin_read(data, &handles, execution_handle->processed->size()) == false) { ET_LOG(Error, "ArmBackend::vela_read: error, invalid binary layout"); return Error::InvalidProgram; } ET_LOG( Debug, "ArmBackend::execute: Running program data:\n cmd %p %zu\n weight %p %zu\n scratch %p %zu\n", handles.cmd_data, handles.cmd_data_size, handles.weight_data, handles.weight_data_size, handles.scratch_data, handles.scratch_data_size); // Write argument values (from EValue tensor) into Ethos-U scratch // TODO(MLETORCH-123): Optimise into direct write from Vela into the SRAM // or DRAM output for compatible data layouts. for (int i = 0; i < handles.inputs->count; i++) { auto tensor_in = args[i]->toTensor(); char* scratch_addr = handles.scratch_data + handles.inputs->io[i].offset; // We accept: bool supported = 0; // 32 bit int (simple non-quantised test cases) supported |= (tensor_in.scalar_type() == ScalarType::Int and handles.inputs->io[i].elem_size == 4); // 8 bit int (IOQDQ pass prepared networks) supported |= (tensor_in.scalar_type() == ScalarType::Char and handles.inputs->io[i].elem_size == 1); if (!supported) { ET_LOG( Error, "Input %d expected Integer (4 byte) or Char (1 byte) integer inputs, got ScalarType id %s", i, executorch::runtime::toString(tensor_in.scalar_type())); return Error::InvalidProgram; } supported = executorch::runtime::is_contiguous_dim_order( tensor_in.dim_order().data(), tensor_in.dim()); if (!supported) { ET_LOG( Error, "Input %d expected contiguous dim_order, but got non-contiguous dim_order", i); return Error::InvalidProgram; } // Select a compatible copy routine including checking for input layouts // which require permutation. bool permuted_input_shape; ET_CHECK_OK_OR_RETURN_ERROR(check_requires_permute( i, tensor_in, &handles.inputs->io[i], execution_handle->permuted_io_flag, &permuted_input_shape)); bool both_char = tensor_in.scalar_type() == ScalarType::Char and handles.inputs->io[i].elem_size == 1; bool both_int = tensor_in.scalar_type() == ScalarType::Int and handles.inputs->io[i].elem_size == 4; // Select a compatible copy routine if (both_char and permuted_input_shape) { // permuted byte copy CHW to HWC permute_CHW_to_HWC( tensor_in.mutable_data_ptr(), scratch_addr, tensor_in.size(1), tensor_in.size(2), tensor_in.size(3)); } else if (both_char or both_int) { // Sizes match and elt size matches so memcpy memcpy( scratch_addr, tensor_in.mutable_data_ptr(), tensor_in.nbytes()); } else { ET_LOG(Error, "No matching input copy routine"); return Error::InvalidProgram; } } // Allocate driver handle and synchronously invoke driver auto driver = std::unique_ptr( ethosu_reserve_driver(), ethosu_release_driver); if (driver == NULL) { ET_LOG(Error, "ArmBackend::execute: ethosu_reserve_driver failed"); return Error::InvalidState; } // Ethos-U low level driver expected order for Ethos U-55, we have // constant weight data, then scratch (which contains input and output) // scratch is written above in this function. uint64_t bases[2] = { (uint64_t)handles.weight_data, (uint64_t)handles.scratch_data}; size_t bases_size[2] = { handles.weight_data_size, handles.scratch_data_size}; int result = ethosu_invoke_v3( driver.get(), (void*)handles.cmd_data, handles.cmd_data_size, bases, bases_size, 2, /* fixed array of pointers to binary interface*/ nullptr); if (result != 0) { ET_LOG( Error, "ArmBackend::execute: Ethos-U invocation failed error (%d)", result); return Error::InvalidProgram; } // Write outputs from scratch into EValue pointers for (int i = 0; i < handles.outputs->count; i++) { const char* output_addr = handles.scratch_data + handles.outputs->io[i].offset; // Process input EValue into scratch // Outputs are in the index immediately after inputs auto tensor_out = args[handles.inputs->count + i]->toTensor(); bool permuted_output_shape; ET_CHECK_OK_OR_RETURN_ERROR(check_requires_permute( i, tensor_out, &handles.outputs->io[i], execution_handle->permuted_io_flag, &permuted_output_shape)); if (tensor_out.scalar_type() == ScalarType::Char and permuted_output_shape) { char* output_address = (char*)output_addr; permute_HWC_to_CHW( output_address, tensor_out.mutable_data_ptr(), tensor_out.size(1), tensor_out.size(2), tensor_out.size(3)); } else { for (int j = 0; j < tensor_out.numel(); j++) { if (tensor_out.scalar_type() == ScalarType::Char) { char* output_address = (char*)output_addr; tensor_out.mutable_data_ptr()[j] = output_address[j]; } else { int* output_address = (int*)output_addr; tensor_out.mutable_data_ptr()[j] = output_address[j]; } } } } return Error::Ok; } void destroy(DelegateHandle* handle) const override { return; } private: Error check_requires_permute( int index, const executorch::aten::Tensor tensor, VelaIO* io, bool permuted_io_flag, bool* is_permuted) const { bool permuted_shape = false; if (tensor.dim() == 4) { // special case for NHWC workaround in AOT; as the compilation has // permuted to channel last in an undetectable way, we assume here // that the application has similarly permuted any input/output tensors. permuted_shape = tensor.size(0) == io->shape[0] && tensor.size(1) == io->shape[3] && tensor.size(2) == io->shape[1] && tensor.size(3) == io->shape[2]; if (permuted_shape) { ET_LOG(Debug, "Tensor input/output %d will be permuted", index); } if (permuted_io_flag != permuted_shape) { ET_LOG( Error, "Permute compile flag and permuted input/output don't agree"); return Error::InvalidProgram; } } if (!permuted_shape) { // Check the number of elements in each tensor match int tensor_count = 1; int io_count = 1; for (int i = 0; i < tensor.dim(); i++) { tensor_count = tensor_count * tensor.size(i); } // The VelaIO type has a shape of fixed size 4 for (int i = 0; i < 4; i++) { io_count = io_count * io->shape[i]; } if (tensor_count != io_count) { ET_LOG(Error, "Input tensor sizes do not match"); ET_LOG( Error, "Program expects %d elements but got %d", io_count, tensor_count); return Error::InvalidProgram; } } *is_permuted = permuted_shape; return Error::Ok; } void permute_CHW_to_HWC(char* input, char* output, int C, int H, int W) const { for (int i = 0; i != H * W; ++i) { for (int j = 0; j < C; ++j) { output[i * C + j] = input[i + j * W * H]; } } } void permute_HWC_to_CHW(char* input, char* output, int C, int H, int W) const { for (int i = 0; i != H * W; ++i) { for (int j = 0; j < C; ++j) { output[i + j * W * H] = input[i * C + j]; } } } }; namespace { auto backend = ArmBackend(); Backend backend_id{"ArmBackend", &backend}; static auto registered = register_backend(backend_id); } // namespace } // namespace arm } // namespace backends } // namespace executorch