/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#include <executorch/backends/xnnpack/runtime/XNNCompiler.h>
#include <executorch/backends/xnnpack/runtime/XNNHeader.h>
#include <executorch/backends/xnnpack/serialization/schema_generated.h>
#include <executorch/extension/threadpool/threadpool.h>
#include <executorch/runtime/core/exec_aten/util/scalar_type_util.h>
#include <unordered_map>

#pragma clang diagnostic ignored "-Wmissing-prototypes"
#pragma clang diagnostic ignored "-Wglobal-constructors"

namespace executorch {
namespace backends {
namespace xnnpack {
namespace delegate {

using executorch::runtime::Error;
using executorch::runtime::MemoryAllocator;
using executorch::runtime::Result;

/*
 * Provide compile-time allocation.
 */
class CompileAllocator {
 public:
  /*
   * Allocate memory which will be automatically freed at the end
   * of the compilation process.
   */
  void* allocateTemporary(size_t size) {
    auto mem = new uint8_t[size];
    temporaries_.emplace_back(mem);
    return mem;
  }

 private:
  std::vector<std::unique_ptr<uint8_t[]>> temporaries_;
};

// Flatbuffer types
using ValuePtr = const fb_xnnpack::XValue*;
using NodePtr = const fb_xnnpack::XNode*;
using GraphPtr = const fb_xnnpack::XNNGraph*;
using DataType = fb_xnnpack::XNNDatatype;

// Type for define node function. This is the function signature
// for any function that takes in a flatbuffer node and defines it
// into our xnn_subgraph
using DefineNodeFunc = Error (*)(
    xnn_subgraph_t,
    const std::unordered_map<uint32_t, uint32_t>&,
    NodePtr,
    const fb_xnnpack::XNNGraph*) noexcept;

/*
Convert a tensor from fp32 to bf16.
*/
void convertF32TensorToBF16(
    const float* f32_data,
    uint16_t* bf16_data_out,
    size_t numel) {
  for (auto i = 0u; i < numel; i++) {
    // Adjust the f32 value such that it rounds properly after truncation.
    // Constant factor scales 1+2^-8 to 1+2e-7.
    float f32_adjusted = f32_data[i] * 1.00389105f;
    uint32_t f32_bits;
    memcpy(&f32_bits, &f32_adjusted, sizeof(float));
    bf16_data_out[i] = static_cast<uint16_t>(f32_bits >> 16);
  }
}

/*
Gets the output min and output max for a given node operator
*/
std::pair<float, float> getOutputMinMax(const NodePtr node) noexcept {
  float output_min = -std::numeric_limits<float>::infinity();
  float output_max = std::numeric_limits<float>::infinity();
  auto output_min_max = node->output_min_max();
  if (output_min_max != nullptr) {
    output_min = output_min_max->output_min();
    output_max = output_min_max->output_max();
  }

  return {output_min, output_max};
}

/*
Converts flatbuffer xnn data type to xnnpack data type
*/
xnn_datatype getDataType(const DataType& data_type) {
  switch (data_type) {
    case DataType::xnn_datatype_fp32:
      return xnn_datatype::xnn_datatype_fp32;
    case DataType::xnn_datatype_fp16:
      return xnn_datatype::xnn_datatype_fp16;
    case DataType::xnn_datatype_qint8:
      return xnn_datatype::xnn_datatype_qint8;
    case DataType::xnn_datatype_quint8:
      return xnn_datatype::xnn_datatype_quint8;
    case DataType::xnn_datatype_qint32:
      return xnn_datatype::xnn_datatype_qint32;
    case DataType::xnn_datatype_qcint8:
      return xnn_datatype::xnn_datatype_qcint8;
    case DataType::xnn_datatype_qcint32:
      return xnn_datatype::xnn_datatype_qcint32;
    case DataType::xnn_datatype_qcint4:
      return xnn_datatype::xnn_datatype_qcint4;
    case DataType::xnn_datatype_qdint8:
      return xnn_datatype::xnn_datatype_qdint8;
    case DataType::xnn_datatype_qbint4:
      return xnn_datatype::xnn_datatype_qbint4;
    default:
      return xnn_datatype::xnn_datatype_invalid;
  }
}

bool isQuantizedDataType(const xnn_datatype data_type) {
  switch (data_type) {
    case xnn_datatype::xnn_datatype_qint8:
    case xnn_datatype::xnn_datatype_quint8:
    case xnn_datatype::xnn_datatype_qint32:
    case xnn_datatype::xnn_datatype_qcint8:
    case xnn_datatype::xnn_datatype_qcint32:
    case xnn_datatype::xnn_datatype_qcint4:
    case xnn_datatype::xnn_datatype_qdint8:
      return true;
    default:
      return false;
  }
}

/**
Converts dims from uint32 to size_t. Takes in a flatbuffer vector
of uint32_t and returns a std::vector of size_t. XNNPACK takes in
dims of size_t* but tensor shape is serialized in flatbuffer as
int32_t. As a result, we need to static cast the shapes to size_t
*/
template <typename T = size_t>
std::vector<T> flatbufferDimsToVector(
    const flatbuffers::Vector<uint32_t>* fb_dims) {
  std::vector<T> dims_data;
  dims_data.reserve(fb_dims->size());
  for (auto fb_dim : *fb_dims) {
    dims_data.push_back(static_cast<T>(fb_dim));
  }
  return dims_data;
}

/**
Gets the constant data pointer associated with the given tensor value.
Obtaining the constant data pointer can either be from within the flatbuffer
payload (deprecated) or via offsets to the constant_data_ptr. If no constant
data associated with the tensor value, then returns nullptr.
*/
const uint8_t* getConstantDataPtr(
    const fb_xnnpack::XNNTensorValue* tensor_value,
    GraphPtr flatbuffer_graph,
    const uint8_t* constant_data_ptr) {
  auto buffer_idx = tensor_value->constant_buffer_idx();
  if (buffer_idx) {
    if (!constant_data_ptr) {
      // TODO(T172265611): Remove constant_buffer in flatbuffer path after BC
      // window
      const auto& constant_buffer = *flatbuffer_graph->constant_buffer();
      return constant_buffer[buffer_idx]->storage()->data();
    } else {
      const auto& constant_data_offsets = *flatbuffer_graph->constant_data();
      uint64_t constant_data_offset =
          constant_data_offsets[buffer_idx]->offset();
      return constant_data_ptr + constant_data_offset;
    }
  }

  return nullptr;
}

/**
Define serialized tensor value into
the subgraph. While also keeping track of the remapped ids from
the serialized id to the newly generated id.
*/
Error defineTensor(
    xnn_subgraph_t subgraph_ptr,
    std::unordered_map<uint32_t, uint32_t>& remapped_ids,
    ValuePtr value,
    GraphPtr flatbuffer_graph,
    const uint8_t* constant_data_ptr,
    std::vector<uint32_t>& input_ids,
    std::vector<uint32_t>& output_ids,
    CompileAllocator& allocator) {
  const fb_xnnpack::XNNTensorValue* tensor_value = nullptr;
  const fb_xnnpack::XNNQuantizedTensorValue* qtensor_value = nullptr;

  switch (value->xvalue_union_type()) {
    case fb_xnnpack::XValueUnion::XNNTensorValue: {
      tensor_value = value->xvalue_union_as_XNNTensorValue();
      break;
    }
    case fb_xnnpack::XValueUnion::XNNQuantizedTensorValue: {
      qtensor_value = value->xvalue_union_as_XNNQuantizedTensorValue();
      tensor_value = qtensor_value->tensor_value();
      break;
    }
    default: {
      ET_CHECK_OR_RETURN_ERROR(
          false,
          NotImplemented,
          "Unhandled value type: %s",
          fb_xnnpack::EnumNameXValueUnion(value->xvalue_union_type()));
    }
  }

  ET_CHECK_OR_RETURN_ERROR(
      tensor_value != nullptr,
      Internal,
      "Deserialized Tensor is Null, this should never happen");

  // Get tensor dims, here we need to use a vector in order
  // to properly convert the uint32_t* to size_t*
  std::vector<size_t> dims_data = flatbufferDimsToVector(tensor_value->dims());

  // XNNPACK Id
  uint32_t id = XNN_INVALID_VALUE_ID;

  // Get Pointer to constant data from flatbuffer, if its non-constant
  // it is a nullptr
  const uint8_t* buffer_ptr =
      getConstantDataPtr(tensor_value, flatbuffer_graph, constant_data_ptr);

  xnn_status status;
  // The type we might have to convert to
  auto dq_datatype = getDataType(tensor_value->dq_datatype());

  if (dq_datatype != xnn_datatype::xnn_datatype_invalid) {
    if (dq_datatype != xnn_datatype::xnn_datatype_qint8) {
      ET_CHECK_OR_RETURN_ERROR(
          false,
          Internal,
          "Only int8_t is supported for dq_datatype for now, got: %d",
          dq_datatype);
    } else {
      ET_CHECK_OR_RETURN_ERROR(
          (tensor_value->flags() & XNN_VALUE_FLAG_EXTERNAL_INPUT),
          Internal,
          "Dynamic quantization of tensor is only allowed for the external input tensor value for now! got flags: %u",
          tensor_value->flags());
    }
  }

  if (qtensor_value == nullptr) {
    // FP32 tensor
    if (!isQuantizedDataType(dq_datatype)) {
      // Define non-quantied tensor
      status = xnn_define_tensor_value(
          /*subgraph=*/subgraph_ptr,
          /*datatype=*/getDataType(tensor_value->datatype()),
          /*num_dims=*/tensor_value->num_dims(),
          /*dims=*/dims_data.data(),
          /*data=*/buffer_ptr,
          /*external_id=*/tensor_value->external_id(),
          /*flags=*/tensor_value->flags(),
          /*id_out=*/&id);
    } else if (dq_datatype != xnn_datatype::xnn_datatype_invalid) {
      ET_CHECK_OR_RETURN_ERROR(
          isQuantizedDataType(dq_datatype),
          Internal,
          "Dynamic quantization can only produce supported quantized dtypes");
      ET_CHECK_OR_RETURN_ERROR(
          tensor_value->external_id() != XNN_INVALID_VALUE_ID,
          Internal,
          "Dynamic quantization can only work with external inputs for now, got an internal ID");
      ET_CHECK_OR_RETURN_ERROR(
          buffer_ptr == nullptr,
          Internal,
          "Dynamic quantization can only work with external inputs for now, got const data");

      switch (dq_datatype) {
        case xnn_datatype::xnn_datatype_qint8: {
          // HACK TO Maintain FC/BC for ASR this will be removed after 01/2024

          // When encountering a dynamically quantized tensor via dq_datatype,
          // which is the old flow for serializing dynamically quantized linear.
          // We replace the definition of a single tensor with a new dynamic
          // Quantization pattern. We change the pattern from:
          //     serialized_qd_input
          //           to
          // (fp32_input --> convert --> qdint8_input)

          status = xnn_define_dynamically_quantized_tensor_value(
              /*subgraph=*/subgraph_ptr,
              /*datatype=*/xnn_datatype_qdint8,
              /*num_dims=*/tensor_value->num_dims(),
              /*num_nonbatch_dims=*/1, // always do per token quantization
              /*dims=*/dims_data.data(),
              /*external_id=*/XNN_INVALID_VALUE_ID, // always internal value id
              /*flags=*/0, // this is netiher external input or output
              /*id_out=*/&id);

          // this is the FP16 or FP32 external value that is being dynamically
          // quantized
          uint32_t float_id;
          enum xnn_datatype fp_datatype = getDataType(tensor_value->datatype());
          status = xnn_define_tensor_value(
              /*subgraph=*/subgraph_ptr,
              /*datatype=*/fp_datatype,
              /*num_dims=*/tensor_value->num_dims(),
              /*dims=*/dims_data.data(),
              /*data=*/buffer_ptr,
              /*external_id=*/tensor_value->external_id(),
              /*flags=*/tensor_value->flags(),
              /*id_out=*/&float_id);

          // Define dynamic conversion from float to qdint8
          status = xnn_define_convert(
              /*subgraph=*/subgraph_ptr,
              /*input_id=*/float_id,
              /*output_id=*/id,
              /*flags=*/0);
          break;
        }
        default:
          ET_CHECK_OR_RETURN_ERROR(
              false,
              NotImplemented,
              "Unhandled Dyanmic Quantization dtype: %d",
              dq_datatype);
      }
    } else {
      ET_CHECK_OR_RETURN_ERROR(false, NotImplemented, "Unhandled fp32 tensor");
    }
  } else {
    // define tensor for quantized
    switch (qtensor_value->quant_params_type()) {
      case fb_xnnpack::XNNQuantParams::PerTensorQuant: {
        auto qparams = qtensor_value->quant_params_as_PerTensorQuant();
        ET_LOG(
            Debug,
            "define quant tensor (per tensor): buffer_ptr: %p, scale: %f, zp: %u\n",
            buffer_ptr,
            qparams->scale(),
            qparams->zero_point());
        status = xnn_define_quantized_tensor_value(
            /*subgraph=*/subgraph_ptr,
            /*datatype=*/getDataType(tensor_value->datatype()),
            /*zero_point=*/qparams->zero_point(),
            /*scale=*/qparams->scale(),
            /*num_dims=*/tensor_value->num_dims(),
            /*dims=*/dims_data.data(),
            /*data=*/buffer_ptr,
            /*external_id=*/tensor_value->external_id(),
            /*flags=*/tensor_value->flags(),
            /*id_out=*/&id);
        break;
      }
      case fb_xnnpack::XNNQuantParams::PerChannelQuant: {
        auto qparams = qtensor_value->quant_params_as_PerChannelQuant();
        enum xnn_datatype dtype = getDataType(tensor_value->datatype());
        int32_t zero_point =
            (dtype == xnn_datatype::xnn_datatype_qcint4 ? 8 : 0);

        ET_LOG(
            Debug,
            "define quant tensor (per channel): buffer_ptr: %p, scale.numel(): %u, channel_dim: %u, dtype: %u, zero_point: %d\n",
            buffer_ptr,
            qparams->scale()->size(),
            qparams->channel_dim(),
            dtype,
            zero_point);
        status = xnn_define_channelwise_quantized_tensor_value_v2(
            /*subgraph=*/subgraph_ptr,
            /*datatype=*/dtype,
            /*zero_point=*/zero_point,
            /*scale=*/qparams->scale()->data(),
            /*num_dims=*/tensor_value->num_dims(),
            /*channel_dim*/ qparams->channel_dim(),
            /*dims=*/dims_data.data(),
            /*data=*/buffer_ptr,
            /*external_id=*/tensor_value->external_id(),
            /*flags=*/tensor_value->flags(),
            /*id_out=*/&id);
        break;
      }
      case fb_xnnpack::XNNQuantParams::PerChannelGroupQuant: {
        xnn_datatype datatype = getDataType(tensor_value->datatype());
        ET_CHECK_OR_RETURN_ERROR(
            datatype == xnn_datatype::xnn_datatype_qbint4,
            Internal,
            "Unsupported datatype for per channel group quantization: %d",
            datatype);
        auto qparams = qtensor_value->quant_params_as_PerChannelGroupQuant();
        size_t group_size = qparams->group_size();
        size_t output_channels = tensor_value->dims()->Get(0);
        size_t input_channels = tensor_value->dims()->Get(1);

        const uint16_t* scale_data = nullptr;
        uint32_t scale_numel = 0;

        // Block scales are preferably serialized as bf16 but can also be
        // serialized as fp32 for backwards compatability.
        if (qparams->scale_bf16() != nullptr) {
          scale_data =
              static_cast<const uint16_t*>(qparams->scale_bf16()->data());
          scale_numel = qparams->scale_bf16()->size();
        } else {
          // Read fp32 scales, convert to bf16.
          auto conv_buffer = static_cast<uint16_t*>(allocator.allocateTemporary(
              qparams->scale()->size() * sizeof(uint16_t)));
          scale_numel = qparams->scale()->size();
          convertF32TensorToBF16(
              qparams->scale()->data(), conv_buffer, scale_numel);
          scale_data = conv_buffer;
        }

        ET_CHECK_OR_RETURN_ERROR(
            scale_numel == output_channels * input_channels / group_size,
            Internal,
            "scale size %zu != output channels %zu * group size %zu",
            static_cast<size_t>(scale_numel),
            output_channels,
            group_size);
        int32_t zero_point =
            (datatype == xnn_datatype::xnn_datatype_qbint4 ? 8 : 0);
        ET_LOG(
            Debug,
            "define quant tensor (per channel group): buffer_ptr: %p, scale.numel(): %u, channel_dim: %u, grpup_size: %zu, output_channels: %zu, dtype: %u, zero_point: %d, datatype: %d\n",
            buffer_ptr,
            scale_numel,
            qparams->channel_dim(),
            group_size,
            output_channels,
            datatype,
            zero_point,
            datatype);

        status = xnn_define_blockwise_quantized_tensor_value(
            /*subgraph=*/subgraph_ptr,
            /*datatype=*/datatype,
            /*zero_point=*/zero_point,
            /*scale=*/scale_data,
            /*num_dims=*/tensor_value->num_dims(),
            /*channel_dim=*/qparams->channel_dim(),
            /*block_size=*/qparams->group_size(),
            /*dims=*/dims_data.data(),
            /*data=*/buffer_ptr,
            /*external_id=*/tensor_value->external_id(),
            /*flags=*/tensor_value->flags(),
            /*id_out=*/&id);
        break;
      }
      case fb_xnnpack::XNNQuantParams::PerTokenDynamicQuant: {
        auto qparams = qtensor_value->quant_params_as_PerTokenDynamicQuant();
        ET_LOG(
            Debug,
            "define quant tensor (dynamic): num_dims: %i, num_nonbatch_dims: %i\n",
            tensor_value->num_dims(),
            qparams->num_nonbatch_dims());
        ET_CHECK_OR_RETURN_ERROR(
            buffer_ptr == nullptr,
            Internal,
            "Dynamically quantized tensor should not have constant data but found non-nullptr");
        // TODO(T179441835): Dynamic Quantization with num_nonbatch_dims > 1
        ET_CHECK_OR_RETURN_ERROR(
            qparams->num_nonbatch_dims() == 1,
            Internal,
            "Dynamically Quantized Tensors currently only support per token quantization");
        status = xnn_define_dynamically_quantized_tensor_value(
            /*subgraph=*/subgraph_ptr,
            /*datatype=*/getDataType(tensor_value->datatype()),
            /*num_dims=*/tensor_value->num_dims(),
            /*num_nonbatch_dims*/ qparams->num_nonbatch_dims(),
            /*dims=*/dims_data.data(),
            /*external_id=*/tensor_value->external_id(),
            /*flags=*/tensor_value->flags(),
            /*id_out=*/&id);
        break;
      }
      default: {
        ET_CHECK_OR_RETURN_ERROR(
            false,
            NotImplemented,
            "Unhandled Quantization Parameters: %s",
            fb_xnnpack::EnumNameXNNQuantParams(
                qtensor_value->quant_params_type()));
      }
    }
  }

  ET_CHECK_OR_RETURN_ERROR(
      status == xnn_status_success,
      Internal,
      "Failed to define tensor %i with code: %s",
      tensor_value->id_out(),
      xnn_status_to_string(status));

  // map serialized id to newly generated id
  remapped_ids.emplace(std::make_pair(tensor_value->id_out(), id));

  // Add external ids to either list of input or output ids
  if (tensor_value->flags() & XNN_VALUE_FLAG_EXTERNAL_INPUT) {
    input_ids.push_back(tensor_value->external_id());
  }
  if (tensor_value->flags() & XNN_VALUE_FLAG_EXTERNAL_OUTPUT) {
    output_ids.push_back(tensor_value->external_id());
  }

  return Error::Ok;
};

#define MAYBE_UNUSED(x) (void)(x)

/*
Define serialized add node into the subgraph, using the remapped ids
to map the serialized ids, to the new ids generated when defining
the tensor value
*/
Error defineAddNode(
    xnn_subgraph_t subgraph_ptr,
    const std::unordered_map<uint32_t, uint32_t>& remapped_ids,
    const NodePtr node,
    const fb_xnnpack::XNNGraph* graph) noexcept {
  MAYBE_UNUSED(graph);

  std::pair<float, float> min_max = getOutputMinMax(node);
  auto graph_node = node->xnode_union_as_XNNAdd();
  xnn_status status = xnn_define_add2(
      subgraph_ptr,
      min_max.first,
      min_max.second,
      remapped_ids.at(graph_node->input1_id()),
      remapped_ids.at(graph_node->input2_id()),
      remapped_ids.at(graph_node->output_id()),
      graph_node->flags());
  ET_CHECK_OR_RETURN_ERROR(
      status == xnn_status_success,
      Internal,
      "Failed to create add node %i with code: %s",
      node->debug_handle(),
      xnn_status_to_string(status));

  return Error::Ok;
};

/*
Define Minimum operator Node into the subgraph
*/
Error defineMinimumNode(
    xnn_subgraph_t subgraph_ptr,
    const std::unordered_map<uint32_t, uint32_t>& remapped_ids,
    const NodePtr node,
    const fb_xnnpack::XNNGraph* graph) noexcept {
  MAYBE_UNUSED(graph);

  auto graph_node = node->xnode_union_as_XNNMinimum();
  xnn_status status = xnn_define_minimum2(
      subgraph_ptr,
      remapped_ids.at(graph_node->input1_id()),
      remapped_ids.at(graph_node->input2_id()),
      remapped_ids.at(graph_node->output_id()),
      graph_node->flags());

  ET_CHECK_OR_RETURN_ERROR(
      status == xnn_status_success,
      Internal,
      "Failed to create minumum node %i with code: %s",
      node->debug_handle(),
      xnn_status_to_string(status));

  return Error::Ok;
};

/*
Define subtract operator Node into the subgraph
*/
Error defineSubtractNode(
    xnn_subgraph_t subgraph_ptr,
    const std::unordered_map<uint32_t, uint32_t>& remapped_ids,
    const NodePtr node,
    const fb_xnnpack::XNNGraph* graph) noexcept {
  MAYBE_UNUSED(graph);

  auto graph_node = node->xnode_union_as_XNNSubtract();
  std::pair<float, float> min_max = getOutputMinMax(node);
  xnn_status status = xnn_define_subtract(
      subgraph_ptr,
      min_max.first,
      min_max.second,
      remapped_ids.at(graph_node->input1_id()),
      remapped_ids.at(graph_node->input2_id()),
      remapped_ids.at(graph_node->output_id()),
      graph_node->flags());

  ET_CHECK_OR_RETURN_ERROR(
      status == xnn_status_success,
      Internal,
      "Failed to create subtract node %i with code: %s",
      node->debug_handle(),
      xnn_status_to_string(status));

  return Error::Ok;
};

/*
Define Multiply operator Node into the subgraph
*/
Error defineMultiplyNode(
    xnn_subgraph_t subgraph_ptr,
    const std::unordered_map<uint32_t, uint32_t>& remapped_ids,
    const NodePtr node,
    const fb_xnnpack::XNNGraph* graph) noexcept {
  MAYBE_UNUSED(graph);

  auto graph_node = node->xnode_union_as_XNNMultiply();
  std::pair<float, float> min_max = getOutputMinMax(node);
  xnn_status status = xnn_define_multiply2(
      subgraph_ptr,
      min_max.first,
      min_max.second,
      remapped_ids.at(graph_node->input1_id()),
      remapped_ids.at(graph_node->input2_id()),
      remapped_ids.at(graph_node->output_id()),
      graph_node->flags());

  ET_CHECK_OR_RETURN_ERROR(
      status == xnn_status_success,
      Internal,
      "Failed to create multiply node %i with code: %s",
      node->debug_handle(),
      xnn_status_to_string(status));

  return Error::Ok;
};

#ifdef ENABLE_XNNPACK_KLEIDI
bool isQP8(const fb_xnnpack::XNNGraph* graph, const NodePtr node) {
  assert(node->xnode_union_type() == fb_xnnpack::XNodeUnion::XNNConvert);
  auto graph_node = node->xnode_union_as_XNNConvert();
  auto cvt_output_id = graph_node->output_id();

  auto check_dtype = [graph](uint32_t id, DataType dtype) -> bool {
    assert(
        dtype == DataType::xnn_datatype_qdint8 ||
        dtype == DataType::xnn_datatype_qbint4);
    for (auto value : *graph->xvalues()) {
      if (value->xvalue_union_type() !=
          fb_xnnpack::XValueUnion::XNNQuantizedTensorValue) {
        continue;
      }
      auto tensor =
          value->xvalue_union_as_XNNQuantizedTensorValue()->tensor_value();
      if (tensor->id_out() == id) {
        return tensor->datatype() == dtype;
      }
    }
    return false;
  };

  // Check if the output tensor is qint8 else bail early.
  if (!check_dtype(cvt_output_id, DataType::xnn_datatype_qdint8)) {
    return false;
  }

  // Find if the convert output is going to the right linear node.
  // Assuming if we can find one valid linear node, then we can use QP8
  // for all the linear nodes consuming this convert output.
  for (auto node : *graph->xnodes()) {
    if (node->xnode_union_type() == fb_xnnpack::XNodeUnion::XNNFullyConnected) {
      auto linear_node = node->xnode_union_as_XNNFullyConnected();
      if (linear_node->input1_id() == cvt_output_id) {
        if (check_dtype(
                linear_node->filter_id(), DataType::xnn_datatype_qbint4)) {
          return true;
        }
      }
    }
  }
  return false;
}
#endif // ENABLE_XNNPACK_KLEIDI

/*
Define Convert operator Node into the subgraph
*/
Error defineConvertNode(
    xnn_subgraph_t subgraph_ptr,
    const std::unordered_map<uint32_t, uint32_t>& remapped_ids,
    const NodePtr node,
    const fb_xnnpack::XNNGraph* flatbuffer_graph) noexcept {
  MAYBE_UNUSED(flatbuffer_graph);
  auto graph_node = node->xnode_union_as_XNNConvert();

  int32_t flags = graph_node->flags();
#ifdef ENABLE_XNNPACK_KLEIDI
// This is not currently exposed at include/xnnpack.h yet once it is
// we can remove this runtime logic and do this ahead-of-time
#define XNN_FLAG_MAYBE_PACK_FOR_QB4W_GEMM 0x00000100;
  if (isQP8(flatbuffer_graph, node)) {
    flags |= XNN_FLAG_MAYBE_PACK_FOR_QB4W_GEMM;
    ET_LOG(
        Debug,
        "Setting XNN_FLAG_MAYBE_PACK_FOR_QB4W_GEMM flag for convert node %i",
        node->debug_handle());
  }
#endif

  xnn_status status = xnn_define_convert(
      subgraph_ptr,
      remapped_ids.at(graph_node->input_id()),
      remapped_ids.at(graph_node->output_id()),
      flags);

  ET_CHECK_OR_RETURN_ERROR(
      status == xnn_status_success,
      Internal,
      "Failed to create convert node %i with code: %s",
      node->debug_handle(),
      xnn_status_to_string(status));

  return Error::Ok;
};
/*
Define serialized linear(fully-connected) node into the subgraph using
the remapped ids to map the serialized ids, to the new ids generated
when defining the tensor values
*/
Error defineFullyConnectedNode(
    xnn_subgraph_t subgraph_ptr,
    const std::unordered_map<uint32_t, uint32_t>& remapped_ids,
    const NodePtr node,
    const fb_xnnpack::XNNGraph* graph) noexcept {
  MAYBE_UNUSED(graph);

  auto graph_node = node->xnode_union_as_XNNFullyConnected();
  std::pair<float, float> min_max = getOutputMinMax(node);
  xnn_status status = xnn_define_fully_connected(
      subgraph_ptr,
      min_max.first,
      min_max.second,
      remapped_ids.at(graph_node->input1_id()),
      remapped_ids.at(graph_node->filter_id()),
      remapped_ids.at(graph_node->bias_id()),
      remapped_ids.at(graph_node->output_id()),
      graph_node->flags());
  ET_CHECK_OR_RETURN_ERROR(
      status == xnn_status_success,
      Internal,
      "Failed to create linear node %i, with code: %s",
      node->debug_handle(),
      xnn_status_to_string(status));

  return Error::Ok;
};

/*
Define serialized clamp node into the subgraph, using the remapped ids
to map the serialized ids, to the new ids generated when defining
the tensor value
*/
Error defineClampNode(
    xnn_subgraph_t subgraph_ptr,
    const std::unordered_map<uint32_t, uint32_t>& remapped_ids,
    const NodePtr node,
    const fb_xnnpack::XNNGraph* graph) noexcept {
  MAYBE_UNUSED(graph);

  std::pair<float, float> min_max = getOutputMinMax(node);
  auto graph_node = node->xnode_union_as_XNNClamp();
  xnn_status status = xnn_define_clamp(
      subgraph_ptr,
      min_max.first,
      min_max.second,
      remapped_ids.at(graph_node->input_id()),
      remapped_ids.at(graph_node->output_id()),
      graph_node->flags());

  ET_CHECK_OR_RETURN_ERROR(
      status == xnn_status_success,
      Internal,
      "Failed to create hardtanh node %i with code: %s",
      node->debug_handle(),
      xnn_status_to_string(status));

  return Error::Ok;
}

/*
Define serialized softmax node into the subgraph, using the remapped ids
to map the serialized ids, to the new ids generated when defining
the tensor value
*/
Error defineSoftmaxNode(
    xnn_subgraph_t subgraph_ptr,
    const std::unordered_map<uint32_t, uint32_t>& remapped_ids,
    const NodePtr node,
    const fb_xnnpack::XNNGraph* graph) noexcept {
  MAYBE_UNUSED(graph);

  auto graph_node = node->xnode_union_as_XNNSoftmax();
  xnn_status status = xnn_define_softmax(
      subgraph_ptr,
      remapped_ids.at(graph_node->input_id()),
      remapped_ids.at(graph_node->output_id()),
      graph_node->flags());
  ET_CHECK_OR_RETURN_ERROR(
      status == xnn_status_success,
      Internal,
      "Failed to create softmax node %i with code: %s",
      node->debug_handle(),
      xnn_status_to_string(status));

  return Error::Ok;
}

/*
Define serialized sigmoid node into the subgraph, using the remapped ids
to map the serialized ids, to the new ids generated when defining
the tensor value
*/
Error defineSigmoidNode(
    xnn_subgraph_t subgraph_ptr,
    const std::unordered_map<uint32_t, uint32_t>& remapped_ids,
    const NodePtr node,
    const fb_xnnpack::XNNGraph* graph) noexcept {
  MAYBE_UNUSED(graph);

  auto graph_node = node->xnode_union_as_XNNSigmoid();
  xnn_status status = xnn_define_sigmoid(
      subgraph_ptr,
      remapped_ids.at(graph_node->input_id()),
      remapped_ids.at(graph_node->output_id()),
      graph_node->flags());
  ET_CHECK_OR_RETURN_ERROR(
      status == xnn_status_success,
      Internal,
      "Failed to create sigmoid node %i with code: %s",
      node->debug_handle(),
      xnn_status_to_string(status));

  return Error::Ok;
}

/*
Define serialized floor node into the subgraph, using the remapped ids
to map the serialized ids, to the new ids generated when defining
the tensor value
*/
Error defineFloorNode(
    xnn_subgraph_t subgraph_ptr,
    const std::unordered_map<uint32_t, uint32_t>& remapped_ids,
    const NodePtr node,
    const fb_xnnpack::XNNGraph* graph) noexcept {
  MAYBE_UNUSED(graph);

  auto graph_node = node->xnode_union_as_XNNFloor();
  xnn_status status = xnn_define_floor(
      subgraph_ptr,
      remapped_ids.at(graph_node->input_id()),
      remapped_ids.at(graph_node->output_id()),
      graph_node->flags());
  ET_CHECK_OR_RETURN_ERROR(
      status == xnn_status_success,
      Internal,
      "Failed to create floor node %i with code: %s",
      node->debug_handle(),
      xnn_status_to_string(status));

  return Error::Ok;
}

Error defineGlobalAvgPooling2dNode(
    xnn_subgraph_t subgraph_ptr,
    const std::unordered_map<uint32_t, uint32_t>& remapped_ids,
    const NodePtr node,
    const fb_xnnpack::XNNGraph* graph) noexcept {
  MAYBE_UNUSED(graph);

  auto graph_node = node->xnode_union_as_XNNGlobalAvgPooling2d();
  std::pair<float, float> min_max = getOutputMinMax(node);
  xnn_status status = xnn_define_global_average_pooling_2d(
      subgraph_ptr,
      min_max.first,
      min_max.second,
      remapped_ids.at(graph_node->input_id()),
      remapped_ids.at(graph_node->output_id()),
      graph_node->flags());
  ET_CHECK_OR_RETURN_ERROR(
      status == xnn_status_success,
      Internal,
      "Failed to create global average pooling node %i with code: %s",
      node->debug_handle(),
      xnn_status_to_string(status));

  return Error::Ok;
}

Error defineAvgPooling2dNode(
    xnn_subgraph_t subgraph_ptr,
    const std::unordered_map<uint32_t, uint32_t>& remapped_ids,
    const NodePtr node,
    const fb_xnnpack::XNNGraph* graph) noexcept {
  MAYBE_UNUSED(graph);

  auto graph_node = node->xnode_union_as_XNNAvgPooling2d();
  std::pair<float, float> min_max = getOutputMinMax(node);
  xnn_status status = xnn_define_average_pooling_2d(
      subgraph_ptr,
      graph_node->padding_top(),
      graph_node->padding_right(),
      graph_node->padding_bottom(),
      graph_node->padding_left(),
      graph_node->pooling_height(),
      graph_node->pooling_width(),
      graph_node->stride_height(),
      graph_node->stride_width(),
      min_max.first,
      min_max.second,
      remapped_ids.at(graph_node->input_id()),
      remapped_ids.at(graph_node->output_id()),
      graph_node->flags());
  ET_CHECK_OR_RETURN_ERROR(
      status == xnn_status_success,
      Internal,
      "Failed to create average pooling node %i with code: %s",
      node->debug_handle(),
      xnn_status_to_string(status));

  return Error::Ok;
}

/*
Define serialized conv2d node into the subgraph, using the remapped ids
to map the serialized ids, to the new ids generated when defining the
tensor value
*/
Error defineConv2dNode(
    xnn_subgraph_t subgraph_ptr,
    const std::unordered_map<uint32_t, uint32_t>& remapped_ids,
    const NodePtr node,
    const fb_xnnpack::XNNGraph* graph) noexcept {
  MAYBE_UNUSED(graph);

  auto graph_node = node->xnode_union_as_XNNConv2d();
  std::pair<float, float> min_max = getOutputMinMax(node);
  xnn_status status = xnn_define_convolution_2d(
      subgraph_ptr,
      graph_node->padding_top(),
      graph_node->padding_right(),
      graph_node->padding_bottom(),
      graph_node->padding_left(),
      graph_node->kernel_height(),
      graph_node->kernel_width(),
      graph_node->subsampling_height(),
      graph_node->subsampling_width(),
      graph_node->dilation_height(),
      graph_node->dilation_width(),
      graph_node->groups(),
      graph_node->group_input_channels(),
      graph_node->group_output_channels(),
      min_max.first,
      min_max.second,
      remapped_ids.at(graph_node->input1_id()),
      remapped_ids.at(graph_node->filter_id()),
      remapped_ids.at(graph_node->bias_id()),
      remapped_ids.at(graph_node->output_id()),
      graph_node->flags());
  ET_CHECK_OR_RETURN_ERROR(
      status == xnn_status_success,
      Internal,
      "Failed to create convolution node %i with code: %s",
      node->debug_handle(),
      xnn_status_to_string(status));

  return Error::Ok;
}

/*
Define serialized maxpool2d node into the subgraph, using the remapped ids
to map the serialized ids, to the new ids generated when defining the
tensor value
*/
Error defineMaxPooling2dNode(
    xnn_subgraph_t subgraph_ptr,
    const std::unordered_map<uint32_t, uint32_t>& remapped_ids,
    const NodePtr node,
    const fb_xnnpack::XNNGraph* graph) noexcept {
  MAYBE_UNUSED(graph);

  auto graph_node = node->xnode_union_as_XNNMaxPooling2d();
  std::pair<float, float> min_max = getOutputMinMax(node);
  xnn_status status = xnn_define_max_pooling_2d(
      subgraph_ptr,
      graph_node->padding_top(),
      graph_node->padding_right(),
      graph_node->padding_bottom(),
      graph_node->padding_left(),
      graph_node->pooling_height(),
      graph_node->pooling_width(),
      graph_node->stride_height(),
      graph_node->stride_width(),
      graph_node->dilation_height(),
      graph_node->dilation_width(),
      min_max.first,
      min_max.second,
      remapped_ids.at(graph_node->input_id()),
      remapped_ids.at(graph_node->output_id()),
      graph_node->flags());

  ET_CHECK_OR_RETURN_ERROR(
      status == xnn_status_success,
      Internal,
      "Failed to create maxpool2d node %i with code: %s",
      node->debug_handle(),
      xnn_status_to_string(status));

  return Error::Ok;
}

/*
Define serialized div node into the subgraph
*/
Error defineDivNode(
    xnn_subgraph_t subgraph_ptr,
    const std::unordered_map<uint32_t, uint32_t>& remapped_ids,
    const NodePtr node,
    const fb_xnnpack::XNNGraph* graph) noexcept {
  MAYBE_UNUSED(graph);

  auto graph_node = node->xnode_union_as_XNNDiv();
  std::pair<float, float> min_max = getOutputMinMax(node);
  xnn_status status = xnn_define_divide(
      subgraph_ptr,
      min_max.first,
      min_max.second,
      remapped_ids.at(graph_node->input1_id()),
      remapped_ids.at(graph_node->input2_id()),
      remapped_ids.at(graph_node->output_id()),
      graph_node->flags());
  ET_CHECK_OR_RETURN_ERROR(
      status == xnn_status_success,
      Internal,
      "Failed to create div node %i with code: %s",
      node->debug_handle(),
      xnn_status_to_string(status));

  return Error::Ok;
}

/*
Define serialized static transpose node into the subgraph, using the remapped
ids to map the serialized ids, to the new ids generated when defining the
tensor value
*/
Error defineStaticTransposeNode(
    xnn_subgraph_t subgraph_ptr,
    const std::unordered_map<uint32_t, uint32_t>& remapped_ids,
    const NodePtr node,
    const fb_xnnpack::XNNGraph* graph) noexcept {
  MAYBE_UNUSED(graph);

  auto graph_node = node->xnode_union_as_XNNStaticTranspose();

  // Get tensor dims, we need to convert the uint32_t* to size_t*
  std::vector<size_t> dims_data = flatbufferDimsToVector(graph_node->perm());
  xnn_status status = xnn_define_static_transpose(
      subgraph_ptr,
      graph_node->num_dims(),
      dims_data.data(),
      remapped_ids.at(graph_node->input_id()),
      remapped_ids.at(graph_node->output_id()),
      graph_node->flags());
  ET_CHECK_OR_RETURN_ERROR(
      status == xnn_status_success,
      Internal,
      "Failed to create sigmoid node %i with code: %s",
      node->debug_handle(),
      xnn_status_to_string(status));

  return Error::Ok;
}

/*
Define serialized static resize bilinear 2d node into the subgraph, using the
remapped ids to map the serialized ids, to the new ids generated when defining
the tensor value
*/
Error defineStaticResizeBilinear2DNode(
    xnn_subgraph_t subgraph_ptr,
    const std::unordered_map<uint32_t, uint32_t>& remapped_ids,
    const NodePtr node,
    const fb_xnnpack::XNNGraph* graph) noexcept {
  MAYBE_UNUSED(graph);

  const fb_xnnpack::XNNStaticResizeBilinear2D* graph_node =
      node->xnode_union_as_XNNStaticResizeBilinear2D();

  xnn_status status = xnn_define_static_resize_bilinear_2d(
      subgraph_ptr,
      graph_node->new_height(),
      graph_node->new_width(),
      remapped_ids.at(graph_node->input_id()),
      remapped_ids.at(graph_node->output_id()),
      graph_node->flags());
  ET_CHECK_OR_RETURN_ERROR(
      status == xnn_status_success,
      Internal,
      "Failed to create StaticResizeBilinear2DNode node %i with code: %s",
      node->debug_handle(),
      xnn_status_to_string(status));

  return Error::Ok;
}

/*
Define serialized static constant pad node into the subgraph, using the
remapped ids to map the serialized ids, to the new ids generated when defining
the tensor value
*/
Error defineStaticConstantPadNode(
    xnn_subgraph_t subgraph_ptr,
    const std::unordered_map<uint32_t, uint32_t>& remapped_ids,
    const NodePtr node,
    const fb_xnnpack::XNNGraph* graph) noexcept {
  MAYBE_UNUSED(graph);

  const fb_xnnpack::XNNStaticConstantPad* graph_node =
      node->xnode_union_as_XNNStaticConstantPad();

  std::vector<size_t> pre_paddings_dims =
      flatbufferDimsToVector(graph_node->pre_paddings());
  std::vector<size_t> post_paddings_dims =
      flatbufferDimsToVector(graph_node->post_paddings());

  xnn_status status = xnn_define_static_constant_pad(
      subgraph_ptr,
      pre_paddings_dims.data(),
      post_paddings_dims.data(),
      graph_node->padding_value(),
      remapped_ids.at(graph_node->input_id()),
      remapped_ids.at(graph_node->output_id()),
      graph_node->flags());
  ET_CHECK_OR_RETURN_ERROR(
      status == xnn_status_success,
      Internal,
      "Failed to create StaticConstantPad node %i with code: %s",
      node->debug_handle(),
      xnn_status_to_string(status));

  return Error::Ok;
}

/*
Define serialized depthwise conv2d node into the subgraph, using the remapped
ids to map the serialized ids, to the new ids generated when defining the
tensor value
*/
Error defineDepthwiseConv2dNode(
    xnn_subgraph_t subgraph_ptr,
    const std::unordered_map<uint32_t, uint32_t>& remapped_ids,
    const NodePtr node,
    const fb_xnnpack::XNNGraph* graph) noexcept {
  MAYBE_UNUSED(graph);

  auto graph_node = node->xnode_union_as_XNNDepthwiseConv2d();
  std::pair<float, float> min_max = getOutputMinMax(node);
  xnn_status status = xnn_define_depthwise_convolution_2d(
      subgraph_ptr,
      graph_node->padding_top(),
      graph_node->padding_right(),
      graph_node->padding_bottom(),
      graph_node->padding_left(),
      graph_node->kernel_height(),
      graph_node->kernel_width(),
      graph_node->subsampling_height(),
      graph_node->subsampling_width(),
      graph_node->dilation_height(),
      graph_node->dilation_width(),
      graph_node->group_output_channels() /
          graph_node->group_input_channels(), // depth_multiplier
      graph_node->groups(), // input_channels = groups for depthwise conv
      min_max.first,
      min_max.second,
      remapped_ids.at(graph_node->input1_id()),
      remapped_ids.at(graph_node->filter_id()),
      remapped_ids.at(graph_node->bias_id()),
      remapped_ids.at(graph_node->output_id()),
      graph_node->flags());

  ET_CHECK_OR_RETURN_ERROR(
      status == xnn_status_success,
      Internal,
      "Failed to create depthwise convolution node %i with code: %s",
      node->debug_handle(),
      xnn_status_to_string(status));

  return Error::Ok;
}

Error defineStaticReshapeNode(
    xnn_subgraph_t subgraph_ptr,
    const std::unordered_map<uint32_t, uint32_t>& remapped_ids,
    const NodePtr node,
    const fb_xnnpack::XNNGraph* graph) noexcept {
  MAYBE_UNUSED(graph);

  auto graph_node = node->xnode_union_as_XNNStaticReshape();

  // Get tensor dims, we need to convert the uint32_t* to size_t*
  std::vector<size_t> dims_data =
      flatbufferDimsToVector(graph_node->new_shape());
  xnn_status status = xnn_define_static_reshape(
      subgraph_ptr,
      graph_node->num_dims(),
      dims_data.data(),
      remapped_ids.at(graph_node->input_id()),
      remapped_ids.at(graph_node->output_id()),
      graph_node->flags());
  ET_CHECK_OR_RETURN_ERROR(
      status == xnn_status_success,
      Internal,
      "Failed to create squeeze node %i with code: %s",
      node->debug_handle(),
      xnn_status_to_string(status));

  return Error::Ok;
}

/*
Define serialized maxpool2d node into the subgraph, using the remapped ids
to map the serialized ids, to the new ids generated when defining the
tensor value
*/
Error defineArgMaxPooling2dNode(
    xnn_subgraph_t subgraph_ptr,
    const std::unordered_map<uint32_t, uint32_t>& remapped_ids,
    const NodePtr node,
    const fb_xnnpack::XNNGraph* graph) noexcept {
  MAYBE_UNUSED(graph);

  auto graph_node = node->xnode_union_as_XNNArgMaxPooling2d();

  xnn_status status = xnn_define_argmax_pooling_2d(
      subgraph_ptr,
      graph_node->padding_top(),
      graph_node->padding_right(),
      graph_node->padding_bottom(),
      graph_node->padding_left(),
      graph_node->pooling_height(),
      graph_node->pooling_width(),
      remapped_ids.at(graph_node->input_id()),
      remapped_ids.at(graph_node->output_value_id()),
      remapped_ids.at(graph_node->output_index_id()),
      graph_node->flags());

  ET_CHECK_OR_RETURN_ERROR(
      status == xnn_status_success,
      Internal,
      "Failed to create argmaxpool2d node %i with code: %s",
      node->debug_handle(),
      xnn_status_to_string(status));

  return Error::Ok;
}

/*
Define serialized square root node into the subgraph, using the remapped ids
to map the serialized ids, to the new ids generated when defining the
tensor value
*/
Error defineSquareRootNode(
    xnn_subgraph_t subgraph_ptr,
    const std::unordered_map<uint32_t, uint32_t>& remapped_ids,
    const NodePtr node,
    const fb_xnnpack::XNNGraph* graph) noexcept {
  MAYBE_UNUSED(graph);

  auto graph_node = node->xnode_union_as_XNNSquareRoot();

  xnn_status status = xnn_define_square_root(
      subgraph_ptr,
      remapped_ids.at(graph_node->input_id()),
      remapped_ids.at(graph_node->output_id()),
      graph_node->flags());

  ET_CHECK_OR_RETURN_ERROR(
      status == xnn_status_success,
      Internal,
      "Failed to create square root node %i with code: %s",
      node->debug_handle(),
      xnn_status_to_string(status));

  return Error::Ok;
}

/*
Define serialized ceiling node into the subgraph, using the remapped ids
to map the serialized ids, to the new ids generated when defining the
tensor value
*/
Error defineCeilingNode(
    xnn_subgraph_t subgraph_ptr,
    const std::unordered_map<uint32_t, uint32_t>& remapped_ids,
    const NodePtr node,
    const fb_xnnpack::XNNGraph* graph) noexcept {
  MAYBE_UNUSED(graph);

  auto graph_node = node->xnode_union_as_XNNCeiling();

  xnn_status status = xnn_define_ceiling(
      subgraph_ptr,
      remapped_ids.at(graph_node->input_id()),
      remapped_ids.at(graph_node->output_id()),
      graph_node->flags());

  ET_CHECK_OR_RETURN_ERROR(
      status == xnn_status_success,
      Internal,
      "Failed to create ceiling node %i with code: %s",
      node->debug_handle(),
      xnn_status_to_string(status));

  return Error::Ok;
}

/*
Define serialized hardswish node into the subgraph, using the remapped ids
to map the serialized ids, to the new ids generated when defining the
tensor value
*/
Error defineHardswishNode(
    xnn_subgraph_t subgraph_ptr,
    const std::unordered_map<uint32_t, uint32_t>& remapped_ids,
    const NodePtr node,
    const fb_xnnpack::XNNGraph* graph) noexcept {
  MAYBE_UNUSED(graph);

  auto graph_node = node->xnode_union_as_XNNHardswish();

  xnn_status status = xnn_define_hardswish(
      subgraph_ptr,
      remapped_ids.at(graph_node->input_id()),
      remapped_ids.at(graph_node->output_id()),
      graph_node->flags());

  ET_CHECK_OR_RETURN_ERROR(
      status == xnn_status_success,
      Internal,
      "Failed to create hardswish node %i with code: %s",
      node->debug_handle(),
      xnn_status_to_string(status));

  return Error::Ok;
}

/*
Define serialized leaky relu node into the subgraph, using the remapped ids
to map the serialized ids, to the new ids generated when defining the
tensor value
*/
Error defineLeakyReLUNode(
    xnn_subgraph_t subgraph_ptr,
    const std::unordered_map<uint32_t, uint32_t>& remapped_ids,
    const NodePtr node,
    const fb_xnnpack::XNNGraph* graph) noexcept {
  MAYBE_UNUSED(graph);

  auto graph_node = node->xnode_union_as_XNNLeakyReLU();

  xnn_status status = xnn_define_leaky_relu(
      subgraph_ptr,
      graph_node->negative_slope(),
      remapped_ids.at(graph_node->input_id()),
      remapped_ids.at(graph_node->output_id()),
      graph_node->flags());

  ET_CHECK_OR_RETURN_ERROR(
      status == xnn_status_success,
      Internal,
      "Failed to create leaky relu node %i with code: %s",
      node->debug_handle(),
      xnn_status_to_string(status));

  return Error::Ok;
}

/*
Define serialized maximum node into the subgraph, using the remapped ids
to map the serialized ids, to the new ids generated when defining the
tensor value
*/
Error defineMaximumNode(
    xnn_subgraph_t subgraph_ptr,
    const std::unordered_map<uint32_t, uint32_t>& remapped_ids,
    const NodePtr node,
    const fb_xnnpack::XNNGraph* graph) noexcept {
  MAYBE_UNUSED(graph);

  auto graph_node = node->xnode_union_as_XNNMaximum();

  xnn_status status = xnn_define_maximum2(
      subgraph_ptr,
      remapped_ids.at(graph_node->input1_id()),
      remapped_ids.at(graph_node->input2_id()),
      remapped_ids.at(graph_node->output_id()),
      graph_node->flags());

  ET_CHECK_OR_RETURN_ERROR(
      status == xnn_status_success,
      Internal,
      "Failed to create maximum node %i with code: %s",
      node->debug_handle(),
      xnn_status_to_string(status));

  return Error::Ok;
}

/*
Define Negate node into subgraph, using the remapped ids to map the
serialized ids, to the new ids generated when defining the tensor value
*/
Error defineNegateNode(
    xnn_subgraph_t subgraph_ptr,
    const std::unordered_map<uint32_t, uint32_t>& remapped_ids,
    const NodePtr node,
    const fb_xnnpack::XNNGraph* graph) noexcept {
  MAYBE_UNUSED(graph);

  auto graph_node = node->xnode_union_as_XNNNegate();

  xnn_status status = xnn_define_negate(
      subgraph_ptr,
      remapped_ids.at(graph_node->input_id()),
      remapped_ids.at(graph_node->output_id()),
      graph_node->flags());

  ET_CHECK_OR_RETURN_ERROR(
      status == xnn_status_success,
      Internal,
      "Failed to create negate node %i with code: %s",
      node->debug_handle(),
      xnn_status_to_string(status));

  return Error::Ok;
}

/*
Defines square node into subgraph using the remapped ids to map the
serialized ids to the new ids generated when defining the tensor value
*/
Error defineSquareNode(
    xnn_subgraph_t subgraph_ptr,
    const std::unordered_map<uint32_t, uint32_t>& remapped_ids,
    const NodePtr node,
    const fb_xnnpack::XNNGraph* graph) noexcept {
  MAYBE_UNUSED(graph);

  auto graph_node = node->xnode_union_as_XNNSquare();

  xnn_status status = xnn_define_square(
      subgraph_ptr,
      remapped_ids.at(graph_node->input_id()),
      remapped_ids.at(graph_node->output_id()),
      graph_node->flags());

  ET_CHECK_OR_RETURN_ERROR(
      status == xnn_status_success,
      Internal,
      "Failed to create square node %i with code: %s",
      node->debug_handle(),
      xnn_status_to_string(status));

  return Error::Ok;
}

/*
Defines square node into subgraph using the remapped ids to map the
serialized ids to the new ids generated when defining the tensor value
*/
Error defineELUNode(
    xnn_subgraph_t subgraph_ptr,
    const std::unordered_map<uint32_t, uint32_t>& remapped_ids,
    const NodePtr node,
    const fb_xnnpack::XNNGraph* graph) noexcept {
  MAYBE_UNUSED(graph);

  auto graph_node = node->xnode_union_as_XNNELU();

  xnn_status status = xnn_define_elu(
      subgraph_ptr,
      graph_node->alpha(),
      remapped_ids.at(graph_node->input_id()),
      remapped_ids.at(graph_node->output_id()),
      graph_node->flags());

  ET_CHECK_OR_RETURN_ERROR(
      status == xnn_status_success,
      Internal,
      "Failed to create ELU node %i with code: %s",
      node->debug_handle(),
      xnn_status_to_string(status));

  return Error::Ok;
}

/*
Defines absolute value node into subgraph using the remapped ids to map the
serialized ids to the new ids generated when defining the tensor value
*/
Error defineAbsNode(
    xnn_subgraph_t subgraph_ptr,
    const std::unordered_map<uint32_t, uint32_t>& remapped_ids,
    const NodePtr node,
    const fb_xnnpack::XNNGraph* graph) noexcept {
  MAYBE_UNUSED(graph);

  auto graph_node = node->xnode_union_as_XNNAbs();

  xnn_status status = xnn_define_abs(
      subgraph_ptr,
      remapped_ids.at(graph_node->input_id()),
      remapped_ids.at(graph_node->output_id()),
      graph_node->flags());

  ET_CHECK_OR_RETURN_ERROR(
      status == xnn_status_success,
      Internal,
      "Failed to create abs node %i with code: %s",
      node->debug_handle(),
      xnn_status_to_string(status));

  return Error::Ok;
}

/*
Defines serialized prelu node into the subgraph,
using the remapped ids to map the serialized ids,
to the new ids generated when defining the tensor value
*/
Error definePReLUNode(
    xnn_subgraph_t subgraph_ptr,
    const std::unordered_map<uint32_t, uint32_t>& remapped_ids,
    const NodePtr node,
    const fb_xnnpack::XNNGraph* graph) noexcept {
  MAYBE_UNUSED(graph);

  auto graph_node = node->xnode_union_as_XNNPReLU();

  xnn_status status = xnn_define_prelu(
      subgraph_ptr,
      remapped_ids.at(graph_node->input1_id()),
      remapped_ids.at(graph_node->input2_id()),
      remapped_ids.at(graph_node->output_id()),
      graph_node->flags());

  ET_CHECK_OR_RETURN_ERROR(
      status == xnn_status_success,
      Internal,
      "Failed to create prelu node %i with code: %s",
      node->debug_handle(),
      xnn_status_to_string(status));

  return Error::Ok;
}

/*
Defines serialized concatenate2 node into the subgraph,
using the remapped ids to map the serialized ids,
to the new ids generated when defining the tensor value
*/
Error defineConcatenate2Node(
    xnn_subgraph_t subgraph_ptr,
    const std::unordered_map<uint32_t, uint32_t>& remapped_ids,
    const NodePtr node,
    const fb_xnnpack::XNNGraph* graph) noexcept {
  MAYBE_UNUSED(graph);

  auto graph_node = node->xnode_union_as_XNNConcatenate2();

  xnn_status status = xnn_define_concatenate2(
      subgraph_ptr,
      graph_node->axis(),
      remapped_ids.at(graph_node->input1_id()),
      remapped_ids.at(graph_node->input2_id()),
      remapped_ids.at(graph_node->output_id()),
      graph_node->flags());

  ET_CHECK_OR_RETURN_ERROR(
      status == xnn_status_success,
      Internal,
      "Failed to create cat2 node %i with code: %s",
      node->debug_handle(),
      xnn_status_to_string(status));

  return Error::Ok;
}

/*
Defines serialized concatenate2 node into the subgraph,
using the remapped ids to map the serialized ids,
to the new ids generated when defining the tensor value
*/
Error defineConcatenate3Node(
    xnn_subgraph_t subgraph_ptr,
    const std::unordered_map<uint32_t, uint32_t>& remapped_ids,
    const NodePtr node,
    const fb_xnnpack::XNNGraph* graph) noexcept {
  MAYBE_UNUSED(graph);

  auto graph_node = node->xnode_union_as_XNNConcatenate3();

  xnn_status status = xnn_define_concatenate3(
      subgraph_ptr,
      graph_node->axis(),
      remapped_ids.at(graph_node->input1_id()),
      remapped_ids.at(graph_node->input2_id()),
      remapped_ids.at(graph_node->input3_id()),
      remapped_ids.at(graph_node->output_id()),
      graph_node->flags());

  ET_CHECK_OR_RETURN_ERROR(
      status == xnn_status_success,
      Internal,
      "Failed to create cat3 node %i with code: %s",
      node->debug_handle(),
      xnn_status_to_string(status));

  return Error::Ok;
}

/*
Defines serialized concatenate2 node into the subgraph,
using the remapped ids to map the serialized ids,
to the new ids generated when defining the tensor value
*/
Error defineConcatenate4Node(
    xnn_subgraph_t subgraph_ptr,
    const std::unordered_map<uint32_t, uint32_t>& remapped_ids,
    const NodePtr node,
    const fb_xnnpack::XNNGraph* graph) noexcept {
  MAYBE_UNUSED(graph);

  auto graph_node = node->xnode_union_as_XNNConcatenate4();

  xnn_status status = xnn_define_concatenate4(
      subgraph_ptr,
      graph_node->axis(),
      remapped_ids.at(graph_node->input1_id()),
      remapped_ids.at(graph_node->input2_id()),
      remapped_ids.at(graph_node->input3_id()),
      remapped_ids.at(graph_node->input4_id()),
      remapped_ids.at(graph_node->output_id()),
      graph_node->flags());

  ET_CHECK_OR_RETURN_ERROR(
      status == xnn_status_success,
      Internal,
      "Failed to create cat4 node %i with code: %s",
      node->debug_handle(),
      xnn_status_to_string(status));

  return Error::Ok;
}

/*
Defines serialized static_slice node into the subgraph,
using the remapped ids to map the serialized ids,
to the new ids generated when defining the tensor value
*/
Error defineStaticSliceNode(
    xnn_subgraph_t subgraph_ptr,
    const std::unordered_map<uint32_t, uint32_t>& remapped_ids,
    const NodePtr node,
    const fb_xnnpack::XNNGraph* graph) noexcept {
  MAYBE_UNUSED(graph);

  auto graph_node = node->xnode_union_as_XNNStaticSlice();

  std::vector<size_t> offsets = flatbufferDimsToVector(graph_node->offsets());
  std::vector<size_t> sizes = flatbufferDimsToVector(graph_node->sizes());

  xnn_status status = xnn_define_static_slice(
      subgraph_ptr,
      graph_node->num_dims(),
      offsets.data(),
      sizes.data(),
      remapped_ids.at(graph_node->input_id()),
      remapped_ids.at(graph_node->output_id()),
      graph_node->flags());

  ET_CHECK_OR_RETURN_ERROR(
      status == xnn_status_success,
      Internal,
      "Failed to create static slice node %i with code: %s",
      node->debug_handle(),
      xnn_status_to_string(status));

  return Error::Ok;
}

/*
Defines Scaled Dot Product Attention (SDPA) node into the subgraph,
using the remapped ids to map the serialized ids,
to the new ids generated when defining the tensor value
*/
Error defineScaledDotProductAttentionNode(
    xnn_subgraph_t subgraph_ptr,
    const std::unordered_map<uint32_t, uint32_t>& remapped_ids,
    const NodePtr node,
    const fb_xnnpack::XNNGraph* graph) noexcept {
  MAYBE_UNUSED(graph);

  auto graph_node = node->xnode_union_as_XNNScaledDotProductAttention();

  xnn_status status = xnn_define_scaled_dot_product_attention(
      subgraph_ptr,
      xnn_attention_logits_cap_type_none, // cap_type
      nullptr, // cap_value - not used
      remapped_ids.at(graph_node->query_id()),
      remapped_ids.at(graph_node->key_id()),
      remapped_ids.at(graph_node->value_id()),
      remapped_ids.at(graph_node->scale_id()),
      remapped_ids.at(graph_node->mask_id()),
      remapped_ids.at(graph_node->output_id()),
      graph_node->flags());

  ET_CHECK_OR_RETURN_ERROR(
      status == xnn_status_success,
      Internal,
      "Failed to create SDPA node %i with code: %s",
      node->debug_handle(),
      xnn_status_to_string(status));

  return Error::Ok;
}

/*
Defines batch matrix multiply node into the subgraph,
using the remapped ids to map the serialized ids,
to the new ids generated when defining the tensor value
*/
Error defineBatchMatrixMultiplyNode(
    xnn_subgraph_t subgraph_ptr,
    const std::unordered_map<uint32_t, uint32_t>& remapped_ids,
    const NodePtr node,
    const fb_xnnpack::XNNGraph* graph) noexcept {
  MAYBE_UNUSED(graph);

  auto graph_node = node->xnode_union_as_XNNBatchMatrixMultiply();

  xnn_status status = xnn_define_batch_matrix_multiply(
      subgraph_ptr,
      remapped_ids.at(graph_node->input1_id()),
      remapped_ids.at(graph_node->input2_id()),
      remapped_ids.at(graph_node->output_id()),
      graph_node->flags());

  ET_CHECK_OR_RETURN_ERROR(
      status == xnn_status_success,
      Internal,
      "Failed to create BMM node %i with code: %s",
      node->debug_handle(),
      xnn_status_to_string(status));

  return Error::Ok;
}

/*
Returns not Implemented Error code. This function is meant to be
called when the compiler encountes a XNodeType from the flatbuffer
that has not yet been implemented
*/
Error defineNotImplementedNode(
    xnn_subgraph_t subgraph_ptr,
    const std::unordered_map<uint32_t, uint32_t>& remapped_ids,
    const NodePtr node,
    const fb_xnnpack::XNNGraph* graph) noexcept {
  MAYBE_UNUSED(graph);

  ET_CHECK_OR_RETURN_ERROR(
      false,
      NotImplemented,
      "Unhandled node type: %s",
      fb_xnnpack::EnumNameXNodeUnion(node->xnode_union_type()));
}

/*
Returns the pointer to the defineNode function that handles the given
XNode type
*/
#define _DEFINE(name)                     \
  case fb_xnnpack::XNodeUnion::XNN##name: \
    return &define##name##Node;

DefineNodeFunc getDefineNodeFunc(fb_xnnpack::XNodeUnion nodeType) {
  switch (nodeType) {
    _DEFINE(Add)
    _DEFINE(FullyConnected)
    _DEFINE(Softmax)
    _DEFINE(Sigmoid)
    _DEFINE(StaticTranspose)
    _DEFINE(Clamp)
    _DEFINE(Conv2d)
    _DEFINE(Div)
    _DEFINE(StaticResizeBilinear2D)
    _DEFINE(StaticConstantPad)
    _DEFINE(AvgPooling2d)
    _DEFINE(Minimum)
    _DEFINE(DepthwiseConv2d)
    _DEFINE(MaxPooling2d)
    _DEFINE(Multiply)
    _DEFINE(Subtract)
    _DEFINE(Floor)
    _DEFINE(Convert)
    _DEFINE(GlobalAvgPooling2d)
    _DEFINE(StaticReshape)
    _DEFINE(ArgMaxPooling2d)
    _DEFINE(SquareRoot)
    _DEFINE(Ceiling)
    _DEFINE(Hardswish)
    _DEFINE(LeakyReLU)
    _DEFINE(Maximum)
    _DEFINE(Negate)
    _DEFINE(Square)
    _DEFINE(ELU)
    _DEFINE(Abs)
    _DEFINE(PReLU)
    _DEFINE(Concatenate2)
    _DEFINE(Concatenate3)
    _DEFINE(Concatenate4)
    _DEFINE(StaticSlice)
    _DEFINE(ScaledDotProductAttention)
    _DEFINE(BatchMatrixMultiply)
    case fb_xnnpack::XNodeUnion::NONE:
    default: // Adding here as a catch all, just in case
      return &defineNotImplementedNode;
  }
}
#undef _DEFINE

/*
Builds the xnnpack runtime object using the buffer pointer. The buffer pointer
must be a valid pointer to the serialized xnnpack object. It also fills the
XNNExecutor object with the built xnn_runtime and the input/output ids.
*/
ET_NODISCARD Error XNNCompiler::compileModel(
    const void* buffer_pointer,
    size_t num_bytes,
    XNNExecutor* executor,
    MemoryAllocator* runtime_allocator,
    xnn_workspace_t workspace) {
  Result<XNNHeader> header = XNNHeader::Parse(buffer_pointer, num_bytes);
  const uint8_t* flatbuffer_data = nullptr;
  const uint8_t* constant_data = nullptr;
  CompileAllocator compile_allocator;

  // Header status can only either be Error::Ok or Error::NotFound
  if (header.ok()) {
    flatbuffer_data = reinterpret_cast<const uint8_t*>(buffer_pointer) +
        header->flatbuffer_offset;
    constant_data = reinterpret_cast<const uint8_t*>(buffer_pointer) +
        header->constant_data_offset;
  } else if (header.error() == Error::NotFound) {
    flatbuffer_data = reinterpret_cast<const uint8_t*>(buffer_pointer);
  } else {
    ET_LOG(Error, "XNNHeader may be corrupt");
    return header.error();
  }

  // Temporarily support identifier XN00 and XN01
  bool is_supported_version =
      strncmp(flatbuffers::GetBufferIdentifier(flatbuffer_data), "XN00", 4) ==
          0 ||
      strncmp(flatbuffers::GetBufferIdentifier(flatbuffer_data), "XN01", 4) ==
          0;
  ET_CHECK_OR_RETURN_ERROR(
      is_supported_version,
      DelegateInvalidCompatibility,
      "XNNPACK Delegate Serialization Format version identifier '%.4s' != expected XN00 or XN01'",
      flatbuffers::GetBufferIdentifier(flatbuffer_data));

  auto flatbuffer_graph = fb_xnnpack::GetXNNGraph(flatbuffer_data);
  // initialize xnnpack
  xnn_status status = xnn_initialize(/*allocator =*/nullptr);
  ET_CHECK_OR_RETURN_ERROR(
      xnn_status_success == status,
      Internal,
      "XNN Initialize failed with code: %s",
      xnn_status_to_string(status));

  // create xnnpack subgraph
  xnn_subgraph_t subgraph_ptr = nullptr;
  status = xnn_create_subgraph(
      /*external_value_ids=*/flatbuffer_graph->num_externs(),
      /*flags=*/0,
      &subgraph_ptr);
  ET_CHECK_OR_RETURN_ERROR(
      xnn_status_success == status,
      Internal,
      "XNN Subgraph creation failed with code: %s",
      xnn_status_to_string(status));

  std::unique_ptr<xnn_subgraph, decltype(&xnn_delete_subgraph)> subgraph(
      subgraph_ptr, &xnn_delete_subgraph);

  // mapping from old ids to new created value ids
  // The old ids that were serialied were generated AoT, since
  // we are re-defining tensor values, the defined IDs could be
  // different from the ones generated AoT, as a result, we need
  // a new mapping from the old ids to the newly created ones
  std::unordered_map<uint32_t, uint32_t> remapped_ids;
  // Invalid ids do not need to be remapped
  remapped_ids.emplace(XNN_INVALID_VALUE_ID, XNN_INVALID_VALUE_ID);

  // External Ids for inputs and outputs
  std::vector<uint32_t> input_ids;
  std::vector<uint32_t> output_ids;
  Error err = Error::Ok;
  for (auto value : *flatbuffer_graph->xvalues()) {
    err = defineTensor(
        subgraph.get(),
        remapped_ids,
        value,
        flatbuffer_graph,
        constant_data,
        input_ids,
        output_ids,
        compile_allocator);

    if (err != Error::Ok) {
      return err;
    }
  }

  for (auto node : *flatbuffer_graph->xnodes()) {
    err = getDefineNodeFunc(node->xnode_union_type())(
        subgraph.get(), remapped_ids, node, flatbuffer_graph);
    if (err != Error::Ok) {
      return err;
    }
  }
  uint32_t runtime_flags = 0;

#if defined(ENABLE_XNNPACK_PROFILING) || defined(ET_EVENT_TRACER_ENABLED)
  runtime_flags |= XNN_FLAG_BASIC_PROFILING;
#endif

  xnn_runtime_t runtime_ptr = nullptr;

#ifdef ENABLE_XNNPACK_SHARED_WORKSPACE
  ET_CHECK_OR_RETURN_ERROR(
      workspace != nullptr, Internal, "Failed to initialize XNNPACK workspace");
  status = xnn_create_runtime_v4(
      subgraph.get(),
      /*weight_cache=*/nullptr, // TODO - support weight cache
      workspace,
      ::executorch::extension::threadpool::get_pthreadpool(),
      runtime_flags,
      &runtime_ptr);
#else
  status = xnn_create_runtime_v3(
      subgraph.get(),
      /*weight_cache=*/nullptr, // TODO - support weight cache
      ::executorch::extension::threadpool::get_pthreadpool(),
      runtime_flags,
      &runtime_ptr);
#endif

  ET_CHECK_OR_RETURN_ERROR(
      xnn_status_success == status,
      Internal,
      "XNN Runtime creation failed with code: %s",
      xnn_status_to_string(status));

  err = executor->initialize( // NOLINT: runtime_ptr is non-null
      runtime_ptr,
      std::move(input_ids),
      std::move(output_ids));

  return err;
};

} // namespace delegate
} // namespace xnnpack
} // namespace backends
} // namespace executorch