qualcomm/serialization/qc_compiler_spec.fbs

//============================================================================
//
// Copyright (c) Qualcomm Innovation Center, Inc.
// All rights reserved
//
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree.
//
//============================================================================

namespace qnn_delegate;

/// Defines the HTP hardware architecture available for HTP backend.
enum HtpArch: int {
  NONE = 0,
  V68 = 68,
  V69 = 69,
  V73 = 73,
  V75 = 75,
}

table HtpInfo {
  /// Represent the HTP hardware architecture
  htp_arch:HtpArch;

  /// Represent the vtcm size to use for graphs. VTCM size is provided in MB.
  vtcm_size_in_mb:uint;
}

/// You could refer to Qualcomm AI Engine Direct SDK
/// to get SoC Model in supported snapdragon devices
enum QcomChipset: int {
  UNKNOWN_SM = 0,
  SM8450 = 36,
  SM8475 = 42,
  SM8550 = 43,
  SSG2115P = 46,
  SM8650 = 57,
  SA8295 = 39,
}

/// Indicate the information of the specified SoC.
table SocInfo {
  /// Identifies SOC model.
  soc_model:QcomChipset;

  /// Identifies the htp information of the specified SoC.
  htp_info:HtpInfo;
}

/// Defines performance modes available for HTP backend.
enum QnnExecuTorchHtpPerformanceMode: int {
  kHtpDefault = 0,
  kHtpSustainedHighPerformance,
  kHtpBurst,
  kHtpHighPerformance,
  kHtpPowerSaver,
  kHtpLowPowerSaver,
  kHtpHighPowerSaver,
  kHtpLowBalanced,
  kHtpBalanced,
}

/// Defines the optimization levels of the graph tensors that are not input nor
/// output tensors. This enum controls the trade-off between performance and
/// accuracy.
enum QnnExecuTorchHtpPrecision: int {
  kHtpQuantized = 0,
  kHtpFp16,
}

/// The QNN backend used to delegate the model's nodes. Each backend has
/// its own set of supported ops and tensor types.
enum QnnExecuTorchBackendType: int {
  kUndefinedBackend = 0,
  kGpuBackend,
  kHtpBackend,
  kDspBackend,
}

/// Defines pd sessions available for HTP backend.
enum QnnExecuTorchHtpPdSession: int {
  kHtpUnsignedPd = 0,
  kHtpSignedPd,
}

/// Specifies the backend options for the HTP backend.
table QnnExecuTorchHtpBackendOptions {
  /// Max spill-fill buffer across contexts.
  max_sf_buf_size:int;

  /// The default performance mode sets no configurations on the HTP.
  performance_mode:QnnExecuTorchHtpPerformanceMode;

  /// The default precision mode supports quantized networks. Other precision
  /// modes may only be supported on certain SoCs.
  precision:QnnExecuTorchHtpPrecision;

  /// Signed or unsigned HTP PD session. The default PD session is unsigned.
  pd_session:QnnExecuTorchHtpPdSession;

  /// Optional parameter specifying the directory of QNN Skel library. Only
  /// useful for backends which have a Skel library.
  skel_library_dir:string;

  /// With using conv hmx with short depths, we might have better performance,
  /// but convolution that have short depth and/or weights that are not
  /// symmetric could exhibit inaccurate results.
  use_conv_hmx:bool;

  /// Deep Learning Bandwidth Compression allows inputs to be
  /// compressed, such that the processing bandwidth can be lowered.
  use_dlbc:bool;

  /// With using fold relu, we might have better performance, this optimization
  /// is correct when quantization ranges for convolution are equal or subset of
  /// the Relu operation.
  use_fold_relu:bool;

  /// When multiple contexts are generated inside the same
  /// pte, it is possible to reserve a single spill-fill allocation that
  /// could be re-used across all the splits.
  use_multi_contexts:bool;

  /// When multiple graphs appear inside the same context,
  /// weights could be reused across all graphs.
  use_weight_sharing:bool;
}

/// Logging level of the delegate and QNN backend.
enum QnnExecuTorchLogLevel: int {
  kLogOff = 0,
  kLogLevelError,
  kLogLevelWarn,
  kLogLevelInfo,
  kLogLevelVerbose,
  kLogLevelDebug,
}

/// Profiling level of the delegate and QNN backend.
enum QnnExecuTorchProfileLevel: int {
  kProfileOff = 0,
  kProfileBasic,
  kProfileDetailed,
  kProfileOptrace,
}

/// QNN backends currently supported
table QnnExecuTorchBackendOptions {
  /// The backend QNN library to open and execute the graph with. This is a
  /// required argument and will error out if kUndefinedBackend is supplied.
  backend_type:QnnExecuTorchBackendType;

  htp_options:QnnExecuTorchHtpBackendOptions;
}

table QnnExecuTorchOptions {
  /// Specify SoC to compile or execute for.
  soc_info:SocInfo;

  /// Optional backend specific options for the HTP backend.
  backend_options:QnnExecuTorchBackendOptions;

  /// Optional parameter to create qnn graph if QNN context blob is not given
  graph_name:string;

  /// Optional parameter to override the QNN backend library.
  library_path:string;

  /// Logging level of the delegate and the backend. Default is off.
  log_level:QnnExecuTorchLogLevel;

  /// Check if on-device graph construction. Default is false.
  online_prepare:bool;

  /// If tensor dump is enabled, all intermediate tensors output will be dumped.
  /// This option exists for debugging accuracy issues. Default is off.
  dump_intermediate_outputs:bool;

  /// Profiling level of the delegate and the backend. Default is off.
  profile_level:QnnExecuTorchProfileLevel;

  /// Enables usage of shared buffer between application and backend for graph I/O.
  shared_buffer:bool;

  /// Is model from qnn context binary
  is_from_context_binary:bool;

  /// True if there exists multiple graphs in one .pte file.
  multiple_graphs:bool;
}

root_type QnnExecuTorchOptions;