csrc/autograd/profiler_kineto.cpp

#include <cstring>
#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
#include <torch/csrc/autograd/profiler_kineto.h>

#include <c10/macros/Export.h>
#include <c10/util/ApproximateClock.h>
#include <c10/util/Exception.h>
#include <c10/util/flat_hash_map.h>
#include <c10/util/irange.h>
#include <c10/util/overloaded.h>

#include <torch/csrc/profiler/api.h>
#include <torch/csrc/profiler/collection.h>
#include <torch/csrc/profiler/containers.h>
#include <torch/csrc/profiler/events.h>
#include <torch/csrc/profiler/kineto_shim.h>
#include <torch/csrc/profiler/orchestration/observer.h>
#include <torch/csrc/profiler/perf.h>
#include <torch/csrc/profiler/standalone/itt_observer.h>
#include <torch/csrc/profiler/standalone/nvtx_observer.h>
#include <torch/csrc/profiler/standalone/privateuse1_observer.h>
#include <torch/csrc/profiler/util.h>

#include <ATen/Context.h>

#include <stdexcept>
#include <utility>

#ifdef USE_KINETO
#include <ApproximateClock.h>
#include <libkineto.h>
#include <time_since_epoch.h>

#ifndef _MSC_VER
// TODO: TO be removed, once this properly works from libkineto
// Literal copy-n-paste from third_party/kineto/libkineto/src/WeakSymbols.cpp
extern "C" {
// This function is needed to avoid superfluous dependency on GNU OpenMP library
// when cuPTI is linked statically For more details see
// https://github.com/pytorch/pytorch/issues/51026
__attribute__((weak)) int acc_get_device_type();
__attribute__((weak)) int acc_get_device_type() {
  throw std::runtime_error(
      "Dummy implementation of acc_get_device_type is not supposed to be called!");
}
} // extern "C"
#endif // _MSC_VER
#endif // USE_KINETO

namespace torch {
namespace autograd::profiler {

namespace {
inline int64_t getTimeNs() {
#ifdef USE_KINETO
  return libkineto::timeSinceEpoch(std::chrono::system_clock::now());
#else
  return c10::getTime();
#endif // USE_KINETO
}

using torch::profiler::impl::ActiveProfilerType;
using torch::profiler::impl::EventType;
using torch::profiler::impl::ExtraFields;
using torch::profiler::impl::get_record_concrete_inputs_enabled;
using torch::profiler::impl::ivalueListToStr;
using torch::profiler::impl::ivalueToStr;
using torch::profiler::impl::op_input_t;
using torch::profiler::impl::ProfilerStateBase;
using torch::profiler::impl::PyExtraFieldsBase;
using torch::profiler::impl::Result;
using torch::profiler::impl::shape;
using torch::profiler::impl::shapesToStr;
using torch::profiler::impl::stacksToStr;
using torch::profiler::impl::strListToStr;
using torch::profiler::impl::TensorMetadata;
using torch::profiler::impl::variantShapesToStr;

struct OpArgData {
  bool hasData;
  std::vector<shape> shapes;
  std::vector<std::string> dtypes;
  std::vector<c10::IValue> concreteInputs;
  std::vector<std::vector<int64_t>> shapesForKinetoEvent;
  std::vector<shape> strides;
};

auto parseArgData(
    const std::vector<op_input_t>& input_shapes,
    const std::vector<op_input_t>& concreteInputs) {
  if (input_shapes.empty()) {
    return OpArgData{false, {}, {}, {}, {}, {}};
  }

  std::vector<shape> shapes(input_shapes.size());
  std::vector<shape> strides(input_shapes.size());
  std::vector<std::vector<int64_t>> shapesForKinetoEvent(input_shapes.size());

  std::vector<std::string> dtypes(input_shapes.size());
  std::vector<c10::IValue> concrete_inputs_list;

  for (const auto& i : c10::irange(input_shapes.size())) {
    std::visit(
        c10::overloaded(
            [&](const TensorMetadata& t) {
              shapes[i] = t.sizes_;
              shapesForKinetoEvent[i] = t.sizes_;
              dtypes[i] = std::string(scalarTypeToTypeMeta(t.dtype_).name());
              strides[i] = t.strides_;
            },
            [&](const std::vector<TensorMetadata>& l) {
              std::vector<std::vector<int64_t>> shape;
              shape.reserve(l.size());
              std::vector<std::vector<int64_t>> stride;
              stride.reserve(l.size());
              for (const auto& t : l) {
                shape.emplace_back(t.sizes_);
                stride.emplace_back(t.strides_);
              }
              shapes[i] = shape;
              strides[i] = stride;
              dtypes[i] = "TensorList";
            },
            [&](const c10::IValue&) { dtypes[i] = "Scalar"; },
            [&](const auto&) {}),
        input_shapes[i]);
  }

  // If we recorded concrete inputs, then parse them
  if (input_shapes.size() == concreteInputs.size() && !concreteInputs.empty()) {
    concrete_inputs_list.resize(input_shapes.size());

    for (const auto& i : c10::irange(input_shapes.size())) {
      std::visit(
          c10::overloaded(
              [&](const c10::IValue& val) { concrete_inputs_list[i] = val; },
              [&](const auto&) {}),
          input_shapes[i]);
      std::visit(
          c10::overloaded(
              [&](const c10::IValue& val) {
                concrete_inputs_list[i] = val;
                dtypes[i] = "ScalarList";
              },
              [&](const auto&) {}),
          concreteInputs[i]);
    }
  }

  return OpArgData{
      true,
      shapes,
      dtypes,
      concrete_inputs_list,
      shapesForKinetoEvent,
      strides};
}

struct MetadataBase {
  /* implicit */ MetadataBase(const std::shared_ptr<Result>& result)
      : kinetoActivity_{result->kineto_activity_} {
    if (std::holds_alternative<ExtraFields<EventType::Kineto>>(
            result->extra_fields_)) {
      // In order to add metadata we have to downcast from
      // `libkineto::ITraceActivity` to `libkineto::GenericTraceActivity`. We
      // know that all activities provided by PyTorch are of the correct type,
      // however Kineto profilers can (and do) add events that inherit directly
      // from ITraceActivity. As a result, any Result which was constructed from
      // an event that Kineto provided is unsafe to cast.
      if (!(SOFT_ASSERT(!hasKinetoActivity()))) {
        result->kineto_activity_ = nullptr;
      }
      kinetoActivity_ = result->kineto_activity_;
    }
  }

  void addMetadata(const std::string& key, const std::string& value) {
    if (kinetoActivity_ && !value.empty() && value != "\"\"") {
      torch::profiler::impl::kineto::addMetadata(
          // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
          const_cast<torch::profiler::impl::kineto::activity_t*>(
              kinetoActivity_),
          key,
          value);
    }
  }

  bool hasKinetoActivity() const {
    return kinetoActivity_ != nullptr;
  }

 private:
  const torch::profiler::impl::kineto::activity_t* kinetoActivity_{nullptr};
};

struct AddTensorboardFields : public MetadataBase {
  AddTensorboardFields(
      const std::shared_ptr<Result>& result,
      KinetoEvent& kineto_event)
      : MetadataBase(result) {
    result->visit(*this);
    const auto module_hierarchy = kineto_event.moduleHierarchy();
    addMetadata("Module Hierarchy", stacksToStr(module_hierarchy.vec(), "."));
    addMetadata("Call stack", stacksToStr(kineto_event.stack().vec(), ";"));

    result->visit_if_base<PyExtraFieldsBase>([&, this](const auto& i) -> void {
      this->addMetadata("Python id", std::to_string(i.id_));

      std::optional<std::string> parent_id;
      std::shared_ptr<Result> parent = result->parent_.lock();
      while (parent && !parent_id.has_value()) {
        parent->visit_if_base<PyExtraFieldsBase>(
            [&](const auto& j) { parent_id = std::to_string(j.id_); });
        parent = parent->parent_.lock();
      }
      this->addMetadata("Python parent id", parent_id.value_or("null"));
    });
  }

  void operator()(const ExtraFields<EventType::PyCall>& py_call) {
    if (py_call.module_.has_value()) {
      addMetadata("Python module id", std::to_string(py_call.module_->id_));
    }
  }

  template <typename T>
  void operator()(const T&) {}
};

struct AddGenericMetadata : public MetadataBase {
  AddGenericMetadata(
      std::shared_ptr<Result>& result,
      const torch::profiler::impl::ProfilerConfig* config)
      : MetadataBase(result), config_(config) {
    result->visit(*this);
    if (config->experimental_config.verbose) {
      result->visit_if_base<PyExtraFieldsBase>(
          [&, this](const auto& i) -> void {
            this->addMetadata("Python thread", std::to_string(i.python_tid_));
          });
    }
  }

  void operator()(ExtraFields<EventType::TorchOp>& op_event) {
    const auto arg_data =
        parseArgData(op_event.inputs_, op_event.concrete_inputs_);

    if (arg_data.hasData) {
      if (get_record_concrete_inputs_enabled()) {
        addMetadata("Input Dims", variantShapesToStr(arg_data.shapes));
        addMetadata("Input Strides", variantShapesToStr(arg_data.strides));
      } else {
        addMetadata("Input Dims", shapesToStr(arg_data.shapesForKinetoEvent));
      }
      addMetadata("Input type", strListToStr(arg_data.dtypes));
      if (!arg_data.concreteInputs.empty()) {
        addMetadata(
            "Concrete Inputs", ivalueListToStr(arg_data.concreteInputs));
      }
    }

    // Add metadata for kwinputs if exist
    for (const auto& [key, val] : op_event.kwinputs_) {
      bool isString = val.isString();
      addMetadata(key, ivalueToStr(val, isString));
    }
    // Add extra metadata if any
    for (const auto& [key, val] : op_event.extra_meta_) {
      addMetadata(key, val);
    }

    if (config_ && !config_->experimental_config.performance_events.empty()) {
      auto& event_names = config_->experimental_config.performance_events;
      for (const auto i : c10::irange(op_event.perf_event_counters_->size())) {
        addMetadata(
            event_names[i],
            std::to_string((*op_event.perf_event_counters_)[i]));
      }
    }

    // add information about an associated forward op, if a sequence number
    // is available (e.g. during training)
    if (op_event.sequence_number_ >= 0) {
      addMetadata("Fwd thread id", std::to_string(op_event.forward_tid_));
      addMetadata("Sequence number", std::to_string(op_event.sequence_number_));
    }
    addMetadata(
        "Record function id", std::to_string(op_event.record_function_id_));
  }

  void operator()(ExtraFields<EventType::Backend>& backend_event) {
    if (!backend_event.backend_.empty()) {
      addMetadata("Backend", "\"" + backend_event.backend_ + "\"");
    }
  }

  void operator()(const ExtraFields<EventType::Allocation>& alloc) {
    addMetadata("Device Type", std::to_string((int8_t)alloc.device_type_));
    addMetadata("Device Id", std::to_string(alloc.device_index_));
    addMetadata("Addr", std::to_string(reinterpret_cast<intptr_t>(alloc.ptr_)));
    addMetadata("Bytes", std::to_string(alloc.alloc_size_));
    addMetadata("Total Allocated", std::to_string(alloc.total_allocated_));
    addMetadata("Total Reserved", std::to_string(alloc.total_reserved_));
  }

  void operator()(const ExtraFields<EventType::OutOfMemory>& alloc) {
    addMetadata("Device Type", std::to_string((int8_t)alloc.device_type_));
    addMetadata("Device Id", std::to_string(alloc.device_index_));
    addMetadata("Bytes", std::to_string(alloc.alloc_size_));
    addMetadata("Total Allocated", std::to_string(alloc.total_allocated_));
    addMetadata("Total Reserved", std::to_string(alloc.total_reserved_));
  }

  template <typename T>
  void operator()(const T&) {}

 private:
  /* To get names of the performance events */
  const torch::profiler::impl::ProfilerConfig* config_;
};

struct KinetoThreadLocalState : public ProfilerStateBase {
  explicit KinetoThreadLocalState(
      const ProfilerConfig& config,
      std::set<torch::profiler::impl::ActivityType> activities)
      : ProfilerStateBase(config),
        startTime(getTimeNs()),
        recordQueue(config, std::move(activities)) {}
  ~KinetoThreadLocalState() override = default;

  static KinetoThreadLocalState* get(bool global) {
    auto* state = ProfilerStateBase::get(/*global=*/global);
    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
        state == nullptr ||
        state->profilerType() == ActiveProfilerType::KINETO);
    return static_cast<KinetoThreadLocalState*>(state);
  }

  ActiveProfilerType profilerType() override {
    return ActiveProfilerType::KINETO;
  }

  void reportVulkanEventToProfiler(torch::profiler::impl::vulkan_id_t id) {
    if (!config_.disabled()) {
      recordQueue.getSubqueue()->emplace_vulkan_event(
          c10::getApproximateTime(), id);
    }
  }

  void reportMemoryUsage(
      void* ptr,
      int64_t alloc_size,
      size_t total_allocated,
      size_t total_reserved,
      c10::Device device) override {
    if (config_.profile_memory && !config_.disabled()) {
      recordQueue.getSubqueue()->emplace_allocation_event(
          c10::getApproximateTime(),
          ptr,
          alloc_size,
          total_allocated,
          total_reserved,
          device.type(),
          device.index());
    }
  }

  void reportOutOfMemory(
      int64_t alloc_size,
      size_t total_allocated,
      size_t total_reserved,
      c10::Device device) override {
    if (config_.profile_memory && !config_.disabled()) {
      recordQueue.getSubqueue()->emplace_ooms_event(
          c10::getApproximateTime(),
          alloc_size,
          total_allocated,
          total_reserved,
          device.type(),
          device.index());
    }
  }

  void setEventPostProcessingCallback(post_process_t&& cb) {
    eventPostProcessCb = std::move(cb);
  }

  void pausePython() {
    recordQueue.stop();
  }

  void resumePython() {
    recordQueue.restart();
  }

  std::unique_ptr<torch::profiler::impl::kineto::ActivityTraceWrapper>
  finalizeTrace() {
    auto end_time = getTimeNs();
    recordQueue.stop();

    std::lock_guard<std::mutex> guard(state_mutex_);
    auto converter = clockConverter.makeConverter();
#ifdef USE_KINETO
    libkineto::get_time_converter() = converter;
#endif
    auto records_and_trace =
        recordQueue.getRecords(std::move(converter), startTime, end_time);

    materializeOpEvents(records_and_trace.first);

    // `kinetoEvents` does not include Python events. Instead it exposes them
    // via the `stacks` property.
    kinetoEvents.erase(
        std::remove_if(
            kinetoEvents.begin(),
            kinetoEvents.end(),
            [](const auto& i) { return i.isPythonFunction(); }),
        kinetoEvents.end());

    return std::move(records_and_trace.second);
  }

  template <typename T>
  void invokeCallback(T& t) {
    if (eventPostProcessCb) {
      eventPostProcessCb(t.debug_handle_, t.jit_stack_, t.jit_modules_);
    }
  }

  void materializeOpEvents(std::vector<std::shared_ptr<Result>>& events) {
    for (auto& e : events) {
      if (e->parent_.expired() && e->deviceType() == c10::DeviceType::CPU) {
        eventTree.push_back(e);
      }

      if (e->finished_) {
        e->visit(c10::overloaded(
            [this](ExtraFields<EventType::TorchOp>& i) { invokeCallback(i); },
            [this](ExtraFields<EventType::Backend>& i) { invokeCallback(i); },
            [](auto&) {}));

        kinetoEvents.emplace_back(e, config_.experimental_config.verbose);
        AddTensorboardFields add_tb(e, kinetoEvents.back());
        AddGenericMetadata add_generic(e, &config_);

        // It is not safe to use the activity after post processing.
        e->kineto_activity_ = nullptr;
      }
    }
  }

  uint64_t startTime;
  c10::ApproximateClockToUnixTimeConverter clockConverter;
  torch::profiler::impl::RecordQueue recordQueue;
  std::vector<KinetoEvent> kinetoEvents;
  std::vector<experimental_event_t> eventTree;
  // Optional, if event post-processing is enabled.
  post_process_t eventPostProcessCb;
};

template <bool use_global_state_ptr = false>
std::unique_ptr<at::ObserverContext> onFunctionEnter(
    const at::RecordFunction& fn) {
  auto state_ptr = KinetoThreadLocalState::get(use_global_state_ptr);
  if (!state_ptr) {
    return nullptr;
  }
  return state_ptr->recordQueue.getSubqueue()->begin_op(fn);
}

// @lint-ignore CLANGTIDY clang-diagnostic-unused-parameter
template <bool use_global_state_ptr = false>
void onFunctionExit(
    const at::RecordFunction& fn,
    at::ObserverContext* ctx_ptr) {
  auto state_ptr = KinetoThreadLocalState::get(use_global_state_ptr);
  if (!state_ptr) {
    return;
  }
  const auto& config = state_ptr->config();
  auto* kineto_ctx_ptr =
      static_cast<torch::profiler::impl::KinetoObserverContext*>(ctx_ptr);
  TORCH_INTERNAL_ASSERT(kineto_ctx_ptr != nullptr);
  kineto_ctx_ptr->event_->end_time_ = c10::getApproximateTime();
  if (!config.experimental_config.performance_events.empty()) {
    state_ptr->recordQueue.getSubqueue()->disable_perf_profiler(
        *kineto_ctx_ptr->event_->counters_);
  }
  kineto_ctx_ptr->event_->basic_fields_.end_tid_ =
      at::RecordFunction::currentThreadId();
  if (config.state == ProfilerState::KINETO_GPU_FALLBACK) {
    try {
      auto fallback = kineto_ctx_ptr->fallback_;
      TORCH_INTERNAL_ASSERT(fallback != nullptr);
      torch::profiler::impl::cudaStubs()->record(
          nullptr, &fallback->device_event_end_, nullptr);
    } catch (const std::exception& e) {
      LOG(WARNING) << "Failed to record CUDA event. " << e.what();
    }
  } else if (config.state == ProfilerState::KINETO_PRIVATEUSE1_FALLBACK) {
    auto fallback = kineto_ctx_ptr->fallback_;
    TORCH_INTERNAL_ASSERT(fallback != nullptr);
    torch::profiler::impl::privateuse1Stubs()->record(
        nullptr, &fallback->device_event_end_, nullptr);
  }

  if (fn.scope() == at::RecordScope::USER_SCOPE) {
    torch::profiler::impl::kineto::popUserCorrelationId();
  } else {
    torch::profiler::impl::kineto::popCorrelationId();
  }
}

template <bool use_global_callback = false>
void pushProfilingCallbacks(const std::unordered_set<at::RecordScope>& scopes) {
  auto registration_state_ptr =
      KinetoThreadLocalState::get(use_global_callback);
  TORCH_INTERNAL_ASSERT(registration_state_ptr, "Expected profiler state set");
  auto recordFunctionCallback =
      at::RecordFunctionCallback(
          onFunctionEnter<use_global_callback>,
          onFunctionExit<use_global_callback>)
          .needsInputs(registration_state_ptr->config().report_input_shapes)
          .scopes(scopes);

  if constexpr (use_global_callback) {
    registration_state_ptr->setCallbackHandle(
        at::addGlobalCallback(recordFunctionCallback));
  } else {
    registration_state_ptr->setCallbackHandle(
        at::addThreadLocalCallback(recordFunctionCallback));
  }
}

struct ProfilerStateInfo {
  std::shared_ptr<KinetoThreadLocalState> state_ptr;
  std::unordered_set<at::RecordScope> scopes;
};
std::shared_ptr<ProfilerStateInfo> profiler_state_info_ptr{nullptr};

} // namespace

void reportBackendEventToActiveKinetoProfiler(
    const int64_t start_time_us,
    const int64_t end_time_us,
    const int64_t debug_handle,
    const at::RecordScope scope,
    const std::string& event_name,
    const std::string& backend_name) {
  TORCH_INTERNAL_ASSERT(
      KinetoThreadLocalState::get(/*global=*/true) == nullptr,
      "On-demand profiling does not support post processing callback");

  auto state_ptr = KinetoThreadLocalState::get(/*global=*/false);
  if (!state_ptr) {
    return;
  }

  state_ptr->recordQueue.getSubqueue()->emplace_backend_event(
      start_time_us,
      end_time_us,
      debug_handle,
      scope,
      event_name,
      backend_name);

  /* no support for input shapes now?
  if (config.report_input_shapes) {
    ctx_ptr->shapes = inputSizes(fn);
    ctx_ptr->dtypes = inputTypes(fn);
  }
  */
}

void prepareProfiler(
    const torch::profiler::impl::ProfilerConfig& config,
    const std::set<torch::profiler::impl::ActivityType>& activities) {
  if (config.state == ProfilerState::NVTX ||
      config.state == ProfilerState::ITT) {
    return;
  }
  TORCH_CHECK(
      config.state == ProfilerState::KINETO ||
          config.state == ProfilerState::KINETO_GPU_FALLBACK ||
          config.state == ProfilerState::KINETO_PRIVATEUSE1_FALLBACK,
      "Supported only in Kineto profiler");
  torch::profiler::impl::kineto::prepareTrace(
      /*cpuOnly=*/!(
          at::hasCUDA() || at::hasXPU() || at::hasMTIA() ||
          c10::get_privateuse1_backend() != "privateuseone"),
      activities,
      config.experimental_config);

  if (!config.experimental_config.performance_events.empty()) {
    /* For now only CPU activity is supported */
    TORCH_CHECK(
        activities.count(torch::autograd::profiler::ActivityType::CPU),
        "Cannot run cpu hardware profiler without CPU activities, please only use CPU activity type");
    /*
     * Sending a warning and passing the non-standard event to the backend
     * Backend can abort if the event is not supported.
     * TODO Should we gracefully drop the invalid event if we have atleast one
     * valid?
     */
    auto is_standard_event = [](const std::string& event) -> bool {
      for (auto e : torch::profiler::ProfilerPerfEvents) {
        if (!std::strcmp(event.c_str(), e)) {
          return true;
        }
      }
      return false;
    };

    for (const auto& e : config.experimental_config.performance_events) {
      if (!is_standard_event(e)) {
        TORCH_WARN("Forwarding a non-standard CPU performance event : ", e);
      }
    }
  }
}

static void toggleTorchOpCollectionDynamic(bool enable) {
  auto state_ptr = ProfilerStateBase::get();
  if (state_ptr) {
    const auto& config = state_ptr->config();
    if (enable) {
      auto scopes = profiler_state_info_ptr->scopes;
      config.global() ? pushProfilingCallbacks</*global=*/true>(scopes)
                      : pushProfilingCallbacks</*global=*/false>(scopes);
    } else {
      state_ptr->removeCallback();
    }
  }
}

// Set this function to be unused as profiler implementation needs more
// refactoring to support Python ops collection dynamic toggling
#ifdef _MSC_VER
#define UNUSED
#else
#define UNUSED __attribute__((unused))
#endif
static UNUSED void togglePythonCollectionDynamic(bool enable) {
  auto state_ptr = ProfilerStateBase::get();
  if (state_ptr) {
    auto global = state_ptr->config().global();
    KinetoThreadLocalState* kineto_thread_local_state_ptr =
        KinetoThreadLocalState::get(global);
    if (enable) {
      kineto_thread_local_state_ptr->resumePython();
    } else {
      kineto_thread_local_state_ptr->pausePython();
    }
  }
}

static void toggleCPUCollectionDynamic(bool enable) {
  toggleTorchOpCollectionDynamic(enable);
  // For now we only support Torch Op collection dynamic toggling as
  // implementing Python ops would require not only string parsing to get rid of
  // the toggling events as well as other unfinished events as well as changes
  // in stack logic
  // togglePythonCollectionDynamic(enable);
}

void toggleCollectionDynamic(
    const bool enable,
    const std::set<torch::profiler::impl::ActivityType>& activities) {
  if (activities.count(torch::autograd::profiler::ActivityType::CPU) > 0 &&
      activities.count(torch::autograd::profiler::ActivityType::CUDA) == 0) {
    LOG(WARNING)
        << "Toggling CPU activity with CUDA activity on may result in traces with CUDA events on artibrary tracks";
  }
  for (auto act : activities) {
    if (act == torch::autograd::profiler::ActivityType::CUDA) {
      torch::profiler::impl::kineto::toggleCollectionDynamic(enable);
    } else if (act == torch::autograd::profiler::ActivityType::CPU) {
      toggleCPUCollectionDynamic(enable);
    } else {
      LOG(WARNING)
          << "Dynamic toggle is only supported for CPU/GPU activity, skipping toggling of "
          << actToString(act);
      continue;
    }
  }
}

void enableProfilerWithEventPostProcess(
    const torch::profiler::impl::ProfilerConfig& config,
    const std::set<torch::profiler::impl::ActivityType>& activities,
    post_process_t&& cb,
    const std::unordered_set<at::RecordScope>& scopes) {
  TORCH_CHECK(
      config.state != ProfilerState::NVTX,
      "NVTX does not support post processing callback.");
  TORCH_CHECK(
      config.state != ProfilerState::ITT,
      "ITT does not support post processing callback.");
  TORCH_INTERNAL_ASSERT(
      KinetoThreadLocalState::get(/*global=*/true) == nullptr,
      "On-demand profiling does not support post processing callback");

  enableProfiler(config, activities, scopes);
  auto state_ptr = KinetoThreadLocalState::get(config.global());
  state_ptr->setEventPostProcessingCallback(std::move(cb));
}

void enableProfiler(
    const torch::profiler::impl::ProfilerConfig& config,
    const std::set<torch::profiler::impl::ActivityType>& activities,
    const std::unordered_set<at::RecordScope>& scopes) {
  const auto has_cpu = activities.count(ActivityType::CPU);
  TORCH_CHECK(
      KinetoThreadLocalState::get(/*global=*/config.global()) == nullptr,
      "Profiler is already enabled",
      (config.global() ? "." : " on this thread."));

  if (config.state == ProfilerState::NVTX) {
    torch::profiler::impl::pushNVTXCallbacks(config, scopes);
    return;
  } else if (config.state == ProfilerState::ITT) {
    torch::profiler::impl::pushITTCallbacks(config, scopes);
    return;
  } else if (config.state == ProfilerState::PRIVATEUSE1) {
    torch::profiler::impl::pushPRIVATEUSE1CallbacksStub(config, scopes);
    return;
  }

  TORCH_CHECK(
      config.state == ProfilerState::KINETO ||
      config.state == ProfilerState::KINETO_GPU_FALLBACK ||
      config.state == ProfilerState::KINETO_PRIVATEUSE1_FALLBACK ||
      config.global());
  TORCH_CHECK(!activities.empty(), "No activities specified.");
  TORCH_INTERNAL_ASSERT(
      has_cpu || !config.global(),
      "Ondemand profiling must enable CPU tracing");

  auto state_ptr = std::make_shared<KinetoThreadLocalState>(config, activities);
  KinetoThreadLocalState::push(state_ptr);

  if (has_cpu) {
    config.global() ? pushProfilingCallbacks</*global=*/true>(scopes)
                    : pushProfilingCallbacks</*global=*/false>(scopes);
  }

  if (!config.global()) {
    torch::profiler::impl::kineto::startTrace();
  }

  if (has_cpu) {
    auto state_info_ptr = std::make_shared<ProfilerStateInfo>();
    state_info_ptr->state_ptr = state_ptr;
    state_info_ptr->scopes = scopes;
    profiler_state_info_ptr = state_info_ptr;
  }
}

bool isProfilerEnabledInMainThread() {
  return profiler_state_info_ptr != nullptr;
}

void enableProfilerInChildThread() {
  auto state_info_ptr = profiler_state_info_ptr;
  TORCH_CHECK(state_info_ptr, "Profiler is not enabled in main thread.");
  TORCH_CHECK(
      KinetoThreadLocalState::get(/*global=*/false) == nullptr,
      "Profiler is already enabled in this thread.");

  KinetoThreadLocalState::push(state_info_ptr->state_ptr);
  pushProfilingCallbacks</*global=*/false>(state_info_ptr->scopes);
}

void disableProfilerInChildThread() {
  auto state_ptr = ProfilerStateBase::pop();
  TORCH_CHECK(
      state_ptr,
      "Can't disable Kineto profiler when it's not running in this thread");
  state_ptr->removeCallback();
}

std::unique_ptr<ProfilerResult> disableProfiler() {
  // releasing to inform child threads to stop profiling
  profiler_state_info_ptr = nullptr;

  auto state_ptr = ProfilerStateBase::pop();
  const auto& config = state_ptr->config();
  TORCH_CHECK(
      state_ptr &&
          (config.state == ProfilerState::KINETO ||
           config.state == ProfilerState::KINETO_GPU_FALLBACK ||
           config.state == ProfilerState::KINETO_PRIVATEUSE1_FALLBACK ||
           config.state == ProfilerState::KINETO_ONDEMAND ||
           config.state == ProfilerState::NVTX ||
           config.state == ProfilerState::ITT ||
           config.state == ProfilerState::PRIVATEUSE1),
      "Can't disable Kineto profiler when it's not running");

  state_ptr->removeCallback();

  // Traces are converged via libkineto automatically for ondemand flow
  if (state_ptr->config().global()) {
    (void)std::static_pointer_cast<KinetoThreadLocalState>(state_ptr)
        ->finalizeTrace();
    return std::make_unique<ProfilerResult>();
  }

  // Shared among NVTX, PRIVATEUSE1, KINETO, KINETO_GPU_FALLBACK,
  // KINETO_PRIVATEUSE1_FALLBACK
  std::unique_ptr<ProfilerResult> result;
  if (state_ptr->config().state == ProfilerState::NVTX ||
      state_ptr->config().state == ProfilerState::PRIVATEUSE1) {
    result = std::make_unique<ProfilerResult>();
  }

  if (config.state == ProfilerState::KINETO ||
      config.state == ProfilerState::KINETO_GPU_FALLBACK ||
      config.state == ProfilerState::KINETO_PRIVATEUSE1_FALLBACK) {
    auto kineto_state_ptr =
        std::static_pointer_cast<KinetoThreadLocalState>(state_ptr);
    auto trace = kineto_state_ptr->finalizeTrace();
    result = std::make_unique<ProfilerResult>(
        kineto_state_ptr->startTime,
        std::move(kineto_state_ptr->kinetoEvents),
        std::move(trace),
        std::move(kineto_state_ptr->eventTree));
  }

  return result;
}

KinetoEvent::KinetoEvent(
    const std::shared_ptr<const torch::profiler::impl::Result>& result,
    const bool verbose)
    : result_{result} {
  TORCH_INTERNAL_ASSERT(result != nullptr);

  if (verbose) {
    // Populate Python stack
    auto parent = result_->parent_.lock();
    while (parent != nullptr) {
      parent->visit_if_base<PyExtraFieldsBase>(
          [&](const auto&) { python_stack_.push_back(parent->name()); });
      parent = parent->parent_.lock();
    }
  }

  result->visit_if_base<ExtraFields<EventType::TorchOp>>([&](const auto& op) {
    auto arg_data = parseArgData(op.inputs_, op.concrete_inputs_);
    shapes_ = std::move(arg_data.shapesForKinetoEvent);
    dtypes_ = std::move(arg_data.dtypes);
    concrete_inputs_ = std::move(arg_data.concreteInputs);
    kwinputs_ = std::move(op.kwinputs_);
  });
}

bool KinetoEvent::isPythonFunction() const {
  bool out{false};
  result_->visit_if_base<PyExtraFieldsBase>([&](const auto&) { out = true; });
  return out;
}

bool KinetoEvent::hasShapes() const {
  return !shapes_.empty();
}

const c10::ArrayRef<std::vector<int64_t>> KinetoEvent::shapes() const {
  return shapes_;
}

bool KinetoEvent::hasTypes() const {
  return !dtypes_.empty();
}

const c10::ArrayRef<std::string> KinetoEvent::dtypes() const {
  return dtypes_;
}

bool KinetoEvent::hasConcreteInputs() const {
  return !concrete_inputs_.empty();
}

const c10::ArrayRef<c10::IValue> KinetoEvent::concreteInputs() const {
  return concrete_inputs_;
}

bool KinetoEvent::hasKwinputs() const {
  return !kwinputs_.empty();
}

const std::unordered_map<std::string, c10::IValue> KinetoEvent::kwinputs()
    const {
  return kwinputs_;
}

const c10::ArrayRef<std::string> KinetoEvent::stack() const {
  auto get = [&](const auto& i) -> auto& {
    return !i.jit_stack_.empty() ? i.jit_stack_ : python_stack_;
  };

  auto const& extra_fields = result_->extra_fields_;
  if (auto p = std::get_if<ExtraFields<EventType::TorchOp>>(&extra_fields)) {
    return get(*p);
  }
  if (auto p = std::get_if<ExtraFields<EventType::Backend>>(&extra_fields)) {
    return get(*p);
  }
  return python_stack_;
}

const c10::ArrayRef<std::string> KinetoEvent::moduleHierarchy() const {
  auto const& extra_fields = result_->extra_fields_;
  if (auto p = std::get_if<ExtraFields<EventType::TorchOp>>(&extra_fields)) {
    return p->jit_modules_;
  }
  if (auto p = std::get_if<ExtraFields<EventType::Backend>>(&extra_fields)) {
    return p->jit_modules_;
  }
  return {};
}

uint64_t KinetoEvent::endNs() const {
  return result_->endTimeNS();
}

uint64_t KinetoEvent::durationNs() const {
  return (result_->endTimeNS() - result_->start_time_ns_);
}

int64_t KinetoEvent::debugHandle() const {
  return result_->visit(c10::overloaded(
      [](const ExtraFields<EventType::TorchOp>& i) { return i.debug_handle_; },
      [](const ExtraFields<EventType::Backend>& i) { return i.debug_handle_; },
      [](const auto&) -> int64_t { return -1; }));
}

int KinetoEvent::deviceIndex() const {
  return result_->visit(c10::overloaded(
      [](const ExtraFields<EventType::Allocation>& i) {
        return static_cast<int>(i.device_index_);
      },
      [](const ExtraFields<EventType::OutOfMemory>& i) {
        return static_cast<int>(i.device_index_);
      },
      [&](const auto&) {
        return static_cast<int>(result_->kineto_info_.device);
      }));
}

bool KinetoEvent::hasStack() const {
  return !stack().empty();
}

int64_t KinetoEvent::cudaElapsedUs() const {
  auto cuda_event_start = fallbackStart();
  auto cuda_event_end = fallbackEnd();
  if (!cuda_event_start || !cuda_event_end) {
    return -1;
  }
  try {
    return (int64_t)torch::profiler::impl::cudaStubs()->elapsed(
        &cuda_event_start, &cuda_event_end);
  } catch (std::exception& e) {
    LOG(WARNING) << "Failed to measure time between two CUDA events. "
                 << e.what();
  }
  return -1;
}

int64_t KinetoEvent::privateuse1ElapsedUs() const {
  auto privateuse1_event_start = fallbackStart();
  auto privateuse1_event_end = fallbackEnd();
  if (!privateuse1_event_start || !privateuse1_event_end) {
    return -1;
  }
  return (int64_t)torch::profiler::impl::privateuse1Stubs()->elapsed(
      &privateuse1_event_start, &privateuse1_event_end);
  return -1;
}

void KinetoEvent::getPerfEventCounters(std::vector<uint64_t>& in) const {
  return result_->visit(c10::overloaded(
      [&in](const ExtraFields<EventType::TorchOp>& e) -> void {
        const size_t n = e.perf_event_counters_->size();
        // should be rare
        if (in.size() < n) {
          in.resize(n, 0);
        }
        for (size_t i = 0; i < n; ++i) {
          in[i] = (*e.perf_event_counters_)[i];
        }
      },
      [](const auto&) -> void { return; }));
}

#define FORWARD_FROM_RESULT(method_name, result_expr)                        \
  decltype(std::declval<KinetoEvent>().method_name())                        \
  KinetoEvent::method_name() const {                                         \
    return static_cast<decltype(std::declval<KinetoEvent>().method_name())>( \
        result_->result_expr);                                               \
  }

FORWARD_FROM_RESULT(startThreadId, start_tid_)
FORWARD_FROM_RESULT(endThreadId, endTID())
FORWARD_FROM_RESULT(activityType, kinetoType())
FORWARD_FROM_RESULT(name, name())
FORWARD_FROM_RESULT(deviceType, deviceType())
FORWARD_FROM_RESULT(startNs, start_time_ns_)
FORWARD_FROM_RESULT(correlationId, correlationID())
FORWARD_FROM_RESULT(deviceResourceId, kineto_info_.resource)
#undef FORWARD_FROM_RESULT

// Most of the fields in `KinetoEvent` only make sense for a single event type.
// (Generally TorchOp.) For all other types they simply return the default
// value. This macro provides a succinct way of expressing this behavior.
#define TYPED_ATTR_WITH_DEFAULT(                                       \
    event_type, method_name, expression, default_value)                \
  decltype(std::declval<KinetoEvent>().method_name())                  \
  KinetoEvent::method_name() const {                                   \
    using out_t = decltype(std::declval<KinetoEvent>().method_name()); \
    return result_->visit(c10::overloaded(                             \
        [](const ExtraFields<EventType::event_type>& e) -> out_t {     \
          return expression;                                           \
        },                                                             \
        [](const auto&) -> out_t { return default_value; }));          \
  }

#define TYPED_ATTR(event_type, method_name, expression) \
  TYPED_ATTR_WITH_DEFAULT(event_type, method_name, expression, {})

TYPED_ATTR_WITH_DEFAULT(TorchOp, sequenceNr, e.sequence_number_, -1)
TYPED_ATTR(TorchOp, fwdThreadId, e.sequence_number_ >= 0 ? e.forward_tid_ : 0)
TYPED_ATTR(TorchOp, scope, static_cast<uint8_t>(e.scope_))
TYPED_ATTR(TorchOp, hasModuleHierarchy, !e.jit_modules_.empty())
TYPED_ATTR(TorchOp, isAsync, e.is_async_)
TYPED_ATTR(TorchOp, extraMeta, e.extra_meta_)
TYPED_ATTR(TorchOp, fallbackStart, e.device_fallback_.device_event_start_)
TYPED_ATTR(TorchOp, fallbackEnd, e.device_fallback_.device_event_end_)
TYPED_ATTR(
    TorchOp,
    flops,
    !e.extra_args_.empty()
        ? torch::profiler::impl::computeFlops(e.name_, e.extra_args_)
        : 0)
TYPED_ATTR(Backend, backend, e.backend_)
TYPED_ATTR(Allocation, nBytes, e.alloc_size_)
TYPED_ATTR(Kineto, linkedCorrelationId, [&]() {
  const auto linked = e.linked_activity_.lock();
  return linked ? linked->correlationID() : 0;
}())
#undef TYPED_ATTR
#undef TYPED_ATTR_WITH_DEFAULT

ProfilerResult::ProfilerResult(
    uint64_t start_time,
    std::vector<KinetoEvent> events,
    std::unique_ptr<torch::profiler::impl::kineto::ActivityTraceWrapper>&&
        trace,
    std::vector<experimental_event_t>&& event_tree)
    : trace_start_ns_(start_time),
      events_(std::move(events)),
      trace_(std::move(trace)),
      event_tree_(std::move(event_tree)) {}
ProfilerResult::ProfilerResult() = default;
ProfilerResult::~ProfilerResult() = default;

void ProfilerResult::save(const std::string& path) {
  trace_->save(path);
}

} // namespace autograd::profiler

namespace profiler::impl {
void _reportVulkanEventToProfiler(vulkan_id_t id) {
  auto state_ptr = ::torch::autograd::profiler::KinetoThreadLocalState::get(
      /*global=*/false);
  if (state_ptr) {
    state_ptr->reportVulkanEventToProfiler(id);
  }
}
} // namespace profiler::impl

} // namespace torch