c10/cuda/CUDADeviceAssertion.h

#pragma once

#include <c10/cuda/CUDAException.h>
#include <c10/macros/Macros.h>

namespace c10::cuda {

#ifdef TORCH_USE_CUDA_DSA
// Copy string from `src` to `dst`
static __device__ void dstrcpy(char* dst, const char* src) {
  int i = 0;
  // Copy string from source to destination, ensuring that it
  // isn't longer than `C10_CUDA_DSA_MAX_STR_LEN-1`
  while (*src != '\0' && i++ < C10_CUDA_DSA_MAX_STR_LEN - 1) {
    *dst++ = *src++;
  }
  *dst = '\0';
}

static __device__ void dsa_add_new_assertion_failure(
    DeviceAssertionsData* assertions_data,
    const char* assertion_msg,
    const char* filename,
    const char* function_name,
    const int line_number,
    const uint32_t caller,
    const dim3 block_id,
    const dim3 thread_id) {
  // `assertions_data` may be nullptr if device-side assertion checking
  // is disabled at run-time. If it is disabled at compile time this
  // function will never be called
  if (!assertions_data) {
    return;
  }

  // Atomically increment so other threads can fail at the same time
  // Note that incrementing this means that the CPU can observe that
  // a failure has happened and can begin to respond before we've
  // written information about that failure out to the buffer.
  const auto nid = atomicAdd(&(assertions_data->assertion_count), 1);

  if (nid >= C10_CUDA_DSA_ASSERTION_COUNT) {
    // At this point we're ran out of assertion buffer space.
    // We could print a message about this, but that'd get
    // spammy if a lot of threads did it, so we just silently
    // ignore any other assertion failures. In most cases the
    // failures will all probably be analogous anyway.
    return;
  }

  // Write information about the assertion failure to memory.
  // Note that this occurs only after the `assertion_count`
  // increment broadcasts that there's been a problem.
  auto& self = assertions_data->assertions[nid];
  dstrcpy(self.assertion_msg, assertion_msg);
  dstrcpy(self.filename, filename);
  dstrcpy(self.function_name, function_name);
  self.line_number = line_number;
  self.caller = caller;
  self.block_id[0] = block_id.x;
  self.block_id[1] = block_id.y;
  self.block_id[2] = block_id.z;
  self.thread_id[0] = thread_id.x;
  self.thread_id[1] = thread_id.y;
  self.thread_id[2] = thread_id.z;
}

// Emulates a kernel assertion. The assertion won't stop the kernel's progress,
// so you should assume everything the kernel produces is garbage if there's an
// assertion failure.
// NOTE: This assumes that `assertions_data` and  `assertion_caller_id` are
//       arguments of the kernel and therefore accessible.
#define CUDA_KERNEL_ASSERT2(condition)                                   \
  do {                                                                   \
    if (C10_UNLIKELY(!(condition))) {                                    \
      /* Has an atomic element so threads can fail at the same time */   \
      c10::cuda::dsa_add_new_assertion_failure(                          \
          assertions_data,                                               \
          C10_STRINGIZE(condition),                                      \
          __FILE__,                                                      \
          __FUNCTION__,                                                  \
          __LINE__,                                                      \
          assertion_caller_id,                                           \
          blockIdx,                                                      \
          threadIdx);                                                    \
      /* Now that the kernel has failed we early exit the kernel, but */ \
      /* otherwise keep going and rely on the host to check UVM and */   \
      /* determine we've had a problem */                                \
      return;                                                            \
    }                                                                    \
  } while (false)
#else
#define CUDA_KERNEL_ASSERT2(condition) assert(condition)
#endif

} // namespace c10::cuda