1 #pragma once 2 #include <c10/macros/Macros.h> 3 4 // Marks a lambda as executable on both the host and device. The __host__ 5 // attribute is important so that we can access static type information from 6 // the host, even if the function is typically only executed on the device. 7 #ifndef GPU_LAMBDA 8 #define GPU_LAMBDA __host__ __device__ 9 #endif 10 11 #if defined(USE_ROCM) num_threads()12constexpr int num_threads() { 13 return 256; 14 } 15 #else num_threads()16constexpr uint32_t num_threads() { 17 return C10_WARP_SIZE * 4; 18 } 19 #endif 20 thread_work_size()21constexpr int thread_work_size() { return 4; } block_work_size()22constexpr int block_work_size() { return thread_work_size() * num_threads(); } 23