#pragma once #include #include #include #if defined(__CUDA_ARCH__) #include #define C10_COMPAT_COPYSIGN c10::cuda::compat::copysign #elif defined(__HIPCC__) #include #define C10_COMPAT_COPYSIGN c10::hip::compat::copysign #else #include #define C10_COMPAT_COPYSIGN c10::copysign #endif // The functions in this file should be header-only as it is used under // ABI-compatibility mode. namespace c10 { // NOTE: [Floor Division in Python] // Python's __floordiv__ operator is more complicated than just floor(a / b). // It aims to maintain the property: a == (a // b) * b + remainder(a, b) // which can otherwise fail due to rounding errors in the remainder. // So, instead it is calculated as: a // b = (a - remainder(a, b)) / b // With some additional fix-ups added to the result. // // For reference, see CPython's implementation: // https://github.com/python/cpython/blob/ace008c531dd685a30c1dd68f9b5ba35f20171cf/Objects/floatobject.c#L636 template inline C10_HOST_DEVICE scalar_t div_floor_floating(scalar_t a, scalar_t b) __ubsan_ignore_float_divide_by_zero__ { if (C10_UNLIKELY(b == 0)) { // Divide by zero: return standard IEEE result return a / b; } auto mod = std::fmod(a, b); auto div = (a - mod) / b; if ((mod != 0) && (b < 0) != (mod < 0)) { div -= scalar_t(1); } scalar_t floordiv; if (div != 0) { floordiv = std::floor(div); if (div - floordiv > scalar_t(0.5)) { floordiv += scalar_t(1.0); } } else { floordiv = C10_COMPAT_COPYSIGN(scalar_t(0), a / b); } return floordiv; } template inline C10_HOST_DEVICE scalar_t div_floor_integer(scalar_t a, scalar_t b) { if (c10::signs_differ(a, b)) { // Subtracts one from the results of truncation division if the // divisor and dividend have different sign(bit)s and the remainder of // the division is nonzero const auto quot = a / b; const auto rem = a % b; return rem ? quot - 1 : quot; } return a / b; } } // namespace c10