xref: /aosp_15_r20/external/tensorflow/tensorflow/core/platform/denormal.cc (revision b6fb3261f9314811a0f4371741dbb8839866f948)
1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "tensorflow/core/platform/denormal.h"
17 
18 #include "tensorflow/core/platform/cpu_info.h"
19 #include "tensorflow/core/platform/platform.h"
20 
21 // If we're on gcc 4.8 or older, there's a known bug that prevents the use of
22 // intrinsics when the architecture is not defined in the flags. See
23 // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=57202
24 #if !defined(__SSE3__) && !defined(__clang__) && \
25     (defined(__GNUC__) && (__GNUC__ < 4) ||      \
26      ((__GNUC__ == 4) && (__GNUC_MINOR__ < 9)))
27 #define GCC_WITHOUT_INTRINSICS
28 #endif
29 // Only try to use SSE3 instructions if we're on an x86 platform, and it's not
30 // mobile, and we're not on a known bad gcc version.
31 #if defined(PLATFORM_IS_X86) && !defined(IS_MOBILE_PLATFORM) && \
32     !defined(GCC_WITHOUT_INTRINSICS)
33 #define X86_DENORM_USE_INTRINSICS
34 #endif
35 
36 #ifdef X86_DENORM_USE_INTRINSICS
37 #include <pmmintrin.h>
38 #endif
39 
40 // If on ARM, only access the control register if hardware floating-point
41 // support is available.
42 #if defined(PLATFORM_IS_ARM) && defined(__ARM_FP) && (__ARM_FP > 0)
43 #define ARM_DENORM_AVAILABLE
44 // Flush-to-zero bit on the ARM floating-point control register.
45 #define ARM_FPCR_FZ (1 << 24)
46 #endif
47 
48 namespace tensorflow {
49 namespace port {
50 
operator ==(const DenormalState & other) const51 bool DenormalState::operator==(const DenormalState& other) const {
52   return flush_to_zero() == other.flush_to_zero() &&
53          denormals_are_zero() == other.denormals_are_zero();
54 }
55 
operator !=(const DenormalState & other) const56 bool DenormalState::operator!=(const DenormalState& other) const {
57   return !(this->operator==(other));
58 }
59 
60 #ifdef ARM_DENORM_AVAILABLE
61 // Although the ARM ACLE does have a specification for __arm_rsr/__arm_wsr
62 // for reading and writing to the status registers, they are not implemented
63 // by GCC, so we need to resort to inline assembly.
ArmSetFloatingPointControlRegister(uint32_t fpcr)64 static inline void ArmSetFloatingPointControlRegister(uint32_t fpcr) {
65 #ifdef PLATFORM_IS_ARM64
66   __asm__ __volatile__("msr fpcr, %[fpcr]"
67                        :
68                        : [fpcr] "r"(static_cast<uint64_t>(fpcr)));
69 #else
70   __asm__ __volatile__("vmsr fpscr, %[fpcr]" : : [fpcr] "r"(fpcr));
71 #endif
72 }
73 
ArmGetFloatingPointControlRegister()74 static inline uint32_t ArmGetFloatingPointControlRegister() {
75   uint32_t fpcr;
76 #ifdef PLATFORM_IS_ARM64
77   uint64_t fpcr64;
78   __asm__ __volatile__("mrs %[fpcr], fpcr" : [fpcr] "=r"(fpcr64));
79   fpcr = static_cast<uint32_t>(fpcr64);
80 #else
81   __asm__ __volatile__("vmrs %[fpcr], fpscr" : [fpcr] "=r"(fpcr));
82 #endif
83   return fpcr;
84 }
85 #endif  // ARM_DENORM_AVAILABLE
86 
SetDenormalState(const DenormalState & state)87 bool SetDenormalState(const DenormalState& state) {
88   // For now, we flush denormals only on SSE 3 and ARM.  Other architectures
89   // can be added as needed.
90 
91 #ifdef X86_DENORM_USE_INTRINSICS
92   if (TestCPUFeature(SSE3)) {
93     // Restore flags
94     _MM_SET_FLUSH_ZERO_MODE(state.flush_to_zero() ? _MM_FLUSH_ZERO_ON
95                                                   : _MM_FLUSH_ZERO_OFF);
96     _MM_SET_DENORMALS_ZERO_MODE(state.denormals_are_zero()
97                                     ? _MM_DENORMALS_ZERO_ON
98                                     : _MM_DENORMALS_ZERO_OFF);
99     return true;
100   }
101 #endif
102 
103 #ifdef ARM_DENORM_AVAILABLE
104   // ARM only has one setting controlling both denormal inputs and outputs.
105   if (state.flush_to_zero() == state.denormals_are_zero()) {
106     uint32_t fpcr = ArmGetFloatingPointControlRegister();
107     if (state.flush_to_zero()) {
108       fpcr |= ARM_FPCR_FZ;
109     } else {
110       fpcr &= ~ARM_FPCR_FZ;
111     }
112     ArmSetFloatingPointControlRegister(fpcr);
113     return true;
114   }
115 #endif
116 
117   // Setting denormal handling to the provided state is not supported.
118   return false;
119 }
120 
GetDenormalState()121 DenormalState GetDenormalState() {
122 #ifdef X86_DENORM_USE_INTRINSICS
123   if (TestCPUFeature(SSE3)) {
124     // Save existing flags
125     bool flush_zero_mode = _MM_GET_FLUSH_ZERO_MODE() == _MM_FLUSH_ZERO_ON;
126     bool denormals_zero_mode =
127         _MM_GET_DENORMALS_ZERO_MODE() == _MM_DENORMALS_ZERO_ON;
128     return DenormalState(flush_zero_mode, denormals_zero_mode);
129   }
130 #endif
131 
132 #ifdef ARM_DENORM_AVAILABLE
133   uint32_t fpcr = ArmGetFloatingPointControlRegister();
134   if ((fpcr & ARM_FPCR_FZ) != 0) {
135     return DenormalState(true, true);
136   }
137 #endif
138 
139   return DenormalState(false, false);
140 }
141 
ScopedRestoreFlushDenormalState()142 ScopedRestoreFlushDenormalState::ScopedRestoreFlushDenormalState()
143     : denormal_state_(GetDenormalState()) {}
144 
~ScopedRestoreFlushDenormalState()145 ScopedRestoreFlushDenormalState::~ScopedRestoreFlushDenormalState() {
146   SetDenormalState(denormal_state_);
147 }
148 
ScopedFlushDenormal()149 ScopedFlushDenormal::ScopedFlushDenormal() {
150   SetDenormalState(
151       DenormalState(/*flush_to_zero=*/true, /*denormals_are_zero=*/true));
152 }
153 
ScopedDontFlushDenormal()154 ScopedDontFlushDenormal::ScopedDontFlushDenormal() {
155   SetDenormalState(
156       DenormalState(/*flush_to_zero=*/false, /*denormals_are_zero=*/false));
157 }
158 
159 }  // namespace port
160 }  // namespace tensorflow
161