xref: /aosp_15_r20/external/tensorflow/tensorflow/core/common_runtime/process_state.cc (revision b6fb3261f9314811a0f4371741dbb8839866f948)
1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "tensorflow/core/common_runtime/process_state.h"
17 
18 #include <atomic>
19 #include <cstring>
20 #include <vector>
21 
22 #include "absl/base/call_once.h"
23 #include "tensorflow/core/common_runtime/bfc_allocator.h"
24 #include "tensorflow/core/common_runtime/pool_allocator.h"
25 #include "tensorflow/core/framework/allocator.h"
26 #include "tensorflow/core/framework/log_memory.h"
27 #include "tensorflow/core/framework/tracking_allocator.h"
28 #include "tensorflow/core/lib/strings/strcat.h"
29 #include "tensorflow/core/platform/logging.h"
30 #include "tensorflow/core/platform/mutex.h"
31 #include "tensorflow/core/platform/types.h"
32 #include "tensorflow/core/util/env_var.h"
33 
34 namespace tensorflow {
35 
singleton()36 /*static*/ ProcessState* ProcessState::singleton() {
37   static ProcessState* instance = new ProcessState;
38   static absl::once_flag f;
39   absl::call_once(f, []() {
40     AllocatorFactoryRegistry::singleton()->process_state_ = instance;
41   });
42 
43   return instance;
44 }
45 
ProcessState()46 ProcessState::ProcessState()
47     : numa_enabled_(false), cpu_allocators_cached_(0) {}
48 
DebugString()49 string ProcessState::MemDesc::DebugString() {
50   return strings::StrCat((loc == CPU ? "CPU " : "GPU "), dev_index,
51                          ", dma: ", gpu_registered, ", nic: ", nic_registered);
52 }
53 
PtrType(const void * ptr)54 ProcessState::MemDesc ProcessState::PtrType(const void* ptr) {
55   if (FLAGS_brain_gpu_record_mem_types) {
56     auto iter = mem_desc_map_.find(ptr);
57     if (iter != mem_desc_map_.end()) {
58       return iter->second;
59     }
60   }
61   return MemDesc();
62 }
63 
GetCPUAllocator(int numa_node)64 Allocator* ProcessState::GetCPUAllocator(int numa_node) {
65   if (!numa_enabled_ || numa_node == port::kNUMANoAffinity) numa_node = 0;
66 
67   // Check if allocator for the numa node is in lock-free cache.
68   if (numa_node < cpu_allocators_cached_.load(std::memory_order_acquire)) {
69     return cpu_allocators_cache_[numa_node];
70   }
71 
72   mutex_lock lock(mu_);
73   while (cpu_allocators_.size() <= static_cast<size_t>(numa_node)) {
74     // If visitors have been defined we need an Allocator built from
75     // a SubAllocator.  Prefer BFCAllocator, but fall back to PoolAllocator
76     // depending on env var setting.
77     const bool alloc_visitors_defined =
78         (!cpu_alloc_visitors_.empty() || !cpu_free_visitors_.empty());
79     bool use_bfc_allocator = false;
80     Status status = ReadBoolFromEnvVar(
81         "TF_CPU_ALLOCATOR_USE_BFC", alloc_visitors_defined, &use_bfc_allocator);
82     if (!status.ok()) {
83       LOG(ERROR) << "GetCPUAllocator: " << status.error_message();
84     }
85     Allocator* allocator = nullptr;
86     SubAllocator* sub_allocator =
87         (numa_enabled_ || alloc_visitors_defined || use_bfc_allocator)
88             ? new BasicCPUAllocator(
89                   numa_enabled_ ? numa_node : port::kNUMANoAffinity,
90                   cpu_alloc_visitors_, cpu_free_visitors_)
91             : nullptr;
92     if (use_bfc_allocator) {
93       // TODO(reedwm): evaluate whether 64GB by default is the best choice.
94       int64_t cpu_mem_limit_in_mb = -1;
95       Status status = ReadInt64FromEnvVar("TF_CPU_BFC_MEM_LIMIT_IN_MB",
96                                           1LL << 16 /*64GB max by default*/,
97                                           &cpu_mem_limit_in_mb);
98       if (!status.ok()) {
99         LOG(ERROR) << "GetCPUAllocator: " << status.error_message();
100       }
101       int64_t cpu_mem_limit = cpu_mem_limit_in_mb * (1LL << 20);
102       DCHECK(sub_allocator);
103 
104       BFCAllocator::Options allocator_opts;
105       allocator_opts.allow_growth = true;
106       allocator = new BFCAllocator(
107           absl::WrapUnique(sub_allocator), cpu_mem_limit,
108           /*name=*/"bfc_cpu_allocator_for_gpu", allocator_opts);
109 
110       VLOG(2) << "Using BFCAllocator with memory limit of "
111               << cpu_mem_limit_in_mb << " MB for ProcessState CPU allocator";
112     } else if (sub_allocator) {
113       DCHECK(sub_allocator);
114       allocator =
115           new PoolAllocator(/*pool_size_limit=*/100, /*auto_resize=*/true,
116                             sub_allocator, new NoopRounder, "cpu_pool");
117       VLOG(2) << "Using PoolAllocator for ProcessState CPU allocator "
118               << "numa_enabled_=" << numa_enabled_
119               << " numa_node=" << numa_node;
120     } else {
121       DCHECK(!sub_allocator);
122       allocator = cpu_allocator_base();
123     }
124     if (LogMemory::IsEnabled() && !allocator->TracksAllocationSizes()) {
125       // Wrap the allocator to track allocation ids for better logging
126       // at the cost of performance.
127       allocator = new TrackingAllocator(allocator, true);
128     }
129     cpu_allocators_.push_back(allocator);
130     if (cpu_allocators_.size() < cpu_allocators_cache_.max_size()) {
131       cpu_allocators_cache_[cpu_allocators_.size() - 1] = allocator;
132       cpu_allocators_cached_.fetch_add(1, std::memory_order_release);
133     }
134     if (!sub_allocator) {
135       DCHECK(cpu_alloc_visitors_.empty() && cpu_free_visitors_.empty());
136     }
137   }
138   return cpu_allocators_[numa_node];
139 }
140 
AddCPUAllocVisitor(SubAllocator::Visitor visitor)141 void ProcessState::AddCPUAllocVisitor(SubAllocator::Visitor visitor) {
142   VLOG(1) << "AddCPUAllocVisitor";
143   mutex_lock lock(mu_);
144   CHECK_EQ(0, cpu_allocators_.size())  // Crash OK
145       << "AddCPUAllocVisitor must be called prior to first call to "
146          "ProcessState::GetCPUAllocator";
147   cpu_alloc_visitors_.push_back(std::move(visitor));
148 }
149 
AddCPUFreeVisitor(SubAllocator::Visitor visitor)150 void ProcessState::AddCPUFreeVisitor(SubAllocator::Visitor visitor) {
151   mutex_lock lock(mu_);
152   CHECK_EQ(0, cpu_allocators_.size())  // Crash OK
153       << "AddCPUFreeVisitor must be called prior to first call to "
154          "ProcessState::GetCPUAllocator";
155   cpu_free_visitors_.push_back(std::move(visitor));
156 }
157 
TestOnlyReset()158 void ProcessState::TestOnlyReset() {
159   mutex_lock lock(mu_);
160   // Don't delete this value because it's static.
161   Allocator* default_cpu_allocator = cpu_allocator_base();
162   mem_desc_map_.clear();
163   for (Allocator* a : cpu_allocators_) {
164     if (a != default_cpu_allocator) delete a;
165   }
166   cpu_allocators_.clear();
167   for (Allocator* a : cpu_al_) {
168     delete a;
169   }
170   cpu_al_.clear();
171 }
172 
173 }  // namespace tensorflow
174