xref: /aosp_15_r20/external/tensorflow/tensorflow/core/common_runtime/dynamic_device_mgr.cc (revision b6fb3261f9314811a0f4371741dbb8839866f948)
1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include <atomic>
17 #include <iterator>
18 #include <memory>
19 #include <vector>
20 
21 #include "tensorflow/core/common_runtime/device_mgr.h"
22 #include "tensorflow/core/common_runtime/local_device.h"
23 #include "tensorflow/core/framework/device_attributes.pb.h"
24 #include "tensorflow/core/lib/core/errors.h"
25 #include "tensorflow/core/platform/logging.h"
26 #include "tensorflow/core/util/device_name_utils.h"
27 
28 namespace tensorflow {
29 
DynamicDeviceMgr()30 DynamicDeviceMgr::DynamicDeviceMgr() : cpu_device_(nullptr) {}
31 
DynamicDeviceMgr(std::vector<std::unique_ptr<Device>> devices)32 DynamicDeviceMgr::DynamicDeviceMgr(
33     std::vector<std::unique_ptr<Device>> devices) {
34   Status status = AddDevices(std::move(devices));
35   CHECK(status.ok());  // Crash OK
36   mutex_lock l(devices_mu_);
37   // Initialize cpu_device_.
38   for (int i = 0; i < dynamic_devices_.size(); ++i) {
39     auto* d = dynamic_devices_[i].get();
40     if (d->device_type() == DEVICE_CPU && d->parsed_name().id == 0) {
41       cpu_device_ = d;
42       break;
43     }
44   }
45 }
46 
~DynamicDeviceMgr()47 DynamicDeviceMgr::~DynamicDeviceMgr() {
48   // Release resources ahead of destroying the device manager as the resource
49   // destructors (e.g. ~IteratorResource) assume devices still exist.
50   mutex_lock l(devices_mu_);
51   for (const auto& d : dynamic_devices_) {
52     // TODO(tf-runtime-team): clear devices' resource mgr in devices'
53     // destructor.
54     d->ClearResourceMgr();
55   }
56 }
57 
ListDeviceAttributes(std::vector<DeviceAttributes> * devices) const58 void DynamicDeviceMgr::ListDeviceAttributes(
59     std::vector<DeviceAttributes>* devices) const {
60   tf_shared_lock l(devices_mu_);
61   devices->reserve(dynamic_devices_.size());
62   for (const auto& d : dynamic_devices_) {
63     devices->emplace_back(d->attributes());
64   }
65 }
66 
ListDevices() const67 std::vector<Device*> DynamicDeviceMgr::ListDevices() const {
68   tf_shared_lock l(devices_mu_);
69   std::vector<Device*> devices;
70   devices.reserve(dynamic_devices_.size());
71   for (const auto& d : dynamic_devices_) {
72     devices.emplace_back(d.get());
73   }
74   return devices;
75 }
76 
DebugString() const77 string DynamicDeviceMgr::DebugString() const {
78   string out;
79   tf_shared_lock l(devices_mu_);
80   for (const auto& d : dynamic_devices_) {
81     strings::StrAppend(&out, d->name(), "\n");
82   }
83   return out;
84 }
85 
DeviceMappingString() const86 string DynamicDeviceMgr::DeviceMappingString() const {
87   string out;
88   tf_shared_lock l(devices_mu_);
89   for (const auto& d : dynamic_devices_) {
90     if (!d->attributes().physical_device_desc().empty()) {
91       strings::StrAppend(&out, d->name(), " -> ",
92                          d->attributes().physical_device_desc(), "\n");
93     }
94   }
95   return out;
96 }
97 
LookupDevice(StringPiece name,Device ** device) const98 Status DynamicDeviceMgr::LookupDevice(StringPiece name, Device** device) const {
99   tf_shared_lock l(devices_mu_);
100   auto iter = device_map_.find(string(name));
101   if (iter == device_map_.end()) {
102     std::vector<StringPiece> device_names;
103     for (auto&& itr : device_map_) {
104       device_names.push_back(itr.first);
105     }
106     VLOG(1) << "Unknown device: " << name
107             << " all devices: " << absl::StrJoin(device_names, ", ");
108     return errors::InvalidArgument(name, " unknown device.");
109   }
110   *device = iter->second;
111   return OkStatus();
112 }
113 
ContainsDevice(int64_t device_incarnation) const114 bool DynamicDeviceMgr::ContainsDevice(int64_t device_incarnation) const {
115   tf_shared_lock l(devices_mu_);
116   return device_incarnation_set_.contains(device_incarnation);
117 }
118 
ClearContainers(gtl::ArraySlice<string> containers) const119 void DynamicDeviceMgr::ClearContainers(
120     gtl::ArraySlice<string> containers) const {
121   Status s;
122   tf_shared_lock l(devices_mu_);
123   for (const auto& d : dynamic_devices_) {
124     if (containers.empty()) {
125       s.Update(d->resource_manager()->Cleanup(
126           d->resource_manager()->default_container()));
127     } else {
128       for (const string& c : containers) {
129         s.Update(d->resource_manager()->Cleanup(c));
130       }
131     }
132     if (!s.ok()) {
133       LOG(WARNING) << s;
134     }
135   }
136 }
137 
NumDeviceType(const string & type) const138 int DynamicDeviceMgr::NumDeviceType(const string& type) const {
139   tf_shared_lock l(devices_mu_);
140   auto iter = device_type_counts_.find(type);
141   if (iter != device_type_counts_.end()) return iter->second;
142   return 0;
143 }
144 
AddDevices(std::vector<std::unique_ptr<Device>> devices)145 Status DynamicDeviceMgr::AddDevices(
146     std::vector<std::unique_ptr<Device>> devices) {
147   mutex_lock l(devices_mu_);
148   for (auto& d : devices) {
149     if (device_map_.find(d->name()) != device_map_.end()) {
150       return errors::InvalidArgument(
151           "Trying to add device ", d->name(),
152           " to manager but its name conflicts with an existing device.");
153     }
154     // Register under the (1) full name and (2) canonical name.
155     for (const string& name :
156          DeviceNameUtils::GetNamesForDeviceMappings(d->parsed_name())) {
157       device_map_[name] = d.get();
158     }
159     // Register under the (3) local name and (4) legacy local name.
160     for (const string& name :
161          DeviceNameUtils::GetLocalNamesForDeviceMappings(d->parsed_name())) {
162       device_map_[name] = d.get();
163     }
164     device_type_counts_[d->device_type()]++;
165     device_incarnation_set_.insert(d->attributes().incarnation());
166     dynamic_devices_.push_back(std::move(d));
167   }
168   return OkStatus();
169 }
170 
RemoveDevices(const std::vector<Device * > & devices)171 Status DynamicDeviceMgr::RemoveDevices(const std::vector<Device*>& devices) {
172   mutex_lock l(devices_mu_);
173 
174   for (const auto& d : devices) {
175     if (d == cpu_device_) {
176       TF_RETURN_IF_ERROR(
177           errors::InvalidArgument("Can not remove HostCPU device ", d->name()));
178     }
179     int i = 0;
180     for (; i < dynamic_devices_.size(); ++i) {
181       if (d == dynamic_devices_[i].get()) break;
182     }
183     if (i >= dynamic_devices_.size()) {
184       return errors::InvalidArgument("Unknown device ", d->name());
185     }
186   }
187 
188   for (const auto& d : devices) {
189     // Clear registration of (1) full name and (2) canonical name
190     for (const string& name :
191          DeviceNameUtils::GetNamesForDeviceMappings(d->parsed_name())) {
192       device_map_.erase(name);
193     }
194     // Clear registration of (3) local name and (4) legacy local name
195     for (const string& name :
196          DeviceNameUtils::GetLocalNamesForDeviceMappings(d->parsed_name())) {
197       device_map_.erase(name);
198     }
199     device_type_counts_[d->device_type()]--;
200     device_incarnation_set_.erase(d->attributes().incarnation());
201 
202     int i = 0;
203     for (; i < dynamic_devices_.size(); ++i) {
204       if (d == dynamic_devices_[i].get()) break;
205     }
206     // There shouldn't be unknown devices at this point.
207     CHECK(i < dynamic_devices_.size());  // Crash OK
208     stale_devices_.add(std::move(dynamic_devices_[i]));
209     dynamic_devices_.erase(dynamic_devices_.begin() + i);
210   }
211   return OkStatus();
212 }
213 
RemoveDevicesByName(const std::vector<string> & device_names)214 Status DynamicDeviceMgr::RemoveDevicesByName(
215     const std::vector<string>& device_names) {
216   std::vector<Device*> devices_to_remove;
217   for (const string& name : device_names) {
218     Device* device;
219     TF_RETURN_IF_ERROR(LookupDevice(name, &device));
220     devices_to_remove.emplace_back(device);
221   }
222   return RemoveDevices(devices_to_remove);
223 }
224 
HostCPU() const225 Device* DynamicDeviceMgr::HostCPU() const {
226   Device* device = cpu_device_.load(std::memory_order_relaxed);
227 
228   // Host CPU device can't be removed, so if we found valid device once, we
229   // do not need to check that it is still in the device list.
230   if (device != nullptr) return device;
231 
232   mutex_lock l(devices_mu_);
233   for (int i = 0; i < dynamic_devices_.size(); ++i) {
234     Device* d = dynamic_devices_[i].get();
235     if (d->device_type() == DEVICE_CPU && d->parsed_name().id == 0) {
236       cpu_device_ = d;
237       break;
238     }
239   }
240 
241   return cpu_device_.load(std::memory_order_relaxed);
242 }
243 
244 }  // namespace tensorflow
245