1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 #ifndef TENSORFLOW_COMPILER_XLA_STREAM_EXECUTOR_TF_ALLOCATOR_ADAPTER_H_ 17 #define TENSORFLOW_COMPILER_XLA_STREAM_EXECUTOR_TF_ALLOCATOR_ADAPTER_H_ 18 19 #include <memory> 20 #include <utility> 21 #include <vector> 22 23 #include "tensorflow/compiler/xla/stream_executor/device_memory.h" 24 #include "tensorflow/compiler/xla/stream_executor/device_memory_allocator.h" 25 #include "tensorflow/compiler/xla/stream_executor/lib/statusor.h" 26 #include "tensorflow/compiler/xla/stream_executor/platform.h" 27 #include "tensorflow/compiler/xla/stream_executor/stream.h" 28 #include "tensorflow/compiler/xla/stream_executor/stream_executor.h" 29 #include "tensorflow/core/framework/allocator.h" 30 31 namespace stream_executor { 32 33 // Adapter class that wraps a Tensorflow allocator. 34 // 35 // Assumes that the Tensorflow allocator permits asynchronous deallocation: 36 // see comment on `AllowsAsynchronousDeallocation()`. 37 class TfAllocatorAdapter : public DeviceMemoryAllocator { 38 public: 39 // stream: a Stream on which the allocator can only be used. If non-null, the 40 // allocator can not be used on any other stream. 41 TfAllocatorAdapter(tensorflow::Allocator *wrapped, Stream *stream); 42 43 // Constructor for the cases where `stream` can not be provided. 44 TfAllocatorAdapter(tensorflow::Allocator *wrapped, Platform *platform); 45 46 ~TfAllocatorAdapter() override; 47 48 port::StatusOr<OwningDeviceMemory> Allocate(int device_ordinal, uint64_t size, 49 bool retry_on_failure, 50 int64_t memory_space) override; 51 52 port::Status Deallocate(int device_ordinal, DeviceMemoryBase mem) override; 53 54 // The Tensorflow BFC allocator used on GPU allows host-side deallocation 55 // before GPU execution takes place. Tensorflow uses the ordering of the main 56 // compute stream to enforce a happens-before relationship between a memory 57 // allocation and code that reuses the same memory. If Tensorflow adds 58 // support for multiple GPU streams or allocators with different ordering 59 // requirements, this code may need to change. 60 // (This attribute has no effect on CPU.) AllowsAsynchronousDeallocation()61 bool AllowsAsynchronousDeallocation() const override { return true; } 62 63 port::StatusOr<Stream *> GetStream(int device_ordinal) override; 64 65 private: 66 tensorflow::Allocator *wrapped_; 67 Stream *stream_; 68 }; 69 70 // Adapter class that wraps per-device TF allocators with corresponding streams 71 // as a TfAllocatorAdapter. Assumes that the Tensorflow allocator permits 72 // asynchronous deallocation; see comment on `AllowsAsynchronousDeallocation()`. 73 class MultiDeviceAdapter : public DeviceMemoryAllocator { 74 public: 75 using AllocatorWithStream = 76 std::pair<std::unique_ptr<tensorflow::Allocator>, Stream *>; MultiDeviceAdapter(const Platform * platform,std::vector<AllocatorWithStream> tf_allocators)77 MultiDeviceAdapter(const Platform *platform, 78 std::vector<AllocatorWithStream> tf_allocators) 79 : DeviceMemoryAllocator(platform) { 80 tf_allocators_.reserve(tf_allocators.size()); 81 for (AllocatorWithStream &p : tf_allocators) { 82 int device_ordinal = p.second->parent()->device_ordinal(); 83 if (per_device_allocators_.size() <= device_ordinal) { 84 per_device_allocators_.resize(device_ordinal + 1); 85 } 86 CHECK(!per_device_allocators_[device_ordinal]); 87 per_device_allocators_[device_ordinal] = 88 std::make_unique<TfAllocatorAdapter>(p.first.get(), p.second); 89 tf_allocators_.push_back(std::move(p.first)); 90 } 91 } 92 Allocate(int device_ordinal,uint64_t size,bool retry_on_failure,int64_t memory_space)93 port::StatusOr<OwningDeviceMemory> Allocate(int device_ordinal, uint64_t size, 94 bool retry_on_failure, 95 int64_t memory_space) override { 96 CHECK_LT(device_ordinal, per_device_allocators_.size()); 97 return per_device_allocators_[device_ordinal]->Allocate( 98 device_ordinal, size, retry_on_failure, memory_space); 99 } 100 Deallocate(int device_ordinal,DeviceMemoryBase mem)101 port::Status Deallocate(int device_ordinal, DeviceMemoryBase mem) override { 102 CHECK_LT(device_ordinal, per_device_allocators_.size()); 103 return per_device_allocators_[device_ordinal]->Deallocate(device_ordinal, 104 mem); 105 } 106 107 // The Tensorflow BFC allocator used on GPU allows host-side deallocation 108 // before GPU execution takes place. Tensorflow uses the ordering of the main 109 // compute stream to enforce a happens-before relationship between a memory 110 // allocation and code that reuses the same memory. If Tensorflow adds 111 // support for multiple GPU streams or allocators with different ordering 112 // requirements, this code may need to change. 113 // (This attribute has no effect on CPU.) AllowsAsynchronousDeallocation()114 bool AllowsAsynchronousDeallocation() const override { return true; } 115 GetStream(int device_ordinal)116 port::StatusOr<Stream *> GetStream(int device_ordinal) override { 117 return per_device_allocators_[device_ordinal]->GetStream(device_ordinal); 118 } 119 120 private: 121 std::vector<std::unique_ptr<TfAllocatorAdapter>> per_device_allocators_; 122 // The wrapped TF allocators backing per_device_allocators_ 123 // (TfAllocatorAdapter does not take ownership of its underlying Allocator). 124 std::vector<std::unique_ptr<tensorflow::Allocator>> tf_allocators_; 125 }; 126 127 } // namespace stream_executor 128 129 #endif // TENSORFLOW_COMPILER_XLA_STREAM_EXECUTOR_TF_ALLOCATOR_ADAPTER_H_ 130