xref: /aosp_15_r20/external/tensorflow/tensorflow/compiler/jit/xla_launch_util.h (revision b6fb3261f9314811a0f4371741dbb8839866f948)
1 /* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 // Contains utilities for launching compiled XLA kernels for a KernelContext.
17 
18 #ifndef TENSORFLOW_COMPILER_JIT_XLA_LAUNCH_UTIL_H_
19 #define TENSORFLOW_COMPILER_JIT_XLA_LAUNCH_UTIL_H_
20 
21 #include "tensorflow/compiler/jit/xla_compilation_cache.h"
22 #include "tensorflow/compiler/jit/xla_tensor.h"
23 #include "tensorflow/compiler/tf2xla/xla_compiler.h"
24 #include "tensorflow/compiler/xla/client/local_client.h"
25 #include "tensorflow/compiler/xla/service/shaped_buffer.h"
26 #include "tensorflow/core/framework/allocation_description.pb.h"
27 #include "tensorflow/core/framework/resource_var.h"
28 #include "tensorflow/core/framework/tensor.h"
29 #include "tensorflow/core/framework/types.h"
30 #include "tensorflow/core/lib/core/status.h"
31 #include "tensorflow/core/lib/gtl/array_slice.h"
32 #include "tensorflow/core/platform/thread_annotations.h"
33 #include "tensorflow/stream_executor/device_memory_allocator.h"
34 
35 namespace tensorflow {
36 
37 // Snapshot of resource variables for a TF kernel invocation, mapping from
38 // parameter number to values at execution time. If the resource variable is not
39 // initialized, the value will not be present.
40 using ResourceVarsSnapshot = absl::flat_hash_map<int, std::optional<Tensor>>;
41 
42 // Information about the state of a variable passed as input to the _XlaCompile
43 // and _XlaRun operators.  Unlocks the resource variable and decrements its
44 // refcount on destruction.
45 class VariableInfo {
46  public:
47   explicit VariableInfo(int index, absl::string_view name, Var* var,
48                         const std::optional<ManagedStackTrace>&
49                             definition_stack_trace = std::nullopt);
50   VariableInfo(VariableInfo&& other);
51 
52   VariableInfo& operator=(VariableInfo&& other);
53 
54   VariableInfo(const VariableInfo&) = delete;
55   VariableInfo& operator=(const VariableInfo&) = delete;
56 
57   // The index of the DT_RESOURCE input to the _XlaCompile/_XlaRun operator.
58   // Note that the indices can be different between _XlaCompile and _XlaRun.
index()59   int index() const { return index_; }
60 
61   // A pointer to the resource variable.  May be null if this VariableInfo is
62   // "empty", i.e. it does not track a resource variable.
var()63   Var* var() const { return var_; }
64 
65   // Returns the variable name.
name()66   absl::string_view name() const { return name_; }
67 
68   // Returns true if the resource variable lock was successfully acquired by
69   // this thread.
lock_held()70   bool lock_held() const { return lock_held_; }
set_lock_held()71   void set_lock_held() { lock_held_ = true; }
72 
definition_stack_trace()73   const std::optional<ManagedStackTrace>& definition_stack_trace() const {
74     return definition_stack_trace_;
75   }
76 
77   ~VariableInfo();
78 
79  private:
80   int index_;
81   std::string name_;
82   Var* var_;
83   std::optional<ManagedStackTrace> definition_stack_trace_;
84 
85   // We can't use a optional<mutex_lock> here because it confuses the compiler's
86   // thread safety analysis. Instead we use a boolean flag and release the lock
87   // in the VariableInfo destructor.
88   bool lock_held_ = false;
89 };
90 
91 // Creates a list of updated resource variables.
92 StatusOr<std::vector<VariableInfo>> GatherVariableInfo(
93     OpKernelContext* ctx,
94     const XlaCompiler::CompilationResult& compilation_result,
95     int missing_ctx_input_prefix);
96 
97 // Takes a snapshot of the values of resource variable arguments, whose indices
98 // are specified in `variable_indices` argument. We snapshot tensors that back
99 // resource variables since concurrent updates may modify the shape, and it is
100 // important that the shapes used for compilation match the true shapes of the
101 // buffers.
102 //
103 // We snapshot the entire set of resource variables as one atomic operation.
104 // This models Read->* dependencies between resource variable operations.  See
105 // jit/resource_operation_safety_analysis for details.
106 Status SnapshotResourceVariables(OpKernelContext* ctx,
107                                  absl::Span<const int> variable_indices,
108                                  absl::Span<VariableInfo const> variable_infos,
109                                  ResourceVarsSnapshot* result);
110 
111 // Acquires the mutexes for all the variables in `variables` using a
112 // deadlock-safe protocol (acquire the mutexes in increasing-address order).
113 //
114 // `variables` is allowed to contain instances that don't track a resource
115 // variable (i.e. variables[i].var() can be null for some i).
116 Status LockVariables(absl::Span<VariableInfo*> variables)
117     TF_EXCLUSIVE_LOCK_FUNCTION();
118 Status LockVariables(absl::Span<VariableInfo> variables)
119     TF_EXCLUSIVE_LOCK_FUNCTION();
120 
121 // Returns a vector of VariableInfo instances for the resource variable inputs,
122 // given that *all* inputs are in `inputs`. The input indices for the resource
123 // variable inputs are in `variable_indices`.
124 Status GetVariableInfosFromInputs(ResourceMgr* rm, DeviceBase* dev,
125                                   absl::Span<const Tensor* const> inputs,
126                                   absl::Span<const int> variable_indices,
127                                   std::vector<VariableInfo>* result);
128 
129 // Returns pointers to inputs stored in `ctx`.
130 std::vector<const Tensor*> InputsFromContext(OpKernelContext* ctx);
131 
132 // Helper class to perform the marshalling of TensorFlow inputs and outputs to
133 // ShapedBuffers suitable for passing to an XLA computation.
134 class XlaComputationLaunchContext {
135  public:
136   // Create a new launch context. 'allocate_xla_tensors' is true if allocated
137   // output tensors and variables are always XlaTensors. If false they are
138   // assumed to be "normal" device pointers.
139   // If 'use_multiple_streams' is true, tensors may be defined and used on
140   // multiple streams and so se::Events must be defined and waited for. If
141   // 'use_multiple_streams' is true, 'allocate_xla_tensors' must also be true
142   // because we track inter-stream dependencies through events inside XlaTensor
143   // objects.
144   XlaComputationLaunchContext(xla::LocalClient* client,
145                               se::DeviceMemoryAllocator* xla_allocator,
146                               int device_ordinal, bool allocate_xla_tensors,
147                               bool use_multiple_streams);
148 
149   // Builds a XlaCompiler::Argument vector from the arguments to an XlaLaunch
150   // op.
151   // Precondition: variables in `variable_args` are locked.
152   static StatusOr<std::vector<XlaCompiler::Argument>> BuildXlaCompilerArguments(
153       absl::Span<int const> must_be_constant_idxs,
154       absl::Span<const Tensor* const> inputs,
155       absl::Span<VariableInfo const> variable_args, Device* device);
156 
157   // Add all inputs within `ctx` as XLA arguments (returned by arguments()).
158   // `variables` is a map from TensorFlow argument number to resource variable.
159   //
160   // Assumes that the first `missing_ctx_input_prefix` inputs to the kernel are
161   // missing and adjusts input indices accordingly.  All elements in kernel's
162   // input_mapping must be greater than or equal to `missing_ctx_input_prefix`
163   // (in other words, no inputs actually required by the kernel can be missing).
164   StatusOr<std::vector<xla::ExecutionInput>> PopulateInputs(
165       OpKernelContext* ctx,
166       const XlaCompiler::CompilationResult* compilation_result,
167       const std::map<int, const Tensor*>& resource_vars,
168       int missing_ctx_input_prefix,
169       const xla::HloInputOutputAliasConfig& input_output_alias);
170 
171   // Given the XLA output in `output`, populate all outputs of `ctx`.  Also
172   // writes out the resource variable updates.
173   //
174   // Updates to all resource variables are written in a single atomic operation.
175   // This models *->Write dependencies between resource variable operations.
176   // See jit/resource_operation_safety_analysis for details.
177   //
178   //
179   // Assumes that the first `missing_ctx_input_prefix` inputs to the
180   // compilation_result are missing and adjusts input indices accordingly.
181   Status PopulateOutputs(
182       OpKernelContext* ctx,
183       const XlaCompiler::CompilationResult* compilation_result,
184       xla::ScopedShapedBuffer output, int missing_ctx_input_prefix,
185       absl::Span<VariableInfo> variable_infos,
186       const xla::HloInputOutputAliasConfig& input_output_alias,
187       const std::map<int, const Tensor*>& resource_vars);
188 
189  private:
190   xla::LocalClient* client_;
191   se::DeviceMemoryAllocator* xla_allocator_;
192   bool allocate_xla_tensors_;
193   bool use_multiple_streams_;
194   int device_ordinal_;
195 };
196 
197 // A simple TensorBuffer implementation that allows us to create Tensors that
198 // take ownership of pre-allocated memory.
199 class XlaTensorBuffer : public TensorBuffer {
200  public:
XlaTensorBuffer(const void * ptr,size_t expected_size,size_t actual_size,Allocator * allocator)201   XlaTensorBuffer(const void* ptr, size_t expected_size, size_t actual_size,
202                   Allocator* allocator)
203       : TensorBuffer(const_cast<void*>(ptr)),
204         expected_size_(expected_size),
205         actual_size_(actual_size),
206         allocator_(allocator) {}
207 
~XlaTensorBuffer()208   ~XlaTensorBuffer() override {
209     if (data()) {
210       allocator_->DeallocateRaw(data());
211     }
212   }
213 
size()214   size_t size() const override { return expected_size_; }
215 
root_buffer()216   TensorBuffer* root_buffer() override { return this; }
217 
FillAllocationDescription(AllocationDescription * proto)218   void FillAllocationDescription(AllocationDescription* proto) const override {
219     proto->set_requested_bytes(static_cast<int64_t>(expected_size_));
220     proto->set_allocator_name(allocator_->Name());
221     proto->set_ptr(reinterpret_cast<uintptr_t>(data()));
222     if (allocator_->TracksAllocationSizes()) {
223       auto ab = static_cast<int64_t>(allocator_->AllocatedSize(data()));
224       proto->set_allocated_bytes(ab);
225       int64_t id = allocator_->AllocationId(data());
226       if (id > 0) {
227         proto->set_allocation_id(id);
228       }
229       if (RefCountIsOne()) {
230         proto->set_has_single_reference(true);
231       }
232     }
233   }
234 
235  private:
236   size_t expected_size_;
237   size_t actual_size_;
238   Allocator* allocator_;
239 };
240 
241 }  // namespace tensorflow
242 
243 #endif  // TENSORFLOW_COMPILER_JIT_XLA_LAUNCH_UTIL_H_
244