xref: /aosp_15_r20/external/tensorflow/tensorflow/compiler/xla/service/cpu/cpu_runtime.h (revision b6fb3261f9314811a0f4371741dbb8839866f948)
1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 // This header declares functions which may be called by the generated code on
17 // the CPU. Calls to these functions must be resolved explicitly in the JIT in
18 // xla::cpu::SimpleResolver.  It also defines a per-CpuExecutable context
19 // which is used to cache expensive state and resources utilized by the
20 // aforementioned functions.
21 //
22 // Other functions are declared in individual libraries as well, such as
23 // runtime_conv2d and runtime_matmul. As individual libraries, callers for
24 // ahead-of-time compilation can link only the required subset.
25 
26 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_CPU_RUNTIME_H_
27 #define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_CPU_RUNTIME_H_
28 
29 #include "tensorflow/compiler/xla/executable_run_options.h"
30 #include "tensorflow/compiler/xla/service/cpu/xfeed_manager.h"
31 #include "tensorflow/compiler/xla/service/hlo_instructions.h"
32 #include "tensorflow/compiler/xla/types.h"
33 
34 namespace xla {
35 namespace cpu {
36 namespace runtime {
37 
38 // Names of runtime functions. These get resolved from the generated code to the
39 // right symbol at link time in one of two ways:
40 // 1. When using the JIT, the symbol resolver (SimpleResolver in
41 //    third_party/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc) maps
42 //    this symbol name to
43 //    the actual symbol.
44 // 2. When using ahead-of-time compilation, the linker can resolve the name
45 //    because it is a symbol in the cpu_runtime library.
46 extern const char* const kEigenMatMulF16SymbolName;
47 extern const char* const kEigenMatMulF32SymbolName;
48 extern const char* const kEigenMatMulF64SymbolName;
49 extern const char* const kEigenMatMulC64SymbolName;
50 extern const char* const kEigenMatMulC128SymbolName;
51 extern const char* const kEigenMatMulS32SymbolName;
52 extern const char* const kEigenBatchMatMulF32SymbolName;
53 extern const char* const kMKLConv2DF32SymbolName;
54 extern const char* const kACLConv2DF32SymbolName;
55 extern const char* const kMKLMatMulF32SymbolName;
56 extern const char* const kMKLMatMulF64SymbolName;
57 extern const char* const kACLMatMulF32SymbolName;
58 extern const char* const kACLBatchMatMulF32SymbolName;
59 extern const char* const kMKLSingleThreadedMatMulF32SymbolName;
60 extern const char* const kMKLSingleThreadedMatMulF64SymbolName;
61 extern const char* const kEigenConv2DF16SymbolName;
62 extern const char* const kEigenConv2DF32SymbolName;
63 extern const char* const kEigenConv3DF16SymbolName;
64 extern const char* const kEigenConv3DF32SymbolName;
65 extern const char* const kEigenFftSymbolName;
66 extern const char* const kEigenSingleThreadedFftSymbolName;
67 extern const char* const kEigenSingleThreadedMatMulF16SymbolName;
68 extern const char* const kEigenSingleThreadedMatMulF32SymbolName;
69 extern const char* const kEigenSingleThreadedMatMulF64SymbolName;
70 extern const char* const kEigenSingleThreadedMatMulC64SymbolName;
71 extern const char* const kEigenSingleThreadedMatMulC128SymbolName;
72 extern const char* const kEigenSingleThreadedMatMulS32SymbolName;
73 extern const char* const kEigenSingleThreadedConv2DF16SymbolName;
74 extern const char* const kEigenSingleThreadedConv2DF32SymbolName;
75 extern const char* const kEigenSingleThreadedConv3DF16SymbolName;
76 extern const char* const kEigenSingleThreadedConv3DF32SymbolName;
77 extern const char* const kAcquireInfeedBufferForDequeueSymbolName;
78 extern const char* const kReleaseInfeedBufferAfterDequeueSymbolName;
79 extern const char* const kAcquireOutfeedBufferForPopulationSymbolName;
80 extern const char* const kReleaseOutfeedBufferAfterPopulationSymbolName;
81 extern const char* const kParallelForkJoinSymbolName;
82 extern const char* const kPrintfToStderrSymbolName;
83 extern const char* const kStatusIsSuccessSymbolName;
84 extern const char* const kKeyValueSortSymbolName;
85 extern const char* const kTopKF32SymbolName;
86 extern const char* const kAllReduceSymbolName;
87 extern const char* const kCollectivePermuteSymbolName;
88 extern const char* const kPartitionIdSymbolName;
89 extern const char* const kReplicaIdSymbolName;
90 extern const char* const kTracingStartSymbolName;
91 extern const char* const kTracingEndSymbolName;
92 extern const char* const kAllToAllSymbolName;
93 
94 // All symbol names for XLA CPU runtime functions need to start with this
95 // prefix.
96 extern const char* const kXlaCpuRuntimeSymbolNamePrefix;
97 
98 // Returns the infeed manager used by the CPU runtime for the CPU device
99 // `device_ordinal`.  Note the device ordinal does not name a CPU
100 XfeedManager* GetXfeedManager(int device_ordinal);
101 
102 }  // namespace runtime
103 }  // namespace cpu
104 }  // namespace xla
105 
106 extern "C" {
107 
108 extern int __xla_cpu_runtime_PrintfToStderr(const char* format, ...);
109 
110 extern int64_t __xla_cpu_runtime_TracingStart(
111     const void* /* xla::ExecutableRunOptions* */ run_options_ptr,
112     const char* name);
113 extern void __xla_cpu_runtime_TracingEnd(
114     const void* /* xla::ExecutableRunOptions* */ run_options_ptr, int64_t id);
115 
116 // Some things common to all of the runtime entry points below:
117 //
118 //  * The shape pointer and shape_length reflect values that can be deserialized
119 //    via llvm_ir::DecodeSelfDescribingShapeConstant. This is the way we pass
120 //    reified type information from the generated program to the runtime, which
121 //    helps check the type safety and contract for the emitted-code/runtime
122 //    communication.
123 //
124 //  * run_options is used to look up the device ordinal for the stream executor
125 //    we're executing under.  If it is null the device ordinal is assumed to be
126 //    0 (this behavior helps in writing tests).
127 
128 // Note: in the runtime entry points below, the shape pointer and shape_length
129 // reflect values that can be deserialized via
130 // llvm_ir::DecodeSelfDescribingShapeConstant. This is the way we pass reified
131 // type information from the generated program to the runtime, which helps check
132 // the type safety and contract for the emitted-code/runtime communication.
133 
134 // Blocks until the next infeed buffer is ready to be dequeued, then
135 // returns it. Fails catastrophically if the next enqueued buffer is
136 // not of the correct length in bytes. Checking the shape rather than
137 // the length would be more exact, but the length check is chosen as a
138 // tradeoff between error checking and speed/simplicity.
139 extern void* __xla_cpu_runtime_AcquireInfeedBufferForDequeue(
140     const xla::ExecutableRunOptions* run_options, int32_t buffer_length,
141     const void* shape, int32_t shape_length);
142 
143 // Relinquishes the next infeed buffer that was returned by
144 // __xla_cpu_runtime_AcquireInfeedBufferForDequeue. Once this call
145 // completes the data at buffer_ptr may no longer be
146 // accessed. buffer_length must match the length passed to the call to
147 // __xla_cpu_runtime_AcquireInfeedBufferForDequeue that returned
148 // buffer_ptr. This function must be called before the next buffer is
149 // acquired, i.e., there may only be one outstanding infeed buffer in
150 // use by the runtime.  TODO(b/31340454) investigate whether or not it
151 // is worth supporting zero-copy infeed where the buffer is retained
152 // by the compiled code until it has been used. If zero-copy infeed is
153 // implemented we will add support for multiple outstanding buffers
154 // that can be returned out of order.
155 extern void __xla_cpu_runtime_ReleaseInfeedBufferAfterDequeue(
156     const xla::ExecutableRunOptions* run_options, int32_t buffer_length,
157     void* buffer_ptr, const void* shape_ptr, int32_t shape_length);
158 
159 // Blocks until the next outfeed buffer is available to be populated, then
160 // returns it.
161 extern void* __xla_cpu_runtime_AcquireOutfeedBufferForPopulation(
162     const xla::ExecutableRunOptions* run_options, int32_t buffer_length,
163     const void* shape_ptr, int32_t shape_length);
164 
165 // Relinquishes the outfeed buffer after it has been populated.
166 // buffer_ptr must have been previously returned by
167 // __xla_cpu_runtime_AcquireOutfeedBufferForPopulation.
168 // Once this call completes, buffer_ptr may no longer be accessed.
169 // buffer_length must match the length passed to the call to
170 // __xla_cpu_runtime_AcquireInfeedBufferForDequeue that returned
171 // buffer_ptr. This function must be called before the next buffer is
172 // acquired, i.e., there may only be one outstanding outfeed buffer in
173 // use by the runtime.
174 extern void __xla_cpu_runtime_ReleaseOutfeedBufferAfterPopulation(
175     const xla::ExecutableRunOptions* run_options, int32_t buffer_length,
176     void* buffer_ptr, const void* shape_ptr, int32_t shape_length);
177 
178 // Perform all reduce on a CPU.
179 //
180 // participating_replicas: array of replica IDs participating in the reduction,
181 // cf. GetParticipatingIDs.
182 // channel_id_present, op_id: whether op_id is a channel ID or a module ID.
183 // reduction_kind: operator used for a reduction, cf. ReductionKind.
184 // shape_ptr: shape of all input/output buffers.
185 extern void __xla_cpu_runtime_AllReduce(
186     const xla::ExecutableRunOptions* run_options,
187     const void* replica_groups_str, int32_t replica_groups_str_size,
188     int32_t channel_id_present, int32_t use_global_device_ids, int64_t op_id,
189     int32_t reduction_kind, const void* shape_ptr, int32_t shape_length,
190     int32_t num_buffers, void** input_buffers, void** output_buffers);
191 
192 extern void __xla_cpu_runtime_CollectivePermute(
193     const xla::ExecutableRunOptions* run_options, int32_t channel_id_present,
194     int64_t op_id, int32_t byte_size, void* input_buffer, void* output_buffer,
195     const void* source_target_pairs, int32_t source_target_pairs_size);
196 
197 extern void __xla_cpu_runtime_AllToAll(
198     const xla::ExecutableRunOptions* run_options, int32_t channel_id_present,
199     int64_t op_id, const void* replica_groups_str,
200     int32_t replica_groups_str_size, int32_t num_buffers, int64_t buffer_size,
201     void** source_buffers, void** destination_buffers);
202 
203 // Write the partition ID into the output buffer.
204 extern void __xla_cpu_runtime_PartitionId(
205     const xla::ExecutableRunOptions* run_options, void* output_buffer);
206 // Write the replica ID into the output buffer.
207 extern void __xla_cpu_runtime_ReplicaId(
208     const xla::ExecutableRunOptions* run_options, void* output_buffer);
209 
210 }  // extern "C"
211 
212 #endif  // TENSORFLOW_COMPILER_XLA_SERVICE_CPU_CPU_RUNTIME_H_
213