1 /* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include <algorithm>
17 #include <string>
18 
19 #include "tensorflow/compiler/tf2xla/kernels/light_outside_compilation.h"
20 #include "tensorflow/core/framework/op_kernel.h"
21 #include "tensorflow/core/framework/tensor.h"
22 #include "tensorflow/core/framework/tensor_shape.h"
23 
24 // Sample kernels for the light outside compilation test.
25 
26 namespace tensorflow {
27 namespace {
28 
29 // Just copy the input.
30 REGISTER_OP("TestStaticTf")
31     .Input("input: float")
32     .Output("output: float")
__anon9179db730202(shape_inference::InferenceContext* c) 33     .SetShapeFn([](shape_inference::InferenceContext* c) {
34       c->set_output(0, c->input(0));
35       return OkStatus();
36     });
37 
38 class TestStaticTfOp : public OpKernel {
39  public:
TestStaticTfOp(OpKernelConstruction * ctx)40   explicit TestStaticTfOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
Compute(OpKernelContext * ctx)41   void Compute(OpKernelContext* ctx) override {
42     Tensor* out_tensor = nullptr;
43     const Tensor& input = ctx->input(0);
44     OP_REQUIRES_OK(ctx, ctx->allocate_output("output", ctx->input(0).shape(),
45                                              &out_tensor));
46 
47     // Just pass the value through.
48     uint64_t size = input.AllocatedBytes();
49     se::DeviceMemoryBase gpu_dst{out_tensor->data(), size};
50     se::Stream* stream = ctx->op_device_context()->stream();
51 
52     stream->ThenMemcpyD2D(
53         /*gpu_dst=*/&gpu_dst,
54         /*gpu_src=*/se::DeviceMemoryBase{input.data(), size},
55         /*size=*/input.AllocatedBytes());
56   }
57 };
58 
59 REGISTER_KERNEL_BUILDER(Name("TestStaticTf").Device(DEVICE_GPU),
60                         TestStaticTfOp);
61 REGISTER_XLA_OP(Name("TestStaticTf").Device(DEVICE_GPU_XLA_JIT),
62                 LightOutsideCompilationOp)
63 
64 REGISTER_OP("TestStaticMultipleOutputTf")
65     .Input("input: float")
66     .Output("output1: float")
67     .Output("output2: float")
__anon9179db730302(shape_inference::InferenceContext* c) 68     .SetShapeFn([](shape_inference::InferenceContext* c) {
69       c->set_output(0, c->input(0));
70       c->set_output(1, c->input(0));
71       return OkStatus();
72     });
73 
74 class TestStaticMultipleOutputTfOp : public OpKernel {
75  public:
TestStaticMultipleOutputTfOp(OpKernelConstruction * ctx)76   explicit TestStaticMultipleOutputTfOp(OpKernelConstruction* ctx)
77       : OpKernel(ctx) {}
Compute(OpKernelContext * ctx)78   void Compute(OpKernelContext* ctx) override {
79     Tensor* out_tensor1 = nullptr;
80     Tensor* out_tensor2 = nullptr;
81     const Tensor& input = ctx->input(0);
82     OP_REQUIRES_OK(ctx, ctx->allocate_output("output1", ctx->input(0).shape(),
83                                              &out_tensor1));
84     OP_REQUIRES_OK(ctx, ctx->allocate_output("output2", ctx->input(0).shape(),
85                                              &out_tensor2));
86 
87     // Just pass the value through.
88     uint64_t size = input.AllocatedBytes();
89     se::DeviceMemoryBase gpu_dst1{out_tensor1->data(), size};
90     se::DeviceMemoryBase gpu_dst2{out_tensor2->data(), size};
91     se::Stream* stream =
92         ctx->device()->tensorflow_accelerator_device_info()->stream;
93 
94     stream->ThenMemcpyD2D(
95         /*gpu_dst=*/&gpu_dst1,
96         /*gpu_src=*/se::DeviceMemoryBase{input.data(), size},
97         /*size=*/input.AllocatedBytes());
98     stream->ThenMemcpyD2D(
99         /*gpu_dst=*/&gpu_dst2,
100         /*gpu_src=*/se::DeviceMemoryBase{input.data(), size},
101         /*size=*/input.AllocatedBytes());
102   }
103 };
104 
105 REGISTER_KERNEL_BUILDER(Name("TestStaticMultipleOutputTf").Device(DEVICE_GPU),
106                         TestStaticMultipleOutputTfOp);
107 REGISTER_XLA_OP(Name("TestStaticMultipleOutputTf").Device(DEVICE_GPU_XLA_JIT),
108                 LightOutsideCompilationOp)
109 
110 // Copy the input up to `max_size`.
111 REGISTER_OP("TestDynamicTf")
112     .Input("input: float")
113     .Attr("max_size: int")
114     .Output("output: float")
__anon9179db730402(shape_inference::InferenceContext* c) 115     .SetShapeFn([](shape_inference::InferenceContext* c) {
116       c->set_output(0, c->UnknownShapeOfRank(c->Rank(c->input(0))));
117       return OkStatus();
118     });
119 
120 // Same as TestStaticTfOp, but only copies up to `max_size` attribute.
121 class TestDynamicTfOp : public OpKernel {
122  public:
TestDynamicTfOp(OpKernelConstruction * ctx)123   explicit TestDynamicTfOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
124     OP_REQUIRES_OK(ctx, ctx->GetAttr("max_size", &max_size_));
125   }
Compute(OpKernelContext * ctx)126   void Compute(OpKernelContext* ctx) override {
127     const Tensor& input = ctx->input(0);
128 
129     // Pass through the part of the value specified by the `max_size` attribute.
130     int64_t size = input.AllocatedBytes();
131     CHECK_LE(max_size_, size);
132     uint64_t size_to_cpy = std::min(size, max_size_) / 2;
133 
134     TensorShape allocated_shape;
135     OP_REQUIRES_OK(ctx,
136                    TensorShapeUtils::MakeShape(
137                        absl::Span<const int>{static_cast<int>(size_to_cpy)},
138                        &allocated_shape));
139 
140     Tensor* out_tensor = nullptr;
141     OP_REQUIRES_OK(
142         ctx, ctx->allocate_output("output", allocated_shape, &out_tensor));
143 
144     se::Stream* stream =
145         ctx->device()->tensorflow_accelerator_device_info()->stream;
146 
147     se::DeviceMemoryBase gpu_dst{out_tensor->data(), size_to_cpy};
148     stream->ThenMemcpyD2D(
149         /*gpu_dst=*/&gpu_dst,
150         /*gpu_src=*/
151         se::DeviceMemoryBase{input.data(), static_cast<uint64_t>(size)},
152         /*size=*/size_to_cpy);
153   }
154 
155  private:
156   int64_t max_size_;
157 };
158 REGISTER_KERNEL_BUILDER(Name("TestDynamicTf").Device(DEVICE_GPU),
159                         TestDynamicTfOp);
160 
161 class TestDynamicTfXlaOp : public LightOutsideCompilationOp {
162  public:
TestDynamicTfXlaOp(OpKernelConstruction * context)163   explicit TestDynamicTfXlaOp(OpKernelConstruction* context)
164       : LightOutsideCompilationOp(context) {}
DynamicOutputDimensions(const NodeDef & ndef,XlaOpKernelContext * ctx) const165   StatusOr<OutputDimensionBoundsMap> DynamicOutputDimensions(
166       const NodeDef& ndef, XlaOpKernelContext* ctx) const override {
167     OutputDimensionBoundsMap out;
168     TF_ASSIGN_OR_RETURN(auto max_bound, GetNodeAttr<int64_t>(ndef, "max_size"));
169     out[0][0] = max_bound;
170     return out;
171   }
172 };
173 
174 REGISTER_XLA_OP(Name("TestDynamicTf").Device(DEVICE_GPU_XLA_JIT),
175                 TestDynamicTfXlaOp);
176 
177 REGISTER_OP("DynamicMultidim")
178     .Input("output_shape: int32")
179     .Output("output: float")
__anon9179db730502(shape_inference::InferenceContext* c) 180     .SetShapeFn([](shape_inference::InferenceContext* c) {
181       c->set_output(0, c->UnknownShapeOfRank(5));
182       return OkStatus();
183     });
184 
185 // Just fill in the data with ones for a given shape.
186 class DynamicMultidimOp : public OpKernel {
187  public:
DynamicMultidimOp(OpKernelConstruction * ctx)188   explicit DynamicMultidimOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
189 
Compute(OpKernelContext * ctx)190   void Compute(OpKernelContext* ctx) override {
191     TensorShape output_shape;
192     auto vec = ctx->input(0).flat<int32>();
193     for (int i = 0; i < vec.size(); i++) {
194       output_shape.AddDim(vec(i));
195     }
196     Tensor* out_tensor = nullptr;
197     OP_REQUIRES_OK(ctx,
198                    ctx->allocate_output("output", output_shape, &out_tensor));
199 
200     // Fill in the value with ones.
201     int32_t num_elements = output_shape.num_elements();
202     std::vector<float> host_data(num_elements);
203     for (int i = 0; i < output_shape.num_elements(); i++) {
204       host_data[i] = 1.0;
205     }
206     se::DeviceMemoryBase gpu_dst{out_tensor->data(),
207                                  static_cast<uint64_t>(num_elements)};
208 
209     se::Stream* stream =
210         ctx->device()->tensorflow_accelerator_device_info()->stream;
211     stream->ThenMemcpy(
212         /*gpu_dst=*/&gpu_dst, /*host_src=*/host_data.data(),
213         /*size=*/num_elements * sizeof(float));
214   }
215 };
216 
217 REGISTER_KERNEL_BUILDER(
218     Name("DynamicMultidim").Device(DEVICE_GPU).HostMemory("output_shape"),
219     DynamicMultidimOp);
220 
221 class DynamicMultidimXlaOp : public LightOutsideCompilationOp {
222  public:
DynamicMultidimXlaOp(OpKernelConstruction * context)223   explicit DynamicMultidimXlaOp(OpKernelConstruction* context)
224       : LightOutsideCompilationOp(context) {}
DynamicOutputDimensions(const NodeDef & ndef,XlaOpKernelContext * ctx) const225   StatusOr<OutputDimensionBoundsMap> DynamicOutputDimensions(
226       const NodeDef& ndef, XlaOpKernelContext* ctx) const override {
227     OutputDimensionBoundsMap out;
228     for (int i = 0; i < 5; i++) {
229       out[0][i] = 20;
230     }
231     return out;
232   }
233 };
234 
235 REGISTER_XLA_OP(Name("DynamicMultidim")
236                     .Device(DEVICE_GPU_XLA_JIT)
237                     .CompileTimeConstantInput("output_shape"),
238                 DynamicMultidimXlaOp);
239 
240 REGISTER_OP("DynamicUnranked")
241     .Output("output: float")
__anon9179db730602(shape_inference::InferenceContext* c) 242     .SetShapeFn([](shape_inference::InferenceContext* c) {
243       c->set_output(0, c->UnknownShape());
244       return OkStatus();
245     });
246 
247 REGISTER_XLA_OP(Name("DynamicUnranked").Device(DEVICE_GPU_XLA_JIT),
248                 LightOutsideCompilationOp);
249 
250 // Copies up to `to_copy_bytes` from the input: tests constant storage.
251 REGISTER_OP("TestTfMustBeConstant")
252     .Input("input: float")
253     .Input("constant_to_add: int32")
254     .Output("output: float")
__anon9179db730702(shape_inference::InferenceContext* c) 255     .SetShapeFn([](shape_inference::InferenceContext* c) {
256       c->set_output(0, c->input(0));
257       return OkStatus();
258     });
259 
260 class TestTfMustBeConstantOp : public OpKernel {
261  public:
TestTfMustBeConstantOp(OpKernelConstruction * ctx)262   explicit TestTfMustBeConstantOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
Compute(OpKernelContext * ctx)263   void Compute(OpKernelContext* ctx) override {
264     const Tensor& input = ctx->input(0);
265 
266     int constant_to_add = ctx->input(1).scalar<int>()();
267     size_t allocated_size = input.AllocatedBytes();
268 
269     se::Stream* stream =
270         ctx->device()->tensorflow_accelerator_device_info()->stream;
271 
272     Tensor tmp;
273     AllocatorAttributes pinned_alloc_attrs;
274     pinned_alloc_attrs.set_on_host(true);
275     pinned_alloc_attrs.set_gpu_compatible(true);
276     TF_CHECK_OK(ctx->allocate_temp(input.dtype(), input.shape(), &tmp,
277                                    pinned_alloc_attrs));
278 
279     stream->ThenMemcpy(tmp.data(),
280                        se::DeviceMemoryBase{input.data(), allocated_size},
281                        allocated_size);
282 
283     OP_REQUIRES_OK(ctx, stream->BlockHostUntilDone());
284 
285     for (int i = 0; i < input.NumElements(); i++) {
286       tmp.flat<float>().data()[i] += constant_to_add;
287     }
288 
289     Tensor* out_tensor = nullptr;
290     OP_REQUIRES_OK(ctx, ctx->allocate_output("output", ctx->input(0).shape(),
291                                              &out_tensor));
292     se::DeviceMemoryBase gpu_dst{out_tensor->data(),
293                                  static_cast<uint64_t>(allocated_size)};
294     stream->ThenMemcpy(&gpu_dst, tmp.data(), allocated_size);
295   }
296 };
297 
298 REGISTER_KERNEL_BUILDER(Name("TestTfMustBeConstant").Device(DEVICE_GPU),
299                         TestTfMustBeConstantOp);
300 
301 REGISTER_XLA_OP(Name("TestTfMustBeConstant")
302                     .Device(DEVICE_GPU_XLA_JIT)
303                     .CompileTimeConstantInput("constant_to_add"),
304                 LightOutsideCompilationOp)
305 
306 REGISTER_OP("TestDynamicTfWithBound")
307     .Input("input: float")
308     .Attr("max_size: int")
309     .Output("output: float")
__anon9179db730802(shape_inference::InferenceContext* c) 310     .SetShapeFn([](shape_inference::InferenceContext* c) {
311       c->set_output(0, c->input(0));
312       return OkStatus();
313     });
314 
315 class TestDynamicTfWithBoundOp : public OpKernel {
316  public:
TestDynamicTfWithBoundOp(OpKernelConstruction * ctx)317   explicit TestDynamicTfWithBoundOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
318     OP_REQUIRES_OK(ctx, ctx->GetAttr("max_size", &max_size_));
319   }
Compute(OpKernelContext * ctx)320   void Compute(OpKernelContext* ctx) override {
321     const Tensor& input = ctx->input(0);
322     uint64_t size_to_cpy =
323         std::min(input.AllocatedBytes(), static_cast<size_t>(max_size_));
324 
325     TensorShape allocated_shape;
326     OP_REQUIRES_OK(ctx,
327                    TensorShapeUtils::MakeShape(
328                        absl::Span<const int>{static_cast<int>(size_to_cpy)},
329                        &allocated_shape));
330 
331     Tensor* out_tensor = nullptr;
332     OP_REQUIRES_OK(
333         ctx, ctx->allocate_output("output", allocated_shape, &out_tensor));
334 
335     se::Stream* stream =
336         ctx->device()->tensorflow_accelerator_device_info()->stream;
337     se::DeviceMemoryBase gpu_dst{out_tensor->data(), size_to_cpy};
338     stream->ThenMemcpyD2D(
339         /*gpu_dst=*/&gpu_dst,
340         /*gpu_src=*/se::DeviceMemoryBase{input.data(), size_to_cpy},
341         /*size=*/size_to_cpy);
342   }
343 
344  private:
345   int64_t max_size_;
346 };
347 
348 REGISTER_KERNEL_BUILDER(Name("TestDynamicTfWithBound").Device(DEVICE_GPU),
349                         TestDynamicTfWithBoundOp);
350 
351 class TestDynamicTfWithBoundXlaOp : public LightOutsideCompilationOp {
352  public:
TestDynamicTfWithBoundXlaOp(OpKernelConstruction * context)353   explicit TestDynamicTfWithBoundXlaOp(OpKernelConstruction* context)
354       : LightOutsideCompilationOp(context) {}
355 
DynamicOutputDimensions(const NodeDef & ndef,XlaOpKernelContext * ctx) const356   StatusOr<OutputDimensionBoundsMap> DynamicOutputDimensions(
357       const NodeDef& ndef, XlaOpKernelContext* ctx) const override {
358     OutputDimensionBoundsMap out;
359     TF_ASSIGN_OR_RETURN(auto max_bound, GetNodeAttr<int64_t>(ndef, "max_size"));
360     out[0][0] = max_bound;
361     return out;
362   }
363 };
364 
365 REGISTER_XLA_OP(Name("TestDynamicTfWithBound").Device(DEVICE_GPU_XLA_JIT),
366                 TestDynamicTfWithBoundXlaOp);
367 
368 }  // namespace
369 }  // namespace tensorflow
370