1 /* Copyright 2022 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 #include <algorithm> 17 #include <string> 18 19 #include "tensorflow/compiler/tf2xla/kernels/light_outside_compilation.h" 20 #include "tensorflow/core/framework/op_kernel.h" 21 #include "tensorflow/core/framework/tensor.h" 22 #include "tensorflow/core/framework/tensor_shape.h" 23 24 // Sample kernels for the light outside compilation test. 25 26 namespace tensorflow { 27 namespace { 28 29 // Just copy the input. 30 REGISTER_OP("TestStaticTf") 31 .Input("input: float") 32 .Output("output: float") __anon9179db730202(shape_inference::InferenceContext* c) 33 .SetShapeFn([](shape_inference::InferenceContext* c) { 34 c->set_output(0, c->input(0)); 35 return OkStatus(); 36 }); 37 38 class TestStaticTfOp : public OpKernel { 39 public: TestStaticTfOp(OpKernelConstruction * ctx)40 explicit TestStaticTfOp(OpKernelConstruction* ctx) : OpKernel(ctx) {} Compute(OpKernelContext * ctx)41 void Compute(OpKernelContext* ctx) override { 42 Tensor* out_tensor = nullptr; 43 const Tensor& input = ctx->input(0); 44 OP_REQUIRES_OK(ctx, ctx->allocate_output("output", ctx->input(0).shape(), 45 &out_tensor)); 46 47 // Just pass the value through. 48 uint64_t size = input.AllocatedBytes(); 49 se::DeviceMemoryBase gpu_dst{out_tensor->data(), size}; 50 se::Stream* stream = ctx->op_device_context()->stream(); 51 52 stream->ThenMemcpyD2D( 53 /*gpu_dst=*/&gpu_dst, 54 /*gpu_src=*/se::DeviceMemoryBase{input.data(), size}, 55 /*size=*/input.AllocatedBytes()); 56 } 57 }; 58 59 REGISTER_KERNEL_BUILDER(Name("TestStaticTf").Device(DEVICE_GPU), 60 TestStaticTfOp); 61 REGISTER_XLA_OP(Name("TestStaticTf").Device(DEVICE_GPU_XLA_JIT), 62 LightOutsideCompilationOp) 63 64 REGISTER_OP("TestStaticMultipleOutputTf") 65 .Input("input: float") 66 .Output("output1: float") 67 .Output("output2: float") __anon9179db730302(shape_inference::InferenceContext* c) 68 .SetShapeFn([](shape_inference::InferenceContext* c) { 69 c->set_output(0, c->input(0)); 70 c->set_output(1, c->input(0)); 71 return OkStatus(); 72 }); 73 74 class TestStaticMultipleOutputTfOp : public OpKernel { 75 public: TestStaticMultipleOutputTfOp(OpKernelConstruction * ctx)76 explicit TestStaticMultipleOutputTfOp(OpKernelConstruction* ctx) 77 : OpKernel(ctx) {} Compute(OpKernelContext * ctx)78 void Compute(OpKernelContext* ctx) override { 79 Tensor* out_tensor1 = nullptr; 80 Tensor* out_tensor2 = nullptr; 81 const Tensor& input = ctx->input(0); 82 OP_REQUIRES_OK(ctx, ctx->allocate_output("output1", ctx->input(0).shape(), 83 &out_tensor1)); 84 OP_REQUIRES_OK(ctx, ctx->allocate_output("output2", ctx->input(0).shape(), 85 &out_tensor2)); 86 87 // Just pass the value through. 88 uint64_t size = input.AllocatedBytes(); 89 se::DeviceMemoryBase gpu_dst1{out_tensor1->data(), size}; 90 se::DeviceMemoryBase gpu_dst2{out_tensor2->data(), size}; 91 se::Stream* stream = 92 ctx->device()->tensorflow_accelerator_device_info()->stream; 93 94 stream->ThenMemcpyD2D( 95 /*gpu_dst=*/&gpu_dst1, 96 /*gpu_src=*/se::DeviceMemoryBase{input.data(), size}, 97 /*size=*/input.AllocatedBytes()); 98 stream->ThenMemcpyD2D( 99 /*gpu_dst=*/&gpu_dst2, 100 /*gpu_src=*/se::DeviceMemoryBase{input.data(), size}, 101 /*size=*/input.AllocatedBytes()); 102 } 103 }; 104 105 REGISTER_KERNEL_BUILDER(Name("TestStaticMultipleOutputTf").Device(DEVICE_GPU), 106 TestStaticMultipleOutputTfOp); 107 REGISTER_XLA_OP(Name("TestStaticMultipleOutputTf").Device(DEVICE_GPU_XLA_JIT), 108 LightOutsideCompilationOp) 109 110 // Copy the input up to `max_size`. 111 REGISTER_OP("TestDynamicTf") 112 .Input("input: float") 113 .Attr("max_size: int") 114 .Output("output: float") __anon9179db730402(shape_inference::InferenceContext* c) 115 .SetShapeFn([](shape_inference::InferenceContext* c) { 116 c->set_output(0, c->UnknownShapeOfRank(c->Rank(c->input(0)))); 117 return OkStatus(); 118 }); 119 120 // Same as TestStaticTfOp, but only copies up to `max_size` attribute. 121 class TestDynamicTfOp : public OpKernel { 122 public: TestDynamicTfOp(OpKernelConstruction * ctx)123 explicit TestDynamicTfOp(OpKernelConstruction* ctx) : OpKernel(ctx) { 124 OP_REQUIRES_OK(ctx, ctx->GetAttr("max_size", &max_size_)); 125 } Compute(OpKernelContext * ctx)126 void Compute(OpKernelContext* ctx) override { 127 const Tensor& input = ctx->input(0); 128 129 // Pass through the part of the value specified by the `max_size` attribute. 130 int64_t size = input.AllocatedBytes(); 131 CHECK_LE(max_size_, size); 132 uint64_t size_to_cpy = std::min(size, max_size_) / 2; 133 134 TensorShape allocated_shape; 135 OP_REQUIRES_OK(ctx, 136 TensorShapeUtils::MakeShape( 137 absl::Span<const int>{static_cast<int>(size_to_cpy)}, 138 &allocated_shape)); 139 140 Tensor* out_tensor = nullptr; 141 OP_REQUIRES_OK( 142 ctx, ctx->allocate_output("output", allocated_shape, &out_tensor)); 143 144 se::Stream* stream = 145 ctx->device()->tensorflow_accelerator_device_info()->stream; 146 147 se::DeviceMemoryBase gpu_dst{out_tensor->data(), size_to_cpy}; 148 stream->ThenMemcpyD2D( 149 /*gpu_dst=*/&gpu_dst, 150 /*gpu_src=*/ 151 se::DeviceMemoryBase{input.data(), static_cast<uint64_t>(size)}, 152 /*size=*/size_to_cpy); 153 } 154 155 private: 156 int64_t max_size_; 157 }; 158 REGISTER_KERNEL_BUILDER(Name("TestDynamicTf").Device(DEVICE_GPU), 159 TestDynamicTfOp); 160 161 class TestDynamicTfXlaOp : public LightOutsideCompilationOp { 162 public: TestDynamicTfXlaOp(OpKernelConstruction * context)163 explicit TestDynamicTfXlaOp(OpKernelConstruction* context) 164 : LightOutsideCompilationOp(context) {} DynamicOutputDimensions(const NodeDef & ndef,XlaOpKernelContext * ctx) const165 StatusOr<OutputDimensionBoundsMap> DynamicOutputDimensions( 166 const NodeDef& ndef, XlaOpKernelContext* ctx) const override { 167 OutputDimensionBoundsMap out; 168 TF_ASSIGN_OR_RETURN(auto max_bound, GetNodeAttr<int64_t>(ndef, "max_size")); 169 out[0][0] = max_bound; 170 return out; 171 } 172 }; 173 174 REGISTER_XLA_OP(Name("TestDynamicTf").Device(DEVICE_GPU_XLA_JIT), 175 TestDynamicTfXlaOp); 176 177 REGISTER_OP("DynamicMultidim") 178 .Input("output_shape: int32") 179 .Output("output: float") __anon9179db730502(shape_inference::InferenceContext* c) 180 .SetShapeFn([](shape_inference::InferenceContext* c) { 181 c->set_output(0, c->UnknownShapeOfRank(5)); 182 return OkStatus(); 183 }); 184 185 // Just fill in the data with ones for a given shape. 186 class DynamicMultidimOp : public OpKernel { 187 public: DynamicMultidimOp(OpKernelConstruction * ctx)188 explicit DynamicMultidimOp(OpKernelConstruction* ctx) : OpKernel(ctx) {} 189 Compute(OpKernelContext * ctx)190 void Compute(OpKernelContext* ctx) override { 191 TensorShape output_shape; 192 auto vec = ctx->input(0).flat<int32>(); 193 for (int i = 0; i < vec.size(); i++) { 194 output_shape.AddDim(vec(i)); 195 } 196 Tensor* out_tensor = nullptr; 197 OP_REQUIRES_OK(ctx, 198 ctx->allocate_output("output", output_shape, &out_tensor)); 199 200 // Fill in the value with ones. 201 int32_t num_elements = output_shape.num_elements(); 202 std::vector<float> host_data(num_elements); 203 for (int i = 0; i < output_shape.num_elements(); i++) { 204 host_data[i] = 1.0; 205 } 206 se::DeviceMemoryBase gpu_dst{out_tensor->data(), 207 static_cast<uint64_t>(num_elements)}; 208 209 se::Stream* stream = 210 ctx->device()->tensorflow_accelerator_device_info()->stream; 211 stream->ThenMemcpy( 212 /*gpu_dst=*/&gpu_dst, /*host_src=*/host_data.data(), 213 /*size=*/num_elements * sizeof(float)); 214 } 215 }; 216 217 REGISTER_KERNEL_BUILDER( 218 Name("DynamicMultidim").Device(DEVICE_GPU).HostMemory("output_shape"), 219 DynamicMultidimOp); 220 221 class DynamicMultidimXlaOp : public LightOutsideCompilationOp { 222 public: DynamicMultidimXlaOp(OpKernelConstruction * context)223 explicit DynamicMultidimXlaOp(OpKernelConstruction* context) 224 : LightOutsideCompilationOp(context) {} DynamicOutputDimensions(const NodeDef & ndef,XlaOpKernelContext * ctx) const225 StatusOr<OutputDimensionBoundsMap> DynamicOutputDimensions( 226 const NodeDef& ndef, XlaOpKernelContext* ctx) const override { 227 OutputDimensionBoundsMap out; 228 for (int i = 0; i < 5; i++) { 229 out[0][i] = 20; 230 } 231 return out; 232 } 233 }; 234 235 REGISTER_XLA_OP(Name("DynamicMultidim") 236 .Device(DEVICE_GPU_XLA_JIT) 237 .CompileTimeConstantInput("output_shape"), 238 DynamicMultidimXlaOp); 239 240 REGISTER_OP("DynamicUnranked") 241 .Output("output: float") __anon9179db730602(shape_inference::InferenceContext* c) 242 .SetShapeFn([](shape_inference::InferenceContext* c) { 243 c->set_output(0, c->UnknownShape()); 244 return OkStatus(); 245 }); 246 247 REGISTER_XLA_OP(Name("DynamicUnranked").Device(DEVICE_GPU_XLA_JIT), 248 LightOutsideCompilationOp); 249 250 // Copies up to `to_copy_bytes` from the input: tests constant storage. 251 REGISTER_OP("TestTfMustBeConstant") 252 .Input("input: float") 253 .Input("constant_to_add: int32") 254 .Output("output: float") __anon9179db730702(shape_inference::InferenceContext* c) 255 .SetShapeFn([](shape_inference::InferenceContext* c) { 256 c->set_output(0, c->input(0)); 257 return OkStatus(); 258 }); 259 260 class TestTfMustBeConstantOp : public OpKernel { 261 public: TestTfMustBeConstantOp(OpKernelConstruction * ctx)262 explicit TestTfMustBeConstantOp(OpKernelConstruction* ctx) : OpKernel(ctx) {} Compute(OpKernelContext * ctx)263 void Compute(OpKernelContext* ctx) override { 264 const Tensor& input = ctx->input(0); 265 266 int constant_to_add = ctx->input(1).scalar<int>()(); 267 size_t allocated_size = input.AllocatedBytes(); 268 269 se::Stream* stream = 270 ctx->device()->tensorflow_accelerator_device_info()->stream; 271 272 Tensor tmp; 273 AllocatorAttributes pinned_alloc_attrs; 274 pinned_alloc_attrs.set_on_host(true); 275 pinned_alloc_attrs.set_gpu_compatible(true); 276 TF_CHECK_OK(ctx->allocate_temp(input.dtype(), input.shape(), &tmp, 277 pinned_alloc_attrs)); 278 279 stream->ThenMemcpy(tmp.data(), 280 se::DeviceMemoryBase{input.data(), allocated_size}, 281 allocated_size); 282 283 OP_REQUIRES_OK(ctx, stream->BlockHostUntilDone()); 284 285 for (int i = 0; i < input.NumElements(); i++) { 286 tmp.flat<float>().data()[i] += constant_to_add; 287 } 288 289 Tensor* out_tensor = nullptr; 290 OP_REQUIRES_OK(ctx, ctx->allocate_output("output", ctx->input(0).shape(), 291 &out_tensor)); 292 se::DeviceMemoryBase gpu_dst{out_tensor->data(), 293 static_cast<uint64_t>(allocated_size)}; 294 stream->ThenMemcpy(&gpu_dst, tmp.data(), allocated_size); 295 } 296 }; 297 298 REGISTER_KERNEL_BUILDER(Name("TestTfMustBeConstant").Device(DEVICE_GPU), 299 TestTfMustBeConstantOp); 300 301 REGISTER_XLA_OP(Name("TestTfMustBeConstant") 302 .Device(DEVICE_GPU_XLA_JIT) 303 .CompileTimeConstantInput("constant_to_add"), 304 LightOutsideCompilationOp) 305 306 REGISTER_OP("TestDynamicTfWithBound") 307 .Input("input: float") 308 .Attr("max_size: int") 309 .Output("output: float") __anon9179db730802(shape_inference::InferenceContext* c) 310 .SetShapeFn([](shape_inference::InferenceContext* c) { 311 c->set_output(0, c->input(0)); 312 return OkStatus(); 313 }); 314 315 class TestDynamicTfWithBoundOp : public OpKernel { 316 public: TestDynamicTfWithBoundOp(OpKernelConstruction * ctx)317 explicit TestDynamicTfWithBoundOp(OpKernelConstruction* ctx) : OpKernel(ctx) { 318 OP_REQUIRES_OK(ctx, ctx->GetAttr("max_size", &max_size_)); 319 } Compute(OpKernelContext * ctx)320 void Compute(OpKernelContext* ctx) override { 321 const Tensor& input = ctx->input(0); 322 uint64_t size_to_cpy = 323 std::min(input.AllocatedBytes(), static_cast<size_t>(max_size_)); 324 325 TensorShape allocated_shape; 326 OP_REQUIRES_OK(ctx, 327 TensorShapeUtils::MakeShape( 328 absl::Span<const int>{static_cast<int>(size_to_cpy)}, 329 &allocated_shape)); 330 331 Tensor* out_tensor = nullptr; 332 OP_REQUIRES_OK( 333 ctx, ctx->allocate_output("output", allocated_shape, &out_tensor)); 334 335 se::Stream* stream = 336 ctx->device()->tensorflow_accelerator_device_info()->stream; 337 se::DeviceMemoryBase gpu_dst{out_tensor->data(), size_to_cpy}; 338 stream->ThenMemcpyD2D( 339 /*gpu_dst=*/&gpu_dst, 340 /*gpu_src=*/se::DeviceMemoryBase{input.data(), size_to_cpy}, 341 /*size=*/size_to_cpy); 342 } 343 344 private: 345 int64_t max_size_; 346 }; 347 348 REGISTER_KERNEL_BUILDER(Name("TestDynamicTfWithBound").Device(DEVICE_GPU), 349 TestDynamicTfWithBoundOp); 350 351 class TestDynamicTfWithBoundXlaOp : public LightOutsideCompilationOp { 352 public: TestDynamicTfWithBoundXlaOp(OpKernelConstruction * context)353 explicit TestDynamicTfWithBoundXlaOp(OpKernelConstruction* context) 354 : LightOutsideCompilationOp(context) {} 355 DynamicOutputDimensions(const NodeDef & ndef,XlaOpKernelContext * ctx) const356 StatusOr<OutputDimensionBoundsMap> DynamicOutputDimensions( 357 const NodeDef& ndef, XlaOpKernelContext* ctx) const override { 358 OutputDimensionBoundsMap out; 359 TF_ASSIGN_OR_RETURN(auto max_bound, GetNodeAttr<int64_t>(ndef, "max_size")); 360 out[0][0] = max_bound; 361 return out; 362 } 363 }; 364 365 REGISTER_XLA_OP(Name("TestDynamicTfWithBound").Device(DEVICE_GPU_XLA_JIT), 366 TestDynamicTfWithBoundXlaOp); 367 368 } // namespace 369 } // namespace tensorflow 370