1 /* Copyright 2016 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 #ifndef TENSORFLOW_CORE_KERNELS_DEBUG_OPS_H_ 17 #define TENSORFLOW_CORE_KERNELS_DEBUG_OPS_H_ 18 19 #include <numeric> 20 21 #include "tensorflow/core/platform/bfloat16.h" 22 23 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM 24 #include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h" 25 #include "tensorflow/core/common_runtime/gpu/gpu_util.h" 26 #include "tensorflow/core/util/determinism.h" 27 #endif 28 29 #if GOOGLE_CUDA 30 #include "tensorflow/core/platform/cuda.h" 31 #elif TENSORFLOW_USE_ROCM 32 #include "tensorflow/core/platform/rocm.h" 33 #endif 34 35 #include "tensorflow/core/debug/debug_io_utils.h" 36 #include "tensorflow/core/framework/device_base.h" 37 #include "tensorflow/core/framework/op_kernel.h" 38 #include "tensorflow/core/framework/tensor_util.h" 39 #include "tensorflow/core/lib/core/notification.h" 40 #include "tensorflow/core/lib/strings/stringprintf.h" 41 #include "tensorflow/core/util/debug_events_writer.h" 42 43 namespace tensorflow { 44 45 // Copy op for debugging. 46 // Performs CPU-to-CPU or GPU-to-GPU deep-copying of tensor, depending on the 47 // device on which the tensor is allocated. 48 class CopyOp : public OpKernel { 49 public: CopyOp(OpKernelConstruction * context)50 explicit CopyOp(OpKernelConstruction* context) : OpKernel(context) { 51 OP_REQUIRES_OK(context, context->GetAttr("tensor_name", &tensor_name_)); 52 53 std::vector<string> debug_ops_spec; 54 OP_REQUIRES_OK(context, 55 context->GetAttr("debug_ops_spec", &debug_ops_spec)); 56 for (const string& debug_op_spec : debug_ops_spec) { 57 // Assume debug_op_spec has the format 58 // <debug_op>;<debug_url>;<gated_grpc>, e.g., 59 // DebugIdentity;grpc://localhost:3333;1 60 const std::vector<string> items = str_util::Split(debug_op_spec, ";"); 61 OP_REQUIRES( 62 context, items.size() == 3, 63 errors::Internal( 64 "Unexpected number of semicolons in debug_ops_spec element: ", 65 debug_op_spec)); 66 debug_op_and_url_specs_.push_back( 67 DebugWatchAndURLSpec(strings::StrCat(tensor_name_, ":", items[0]), 68 items[1], items[2] == "1")); 69 } 70 } 71 Compute(OpKernelContext * context)72 void Compute(OpKernelContext* context) override { 73 const Tensor& src_tensor = context->input(0); 74 75 if (src_tensor.IsInitialized() && 76 DataTypeCanUseMemcpy(src_tensor.dtype()) && 77 DebugIO::IsCopyNodeGateOpen(debug_op_and_url_specs_)) { 78 // Source tensor is initialized and is mem-copyable. Make a copy. 79 Tensor* copied_tensor; 80 OP_REQUIRES_OK(context, context->allocate_output(0, src_tensor.shape(), 81 &copied_tensor)); 82 83 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM 84 Device* device = static_cast<Device*>(context->device()); 85 // Determine if the input tensor is not on CPU (e.g., on GPU). 86 bool off_host_input = device->device_type() == DEVICE_GPU && 87 !context->input_alloc_attr(0).on_host(); 88 89 if (off_host_input) { 90 DeviceContext* device_ctxt = context->op_device_context(); 91 // Input is not on host: deep-copy it from GPU to the same GPU. 92 Notification done_copy; 93 GPUUtil::CopyGPUTensorToSameGPU( 94 device, device_ctxt, &src_tensor, copied_tensor, 95 [&done_copy](const Status& s) { done_copy.Notify(); }); 96 done_copy.WaitForNotification(); 97 } else { 98 // The input tensor is on the host (CPU): deep-copy from CPU to CPU. 99 *copied_tensor = tensor::DeepCopy(src_tensor); 100 } 101 #else 102 *copied_tensor = tensor::DeepCopy(src_tensor); 103 #endif 104 } else { 105 // Source tensor is NOT initialized and/or is not mem-copyable: Forward 106 // the Tensor object. 107 context->set_output(0, src_tensor); 108 } 109 } 110 IsExpensive()111 bool IsExpensive() override { return false; } 112 113 private: 114 string tensor_name_; 115 std::vector<DebugWatchAndURLSpec> debug_op_and_url_specs_; 116 }; 117 118 // Base class of all debug ops. 119 class BaseDebugOp : public OpKernel { 120 public: BaseDebugOp(const string & debug_op_name,OpKernelConstruction * context)121 explicit BaseDebugOp(const string& debug_op_name, 122 OpKernelConstruction* context) 123 : OpKernel(context), debug_op_name_(debug_op_name) { 124 OP_REQUIRES_OK(context, context->GetAttr("debug_urls", &debug_urls_)); 125 OP_REQUIRES_OK(context, context->GetAttr("gated_grpc", &gated_grpc_)); 126 127 string device_name; 128 string tensor_name; 129 OP_REQUIRES_OK(context, context->GetAttr("device_name", &device_name)); 130 OP_REQUIRES_OK(context, context->GetAttr("tensor_name", &tensor_name)); 131 132 std::vector<string> name_items = str_util::Split(tensor_name, ':'); 133 string node_name; 134 int32_t output_slot = 0; 135 OP_REQUIRES(context, name_items.size() == 1 || name_items.size() == 2, 136 errors::InvalidArgument("Failed to parse tensor name: \"", 137 tensor_name, "\"")); 138 if (name_items.size() == 2) { 139 node_name = name_items[0]; 140 OP_REQUIRES( 141 context, strings::safe_strto32(name_items[1], &output_slot), 142 errors::InvalidArgument("Invalid string value for output_slot: \"", 143 name_items[1], "\"")); 144 } else if (name_items.size() == 1) { 145 node_name = name_items[0]; 146 } 147 148 debug_watch_key_.reset( 149 new DebugNodeKey(device_name, node_name, output_slot, debug_op_name_)); 150 } 151 IsExpensive()152 bool IsExpensive() override { return false; } 153 154 protected: 155 // Apply gRPC gating (if gated_grpc_ attribute is true). 156 // 157 // Returns false if and only if all grpc:// debug URLs of the debug op are 158 // disabled currently (i.e., gated off), in which case the debug op will emit 159 // an empty (size {0}) tensor of undefined data type. ApplyGrpcGating(OpKernelContext * context)160 bool ApplyGrpcGating(OpKernelContext* context) { 161 if (gated_grpc_ && !DebugIO::IsDebugNodeGateOpen( 162 debug_watch_key_->debug_node_name, debug_urls_)) { 163 // The entire node is gated off: Output an empty tensor and avoid 164 // expensive computation. 165 Tensor* output_tensor; 166 TensorShape shape({0}); 167 if (!context->allocate_output(0, shape, &output_tensor).ok()) { 168 LOG(ERROR) << "Debug node of watch key " 169 << debug_watch_key_->debug_node_name 170 << " failed to allocate empty tensor under gated-off state."; 171 } 172 return false; 173 } else { 174 return true; 175 } 176 } 177 178 // Publish a tensor to all debug URLs of the debug op. 179 // Log an error if the publishing failed. PublishTensor(const Tensor & tensor)180 Status PublishTensor(const Tensor& tensor) { 181 if (debug_urls_.empty()) { 182 return OkStatus(); 183 } else { 184 Status status = DebugIO::PublishDebugTensor(*debug_watch_key_, tensor, 185 Env::Default()->NowMicros(), 186 debug_urls_, gated_grpc_); 187 if (!status.ok()) { 188 LOG(ERROR) << "Debug node of watch key " 189 << debug_watch_key_->debug_node_name 190 << " failed to publish debug tensor data to all URLs " 191 << str_util::Join(debug_urls_, ", ") 192 << ", due to: " << status.error_message(); 193 } 194 return status; 195 } 196 } 197 198 private: 199 const string debug_op_name_; 200 std::unique_ptr<DebugNodeKey> debug_watch_key_; 201 std::vector<string> debug_urls_; 202 bool gated_grpc_; 203 }; 204 205 // Identity op for debugging. 206 // Output slot 0 carries the debug signal and is always allocated on the 207 // host (CPU) as a non-Ref tensor. In the case of DebugIdentityOp, 208 // the debug signal is equal to the input tensor. 209 class DebugIdentityOp : public BaseDebugOp { 210 public: DebugIdentityOp(OpKernelConstruction * context)211 explicit DebugIdentityOp(OpKernelConstruction* context) 212 : BaseDebugOp("DebugIdentity", context) {} 213 Compute(OpKernelContext * context)214 void Compute(OpKernelContext* context) override { 215 if (!ApplyGrpcGating(context)) { 216 return; 217 } 218 219 OP_REQUIRES_OK(context, PublishTensor(context->input(0))); 220 context->set_output(0, context->input(0)); 221 } 222 }; 223 224 // NaN-counter op for debugging. 225 template <typename T> 226 class DebugNanCountOp : public BaseDebugOp { 227 public: DebugNanCountOp(OpKernelConstruction * context)228 explicit DebugNanCountOp(OpKernelConstruction* context) 229 : BaseDebugOp("DebugNanCount", context) {} 230 Compute(OpKernelContext * context)231 void Compute(OpKernelContext* context) override { 232 if (!ApplyGrpcGating(context)) { 233 return; 234 } 235 236 Tensor* output_tensor; 237 const Tensor& input = context->input(0); 238 239 // Use DT_INT64/int64 to be consistent with TensorShape::num_elements(). 240 int64_t nan_count = 0; 241 242 // If the input is an uninitialized tensor, let nan_count be 0. 243 if (input.IsInitialized()) { 244 // Count NaNs. 245 const TensorShape& input_shape = input.shape(); 246 const T* input_flat = input.template flat<T>().data(); 247 248 for (int64_t i = 0; i < input_shape.num_elements(); ++i) { 249 if (Eigen::numext::isnan(static_cast<double>(input_flat[i]))) { 250 nan_count++; 251 } 252 } 253 } 254 255 TensorShape shape({1}); 256 OP_REQUIRES_OK(context, context->allocate_output(0, shape, &output_tensor)); 257 output_tensor->vec<int64_t>()(0) = nan_count; 258 OP_REQUIRES_OK(context, PublishTensor(*output_tensor)); 259 } 260 }; 261 262 // Numeric summary op for debugging. 263 template <typename T> 264 class DebugNumericSummaryOp : public BaseDebugOp { 265 public: DebugNumericSummaryOp(OpKernelConstruction * context)266 explicit DebugNumericSummaryOp(OpKernelConstruction* context) 267 : BaseDebugOp("DebugNumericSummary", context) { 268 OP_REQUIRES_OK(context, context->GetAttr("lower_bound", &lower_bound_)); 269 OP_REQUIRES_OK(context, context->GetAttr("upper_bound", &upper_bound_)); 270 OP_REQUIRES_OK(context, 271 context->GetAttr("mute_if_healthy", &mute_if_healthy_)); 272 } 273 Compute(OpKernelContext * context)274 void Compute(OpKernelContext* context) override { 275 if (!ApplyGrpcGating(context)) { 276 return; 277 } 278 279 Tensor* output_tensor; 280 const Tensor& input = context->input(0); 281 282 int64_t is_initialized = 0; 283 int64_t element_count = 0; 284 int64_t negative_inf_count = 0; 285 int64_t negative_count = 0; 286 int64_t zero_count = 0; 287 int64_t positive_count = 0; 288 int64_t positive_inf_count = 0; 289 int64_t nan_count = 0; 290 double min = std::numeric_limits<double>::infinity(); 291 double max = -std::numeric_limits<double>::infinity(); 292 double sum = 0.0; 293 double mean = std::numeric_limits<double>::quiet_NaN(); 294 double variance = std::numeric_limits<double>::quiet_NaN(); 295 296 // Equal to negative_count + zero_count + positive_count. 297 int64_t non_inf_nan_count = 0; 298 299 const TensorShape& input_shape = input.shape(); 300 if (input.IsInitialized()) { 301 is_initialized = 1; 302 const T* input_flat = input.template flat<T>().data(); 303 304 element_count = input_shape.num_elements(); 305 const bool is_lower_bound_custom = !Eigen::numext::isinf(lower_bound_); 306 const bool is_upper_bound_custom = !Eigen::numext::isinf(upper_bound_); 307 308 for (int64_t i = 0; i < element_count; ++i) { 309 const double x = static_cast<double>(input_flat[i]); 310 if (Eigen::numext::isnan(x)) { 311 nan_count++; 312 } else if (Eigen::numext::isinf(x)) { 313 if (x < 0.0) { 314 negative_inf_count++; 315 } else { 316 positive_inf_count++; 317 } 318 } else { 319 if (is_lower_bound_custom && x <= lower_bound_) { 320 negative_inf_count++; 321 } else if (is_upper_bound_custom && x >= upper_bound_) { 322 positive_inf_count++; 323 } else if (x < 0.0) { 324 negative_count++; 325 } else if (x > 0.0) { 326 positive_count++; 327 } else { 328 zero_count++; 329 } 330 331 if (x < min) { 332 min = x; 333 } 334 if (x > max) { 335 max = x; 336 } 337 338 non_inf_nan_count++; 339 sum += x; 340 } 341 } 342 343 if (non_inf_nan_count > 0) { 344 mean = sum / non_inf_nan_count; 345 346 // Do a second pass to compute variance. 347 variance = 0.0; 348 for (int64_t i = 0; i < element_count; ++i) { 349 const double x = static_cast<double>(input_flat[i]); 350 if (!Eigen::numext::isnan(x) && !Eigen::numext::isinf(x)) { 351 variance += (x - mean) * (x - mean); 352 } 353 } 354 variance /= non_inf_nan_count; 355 } 356 } 357 358 TensorShape shape({14 + input_shape.dims()}); 359 OP_REQUIRES_OK(context, context->allocate_output(0, shape, &output_tensor)); 360 output_tensor->vec<double>()(0) = static_cast<double>(is_initialized); 361 output_tensor->vec<double>()(1) = static_cast<double>(element_count); 362 output_tensor->vec<double>()(2) = static_cast<double>(nan_count); 363 output_tensor->vec<double>()(3) = static_cast<double>(negative_inf_count); 364 output_tensor->vec<double>()(4) = static_cast<double>(negative_count); 365 output_tensor->vec<double>()(5) = static_cast<double>(zero_count); 366 output_tensor->vec<double>()(6) = static_cast<double>(positive_count); 367 output_tensor->vec<double>()(7) = static_cast<double>(positive_inf_count); 368 output_tensor->vec<double>()(8) = min; 369 output_tensor->vec<double>()(9) = max; 370 output_tensor->vec<double>()(10) = mean; 371 output_tensor->vec<double>()(11) = variance; 372 373 output_tensor->vec<double>()(12) = static_cast<double>(input.dtype()); 374 output_tensor->vec<double>()(13) = static_cast<double>(input_shape.dims()); 375 for (size_t d = 0; d < input_shape.dims(); ++d) { 376 output_tensor->vec<double>()(14 + d) = 377 static_cast<double>(input_shape.dim_sizes()[d]); 378 } 379 380 bool mute = mute_if_healthy_ && nan_count == 0 && negative_inf_count == 0 && 381 positive_inf_count == 0; 382 if (!mute) { 383 OP_REQUIRES_OK(context, PublishTensor(*output_tensor)); 384 } 385 } 386 387 private: 388 float lower_bound_; 389 float upper_bound_; 390 bool mute_if_healthy_; 391 }; 392 393 // Identity op for tfdbg v2: Writes debug data using DebugEventsWriter. 394 class DebugIdentityV2Op : public OpKernel { 395 public: DebugIdentityV2Op(OpKernelConstruction * context)396 explicit DebugIdentityV2Op(OpKernelConstruction* context) 397 : OpKernel(context), 398 device_name_(context->device()->name()), 399 output_slot_(-1), 400 tensor_debug_mode_(0), 401 tfdbg_run_id_() { 402 std::vector<string> debug_urls; 403 OP_REQUIRES_OK(context, context->GetAttr("debug_urls", &debug_urls)); 404 for (const string& debug_url : debug_urls) { 405 if (absl::StartsWith(debug_url, DebugIO::kFileURLScheme)) { 406 dump_roots_.emplace_back( 407 debug_url.substr(strlen(DebugIO::kFileURLScheme))); 408 } else { 409 context->SetStatus( 410 errors::Internal("Unsupported debug URL schema in: ", debug_url)); 411 } 412 } 413 OP_REQUIRES_OK(context, 414 context->GetAttr("tfdbg_context_id", &tfdbg_context_id_)); 415 OP_REQUIRES_OK(context, context->GetAttr("op_name", &op_name_)); 416 OP_REQUIRES_OK(context, context->GetAttr("output_slot", &output_slot_)); 417 OP_REQUIRES_OK(context, 418 context->GetAttr("tensor_debug_mode", &tensor_debug_mode_)); 419 if (context->HasAttr("circular_buffer_size")) { 420 OP_REQUIRES_OK(context, context->GetAttr("circular_buffer_size", 421 &circular_buffer_size_)); 422 } else { 423 circular_buffer_size_ = 424 tfdbg::DebugEventsWriter::kDefaultCyclicBufferSize; 425 } 426 if (context->HasAttr("tfdbg_run_id")) { 427 OP_REQUIRES_OK(context, context->GetAttr("tfdbg_run_id", &tfdbg_run_id_)); 428 } 429 } 430 Compute(OpKernelContext * context)431 void Compute(OpKernelContext* context) override { 432 const Tensor& tensor = context->input(0); 433 for (const string& dump_root : dump_roots_) { 434 tfdbg::DebugEventsWriter* debug_events_writer = 435 tfdbg::DebugEventsWriter::GetDebugEventsWriter( 436 dump_root, tfdbg_run_id_, circular_buffer_size_); 437 OP_REQUIRES_OK(context, debug_events_writer->WriteGraphExecutionTrace( 438 tfdbg_context_id_, device_name_, op_name_, 439 output_slot_, tensor_debug_mode_, tensor)); 440 } 441 context->set_output(0, tensor); 442 } 443 444 private: 445 std::vector<string> dump_roots_; 446 string tfdbg_context_id_; 447 string device_name_; 448 string op_name_; 449 int32 output_slot_; 450 int32 tensor_debug_mode_; 451 int64_t circular_buffer_size_; 452 string tfdbg_run_id_; 453 }; 454 455 typedef Eigen::ThreadPoolDevice CPUDevice; 456 typedef Eigen::GpuDevice GPUDevice; 457 458 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM 459 template <typename Tin, typename Tout> 460 struct CurtHealthLaunch { 461 void Run(const GPUDevice& d, const Tin* data, int size, Tout output[1]); 462 }; 463 464 extern template struct CurtHealthLaunch<Eigen::half, float>; 465 extern template struct CurtHealthLaunch<float, float>; 466 extern template struct CurtHealthLaunch<double, float>; 467 extern template struct CurtHealthLaunch<Eigen::half, double>; 468 extern template struct CurtHealthLaunch<float, double>; 469 extern template struct CurtHealthLaunch<double, double>; 470 471 template <typename Tin, typename Tout> 472 struct ConciseHealthLaunch { 473 void Run(const GPUDevice& d, const Tin* data, int size, Tout output[3]); 474 }; 475 476 extern template struct ConciseHealthLaunch<Eigen::half, float>; 477 extern template struct ConciseHealthLaunch<float, float>; 478 extern template struct ConciseHealthLaunch<double, float>; 479 extern template struct ConciseHealthLaunch<Eigen::half, double>; 480 extern template struct ConciseHealthLaunch<float, double>; 481 extern template struct ConciseHealthLaunch<double, double>; 482 483 template <typename Tin, typename Tout> 484 struct FullHealthLaunch { 485 void Run(const GPUDevice& d, const Tin* data, int size, Tout output[6]); 486 }; 487 488 extern template struct FullHealthLaunch<Eigen::half, float>; 489 extern template struct FullHealthLaunch<float, float>; 490 extern template struct FullHealthLaunch<double, float>; 491 extern template struct FullHealthLaunch<Eigen::half, double>; 492 extern template struct FullHealthLaunch<float, double>; 493 extern template struct FullHealthLaunch<double, double>; 494 495 template <typename Tin, typename Tout> 496 struct ReduceInfNanThreeSlotsLaunch { 497 void Run(const GPUDevice& d, const Tin* data, int size, Tout output[3]); 498 }; 499 500 extern template struct ReduceInfNanThreeSlotsLaunch<Eigen::half, float>; 501 extern template struct ReduceInfNanThreeSlotsLaunch<float, float>; 502 extern template struct ReduceInfNanThreeSlotsLaunch<double, float>; 503 extern template struct ReduceInfNanThreeSlotsLaunch<Eigen::half, double>; 504 extern template struct ReduceInfNanThreeSlotsLaunch<float, double>; 505 extern template struct ReduceInfNanThreeSlotsLaunch<double, double>; 506 507 #endif 508 509 template <typename Device, typename Tin, typename Tout> 510 class DebugNumericSummaryV2Op; 511 512 // Numeric summary op for tfdbg v2: CPU Kernel. 513 template <typename Tin, typename Tout> 514 class DebugNumericSummaryV2Op<CPUDevice, Tin, Tout> : public OpKernel { 515 public: 516 explicit DebugNumericSummaryV2Op(OpKernelConstruction* context) 517 : OpKernel(context) { 518 OP_REQUIRES_OK(context, 519 context->GetAttr("tensor_debug_mode", &tensor_debug_mode_)); 520 OP_REQUIRES_OK(context, context->GetAttr("tensor_id", &tensor_id_)); 521 } 522 523 void Compute(OpKernelContext* context) override { 524 const Tensor& tensor = context->input(0); 525 auto in = tensor.flat<Tin>(); 526 const Tin* data = in.data(); 527 const int64_t size = in.size(); 528 Tensor* output_tensor; 529 Tout tensor_id = static_cast<Tout>(tensor_id_); 530 const Tout num_elem = static_cast<Tout>(context->input(0).NumElements()); 531 // Disregard lossy cast if mode is REDUCE_INF_NAN_THREE_SLOTS because 532 // that mode does not make use of tensor_id. 533 if (tensor_debug_mode_ != 8) { 534 OP_REQUIRES( 535 context, tensor_id_ <= kMaxTensorId, 536 errors::InvalidArgument("DebugNumericSummaryV2Op requires " 537 "tensor_id to be less than or equal to " 538 "(2^", 539 std::numeric_limits<Tout>::digits, 540 "). Given tensor_id:", tensor_id_)); 541 } 542 543 if (tensor_debug_mode_ == 2) { // CURT_HEALTH 544 TensorShape shape({2}); 545 OP_REQUIRES_OK(context, 546 context->allocate_output(0, shape, &output_tensor)); 547 output_tensor->flat<Tout>()(0) = tensor_id; // Slot tensor id 548 output_tensor->flat<Tout>()(1) = 0.0; // Has inf or nan 549 int fp_props = 550 std::accumulate(data, data + size, 0, [](const int x, const Tin& y) { 551 return Eigen::numext::isfinite(y) ? x : 1; 552 }); 553 if (fp_props) { 554 output_tensor->flat<Tout>()(1) = 1.0; 555 } 556 } else if (tensor_debug_mode_ == 3) { // CONCISE_HEALTH 557 TensorShape shape({5}); 558 OP_REQUIRES_OK(context, 559 context->allocate_output(0, shape, &output_tensor)); 560 output_tensor->flat<Tout>()(0) = tensor_id; 561 output_tensor->flat<Tout>()(1) = num_elem; 562 563 // Accumulator value [neg_inf_count, pos_inf_count, nan_count] 564 Tout fp_props[3] = {0.0, 0.0, 0.0}; 565 std::for_each(data, data + size, [&fp_props](const Tin& y) { 566 if (TF_PREDICT_TRUE(Eigen::numext::isfinite(y))) { 567 // Do nothing: common case. 568 } else if (Eigen::numext::isinf(y)) { 569 if (y < static_cast<Tin>(0.f)) { 570 ++fp_props[0]; 571 } else { 572 ++fp_props[1]; 573 } 574 } else if (Eigen::numext::isnan(y)) { 575 ++fp_props[2]; 576 } 577 }); 578 output_tensor->flat<Tout>()(2) = fp_props[0]; // Slot for -inf count 579 output_tensor->flat<Tout>()(3) = fp_props[1]; // Slot for inf count 580 output_tensor->flat<Tout>()(4) = fp_props[2]; // Slot for nan count 581 } else if (tensor_debug_mode_ == 4) { // FULL HEALTH 582 TensorShape shape({11}); 583 OP_REQUIRES_OK(context, 584 context->allocate_output(0, shape, &output_tensor)); 585 int num_dims = tensor.dims(); 586 output_tensor->flat<Tout>()(0) = tensor_id; 587 output_tensor->flat<Tout>()(1) = -1.0; // TODO(144919262): Device ID 588 output_tensor->flat<Tout>()(2) = static_cast<Tout>(tensor.dtype()); 589 output_tensor->flat<Tout>()(3) = static_cast<Tout>(num_dims); 590 output_tensor->flat<Tout>()(4) = num_elem; 591 592 // Accumulator value [neg_inf_count, pos_inf_count, nan_count, neg_count, 593 // zero_count, pos_count] 594 Tout fp_props[6] = {0.0, 0.0, 0.0, 0.0, 0.0, 0.0}; 595 std::for_each(data, data + size, [&fp_props](const Tin& y) { 596 if (TF_PREDICT_TRUE(Eigen::numext::isfinite(y))) { 597 if (y < static_cast<Tin>(0.f)) { 598 ++fp_props[3]; 599 } else if (y == static_cast<Tin>(0.f)) { 600 ++fp_props[4]; 601 } else { 602 ++fp_props[5]; 603 } 604 } else if (Eigen::numext::isinf(y)) { 605 if (y < static_cast<Tin>(0.f)) { 606 ++fp_props[0]; 607 } else { 608 ++fp_props[1]; 609 } 610 } else if (Eigen::numext::isnan(y)) { 611 ++fp_props[2]; 612 } 613 }); 614 output_tensor->flat<Tout>()(5) = fp_props[0]; // Slot for -inf count 615 output_tensor->flat<Tout>()(6) = fp_props[1]; // Slot for inf count 616 output_tensor->flat<Tout>()(7) = fp_props[2]; // Slot for nan count. 617 output_tensor->flat<Tout>()(8) = fp_props[3]; // Slot for neg count. 618 output_tensor->flat<Tout>()(9) = fp_props[4]; // Slot for zero count. 619 output_tensor->flat<Tout>()(10) = fp_props[5]; // Slot for pos count. 620 } else if (tensor_debug_mode_ == 5) { // SHAPE 621 TensorShape shape({10}); 622 OP_REQUIRES_OK(context, 623 context->allocate_output(0, shape, &output_tensor)); 624 625 int num_dims = tensor.dims(); 626 output_tensor->flat<Tout>()(0) = tensor_id; 627 output_tensor->flat<Tout>()(1) = static_cast<Tout>(tensor.dtype()); 628 output_tensor->flat<Tout>()(2) = static_cast<Tout>(num_dims); 629 output_tensor->flat<Tout>()(3) = num_elem; 630 631 // Tensor shape - stored as (6 columns) 632 // if num_dim is less than 6, we right pad the shape with zeros 633 // if num_dim is greater than 6, we truncate the head (left most) of the 634 // dimensions as they are more predictable than the last few (e.g. batch 635 // size as first dimension) 636 int dim_idx = 4; 637 for (int i = std::max(0, num_dims - kShapeDims); 638 i < std::max(6, num_dims); ++i) { 639 if (i < num_dims) { 640 output_tensor->flat<Tout>()(dim_idx++) = 641 static_cast<Tout>(tensor.dim_size(i)); 642 } else { 643 output_tensor->flat<Tout>()(dim_idx++) = 0.0; 644 } 645 } 646 } else if (tensor_debug_mode_ == 8) { // REDUCE_INF_NAN_THREE_SLOTS. 647 TensorShape shape({3}); 648 OP_REQUIRES_OK(context, 649 context->allocate_output(0, shape, &output_tensor)); 650 output_tensor->flat<Tout>()(0) = 0.0; // Slot for -inf. 651 output_tensor->flat<Tout>()(1) = 0.0; // Slot for inf. 652 output_tensor->flat<Tout>()(2) = 0.0; // Slot for nan. 653 654 int fp_props = 655 std::accumulate(data, data + size, 0, [](const int x, const Tin& y) { 656 int result = x; 657 if (TF_PREDICT_TRUE(Eigen::numext::isfinite(y))) { 658 // Do nothing: common case. 659 } else if (Eigen::numext::isinf(y)) { 660 result |= y < static_cast<Tin>(0.f) ? kNegInfBit : kPosInfBit; 661 } else if (Eigen::numext::isnan(y)) { 662 result |= kNaNBit; 663 } 664 return result; 665 }); 666 667 if (fp_props & kNegInfBit) { 668 output_tensor->flat<Tout>()(0) = -std::numeric_limits<Tout>::infinity(); 669 } 670 if (fp_props & kPosInfBit) { 671 output_tensor->flat<Tout>()(1) = std::numeric_limits<Tout>::infinity(); 672 } 673 if (fp_props & kNaNBit) { 674 output_tensor->flat<Tout>()(2) = std::numeric_limits<Tout>::quiet_NaN(); 675 } 676 } else { 677 // TODO(cais): Implement other tensor debug modes in debug_event.proto. 678 context->SetStatus(errors::Unimplemented( 679 "Unimplemented tensor debug mode: ", tensor_debug_mode_)); 680 } 681 } 682 683 private: 684 int tensor_debug_mode_; 685 int64_t tensor_id_; 686 static constexpr int kShapeDims = 6; 687 static constexpr int kNegInfBit = 0x01; 688 static constexpr int kPosInfBit = 0x02; 689 static constexpr int kNaNBit = 0x04; 690 static constexpr int64_t kMaxTensorId = 1LL 691 << std::numeric_limits<Tout>::digits; 692 }; 693 694 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM 695 696 template <typename Tin, typename Tout> 697 class DebugNumericSummaryV2Op<GPUDevice, Tin, Tout> : public AsyncOpKernel { 698 public: 699 typedef GPUDevice Device; 700 701 explicit DebugNumericSummaryV2Op(OpKernelConstruction* context) 702 : AsyncOpKernel(context) { 703 OP_REQUIRES_OK(context, 704 context->GetAttr("tensor_debug_mode", &tensor_debug_mode_)); 705 OP_REQUIRES_OK(context, context->GetAttr("tensor_id", &tensor_id_)); 706 } 707 708 void ComputeAsync(OpKernelContext* context, DoneCallback done) override { 709 Tensor* output_tensor; 710 Tout tensor_id = static_cast<Tout>(tensor_id_); 711 const Tensor& tensor = context->input(0); 712 const Tout num_elem = static_cast<Tout>(tensor.NumElements()); 713 const Device& d = context->eigen_device<Device>(); 714 auto input = tensor.flat<Tin>(); 715 auto check_cb = [this, done]() { done(); }; 716 // Disregard lossy cast if mode is REDUCE_INF_NAN_THREE_SLOTS because 717 // that mode does not make use of tensor_id. 718 if (tensor_debug_mode_ != 8) { 719 OP_REQUIRES_ASYNC( 720 context, tensor_id_ <= kMaxTensorId, 721 errors::InvalidArgument("DebugNumericSummaryV2Op requires " 722 "tensor_id to be less than or equal to " 723 "(2^", 724 std::numeric_limits<Tout>::digits, 725 "). Given tensor_id:", tensor_id_), 726 done); 727 } 728 729 if (tensor_debug_mode_ == 2) { // CURT_HEALTH. 730 TensorShape shape({2}); 731 OP_REQUIRES_OK(context, 732 context->allocate_output(0, shape, &output_tensor)); 733 734 auto* stream = context->op_device_context()->stream(); 735 OP_REQUIRES_ASYNC(context, stream != nullptr, 736 errors::Internal("No GPU stream available."), done); 737 738 se::DeviceMemoryBase output_tensor_ptr( 739 output_tensor->flat<Tout>().data(), 740 output_tensor->flat<Tout>().size()); 741 stream->ThenMemZero(&output_tensor_ptr, 2 * sizeof(Tout)); 742 // Copy tensor_id to slot zero 743 stream->ThenMemcpy(&output_tensor_ptr, &tensor_id, sizeof(Tout)); 744 if (num_elem == 0) { 745 done(); 746 return; 747 } 748 749 // Call the GPU kernels for the numerical (inf/nan) checks. 750 auto input = context->input(0).flat<Tin>(); 751 CurtHealthLaunch<Tin, Tout>().Run(d, input.data(), input.size(), 752 output_tensor->flat<Tout>().data() + 1); 753 754 context->device() 755 ->tensorflow_accelerator_device_info() 756 ->event_mgr->ThenExecute(stream, std::move(check_cb)); 757 } else if (tensor_debug_mode_ == 3) { // CONCISE_HEALTH. 758 TensorShape shape({5}); 759 OP_REQUIRES_OK(context, 760 context->allocate_output(0, shape, &output_tensor)); 761 OP_REQUIRES_ASYNC(context, !tensorflow::OpDeterminismRequired(), 762 errors::Unimplemented( 763 "Determinism is not yet supported for " 764 "DebugNumericSummaryV2 when tensor_debug_mode is " 765 "CONCISE_HEALTH."), 766 done); 767 768 auto* stream = context->op_device_context()->stream(); 769 OP_REQUIRES_ASYNC(context, stream != nullptr, 770 errors::Internal("No GPU stream available."), done); 771 772 se::DeviceMemoryBase output_tensor_ptr( 773 output_tensor->flat<Tout>().data(), 774 output_tensor->flat<Tout>().size()); 775 stream->ThenMemset32(&output_tensor_ptr, 0, 5 * sizeof(Tout)); 776 const Tout static_output[] = {tensor_id, num_elem}; 777 stream->ThenMemcpy(&output_tensor_ptr, &static_output, 2 * sizeof(Tout)); 778 if (num_elem == 0) { 779 done(); 780 return; 781 } 782 783 // Call the GPU kernels for the numerical (inf/nan) checks. 784 ConciseHealthLaunch<Tin, Tout>().Run( 785 d, input.data(), input.size(), 786 output_tensor->flat<Tout>().data() + 2); 787 788 context->device() 789 ->tensorflow_accelerator_device_info() 790 ->event_mgr->ThenExecute(stream, std::move(check_cb)); 791 } else if (tensor_debug_mode_ == 4) { // FULL HEALTH 792 TensorShape shape({11}); 793 OP_REQUIRES_OK(context, 794 context->allocate_output(0, shape, &output_tensor)); 795 796 auto* stream = context->op_device_context()->stream(); 797 OP_REQUIRES_ASYNC(context, stream != nullptr, 798 errors::Internal("No GPU stream available."), done); 799 OP_REQUIRES_ASYNC(context, !tensorflow::OpDeterminismRequired(), 800 errors::Unimplemented( 801 "Determinism is not yet supported for " 802 "DebugNumericSummaryV2 when tensor_debug_mode is " 803 "FULL_HEALTH."), 804 done); 805 806 se::DeviceMemoryBase output_tensor_ptr( 807 output_tensor->flat<Tout>().data(), 808 output_tensor->flat<Tout>().size()); 809 stream->ThenMemset32(&output_tensor_ptr, 0, 11 * sizeof(Tout)); 810 811 int num_dims = tensor.dims(); 812 const Tout static_output[] = {tensor_id, 813 -1.0, // TODO(144919262): Device ID 814 static_cast<Tout>(tensor.dtype()), 815 static_cast<Tout>(num_dims), num_elem}; 816 stream->ThenMemcpy(&output_tensor_ptr, &static_output, 5 * sizeof(Tout)); 817 if (num_elem == 0) { 818 done(); 819 return; 820 } 821 822 // Call the GPU kernels for the numerical (inf/nan) checks and 823 // pos/neg/zero counts. 824 FullHealthLaunch<Tin, Tout>().Run(d, input.data(), input.size(), 825 output_tensor->flat<Tout>().data() + 5); 826 827 context->device() 828 ->tensorflow_accelerator_device_info() 829 ->event_mgr->ThenExecute(stream, std::move(check_cb)); 830 } else if (tensor_debug_mode_ == 5) { // SHAPE 831 TensorShape shape({10}); 832 OP_REQUIRES_OK(context, 833 context->allocate_output(0, shape, &output_tensor)); 834 835 auto* stream = context->op_device_context()->stream(); 836 OP_REQUIRES_ASYNC(context, stream != nullptr, 837 errors::Internal("No GPU stream available."), done); 838 839 se::DeviceMemoryBase output_tensor_ptr( 840 output_tensor->flat<Tout>().data(), 841 output_tensor->flat<Tout>().size()); 842 843 int num_dims = tensor.dims(); 844 Tout static_output[10] = {tensor_id, 845 static_cast<Tout>(tensor.dtype()), 846 static_cast<Tout>(num_dims), 847 num_elem, 848 0.0, 849 0.0, 850 0.0, 851 0.0, 852 0.0, 853 0.0}; 854 // Tensor shape: right pad zeros, truncate head 855 int dim_idx = 4; 856 for (int i = std::max(0, num_dims - 6); i < num_dims; ++i) { 857 static_output[dim_idx++] = static_cast<Tout>(tensor.dim_size(i)); 858 } 859 // Write to device stream 860 stream->ThenMemcpy(&output_tensor_ptr, &static_output, sizeof(Tout) * 10); 861 context->device() 862 ->tensorflow_accelerator_device_info() 863 ->event_mgr->ThenExecute(stream, std::move(check_cb)); 864 } else if (tensor_debug_mode_ == 8) { // REDUCE_INF_NAN_THREE_SLOTS. 865 TensorShape shape({3}); 866 OP_REQUIRES_OK(context, 867 context->allocate_output(0, shape, &output_tensor)); 868 869 auto* stream = context->op_device_context()->stream(); 870 OP_REQUIRES_ASYNC(context, stream != nullptr, 871 errors::Internal("No GPU stream available."), done); 872 873 se::DeviceMemoryBase output_tensor_ptr( 874 output_tensor->flat<Tout>().data(), 875 output_tensor->flat<Tout>().size()); 876 stream->ThenMemset32(&output_tensor_ptr, 0, 877 output_tensor->flat<Tout>().size() * sizeof(Tout)); 878 if (num_elem == 0) { 879 done(); 880 return; 881 } 882 883 // Call the GPU kernels for the numerical (inf/nan) checks. 884 auto input = context->input(0).flat<Tin>(); 885 ReduceInfNanThreeSlotsLaunch<Tin, Tout>().Run( 886 d, input.data(), input.size(), output_tensor->flat<Tout>().data()); 887 888 context->device() 889 ->tensorflow_accelerator_device_info() 890 ->event_mgr->ThenExecute(stream, std::move(check_cb)); 891 } else { 892 // TODO(cais): Implement other tensor debug modes in debug_event.proto. 893 context->SetStatus(errors::Unimplemented( 894 "Unimplemented tensor debug mode: ", tensor_debug_mode_)); 895 done(); 896 } 897 } 898 899 private: 900 int tensor_debug_mode_; 901 int64_t tensor_id_; 902 static constexpr int64_t kMaxTensorId = 1L 903 << std::numeric_limits<Tout>::digits; 904 }; 905 906 #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM 907 908 } // namespace tensorflow 909 910 #endif // TENSORFLOW_CORE_KERNELS_DEBUG_OPS_H_ 911