xref: /aosp_15_r20/external/tensorflow/tensorflow/core/kernels/debug_ops.h (revision b6fb3261f9314811a0f4371741dbb8839866f948)
1 /* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #ifndef TENSORFLOW_CORE_KERNELS_DEBUG_OPS_H_
17 #define TENSORFLOW_CORE_KERNELS_DEBUG_OPS_H_
18 
19 #include <numeric>
20 
21 #include "tensorflow/core/platform/bfloat16.h"
22 
23 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
24 #include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h"
25 #include "tensorflow/core/common_runtime/gpu/gpu_util.h"
26 #include "tensorflow/core/util/determinism.h"
27 #endif
28 
29 #if GOOGLE_CUDA
30 #include "tensorflow/core/platform/cuda.h"
31 #elif TENSORFLOW_USE_ROCM
32 #include "tensorflow/core/platform/rocm.h"
33 #endif
34 
35 #include "tensorflow/core/debug/debug_io_utils.h"
36 #include "tensorflow/core/framework/device_base.h"
37 #include "tensorflow/core/framework/op_kernel.h"
38 #include "tensorflow/core/framework/tensor_util.h"
39 #include "tensorflow/core/lib/core/notification.h"
40 #include "tensorflow/core/lib/strings/stringprintf.h"
41 #include "tensorflow/core/util/debug_events_writer.h"
42 
43 namespace tensorflow {
44 
45 // Copy op for debugging.
46 // Performs CPU-to-CPU or GPU-to-GPU deep-copying of tensor, depending on the
47 // device on which the tensor is allocated.
48 class CopyOp : public OpKernel {
49  public:
CopyOp(OpKernelConstruction * context)50   explicit CopyOp(OpKernelConstruction* context) : OpKernel(context) {
51     OP_REQUIRES_OK(context, context->GetAttr("tensor_name", &tensor_name_));
52 
53     std::vector<string> debug_ops_spec;
54     OP_REQUIRES_OK(context,
55                    context->GetAttr("debug_ops_spec", &debug_ops_spec));
56     for (const string& debug_op_spec : debug_ops_spec) {
57       // Assume debug_op_spec has the format
58       // <debug_op>;<debug_url>;<gated_grpc>, e.g.,
59       // DebugIdentity;grpc://localhost:3333;1
60       const std::vector<string> items = str_util::Split(debug_op_spec, ";");
61       OP_REQUIRES(
62           context, items.size() == 3,
63           errors::Internal(
64               "Unexpected number of semicolons in debug_ops_spec element: ",
65               debug_op_spec));
66       debug_op_and_url_specs_.push_back(
67           DebugWatchAndURLSpec(strings::StrCat(tensor_name_, ":", items[0]),
68                                items[1], items[2] == "1"));
69     }
70   }
71 
Compute(OpKernelContext * context)72   void Compute(OpKernelContext* context) override {
73     const Tensor& src_tensor = context->input(0);
74 
75     if (src_tensor.IsInitialized() &&
76         DataTypeCanUseMemcpy(src_tensor.dtype()) &&
77         DebugIO::IsCopyNodeGateOpen(debug_op_and_url_specs_)) {
78       // Source tensor is initialized and is mem-copyable. Make a copy.
79       Tensor* copied_tensor;
80       OP_REQUIRES_OK(context, context->allocate_output(0, src_tensor.shape(),
81                                                        &copied_tensor));
82 
83 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
84       Device* device = static_cast<Device*>(context->device());
85       // Determine if the input tensor is not on CPU (e.g., on GPU).
86       bool off_host_input = device->device_type() == DEVICE_GPU &&
87                             !context->input_alloc_attr(0).on_host();
88 
89       if (off_host_input) {
90         DeviceContext* device_ctxt = context->op_device_context();
91         // Input is not on host: deep-copy it from GPU to the same GPU.
92         Notification done_copy;
93         GPUUtil::CopyGPUTensorToSameGPU(
94             device, device_ctxt, &src_tensor, copied_tensor,
95             [&done_copy](const Status& s) { done_copy.Notify(); });
96         done_copy.WaitForNotification();
97       } else {
98         // The input tensor is on the host (CPU): deep-copy from CPU to CPU.
99         *copied_tensor = tensor::DeepCopy(src_tensor);
100       }
101 #else
102       *copied_tensor = tensor::DeepCopy(src_tensor);
103 #endif
104     } else {
105       // Source tensor is NOT initialized and/or is not mem-copyable: Forward
106       // the Tensor object.
107       context->set_output(0, src_tensor);
108     }
109   }
110 
IsExpensive()111   bool IsExpensive() override { return false; }
112 
113  private:
114   string tensor_name_;
115   std::vector<DebugWatchAndURLSpec> debug_op_and_url_specs_;
116 };
117 
118 // Base class of all debug ops.
119 class BaseDebugOp : public OpKernel {
120  public:
BaseDebugOp(const string & debug_op_name,OpKernelConstruction * context)121   explicit BaseDebugOp(const string& debug_op_name,
122                        OpKernelConstruction* context)
123       : OpKernel(context), debug_op_name_(debug_op_name) {
124     OP_REQUIRES_OK(context, context->GetAttr("debug_urls", &debug_urls_));
125     OP_REQUIRES_OK(context, context->GetAttr("gated_grpc", &gated_grpc_));
126 
127     string device_name;
128     string tensor_name;
129     OP_REQUIRES_OK(context, context->GetAttr("device_name", &device_name));
130     OP_REQUIRES_OK(context, context->GetAttr("tensor_name", &tensor_name));
131 
132     std::vector<string> name_items = str_util::Split(tensor_name, ':');
133     string node_name;
134     int32_t output_slot = 0;
135     OP_REQUIRES(context, name_items.size() == 1 || name_items.size() == 2,
136                 errors::InvalidArgument("Failed to parse tensor name: \"",
137                                         tensor_name, "\""));
138     if (name_items.size() == 2) {
139       node_name = name_items[0];
140       OP_REQUIRES(
141           context, strings::safe_strto32(name_items[1], &output_slot),
142           errors::InvalidArgument("Invalid string value for output_slot: \"",
143                                   name_items[1], "\""));
144     } else if (name_items.size() == 1) {
145       node_name = name_items[0];
146     }
147 
148     debug_watch_key_.reset(
149         new DebugNodeKey(device_name, node_name, output_slot, debug_op_name_));
150   }
151 
IsExpensive()152   bool IsExpensive() override { return false; }
153 
154  protected:
155   // Apply gRPC gating (if gated_grpc_ attribute is true).
156   //
157   // Returns false if and only if all grpc:// debug URLs of the debug op are
158   // disabled currently (i.e., gated off), in which case the debug op will emit
159   // an empty (size {0}) tensor of undefined data type.
ApplyGrpcGating(OpKernelContext * context)160   bool ApplyGrpcGating(OpKernelContext* context) {
161     if (gated_grpc_ && !DebugIO::IsDebugNodeGateOpen(
162                            debug_watch_key_->debug_node_name, debug_urls_)) {
163       // The entire node is gated off: Output an empty tensor and avoid
164       // expensive computation.
165       Tensor* output_tensor;
166       TensorShape shape({0});
167       if (!context->allocate_output(0, shape, &output_tensor).ok()) {
168         LOG(ERROR) << "Debug node of watch key "
169                    << debug_watch_key_->debug_node_name
170                    << " failed to allocate empty tensor under gated-off state.";
171       }
172       return false;
173     } else {
174       return true;
175     }
176   }
177 
178   // Publish a tensor to all debug URLs of the debug op.
179   // Log an error if the publishing failed.
PublishTensor(const Tensor & tensor)180   Status PublishTensor(const Tensor& tensor) {
181     if (debug_urls_.empty()) {
182       return OkStatus();
183     } else {
184       Status status = DebugIO::PublishDebugTensor(*debug_watch_key_, tensor,
185                                                   Env::Default()->NowMicros(),
186                                                   debug_urls_, gated_grpc_);
187       if (!status.ok()) {
188         LOG(ERROR) << "Debug node of watch key "
189                    << debug_watch_key_->debug_node_name
190                    << " failed to publish debug tensor data to all URLs "
191                    << str_util::Join(debug_urls_, ", ")
192                    << ", due to: " << status.error_message();
193       }
194       return status;
195     }
196   }
197 
198  private:
199   const string debug_op_name_;
200   std::unique_ptr<DebugNodeKey> debug_watch_key_;
201   std::vector<string> debug_urls_;
202   bool gated_grpc_;
203 };
204 
205 // Identity op for debugging.
206 //   Output slot 0 carries the debug signal and is always allocated on the
207 //   host (CPU) as a non-Ref tensor. In the case of DebugIdentityOp,
208 //   the debug signal is equal to the input tensor.
209 class DebugIdentityOp : public BaseDebugOp {
210  public:
DebugIdentityOp(OpKernelConstruction * context)211   explicit DebugIdentityOp(OpKernelConstruction* context)
212       : BaseDebugOp("DebugIdentity", context) {}
213 
Compute(OpKernelContext * context)214   void Compute(OpKernelContext* context) override {
215     if (!ApplyGrpcGating(context)) {
216       return;
217     }
218 
219     OP_REQUIRES_OK(context, PublishTensor(context->input(0)));
220     context->set_output(0, context->input(0));
221   }
222 };
223 
224 // NaN-counter op for debugging.
225 template <typename T>
226 class DebugNanCountOp : public BaseDebugOp {
227  public:
DebugNanCountOp(OpKernelConstruction * context)228   explicit DebugNanCountOp(OpKernelConstruction* context)
229       : BaseDebugOp("DebugNanCount", context) {}
230 
Compute(OpKernelContext * context)231   void Compute(OpKernelContext* context) override {
232     if (!ApplyGrpcGating(context)) {
233       return;
234     }
235 
236     Tensor* output_tensor;
237     const Tensor& input = context->input(0);
238 
239     // Use DT_INT64/int64 to be consistent with TensorShape::num_elements().
240     int64_t nan_count = 0;
241 
242     // If the input is an uninitialized tensor, let nan_count be 0.
243     if (input.IsInitialized()) {
244       // Count NaNs.
245       const TensorShape& input_shape = input.shape();
246       const T* input_flat = input.template flat<T>().data();
247 
248       for (int64_t i = 0; i < input_shape.num_elements(); ++i) {
249         if (Eigen::numext::isnan(static_cast<double>(input_flat[i]))) {
250           nan_count++;
251         }
252       }
253     }
254 
255     TensorShape shape({1});
256     OP_REQUIRES_OK(context, context->allocate_output(0, shape, &output_tensor));
257     output_tensor->vec<int64_t>()(0) = nan_count;
258     OP_REQUIRES_OK(context, PublishTensor(*output_tensor));
259   }
260 };
261 
262 // Numeric summary op for debugging.
263 template <typename T>
264 class DebugNumericSummaryOp : public BaseDebugOp {
265  public:
DebugNumericSummaryOp(OpKernelConstruction * context)266   explicit DebugNumericSummaryOp(OpKernelConstruction* context)
267       : BaseDebugOp("DebugNumericSummary", context) {
268     OP_REQUIRES_OK(context, context->GetAttr("lower_bound", &lower_bound_));
269     OP_REQUIRES_OK(context, context->GetAttr("upper_bound", &upper_bound_));
270     OP_REQUIRES_OK(context,
271                    context->GetAttr("mute_if_healthy", &mute_if_healthy_));
272   }
273 
Compute(OpKernelContext * context)274   void Compute(OpKernelContext* context) override {
275     if (!ApplyGrpcGating(context)) {
276       return;
277     }
278 
279     Tensor* output_tensor;
280     const Tensor& input = context->input(0);
281 
282     int64_t is_initialized = 0;
283     int64_t element_count = 0;
284     int64_t negative_inf_count = 0;
285     int64_t negative_count = 0;
286     int64_t zero_count = 0;
287     int64_t positive_count = 0;
288     int64_t positive_inf_count = 0;
289     int64_t nan_count = 0;
290     double min = std::numeric_limits<double>::infinity();
291     double max = -std::numeric_limits<double>::infinity();
292     double sum = 0.0;
293     double mean = std::numeric_limits<double>::quiet_NaN();
294     double variance = std::numeric_limits<double>::quiet_NaN();
295 
296     // Equal to negative_count + zero_count + positive_count.
297     int64_t non_inf_nan_count = 0;
298 
299     const TensorShape& input_shape = input.shape();
300     if (input.IsInitialized()) {
301       is_initialized = 1;
302       const T* input_flat = input.template flat<T>().data();
303 
304       element_count = input_shape.num_elements();
305       const bool is_lower_bound_custom = !Eigen::numext::isinf(lower_bound_);
306       const bool is_upper_bound_custom = !Eigen::numext::isinf(upper_bound_);
307 
308       for (int64_t i = 0; i < element_count; ++i) {
309         const double x = static_cast<double>(input_flat[i]);
310         if (Eigen::numext::isnan(x)) {
311           nan_count++;
312         } else if (Eigen::numext::isinf(x)) {
313           if (x < 0.0) {
314             negative_inf_count++;
315           } else {
316             positive_inf_count++;
317           }
318         } else {
319           if (is_lower_bound_custom && x <= lower_bound_) {
320             negative_inf_count++;
321           } else if (is_upper_bound_custom && x >= upper_bound_) {
322             positive_inf_count++;
323           } else if (x < 0.0) {
324             negative_count++;
325           } else if (x > 0.0) {
326             positive_count++;
327           } else {
328             zero_count++;
329           }
330 
331           if (x < min) {
332             min = x;
333           }
334           if (x > max) {
335             max = x;
336           }
337 
338           non_inf_nan_count++;
339           sum += x;
340         }
341       }
342 
343       if (non_inf_nan_count > 0) {
344         mean = sum / non_inf_nan_count;
345 
346         // Do a second pass to compute variance.
347         variance = 0.0;
348         for (int64_t i = 0; i < element_count; ++i) {
349           const double x = static_cast<double>(input_flat[i]);
350           if (!Eigen::numext::isnan(x) && !Eigen::numext::isinf(x)) {
351             variance += (x - mean) * (x - mean);
352           }
353         }
354         variance /= non_inf_nan_count;
355       }
356     }
357 
358     TensorShape shape({14 + input_shape.dims()});
359     OP_REQUIRES_OK(context, context->allocate_output(0, shape, &output_tensor));
360     output_tensor->vec<double>()(0) = static_cast<double>(is_initialized);
361     output_tensor->vec<double>()(1) = static_cast<double>(element_count);
362     output_tensor->vec<double>()(2) = static_cast<double>(nan_count);
363     output_tensor->vec<double>()(3) = static_cast<double>(negative_inf_count);
364     output_tensor->vec<double>()(4) = static_cast<double>(negative_count);
365     output_tensor->vec<double>()(5) = static_cast<double>(zero_count);
366     output_tensor->vec<double>()(6) = static_cast<double>(positive_count);
367     output_tensor->vec<double>()(7) = static_cast<double>(positive_inf_count);
368     output_tensor->vec<double>()(8) = min;
369     output_tensor->vec<double>()(9) = max;
370     output_tensor->vec<double>()(10) = mean;
371     output_tensor->vec<double>()(11) = variance;
372 
373     output_tensor->vec<double>()(12) = static_cast<double>(input.dtype());
374     output_tensor->vec<double>()(13) = static_cast<double>(input_shape.dims());
375     for (size_t d = 0; d < input_shape.dims(); ++d) {
376       output_tensor->vec<double>()(14 + d) =
377           static_cast<double>(input_shape.dim_sizes()[d]);
378     }
379 
380     bool mute = mute_if_healthy_ && nan_count == 0 && negative_inf_count == 0 &&
381                 positive_inf_count == 0;
382     if (!mute) {
383       OP_REQUIRES_OK(context, PublishTensor(*output_tensor));
384     }
385   }
386 
387  private:
388   float lower_bound_;
389   float upper_bound_;
390   bool mute_if_healthy_;
391 };
392 
393 // Identity op for tfdbg v2: Writes debug data using DebugEventsWriter.
394 class DebugIdentityV2Op : public OpKernel {
395  public:
DebugIdentityV2Op(OpKernelConstruction * context)396   explicit DebugIdentityV2Op(OpKernelConstruction* context)
397       : OpKernel(context),
398         device_name_(context->device()->name()),
399         output_slot_(-1),
400         tensor_debug_mode_(0),
401         tfdbg_run_id_() {
402     std::vector<string> debug_urls;
403     OP_REQUIRES_OK(context, context->GetAttr("debug_urls", &debug_urls));
404     for (const string& debug_url : debug_urls) {
405       if (absl::StartsWith(debug_url, DebugIO::kFileURLScheme)) {
406         dump_roots_.emplace_back(
407             debug_url.substr(strlen(DebugIO::kFileURLScheme)));
408       } else {
409         context->SetStatus(
410             errors::Internal("Unsupported debug URL schema in: ", debug_url));
411       }
412     }
413     OP_REQUIRES_OK(context,
414                    context->GetAttr("tfdbg_context_id", &tfdbg_context_id_));
415     OP_REQUIRES_OK(context, context->GetAttr("op_name", &op_name_));
416     OP_REQUIRES_OK(context, context->GetAttr("output_slot", &output_slot_));
417     OP_REQUIRES_OK(context,
418                    context->GetAttr("tensor_debug_mode", &tensor_debug_mode_));
419     if (context->HasAttr("circular_buffer_size")) {
420       OP_REQUIRES_OK(context, context->GetAttr("circular_buffer_size",
421                                                &circular_buffer_size_));
422     } else {
423       circular_buffer_size_ =
424           tfdbg::DebugEventsWriter::kDefaultCyclicBufferSize;
425     }
426     if (context->HasAttr("tfdbg_run_id")) {
427       OP_REQUIRES_OK(context, context->GetAttr("tfdbg_run_id", &tfdbg_run_id_));
428     }
429   }
430 
Compute(OpKernelContext * context)431   void Compute(OpKernelContext* context) override {
432     const Tensor& tensor = context->input(0);
433     for (const string& dump_root : dump_roots_) {
434       tfdbg::DebugEventsWriter* debug_events_writer =
435           tfdbg::DebugEventsWriter::GetDebugEventsWriter(
436               dump_root, tfdbg_run_id_, circular_buffer_size_);
437       OP_REQUIRES_OK(context, debug_events_writer->WriteGraphExecutionTrace(
438                                   tfdbg_context_id_, device_name_, op_name_,
439                                   output_slot_, tensor_debug_mode_, tensor));
440     }
441     context->set_output(0, tensor);
442   }
443 
444  private:
445   std::vector<string> dump_roots_;
446   string tfdbg_context_id_;
447   string device_name_;
448   string op_name_;
449   int32 output_slot_;
450   int32 tensor_debug_mode_;
451   int64_t circular_buffer_size_;
452   string tfdbg_run_id_;
453 };
454 
455 typedef Eigen::ThreadPoolDevice CPUDevice;
456 typedef Eigen::GpuDevice GPUDevice;
457 
458 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
459 template <typename Tin, typename Tout>
460 struct CurtHealthLaunch {
461   void Run(const GPUDevice& d, const Tin* data, int size, Tout output[1]);
462 };
463 
464 extern template struct CurtHealthLaunch<Eigen::half, float>;
465 extern template struct CurtHealthLaunch<float, float>;
466 extern template struct CurtHealthLaunch<double, float>;
467 extern template struct CurtHealthLaunch<Eigen::half, double>;
468 extern template struct CurtHealthLaunch<float, double>;
469 extern template struct CurtHealthLaunch<double, double>;
470 
471 template <typename Tin, typename Tout>
472 struct ConciseHealthLaunch {
473   void Run(const GPUDevice& d, const Tin* data, int size, Tout output[3]);
474 };
475 
476 extern template struct ConciseHealthLaunch<Eigen::half, float>;
477 extern template struct ConciseHealthLaunch<float, float>;
478 extern template struct ConciseHealthLaunch<double, float>;
479 extern template struct ConciseHealthLaunch<Eigen::half, double>;
480 extern template struct ConciseHealthLaunch<float, double>;
481 extern template struct ConciseHealthLaunch<double, double>;
482 
483 template <typename Tin, typename Tout>
484 struct FullHealthLaunch {
485   void Run(const GPUDevice& d, const Tin* data, int size, Tout output[6]);
486 };
487 
488 extern template struct FullHealthLaunch<Eigen::half, float>;
489 extern template struct FullHealthLaunch<float, float>;
490 extern template struct FullHealthLaunch<double, float>;
491 extern template struct FullHealthLaunch<Eigen::half, double>;
492 extern template struct FullHealthLaunch<float, double>;
493 extern template struct FullHealthLaunch<double, double>;
494 
495 template <typename Tin, typename Tout>
496 struct ReduceInfNanThreeSlotsLaunch {
497   void Run(const GPUDevice& d, const Tin* data, int size, Tout output[3]);
498 };
499 
500 extern template struct ReduceInfNanThreeSlotsLaunch<Eigen::half, float>;
501 extern template struct ReduceInfNanThreeSlotsLaunch<float, float>;
502 extern template struct ReduceInfNanThreeSlotsLaunch<double, float>;
503 extern template struct ReduceInfNanThreeSlotsLaunch<Eigen::half, double>;
504 extern template struct ReduceInfNanThreeSlotsLaunch<float, double>;
505 extern template struct ReduceInfNanThreeSlotsLaunch<double, double>;
506 
507 #endif
508 
509 template <typename Device, typename Tin, typename Tout>
510 class DebugNumericSummaryV2Op;
511 
512 // Numeric summary op for tfdbg v2: CPU Kernel.
513 template <typename Tin, typename Tout>
514 class DebugNumericSummaryV2Op<CPUDevice, Tin, Tout> : public OpKernel {
515  public:
516   explicit DebugNumericSummaryV2Op(OpKernelConstruction* context)
517       : OpKernel(context) {
518     OP_REQUIRES_OK(context,
519                    context->GetAttr("tensor_debug_mode", &tensor_debug_mode_));
520     OP_REQUIRES_OK(context, context->GetAttr("tensor_id", &tensor_id_));
521   }
522 
523   void Compute(OpKernelContext* context) override {
524     const Tensor& tensor = context->input(0);
525     auto in = tensor.flat<Tin>();
526     const Tin* data = in.data();
527     const int64_t size = in.size();
528     Tensor* output_tensor;
529     Tout tensor_id = static_cast<Tout>(tensor_id_);
530     const Tout num_elem = static_cast<Tout>(context->input(0).NumElements());
531     // Disregard lossy cast if mode is REDUCE_INF_NAN_THREE_SLOTS because
532     // that mode does not make use of tensor_id.
533     if (tensor_debug_mode_ != 8) {
534       OP_REQUIRES(
535           context, tensor_id_ <= kMaxTensorId,
536           errors::InvalidArgument("DebugNumericSummaryV2Op requires "
537                                   "tensor_id to be less than or equal to "
538                                   "(2^",
539                                   std::numeric_limits<Tout>::digits,
540                                   "). Given tensor_id:", tensor_id_));
541     }
542 
543     if (tensor_debug_mode_ == 2) {  // CURT_HEALTH
544       TensorShape shape({2});
545       OP_REQUIRES_OK(context,
546                      context->allocate_output(0, shape, &output_tensor));
547       output_tensor->flat<Tout>()(0) = tensor_id;  // Slot tensor id
548       output_tensor->flat<Tout>()(1) = 0.0;        // Has inf or nan
549       int fp_props =
550           std::accumulate(data, data + size, 0, [](const int x, const Tin& y) {
551             return Eigen::numext::isfinite(y) ? x : 1;
552           });
553       if (fp_props) {
554         output_tensor->flat<Tout>()(1) = 1.0;
555       }
556     } else if (tensor_debug_mode_ == 3) {  // CONCISE_HEALTH
557       TensorShape shape({5});
558       OP_REQUIRES_OK(context,
559                      context->allocate_output(0, shape, &output_tensor));
560       output_tensor->flat<Tout>()(0) = tensor_id;
561       output_tensor->flat<Tout>()(1) = num_elem;
562 
563       // Accumulator value [neg_inf_count, pos_inf_count, nan_count]
564       Tout fp_props[3] = {0.0, 0.0, 0.0};
565       std::for_each(data, data + size, [&fp_props](const Tin& y) {
566         if (TF_PREDICT_TRUE(Eigen::numext::isfinite(y))) {
567           // Do nothing: common case.
568         } else if (Eigen::numext::isinf(y)) {
569           if (y < static_cast<Tin>(0.f)) {
570             ++fp_props[0];
571           } else {
572             ++fp_props[1];
573           }
574         } else if (Eigen::numext::isnan(y)) {
575           ++fp_props[2];
576         }
577       });
578       output_tensor->flat<Tout>()(2) = fp_props[0];  // Slot for -inf count
579       output_tensor->flat<Tout>()(3) = fp_props[1];  // Slot for inf count
580       output_tensor->flat<Tout>()(4) = fp_props[2];  // Slot for nan count
581     } else if (tensor_debug_mode_ == 4) {            // FULL HEALTH
582       TensorShape shape({11});
583       OP_REQUIRES_OK(context,
584                      context->allocate_output(0, shape, &output_tensor));
585       int num_dims = tensor.dims();
586       output_tensor->flat<Tout>()(0) = tensor_id;
587       output_tensor->flat<Tout>()(1) = -1.0;  // TODO(144919262): Device ID
588       output_tensor->flat<Tout>()(2) = static_cast<Tout>(tensor.dtype());
589       output_tensor->flat<Tout>()(3) = static_cast<Tout>(num_dims);
590       output_tensor->flat<Tout>()(4) = num_elem;
591 
592       // Accumulator value [neg_inf_count, pos_inf_count, nan_count, neg_count,
593       //                   zero_count, pos_count]
594       Tout fp_props[6] = {0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
595       std::for_each(data, data + size, [&fp_props](const Tin& y) {
596         if (TF_PREDICT_TRUE(Eigen::numext::isfinite(y))) {
597           if (y < static_cast<Tin>(0.f)) {
598             ++fp_props[3];
599           } else if (y == static_cast<Tin>(0.f)) {
600             ++fp_props[4];
601           } else {
602             ++fp_props[5];
603           }
604         } else if (Eigen::numext::isinf(y)) {
605           if (y < static_cast<Tin>(0.f)) {
606             ++fp_props[0];
607           } else {
608             ++fp_props[1];
609           }
610         } else if (Eigen::numext::isnan(y)) {
611           ++fp_props[2];
612         }
613       });
614       output_tensor->flat<Tout>()(5) = fp_props[0];   // Slot for -inf count
615       output_tensor->flat<Tout>()(6) = fp_props[1];   // Slot for inf count
616       output_tensor->flat<Tout>()(7) = fp_props[2];   // Slot for nan count.
617       output_tensor->flat<Tout>()(8) = fp_props[3];   // Slot for neg count.
618       output_tensor->flat<Tout>()(9) = fp_props[4];   // Slot for zero count.
619       output_tensor->flat<Tout>()(10) = fp_props[5];  // Slot for pos count.
620     } else if (tensor_debug_mode_ == 5) {             // SHAPE
621       TensorShape shape({10});
622       OP_REQUIRES_OK(context,
623                      context->allocate_output(0, shape, &output_tensor));
624 
625       int num_dims = tensor.dims();
626       output_tensor->flat<Tout>()(0) = tensor_id;
627       output_tensor->flat<Tout>()(1) = static_cast<Tout>(tensor.dtype());
628       output_tensor->flat<Tout>()(2) = static_cast<Tout>(num_dims);
629       output_tensor->flat<Tout>()(3) = num_elem;
630 
631       // Tensor shape - stored as (6 columns)
632       // if num_dim is less than 6, we right pad the shape with zeros
633       // if num_dim is greater than 6, we truncate the head (left most) of the
634       // dimensions as they are more predictable than the last few (e.g. batch
635       // size as first dimension)
636       int dim_idx = 4;
637       for (int i = std::max(0, num_dims - kShapeDims);
638            i < std::max(6, num_dims); ++i) {
639         if (i < num_dims) {
640           output_tensor->flat<Tout>()(dim_idx++) =
641               static_cast<Tout>(tensor.dim_size(i));
642         } else {
643           output_tensor->flat<Tout>()(dim_idx++) = 0.0;
644         }
645       }
646     } else if (tensor_debug_mode_ == 8) {  // REDUCE_INF_NAN_THREE_SLOTS.
647       TensorShape shape({3});
648       OP_REQUIRES_OK(context,
649                      context->allocate_output(0, shape, &output_tensor));
650       output_tensor->flat<Tout>()(0) = 0.0;  // Slot for -inf.
651       output_tensor->flat<Tout>()(1) = 0.0;  // Slot for inf.
652       output_tensor->flat<Tout>()(2) = 0.0;  // Slot for nan.
653 
654       int fp_props =
655           std::accumulate(data, data + size, 0, [](const int x, const Tin& y) {
656             int result = x;
657             if (TF_PREDICT_TRUE(Eigen::numext::isfinite(y))) {
658               // Do nothing: common case.
659             } else if (Eigen::numext::isinf(y)) {
660               result |= y < static_cast<Tin>(0.f) ? kNegInfBit : kPosInfBit;
661             } else if (Eigen::numext::isnan(y)) {
662               result |= kNaNBit;
663             }
664             return result;
665           });
666 
667       if (fp_props & kNegInfBit) {
668         output_tensor->flat<Tout>()(0) = -std::numeric_limits<Tout>::infinity();
669       }
670       if (fp_props & kPosInfBit) {
671         output_tensor->flat<Tout>()(1) = std::numeric_limits<Tout>::infinity();
672       }
673       if (fp_props & kNaNBit) {
674         output_tensor->flat<Tout>()(2) = std::numeric_limits<Tout>::quiet_NaN();
675       }
676     } else {
677       // TODO(cais): Implement other tensor debug modes in debug_event.proto.
678       context->SetStatus(errors::Unimplemented(
679           "Unimplemented tensor debug mode: ", tensor_debug_mode_));
680     }
681   }
682 
683  private:
684   int tensor_debug_mode_;
685   int64_t tensor_id_;
686   static constexpr int kShapeDims = 6;
687   static constexpr int kNegInfBit = 0x01;
688   static constexpr int kPosInfBit = 0x02;
689   static constexpr int kNaNBit = 0x04;
690   static constexpr int64_t kMaxTensorId = 1LL
691                                           << std::numeric_limits<Tout>::digits;
692 };
693 
694 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
695 
696 template <typename Tin, typename Tout>
697 class DebugNumericSummaryV2Op<GPUDevice, Tin, Tout> : public AsyncOpKernel {
698  public:
699   typedef GPUDevice Device;
700 
701   explicit DebugNumericSummaryV2Op(OpKernelConstruction* context)
702       : AsyncOpKernel(context) {
703     OP_REQUIRES_OK(context,
704                    context->GetAttr("tensor_debug_mode", &tensor_debug_mode_));
705     OP_REQUIRES_OK(context, context->GetAttr("tensor_id", &tensor_id_));
706   }
707 
708   void ComputeAsync(OpKernelContext* context, DoneCallback done) override {
709     Tensor* output_tensor;
710     Tout tensor_id = static_cast<Tout>(tensor_id_);
711     const Tensor& tensor = context->input(0);
712     const Tout num_elem = static_cast<Tout>(tensor.NumElements());
713     const Device& d = context->eigen_device<Device>();
714     auto input = tensor.flat<Tin>();
715     auto check_cb = [this, done]() { done(); };
716     // Disregard lossy cast if mode is REDUCE_INF_NAN_THREE_SLOTS because
717     // that mode does not make use of tensor_id.
718     if (tensor_debug_mode_ != 8) {
719       OP_REQUIRES_ASYNC(
720           context, tensor_id_ <= kMaxTensorId,
721           errors::InvalidArgument("DebugNumericSummaryV2Op requires "
722                                   "tensor_id to be less than or equal to "
723                                   "(2^",
724                                   std::numeric_limits<Tout>::digits,
725                                   "). Given tensor_id:", tensor_id_),
726           done);
727     }
728 
729     if (tensor_debug_mode_ == 2) {  // CURT_HEALTH.
730       TensorShape shape({2});
731       OP_REQUIRES_OK(context,
732                      context->allocate_output(0, shape, &output_tensor));
733 
734       auto* stream = context->op_device_context()->stream();
735       OP_REQUIRES_ASYNC(context, stream != nullptr,
736                         errors::Internal("No GPU stream available."), done);
737 
738       se::DeviceMemoryBase output_tensor_ptr(
739           output_tensor->flat<Tout>().data(),
740           output_tensor->flat<Tout>().size());
741       stream->ThenMemZero(&output_tensor_ptr, 2 * sizeof(Tout));
742       // Copy tensor_id to slot zero
743       stream->ThenMemcpy(&output_tensor_ptr, &tensor_id, sizeof(Tout));
744       if (num_elem == 0) {
745         done();
746         return;
747       }
748 
749       // Call the GPU kernels for the numerical (inf/nan) checks.
750       auto input = context->input(0).flat<Tin>();
751       CurtHealthLaunch<Tin, Tout>().Run(d, input.data(), input.size(),
752                                         output_tensor->flat<Tout>().data() + 1);
753 
754       context->device()
755           ->tensorflow_accelerator_device_info()
756           ->event_mgr->ThenExecute(stream, std::move(check_cb));
757     } else if (tensor_debug_mode_ == 3) {  // CONCISE_HEALTH.
758       TensorShape shape({5});
759       OP_REQUIRES_OK(context,
760                      context->allocate_output(0, shape, &output_tensor));
761       OP_REQUIRES_ASYNC(context, !tensorflow::OpDeterminismRequired(),
762                         errors::Unimplemented(
763                             "Determinism is not yet supported for "
764                             "DebugNumericSummaryV2 when tensor_debug_mode is "
765                             "CONCISE_HEALTH."),
766                         done);
767 
768       auto* stream = context->op_device_context()->stream();
769       OP_REQUIRES_ASYNC(context, stream != nullptr,
770                         errors::Internal("No GPU stream available."), done);
771 
772       se::DeviceMemoryBase output_tensor_ptr(
773           output_tensor->flat<Tout>().data(),
774           output_tensor->flat<Tout>().size());
775       stream->ThenMemset32(&output_tensor_ptr, 0, 5 * sizeof(Tout));
776       const Tout static_output[] = {tensor_id, num_elem};
777       stream->ThenMemcpy(&output_tensor_ptr, &static_output, 2 * sizeof(Tout));
778       if (num_elem == 0) {
779         done();
780         return;
781       }
782 
783       // Call the GPU kernels for the numerical (inf/nan) checks.
784       ConciseHealthLaunch<Tin, Tout>().Run(
785           d, input.data(), input.size(),
786           output_tensor->flat<Tout>().data() + 2);
787 
788       context->device()
789           ->tensorflow_accelerator_device_info()
790           ->event_mgr->ThenExecute(stream, std::move(check_cb));
791     } else if (tensor_debug_mode_ == 4) {  // FULL HEALTH
792       TensorShape shape({11});
793       OP_REQUIRES_OK(context,
794                      context->allocate_output(0, shape, &output_tensor));
795 
796       auto* stream = context->op_device_context()->stream();
797       OP_REQUIRES_ASYNC(context, stream != nullptr,
798                         errors::Internal("No GPU stream available."), done);
799       OP_REQUIRES_ASYNC(context, !tensorflow::OpDeterminismRequired(),
800                         errors::Unimplemented(
801                             "Determinism is not yet supported for "
802                             "DebugNumericSummaryV2 when tensor_debug_mode is "
803                             "FULL_HEALTH."),
804                         done);
805 
806       se::DeviceMemoryBase output_tensor_ptr(
807           output_tensor->flat<Tout>().data(),
808           output_tensor->flat<Tout>().size());
809       stream->ThenMemset32(&output_tensor_ptr, 0, 11 * sizeof(Tout));
810 
811       int num_dims = tensor.dims();
812       const Tout static_output[] = {tensor_id,
813                                     -1.0,  // TODO(144919262): Device ID
814                                     static_cast<Tout>(tensor.dtype()),
815                                     static_cast<Tout>(num_dims), num_elem};
816       stream->ThenMemcpy(&output_tensor_ptr, &static_output, 5 * sizeof(Tout));
817       if (num_elem == 0) {
818         done();
819         return;
820       }
821 
822       // Call the GPU kernels for the numerical (inf/nan) checks and
823       // pos/neg/zero counts.
824       FullHealthLaunch<Tin, Tout>().Run(d, input.data(), input.size(),
825                                         output_tensor->flat<Tout>().data() + 5);
826 
827       context->device()
828           ->tensorflow_accelerator_device_info()
829           ->event_mgr->ThenExecute(stream, std::move(check_cb));
830     } else if (tensor_debug_mode_ == 5) {  // SHAPE
831       TensorShape shape({10});
832       OP_REQUIRES_OK(context,
833                      context->allocate_output(0, shape, &output_tensor));
834 
835       auto* stream = context->op_device_context()->stream();
836       OP_REQUIRES_ASYNC(context, stream != nullptr,
837                         errors::Internal("No GPU stream available."), done);
838 
839       se::DeviceMemoryBase output_tensor_ptr(
840           output_tensor->flat<Tout>().data(),
841           output_tensor->flat<Tout>().size());
842 
843       int num_dims = tensor.dims();
844       Tout static_output[10] = {tensor_id,
845                                 static_cast<Tout>(tensor.dtype()),
846                                 static_cast<Tout>(num_dims),
847                                 num_elem,
848                                 0.0,
849                                 0.0,
850                                 0.0,
851                                 0.0,
852                                 0.0,
853                                 0.0};
854       // Tensor shape: right pad zeros, truncate head
855       int dim_idx = 4;
856       for (int i = std::max(0, num_dims - 6); i < num_dims; ++i) {
857         static_output[dim_idx++] = static_cast<Tout>(tensor.dim_size(i));
858       }
859       // Write to device stream
860       stream->ThenMemcpy(&output_tensor_ptr, &static_output, sizeof(Tout) * 10);
861       context->device()
862           ->tensorflow_accelerator_device_info()
863           ->event_mgr->ThenExecute(stream, std::move(check_cb));
864     } else if (tensor_debug_mode_ == 8) {  // REDUCE_INF_NAN_THREE_SLOTS.
865       TensorShape shape({3});
866       OP_REQUIRES_OK(context,
867                      context->allocate_output(0, shape, &output_tensor));
868 
869       auto* stream = context->op_device_context()->stream();
870       OP_REQUIRES_ASYNC(context, stream != nullptr,
871                         errors::Internal("No GPU stream available."), done);
872 
873       se::DeviceMemoryBase output_tensor_ptr(
874           output_tensor->flat<Tout>().data(),
875           output_tensor->flat<Tout>().size());
876       stream->ThenMemset32(&output_tensor_ptr, 0,
877                            output_tensor->flat<Tout>().size() * sizeof(Tout));
878       if (num_elem == 0) {
879         done();
880         return;
881       }
882 
883       // Call the GPU kernels for the numerical (inf/nan) checks.
884       auto input = context->input(0).flat<Tin>();
885       ReduceInfNanThreeSlotsLaunch<Tin, Tout>().Run(
886           d, input.data(), input.size(), output_tensor->flat<Tout>().data());
887 
888       context->device()
889           ->tensorflow_accelerator_device_info()
890           ->event_mgr->ThenExecute(stream, std::move(check_cb));
891     } else {
892       // TODO(cais): Implement other tensor debug modes in debug_event.proto.
893       context->SetStatus(errors::Unimplemented(
894           "Unimplemented tensor debug mode: ", tensor_debug_mode_));
895       done();
896     }
897   }
898 
899  private:
900   int tensor_debug_mode_;
901   int64_t tensor_id_;
902   static constexpr int64_t kMaxTensorId = 1L
903                                           << std::numeric_limits<Tout>::digits;
904 };
905 
906 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
907 
908 }  // namespace tensorflow
909 
910 #endif  // TENSORFLOW_CORE_KERNELS_DEBUG_OPS_H_
911