xref: /aosp_15_r20/external/tensorflow/tensorflow/python/debug/lib/session_debug_testlib.py (revision b6fb3261f9314811a0f4371741dbb8839866f948)
1# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7#     http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14# ==============================================================================
15"""Tests for debugger functionalities in tf.Session."""
16import collections
17import functools
18import glob
19import os
20import tempfile
21import threading
22
23import numpy as np
24
25from tensorflow.core.protobuf import config_pb2
26from tensorflow.core.protobuf import rewriter_config_pb2
27from tensorflow.core.util import event_pb2
28from tensorflow.python.client import session
29from tensorflow.python.debug.lib import debug_data
30from tensorflow.python.debug.lib import debug_graphs
31from tensorflow.python.debug.lib import debug_utils
32from tensorflow.python.framework import constant_op
33from tensorflow.python.framework import dtypes
34from tensorflow.python.framework import errors
35from tensorflow.python.framework import ops
36from tensorflow.python.framework import test_util
37from tensorflow.python.lib.io import file_io
38from tensorflow.python.ops import array_ops
39from tensorflow.python.ops import control_flow_ops
40from tensorflow.python.ops import data_flow_ops
41from tensorflow.python.ops import math_ops
42from tensorflow.python.ops import parsing_ops
43from tensorflow.python.ops import rnn
44from tensorflow.python.ops import rnn_cell_impl
45from tensorflow.python.ops import state_ops
46from tensorflow.python.ops import variables
47import tensorflow.python.ops.tensor_array_grad  # pylint: disable=unused-import
48from tensorflow.python.platform import googletest
49from tensorflow.python.platform import test
50from tensorflow.python.training import gradient_descent
51
52
53def no_rewrite_session_config():
54  rewriter_config = rewriter_config_pb2.RewriterConfig(
55      disable_model_pruning=True,
56      arithmetic_optimization=rewriter_config_pb2.RewriterConfig.OFF,
57      dependency_optimization=rewriter_config_pb2.RewriterConfig.OFF)
58  graph_options = config_pb2.GraphOptions(rewrite_options=rewriter_config)
59  return config_pb2.ConfigProto(graph_options=graph_options)
60
61
62class _RNNCellForTest(rnn_cell_impl.RNNCell):
63  """RNN cell for testing."""
64
65  def __init__(self, input_output_size, state_size):
66    self._input_output_size = input_output_size
67    self._state_size = state_size
68    self._w = variables.VariableV1(1.0, dtype=dtypes.float32, name="w")
69
70  @property
71  def output_size(self):
72    return self._input_output_size
73
74  @property
75  def state_size(self):
76    return self._state_size
77
78  def __call__(self, input_, state, scope=None):
79    return (math_ops.multiply(self._w, input_), state)
80
81
82@test_util.run_v1_only("b/120545219")
83class SessionDebugTestBase(test_util.TensorFlowTestCase):
84  """Base class for unit tests of tfdbg running with tf.Session."""
85
86  @classmethod
87  def setUpClass(cls):
88    if test.is_gpu_available():
89      cls._expected_partition_graph_count = 2
90      cls._expected_num_devices = 2
91      gpu_name = test_util.gpu_device_name()
92      cls._main_device = "/job:localhost/replica:0/task:0" + gpu_name
93    else:
94      cls._expected_partition_graph_count = 1
95      cls._expected_num_devices = 1
96      cls._main_device = "/job:localhost/replica:0/task:0/device:CPU:0"
97
98  @classmethod
99  def tearDownClass(cls):
100    pass
101
102  def setUp(self):
103    self._dump_root = tempfile.mkdtemp()
104
105  def tearDown(self):
106    ops.reset_default_graph()
107
108    # Tear down temporary dump directory.
109    if os.path.isdir(self._dump_root):
110      file_io.delete_recursively(self._dump_root)
111
112  def _debug_urls(self, run_number=None):
113    raise NotImplementedError(
114        "_debug_urls() method is not implemented in the base test class.")
115
116  def _debug_dump_dir(self, run_number=None):
117    raise NotImplementedError(
118        "_debug_dump_dir() method is not implemented in the base test class.")
119
120  def _debug_run_and_get_dump(self,
121                              sess,
122                              fetches,
123                              feed_dict=None,
124                              debug_ops="DebugIdentity",
125                              tolerate_debug_op_creation_failures=False,
126                              global_step=-1,
127                              validate=True,
128                              expected_partition_graph_count=None):
129    """Run fetches with debugging and obtain DebugDumpDir.
130
131    Args:
132      sess: the tf.compat.v1.Session to be used.
133      fetches: fetches of the Session.run().
134      feed_dict: feed dict for the Session.run().
135      debug_ops: name(s) of the debug ops to be used.
136      tolerate_debug_op_creation_failures: whether to tolerate debug op
137        creation failures.
138      global_step: Optional global step.
139      validate: whether to validate dumped tensors against graph.
140      expected_partition_graph_count: optional count of partition graphs to
141        assert on.
142
143    Returns:
144      1. Return values of the Session.run().
145      2. The DebugDumpDir object from the debugged run().
146    """
147
148    run_options = config_pb2.RunOptions(output_partition_graphs=True)
149    debug_utils.watch_graph(
150        run_options,
151        sess.graph,
152        debug_ops=debug_ops,
153        debug_urls=self._debug_urls(),
154        tolerate_debug_op_creation_failures=tolerate_debug_op_creation_failures,
155        global_step=global_step)
156    run_metadata = config_pb2.RunMetadata()
157    run_output = sess.run(fetches,
158                          feed_dict=feed_dict,
159                          options=run_options,
160                          run_metadata=run_metadata)
161
162    if expected_partition_graph_count is not None:
163      self.assertEqual(expected_partition_graph_count,
164                       len(run_metadata.partition_graphs))
165    return run_output, debug_data.DebugDumpDir(
166        self._dump_root, partition_graphs=run_metadata.partition_graphs,
167        validate=validate)
168
169  def _generate_dump_from_simple_addition_graph(self):
170    with session.Session(config=no_rewrite_session_config()) as sess:
171      u_init_val = np.array([[5.0, 3.0], [-1.0, 0.0]])
172      v_init_val = np.array([[2.0], [-1.0]])
173
174      # Use node names with overlapping namespace (i.e., parent directory) to
175      # test concurrent, non-racing directory creation.
176      u_name = "u"
177      v_name = "v"
178      w_name = "w"
179
180      u_init = constant_op.constant(u_init_val, shape=[2, 2])
181      u = variables.VariableV1(u_init, name=u_name)
182      v_init = constant_op.constant(v_init_val, shape=[2, 1])
183      v = variables.VariableV1(v_init, name=v_name)
184
185      w = math_ops.matmul(u, v, name=w_name)
186
187      u.initializer.run()
188      v.initializer.run()
189
190      run_options = config_pb2.RunOptions(output_partition_graphs=True)
191      debug_urls = "file://%s" % self._dump_root
192
193      # Add debug tensor watch for u.
194      debug_utils.add_debug_tensor_watch(
195          run_options, "%s/read" % u_name, 0, debug_urls=debug_urls)
196      # Add debug tensor watch for v.
197      debug_utils.add_debug_tensor_watch(
198          run_options, "%s/read" % v_name, 0, debug_urls=debug_urls)
199
200      run_metadata = config_pb2.RunMetadata()
201
202      # Invoke Session.run().
203      sess.run(w, options=run_options, run_metadata=run_metadata)
204
205      self.assertEqual(self._expected_partition_graph_count,
206                       len(run_metadata.partition_graphs))
207
208      dump = debug_data.DebugDumpDir(
209          self._dump_root, partition_graphs=run_metadata.partition_graphs)
210
211    simple_add_results = collections.namedtuple("SimpleAddResults", [
212        "u_init_val", "v_init_val", "u", "v", "w", "u_name", "v_name", "w_name",
213        "dump"
214    ])
215    return simple_add_results(u_init_val, v_init_val, u, v, w, u_name, v_name,
216                              w_name, dump)
217
218  def testCopyNodesHaveCorrectDebugOpsAndURLsAttributeValues(self):
219    with session.Session() as sess:
220      u = variables.VariableV1(2.1, name="u")
221      v = variables.VariableV1(20.0, name="v")
222      w = math_ops.multiply(u, v, name="w")
223
224      sess.run(variables.global_variables_initializer())
225
226      run_options = config_pb2.RunOptions(output_partition_graphs=True)
227      debug_urls = self._debug_urls()
228      debug_utils.add_debug_tensor_watch(
229          run_options,
230          "u",
231          0, ["DebugNumericSummary(gated_grpc=True)", "DebugIdentity"],
232          debug_urls=debug_urls)
233      debug_utils.add_debug_tensor_watch(
234          run_options, "v", 0, ["DebugNumericSummary"], debug_urls=debug_urls)
235
236      run_metadata = config_pb2.RunMetadata()
237      r = sess.run(w, options=run_options, run_metadata=run_metadata)
238      self.assertAllClose(42.0, r)
239
240      u_copy_node_def = None
241      v_copy_node_def = None
242      for partition_graph in run_metadata.partition_graphs:
243        for node_def in partition_graph.node:
244          if debug_graphs.is_copy_node(node_def.name):
245            if node_def.name == "__copy_u_0":
246              u_copy_node_def = node_def
247            elif node_def.name == "__copy_v_0":
248              v_copy_node_def = node_def
249
250      self.assertIsNotNone(u_copy_node_def)
251      debug_ops_spec = u_copy_node_def.attr["debug_ops_spec"].list.s
252      self.assertEqual(2, len(debug_ops_spec))
253      self.assertEqual("DebugNumericSummary;%s;1" % debug_urls[0],
254                       debug_ops_spec[0].decode("utf-8"))
255      self.assertEqual("DebugIdentity;%s;0" % debug_urls[0],
256                       debug_ops_spec[1].decode("utf-8"))
257
258      self.assertIsNotNone(v_copy_node_def)
259      debug_ops_spec = v_copy_node_def.attr["debug_ops_spec"].list.s
260      self.assertEqual(1, len(debug_ops_spec))
261      self.assertEqual("DebugNumericSummary;%s;0" % debug_urls[0],
262                       debug_ops_spec[0].decode("utf-8"))
263
264  def testConcurrentDumpingToPathsWithOverlappingParentDirsWorks(self):
265    results = self._generate_dump_from_simple_addition_graph()
266    self.assertTrue(results.dump.loaded_partition_graphs())
267
268    # Since global_step is not explicitly specified, it should take its default
269    # value: -1.
270    self.assertEqual(-1, results.dump.core_metadata.global_step)
271    self.assertGreaterEqual(results.dump.core_metadata.session_run_index, 0)
272    self.assertGreaterEqual(results.dump.core_metadata.executor_step_index, 0)
273    self.assertEqual([], results.dump.core_metadata.input_names)
274    self.assertEqual([results.w.name], results.dump.core_metadata.output_names)
275    self.assertEqual([], results.dump.core_metadata.target_nodes)
276
277    # Verify the dumped tensor values for u and v.
278    self.assertEqual(2, results.dump.size)
279
280    self.assertAllClose([results.u_init_val],
281                        results.dump.get_tensors("%s/read" % results.u_name, 0,
282                                                 "DebugIdentity"))
283    self.assertAllClose([results.v_init_val],
284                        results.dump.get_tensors("%s/read" % results.v_name, 0,
285                                                 "DebugIdentity"))
286
287    self.assertGreaterEqual(
288        results.dump.get_rel_timestamps("%s/read" % results.u_name, 0,
289                                        "DebugIdentity")[0], 0)
290    self.assertGreaterEqual(
291        results.dump.get_rel_timestamps("%s/read" % results.v_name, 0,
292                                        "DebugIdentity")[0], 0)
293
294    self.assertGreater(
295        results.dump.get_dump_sizes_bytes("%s/read" % results.u_name, 0,
296                                          "DebugIdentity")[0], 0)
297    self.assertGreater(
298        results.dump.get_dump_sizes_bytes("%s/read" % results.v_name, 0,
299                                          "DebugIdentity")[0], 0)
300
301  def testGetOpTypeWorks(self):
302    results = self._generate_dump_from_simple_addition_graph()
303
304    self.assertEqual(results.u.op.type,
305                     results.dump.node_op_type(results.u_name))
306    self.assertIn(results.v.op.type, results.dump.node_op_type(results.v_name))
307    self.assertIn(results.w.op.type, results.dump.node_op_type(results.w_name))
308
309    with self.assertRaisesRegexp(
310        ValueError, r"None of the .* device\(s\) has a node named "):
311      results.dump.node_op_type("foo_bar")
312
313  def testDumpStringTensorsWorks(self):
314    with session.Session(config=no_rewrite_session_config()) as sess:
315      str1_init_val = np.array(b"abc")
316      str2_init_val = np.array(b"def")
317
318      str1_init = constant_op.constant(str1_init_val)
319      str2_init = constant_op.constant(str2_init_val)
320
321      str1_name = "str1"
322      str2_name = "str2"
323      str1 = variables.VariableV1(str1_init, name=str1_name)
324      str2 = variables.VariableV1(str2_init, name=str2_name)
325      # Concatenate str1 and str2
326      str_concat = math_ops.add(str1, str2, name="str_concat")
327
328      str1.initializer.run()
329      str2.initializer.run()
330
331      run_options = config_pb2.RunOptions(output_partition_graphs=True)
332      debug_urls = self._debug_urls()
333
334      # Add debug tensor watch for u.
335      debug_utils.add_debug_tensor_watch(
336          run_options, "%s/read" % str1_name, 0, debug_urls=debug_urls)
337      # Add debug tensor watch for v.
338      debug_utils.add_debug_tensor_watch(
339          run_options, "%s/read" % str2_name, 0, debug_urls=debug_urls)
340
341      run_metadata = config_pb2.RunMetadata()
342      sess.run(str_concat, options=run_options, run_metadata=run_metadata)
343
344      # String ops are located on CPU.
345      self.assertEqual(1, len(run_metadata.partition_graphs))
346
347      dump = debug_data.DebugDumpDir(
348          self._dump_root, partition_graphs=run_metadata.partition_graphs)
349
350      self.assertIn(str1_name, dump.nodes())
351      self.assertIn(str2_name, dump.nodes())
352
353      self.assertEqual(2, dump.size)
354
355      self.assertEqual([str1_init_val],
356                       dump.get_tensors("%s/read" % str1_name, 0,
357                                        "DebugIdentity"))
358      self.assertEqual([str2_init_val],
359                       dump.get_tensors("%s/read" % str2_name, 0,
360                                        "DebugIdentity"))
361
362      self.assertGreaterEqual(
363          dump.get_rel_timestamps("%s/read" % str1_name, 0, "DebugIdentity")[0],
364          0)
365      self.assertGreaterEqual(
366          dump.get_rel_timestamps("%s/read" % str2_name, 0, "DebugIdentity")[0],
367          0)
368
369      self.assertGreater(
370          dump.get_dump_sizes_bytes("%s/read" % str1_name, 0,
371                                    "DebugIdentity")[0], 0)
372      self.assertGreater(
373          dump.get_dump_sizes_bytes("%s/read" % str2_name, 0,
374                                    "DebugIdentity")[0], 0)
375
376  def testDumpUninitializedVariable(self):
377    op_namespace = "testDumpUninitializedVariable"
378    with session.Session() as sess:
379      u_init_val = np.array([[5.0, 3.0], [-1.0, 0.0]])
380      s_init_val = b"str1"
381
382      u_name = "%s/u" % op_namespace
383      s_name = "%s/s" % op_namespace
384
385      u_init = constant_op.constant(u_init_val, shape=[2, 2])
386      u = variables.VariableV1(u_init, name=u_name)
387      s_init = constant_op.constant(s_init_val)
388      s = variables.VariableV1(s_init, name=s_name)
389
390      run_options = config_pb2.RunOptions(output_partition_graphs=True)
391      debug_urls = self._debug_urls()
392
393      # Add debug tensor watch for u.
394      debug_utils.add_debug_tensor_watch(
395          run_options, u_name, 0, debug_urls=debug_urls)
396      debug_utils.add_debug_tensor_watch(
397          run_options, s_name, 0, debug_urls=debug_urls)
398
399      run_metadata = config_pb2.RunMetadata()
400
401      # Initialize u and s.
402      sess.run(variables.global_variables_initializer(),
403               options=run_options,
404               run_metadata=run_metadata)
405
406      # Verify the dump file for the uninitialized value of u.
407      dump = debug_data.DebugDumpDir(
408          self._dump_root, partition_graphs=run_metadata.partition_graphs)
409
410      self.assertEqual(2, dump.size)
411      self.assertEqual(self._expected_partition_graph_count,
412                       len(run_metadata.partition_graphs))
413
414      # Verify that the variable is properly initialized by the run() call.
415      u_vals = dump.get_tensors(u_name, 0, "DebugIdentity")
416      s_vals = dump.get_tensors(s_name, 0, "DebugIdentity")
417      self.assertEqual(1, len(u_vals))
418      self.assertIsInstance(u_vals[0], debug_data.InconvertibleTensorProto)
419      self.assertFalse(u_vals[0].initialized)
420      self.assertEqual(1, len(s_vals))
421      self.assertIsInstance(s_vals[0], debug_data.InconvertibleTensorProto)
422      self.assertFalse(s_vals[0].initialized)
423
424      # Call run() again, to check that u is initialized properly.
425      self.assertAllClose(u_init_val, sess.run(u))
426      self.assertEqual(s_init_val, sess.run(s))
427
428  def testDebugWhileLoopGeneratesMultipleDumps(self):
429    with session.Session(config=no_rewrite_session_config()) as sess:
430      num_iter = 10
431
432      # "u" is the Variable being updated in the loop.
433      u_name = "testDumpToFileWhileLoop/u"
434      u_namespace = u_name.split("/")[0]
435
436      u_init_val = np.array(11.0)
437      u_init = constant_op.constant(u_init_val)
438      u = variables.VariableV1(u_init, name=u_name)
439
440      # "v" is the increment.
441      v_name = "testDumpToFileWhileLoop/v"
442      v_namespace = v_name.split("/")[0]
443
444      v_init_val = np.array(2.0)
445      v_init = constant_op.constant(v_init_val)
446      v = variables.VariableV1(v_init, name=v_name)
447
448      u.initializer.run()
449      v.initializer.run()
450
451      i = constant_op.constant(0, name="testDumpToFileWhileLoop/i")
452
453      def cond(i):
454        return math_ops.less(i, num_iter)
455
456      def body(i):
457        new_u = state_ops.assign_add(u, v)
458        new_i = math_ops.add(i, 1)
459        op = control_flow_ops.group(new_u)
460        new_i = control_flow_ops.with_dependencies([op], new_i)
461        return [new_i]
462
463      loop = control_flow_ops.while_loop(
464          cond, body, [i], parallel_iterations=10)
465
466      # Create RunOptions for debug-watching tensors
467      run_options = config_pb2.RunOptions(output_partition_graphs=True)
468      debug_urls = self._debug_urls()
469
470      # Add debug tensor watch for u.
471      debug_utils.add_debug_tensor_watch(
472          run_options, u_name, 0, debug_urls=debug_urls)
473      # Add debug tensor watch for v.
474      debug_utils.add_debug_tensor_watch(
475          run_options, "%s/read" % v_name, 0, debug_urls=debug_urls)
476      # Add debug tensor watch for while/Identity.
477      debug_utils.add_debug_tensor_watch(
478          run_options, "while/Identity", 0, debug_urls=debug_urls)
479      # Add debug tensor watch for while/Add/y.
480      debug_utils.add_debug_tensor_watch(
481          run_options, "while/Add/y", 0, debug_urls=debug_urls)
482
483      run_metadata = config_pb2.RunMetadata()
484      r = sess.run(loop, options=run_options, run_metadata=run_metadata)
485
486      self.assertEqual(self._expected_partition_graph_count,
487                       len(run_metadata.partition_graphs))
488
489      self.assertEqual(num_iter, r)
490      u_val_final = sess.run(u)
491      self.assertAllClose(u_init_val + num_iter * v_init_val, u_val_final)
492
493      # Verify dump files
494      self.assertTrue(os.path.isdir(self._dump_root))
495
496      u_glob_out = glob.glob(os.path.join(self._dump_root, "*", u_namespace))
497      v_glob_out = glob.glob(os.path.join(
498          self._dump_root, "*", v_namespace, "v"))
499      self.assertTrue(os.path.isdir(u_glob_out[0]))
500      self.assertTrue(os.path.isdir(v_glob_out[0]))
501
502      dump = debug_data.DebugDumpDir(
503          self._dump_root, partition_graphs=run_metadata.partition_graphs)
504
505      # Expected dumped tensors: u, v/read, 10 iterations of while/Identity,
506      # and 10 iterations of while/Add/y.
507      self.assertEqual(1 + 1 + num_iter + num_iter, dump.size)
508
509      # Verify tensor values.
510      self.assertAllClose([u_init_val],
511                          dump.get_tensors(u_name, 0, "DebugIdentity"))
512      self.assertAllClose([v_init_val],
513                          dump.get_tensors("%s/read" % v_name, 0,
514                                           "DebugIdentity"))
515
516      while_id_tensors = dump.get_tensors("while/Identity", 0, "DebugIdentity")
517      self.assertEqual(10, len(while_id_tensors))
518      for k in range(len(while_id_tensors)):
519        self.assertAllClose(np.array(k), while_id_tensors[k])
520
521      # Verify ascending timestamps from the while loops.
522      while_id_rel_timestamps = dump.get_rel_timestamps("while/Identity", 0,
523                                                        "DebugIdentity")
524      while_id_dump_sizes_bytes = dump.get_dump_sizes_bytes("while/Identity", 0,
525                                                            "DebugIdentity")
526      self.assertEqual(10, len(while_id_rel_timestamps))
527      prev_rel_time = 0
528      prev_dump_size_bytes = while_id_dump_sizes_bytes[0]
529      for rel_time, dump_size_bytes in zip(while_id_rel_timestamps,
530                                           while_id_dump_sizes_bytes):
531        self.assertGreaterEqual(rel_time, prev_rel_time)
532        self.assertEqual(dump_size_bytes, prev_dump_size_bytes)
533        prev_rel_time = rel_time
534        prev_dump_size_bytes = dump_size_bytes
535
536      # Test querying debug watch keys from node name.
537      watch_keys = dump.debug_watch_keys("while/Identity")
538      self.assertEqual(["while/Identity:0:DebugIdentity"], watch_keys)
539
540      # Test querying debug datum instances from debug watch key.
541      self.assertEqual(10, len(dump.watch_key_to_data(watch_keys[0])))
542      self.assertEqual([], dump.watch_key_to_data("foo"))
543
544  def testDebugWhileLoopWatchingWholeGraphWorks(self):
545    with session.Session() as sess:
546      loop_body = lambda i: math_ops.add(i, 2)
547      loop_cond = lambda i: math_ops.less(i, 16)
548
549      i = constant_op.constant(10, name="i")
550      loop = control_flow_ops.while_loop(loop_cond, loop_body, [i])
551
552      loop_result, dump = self._debug_run_and_get_dump(sess, loop)
553      self.assertEqual(16, loop_result)
554
555      self.assertEqual(
556          [[10]], dump.get_tensors("while/Enter", 0, "DebugIdentity"))
557      self.assertEqual(
558          [[12], [14], [16]],
559          dump.get_tensors("while/NextIteration", 0, "DebugIdentity"))
560
561  def testDebugTrainingDynamicRNNWorks(self):
562    with session.Session() as sess:
563      input_size = 3
564      state_size = 2
565      time_steps = 4
566      batch_size = 2
567
568      input_values = np.random.randn(time_steps, batch_size, input_size)
569      sequence_length = np.random.randint(0, time_steps, size=batch_size)
570      concat_inputs = array_ops.placeholder(
571          dtypes.float32, shape=(time_steps, batch_size, input_size))
572
573      outputs_dynamic, _ = rnn.dynamic_rnn(
574          _RNNCellForTest(input_size, state_size),
575          inputs=concat_inputs,
576          sequence_length=sequence_length,
577          time_major=True,
578          dtype=dtypes.float32)
579      toy_loss = math_ops.reduce_sum(outputs_dynamic * outputs_dynamic)
580      train_op = gradient_descent.GradientDescentOptimizer(
581          learning_rate=0.1).minimize(toy_loss, name="train_op")
582
583      sess.run(variables.global_variables_initializer())
584
585      run_options = config_pb2.RunOptions(output_partition_graphs=True)
586      debug_utils.watch_graph_with_denylists(
587          run_options,
588          sess.graph,
589          node_name_regex_denylist="(.*rnn/while/.*|.*TensorArray.*)",
590          debug_urls=self._debug_urls())
591      # b/36870549: Nodes with these name patterns need to be excluded from
592      # tfdbg in order to prevent MSAN warnings of uninitialized Tensors
593      # under both file:// and grpc:// debug URL schemes.
594
595      run_metadata = config_pb2.RunMetadata()
596      sess.run(train_op, feed_dict={concat_inputs: input_values},
597               options=run_options, run_metadata=run_metadata)
598
599      debug_data.DebugDumpDir(
600          self._dump_root, partition_graphs=run_metadata.partition_graphs)
601
602  def testDebugCondWatchingWholeGraphWorks(self):
603    with session.Session() as sess:
604      x = variables.VariableV1(10.0, name="x")
605      y = variables.VariableV1(20.0, name="y")
606      cond = control_flow_ops.cond(
607          x > y, lambda: math_ops.add(x, 1), lambda: math_ops.add(y, 1))
608
609      sess.run(variables.global_variables_initializer())
610
611      cond_result, dump = self._debug_run_and_get_dump(sess, cond)
612      self.assertEqual(21, cond_result)
613
614      self.assertAllClose(
615          [21.0], dump.get_tensors("cond/Merge", 0, "DebugIdentity"))
616
617  def testFindNodesWithBadTensorValues(self):
618    with session.Session() as sess:
619      u_name = "testFindNodesWithBadTensorValues/u"
620      v_name = "testFindNodesWithBadTensorValues/v"
621      w_name = "testFindNodesWithBadTensorValues/w"
622      x_name = "testFindNodesWithBadTensorValues/x"
623      y_name = "testFindNodesWithBadTensorValues/y"
624      z_name = "testFindNodesWithBadTensorValues/z"
625
626      u_init = constant_op.constant([2.0, 4.0])
627      u = variables.VariableV1(u_init, name=u_name)
628      v_init = constant_op.constant([2.0, 1.0])
629      v = variables.VariableV1(v_init, name=v_name)
630
631      # Expected output: [0.0, 3.0]
632      w = math_ops.subtract(u, v, name=w_name)
633
634      # Expected output: [inf, 1.3333]
635      x = math_ops.div(u, w, name=x_name)
636
637      # Expected output: [nan, 4.0]
638      y = math_ops.multiply(w, x, name=y_name)
639
640      z = math_ops.multiply(y, y, name=z_name)
641
642      u.initializer.run()
643      v.initializer.run()
644
645      _, dump = self._debug_run_and_get_dump(
646          sess, z,
647          expected_partition_graph_count=self._expected_partition_graph_count)
648
649      def has_bad_value(_, tensor):
650        return np.any(np.isnan(tensor)) or np.any(np.isinf(tensor))
651
652      # Find all "offending tensors".
653      bad_data = dump.find(has_bad_value)
654
655      # Verify that the nodes with bad values are caught through running find
656      # on the debug dump.
657      self.assertLessEqual(3, len(bad_data))
658      node_names = [datum.node_name for datum in bad_data]
659      self.assertIn(x_name, node_names)
660      self.assertIn(y_name, node_names)
661      self.assertIn(z_name, node_names)
662
663      # Test first_n kwarg of find(): Find the first offending tensor.
664      first_bad_datum = dump.find(has_bad_value, first_n=1)
665      self.assertEqual(1, len(first_bad_datum))
666
667  def testFindInfOrNanWithOpNameExclusion(self):
668    with session.Session() as sess:
669      u_name = "testFindInfOrNanWithOpNameExclusion/u"
670      v_name = "testFindInfOrNanWithOpNameExclusion/v"
671      w_name = "testFindInfOrNanWithOpNameExclusion/w"
672      x_name = "testFindInfOrNanWithOpNameExclusion/x"
673      y_name = "testFindInfOrNanWithOpNameExclusion/y"
674      z_name = "testFindInfOrNanWithOpNameExclusion/z"
675
676      u_init = constant_op.constant([2.0, 4.0])
677      u = variables.VariableV1(u_init, name=u_name)
678      v_init = constant_op.constant([2.0, 1.0])
679      v = variables.VariableV1(v_init, name=v_name)
680
681      # Expected output: [0.0, 3.0]
682      w = math_ops.subtract(u, v, name=w_name)
683
684      # Expected output: [inf, 1.3333]
685      x = math_ops.div(u, w, name=x_name)
686
687      # Expected output: [nan, 4.0]
688      y = math_ops.multiply(w, x, name=y_name)
689
690      z = math_ops.multiply(y, y, name=z_name)
691
692      u.initializer.run()
693      v.initializer.run()
694
695      _, dump = self._debug_run_and_get_dump(
696          sess, z,
697          expected_partition_graph_count=self._expected_partition_graph_count)
698
699      # Find all "offending tensors".
700      bad_data = dump.find(debug_data.has_inf_or_nan,
701                           exclude_node_names=".*/x$")
702
703      # Verify that the nodes with bad values are caught through running find
704      # on the debug dump.
705      self.assertLessEqual(2, len(bad_data))
706      # Assert that the node `x` should have been excluded.
707      node_names = [datum.node_name for datum in bad_data]
708      self.assertIn(y_name, node_names)
709      self.assertIn(z_name, node_names)
710
711      first_bad_datum = dump.find(
712          debug_data.has_inf_or_nan, first_n=1, exclude_node_names=".*/x$")
713      self.assertEqual(1, len(first_bad_datum))
714
715  def _session_run_for_graph_structure_lookup(self):
716    with session.Session(config=no_rewrite_session_config()) as sess:
717      u_name = "testDumpGraphStructureLookup/u"
718      v_name = "testDumpGraphStructureLookup/v"
719      w_name = "testDumpGraphStructureLookup/w"
720
721      u_init = constant_op.constant([2.0, 4.0])
722      u = variables.VariableV1(u_init, name=u_name)
723      v = math_ops.add(u, u, name=v_name)
724      w = math_ops.add(v, v, name=w_name)
725
726      u.initializer.run()
727
728      _, dump = self._debug_run_and_get_dump(
729          sess, w,
730          expected_partition_graph_count=self._expected_partition_graph_count)
731
732    return u_name, v_name, w_name, dump
733
734  def testGraphStructureLookupGivesDevicesAndNodesInfo(self):
735    u_name, _, _, dump = self._session_run_for_graph_structure_lookup()
736
737    # Test num_devices().
738    self.assertEqual(self._expected_num_devices, len(dump.devices()))
739
740    # Test node_device().
741    self.assertEqual(self._main_device, dump.node_device(u_name))
742
743    with self.assertRaisesRegexp(ValueError,
744                                 "does not exist in partition graphs"):
745      dump.node_device(u_name + "foo")
746
747    # Test node_exists().
748    self.assertTrue(dump.node_exists(u_name))
749    self.assertTrue(dump.node_exists(u_name + "/read"))
750    self.assertFalse(dump.node_exists(u_name + "/read" + "/foo"))
751
752  def testGraphStructureLookupGivesNodesAndAttributes(self):
753    u_name, _, _, dump = self._session_run_for_graph_structure_lookup()
754
755    u_read_name = u_name + "/read"
756
757    # Test node name list lookup of the DebugDumpDir object.
758    if test_util.gpu_device_name():
759      node_names = dump.nodes(
760          device_name="/job:localhost/replica:0/task:0/device:GPU:0")
761    else:
762      node_names = dump.nodes()
763    self.assertTrue(u_name in node_names)
764    self.assertTrue(u_read_name in node_names)
765
766    # Test querying node attributes.
767    u_attr = dump.node_attributes(u_name)
768    self.assertEqual(dtypes.float32, u_attr["dtype"].type)
769    self.assertEqual(1, len(u_attr["shape"].shape.dim))
770    self.assertEqual(2, u_attr["shape"].shape.dim[0].size)
771
772    with self.assertRaisesRegexp(
773        ValueError, r"None of the .* device\(s\) has a node named "):
774      dump.node_attributes("foo")
775
776  def testGraphStructureLookupGivesDebugWatchKeys(self):
777    u_name, v_name, w_name, dump = (
778        self._session_run_for_graph_structure_lookup())
779
780    # Test querying the debug watch keys with node names.
781    self.assertEqual(["%s:0:DebugIdentity" % u_name],
782                     dump.debug_watch_keys(u_name))
783    self.assertEqual(["%s:0:DebugIdentity" % v_name],
784                     dump.debug_watch_keys(v_name))
785    self.assertEqual(["%s:0:DebugIdentity" % w_name],
786                     dump.debug_watch_keys(w_name))
787    self.assertEqual([], dump.debug_watch_keys("foo"))
788
789    # Test querying debug datum instances from debug watch.
790    u_data = dump.watch_key_to_data(dump.debug_watch_keys(u_name)[0])
791    self.assertEqual(1, len(u_data))
792    self.assertEqual(u_name, u_data[0].node_name)
793    self.assertEqual(0, u_data[0].output_slot)
794    self.assertEqual("DebugIdentity", u_data[0].debug_op)
795    self.assertGreaterEqual(u_data[0].timestamp, 0)
796    self.assertEqual([], dump.watch_key_to_data("foo"))
797
798  def testGraphStructureLookupGivesNodeInputsAndRecipients(self):
799    u_name, v_name, w_name, dump = (
800        self._session_run_for_graph_structure_lookup())
801
802    u_read_name = u_name + "/read"
803
804    # Test the inputs lookup of the DebugDumpDir object.
805    self.assertEqual([], dump.node_inputs(u_name))
806    self.assertEqual([u_name], dump.node_inputs(u_read_name))
807    self.assertEqual([u_read_name] * 2, dump.node_inputs(v_name))
808    self.assertEqual([v_name] * 2, dump.node_inputs(w_name))
809
810    self.assertEqual([], dump.node_inputs(u_name, is_control=True))
811    self.assertEqual([], dump.node_inputs(u_read_name, is_control=True))
812    self.assertEqual([], dump.node_inputs(v_name, is_control=True))
813    self.assertEqual([], dump.node_inputs(w_name, is_control=True))
814
815    # Test the outputs recipient lookup of the DebugDumpDir object.
816    self.assertTrue(u_read_name in dump.node_recipients(u_name))
817    self.assertEqual(2, dump.node_recipients(u_read_name).count(v_name))
818    self.assertEqual(2, dump.node_recipients(v_name).count(w_name))
819
820    self.assertEqual([], dump.node_recipients(u_name, is_control=True))
821    self.assertEqual([], dump.node_recipients(u_read_name, is_control=True))
822    self.assertEqual([], dump.node_recipients(v_name, is_control=True))
823    self.assertEqual([], dump.node_recipients(w_name, is_control=True))
824
825    # Test errors raised on invalid node names.
826    with self.assertRaisesRegexp(
827        ValueError, r"None of the .* device\(s\) has a node named "):
828      dump.node_inputs(u_name + "foo")
829    with self.assertRaisesRegexp(
830        ValueError, r"None of the .* device\(s\) has a node named "):
831      dump.node_recipients(u_name + "foo")
832
833    # Test transitive_inputs().
834    self.assertEqual([], dump.transitive_inputs(u_name))
835    self.assertEqual([u_name], dump.transitive_inputs(u_read_name))
836    self.assertEqual(
837        set([u_name, u_read_name]), set(dump.transitive_inputs(v_name)))
838    self.assertEqual(
839        set([u_name, u_read_name, v_name]), set(dump.transitive_inputs(w_name)))
840
841    with self.assertRaisesRegexp(
842        ValueError, r"None of the .* device\(s\) has a node named "):
843      dump.transitive_inputs(u_name + "foo")
844
845  def testGraphStructureLookupWithoutPartitionGraphsDoesNotErrorOut(self):
846    _, _, _, dump = self._session_run_for_graph_structure_lookup()
847
848    # Now load the dump again, without the partition graphs, so we can check
849    # errors are not raised because the partition graphs are loaded from the
850    # dump directory.
851    dump = debug_data.DebugDumpDir(self._dump_root, validate=False)
852    self.assertTrue(dump.loaded_partition_graphs())
853
854  def testGraphPathFindingOnControlEdgesWorks(self):
855    with session.Session(config=no_rewrite_session_config()) as sess:
856      v1 = variables.VariableV1(1.0, name="v1")
857      v2 = variables.VariableV1(2.0, name="v2")
858      v3 = variables.VariableV1(3.0, name="v3")
859      a = math_ops.add(v1, v2, name="a")
860      with ops.control_dependencies([a]):
861        c = math_ops.subtract(v3, v3, name="c")
862
863      sess.run(variables.global_variables_initializer())
864      _, dump = self._debug_run_and_get_dump(sess, c)
865
866      self.assertEqual(["v1", "v1/read", "a", "c"],
867                       dump.find_some_path("v1", "c"))
868      self.assertIsNone(dump.find_some_path("v1", "c", include_control=False))
869
870  def testGraphPathFindingReverseRefEdgeWorks(self):
871    with session.Session(config=no_rewrite_session_config()) as sess:
872      v = variables.VariableV1(10.0, name="v")
873      delta = variables.VariableV1(1.0, name="delta")
874      inc_v = state_ops.assign_add(v, delta, name="inc_v")
875
876      sess.run(variables.global_variables_initializer())
877      _, dump = self._debug_run_and_get_dump(sess, inc_v)
878
879      self.assertEqual(
880          ["delta", "delta/read", "inc_v", "v"],
881          dump.find_some_path("delta", "v", include_reversed_ref=True))
882      self.assertIsNone(dump.find_some_path("delta", "v"))
883
884  def testCausalityCheckOnDumpsDetectsWrongTemporalOrder(self):
885    with session.Session(config=no_rewrite_session_config()) as sess:
886      u_name = "testDumpCausalityCheck/u"
887      v_name = "testDumpCausalityCheck/v"
888      w_name = "testDumpCausalityCheck/w"
889
890      u_init = constant_op.constant([2.0, 4.0])
891      u = variables.VariableV1(u_init, name=u_name)
892      v = math_ops.add(u, u, name=v_name)
893      w = math_ops.add(v, v, name=w_name)
894
895      u.initializer.run()
896
897      run_options = config_pb2.RunOptions(output_partition_graphs=True)
898      debug_utils.watch_graph(
899          run_options,
900          sess.graph,
901          debug_ops=["DebugIdentity"],
902          debug_urls=self._debug_urls())
903
904      run_metadata = config_pb2.RunMetadata()
905      sess.run(w, options=run_options, run_metadata=run_metadata)
906
907      self.assertEqual(self._expected_partition_graph_count,
908                       len(run_metadata.partition_graphs))
909
910      # First, loading the original dump without supplying the
911      # partition_graphs should not cause a LookupError, validation occurs
912      # only with partition_graphs loaded.
913      debug_data.DebugDumpDir(self._dump_root)
914
915      # Now, loading the original dump with partition graphs supplied should
916      # succeed. The validation should pass quietly.
917      dump = debug_data.DebugDumpDir(
918          self._dump_root, partition_graphs=run_metadata.partition_graphs)
919
920      # Get the dump file names and compute their timestamps.
921      self.assertEqual(
922          1, len(dump.get_tensor_file_paths(v_name, 0, "DebugIdentity")))
923      v_file_path = dump.get_tensor_file_paths(v_name, 0, "DebugIdentity")[0]
924
925      self.assertEqual(
926          1, len(dump.get_tensor_file_paths(w_name, 0, "DebugIdentity")))
927      w_file_path = dump.get_tensor_file_paths(w_name, 0, "DebugIdentity")[0]
928
929      v_timestamp = int(v_file_path[v_file_path.rindex("_") + 1:])
930      w_timestamp = int(w_file_path[w_file_path.rindex("_") + 1:])
931
932      # Swap and slightly shift the time stamps of the last two dumped tensors,
933      # to simulate "causality violation", which can happen if the dump
934      # directory contains incomplete data and/or mixes data from different
935      # Session.run() calls.
936      v_file_path_1 = v_file_path[:v_file_path.rindex(
937          "_")] + "_%d" % w_timestamp
938      w_file_path_1 = w_file_path[:w_file_path.rindex("_")] + "_%d" % (
939          v_timestamp - 1)
940
941      os.rename(v_file_path, v_file_path_1)
942      os.rename(w_file_path, w_file_path_1)
943
944      # Load the dump directory again. Now a ValueError is expected to be
945      # raised due to the timestamp swap.
946      with self.assertRaisesRegexp(ValueError, "Causality violated"):
947        dump = debug_data.DebugDumpDir(
948            self._dump_root, partition_graphs=run_metadata.partition_graphs)
949
950      # Loading the dump directory with kwarg "validate" set explicitly to
951      # False should get rid of the error.
952      dump = debug_data.DebugDumpDir(
953          self._dump_root,
954          partition_graphs=run_metadata.partition_graphs,
955          validate=False)
956
957      # Next, set the two times stamps to be the same, which should be fine.
958      v_file_path_2 = v_file_path[:v_file_path.rindex(
959          "_")] + "_%d" % w_timestamp
960      w_file_path_2 = w_file_path[:w_file_path.rindex(
961          "_")] + "_%d" % w_timestamp
962
963      os.rename(v_file_path_1, v_file_path_2)
964      os.rename(w_file_path_1, w_file_path_2)
965
966      debug_data.DebugDumpDir(
967          self._dump_root, partition_graphs=run_metadata.partition_graphs)
968
969  def testWatchingOnlyOneOfTwoOutputSlotsDoesNotLeadToCausalityFailure(self):
970    with session.Session() as sess:
971      x_name = "oneOfTwoSlots/x"
972      u_name = "oneOfTwoSlots/u"
973      v_name = "oneOfTwoSlots/v"
974      w_name = "oneOfTwoSlots/w"
975      y_name = "oneOfTwoSlots/y"
976
977      x = variables.VariableV1([1, 3, 3, 7], dtype=dtypes.int32, name=x_name)
978      sess.run(x.initializer)
979
980      unique_x, indices, _ = array_ops.unique_with_counts(x, name=u_name)
981
982      v = math_ops.add(unique_x, unique_x, name=v_name)
983      w = math_ops.add(indices, indices, name=w_name)
984      y = math_ops.add(w, w, name=y_name)
985
986      run_options = config_pb2.RunOptions(output_partition_graphs=True)
987      # Watch only the first output slot of u, even though it has two output
988      # slots.
989      debug_utils.add_debug_tensor_watch(
990          run_options, u_name, 0, debug_urls=self._debug_urls())
991      debug_utils.add_debug_tensor_watch(
992          run_options, w_name, 0, debug_urls=self._debug_urls())
993      debug_utils.add_debug_tensor_watch(
994          run_options, y_name, 0, debug_urls=self._debug_urls())
995
996      run_metadata = config_pb2.RunMetadata()
997      sess.run([v, y], options=run_options, run_metadata=run_metadata)
998
999      dump = debug_data.DebugDumpDir(
1000          self._dump_root,
1001          partition_graphs=run_metadata.partition_graphs,
1002          validate=True)
1003
1004      self.assertAllClose([1, 3, 7],
1005                          dump.get_tensors(u_name, 0, "DebugIdentity")[0])
1006
1007  def testOutputSlotWithoutOutgoingEdgeCanBeWatched(self):
1008    """Test watching output slots not attached to any outgoing edges."""
1009
1010    with session.Session(config=no_rewrite_session_config()) as sess:
1011      u_init_val = np.array([[5.0, 3.0], [-1.0, 0.0]])
1012      u = constant_op.constant(u_init_val, shape=[2, 2], name="u")
1013
1014      # Create a control edge from a node with an output: From u to z.
1015      # Node u will get executed only because of the control edge. The output
1016      # tensor u:0 is not attached to any outgoing edge in the graph. This test
1017      # checks that the debugger can watch such a tensor.
1018      with ops.control_dependencies([u]):
1019        z = control_flow_ops.no_op(name="z")
1020
1021      _, dump = self._debug_run_and_get_dump(sess, z)
1022
1023      # Assert that the DebugIdentity watch on u works properly.
1024      self.assertEqual(1, len(dump.dumped_tensor_data))
1025      datum = dump.dumped_tensor_data[0]
1026      self.assertEqual("u", datum.node_name)
1027      self.assertEqual(0, datum.output_slot)
1028      self.assertEqual("DebugIdentity", datum.debug_op)
1029      self.assertAllClose([[5.0, 3.0], [-1.0, 0.0]], datum.get_tensor())
1030
1031  def testWatchingVariableUpdateOpsSeesUpdatedValues(self):
1032    """Watch output slots on Variable-updating ops, with no emitted edges."""
1033
1034    with session.Session(config=no_rewrite_session_config()) as sess:
1035      u_init = constant_op.constant(10.0)
1036      u = variables.VariableV1(u_init, name="gdo/u")
1037      v_init = constant_op.constant(20.0)
1038      v = variables.VariableV1(v_init, name="gdo/v")
1039
1040      w = math_ops.multiply(u, v, name="gdo/w")
1041      # gdo stands for GradientDescentOptimizer.
1042
1043      train_op = gradient_descent.GradientDescentOptimizer(
1044          learning_rate=0.1).minimize(
1045              w, name="gdo/train")
1046
1047      u.initializer.run()
1048      v.initializer.run()
1049
1050      _, dump = self._debug_run_and_get_dump(sess, train_op)
1051
1052      update_u_data = dump.watch_key_to_data(
1053          "gdo/train/update_gdo/u/ApplyGradientDescent:0:DebugIdentity")
1054      self.assertEqual(1, len(update_u_data))
1055
1056      # Gradient descent on u: w = u * v, so dw / du = v.
1057      # Updated value of u should be:
1058      #   10.0 - learning_rate * v = 10.0 - 0.1 * 20.0 = 8.0
1059      self.assertAllClose(8.0, update_u_data[0].get_tensor())
1060
1061      update_v_data = dump.watch_key_to_data(
1062          "gdo/train/update_gdo/v/ApplyGradientDescent:0:DebugIdentity")
1063      self.assertEqual(1, len(update_v_data))
1064
1065      # Gradient descent on u: w = u * v, so dw / dv = u.
1066      # Updated value of u should be:
1067      #   20.0 - learning_rate * u = 20.0 - 0.1 * 10.0 = 19.0
1068      self.assertAllClose(19.0, update_v_data[0].get_tensor())
1069
1070      # Verify that the Variables u and v are updated properly.
1071      self.assertAllClose(8.0, sess.run(u))
1072      self.assertAllClose(19.0, sess.run(v))
1073
1074  def testAllowsWatchingUnconnectedOutputTensor(self):
1075    """Watch an output slot not emitting any edges.
1076
1077    (Not even control edges from the node.)
1078    """
1079
1080    with session.Session() as sess:
1081      x_init = constant_op.constant([2, 2, 3, 5, 5])
1082      x = variables.VariableV1(x_init, name="unconnected/x")
1083
1084      # The UniqueOp (tf.unique) has two output slots. Use only slot 0 in the
1085      # graph. Let the debugger watch the unused slot 1.
1086      unique_x, _ = array_ops.unique(x, name="unconnected/unique_x")
1087      y = math_ops.add(unique_x, [0, 1, 2], name="unconnected/y")
1088
1089      x.initializer.run()
1090
1091      # Verify that only slot 0 of unique_x has recipients, while slot 1 of the
1092      # same node does not have recipients.
1093      unique_x_slot_0_recipients = []
1094      unique_x_slot_1_recipients = []
1095      for op in sess.graph.get_operations():
1096        for inp in op.inputs:
1097          if inp.name == "unconnected/unique_x:0":
1098            unique_x_slot_0_recipients.append(op.name)
1099          elif inp.name == "unconnected/unique_x:1":
1100            unique_x_slot_1_recipients.append(op.name)
1101
1102      self.assertEqual(["unconnected/y"], unique_x_slot_0_recipients)
1103      self.assertEqual([], unique_x_slot_1_recipients)
1104
1105      y_result, dump = self._debug_run_and_get_dump(sess, y)
1106      self.assertAllClose([2, 4, 7], y_result)
1107
1108      # Assert that the connected slot (slot 0) is dumped properly.
1109      unique_x_slot_0_dumps = dump.watch_key_to_data(
1110          "unconnected/unique_x:0:DebugIdentity")
1111      self.assertEqual(1, len(unique_x_slot_0_dumps))
1112      self.assertEqual("unconnected/unique_x",
1113                       unique_x_slot_0_dumps[0].node_name)
1114      self.assertEqual(0, unique_x_slot_0_dumps[0].output_slot)
1115      self.assertAllClose([2, 3, 5], unique_x_slot_0_dumps[0].get_tensor())
1116
1117      # Assert that the unconnected slot (slot 1) is dumped properly.
1118      unique_x_slot_1_dumps = dump.watch_key_to_data(
1119          "unconnected/unique_x:1:DebugIdentity")
1120      self.assertEqual(1, len(unique_x_slot_1_dumps))
1121      self.assertEqual("unconnected/unique_x",
1122                       unique_x_slot_1_dumps[0].node_name)
1123      self.assertEqual(1, unique_x_slot_1_dumps[0].output_slot)
1124      self.assertAllClose([0, 0, 1, 2, 2],
1125                          unique_x_slot_1_dumps[0].get_tensor())
1126
1127  def testSuccessiveDebuggingRunsIncreasesCounters(self):
1128    """Test repeated Session.run() calls with debugger increments counters."""
1129
1130    with session.Session() as sess:
1131      ph = array_ops.placeholder(dtypes.float32, name="successive/ph")
1132      x = array_ops.transpose(ph, name="mismatch/x")
1133      y = array_ops.squeeze(ph, name="mismatch/y")
1134
1135      _, dump1 = self._debug_run_and_get_dump(
1136          sess, x, feed_dict={ph: np.array([[7.0, 8.0]])}, global_step=1)
1137      self.assertEqual(1, dump1.core_metadata.global_step)
1138      self.assertGreaterEqual(dump1.core_metadata.session_run_index, 0)
1139      self.assertEqual(0, dump1.core_metadata.executor_step_index)
1140      self.assertEqual([ph.name], dump1.core_metadata.input_names)
1141      self.assertEqual([x.name], dump1.core_metadata.output_names)
1142      self.assertEqual([], dump1.core_metadata.target_nodes)
1143      file_io.delete_recursively(self._dump_root)
1144
1145      # Calling run() with the same feed, same output and same debug watch
1146      # options should increment both session_run_index and
1147      # executor_step_index.
1148      _, dump2 = self._debug_run_and_get_dump(
1149          sess, x, feed_dict={ph: np.array([[7.0, 8.0]])}, global_step=2)
1150      self.assertEqual(2, dump2.core_metadata.global_step)
1151      self.assertEqual(dump1.core_metadata.session_run_index + 1,
1152                       dump2.core_metadata.session_run_index)
1153      self.assertEqual(dump1.core_metadata.executor_step_index + 1,
1154                       dump2.core_metadata.executor_step_index)
1155      self.assertEqual([ph.name], dump2.core_metadata.input_names)
1156      self.assertEqual([x.name], dump2.core_metadata.output_names)
1157      self.assertEqual([], dump2.core_metadata.target_nodes)
1158      file_io.delete_recursively(self._dump_root)
1159
1160      run_options = config_pb2.RunOptions(output_partition_graphs=True)
1161      debug_utils.watch_graph(
1162          run_options, sess.graph, debug_urls=self._debug_urls(), global_step=3)
1163
1164      # Calling run() with a different output should increment
1165      # session_run_index, but not executor_step_index.
1166      _, dump3 = self._debug_run_and_get_dump(
1167          sess, y, feed_dict={ph: np.array([[7.0, 8.0]])}, global_step=3)
1168      self.assertEqual(3, dump3.core_metadata.global_step)
1169      self.assertEqual(dump2.core_metadata.session_run_index + 1,
1170                       dump3.core_metadata.session_run_index)
1171      self.assertEqual(0, dump3.core_metadata.executor_step_index)
1172      self.assertEqual([ph.name], dump3.core_metadata.input_names)
1173      self.assertEqual([y.name], dump3.core_metadata.output_names)
1174      self.assertEqual([], dump3.core_metadata.target_nodes)
1175
1176  def testDebuggingDuringOpError(self):
1177    """Test the debug tensor dumping when error occurs in graph runtime."""
1178
1179    with session.Session() as sess:
1180      ph = array_ops.placeholder(dtypes.float32, name="mismatch/ph")
1181      x = array_ops.transpose(ph, name="mismatch/x")
1182      m = constant_op.constant(
1183          np.array(
1184              [[1.0, 2.0]], dtype=np.float32), name="mismatch/m")
1185      y = math_ops.matmul(m, x, name="mismatch/y")
1186
1187      run_options = config_pb2.RunOptions(output_partition_graphs=True)
1188      debug_utils.watch_graph(
1189          run_options,
1190          sess.graph,
1191          debug_ops=["DebugIdentity"],
1192          debug_urls=self._debug_urls())
1193
1194      with self.assertRaises(errors.OpError):
1195        sess.run(y,
1196                 options=run_options,
1197                 feed_dict={ph: np.array([[-3.0], [0.0]])})
1198
1199      dump = debug_data.DebugDumpDir(self._dump_root)
1200
1201      self.assertGreaterEqual(dump.core_metadata.session_run_index, 0)
1202      self.assertGreaterEqual(dump.core_metadata.executor_step_index, 0)
1203      self.assertEqual([ph.name], dump.core_metadata.input_names)
1204      self.assertEqual([y.name], dump.core_metadata.output_names)
1205      self.assertEqual([], dump.core_metadata.target_nodes)
1206
1207      # Despite the fact that the run() call errored out and partition_graphs
1208      # are not available via run_metadata, the partition graphs should still
1209      # have been loaded from the dump directory.
1210      self.assertTrue(dump.loaded_partition_graphs())
1211
1212      m_dumps = dump.watch_key_to_data("mismatch/m:0:DebugIdentity")
1213      self.assertEqual(1, len(m_dumps))
1214      self.assertAllClose(np.array([[1.0, 2.0]]), m_dumps[0].get_tensor())
1215
1216      x_dumps = dump.watch_key_to_data("mismatch/x:0:DebugIdentity")
1217      self.assertEqual(1, len(x_dumps))
1218      self.assertAllClose(np.array([[-3.0, 0.0]]), x_dumps[0].get_tensor())
1219
1220  def testDebugNumericSummaryOnInitializedTensorGivesCorrectResult(self):
1221    with session.Session(config=no_rewrite_session_config()) as sess:
1222      a = variables.VariableV1(
1223          [
1224              np.nan, np.nan, 0.0, 0.0, 0.0, -1.0, -3.0, 3.0, 7.0, -np.inf,
1225              -np.inf, np.inf, np.inf, np.inf, np.inf, np.inf, np.nan, np.nan
1226          ],
1227          dtype=np.float32,
1228          name="numeric_summary/a")
1229      b = variables.VariableV1(
1230          [0.0] * 18, dtype=np.float32, name="numeric_summary/b")
1231      c = math_ops.add(a, b, name="numeric_summary/c")
1232
1233      sess.run(variables.global_variables_initializer())
1234
1235      _, dump = self._debug_run_and_get_dump(
1236          sess, c, debug_ops=["DebugNumericSummary"])
1237      self.assertTrue(dump.loaded_partition_graphs())
1238
1239      self.assertAllClose([[
1240          1.0, 18.0, 4.0, 2.0, 2.0, 3.0, 2.0, 5.0, -3.0, 7.0, 0.85714286,
1241          8.97959184, 1.0, 1.0, 18.0
1242      ]], dump.get_tensors("numeric_summary/a/read", 0, "DebugNumericSummary"))
1243
1244  def testDebugNumericSummaryOnUninitializedTensorGivesCorrectResult(self):
1245    with session.Session() as sess:
1246      a = variables.VariableV1(
1247          [42], dtype=np.float32, name="numeric_summary_uninit/a")
1248
1249      _, dump = self._debug_run_and_get_dump(
1250          sess, a.initializer, debug_ops=["DebugNumericSummary"])
1251
1252      self.assertTrue(dump.loaded_partition_graphs())
1253
1254      # DebugNumericSummary output should reflect the uninitialized state of
1255      # the watched tensor.
1256      numeric_summary = dump.get_tensors("numeric_summary_uninit/a", 0,
1257                                         "DebugNumericSummary")[0]
1258      self.assertAllClose([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
1259                          numeric_summary[0:8])
1260      # Check dtype (index 12), ndims (index 13) and dimension sizes (index
1261      # 14+).
1262      self.assertAllClose([1.0, 1.0, 1.0], numeric_summary[12:])
1263      self.assertTrue(np.isinf(numeric_summary[8]))
1264      self.assertGreater(numeric_summary[8], 0.0)
1265      self.assertTrue(np.isinf(numeric_summary[9]))
1266      self.assertLess(numeric_summary[9], 0.0)
1267      self.assertTrue(np.isnan(numeric_summary[10]))
1268      self.assertTrue(np.isnan(numeric_summary[11]))
1269
1270  def testDebugNumericSummaryFailureIsToleratedWhenOrdered(self):
1271    with session.Session() as sess:
1272      a = variables.VariableV1("1", name="a")
1273      b = variables.VariableV1("3", name="b")
1274      c = variables.VariableV1("2", name="c")
1275
1276      d = math_ops.add(a, b, name="d")
1277      e = math_ops.add(d, c, name="e")
1278      n = parsing_ops.string_to_number(e, name="n")
1279      m = math_ops.add(n, n, name="m")
1280
1281      sess.run(variables.global_variables_initializer())
1282
1283      # Using DebugNumericSummary on sess.run(m) with the default
1284      # tolerate_debug_op_creation_failures=False should error out due to the
1285      # presence of string-dtype Tensors in the graph.
1286      run_metadata = config_pb2.RunMetadata()
1287      run_options = config_pb2.RunOptions(output_partition_graphs=True)
1288      debug_utils.watch_graph(
1289          run_options,
1290          sess.graph,
1291          debug_ops=["DebugNumericSummary"],
1292          debug_urls=self._debug_urls())
1293      with self.assertRaises(errors.FailedPreconditionError):
1294        sess.run(m, options=run_options, run_metadata=run_metadata)
1295
1296      # Using tolerate_debug_op_creation_failures=True should get rid of the
1297      # error.
1298      m_result, dump = self._debug_run_and_get_dump(
1299          sess, m, debug_ops=["DebugNumericSummary"],
1300          tolerate_debug_op_creation_failures=True)
1301      self.assertEqual(264, m_result)
1302
1303      # The integer-dtype Tensors in the graph should have been dumped
1304      # properly.
1305      self.assertIn("n:0:DebugNumericSummary", dump.debug_watch_keys("n"))
1306      self.assertIn("m:0:DebugNumericSummary", dump.debug_watch_keys("m"))
1307
1308  def testDebugNumericSummaryInvalidAttributesStringAreCaught(self):
1309    with session.Session(config=no_rewrite_session_config()) as sess:
1310      a = variables.VariableV1(10.0, name="a")
1311      b = variables.VariableV1(0.0, name="b")
1312      c = variables.VariableV1(0.0, name="c")
1313
1314      x = math_ops.divide(a, b, name="x")
1315      y = math_ops.multiply(x, c, name="y")
1316
1317      sess.run(variables.global_variables_initializer())
1318
1319      run_metadata = config_pb2.RunMetadata()
1320      run_options = config_pb2.RunOptions(output_partition_graphs=True)
1321      debug_utils.watch_graph(
1322          run_options,
1323          sess.graph,
1324          debug_ops=["DebugNumericSummary(foo=1.0)"],
1325          debug_urls=self._debug_urls())
1326      with self.assertRaisesRegexp(
1327          errors.FailedPreconditionError,
1328          r"1 attribute key\(s\) were not valid for debug node "
1329          r"__dbg_.:0_0_DebugNumericSummary: foo"):
1330        sess.run(y, options=run_options, run_metadata=run_metadata)
1331
1332      run_options = config_pb2.RunOptions(output_partition_graphs=True)
1333      debug_utils.watch_graph(
1334          run_options,
1335          sess.graph,
1336          debug_ops=["DebugNumericSummary(foo=1.0; bar=false)"],
1337          debug_urls=self._debug_urls())
1338      with self.assertRaisesRegexp(
1339          errors.FailedPreconditionError,
1340          r"2 attribute key\(s\) were not valid for debug node "
1341          r"__dbg_.:0_0_DebugNumericSummary:"):
1342        sess.run(y, options=run_options, run_metadata=run_metadata)
1343
1344      run_options = config_pb2.RunOptions(output_partition_graphs=True)
1345      debug_utils.watch_graph(
1346          run_options,
1347          sess.graph,
1348          debug_ops=["DebugNumericSummary(foo=1.0; mute_if_healthy=true)"],
1349          debug_urls=self._debug_urls())
1350      with self.assertRaisesRegexp(
1351          errors.FailedPreconditionError,
1352          r"1 attribute key\(s\) were not valid for debug node "
1353          r"__dbg_.:0_0_DebugNumericSummary: foo"):
1354        sess.run(y, options=run_options, run_metadata=run_metadata)
1355
1356  def testDebugNumericSummaryMuteOnHealthyMutesOnlyHealthyTensorDumps(self):
1357    with session.Session(config=no_rewrite_session_config()) as sess:
1358      a = variables.VariableV1(10.0, name="a")
1359      b = variables.VariableV1(0.0, name="b")
1360      c = variables.VariableV1(0.0, name="c")
1361
1362      x = math_ops.divide(a, b, name="x")
1363      y = math_ops.multiply(x, c, name="y")
1364
1365      sess.run(variables.global_variables_initializer())
1366
1367      # Here, validate=False is necessary to avoid causality check error.
1368      # TODO(cais): Maybe let DebugDumpDir constructor automatically ignore
1369      #   debug ops with mute_if_healthy=false attribute during validation.
1370      _, dump = self._debug_run_and_get_dump(
1371          sess, y, debug_ops=["DebugNumericSummary(mute_if_healthy=true)"],
1372          validate=False)
1373
1374      self.assertLessEqual(2, dump.size)
1375      self.assertAllClose([[
1376          1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, np.inf, -np.inf, np.nan,
1377          np.nan, 1.0, 0.0
1378      ]], dump.get_tensors("x", 0, "DebugNumericSummary"))
1379      self.assertAllClose([[
1380          1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, np.inf, -np.inf, np.nan,
1381          np.nan, 1.0, 0.0
1382      ]], dump.get_tensors("y", 0, "DebugNumericSummary"))
1383
1384      # Another run with the default mute_if_healthy (false) value should
1385      # dump all the tensors.
1386      file_io.delete_recursively(self._dump_root)
1387      _, dump = self._debug_run_and_get_dump(
1388          sess, y, debug_ops=["DebugNumericSummary()"])
1389      self.assertLessEqual(8, dump.size)
1390
1391  def testDebugNumericSummaryMuteOnHealthyAndCustomBoundsWork(self):
1392    with session.Session() as sess:
1393      a = variables.VariableV1([10.0, 10.0], name="a")
1394      b = variables.VariableV1([10.0, 2.0], name="b")
1395
1396      x = math_ops.add(a, b, name="x")  # [20.0, 12.0]
1397      y = math_ops.divide(x, b, name="y")  # [2.0, 6.0]
1398
1399      sess.run(variables.global_variables_initializer())
1400
1401      # Here, validate=False is necessary to avoid causality check error.
1402      # TODO(cais): Maybe let DebugDumpDir constructor automatically ignore
1403      #   debug ops with mute_if_healthy=false attribute during validation.
1404      _, dump = self._debug_run_and_get_dump(
1405          sess, y, debug_ops=[
1406              "DebugNumericSummary(mute_if_healthy=true; upper_bound=11.0)"],
1407          validate=False)
1408
1409      self.assertEqual(1, dump.size)
1410      self.assertAllClose([[
1411          1.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 12.0, 20.0, 16.0, 16.0, 1.0,
1412          1.0, 2.0]], dump.get_tensors("x", 0, "DebugNumericSummary"))
1413
1414  def testDebugQueueOpsDoesNotoErrorOut(self):
1415    with session.Session() as sess:
1416      q = data_flow_ops.FIFOQueue(3, "float", name="fifo_queue")
1417      q_init = q.enqueue_many(([101.0, 202.0, 303.0],), name="enqueue_many")
1418
1419      _, dump = self._debug_run_and_get_dump(sess, q_init)
1420      self.assertTrue(dump.loaded_partition_graphs())
1421
1422      fifo_queue_tensor = dump.get_tensors("fifo_queue", 0, "DebugIdentity")[0]
1423      self.assertIsInstance(fifo_queue_tensor,
1424                            debug_data.InconvertibleTensorProto)
1425      self.assertTrue(fifo_queue_tensor.initialized)
1426      self.assertAllClose(
1427          [101.0, 202.0, 303.0],
1428          dump.get_tensors("enqueue_many/component_0", 0, "DebugIdentity")[0])
1429
1430  def testLookUpNodePythonTracebackWorks(self):
1431    with session.Session() as sess:
1432      u_init = constant_op.constant(10.0)
1433      u = variables.VariableV1(u_init, name="traceback/u")
1434      v_init = constant_op.constant(20.0)
1435      v = variables.VariableV1(v_init, name="traceback/v")
1436
1437      w = math_ops.multiply(u, v, name="traceback/w")
1438
1439      sess.run(variables.global_variables_initializer())
1440      _, dump = self._debug_run_and_get_dump(sess, w)
1441
1442      # Prior to setting the Python graph, attempts to do traceback lookup
1443      # should lead to exceptions.
1444      with self.assertRaisesRegexp(
1445          LookupError, "Python graph is not available for traceback lookup"):
1446        dump.node_traceback("traceback/w")
1447
1448      dump.set_python_graph(sess.graph)
1449
1450      # After setting the Python graph, attempts to look up nonexistent nodes
1451      # should lead to exceptions.
1452      with self.assertRaisesRegexp(KeyError,
1453                                   r"Cannot find node \"foo\" in Python graph"):
1454        dump.node_traceback("foo")
1455
1456      # Lookup should work with node name input.
1457      traceback = dump.node_traceback("traceback/w")
1458      self.assertIsInstance(traceback, tuple)
1459      self.assertGreater(len(traceback), 0)
1460      for trace in traceback:
1461        self.assertIsInstance(trace, tuple)
1462
1463      # Lookup should also work with tensor name input.
1464      traceback = dump.node_traceback("traceback/w:0")
1465      self.assertIsInstance(traceback, tuple)
1466      self.assertGreater(len(traceback), 0)
1467      for trace in traceback:
1468        self.assertIsInstance(trace, tuple)
1469
1470
1471class DebugConcurrentRunCallsTest(test_util.TensorFlowTestCase):
1472  """Test for debugging concurrent Session.run() calls."""
1473
1474  def _get_concurrent_debug_urls(self):
1475    """Abstract method to generate debug URLs for concurrent debugged runs."""
1476    raise NotImplementedError(
1477        "_get_concurrent_debug_urls is not implemented in the base test class")
1478
1479  def testDebugConcurrentVariableUpdates(self):
1480    if test.is_gpu_available():
1481      self.skipTest("No testing concurrent runs on a single GPU.")
1482
1483    with session.Session() as sess:
1484      v = variables.VariableV1(30.0, name="v")
1485      constants = []
1486      for i in range(self._num_concurrent_runs):
1487        constants.append(constant_op.constant(1.0, name="c%d" % i))
1488      incs = [
1489          state_ops.assign_add(
1490              v, c, use_locking=True, name=("inc%d" % i))
1491          for (i, c) in enumerate(constants)
1492      ]
1493      sess.run(v.initializer)
1494
1495      concurrent_debug_urls = self._get_concurrent_debug_urls()
1496
1497      def inc_job(index):
1498        run_options = config_pb2.RunOptions(output_partition_graphs=True)
1499        debug_utils.watch_graph(
1500            run_options, sess.graph, debug_urls=concurrent_debug_urls[index])
1501        for _ in range(100):
1502          sess.run(incs[index], options=run_options)
1503
1504      inc_threads = []
1505      for index in range(self._num_concurrent_runs):
1506        inc_thread = threading.Thread(target=functools.partial(inc_job, index))
1507        inc_thread.start()
1508        inc_threads.append(inc_thread)
1509      for inc_thread in inc_threads:
1510        inc_thread.join()
1511
1512      self.assertAllClose(30.0 + 1.0 * self._num_concurrent_runs * 100,
1513                          sess.run(v))
1514
1515      all_session_run_indices = []
1516      for index in range(self._num_concurrent_runs):
1517        dump = debug_data.DebugDumpDir(self._dump_roots[index])
1518        self.assertTrue(dump.loaded_partition_graphs())
1519
1520        v_data = dump.get_tensors("v", 0, "DebugIdentity")
1521        self.assertEqual(100, len(v_data))
1522
1523        # Examine all the core metadata files
1524        core_metadata_files = glob.glob(
1525            os.path.join(self._dump_roots[index], "_tfdbg_core*"))
1526
1527        timestamps = []
1528        session_run_indices = []
1529        executor_step_indices = []
1530        for core_metadata_file in core_metadata_files:
1531          with open(core_metadata_file, "rb") as f:
1532            event = event_pb2.Event()
1533            event.ParseFromString(f.read())
1534            core_metadata = (
1535                debug_data.extract_core_metadata_from_event_proto(event))
1536            timestamps.append(event.wall_time)
1537            session_run_indices.append(core_metadata.session_run_index)
1538            executor_step_indices.append(core_metadata.executor_step_index)
1539
1540        all_session_run_indices.extend(session_run_indices)
1541
1542        # Assert that executor_step_index increases by one at a time.
1543        executor_step_indices = zip(timestamps, executor_step_indices)
1544        executor_step_indices = sorted(
1545            executor_step_indices, key=lambda x: x[0])
1546        for i in range(len(executor_step_indices) - 1):
1547          self.assertEquals(executor_step_indices[i][1] + 1,
1548                            executor_step_indices[i + 1][1])
1549
1550        # Assert that session_run_index increase monotonically.
1551        session_run_indices = zip(timestamps, session_run_indices)
1552        session_run_indices = sorted(session_run_indices, key=lambda x: x[0])
1553        for i in range(len(session_run_indices) - 1):
1554          self.assertGreater(session_run_indices[i + 1][1],
1555                             session_run_indices[i][1])
1556
1557      # Assert that the session_run_indices from the concurrent run() calls are
1558      # all unique.
1559      self.assertEqual(len(all_session_run_indices),
1560                       len(set(all_session_run_indices)))
1561
1562
1563if __name__ == "__main__":
1564  googletest.main()
1565