1# Copyright 2016 The TensorFlow Authors. All Rights Reserved. 2# 3# Licensed under the Apache License, Version 2.0 (the "License"); 4# you may not use this file except in compliance with the License. 5# You may obtain a copy of the License at 6# 7# http://www.apache.org/licenses/LICENSE-2.0 8# 9# Unless required by applicable law or agreed to in writing, software 10# distributed under the License is distributed on an "AS IS" BASIS, 11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12# See the License for the specific language governing permissions and 13# limitations under the License. 14# ============================================================================== 15"""Tests for debugger functionalities in tf.Session.""" 16import collections 17import functools 18import glob 19import os 20import tempfile 21import threading 22 23import numpy as np 24 25from tensorflow.core.protobuf import config_pb2 26from tensorflow.core.protobuf import rewriter_config_pb2 27from tensorflow.core.util import event_pb2 28from tensorflow.python.client import session 29from tensorflow.python.debug.lib import debug_data 30from tensorflow.python.debug.lib import debug_graphs 31from tensorflow.python.debug.lib import debug_utils 32from tensorflow.python.framework import constant_op 33from tensorflow.python.framework import dtypes 34from tensorflow.python.framework import errors 35from tensorflow.python.framework import ops 36from tensorflow.python.framework import test_util 37from tensorflow.python.lib.io import file_io 38from tensorflow.python.ops import array_ops 39from tensorflow.python.ops import control_flow_ops 40from tensorflow.python.ops import data_flow_ops 41from tensorflow.python.ops import math_ops 42from tensorflow.python.ops import parsing_ops 43from tensorflow.python.ops import rnn 44from tensorflow.python.ops import rnn_cell_impl 45from tensorflow.python.ops import state_ops 46from tensorflow.python.ops import variables 47import tensorflow.python.ops.tensor_array_grad # pylint: disable=unused-import 48from tensorflow.python.platform import googletest 49from tensorflow.python.platform import test 50from tensorflow.python.training import gradient_descent 51 52 53def no_rewrite_session_config(): 54 rewriter_config = rewriter_config_pb2.RewriterConfig( 55 disable_model_pruning=True, 56 arithmetic_optimization=rewriter_config_pb2.RewriterConfig.OFF, 57 dependency_optimization=rewriter_config_pb2.RewriterConfig.OFF) 58 graph_options = config_pb2.GraphOptions(rewrite_options=rewriter_config) 59 return config_pb2.ConfigProto(graph_options=graph_options) 60 61 62class _RNNCellForTest(rnn_cell_impl.RNNCell): 63 """RNN cell for testing.""" 64 65 def __init__(self, input_output_size, state_size): 66 self._input_output_size = input_output_size 67 self._state_size = state_size 68 self._w = variables.VariableV1(1.0, dtype=dtypes.float32, name="w") 69 70 @property 71 def output_size(self): 72 return self._input_output_size 73 74 @property 75 def state_size(self): 76 return self._state_size 77 78 def __call__(self, input_, state, scope=None): 79 return (math_ops.multiply(self._w, input_), state) 80 81 82@test_util.run_v1_only("b/120545219") 83class SessionDebugTestBase(test_util.TensorFlowTestCase): 84 """Base class for unit tests of tfdbg running with tf.Session.""" 85 86 @classmethod 87 def setUpClass(cls): 88 if test.is_gpu_available(): 89 cls._expected_partition_graph_count = 2 90 cls._expected_num_devices = 2 91 gpu_name = test_util.gpu_device_name() 92 cls._main_device = "/job:localhost/replica:0/task:0" + gpu_name 93 else: 94 cls._expected_partition_graph_count = 1 95 cls._expected_num_devices = 1 96 cls._main_device = "/job:localhost/replica:0/task:0/device:CPU:0" 97 98 @classmethod 99 def tearDownClass(cls): 100 pass 101 102 def setUp(self): 103 self._dump_root = tempfile.mkdtemp() 104 105 def tearDown(self): 106 ops.reset_default_graph() 107 108 # Tear down temporary dump directory. 109 if os.path.isdir(self._dump_root): 110 file_io.delete_recursively(self._dump_root) 111 112 def _debug_urls(self, run_number=None): 113 raise NotImplementedError( 114 "_debug_urls() method is not implemented in the base test class.") 115 116 def _debug_dump_dir(self, run_number=None): 117 raise NotImplementedError( 118 "_debug_dump_dir() method is not implemented in the base test class.") 119 120 def _debug_run_and_get_dump(self, 121 sess, 122 fetches, 123 feed_dict=None, 124 debug_ops="DebugIdentity", 125 tolerate_debug_op_creation_failures=False, 126 global_step=-1, 127 validate=True, 128 expected_partition_graph_count=None): 129 """Run fetches with debugging and obtain DebugDumpDir. 130 131 Args: 132 sess: the tf.compat.v1.Session to be used. 133 fetches: fetches of the Session.run(). 134 feed_dict: feed dict for the Session.run(). 135 debug_ops: name(s) of the debug ops to be used. 136 tolerate_debug_op_creation_failures: whether to tolerate debug op 137 creation failures. 138 global_step: Optional global step. 139 validate: whether to validate dumped tensors against graph. 140 expected_partition_graph_count: optional count of partition graphs to 141 assert on. 142 143 Returns: 144 1. Return values of the Session.run(). 145 2. The DebugDumpDir object from the debugged run(). 146 """ 147 148 run_options = config_pb2.RunOptions(output_partition_graphs=True) 149 debug_utils.watch_graph( 150 run_options, 151 sess.graph, 152 debug_ops=debug_ops, 153 debug_urls=self._debug_urls(), 154 tolerate_debug_op_creation_failures=tolerate_debug_op_creation_failures, 155 global_step=global_step) 156 run_metadata = config_pb2.RunMetadata() 157 run_output = sess.run(fetches, 158 feed_dict=feed_dict, 159 options=run_options, 160 run_metadata=run_metadata) 161 162 if expected_partition_graph_count is not None: 163 self.assertEqual(expected_partition_graph_count, 164 len(run_metadata.partition_graphs)) 165 return run_output, debug_data.DebugDumpDir( 166 self._dump_root, partition_graphs=run_metadata.partition_graphs, 167 validate=validate) 168 169 def _generate_dump_from_simple_addition_graph(self): 170 with session.Session(config=no_rewrite_session_config()) as sess: 171 u_init_val = np.array([[5.0, 3.0], [-1.0, 0.0]]) 172 v_init_val = np.array([[2.0], [-1.0]]) 173 174 # Use node names with overlapping namespace (i.e., parent directory) to 175 # test concurrent, non-racing directory creation. 176 u_name = "u" 177 v_name = "v" 178 w_name = "w" 179 180 u_init = constant_op.constant(u_init_val, shape=[2, 2]) 181 u = variables.VariableV1(u_init, name=u_name) 182 v_init = constant_op.constant(v_init_val, shape=[2, 1]) 183 v = variables.VariableV1(v_init, name=v_name) 184 185 w = math_ops.matmul(u, v, name=w_name) 186 187 u.initializer.run() 188 v.initializer.run() 189 190 run_options = config_pb2.RunOptions(output_partition_graphs=True) 191 debug_urls = "file://%s" % self._dump_root 192 193 # Add debug tensor watch for u. 194 debug_utils.add_debug_tensor_watch( 195 run_options, "%s/read" % u_name, 0, debug_urls=debug_urls) 196 # Add debug tensor watch for v. 197 debug_utils.add_debug_tensor_watch( 198 run_options, "%s/read" % v_name, 0, debug_urls=debug_urls) 199 200 run_metadata = config_pb2.RunMetadata() 201 202 # Invoke Session.run(). 203 sess.run(w, options=run_options, run_metadata=run_metadata) 204 205 self.assertEqual(self._expected_partition_graph_count, 206 len(run_metadata.partition_graphs)) 207 208 dump = debug_data.DebugDumpDir( 209 self._dump_root, partition_graphs=run_metadata.partition_graphs) 210 211 simple_add_results = collections.namedtuple("SimpleAddResults", [ 212 "u_init_val", "v_init_val", "u", "v", "w", "u_name", "v_name", "w_name", 213 "dump" 214 ]) 215 return simple_add_results(u_init_val, v_init_val, u, v, w, u_name, v_name, 216 w_name, dump) 217 218 def testCopyNodesHaveCorrectDebugOpsAndURLsAttributeValues(self): 219 with session.Session() as sess: 220 u = variables.VariableV1(2.1, name="u") 221 v = variables.VariableV1(20.0, name="v") 222 w = math_ops.multiply(u, v, name="w") 223 224 sess.run(variables.global_variables_initializer()) 225 226 run_options = config_pb2.RunOptions(output_partition_graphs=True) 227 debug_urls = self._debug_urls() 228 debug_utils.add_debug_tensor_watch( 229 run_options, 230 "u", 231 0, ["DebugNumericSummary(gated_grpc=True)", "DebugIdentity"], 232 debug_urls=debug_urls) 233 debug_utils.add_debug_tensor_watch( 234 run_options, "v", 0, ["DebugNumericSummary"], debug_urls=debug_urls) 235 236 run_metadata = config_pb2.RunMetadata() 237 r = sess.run(w, options=run_options, run_metadata=run_metadata) 238 self.assertAllClose(42.0, r) 239 240 u_copy_node_def = None 241 v_copy_node_def = None 242 for partition_graph in run_metadata.partition_graphs: 243 for node_def in partition_graph.node: 244 if debug_graphs.is_copy_node(node_def.name): 245 if node_def.name == "__copy_u_0": 246 u_copy_node_def = node_def 247 elif node_def.name == "__copy_v_0": 248 v_copy_node_def = node_def 249 250 self.assertIsNotNone(u_copy_node_def) 251 debug_ops_spec = u_copy_node_def.attr["debug_ops_spec"].list.s 252 self.assertEqual(2, len(debug_ops_spec)) 253 self.assertEqual("DebugNumericSummary;%s;1" % debug_urls[0], 254 debug_ops_spec[0].decode("utf-8")) 255 self.assertEqual("DebugIdentity;%s;0" % debug_urls[0], 256 debug_ops_spec[1].decode("utf-8")) 257 258 self.assertIsNotNone(v_copy_node_def) 259 debug_ops_spec = v_copy_node_def.attr["debug_ops_spec"].list.s 260 self.assertEqual(1, len(debug_ops_spec)) 261 self.assertEqual("DebugNumericSummary;%s;0" % debug_urls[0], 262 debug_ops_spec[0].decode("utf-8")) 263 264 def testConcurrentDumpingToPathsWithOverlappingParentDirsWorks(self): 265 results = self._generate_dump_from_simple_addition_graph() 266 self.assertTrue(results.dump.loaded_partition_graphs()) 267 268 # Since global_step is not explicitly specified, it should take its default 269 # value: -1. 270 self.assertEqual(-1, results.dump.core_metadata.global_step) 271 self.assertGreaterEqual(results.dump.core_metadata.session_run_index, 0) 272 self.assertGreaterEqual(results.dump.core_metadata.executor_step_index, 0) 273 self.assertEqual([], results.dump.core_metadata.input_names) 274 self.assertEqual([results.w.name], results.dump.core_metadata.output_names) 275 self.assertEqual([], results.dump.core_metadata.target_nodes) 276 277 # Verify the dumped tensor values for u and v. 278 self.assertEqual(2, results.dump.size) 279 280 self.assertAllClose([results.u_init_val], 281 results.dump.get_tensors("%s/read" % results.u_name, 0, 282 "DebugIdentity")) 283 self.assertAllClose([results.v_init_val], 284 results.dump.get_tensors("%s/read" % results.v_name, 0, 285 "DebugIdentity")) 286 287 self.assertGreaterEqual( 288 results.dump.get_rel_timestamps("%s/read" % results.u_name, 0, 289 "DebugIdentity")[0], 0) 290 self.assertGreaterEqual( 291 results.dump.get_rel_timestamps("%s/read" % results.v_name, 0, 292 "DebugIdentity")[0], 0) 293 294 self.assertGreater( 295 results.dump.get_dump_sizes_bytes("%s/read" % results.u_name, 0, 296 "DebugIdentity")[0], 0) 297 self.assertGreater( 298 results.dump.get_dump_sizes_bytes("%s/read" % results.v_name, 0, 299 "DebugIdentity")[0], 0) 300 301 def testGetOpTypeWorks(self): 302 results = self._generate_dump_from_simple_addition_graph() 303 304 self.assertEqual(results.u.op.type, 305 results.dump.node_op_type(results.u_name)) 306 self.assertIn(results.v.op.type, results.dump.node_op_type(results.v_name)) 307 self.assertIn(results.w.op.type, results.dump.node_op_type(results.w_name)) 308 309 with self.assertRaisesRegexp( 310 ValueError, r"None of the .* device\(s\) has a node named "): 311 results.dump.node_op_type("foo_bar") 312 313 def testDumpStringTensorsWorks(self): 314 with session.Session(config=no_rewrite_session_config()) as sess: 315 str1_init_val = np.array(b"abc") 316 str2_init_val = np.array(b"def") 317 318 str1_init = constant_op.constant(str1_init_val) 319 str2_init = constant_op.constant(str2_init_val) 320 321 str1_name = "str1" 322 str2_name = "str2" 323 str1 = variables.VariableV1(str1_init, name=str1_name) 324 str2 = variables.VariableV1(str2_init, name=str2_name) 325 # Concatenate str1 and str2 326 str_concat = math_ops.add(str1, str2, name="str_concat") 327 328 str1.initializer.run() 329 str2.initializer.run() 330 331 run_options = config_pb2.RunOptions(output_partition_graphs=True) 332 debug_urls = self._debug_urls() 333 334 # Add debug tensor watch for u. 335 debug_utils.add_debug_tensor_watch( 336 run_options, "%s/read" % str1_name, 0, debug_urls=debug_urls) 337 # Add debug tensor watch for v. 338 debug_utils.add_debug_tensor_watch( 339 run_options, "%s/read" % str2_name, 0, debug_urls=debug_urls) 340 341 run_metadata = config_pb2.RunMetadata() 342 sess.run(str_concat, options=run_options, run_metadata=run_metadata) 343 344 # String ops are located on CPU. 345 self.assertEqual(1, len(run_metadata.partition_graphs)) 346 347 dump = debug_data.DebugDumpDir( 348 self._dump_root, partition_graphs=run_metadata.partition_graphs) 349 350 self.assertIn(str1_name, dump.nodes()) 351 self.assertIn(str2_name, dump.nodes()) 352 353 self.assertEqual(2, dump.size) 354 355 self.assertEqual([str1_init_val], 356 dump.get_tensors("%s/read" % str1_name, 0, 357 "DebugIdentity")) 358 self.assertEqual([str2_init_val], 359 dump.get_tensors("%s/read" % str2_name, 0, 360 "DebugIdentity")) 361 362 self.assertGreaterEqual( 363 dump.get_rel_timestamps("%s/read" % str1_name, 0, "DebugIdentity")[0], 364 0) 365 self.assertGreaterEqual( 366 dump.get_rel_timestamps("%s/read" % str2_name, 0, "DebugIdentity")[0], 367 0) 368 369 self.assertGreater( 370 dump.get_dump_sizes_bytes("%s/read" % str1_name, 0, 371 "DebugIdentity")[0], 0) 372 self.assertGreater( 373 dump.get_dump_sizes_bytes("%s/read" % str2_name, 0, 374 "DebugIdentity")[0], 0) 375 376 def testDumpUninitializedVariable(self): 377 op_namespace = "testDumpUninitializedVariable" 378 with session.Session() as sess: 379 u_init_val = np.array([[5.0, 3.0], [-1.0, 0.0]]) 380 s_init_val = b"str1" 381 382 u_name = "%s/u" % op_namespace 383 s_name = "%s/s" % op_namespace 384 385 u_init = constant_op.constant(u_init_val, shape=[2, 2]) 386 u = variables.VariableV1(u_init, name=u_name) 387 s_init = constant_op.constant(s_init_val) 388 s = variables.VariableV1(s_init, name=s_name) 389 390 run_options = config_pb2.RunOptions(output_partition_graphs=True) 391 debug_urls = self._debug_urls() 392 393 # Add debug tensor watch for u. 394 debug_utils.add_debug_tensor_watch( 395 run_options, u_name, 0, debug_urls=debug_urls) 396 debug_utils.add_debug_tensor_watch( 397 run_options, s_name, 0, debug_urls=debug_urls) 398 399 run_metadata = config_pb2.RunMetadata() 400 401 # Initialize u and s. 402 sess.run(variables.global_variables_initializer(), 403 options=run_options, 404 run_metadata=run_metadata) 405 406 # Verify the dump file for the uninitialized value of u. 407 dump = debug_data.DebugDumpDir( 408 self._dump_root, partition_graphs=run_metadata.partition_graphs) 409 410 self.assertEqual(2, dump.size) 411 self.assertEqual(self._expected_partition_graph_count, 412 len(run_metadata.partition_graphs)) 413 414 # Verify that the variable is properly initialized by the run() call. 415 u_vals = dump.get_tensors(u_name, 0, "DebugIdentity") 416 s_vals = dump.get_tensors(s_name, 0, "DebugIdentity") 417 self.assertEqual(1, len(u_vals)) 418 self.assertIsInstance(u_vals[0], debug_data.InconvertibleTensorProto) 419 self.assertFalse(u_vals[0].initialized) 420 self.assertEqual(1, len(s_vals)) 421 self.assertIsInstance(s_vals[0], debug_data.InconvertibleTensorProto) 422 self.assertFalse(s_vals[0].initialized) 423 424 # Call run() again, to check that u is initialized properly. 425 self.assertAllClose(u_init_val, sess.run(u)) 426 self.assertEqual(s_init_val, sess.run(s)) 427 428 def testDebugWhileLoopGeneratesMultipleDumps(self): 429 with session.Session(config=no_rewrite_session_config()) as sess: 430 num_iter = 10 431 432 # "u" is the Variable being updated in the loop. 433 u_name = "testDumpToFileWhileLoop/u" 434 u_namespace = u_name.split("/")[0] 435 436 u_init_val = np.array(11.0) 437 u_init = constant_op.constant(u_init_val) 438 u = variables.VariableV1(u_init, name=u_name) 439 440 # "v" is the increment. 441 v_name = "testDumpToFileWhileLoop/v" 442 v_namespace = v_name.split("/")[0] 443 444 v_init_val = np.array(2.0) 445 v_init = constant_op.constant(v_init_val) 446 v = variables.VariableV1(v_init, name=v_name) 447 448 u.initializer.run() 449 v.initializer.run() 450 451 i = constant_op.constant(0, name="testDumpToFileWhileLoop/i") 452 453 def cond(i): 454 return math_ops.less(i, num_iter) 455 456 def body(i): 457 new_u = state_ops.assign_add(u, v) 458 new_i = math_ops.add(i, 1) 459 op = control_flow_ops.group(new_u) 460 new_i = control_flow_ops.with_dependencies([op], new_i) 461 return [new_i] 462 463 loop = control_flow_ops.while_loop( 464 cond, body, [i], parallel_iterations=10) 465 466 # Create RunOptions for debug-watching tensors 467 run_options = config_pb2.RunOptions(output_partition_graphs=True) 468 debug_urls = self._debug_urls() 469 470 # Add debug tensor watch for u. 471 debug_utils.add_debug_tensor_watch( 472 run_options, u_name, 0, debug_urls=debug_urls) 473 # Add debug tensor watch for v. 474 debug_utils.add_debug_tensor_watch( 475 run_options, "%s/read" % v_name, 0, debug_urls=debug_urls) 476 # Add debug tensor watch for while/Identity. 477 debug_utils.add_debug_tensor_watch( 478 run_options, "while/Identity", 0, debug_urls=debug_urls) 479 # Add debug tensor watch for while/Add/y. 480 debug_utils.add_debug_tensor_watch( 481 run_options, "while/Add/y", 0, debug_urls=debug_urls) 482 483 run_metadata = config_pb2.RunMetadata() 484 r = sess.run(loop, options=run_options, run_metadata=run_metadata) 485 486 self.assertEqual(self._expected_partition_graph_count, 487 len(run_metadata.partition_graphs)) 488 489 self.assertEqual(num_iter, r) 490 u_val_final = sess.run(u) 491 self.assertAllClose(u_init_val + num_iter * v_init_val, u_val_final) 492 493 # Verify dump files 494 self.assertTrue(os.path.isdir(self._dump_root)) 495 496 u_glob_out = glob.glob(os.path.join(self._dump_root, "*", u_namespace)) 497 v_glob_out = glob.glob(os.path.join( 498 self._dump_root, "*", v_namespace, "v")) 499 self.assertTrue(os.path.isdir(u_glob_out[0])) 500 self.assertTrue(os.path.isdir(v_glob_out[0])) 501 502 dump = debug_data.DebugDumpDir( 503 self._dump_root, partition_graphs=run_metadata.partition_graphs) 504 505 # Expected dumped tensors: u, v/read, 10 iterations of while/Identity, 506 # and 10 iterations of while/Add/y. 507 self.assertEqual(1 + 1 + num_iter + num_iter, dump.size) 508 509 # Verify tensor values. 510 self.assertAllClose([u_init_val], 511 dump.get_tensors(u_name, 0, "DebugIdentity")) 512 self.assertAllClose([v_init_val], 513 dump.get_tensors("%s/read" % v_name, 0, 514 "DebugIdentity")) 515 516 while_id_tensors = dump.get_tensors("while/Identity", 0, "DebugIdentity") 517 self.assertEqual(10, len(while_id_tensors)) 518 for k in range(len(while_id_tensors)): 519 self.assertAllClose(np.array(k), while_id_tensors[k]) 520 521 # Verify ascending timestamps from the while loops. 522 while_id_rel_timestamps = dump.get_rel_timestamps("while/Identity", 0, 523 "DebugIdentity") 524 while_id_dump_sizes_bytes = dump.get_dump_sizes_bytes("while/Identity", 0, 525 "DebugIdentity") 526 self.assertEqual(10, len(while_id_rel_timestamps)) 527 prev_rel_time = 0 528 prev_dump_size_bytes = while_id_dump_sizes_bytes[0] 529 for rel_time, dump_size_bytes in zip(while_id_rel_timestamps, 530 while_id_dump_sizes_bytes): 531 self.assertGreaterEqual(rel_time, prev_rel_time) 532 self.assertEqual(dump_size_bytes, prev_dump_size_bytes) 533 prev_rel_time = rel_time 534 prev_dump_size_bytes = dump_size_bytes 535 536 # Test querying debug watch keys from node name. 537 watch_keys = dump.debug_watch_keys("while/Identity") 538 self.assertEqual(["while/Identity:0:DebugIdentity"], watch_keys) 539 540 # Test querying debug datum instances from debug watch key. 541 self.assertEqual(10, len(dump.watch_key_to_data(watch_keys[0]))) 542 self.assertEqual([], dump.watch_key_to_data("foo")) 543 544 def testDebugWhileLoopWatchingWholeGraphWorks(self): 545 with session.Session() as sess: 546 loop_body = lambda i: math_ops.add(i, 2) 547 loop_cond = lambda i: math_ops.less(i, 16) 548 549 i = constant_op.constant(10, name="i") 550 loop = control_flow_ops.while_loop(loop_cond, loop_body, [i]) 551 552 loop_result, dump = self._debug_run_and_get_dump(sess, loop) 553 self.assertEqual(16, loop_result) 554 555 self.assertEqual( 556 [[10]], dump.get_tensors("while/Enter", 0, "DebugIdentity")) 557 self.assertEqual( 558 [[12], [14], [16]], 559 dump.get_tensors("while/NextIteration", 0, "DebugIdentity")) 560 561 def testDebugTrainingDynamicRNNWorks(self): 562 with session.Session() as sess: 563 input_size = 3 564 state_size = 2 565 time_steps = 4 566 batch_size = 2 567 568 input_values = np.random.randn(time_steps, batch_size, input_size) 569 sequence_length = np.random.randint(0, time_steps, size=batch_size) 570 concat_inputs = array_ops.placeholder( 571 dtypes.float32, shape=(time_steps, batch_size, input_size)) 572 573 outputs_dynamic, _ = rnn.dynamic_rnn( 574 _RNNCellForTest(input_size, state_size), 575 inputs=concat_inputs, 576 sequence_length=sequence_length, 577 time_major=True, 578 dtype=dtypes.float32) 579 toy_loss = math_ops.reduce_sum(outputs_dynamic * outputs_dynamic) 580 train_op = gradient_descent.GradientDescentOptimizer( 581 learning_rate=0.1).minimize(toy_loss, name="train_op") 582 583 sess.run(variables.global_variables_initializer()) 584 585 run_options = config_pb2.RunOptions(output_partition_graphs=True) 586 debug_utils.watch_graph_with_denylists( 587 run_options, 588 sess.graph, 589 node_name_regex_denylist="(.*rnn/while/.*|.*TensorArray.*)", 590 debug_urls=self._debug_urls()) 591 # b/36870549: Nodes with these name patterns need to be excluded from 592 # tfdbg in order to prevent MSAN warnings of uninitialized Tensors 593 # under both file:// and grpc:// debug URL schemes. 594 595 run_metadata = config_pb2.RunMetadata() 596 sess.run(train_op, feed_dict={concat_inputs: input_values}, 597 options=run_options, run_metadata=run_metadata) 598 599 debug_data.DebugDumpDir( 600 self._dump_root, partition_graphs=run_metadata.partition_graphs) 601 602 def testDebugCondWatchingWholeGraphWorks(self): 603 with session.Session() as sess: 604 x = variables.VariableV1(10.0, name="x") 605 y = variables.VariableV1(20.0, name="y") 606 cond = control_flow_ops.cond( 607 x > y, lambda: math_ops.add(x, 1), lambda: math_ops.add(y, 1)) 608 609 sess.run(variables.global_variables_initializer()) 610 611 cond_result, dump = self._debug_run_and_get_dump(sess, cond) 612 self.assertEqual(21, cond_result) 613 614 self.assertAllClose( 615 [21.0], dump.get_tensors("cond/Merge", 0, "DebugIdentity")) 616 617 def testFindNodesWithBadTensorValues(self): 618 with session.Session() as sess: 619 u_name = "testFindNodesWithBadTensorValues/u" 620 v_name = "testFindNodesWithBadTensorValues/v" 621 w_name = "testFindNodesWithBadTensorValues/w" 622 x_name = "testFindNodesWithBadTensorValues/x" 623 y_name = "testFindNodesWithBadTensorValues/y" 624 z_name = "testFindNodesWithBadTensorValues/z" 625 626 u_init = constant_op.constant([2.0, 4.0]) 627 u = variables.VariableV1(u_init, name=u_name) 628 v_init = constant_op.constant([2.0, 1.0]) 629 v = variables.VariableV1(v_init, name=v_name) 630 631 # Expected output: [0.0, 3.0] 632 w = math_ops.subtract(u, v, name=w_name) 633 634 # Expected output: [inf, 1.3333] 635 x = math_ops.div(u, w, name=x_name) 636 637 # Expected output: [nan, 4.0] 638 y = math_ops.multiply(w, x, name=y_name) 639 640 z = math_ops.multiply(y, y, name=z_name) 641 642 u.initializer.run() 643 v.initializer.run() 644 645 _, dump = self._debug_run_and_get_dump( 646 sess, z, 647 expected_partition_graph_count=self._expected_partition_graph_count) 648 649 def has_bad_value(_, tensor): 650 return np.any(np.isnan(tensor)) or np.any(np.isinf(tensor)) 651 652 # Find all "offending tensors". 653 bad_data = dump.find(has_bad_value) 654 655 # Verify that the nodes with bad values are caught through running find 656 # on the debug dump. 657 self.assertLessEqual(3, len(bad_data)) 658 node_names = [datum.node_name for datum in bad_data] 659 self.assertIn(x_name, node_names) 660 self.assertIn(y_name, node_names) 661 self.assertIn(z_name, node_names) 662 663 # Test first_n kwarg of find(): Find the first offending tensor. 664 first_bad_datum = dump.find(has_bad_value, first_n=1) 665 self.assertEqual(1, len(first_bad_datum)) 666 667 def testFindInfOrNanWithOpNameExclusion(self): 668 with session.Session() as sess: 669 u_name = "testFindInfOrNanWithOpNameExclusion/u" 670 v_name = "testFindInfOrNanWithOpNameExclusion/v" 671 w_name = "testFindInfOrNanWithOpNameExclusion/w" 672 x_name = "testFindInfOrNanWithOpNameExclusion/x" 673 y_name = "testFindInfOrNanWithOpNameExclusion/y" 674 z_name = "testFindInfOrNanWithOpNameExclusion/z" 675 676 u_init = constant_op.constant([2.0, 4.0]) 677 u = variables.VariableV1(u_init, name=u_name) 678 v_init = constant_op.constant([2.0, 1.0]) 679 v = variables.VariableV1(v_init, name=v_name) 680 681 # Expected output: [0.0, 3.0] 682 w = math_ops.subtract(u, v, name=w_name) 683 684 # Expected output: [inf, 1.3333] 685 x = math_ops.div(u, w, name=x_name) 686 687 # Expected output: [nan, 4.0] 688 y = math_ops.multiply(w, x, name=y_name) 689 690 z = math_ops.multiply(y, y, name=z_name) 691 692 u.initializer.run() 693 v.initializer.run() 694 695 _, dump = self._debug_run_and_get_dump( 696 sess, z, 697 expected_partition_graph_count=self._expected_partition_graph_count) 698 699 # Find all "offending tensors". 700 bad_data = dump.find(debug_data.has_inf_or_nan, 701 exclude_node_names=".*/x$") 702 703 # Verify that the nodes with bad values are caught through running find 704 # on the debug dump. 705 self.assertLessEqual(2, len(bad_data)) 706 # Assert that the node `x` should have been excluded. 707 node_names = [datum.node_name for datum in bad_data] 708 self.assertIn(y_name, node_names) 709 self.assertIn(z_name, node_names) 710 711 first_bad_datum = dump.find( 712 debug_data.has_inf_or_nan, first_n=1, exclude_node_names=".*/x$") 713 self.assertEqual(1, len(first_bad_datum)) 714 715 def _session_run_for_graph_structure_lookup(self): 716 with session.Session(config=no_rewrite_session_config()) as sess: 717 u_name = "testDumpGraphStructureLookup/u" 718 v_name = "testDumpGraphStructureLookup/v" 719 w_name = "testDumpGraphStructureLookup/w" 720 721 u_init = constant_op.constant([2.0, 4.0]) 722 u = variables.VariableV1(u_init, name=u_name) 723 v = math_ops.add(u, u, name=v_name) 724 w = math_ops.add(v, v, name=w_name) 725 726 u.initializer.run() 727 728 _, dump = self._debug_run_and_get_dump( 729 sess, w, 730 expected_partition_graph_count=self._expected_partition_graph_count) 731 732 return u_name, v_name, w_name, dump 733 734 def testGraphStructureLookupGivesDevicesAndNodesInfo(self): 735 u_name, _, _, dump = self._session_run_for_graph_structure_lookup() 736 737 # Test num_devices(). 738 self.assertEqual(self._expected_num_devices, len(dump.devices())) 739 740 # Test node_device(). 741 self.assertEqual(self._main_device, dump.node_device(u_name)) 742 743 with self.assertRaisesRegexp(ValueError, 744 "does not exist in partition graphs"): 745 dump.node_device(u_name + "foo") 746 747 # Test node_exists(). 748 self.assertTrue(dump.node_exists(u_name)) 749 self.assertTrue(dump.node_exists(u_name + "/read")) 750 self.assertFalse(dump.node_exists(u_name + "/read" + "/foo")) 751 752 def testGraphStructureLookupGivesNodesAndAttributes(self): 753 u_name, _, _, dump = self._session_run_for_graph_structure_lookup() 754 755 u_read_name = u_name + "/read" 756 757 # Test node name list lookup of the DebugDumpDir object. 758 if test_util.gpu_device_name(): 759 node_names = dump.nodes( 760 device_name="/job:localhost/replica:0/task:0/device:GPU:0") 761 else: 762 node_names = dump.nodes() 763 self.assertTrue(u_name in node_names) 764 self.assertTrue(u_read_name in node_names) 765 766 # Test querying node attributes. 767 u_attr = dump.node_attributes(u_name) 768 self.assertEqual(dtypes.float32, u_attr["dtype"].type) 769 self.assertEqual(1, len(u_attr["shape"].shape.dim)) 770 self.assertEqual(2, u_attr["shape"].shape.dim[0].size) 771 772 with self.assertRaisesRegexp( 773 ValueError, r"None of the .* device\(s\) has a node named "): 774 dump.node_attributes("foo") 775 776 def testGraphStructureLookupGivesDebugWatchKeys(self): 777 u_name, v_name, w_name, dump = ( 778 self._session_run_for_graph_structure_lookup()) 779 780 # Test querying the debug watch keys with node names. 781 self.assertEqual(["%s:0:DebugIdentity" % u_name], 782 dump.debug_watch_keys(u_name)) 783 self.assertEqual(["%s:0:DebugIdentity" % v_name], 784 dump.debug_watch_keys(v_name)) 785 self.assertEqual(["%s:0:DebugIdentity" % w_name], 786 dump.debug_watch_keys(w_name)) 787 self.assertEqual([], dump.debug_watch_keys("foo")) 788 789 # Test querying debug datum instances from debug watch. 790 u_data = dump.watch_key_to_data(dump.debug_watch_keys(u_name)[0]) 791 self.assertEqual(1, len(u_data)) 792 self.assertEqual(u_name, u_data[0].node_name) 793 self.assertEqual(0, u_data[0].output_slot) 794 self.assertEqual("DebugIdentity", u_data[0].debug_op) 795 self.assertGreaterEqual(u_data[0].timestamp, 0) 796 self.assertEqual([], dump.watch_key_to_data("foo")) 797 798 def testGraphStructureLookupGivesNodeInputsAndRecipients(self): 799 u_name, v_name, w_name, dump = ( 800 self._session_run_for_graph_structure_lookup()) 801 802 u_read_name = u_name + "/read" 803 804 # Test the inputs lookup of the DebugDumpDir object. 805 self.assertEqual([], dump.node_inputs(u_name)) 806 self.assertEqual([u_name], dump.node_inputs(u_read_name)) 807 self.assertEqual([u_read_name] * 2, dump.node_inputs(v_name)) 808 self.assertEqual([v_name] * 2, dump.node_inputs(w_name)) 809 810 self.assertEqual([], dump.node_inputs(u_name, is_control=True)) 811 self.assertEqual([], dump.node_inputs(u_read_name, is_control=True)) 812 self.assertEqual([], dump.node_inputs(v_name, is_control=True)) 813 self.assertEqual([], dump.node_inputs(w_name, is_control=True)) 814 815 # Test the outputs recipient lookup of the DebugDumpDir object. 816 self.assertTrue(u_read_name in dump.node_recipients(u_name)) 817 self.assertEqual(2, dump.node_recipients(u_read_name).count(v_name)) 818 self.assertEqual(2, dump.node_recipients(v_name).count(w_name)) 819 820 self.assertEqual([], dump.node_recipients(u_name, is_control=True)) 821 self.assertEqual([], dump.node_recipients(u_read_name, is_control=True)) 822 self.assertEqual([], dump.node_recipients(v_name, is_control=True)) 823 self.assertEqual([], dump.node_recipients(w_name, is_control=True)) 824 825 # Test errors raised on invalid node names. 826 with self.assertRaisesRegexp( 827 ValueError, r"None of the .* device\(s\) has a node named "): 828 dump.node_inputs(u_name + "foo") 829 with self.assertRaisesRegexp( 830 ValueError, r"None of the .* device\(s\) has a node named "): 831 dump.node_recipients(u_name + "foo") 832 833 # Test transitive_inputs(). 834 self.assertEqual([], dump.transitive_inputs(u_name)) 835 self.assertEqual([u_name], dump.transitive_inputs(u_read_name)) 836 self.assertEqual( 837 set([u_name, u_read_name]), set(dump.transitive_inputs(v_name))) 838 self.assertEqual( 839 set([u_name, u_read_name, v_name]), set(dump.transitive_inputs(w_name))) 840 841 with self.assertRaisesRegexp( 842 ValueError, r"None of the .* device\(s\) has a node named "): 843 dump.transitive_inputs(u_name + "foo") 844 845 def testGraphStructureLookupWithoutPartitionGraphsDoesNotErrorOut(self): 846 _, _, _, dump = self._session_run_for_graph_structure_lookup() 847 848 # Now load the dump again, without the partition graphs, so we can check 849 # errors are not raised because the partition graphs are loaded from the 850 # dump directory. 851 dump = debug_data.DebugDumpDir(self._dump_root, validate=False) 852 self.assertTrue(dump.loaded_partition_graphs()) 853 854 def testGraphPathFindingOnControlEdgesWorks(self): 855 with session.Session(config=no_rewrite_session_config()) as sess: 856 v1 = variables.VariableV1(1.0, name="v1") 857 v2 = variables.VariableV1(2.0, name="v2") 858 v3 = variables.VariableV1(3.0, name="v3") 859 a = math_ops.add(v1, v2, name="a") 860 with ops.control_dependencies([a]): 861 c = math_ops.subtract(v3, v3, name="c") 862 863 sess.run(variables.global_variables_initializer()) 864 _, dump = self._debug_run_and_get_dump(sess, c) 865 866 self.assertEqual(["v1", "v1/read", "a", "c"], 867 dump.find_some_path("v1", "c")) 868 self.assertIsNone(dump.find_some_path("v1", "c", include_control=False)) 869 870 def testGraphPathFindingReverseRefEdgeWorks(self): 871 with session.Session(config=no_rewrite_session_config()) as sess: 872 v = variables.VariableV1(10.0, name="v") 873 delta = variables.VariableV1(1.0, name="delta") 874 inc_v = state_ops.assign_add(v, delta, name="inc_v") 875 876 sess.run(variables.global_variables_initializer()) 877 _, dump = self._debug_run_and_get_dump(sess, inc_v) 878 879 self.assertEqual( 880 ["delta", "delta/read", "inc_v", "v"], 881 dump.find_some_path("delta", "v", include_reversed_ref=True)) 882 self.assertIsNone(dump.find_some_path("delta", "v")) 883 884 def testCausalityCheckOnDumpsDetectsWrongTemporalOrder(self): 885 with session.Session(config=no_rewrite_session_config()) as sess: 886 u_name = "testDumpCausalityCheck/u" 887 v_name = "testDumpCausalityCheck/v" 888 w_name = "testDumpCausalityCheck/w" 889 890 u_init = constant_op.constant([2.0, 4.0]) 891 u = variables.VariableV1(u_init, name=u_name) 892 v = math_ops.add(u, u, name=v_name) 893 w = math_ops.add(v, v, name=w_name) 894 895 u.initializer.run() 896 897 run_options = config_pb2.RunOptions(output_partition_graphs=True) 898 debug_utils.watch_graph( 899 run_options, 900 sess.graph, 901 debug_ops=["DebugIdentity"], 902 debug_urls=self._debug_urls()) 903 904 run_metadata = config_pb2.RunMetadata() 905 sess.run(w, options=run_options, run_metadata=run_metadata) 906 907 self.assertEqual(self._expected_partition_graph_count, 908 len(run_metadata.partition_graphs)) 909 910 # First, loading the original dump without supplying the 911 # partition_graphs should not cause a LookupError, validation occurs 912 # only with partition_graphs loaded. 913 debug_data.DebugDumpDir(self._dump_root) 914 915 # Now, loading the original dump with partition graphs supplied should 916 # succeed. The validation should pass quietly. 917 dump = debug_data.DebugDumpDir( 918 self._dump_root, partition_graphs=run_metadata.partition_graphs) 919 920 # Get the dump file names and compute their timestamps. 921 self.assertEqual( 922 1, len(dump.get_tensor_file_paths(v_name, 0, "DebugIdentity"))) 923 v_file_path = dump.get_tensor_file_paths(v_name, 0, "DebugIdentity")[0] 924 925 self.assertEqual( 926 1, len(dump.get_tensor_file_paths(w_name, 0, "DebugIdentity"))) 927 w_file_path = dump.get_tensor_file_paths(w_name, 0, "DebugIdentity")[0] 928 929 v_timestamp = int(v_file_path[v_file_path.rindex("_") + 1:]) 930 w_timestamp = int(w_file_path[w_file_path.rindex("_") + 1:]) 931 932 # Swap and slightly shift the time stamps of the last two dumped tensors, 933 # to simulate "causality violation", which can happen if the dump 934 # directory contains incomplete data and/or mixes data from different 935 # Session.run() calls. 936 v_file_path_1 = v_file_path[:v_file_path.rindex( 937 "_")] + "_%d" % w_timestamp 938 w_file_path_1 = w_file_path[:w_file_path.rindex("_")] + "_%d" % ( 939 v_timestamp - 1) 940 941 os.rename(v_file_path, v_file_path_1) 942 os.rename(w_file_path, w_file_path_1) 943 944 # Load the dump directory again. Now a ValueError is expected to be 945 # raised due to the timestamp swap. 946 with self.assertRaisesRegexp(ValueError, "Causality violated"): 947 dump = debug_data.DebugDumpDir( 948 self._dump_root, partition_graphs=run_metadata.partition_graphs) 949 950 # Loading the dump directory with kwarg "validate" set explicitly to 951 # False should get rid of the error. 952 dump = debug_data.DebugDumpDir( 953 self._dump_root, 954 partition_graphs=run_metadata.partition_graphs, 955 validate=False) 956 957 # Next, set the two times stamps to be the same, which should be fine. 958 v_file_path_2 = v_file_path[:v_file_path.rindex( 959 "_")] + "_%d" % w_timestamp 960 w_file_path_2 = w_file_path[:w_file_path.rindex( 961 "_")] + "_%d" % w_timestamp 962 963 os.rename(v_file_path_1, v_file_path_2) 964 os.rename(w_file_path_1, w_file_path_2) 965 966 debug_data.DebugDumpDir( 967 self._dump_root, partition_graphs=run_metadata.partition_graphs) 968 969 def testWatchingOnlyOneOfTwoOutputSlotsDoesNotLeadToCausalityFailure(self): 970 with session.Session() as sess: 971 x_name = "oneOfTwoSlots/x" 972 u_name = "oneOfTwoSlots/u" 973 v_name = "oneOfTwoSlots/v" 974 w_name = "oneOfTwoSlots/w" 975 y_name = "oneOfTwoSlots/y" 976 977 x = variables.VariableV1([1, 3, 3, 7], dtype=dtypes.int32, name=x_name) 978 sess.run(x.initializer) 979 980 unique_x, indices, _ = array_ops.unique_with_counts(x, name=u_name) 981 982 v = math_ops.add(unique_x, unique_x, name=v_name) 983 w = math_ops.add(indices, indices, name=w_name) 984 y = math_ops.add(w, w, name=y_name) 985 986 run_options = config_pb2.RunOptions(output_partition_graphs=True) 987 # Watch only the first output slot of u, even though it has two output 988 # slots. 989 debug_utils.add_debug_tensor_watch( 990 run_options, u_name, 0, debug_urls=self._debug_urls()) 991 debug_utils.add_debug_tensor_watch( 992 run_options, w_name, 0, debug_urls=self._debug_urls()) 993 debug_utils.add_debug_tensor_watch( 994 run_options, y_name, 0, debug_urls=self._debug_urls()) 995 996 run_metadata = config_pb2.RunMetadata() 997 sess.run([v, y], options=run_options, run_metadata=run_metadata) 998 999 dump = debug_data.DebugDumpDir( 1000 self._dump_root, 1001 partition_graphs=run_metadata.partition_graphs, 1002 validate=True) 1003 1004 self.assertAllClose([1, 3, 7], 1005 dump.get_tensors(u_name, 0, "DebugIdentity")[0]) 1006 1007 def testOutputSlotWithoutOutgoingEdgeCanBeWatched(self): 1008 """Test watching output slots not attached to any outgoing edges.""" 1009 1010 with session.Session(config=no_rewrite_session_config()) as sess: 1011 u_init_val = np.array([[5.0, 3.0], [-1.0, 0.0]]) 1012 u = constant_op.constant(u_init_val, shape=[2, 2], name="u") 1013 1014 # Create a control edge from a node with an output: From u to z. 1015 # Node u will get executed only because of the control edge. The output 1016 # tensor u:0 is not attached to any outgoing edge in the graph. This test 1017 # checks that the debugger can watch such a tensor. 1018 with ops.control_dependencies([u]): 1019 z = control_flow_ops.no_op(name="z") 1020 1021 _, dump = self._debug_run_and_get_dump(sess, z) 1022 1023 # Assert that the DebugIdentity watch on u works properly. 1024 self.assertEqual(1, len(dump.dumped_tensor_data)) 1025 datum = dump.dumped_tensor_data[0] 1026 self.assertEqual("u", datum.node_name) 1027 self.assertEqual(0, datum.output_slot) 1028 self.assertEqual("DebugIdentity", datum.debug_op) 1029 self.assertAllClose([[5.0, 3.0], [-1.0, 0.0]], datum.get_tensor()) 1030 1031 def testWatchingVariableUpdateOpsSeesUpdatedValues(self): 1032 """Watch output slots on Variable-updating ops, with no emitted edges.""" 1033 1034 with session.Session(config=no_rewrite_session_config()) as sess: 1035 u_init = constant_op.constant(10.0) 1036 u = variables.VariableV1(u_init, name="gdo/u") 1037 v_init = constant_op.constant(20.0) 1038 v = variables.VariableV1(v_init, name="gdo/v") 1039 1040 w = math_ops.multiply(u, v, name="gdo/w") 1041 # gdo stands for GradientDescentOptimizer. 1042 1043 train_op = gradient_descent.GradientDescentOptimizer( 1044 learning_rate=0.1).minimize( 1045 w, name="gdo/train") 1046 1047 u.initializer.run() 1048 v.initializer.run() 1049 1050 _, dump = self._debug_run_and_get_dump(sess, train_op) 1051 1052 update_u_data = dump.watch_key_to_data( 1053 "gdo/train/update_gdo/u/ApplyGradientDescent:0:DebugIdentity") 1054 self.assertEqual(1, len(update_u_data)) 1055 1056 # Gradient descent on u: w = u * v, so dw / du = v. 1057 # Updated value of u should be: 1058 # 10.0 - learning_rate * v = 10.0 - 0.1 * 20.0 = 8.0 1059 self.assertAllClose(8.0, update_u_data[0].get_tensor()) 1060 1061 update_v_data = dump.watch_key_to_data( 1062 "gdo/train/update_gdo/v/ApplyGradientDescent:0:DebugIdentity") 1063 self.assertEqual(1, len(update_v_data)) 1064 1065 # Gradient descent on u: w = u * v, so dw / dv = u. 1066 # Updated value of u should be: 1067 # 20.0 - learning_rate * u = 20.0 - 0.1 * 10.0 = 19.0 1068 self.assertAllClose(19.0, update_v_data[0].get_tensor()) 1069 1070 # Verify that the Variables u and v are updated properly. 1071 self.assertAllClose(8.0, sess.run(u)) 1072 self.assertAllClose(19.0, sess.run(v)) 1073 1074 def testAllowsWatchingUnconnectedOutputTensor(self): 1075 """Watch an output slot not emitting any edges. 1076 1077 (Not even control edges from the node.) 1078 """ 1079 1080 with session.Session() as sess: 1081 x_init = constant_op.constant([2, 2, 3, 5, 5]) 1082 x = variables.VariableV1(x_init, name="unconnected/x") 1083 1084 # The UniqueOp (tf.unique) has two output slots. Use only slot 0 in the 1085 # graph. Let the debugger watch the unused slot 1. 1086 unique_x, _ = array_ops.unique(x, name="unconnected/unique_x") 1087 y = math_ops.add(unique_x, [0, 1, 2], name="unconnected/y") 1088 1089 x.initializer.run() 1090 1091 # Verify that only slot 0 of unique_x has recipients, while slot 1 of the 1092 # same node does not have recipients. 1093 unique_x_slot_0_recipients = [] 1094 unique_x_slot_1_recipients = [] 1095 for op in sess.graph.get_operations(): 1096 for inp in op.inputs: 1097 if inp.name == "unconnected/unique_x:0": 1098 unique_x_slot_0_recipients.append(op.name) 1099 elif inp.name == "unconnected/unique_x:1": 1100 unique_x_slot_1_recipients.append(op.name) 1101 1102 self.assertEqual(["unconnected/y"], unique_x_slot_0_recipients) 1103 self.assertEqual([], unique_x_slot_1_recipients) 1104 1105 y_result, dump = self._debug_run_and_get_dump(sess, y) 1106 self.assertAllClose([2, 4, 7], y_result) 1107 1108 # Assert that the connected slot (slot 0) is dumped properly. 1109 unique_x_slot_0_dumps = dump.watch_key_to_data( 1110 "unconnected/unique_x:0:DebugIdentity") 1111 self.assertEqual(1, len(unique_x_slot_0_dumps)) 1112 self.assertEqual("unconnected/unique_x", 1113 unique_x_slot_0_dumps[0].node_name) 1114 self.assertEqual(0, unique_x_slot_0_dumps[0].output_slot) 1115 self.assertAllClose([2, 3, 5], unique_x_slot_0_dumps[0].get_tensor()) 1116 1117 # Assert that the unconnected slot (slot 1) is dumped properly. 1118 unique_x_slot_1_dumps = dump.watch_key_to_data( 1119 "unconnected/unique_x:1:DebugIdentity") 1120 self.assertEqual(1, len(unique_x_slot_1_dumps)) 1121 self.assertEqual("unconnected/unique_x", 1122 unique_x_slot_1_dumps[0].node_name) 1123 self.assertEqual(1, unique_x_slot_1_dumps[0].output_slot) 1124 self.assertAllClose([0, 0, 1, 2, 2], 1125 unique_x_slot_1_dumps[0].get_tensor()) 1126 1127 def testSuccessiveDebuggingRunsIncreasesCounters(self): 1128 """Test repeated Session.run() calls with debugger increments counters.""" 1129 1130 with session.Session() as sess: 1131 ph = array_ops.placeholder(dtypes.float32, name="successive/ph") 1132 x = array_ops.transpose(ph, name="mismatch/x") 1133 y = array_ops.squeeze(ph, name="mismatch/y") 1134 1135 _, dump1 = self._debug_run_and_get_dump( 1136 sess, x, feed_dict={ph: np.array([[7.0, 8.0]])}, global_step=1) 1137 self.assertEqual(1, dump1.core_metadata.global_step) 1138 self.assertGreaterEqual(dump1.core_metadata.session_run_index, 0) 1139 self.assertEqual(0, dump1.core_metadata.executor_step_index) 1140 self.assertEqual([ph.name], dump1.core_metadata.input_names) 1141 self.assertEqual([x.name], dump1.core_metadata.output_names) 1142 self.assertEqual([], dump1.core_metadata.target_nodes) 1143 file_io.delete_recursively(self._dump_root) 1144 1145 # Calling run() with the same feed, same output and same debug watch 1146 # options should increment both session_run_index and 1147 # executor_step_index. 1148 _, dump2 = self._debug_run_and_get_dump( 1149 sess, x, feed_dict={ph: np.array([[7.0, 8.0]])}, global_step=2) 1150 self.assertEqual(2, dump2.core_metadata.global_step) 1151 self.assertEqual(dump1.core_metadata.session_run_index + 1, 1152 dump2.core_metadata.session_run_index) 1153 self.assertEqual(dump1.core_metadata.executor_step_index + 1, 1154 dump2.core_metadata.executor_step_index) 1155 self.assertEqual([ph.name], dump2.core_metadata.input_names) 1156 self.assertEqual([x.name], dump2.core_metadata.output_names) 1157 self.assertEqual([], dump2.core_metadata.target_nodes) 1158 file_io.delete_recursively(self._dump_root) 1159 1160 run_options = config_pb2.RunOptions(output_partition_graphs=True) 1161 debug_utils.watch_graph( 1162 run_options, sess.graph, debug_urls=self._debug_urls(), global_step=3) 1163 1164 # Calling run() with a different output should increment 1165 # session_run_index, but not executor_step_index. 1166 _, dump3 = self._debug_run_and_get_dump( 1167 sess, y, feed_dict={ph: np.array([[7.0, 8.0]])}, global_step=3) 1168 self.assertEqual(3, dump3.core_metadata.global_step) 1169 self.assertEqual(dump2.core_metadata.session_run_index + 1, 1170 dump3.core_metadata.session_run_index) 1171 self.assertEqual(0, dump3.core_metadata.executor_step_index) 1172 self.assertEqual([ph.name], dump3.core_metadata.input_names) 1173 self.assertEqual([y.name], dump3.core_metadata.output_names) 1174 self.assertEqual([], dump3.core_metadata.target_nodes) 1175 1176 def testDebuggingDuringOpError(self): 1177 """Test the debug tensor dumping when error occurs in graph runtime.""" 1178 1179 with session.Session() as sess: 1180 ph = array_ops.placeholder(dtypes.float32, name="mismatch/ph") 1181 x = array_ops.transpose(ph, name="mismatch/x") 1182 m = constant_op.constant( 1183 np.array( 1184 [[1.0, 2.0]], dtype=np.float32), name="mismatch/m") 1185 y = math_ops.matmul(m, x, name="mismatch/y") 1186 1187 run_options = config_pb2.RunOptions(output_partition_graphs=True) 1188 debug_utils.watch_graph( 1189 run_options, 1190 sess.graph, 1191 debug_ops=["DebugIdentity"], 1192 debug_urls=self._debug_urls()) 1193 1194 with self.assertRaises(errors.OpError): 1195 sess.run(y, 1196 options=run_options, 1197 feed_dict={ph: np.array([[-3.0], [0.0]])}) 1198 1199 dump = debug_data.DebugDumpDir(self._dump_root) 1200 1201 self.assertGreaterEqual(dump.core_metadata.session_run_index, 0) 1202 self.assertGreaterEqual(dump.core_metadata.executor_step_index, 0) 1203 self.assertEqual([ph.name], dump.core_metadata.input_names) 1204 self.assertEqual([y.name], dump.core_metadata.output_names) 1205 self.assertEqual([], dump.core_metadata.target_nodes) 1206 1207 # Despite the fact that the run() call errored out and partition_graphs 1208 # are not available via run_metadata, the partition graphs should still 1209 # have been loaded from the dump directory. 1210 self.assertTrue(dump.loaded_partition_graphs()) 1211 1212 m_dumps = dump.watch_key_to_data("mismatch/m:0:DebugIdentity") 1213 self.assertEqual(1, len(m_dumps)) 1214 self.assertAllClose(np.array([[1.0, 2.0]]), m_dumps[0].get_tensor()) 1215 1216 x_dumps = dump.watch_key_to_data("mismatch/x:0:DebugIdentity") 1217 self.assertEqual(1, len(x_dumps)) 1218 self.assertAllClose(np.array([[-3.0, 0.0]]), x_dumps[0].get_tensor()) 1219 1220 def testDebugNumericSummaryOnInitializedTensorGivesCorrectResult(self): 1221 with session.Session(config=no_rewrite_session_config()) as sess: 1222 a = variables.VariableV1( 1223 [ 1224 np.nan, np.nan, 0.0, 0.0, 0.0, -1.0, -3.0, 3.0, 7.0, -np.inf, 1225 -np.inf, np.inf, np.inf, np.inf, np.inf, np.inf, np.nan, np.nan 1226 ], 1227 dtype=np.float32, 1228 name="numeric_summary/a") 1229 b = variables.VariableV1( 1230 [0.0] * 18, dtype=np.float32, name="numeric_summary/b") 1231 c = math_ops.add(a, b, name="numeric_summary/c") 1232 1233 sess.run(variables.global_variables_initializer()) 1234 1235 _, dump = self._debug_run_and_get_dump( 1236 sess, c, debug_ops=["DebugNumericSummary"]) 1237 self.assertTrue(dump.loaded_partition_graphs()) 1238 1239 self.assertAllClose([[ 1240 1.0, 18.0, 4.0, 2.0, 2.0, 3.0, 2.0, 5.0, -3.0, 7.0, 0.85714286, 1241 8.97959184, 1.0, 1.0, 18.0 1242 ]], dump.get_tensors("numeric_summary/a/read", 0, "DebugNumericSummary")) 1243 1244 def testDebugNumericSummaryOnUninitializedTensorGivesCorrectResult(self): 1245 with session.Session() as sess: 1246 a = variables.VariableV1( 1247 [42], dtype=np.float32, name="numeric_summary_uninit/a") 1248 1249 _, dump = self._debug_run_and_get_dump( 1250 sess, a.initializer, debug_ops=["DebugNumericSummary"]) 1251 1252 self.assertTrue(dump.loaded_partition_graphs()) 1253 1254 # DebugNumericSummary output should reflect the uninitialized state of 1255 # the watched tensor. 1256 numeric_summary = dump.get_tensors("numeric_summary_uninit/a", 0, 1257 "DebugNumericSummary")[0] 1258 self.assertAllClose([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], 1259 numeric_summary[0:8]) 1260 # Check dtype (index 12), ndims (index 13) and dimension sizes (index 1261 # 14+). 1262 self.assertAllClose([1.0, 1.0, 1.0], numeric_summary[12:]) 1263 self.assertTrue(np.isinf(numeric_summary[8])) 1264 self.assertGreater(numeric_summary[8], 0.0) 1265 self.assertTrue(np.isinf(numeric_summary[9])) 1266 self.assertLess(numeric_summary[9], 0.0) 1267 self.assertTrue(np.isnan(numeric_summary[10])) 1268 self.assertTrue(np.isnan(numeric_summary[11])) 1269 1270 def testDebugNumericSummaryFailureIsToleratedWhenOrdered(self): 1271 with session.Session() as sess: 1272 a = variables.VariableV1("1", name="a") 1273 b = variables.VariableV1("3", name="b") 1274 c = variables.VariableV1("2", name="c") 1275 1276 d = math_ops.add(a, b, name="d") 1277 e = math_ops.add(d, c, name="e") 1278 n = parsing_ops.string_to_number(e, name="n") 1279 m = math_ops.add(n, n, name="m") 1280 1281 sess.run(variables.global_variables_initializer()) 1282 1283 # Using DebugNumericSummary on sess.run(m) with the default 1284 # tolerate_debug_op_creation_failures=False should error out due to the 1285 # presence of string-dtype Tensors in the graph. 1286 run_metadata = config_pb2.RunMetadata() 1287 run_options = config_pb2.RunOptions(output_partition_graphs=True) 1288 debug_utils.watch_graph( 1289 run_options, 1290 sess.graph, 1291 debug_ops=["DebugNumericSummary"], 1292 debug_urls=self._debug_urls()) 1293 with self.assertRaises(errors.FailedPreconditionError): 1294 sess.run(m, options=run_options, run_metadata=run_metadata) 1295 1296 # Using tolerate_debug_op_creation_failures=True should get rid of the 1297 # error. 1298 m_result, dump = self._debug_run_and_get_dump( 1299 sess, m, debug_ops=["DebugNumericSummary"], 1300 tolerate_debug_op_creation_failures=True) 1301 self.assertEqual(264, m_result) 1302 1303 # The integer-dtype Tensors in the graph should have been dumped 1304 # properly. 1305 self.assertIn("n:0:DebugNumericSummary", dump.debug_watch_keys("n")) 1306 self.assertIn("m:0:DebugNumericSummary", dump.debug_watch_keys("m")) 1307 1308 def testDebugNumericSummaryInvalidAttributesStringAreCaught(self): 1309 with session.Session(config=no_rewrite_session_config()) as sess: 1310 a = variables.VariableV1(10.0, name="a") 1311 b = variables.VariableV1(0.0, name="b") 1312 c = variables.VariableV1(0.0, name="c") 1313 1314 x = math_ops.divide(a, b, name="x") 1315 y = math_ops.multiply(x, c, name="y") 1316 1317 sess.run(variables.global_variables_initializer()) 1318 1319 run_metadata = config_pb2.RunMetadata() 1320 run_options = config_pb2.RunOptions(output_partition_graphs=True) 1321 debug_utils.watch_graph( 1322 run_options, 1323 sess.graph, 1324 debug_ops=["DebugNumericSummary(foo=1.0)"], 1325 debug_urls=self._debug_urls()) 1326 with self.assertRaisesRegexp( 1327 errors.FailedPreconditionError, 1328 r"1 attribute key\(s\) were not valid for debug node " 1329 r"__dbg_.:0_0_DebugNumericSummary: foo"): 1330 sess.run(y, options=run_options, run_metadata=run_metadata) 1331 1332 run_options = config_pb2.RunOptions(output_partition_graphs=True) 1333 debug_utils.watch_graph( 1334 run_options, 1335 sess.graph, 1336 debug_ops=["DebugNumericSummary(foo=1.0; bar=false)"], 1337 debug_urls=self._debug_urls()) 1338 with self.assertRaisesRegexp( 1339 errors.FailedPreconditionError, 1340 r"2 attribute key\(s\) were not valid for debug node " 1341 r"__dbg_.:0_0_DebugNumericSummary:"): 1342 sess.run(y, options=run_options, run_metadata=run_metadata) 1343 1344 run_options = config_pb2.RunOptions(output_partition_graphs=True) 1345 debug_utils.watch_graph( 1346 run_options, 1347 sess.graph, 1348 debug_ops=["DebugNumericSummary(foo=1.0; mute_if_healthy=true)"], 1349 debug_urls=self._debug_urls()) 1350 with self.assertRaisesRegexp( 1351 errors.FailedPreconditionError, 1352 r"1 attribute key\(s\) were not valid for debug node " 1353 r"__dbg_.:0_0_DebugNumericSummary: foo"): 1354 sess.run(y, options=run_options, run_metadata=run_metadata) 1355 1356 def testDebugNumericSummaryMuteOnHealthyMutesOnlyHealthyTensorDumps(self): 1357 with session.Session(config=no_rewrite_session_config()) as sess: 1358 a = variables.VariableV1(10.0, name="a") 1359 b = variables.VariableV1(0.0, name="b") 1360 c = variables.VariableV1(0.0, name="c") 1361 1362 x = math_ops.divide(a, b, name="x") 1363 y = math_ops.multiply(x, c, name="y") 1364 1365 sess.run(variables.global_variables_initializer()) 1366 1367 # Here, validate=False is necessary to avoid causality check error. 1368 # TODO(cais): Maybe let DebugDumpDir constructor automatically ignore 1369 # debug ops with mute_if_healthy=false attribute during validation. 1370 _, dump = self._debug_run_and_get_dump( 1371 sess, y, debug_ops=["DebugNumericSummary(mute_if_healthy=true)"], 1372 validate=False) 1373 1374 self.assertLessEqual(2, dump.size) 1375 self.assertAllClose([[ 1376 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, np.inf, -np.inf, np.nan, 1377 np.nan, 1.0, 0.0 1378 ]], dump.get_tensors("x", 0, "DebugNumericSummary")) 1379 self.assertAllClose([[ 1380 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, np.inf, -np.inf, np.nan, 1381 np.nan, 1.0, 0.0 1382 ]], dump.get_tensors("y", 0, "DebugNumericSummary")) 1383 1384 # Another run with the default mute_if_healthy (false) value should 1385 # dump all the tensors. 1386 file_io.delete_recursively(self._dump_root) 1387 _, dump = self._debug_run_and_get_dump( 1388 sess, y, debug_ops=["DebugNumericSummary()"]) 1389 self.assertLessEqual(8, dump.size) 1390 1391 def testDebugNumericSummaryMuteOnHealthyAndCustomBoundsWork(self): 1392 with session.Session() as sess: 1393 a = variables.VariableV1([10.0, 10.0], name="a") 1394 b = variables.VariableV1([10.0, 2.0], name="b") 1395 1396 x = math_ops.add(a, b, name="x") # [20.0, 12.0] 1397 y = math_ops.divide(x, b, name="y") # [2.0, 6.0] 1398 1399 sess.run(variables.global_variables_initializer()) 1400 1401 # Here, validate=False is necessary to avoid causality check error. 1402 # TODO(cais): Maybe let DebugDumpDir constructor automatically ignore 1403 # debug ops with mute_if_healthy=false attribute during validation. 1404 _, dump = self._debug_run_and_get_dump( 1405 sess, y, debug_ops=[ 1406 "DebugNumericSummary(mute_if_healthy=true; upper_bound=11.0)"], 1407 validate=False) 1408 1409 self.assertEqual(1, dump.size) 1410 self.assertAllClose([[ 1411 1.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 12.0, 20.0, 16.0, 16.0, 1.0, 1412 1.0, 2.0]], dump.get_tensors("x", 0, "DebugNumericSummary")) 1413 1414 def testDebugQueueOpsDoesNotoErrorOut(self): 1415 with session.Session() as sess: 1416 q = data_flow_ops.FIFOQueue(3, "float", name="fifo_queue") 1417 q_init = q.enqueue_many(([101.0, 202.0, 303.0],), name="enqueue_many") 1418 1419 _, dump = self._debug_run_and_get_dump(sess, q_init) 1420 self.assertTrue(dump.loaded_partition_graphs()) 1421 1422 fifo_queue_tensor = dump.get_tensors("fifo_queue", 0, "DebugIdentity")[0] 1423 self.assertIsInstance(fifo_queue_tensor, 1424 debug_data.InconvertibleTensorProto) 1425 self.assertTrue(fifo_queue_tensor.initialized) 1426 self.assertAllClose( 1427 [101.0, 202.0, 303.0], 1428 dump.get_tensors("enqueue_many/component_0", 0, "DebugIdentity")[0]) 1429 1430 def testLookUpNodePythonTracebackWorks(self): 1431 with session.Session() as sess: 1432 u_init = constant_op.constant(10.0) 1433 u = variables.VariableV1(u_init, name="traceback/u") 1434 v_init = constant_op.constant(20.0) 1435 v = variables.VariableV1(v_init, name="traceback/v") 1436 1437 w = math_ops.multiply(u, v, name="traceback/w") 1438 1439 sess.run(variables.global_variables_initializer()) 1440 _, dump = self._debug_run_and_get_dump(sess, w) 1441 1442 # Prior to setting the Python graph, attempts to do traceback lookup 1443 # should lead to exceptions. 1444 with self.assertRaisesRegexp( 1445 LookupError, "Python graph is not available for traceback lookup"): 1446 dump.node_traceback("traceback/w") 1447 1448 dump.set_python_graph(sess.graph) 1449 1450 # After setting the Python graph, attempts to look up nonexistent nodes 1451 # should lead to exceptions. 1452 with self.assertRaisesRegexp(KeyError, 1453 r"Cannot find node \"foo\" in Python graph"): 1454 dump.node_traceback("foo") 1455 1456 # Lookup should work with node name input. 1457 traceback = dump.node_traceback("traceback/w") 1458 self.assertIsInstance(traceback, tuple) 1459 self.assertGreater(len(traceback), 0) 1460 for trace in traceback: 1461 self.assertIsInstance(trace, tuple) 1462 1463 # Lookup should also work with tensor name input. 1464 traceback = dump.node_traceback("traceback/w:0") 1465 self.assertIsInstance(traceback, tuple) 1466 self.assertGreater(len(traceback), 0) 1467 for trace in traceback: 1468 self.assertIsInstance(trace, tuple) 1469 1470 1471class DebugConcurrentRunCallsTest(test_util.TensorFlowTestCase): 1472 """Test for debugging concurrent Session.run() calls.""" 1473 1474 def _get_concurrent_debug_urls(self): 1475 """Abstract method to generate debug URLs for concurrent debugged runs.""" 1476 raise NotImplementedError( 1477 "_get_concurrent_debug_urls is not implemented in the base test class") 1478 1479 def testDebugConcurrentVariableUpdates(self): 1480 if test.is_gpu_available(): 1481 self.skipTest("No testing concurrent runs on a single GPU.") 1482 1483 with session.Session() as sess: 1484 v = variables.VariableV1(30.0, name="v") 1485 constants = [] 1486 for i in range(self._num_concurrent_runs): 1487 constants.append(constant_op.constant(1.0, name="c%d" % i)) 1488 incs = [ 1489 state_ops.assign_add( 1490 v, c, use_locking=True, name=("inc%d" % i)) 1491 for (i, c) in enumerate(constants) 1492 ] 1493 sess.run(v.initializer) 1494 1495 concurrent_debug_urls = self._get_concurrent_debug_urls() 1496 1497 def inc_job(index): 1498 run_options = config_pb2.RunOptions(output_partition_graphs=True) 1499 debug_utils.watch_graph( 1500 run_options, sess.graph, debug_urls=concurrent_debug_urls[index]) 1501 for _ in range(100): 1502 sess.run(incs[index], options=run_options) 1503 1504 inc_threads = [] 1505 for index in range(self._num_concurrent_runs): 1506 inc_thread = threading.Thread(target=functools.partial(inc_job, index)) 1507 inc_thread.start() 1508 inc_threads.append(inc_thread) 1509 for inc_thread in inc_threads: 1510 inc_thread.join() 1511 1512 self.assertAllClose(30.0 + 1.0 * self._num_concurrent_runs * 100, 1513 sess.run(v)) 1514 1515 all_session_run_indices = [] 1516 for index in range(self._num_concurrent_runs): 1517 dump = debug_data.DebugDumpDir(self._dump_roots[index]) 1518 self.assertTrue(dump.loaded_partition_graphs()) 1519 1520 v_data = dump.get_tensors("v", 0, "DebugIdentity") 1521 self.assertEqual(100, len(v_data)) 1522 1523 # Examine all the core metadata files 1524 core_metadata_files = glob.glob( 1525 os.path.join(self._dump_roots[index], "_tfdbg_core*")) 1526 1527 timestamps = [] 1528 session_run_indices = [] 1529 executor_step_indices = [] 1530 for core_metadata_file in core_metadata_files: 1531 with open(core_metadata_file, "rb") as f: 1532 event = event_pb2.Event() 1533 event.ParseFromString(f.read()) 1534 core_metadata = ( 1535 debug_data.extract_core_metadata_from_event_proto(event)) 1536 timestamps.append(event.wall_time) 1537 session_run_indices.append(core_metadata.session_run_index) 1538 executor_step_indices.append(core_metadata.executor_step_index) 1539 1540 all_session_run_indices.extend(session_run_indices) 1541 1542 # Assert that executor_step_index increases by one at a time. 1543 executor_step_indices = zip(timestamps, executor_step_indices) 1544 executor_step_indices = sorted( 1545 executor_step_indices, key=lambda x: x[0]) 1546 for i in range(len(executor_step_indices) - 1): 1547 self.assertEquals(executor_step_indices[i][1] + 1, 1548 executor_step_indices[i + 1][1]) 1549 1550 # Assert that session_run_index increase monotonically. 1551 session_run_indices = zip(timestamps, session_run_indices) 1552 session_run_indices = sorted(session_run_indices, key=lambda x: x[0]) 1553 for i in range(len(session_run_indices) - 1): 1554 self.assertGreater(session_run_indices[i + 1][1], 1555 session_run_indices[i][1]) 1556 1557 # Assert that the session_run_indices from the concurrent run() calls are 1558 # all unique. 1559 self.assertEqual(len(all_session_run_indices), 1560 len(set(all_session_run_indices))) 1561 1562 1563if __name__ == "__main__": 1564 googletest.main() 1565