1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "tensorflow/compiler/xla/service/dump.h"
17
18 #include <functional>
19 #include <memory>
20 #include <queue>
21 #include <utility>
22
23 #include "absl/strings/ascii.h"
24 #include "absl/strings/str_cat.h"
25 #include "llvm/ADT/SmallString.h"
26 #include "llvm/Support/ToolOutputFile.h"
27 #include "mlir/Support/FileUtilities.h" // from @llvm-project
28 #include "mlir/Transforms/LocationSnapshot.h" // from @llvm-project
29 #include "tensorflow/compiler/xla/service/hlo_graph_dumper.h"
30 #include "tensorflow/compiler/xla/service/hlo_module.h"
31 #include "tensorflow/compiler/xla/service/hlo_proto_util.h"
32 #include "tensorflow/compiler/xla/util.h"
33 #include "tensorflow/core/lib/core/status.h"
34 #include "tensorflow/core/lib/io/zlib_compression_options.h"
35 #include "tensorflow/core/lib/io/zlib_outputbuffer.h"
36 #include "tensorflow/core/lib/strings/proto_serialization.h"
37 #include "tensorflow/core/platform/env.h"
38 #include "tensorflow/core/platform/path.h"
39 #include "tensorflow/core/platform/regexp.h"
40
41 namespace xla {
42
43 namespace {
44
45 using absl::StrCat;
46 using absl::StrFormat;
47 using absl::string_view;
48
49 struct CanonicalDebugOptions {
CanonicalDebugOptionsxla::__anon603c32730111::CanonicalDebugOptions50 explicit CanonicalDebugOptions(const DebugOptions& opts)
51 : dump_to(opts.xla_dump_to()),
52 dump_as_text(opts.xla_dump_hlo_as_text()),
53 dump_as_proto(opts.xla_dump_hlo_as_proto()),
54 dump_as_dot(opts.xla_dump_hlo_as_dot()),
55 dump_as_html(opts.xla_dump_hlo_as_html()),
56 dump_as_url(opts.xla_dump_hlo_as_url()),
57 dump_fusion_visualization(opts.xla_dump_fusion_visualization()),
58 dump_snapshots(opts.xla_dump_hlo_snapshots()),
59 dump_include_timestamp(opts.xla_dump_include_timestamp()),
60 dump_max_hlo_modules(opts.xla_dump_max_hlo_modules()),
61 dump_module_metadata(opts.xla_dump_module_metadata()),
62 dump_compress_protos(opts.xla_dump_compress_protos()),
63 dump_hlo_metadata(!opts.xla_dump_disable_metadata()),
64 dump_as_long_text(opts.xla_dump_hlo_as_long_text()) {
65 // This constructor examines the values in `opts` and turns on other flags
66 // based on what we think is the user's intent. To reduce confusion about
67 // what was a user-specified value versus an extrapolated value, within this
68 // function we treat this struct's members as write-only, and read only from
69 // `opts`.
70
71 // Did the user specify an explicit format for dumping?
72 bool output_format_other_than_url_specified =
73 opts.xla_dump_hlo_as_text() || opts.xla_dump_hlo_as_proto() ||
74 opts.xla_dump_hlo_as_dot() || opts.xla_dump_hlo_as_html() ||
75 opts.xla_dump_hlo_snapshots();
76 bool output_format_specified =
77 output_format_other_than_url_specified || opts.xla_dump_hlo_as_url();
78
79 // If we haven't specified an output format, default to dumping as text.
80 if (!output_format_specified) {
81 dump_as_text = true;
82 }
83
84 // Disable dumping if specified by the user.
85 if (!opts.xla_detailed_logging_and_dumping()) {
86 dump_to = "";
87 }
88
89 // If dump_to is empty, default to dumping to stdout, so long as some dump
90 // format other than dump-as-url was specified. If the user only specified
91 // --xla_dump_hlo_as_url, then don't dump to stdout, that is likely noise
92 // they don't want.
93 if (opts.xla_dump_to().empty() && output_format_other_than_url_specified) {
94 dump_to = "-";
95 }
96
97 // If we specified a regular expression restricting which modules to dump,
98 // respect that.
99 //
100 // If we didn't specify which modules to dump but we passed some other flag
101 // which implies dumping modules, dump all modules.
102 //
103 // Otherwise, don't dump any HLO modules.
104 if (!opts.xla_dump_hlo_module_re().empty()) {
105 // RE2 object is not copyable, and we can't capture "by move", so we
106 // resort to this hack.
107 std::string pattern = opts.xla_dump_hlo_module_re();
108 should_dump_module = [pattern](string_view module_name) {
109 return RE2::PartialMatch(module_name, pattern);
110 };
111 } else if (!opts.xla_dump_hlo_pass_re().empty() ||
112 !opts.xla_dump_to().empty() || output_format_specified) {
113 should_dump_module = [](string_view) { return true; };
114 } else {
115 should_dump_module = [](string_view) { return false; };
116 }
117
118 // Initialize should_dump_pass. This one is easy: We only dump per-pass
119 // data if the user asked for it explicitly.
120 if (!opts.xla_dump_hlo_pass_re().empty()) {
121 std::string pattern = opts.xla_dump_hlo_pass_re();
122 should_dump_pass = [pattern](string_view pass_name) {
123 return RE2::PartialMatch(pass_name, pattern);
124 };
125 } else {
126 should_dump_pass = [](string_view) { return false; };
127 }
128
129 // Initialize should_dump_pipeline. If the option was not specified, dump
130 // all pipelines. Otherwise dump only those pipelines that user asked for
131 // explicitly.
132 if (!opts.xla_dump_hlo_pipeline_re().empty()) {
133 std::string pattern = opts.xla_dump_hlo_pipeline_re();
134 should_dump_pipeline = [pattern](string_view pipeline_name) {
135 return RE2::PartialMatch(pipeline_name, pattern);
136 };
137 } else {
138 should_dump_pipeline = [](string_view) { return true; };
139 }
140
141 // Output dirs "sponge" and "test_undeclared_outputs_dir" (case-insensitive)
142 // have a special meaning: Dump into the directory specified by the
143 // environment variable TEST_UNDECLARED_OUTPUTS_DIR.
144 std::string dump_to_lower = absl::AsciiStrToLower(dump_to);
145 if (dump_to_lower == "sponge" ||
146 dump_to_lower == "test_undeclared_outputs_dir") {
147 if (!tensorflow::io::GetTestUndeclaredOutputsDir(&dump_to)) {
148 LOG(ERROR) << "--xla_dump_to=" << opts.xla_dump_to()
149 << ", but environment variable TEST_UNDECLARED_OUTPUTS_DIR "
150 "is not set, so cannot dump anywhere.";
151 should_dump_module = [](string_view) { return false; };
152 should_dump_pass = [](string_view) { return false; };
153 should_dump_pipeline = [](string_view) { return false; };
154 }
155 }
156 }
157
dumping_to_stdoutxla::__anon603c32730111::CanonicalDebugOptions158 bool dumping_to_stdout() const { return dump_to == "-"; }
159
160 std::string dump_to;
161 std::function<bool(string_view module_name)> should_dump_module;
162 std::function<bool(string_view pass_name)> should_dump_pass;
163 std::function<bool(string_view pipeline_name)> should_dump_pipeline;
164
165 // dump_ir isn't present here because this file is mostly concerned with
166 // dumping HLO.
167 bool dump_as_text;
168 bool dump_as_proto;
169 bool dump_as_dot;
170 bool dump_as_html;
171 bool dump_as_url;
172 bool dump_fusion_visualization;
173 bool dump_snapshots;
174 bool dump_include_timestamp;
175 int64_t dump_max_hlo_modules;
176 bool dump_module_metadata;
177 bool dump_compress_protos;
178 bool dump_hlo_metadata;
179 bool dump_as_long_text;
180 };
181
182 // Helper class to hold a list of functions that produces data to be written to
183 // a file in multiple stages, so that we can lower the peak memory usage.
184 // Ideally we should migrate this whole file to use an I/O stream style API.
185 class DataProducer {
186 public:
Append(std::function<std::string ()> produce_func)187 void Append(std::function<std::string()> produce_func) {
188 produce_funcs_.push(std::move(produce_func));
189 }
190
Next()191 std::function<std::string()> Next() {
192 if (produce_funcs_.empty()) {
193 return nullptr;
194 }
195 auto next = std::move(produce_funcs_.front());
196 produce_funcs_.pop();
197 return next;
198 }
199
200 private:
201 std::queue<std::function<std::string()>> produce_funcs_;
202 };
203
WriteStringToFile(tensorflow::Env * env,const std::string & fname,DataProducer & data_producer,bool compressed)204 static Status WriteStringToFile(tensorflow::Env* env, const std::string& fname,
205 DataProducer& data_producer, bool compressed) {
206 std::unique_ptr<tensorflow::WritableFile> file;
207 TF_RETURN_IF_ERROR(env->NewWritableFile(fname, &file));
208 if (compressed) {
209 auto gz_opts = tensorflow::io::ZlibCompressionOptions::GZIP();
210 tensorflow::io::ZlibOutputBuffer gz_file(
211 file.get(), gz_opts.input_buffer_size, gz_opts.output_buffer_size,
212 gz_opts);
213 TF_RETURN_IF_ERROR(gz_file.Init());
214 while (auto next_producer = data_producer.Next()) {
215 TF_RETURN_IF_ERROR(gz_file.Append(next_producer()));
216 }
217 return gz_file.Close();
218 } else {
219 while (auto next_producer = data_producer.Next()) {
220 TF_RETURN_IF_ERROR(file->Append(next_producer()));
221 }
222 return file->Close();
223 }
224 }
225
WriteStringToFile(tensorflow::Env * env,const std::string & fname,absl::string_view data,bool compressed)226 static Status WriteStringToFile(tensorflow::Env* env, const std::string& fname,
227 absl::string_view data, bool compressed) {
228 if (!compressed) {
229 return tensorflow::WriteStringToFile(env, fname, data);
230 }
231 std::unique_ptr<tensorflow::WritableFile> file;
232 TF_RETURN_IF_ERROR(env->NewWritableFile(fname, &file));
233 auto gz_opts = tensorflow::io::ZlibCompressionOptions::GZIP();
234 tensorflow::io::ZlibOutputBuffer gz_file(file.get(),
235 gz_opts.input_buffer_size,
236 gz_opts.output_buffer_size, gz_opts);
237 TF_RETURN_IF_ERROR(gz_file.Init());
238 TF_RETURN_IF_ERROR(gz_file.Append(data));
239 return gz_file.Close();
240 }
241
GetDumpFilePath(string_view filename,const CanonicalDebugOptions & opts)242 static std::optional<std::string> GetDumpFilePath(
243 string_view filename, const CanonicalDebugOptions& opts) {
244 if (opts.dumping_to_stdout()) {
245 LOG(ERROR) << "Refusing to write " << filename
246 << " to stdout. Pass --xla_dump_to=<path> to write to a file.";
247 return std::nullopt;
248 }
249
250 if (opts.dump_to.empty()) {
251 return std::nullopt;
252 }
253
254 const std::string& dir = opts.dump_to;
255 VLOG(1) << "Dumping " << filename << " to " << dir;
256
257 tensorflow::Env* env = tensorflow::Env::Default();
258 // Two threads can race to observe the absence of the dump directory and
259 // simultaneously try to create it, causing the "losing" thread to get a
260 // "directory already exists" error. We can work around this by checking
261 // again whether the dir exists.
262 if (!env->IsDirectory(dir).ok()) {
263 auto status = env->RecursivelyCreateDir(dir);
264 if (!status.ok() && !env->IsDirectory(dir).ok()) {
265 LOG(ERROR) << "Could not create directory " << dir
266 << " for dumping XLA debug data: " << status;
267 return std::nullopt;
268 }
269 }
270
271 // Make sure we are not going to dump more modules than the user has asked.
272 if (opts.dump_max_hlo_modules > 0) {
273 std::vector<std::string> matches;
274 auto pattern = tensorflow::io::JoinPath(dir, "*module_*.*");
275 auto status = env->GetMatchingPaths(pattern, &matches);
276 if (!status.ok()) {
277 LOG(ERROR) << "Could not get matching paths for pattern " << pattern
278 << ": " << status;
279 }
280 static const LazyRE2 module_id_regex = {R"(.*module_(\d+)\..*)"};
281 absl::flat_hash_set<int64_t> dumped_module_ids;
282 for (const std::string& match : matches) {
283 int64_t dumped_module_id;
284 if (RE2::FullMatch(match, *module_id_regex, &dumped_module_id)) {
285 dumped_module_ids.insert(dumped_module_id);
286 }
287 }
288 if (dumped_module_ids.size() >= opts.dump_max_hlo_modules) {
289 int64_t module_id;
290 if (RE2::FullMatch(filename, *module_id_regex, &module_id) &&
291 !dumped_module_ids.contains(module_id)) {
292 LOG(ERROR) << "Have already dumped " << dumped_module_ids.size()
293 << " modules, more than the limit of "
294 << opts.dump_max_hlo_modules;
295 return std::nullopt;
296 }
297 }
298 }
299
300 return tensorflow::io::JoinPath(dir, SanitizeFileName(std::string(filename)));
301 }
302
DumpToFileInDirImpl(string_view filename,string_view contents,const CanonicalDebugOptions & opts,bool compress=false)303 static std::optional<std::string> DumpToFileInDirImpl(
304 string_view filename, string_view contents,
305 const CanonicalDebugOptions& opts, bool compress = false) {
306 auto file_path = GetDumpFilePath(filename, opts);
307 if (!file_path) return std::nullopt;
308
309 auto status = WriteStringToFile(tensorflow::Env::Default(), *file_path,
310 contents, compress);
311 if (!status.ok()) {
312 LOG(ERROR) << "Could not write XLA debug data to " << *file_path << ": "
313 << status;
314 return std::nullopt;
315 }
316
317 return file_path;
318 }
319
DumpToFileInDirImpl(string_view filename,DataProducer & data_producer,const CanonicalDebugOptions & opts,bool compress=false)320 static std::optional<std::string> DumpToFileInDirImpl(
321 string_view filename, DataProducer& data_producer,
322 const CanonicalDebugOptions& opts, bool compress = false) {
323 auto file_path = GetDumpFilePath(filename, opts);
324 if (!file_path) return std::nullopt;
325
326 auto status = WriteStringToFile(tensorflow::Env::Default(), *file_path,
327 data_producer, compress);
328 if (!status.ok()) {
329 LOG(ERROR) << "Could not write XLA debug data to " << *file_path << ": "
330 << status;
331 return std::nullopt;
332 }
333
334 return file_path;
335 }
336
DumpToFileInDirOrStdoutImpl(string_view filename,string_view contents,const CanonicalDebugOptions & opts)337 static std::optional<std::string> DumpToFileInDirOrStdoutImpl(
338 string_view filename, string_view contents,
339 const CanonicalDebugOptions& opts) {
340 // Dump to stdout if that's called for.
341 if (opts.dumping_to_stdout()) {
342 std::cout << "*** Begin " << filename << " ***\n"
343 << contents << "\n*** End " << filename << " ***" << std::endl;
344 return std::nullopt;
345 }
346
347 // Otherwise, dump to a file.
348 return DumpToFileInDirImpl(filename, contents, opts);
349 }
350
DumpToFileInDirOrStdoutImpl(string_view filename,DataProducer & data_producer,const CanonicalDebugOptions & opts)351 static std::optional<std::string> DumpToFileInDirOrStdoutImpl(
352 string_view filename, DataProducer& data_producer,
353 const CanonicalDebugOptions& opts) {
354 // Dump to stdout if that's called for.
355 if (opts.dumping_to_stdout()) {
356 std::cout << "*** Begin " << filename << " ***\n";
357 while (auto next_producer = data_producer.Next()) {
358 std::cout << next_producer();
359 }
360 std::cout << "\n*** End " << filename << " ***" << std::endl;
361 return std::nullopt;
362 }
363
364 // Otherwise, dump to a file.
365 return DumpToFileInDirImpl(filename, data_producer, opts);
366 }
367
368 // Returns whether the computation is trivial enough not to warrant dumping.
369 // Currently skips instructions where the root instruction has only parameters
370 // as operands AND is not a fusion.
IsTrivial(const HloComputation & computation)371 static bool IsTrivial(const HloComputation& computation) {
372 const HloInstruction* root = computation.root_instruction();
373 return absl::c_all_of(root->operands(),
374 [&](const HloInstruction* op) {
375 return op->opcode() == HloOpcode::kParameter;
376 }) &&
377 root->opcode() != HloOpcode::kFusion;
378 }
379
380 // Returns full file paths of all dumps of the module.
DumpHloModuleImpl(const HloModule & module,const BufferAssignment * buffer_assn,const HloExecutionProfile * profile,string_view prefix,string_view suffix,const CanonicalDebugOptions & opts)381 static std::vector<std::string> DumpHloModuleImpl(
382 const HloModule& module, const BufferAssignment* buffer_assn,
383 const HloExecutionProfile* profile, string_view prefix, string_view suffix,
384 const CanonicalDebugOptions& opts) {
385 std::string filename = FilenameFor(module, prefix, suffix);
386
387 std::vector<std::optional<std::string>> file_paths;
388
389 if (opts.dump_as_text) {
390 auto print_options = opts.dump_as_long_text
391 ? HloPrintOptions()
392 : HloPrintOptions::ShortParsable();
393 print_options.set_print_large_constants(false);
394 print_options.set_print_control_dependencies(true);
395 print_options.set_print_operand_index_annotation_interval(5);
396 print_options.set_print_backend_config(true);
397 print_options.set_print_metadata(opts.dump_hlo_metadata);
398 file_paths.push_back(DumpToFileInDirOrStdoutImpl(
399 StrCat(filename, ".txt"), module.ToString(print_options), opts));
400 if (buffer_assn) {
401 DataProducer data_producer;
402 data_producer.Append([&] { return buffer_assn->ToString(); });
403 data_producer.Append([&] { return "\n\n"; });
404 data_producer.Append(
405 [&] { return buffer_assn->hlo_live_range().ToString(); });
406 file_paths.push_back(DumpToFileInDirOrStdoutImpl(
407 StrCat(filename, "-buffer-assignment.txt"), data_producer, opts));
408 }
409 }
410
411 if (opts.dump_as_proto) {
412 HloProto module_proto =
413 buffer_assn ? MakeHloProto(module, *buffer_assn) : MakeHloProto(module);
414 std::string pb;
415 if (!tensorflow::SerializeToStringDeterministic(module_proto, &pb)) {
416 pb = "Failed to serialize HLO module proto.";
417 }
418 file_paths.push_back(DumpToFileInDirImpl(
419 StrCat(filename, opts.dump_compress_protos ? ".hlo.pb.gz" : ".hlo.pb"),
420 pb, opts, opts.dump_compress_protos));
421 }
422
423 auto render_graph = [&](RenderedGraphFormat format) {
424 StatusOr<std::string> rendered_graph = RenderGraph(
425 *module.entry_computation(),
426 /*label=*/filename, module.config().debug_options(), format, profile);
427 if (rendered_graph.ok()) {
428 return std::move(rendered_graph).ValueOrDie();
429 }
430 return StrFormat("Error rendering graph: %s",
431 rendered_graph.status().ToString());
432 };
433
434 if (opts.dump_as_dot) {
435 file_paths.push_back(
436 DumpToFileInDirImpl(StrFormat("%s.dot", filename),
437 render_graph(RenderedGraphFormat::kDot), opts));
438 }
439
440 if (opts.dump_as_html) {
441 file_paths.push_back(
442 DumpToFileInDirImpl(StrFormat("%s.html", filename),
443 render_graph(RenderedGraphFormat::kHtml), opts));
444 }
445
446 if (opts.dump_fusion_visualization) {
447 for (const HloComputation* computation :
448 module.MakeNonfusionComputations()) {
449 if (IsTrivial(*computation)) {
450 VLOG(1) << "Skipping computation " << computation->name()
451 << " as trivial";
452 continue;
453 }
454
455 StatusOr<std::string> rendered_graph = WrapFusionExplorer(*computation);
456 if (!rendered_graph.ok()) {
457 VLOG(1) << "Skipping fusion visualization"
458 << " for computation " << computation->name()
459 << " due to: " << rendered_graph.status().ToString();
460 continue;
461 }
462 file_paths.push_back(DumpToFileInDirImpl(
463 FilenameFor(module, computation->name(), "_fusion.html"),
464 *rendered_graph, opts));
465 }
466 }
467
468 // Special case for rendering graphs as URLs. We'll dump them to a file
469 // because why not, but we always log them to stdout as well.
470 if (opts.dump_as_url) {
471 std::string url = render_graph(RenderedGraphFormat::kUrl);
472 std::cout << filename << " --> " << url << std::endl;
473 if (!opts.dumping_to_stdout()) {
474 file_paths.push_back(
475 DumpToFileInDirImpl(StrFormat("%s.url", filename), url, opts));
476 }
477 }
478
479 std::vector<std::string> dumped_file_paths;
480 for (const std::optional<std::string>& path : file_paths) {
481 if (path.has_value()) {
482 dumped_file_paths.push_back(*path);
483 }
484 }
485 return dumped_file_paths;
486 }
487
DumpHloModuleMetadata(const HloModuleMetadataProto & metadata,const CanonicalDebugOptions & opts,absl::flat_hash_set<int64_t> * dumped_module_ids)488 static void DumpHloModuleMetadata(
489 const HloModuleMetadataProto& metadata, const CanonicalDebugOptions& opts,
490 absl::flat_hash_set<int64_t>* dumped_module_ids) {
491 // Return if metadata for this module has already been dumped.
492 if (!dumped_module_ids->insert(metadata.canonical_module_id()).second) {
493 return;
494 }
495 std::string filename = absl::StrFormat("module_%04d.metadata.textproto",
496 metadata.canonical_module_id());
497 std::string content;
498 if (tensorflow::protobuf::TextFormat::PrintToString(metadata, &content)) {
499 DumpToFileInDirImpl(filename, content, opts);
500 } else {
501 LOG(ERROR) << "Failed to convert HloModuleMetadataProto to text.";
502 }
503 }
504
505 static absl::Mutex mu(absl::kConstInit);
506
507 // Maps a module's unique ID to a counter indicating how many times we've dumped
508 // this module during the compilation pipeline. This lets us keep the filenames
509 // ordered nicely.
510 //
511 // Entries added here leak forever; we have no way to GC them when a module
512 // dies. But we only add an entry if dumping is enabled for this module, and
513 // dumping a module leaks buffer space in stdout or bytes on disk *way* faster
514 // than this hashtable leaks memory.
515 static auto& module_id_to_step_number ABSL_GUARDED_BY(mu) =
516 *new absl::flat_hash_map<int64_t, int64_t>();
517
518 // Maps a module's unique ID to a timestamp indicating when we've first dumped
519 // this module during the compilation pipeline and when we first started
520 // compiling this module. This lets us keep the filenames ordered nicely.
521 //
522 // Entries added here leak forever; we have no way to GC them when a module
523 // dies. But we only add an entry if dumping is enabled for this module, and
524 // dumping a module leaks buffer space in stdout or bytes on disk *way* faster
525 // than this hashtable leaks memory.
526 static auto& module_id_to_timestamp ABSL_GUARDED_BY(mu) =
527 *new absl::flat_hash_map<int64_t, uint64_t>();
528
StepNumberForModule(const HloModule & module)529 int64_t StepNumberForModule(const HloModule& module) {
530 absl::MutexLock lock(&mu);
531 return module_id_to_step_number[module.unique_id()]++;
532 }
533
534 } // namespace
535
536 // Get a timestamp which we can use as a filename prefix specific to this
537 // module.
TimestampFor(const HloModule & module)538 std::string TimestampFor(const HloModule& module) {
539 if (!module.config().debug_options().xla_dump_include_timestamp()) {
540 return "";
541 }
542 absl::MutexLock lock(&mu);
543 auto timestamp_emplace = module_id_to_timestamp.try_emplace(
544 module.unique_id(), tensorflow::Env::Default()->NowMicros());
545 return std::to_string(timestamp_emplace.first->second);
546 }
547
FilenameFor(int unique_id,string_view module_name,string_view prefix,string_view suffix)548 static std::string FilenameFor(int unique_id, string_view module_name,
549 string_view prefix, string_view suffix) {
550 std::string filename;
551 if (!prefix.empty()) {
552 absl::StrAppend(&filename, prefix, ".");
553 }
554 absl::StrAppendFormat(&filename, "module_%04d", unique_id);
555 if (!module_name.empty()) {
556 absl::StrAppend(&filename, ".", module_name);
557 }
558 absl::StrAppend(&filename, ".", suffix);
559 // Skip the module name if the resulting length is too long.
560 if (!module_name.empty() && filename.size() > 255) {
561 return FilenameFor(unique_id, "", prefix, suffix);
562 }
563 return filename;
564 }
565
FilenameFor(const HloModule & module,string_view prefix,string_view suffix)566 std::string FilenameFor(const HloModule& module, string_view prefix,
567 string_view suffix) {
568 return FilenameFor(module.unique_id(), module.name(), prefix, suffix);
569 }
570
DumpToFileInDir(const HloModule & module,string_view file_prefix,string_view file_suffix,string_view contents)571 void DumpToFileInDir(const HloModule& module, string_view file_prefix,
572 string_view file_suffix, string_view contents) {
573 DumpToFileInDir(module.config().debug_options(),
574 FilenameFor(module, file_prefix, file_suffix), contents);
575 }
576
DumpToFileInDir(const DebugOptions & debug_options,absl::string_view filename,absl::string_view contents)577 void DumpToFileInDir(const DebugOptions& debug_options,
578 absl::string_view filename, absl::string_view contents) {
579 DumpToFileInDirImpl(filename, contents, CanonicalDebugOptions(debug_options));
580 }
581
DumpToFileInDirOrStdout(const HloModule & module,string_view file_prefix,string_view file_suffix,string_view contents)582 void DumpToFileInDirOrStdout(const HloModule& module, string_view file_prefix,
583 string_view file_suffix, string_view contents) {
584 DumpToFileInDirOrStdoutImpl(
585 FilenameFor(module, file_prefix, file_suffix), contents,
586 CanonicalDebugOptions(module.config().debug_options()));
587 }
588
DumpToFileInDirOrStdout(const DebugOptions & debug_options,int unique_id,string_view module_name,string_view file_prefix,string_view file_suffix,string_view contents)589 void DumpToFileInDirOrStdout(const DebugOptions& debug_options, int unique_id,
590 string_view module_name, string_view file_prefix,
591 string_view file_suffix, string_view contents) {
592 DumpToFileInDirOrStdoutImpl(
593 FilenameFor(unique_id, module_name, file_prefix, file_suffix), contents,
594 CanonicalDebugOptions(debug_options));
595 }
596
DumpToFileInDirOrStdout(const HloModule & module,string_view file_prefix,mlir::Operation * op)597 void DumpToFileInDirOrStdout(const HloModule& module, string_view file_prefix,
598 mlir::Operation* op) {
599 CanonicalDebugOptions opts(module.config().debug_options());
600 if (opts.dumping_to_stdout()) return op->dump();
601
602 auto file_path =
603 GetDumpFilePath(FilenameFor(module, file_prefix, "mlir"), opts);
604 if (!file_path) return;
605
606 std::string error;
607 std::unique_ptr<llvm::ToolOutputFile> outputFile =
608 mlir::openOutputFile(llvm::SmallString<32>(*file_path), &error);
609 if (!outputFile) {
610 LOG(ERROR) << "Error: " << error << std::endl
611 << "Failed to open file: " << *file_path;
612 return;
613 }
614
615 op->print(outputFile->os(), mlir::OpPrintingFlags().useLocalScope());
616 outputFile->keep();
617 }
618
DumpProtobufToFile(const tensorflow::protobuf::Message & proto,const DebugOptions & debug_options,absl::string_view filename)619 void DumpProtobufToFile(const tensorflow::protobuf::Message& proto,
620 const DebugOptions& debug_options,
621 absl::string_view filename) {
622 CanonicalDebugOptions opts(debug_options);
623 tensorflow::Env* env = tensorflow::Env::Default();
624 const std::string& dir = opts.dump_to;
625 if (!env->IsDirectory(dir).ok()) {
626 auto status = env->RecursivelyCreateDir(dir);
627 if (!status.ok()) {
628 LOG(ERROR) << "Could not create directory " << dir
629 << " for dumping XLA execution options: " << status;
630 return;
631 }
632 }
633 if (env->IsDirectory(dir).ok()) {
634 const std::string path = tensorflow::io::JoinPath(dir, filename);
635 Status status;
636 if (opts.dump_as_text) {
637 status =
638 tensorflow::WriteTextProto(env, absl::StrCat(path, ".txt"), proto);
639 } else {
640 status =
641 tensorflow::WriteBinaryProto(env, absl::StrCat(path, ".pb"), proto);
642 }
643 if (!status.ok()) {
644 LOG(ERROR) << "Could not write XLA debug data to " << filename << ": "
645 << status;
646 }
647 }
648 }
649
DumpPerModuleProtobufToFile(const HloModule & module,const tensorflow::protobuf::Message & proto,const DebugOptions & debug_options,absl::string_view name)650 void DumpPerModuleProtobufToFile(const HloModule& module,
651 const tensorflow::protobuf::Message& proto,
652 const DebugOptions& debug_options,
653 absl::string_view name) {
654 const std::string filename = FilenameFor(module, TimestampFor(module), name);
655 DumpProtobufToFile(proto, debug_options, filename);
656 }
657
DumpHloModuleIfEnabled(const HloModule & module,string_view name)658 void DumpHloModuleIfEnabled(const HloModule& module, string_view name) {
659 CanonicalDebugOptions opts(module.config().debug_options());
660 if (opts.should_dump_module(module.name())) {
661 DumpHloModuleImpl(module, /*buffer_assn=*/nullptr, /*profile=*/nullptr,
662 TimestampFor(module), name, opts);
663 }
664 }
665
DumpHloModuleIfEnabled(const HloModule & module,const BufferAssignment & buffer_assn,string_view name)666 void DumpHloModuleIfEnabled(const HloModule& module,
667 const BufferAssignment& buffer_assn,
668 string_view name) {
669 CanonicalDebugOptions opts(module.config().debug_options());
670 if (opts.should_dump_module(module.name())) {
671 DumpHloModuleImpl(module, &buffer_assn, /*profile=*/nullptr,
672 TimestampFor(module), name, opts);
673 }
674 }
675
DumpHloModuleIfEnabled(const HloModule & module,const HloExecutionProfile & profile,string_view name)676 void DumpHloModuleIfEnabled(const HloModule& module,
677 const HloExecutionProfile& profile,
678 string_view name) {
679 CanonicalDebugOptions opts(module.config().debug_options());
680 if (opts.should_dump_module(module.name())) {
681 DumpHloModuleImpl(module, /*buffer_assn=*/nullptr, &profile,
682 TimestampFor(module), name, opts);
683 }
684 }
685
DumpingEnabledForHloModule(string_view hlo_module_name,const DebugOptions & opts)686 bool DumpingEnabledForHloModule(string_view hlo_module_name,
687 const DebugOptions& opts) {
688 return CanonicalDebugOptions(opts).should_dump_module(hlo_module_name);
689 }
690
DumpingToStdout(const DebugOptions & opts)691 bool DumpingToStdout(const DebugOptions& opts) {
692 return CanonicalDebugOptions(opts).dumping_to_stdout();
693 }
694
DumpHloModuleBetweenPassesIfEnabled(string_view pipeline_name,string_view before_pass_name,string_view after_pass_name,const HloModule & module)695 std::vector<std::string> DumpHloModuleBetweenPassesIfEnabled(
696 string_view pipeline_name, string_view before_pass_name,
697 string_view after_pass_name, const HloModule& module) {
698 CanonicalDebugOptions opts(module.config().debug_options());
699 if (!opts.should_dump_module(module.name())) {
700 return {};
701 }
702
703 if (!opts.should_dump_pass(before_pass_name) &&
704 !opts.should_dump_pass(after_pass_name)) {
705 return {};
706 }
707
708 if (!opts.should_dump_pipeline(pipeline_name)) {
709 return {};
710 }
711
712 int64_t step_number = StepNumberForModule(module);
713 std::string timestamp = TimestampFor(module);
714
715 std::string filename_suffix =
716 StrFormat("%04d.%s.after_%s.before_%s", step_number, pipeline_name,
717 after_pass_name, before_pass_name);
718 return DumpHloModuleImpl(module, /*buffer_assn=*/nullptr, /*profile=*/nullptr,
719 timestamp, filename_suffix, opts);
720 }
721
DumpHloModuleDuringPassIfEnabled(string_view pass_name,string_view step_name,const HloModule & module)722 void DumpHloModuleDuringPassIfEnabled(string_view pass_name,
723 string_view step_name,
724 const HloModule& module) {
725 CanonicalDebugOptions opts(module.config().debug_options());
726 if (!opts.should_dump_module(module.name()) ||
727 !opts.should_dump_pass(pass_name)) {
728 return;
729 }
730
731 int64_t step_number = StepNumberForModule(module);
732 std::string timestamp = TimestampFor(module);
733
734 std::string filename_suffix =
735 StrFormat("%04d.%s.%s", step_number, pass_name, step_name);
736 DumpHloModuleImpl(module, /*buffer_assn=*/nullptr, /*profile=*/nullptr,
737 timestamp, filename_suffix, opts);
738 }
739
DumpHloSnapshotIfEnabled(const HloModule & module,const HloSnapshot & snapshot)740 void DumpHloSnapshotIfEnabled(const HloModule& module,
741 const HloSnapshot& snapshot) {
742 CanonicalDebugOptions opts(module.config().debug_options());
743 if (!opts.should_dump_module(module.name()) || !opts.dump_snapshots) {
744 return;
745 }
746 int64_t execution_count;
747 uint64_t timestamp;
748 {
749 static auto& module_id_to_execution_count ABSL_GUARDED_BY(mu) =
750 *new absl::flat_hash_map<int64_t, int64_t>();
751 absl::MutexLock lock(&mu);
752 execution_count = module_id_to_execution_count[module.unique_id()]++;
753 auto timestamp_emplace = module_id_to_timestamp.try_emplace(
754 module.unique_id(), tensorflow::Env::Default()->NowMicros());
755 timestamp = timestamp_emplace.first->second;
756 }
757 std::string filename =
758 StrCat(FilenameFor(module, std::to_string(timestamp),
759 StrFormat("execution_%04d", execution_count)),
760 ".hlo_snapshot.pb");
761 if (opts.dumping_to_stdout()) {
762 LOG(ERROR) << "Refusing to write HLO snapshot proto for " << filename
763 << " to stdout. Pass --xla_dump_to=<path> to write to a file.";
764 return;
765 }
766 std::string pb;
767 if (!tensorflow::SerializeToStringDeterministic(snapshot, &pb)) {
768 LOG(ERROR) << "Failed to serialize HLO snapshot proto " << filename;
769 }
770 DumpToFileInDirImpl(filename, pb, opts);
771 }
772
DumpHloSnapshotIfEnabled(const HloSnapshot & snapshot,const DebugOptions & opts)773 void DumpHloSnapshotIfEnabled(const HloSnapshot& snapshot,
774 const DebugOptions& opts) {
775 CanonicalDebugOptions canonical_opts(opts);
776 std::string name = snapshot.hlo().hlo_module().name();
777 if (!canonical_opts.should_dump_module(name) ||
778 !canonical_opts.dump_snapshots) {
779 return;
780 }
781
782 // We don't have a unique id for an HloSnapshot, so in this overload we just
783 // have to use its name.
784 int64_t execution_count;
785 {
786 static auto& module_name_to_execution_count ABSL_GUARDED_BY(mu) =
787 *new absl::flat_hash_map<std::string, int64_t>();
788 absl::MutexLock lock(&mu);
789 execution_count = module_name_to_execution_count[name]++;
790 }
791 std::string filename = StrFormat("module_%s.execution_%04d.hlo_snapshot.pb",
792 name, execution_count);
793 if (canonical_opts.dumping_to_stdout()) {
794 LOG(ERROR) << "Refusing to write HLO snapshot proto for " << filename
795 << " to stdout. Pass --xla_dump_to=<path> to write to a file.";
796 return;
797 }
798 std::string pb;
799 if (!tensorflow::SerializeToStringDeterministic(snapshot, &pb)) {
800 LOG(ERROR) << "Failed to serialize HLO snapshot proto " << filename;
801 }
802 DumpToFileInDirImpl(filename, pb, canonical_opts);
803 }
804
DumpHloModuleMetadataIfEnabled(const std::vector<HloModule * > & modules)805 void DumpHloModuleMetadataIfEnabled(const std::vector<HloModule*>& modules) {
806 absl::flat_hash_set<int64_t> dumped_module_ids;
807 for (const HloModule* module : modules) {
808 CanonicalDebugOptions opts(module->config().debug_options());
809 if (!opts.dump_module_metadata) {
810 continue;
811 }
812 DumpHloModuleMetadata(module->metadata().proto(), opts, &dumped_module_ids);
813 const std::optional<HloModuleMetadataProto>& prepartitioning_metadata =
814 module->metadata().prepartitioning_metadata();
815 if (prepartitioning_metadata.has_value()) {
816 DumpHloModuleMetadata(*prepartitioning_metadata, opts,
817 &dumped_module_ids);
818 }
819 }
820 }
821
822 } // namespace xla
823