1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.h"
17
18 #include <fstream>
19 #include <map>
20 #include <memory>
21 #include <string>
22 #include <utility>
23
24 #include "absl/base/call_once.h"
25 #include "absl/strings/str_cat.h"
26 #include "absl/strings/string_view.h"
27 #include "llvm/ADT/STLExtras.h"
28 #include "llvm/ADT/StringMap.h"
29 #include "llvm/ADT/StringSet.h"
30 #include "llvm/Analysis/TargetLibraryInfo.h"
31 #include "llvm/Analysis/TargetTransformInfo.h"
32 #include "llvm/Bitcode/BitcodeReader.h"
33 #include "llvm/Bitcode/BitcodeWriter.h"
34 #include "llvm/CodeGen/CommandFlags.h"
35 #include "llvm/IR/LLVMContext.h"
36 #include "llvm/IR/LegacyPassManager.h"
37 #include "llvm/IR/Module.h"
38 #include "llvm/IR/Verifier.h"
39 #include "llvm/InitializePasses.h"
40 #include "llvm/Linker/Linker.h"
41 #include "llvm/MC/TargetRegistry.h"
42 #include "llvm/PassRegistry.h"
43 #include "llvm/Support/CommandLine.h"
44 #include "llvm/Support/FileSystem.h"
45 #include "llvm/Support/FormattedStream.h"
46 #include "llvm/Support/Program.h"
47 #include "llvm/Support/TargetSelect.h"
48 #include "llvm/Support/ToolOutputFile.h"
49 #include "llvm/Target/TargetMachine.h"
50 #include "llvm/Transforms/IPO.h"
51 #include "llvm/Transforms/IPO/AlwaysInliner.h"
52 #include "llvm/Transforms/IPO/Internalize.h"
53 #include "llvm/Transforms/IPO/PassManagerBuilder.h"
54 #include "llvm/Transforms/Scalar.h"
55 #include "tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/dump_ir_pass.h"
56 #include "tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/utils.h"
57 #include "tensorflow/compiler/xla/service/gpu/metrics.h"
58 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_command_line_options.h"
59 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_type_conversion_util.h"
60 #include "tensorflow/compiler/xla/status_macros.h"
61 #include "tensorflow/compiler/xla/types.h"
62 #include "tensorflow/compiler/xla/util.h"
63 #include "tensorflow/core/platform/env.h"
64 #include "tensorflow/core/platform/logging.h"
65 #include "tensorflow/core/platform/path.h"
66 #include "tensorflow/core/platform/random.h"
67 #include "tensorflow/core/profiler/lib/traceme.h"
68 #include "tensorflow/core/util/env_var.h"
69
70 #if !defined(PLATFORM_GOOGLE) && TENSORFLOW_USE_ROCM
71 #include "rocm/rocm_config.h"
72 #endif
73
74 namespace xla {
75 namespace gpu {
76 namespace {
77
78 static llvm::codegen::RegisterCodeGenFlags CGF;
79
80 // Inline threshold value to use in LLVM AMDGPU backend.
81 const int kAMDGPUInlineThreshold = 0x100000;
82
83 // Default inline threshold value to use in llvm.
84 const int kDefaultInlineThreshold = 1100;
85
86 // Gets the GPU name as it's known to LLVM for a given compute
87 // capability. If we see an unrecognized compute capability, we
88 // return the highest one that is known and below the selected device.
GetSmName(se::CudaComputeCapability compute_capability)89 static std::string GetSmName(se::CudaComputeCapability compute_capability) {
90 int compute_capability_version =
91 compute_capability.major * 10 + compute_capability.minor;
92 int sm_version = 30;
93 // If the current compute capability isn't known, fallback to the
94 // most recent version before it.
95 int supported_versions[] = {86, 80, 75, 72, 70, 62, 61, 60,
96 53, 52, 50, 37, 35, 32, 30};
97 for (int v : supported_versions) {
98 if (v <= compute_capability_version) {
99 sm_version = v;
100 break;
101 }
102 }
103
104 // If the current CC isn't supported by LLVM and it is newer then
105 // the max supported LLVM version, do not warn about it. The end
106 // user can't do anything about this. E.g., PTX compiled for SM75 will
107 // run on SM80 too.
108 if (sm_version != compute_capability_version &&
109 compute_capability_version < supported_versions[0]) {
110 LOG(WARNING) << "Unknown compute capability "
111 << compute_capability.ToString()
112 << ". Defaulting to telling LLVM that we're compiling for sm_"
113 << sm_version;
114 }
115 return absl::StrCat("sm_", sm_version);
116 }
117
118 // Convenience function for producing a name of a temporary compilation product
119 // from the input filename.
MakeNameForTempProduct(absl::string_view input_filename,absl::string_view extension)120 std::string MakeNameForTempProduct(absl::string_view input_filename,
121 absl::string_view extension) {
122 return ReplaceFilenameExtension(tensorflow::io::Basename(input_filename),
123 extension);
124 }
125
126 // Initializes LLVM passes. Uses the PassRegistry mechanism.
InitializePasses(llvm::PassRegistry * pass_registry)127 void InitializePasses(llvm::PassRegistry* pass_registry) {
128 llvm::initializeCore(*pass_registry);
129 llvm::initializeCodeGen(*pass_registry);
130 llvm::initializeScalarOpts(*pass_registry);
131 llvm::initializeObjCARCOpts(*pass_registry);
132 llvm::initializeVectorization(*pass_registry);
133 llvm::initializeIPO(*pass_registry);
134 llvm::initializeAnalysis(*pass_registry);
135 llvm::initializeTransformUtils(*pass_registry);
136 llvm::initializeInstCombine(*pass_registry);
137 llvm::initializeInstrumentation(*pass_registry);
138 llvm::initializeTarget(*pass_registry);
139 llvm::initializeCodeGenPreparePass(*pass_registry);
140 }
141
142 // Returns the TargetMachine, given a triple.
GetTargetMachine(llvm::Triple triple,absl::string_view cpu_name,const HloModuleConfig & hlo_module_config,absl::string_view feature_str)143 std::unique_ptr<llvm::TargetMachine> GetTargetMachine(
144 llvm::Triple triple, absl::string_view cpu_name,
145 const HloModuleConfig& hlo_module_config, absl::string_view feature_str) {
146 std::string error;
147 const llvm::Target* target =
148 llvm::TargetRegistry::lookupTarget("", triple, error);
149 if (target == nullptr) {
150 LOG(FATAL) << "Unable to find Target for triple '" << triple.str() << "'"
151 << " -- " << error;
152 return nullptr;
153 }
154
155 llvm::TargetOptions target_options =
156 llvm::codegen::InitTargetOptionsFromCodeGenFlags(llvm::Triple());
157
158 // Set the verbose assembly options.
159 target_options.MCOptions.AsmVerbose = false;
160
161 // The selection of codegen optimization level is copied from function
162 // GetCodeGenOptLevel in //third_party/llvm/llvm/tools/opt/opt.cpp.
163 llvm::CodeGenOpt::Level codegen_opt_level;
164 switch (hlo_module_config.debug_options().xla_backend_optimization_level()) {
165 case 1:
166 codegen_opt_level = llvm::CodeGenOpt::Less;
167 break;
168 case 2:
169 codegen_opt_level = llvm::CodeGenOpt::Default;
170 break;
171 case 3:
172 codegen_opt_level = llvm::CodeGenOpt::Aggressive;
173 break;
174 default:
175 codegen_opt_level = llvm::CodeGenOpt::None;
176 }
177 return absl::WrapUnique(target->createTargetMachine(
178 triple.str(), llvm_ir::AsStringRef(cpu_name),
179 llvm_ir::AsStringRef(feature_str), target_options,
180 llvm::codegen::getExplicitRelocModel(),
181 llvm::codegen::getExplicitCodeModel(), codegen_opt_level));
182 }
183
184 // Adds the standard LLVM optimization passes, based on the speed optimization
185 // level (opt_level) and size optimization level (size_level). Both module
186 // and function-level passes are added, so two pass managers are passed in and
187 // modified by this function.
AddOptimizationPasses(unsigned opt_level,unsigned size_level,llvm::TargetMachine * target_machine,llvm::legacy::PassManagerBase * module_passes,llvm::legacy::FunctionPassManager * function_passes,int inline_threshold)188 void AddOptimizationPasses(unsigned opt_level, unsigned size_level,
189 llvm::TargetMachine* target_machine,
190 llvm::legacy::PassManagerBase* module_passes,
191 llvm::legacy::FunctionPassManager* function_passes,
192 int inline_threshold) {
193 llvm::PassManagerBuilder builder;
194 builder.OptLevel = opt_level;
195 builder.SizeLevel = size_level;
196
197 if (opt_level > 1) {
198 builder.Inliner = llvm::createFunctionInliningPass(inline_threshold);
199 } else {
200 // Only inline functions marked with "alwaysinline".
201 builder.Inliner = llvm::createAlwaysInlinerLegacyPass();
202 }
203
204 builder.DisableUnrollLoops = opt_level == 0;
205 builder.LoopVectorize = opt_level > 0;
206 builder.SLPVectorize = opt_level > 1 && size_level < 2;
207
208 // NVPTX's early-as-possible passes include NVVM reflect.
209 target_machine->adjustPassManager(builder);
210
211 builder.populateFunctionPassManager(*function_passes);
212 builder.populateModulePassManager(*module_passes);
213 }
214
215 // Emits the given module to a bit code file.
EmitBitcodeToFile(const llvm::Module & module,absl::string_view filename)216 void EmitBitcodeToFile(const llvm::Module& module, absl::string_view filename) {
217 std::error_code error_code;
218 llvm::ToolOutputFile outfile(std::string(filename).c_str(), error_code,
219 llvm::sys::fs::OF_None);
220 if (error_code) {
221 LOG(FATAL) << "opening bitcode file for writing: " << error_code.message();
222 }
223
224 llvm::WriteBitcodeToFile(module, outfile.os());
225 outfile.keep();
226 }
227
228 // Emits the given module to PTX. target_machine is an initialized TargetMachine
229 // for the NVPTX target.
EmitModuleToPTX(llvm::Module * module,llvm::TargetMachine * target_machine)230 std::string EmitModuleToPTX(llvm::Module* module,
231 llvm::TargetMachine* target_machine) {
232 std::string ptx;
233 {
234 llvm::raw_string_ostream stream(ptx);
235 llvm::buffer_ostream pstream(stream);
236 // The extension is stripped by IrDumpingPassManager, so we need to
237 // get creative to add a suffix.
238 IrDumpingPassManager codegen_passes(
239 MakeNameForTempProduct(module->getModuleIdentifier(), "-nvptx.dummy"),
240 "", false);
241 codegen_passes.add(new llvm::TargetLibraryInfoWrapperPass(
242 llvm::Triple(module->getTargetTriple())));
243
244 target_machine->addPassesToEmitFile(codegen_passes, pstream, nullptr,
245 llvm::CGFT_AssemblyFile);
246 codegen_passes.run(*module);
247 }
248
249 return ptx;
250 }
251
252 // LLVM has an extensive flags mechanism of its own, which is only accessible
253 // through the command line. Internal libraries within LLVM register parsers for
254 // flags, with no other way to configure them except pass these flags.
255 // To do this programmatically, we invoke ParseCommandLineOptions manually with
256 // a "fake argv".
257 // Note: setting flags with this method is stateful, since flags are just
258 // static globals within LLVM libraries.
FeedLLVMWithFlags(const std::vector<std::string> & cl_opts)259 void FeedLLVMWithFlags(const std::vector<std::string>& cl_opts) {
260 std::vector<const char*> fake_argv = {""};
261 for (const std::string& cl_opt : cl_opts) {
262 fake_argv.push_back(cl_opt.c_str());
263 }
264 llvm::cl::ParseCommandLineOptions(fake_argv.size(), &fake_argv[0]);
265 }
266
267 // Returns whether the module could use any device bitcode library functions.
CouldNeedDeviceBitcode(const llvm::Module & module)268 bool CouldNeedDeviceBitcode(const llvm::Module& module) {
269 for (const llvm::Function& function : module.functions()) {
270 // The list of prefixes should be in sync with library functions used in
271 // target_util.cc.
272 if (!function.isIntrinsic() && function.isDeclaration() &&
273 (function.getName().startswith("__nv_") ||
274 function.getName().startswith("__ocml_") ||
275 function.getName().startswith("__ockl_"))) {
276 return true;
277 }
278 }
279 return false;
280 }
281
282 // Links the module with a vector of path to bitcode modules.
283 // The caller must guarantee that the paths exist.
LinkWithBitcodeVector(llvm::Module * module,const std::vector<std::string> & bitcode_path_vector)284 Status LinkWithBitcodeVector(
285 llvm::Module* module, const std::vector<std::string>& bitcode_path_vector) {
286 llvm::Linker linker(*module);
287
288 for (auto& bitcode_path : bitcode_path_vector) {
289 if (!tensorflow::Env::Default()->FileExists(bitcode_path).ok()) {
290 LOG(ERROR) << "bitcode module is required by this HLO module but was "
291 "not found at "
292 << bitcode_path;
293 return xla::InternalError("bitcode module not found at %s", bitcode_path);
294 }
295
296 std::unique_ptr<llvm::Module> bitcode_module =
297 LoadIRModule(bitcode_path, &module->getContext());
298 // Ignore the data layout of the module we're importing. This avoids a
299 // warning from the linker.
300 bitcode_module->setDataLayout(module->getDataLayout());
301 if (linker.linkInModule(
302 std::move(bitcode_module), llvm::Linker::Flags::LinkOnlyNeeded,
303 [](llvm::Module& M, const llvm::StringSet<>& GVS) {
304 internalizeModule(M, [&GVS](const llvm::GlobalValue& GV) {
305 return !GV.hasName() || (GVS.count(GV.getName()) == 0);
306 });
307 })) {
308 return xla::InternalError("Error linking bitcode module from %s",
309 bitcode_path);
310 }
311 }
312 return OkStatus();
313 }
314
315 // Links libdevice into the given module if the module needs libdevice.
LinkLibdeviceIfNecessary(llvm::Module * module,const std::string & libdevice_dir_path)316 Status LinkLibdeviceIfNecessary(llvm::Module* module,
317 const std::string& libdevice_dir_path) {
318 if (!CouldNeedDeviceBitcode(*module)) {
319 return OkStatus();
320 }
321
322 // CUDA 9+ uses a single libdevice file for all devices, and we don't support
323 // older CUDAs.
324 std::string libdevice_path =
325 tensorflow::io::JoinPath(libdevice_dir_path, "libdevice.10.bc");
326 if (!tensorflow::Env::Default()->FileExists(libdevice_path).ok()) {
327 LOG(WARNING)
328 << "libdevice is required by this HLO module but was not found at "
329 << libdevice_path;
330 return xla::InternalError("libdevice not found at %s", libdevice_path);
331 }
332
333 VLOG(1) << "Linking with libdevice from: " << libdevice_path;
334 return LinkWithBitcodeVector(module, {libdevice_path});
335 }
336
NVPTXTargetModuleLinker(llvm::Module * module,GpuVersion gpu_version,const HloModuleConfig & hlo_module_config,const std::string & device_bitcode_dir_path)337 Status NVPTXTargetModuleLinker(llvm::Module* module, GpuVersion gpu_version,
338 const HloModuleConfig& hlo_module_config,
339 const std::string& device_bitcode_dir_path) {
340 // Link the input module with libdevice, to pull in implementations of some
341 // builtins.
342 TF_RETURN_IF_ERROR(LinkLibdeviceIfNecessary(module, device_bitcode_dir_path));
343
344 // Set the flush-denormals-to-zero flag on the module so the NVVM reflect pass
345 // can access it.
346 module->addModuleFlag(llvm::Module::Override, "nvvm-reflect-ftz",
347 hlo_module_config.debug_options().xla_gpu_ftz());
348
349 // If ftz is enabled, set it as an attribute on every function in the module.
350 if (hlo_module_config.debug_options().xla_gpu_ftz()) {
351 for (llvm::Function& fn : *module) {
352 fn.addFnAttr("denormal-fp-math-f32", "preserve-sign");
353 }
354 }
355
356 return OkStatus();
357 }
358
NVPTXGetTargetMachine(llvm::Triple target_triple,se::CudaComputeCapability compute_capability,const HloModuleConfig & hlo_module_config)359 std::unique_ptr<llvm::TargetMachine> NVPTXGetTargetMachine(
360 llvm::Triple target_triple, se::CudaComputeCapability compute_capability,
361 const HloModuleConfig& hlo_module_config) {
362 // Figure out the exact name of the processor as known to the NVPTX backend
363 // from the gpu_architecture flag.
364 return GetTargetMachine(target_triple, GetSmName(compute_capability),
365 hlo_module_config, "+ptx60");
366 }
367
368 using TargetModuleLinker = std::function<Status(
369 llvm::Module*, GpuVersion, const HloModuleConfig&, const std::string&)>;
370
LinkAndOptimizeModule(llvm::Module * module,GpuVersion gpu_version,const HloModuleConfig & hlo_module_config,const std::string & device_bitcode_dir_path,TargetModuleLinker module_linker,llvm::Triple default_target_triple,llvm::TargetMachine * target_machine,int inline_threshold)371 Status LinkAndOptimizeModule(llvm::Module* module, GpuVersion gpu_version,
372 const HloModuleConfig& hlo_module_config,
373 const std::string& device_bitcode_dir_path,
374 TargetModuleLinker module_linker,
375 llvm::Triple default_target_triple,
376 llvm::TargetMachine* target_machine,
377 int inline_threshold) {
378 TF_RETURN_IF_ERROR(module_linker(module, gpu_version, hlo_module_config,
379 device_bitcode_dir_path));
380
381 bool dump_ir = hlo_module_config.debug_options().xla_gpu_dump_llvmir();
382 std::string outputs_dir;
383 tensorflow::io::GetTestUndeclaredOutputsDir(&outputs_dir);
384 IrDumpingPassManager module_passes(module->getModuleIdentifier(), outputs_dir,
385 dump_ir);
386
387 // Add an appropriate TargetLibraryInfo pass for the module's triple.
388 llvm::TargetLibraryInfoWrapperPass* tliwp =
389 new llvm::TargetLibraryInfoWrapperPass(
390 llvm::Triple(module->getTargetTriple()));
391 module_passes.add(tliwp);
392
393 // Try to fetch the target triple from the module. If not present, set a
394 // default target triple.
395 llvm::Triple target_triple = llvm::Triple(module->getTargetTriple());
396 if (target_triple.getArch() == llvm::Triple::UnknownArch) {
397 LOG(WARNING) << "target triple not found in the module";
398 target_triple = default_target_triple;
399 }
400
401 module_passes.add(llvm::createTargetTransformInfoWrapperPass(
402 target_machine->getTargetIRAnalysis()));
403
404 // The LLVM IR verifier performs sanity checking on the IR. This helps
405 // discover problems and report them in a meaningful manner, rather than let
406 // later passes report obscure assertions because of unfulfilled invariants.
407 module_passes.add(llvm::createVerifierPass());
408
409 // Create the function-level pass manager. It needs data layout information
410 // too.
411 llvm::legacy::FunctionPassManager function_passes(module);
412
413 int32_t opt_level =
414 hlo_module_config.debug_options().xla_backend_optimization_level();
415
416 if (opt_level < 2) {
417 LOG(ERROR) << std::string(80, '*');
418 LOG(ERROR) << "The XLA GPU backend doesn't support unoptimized code "
419 "generation but ";
420 LOG(ERROR) << "--xla_backend_optimization_level is set to " << opt_level
421 << "!";
422 LOG(ERROR) << "(Supported configuration is "
423 "--xla_backend_optimization_level >= 2.)";
424 LOG(ERROR) << std::string(80, '*');
425 }
426
427 // Add optimization passes, and set inliner threshold.
428 AddOptimizationPasses(opt_level,
429 /*size_level=*/0, target_machine, &module_passes,
430 &function_passes, inline_threshold);
431
432 // Loop unrolling exposes more opportunities for SROA. Therefore, we run SROA
433 // again after the standard optimization passes [http://b/13329423].
434 // TODO(jingyue): SROA may further expose more optimization opportunities such
435 // as more precise alias analysis and more function inlining (SROA may change
436 // the inlining cost of a function). For now, running SROA already emits good
437 // enough code for the evaluated benchmarks. We may want to run more
438 // optimizations later.
439 if (opt_level > 0) {
440 // LLVM's optimizer turns on SROA when the optimization level is greater
441 // than 0. We mimic this behavior here.
442 module_passes.add(llvm::createSROAPass());
443 }
444
445 // Verify that the module is well formed after optimizations ran.
446 module_passes.add(llvm::createVerifierPass());
447
448 // Done populating the pass managers. Now run them.
449
450 function_passes.doInitialization();
451 for (auto func = module->begin(); func != module->end(); ++func) {
452 function_passes.run(*func);
453 }
454 function_passes.doFinalization();
455 module_passes.run(*module);
456
457 return OkStatus();
458 }
459
460 // One-time module initializer.
461 // Must be called only once -- DO NOT CALL DIRECTLY.
NVPTXBackendInit(const HloModuleConfig & hlo_module_config)462 void NVPTXBackendInit(const HloModuleConfig& hlo_module_config) {
463 // Feed all customized flags here, so we can override them with llvm_cl_opts
464 // without redeploy the compiler for development purpose.
465
466 // This flag tunes a threshold in branch folding. The default threshold, which
467 // is one, is not suitable for CUDA programs where branches are more expensive
468 // than for CPU programs. Setting the threshold to 2 improves the latency of
469 // TwoDPatchDotProductKernel_IND_3_ND_48 by over 5%, and does not affect the
470 // latency of other benchmarks so far.
471 //
472 // I also tried setting this threshold to other values:
473 // * 3-6 gives similar results as 2;
474 // * >6 start hurting the performance of at least dot product kernels.
475 //
476 // TODO(jingyue): The current threshold only considers the number of IR
477 // instructions which do not accurately reflect the true cost. We need a
478 // better cost model.
479 FeedLLVMWithFlags({"-bonus-inst-threshold=2"});
480 // Increase limit when scanning memory dependencies. This helps to reduce
481 // more redundant load instructions.
482 //
483 // The specific value is currently large enough for s3d in shoc benchmark,
484 // which contains a lot of load instructions and many arithmetic instructions
485 // between those loads.
486 FeedLLVMWithFlags({"-memdep-block-scan-limit=500"});
487
488 // Use div.full -- it matters for some float-division heavy benchmarks.
489 // Using div.approx produces incorrect result for float32(max)/float32(max).
490 FeedLLVMWithFlags({"-nvptx-prec-divf32=1"});
491
492 llvm_ir::InitializeLLVMCommandLineOptions(
493 hlo_module_config.debug_options().xla_backend_extra_options());
494
495 // Initialize the NVPTX target; it's the only target we link with, so call its
496 // specific initialization functions instead of the catch-all InitializeAll*.
497 LLVMInitializeNVPTXTarget();
498 LLVMInitializeNVPTXTargetInfo();
499 LLVMInitializeNVPTXTargetMC();
500 LLVMInitializeNVPTXAsmPrinter();
501
502 // Initialize the LLVM optimization passes.
503 llvm::PassRegistry* registry = llvm::PassRegistry::getPassRegistry();
504 InitializePasses(registry);
505 }
506
507 } // namespace
508
509 namespace nvptx {
510
CompileToPtx(llvm::Module * module,GpuVersion gpu_version,const HloModuleConfig & hlo_module_config,const std::string & libdevice_dir_path,std::function<void (llvm::TargetMachine *)> configure_target)511 StatusOr<std::string> CompileToPtx(
512 llvm::Module* module, GpuVersion gpu_version,
513 const HloModuleConfig& hlo_module_config,
514 const std::string& libdevice_dir_path,
515 std::function<void(llvm::TargetMachine*)> configure_target) {
516 static absl::once_flag backend_init_flag;
517 absl::call_once(backend_init_flag, NVPTXBackendInit, hlo_module_config);
518
519 std::string ptx;
520 std::unique_ptr<llvm::TargetMachine> target_machine;
521 {
522 tensorflow::profiler::TraceMe activity(
523 [&] { return absl::StrCat("Compiling IR:", module->getName().str()); },
524 tensorflow::profiler::TraceMeLevel::kInfo);
525 XLA_SCOPED_LOGGING_TIMER("Compile module " + module->getName().str());
526
527 // If the module has no functions or globals, there's nothing to compile.
528 // Just return an empty string.
529 if (module->empty() && module->global_empty()) {
530 VLOG(2) << "Module '" << module->getName().str()
531 << "' is empty. Skipping compilation.";
532 return std::string();
533 }
534
535 auto compute_capability =
536 std::get_if<se::CudaComputeCapability>(&gpu_version);
537 if (!compute_capability) {
538 return xla::InternalError(
539 "Incompatible compute capability was specified.");
540 }
541
542 llvm::Triple default_target_triple("nvptx64-unknown-unknown");
543 // Construct LLVM TargetMachine for NVPTX.
544 std::unique_ptr<llvm::TargetMachine> target_machine = NVPTXGetTargetMachine(
545 default_target_triple, *compute_capability, hlo_module_config);
546
547 // Apply target machine configuration from call-back if available.
548 if (configure_target) {
549 configure_target(target_machine.get());
550 }
551
552 uint64_t start_usecs = tensorflow::Env::Default()->NowMicros();
553
554 // Link with libdevice, and optimize the LLVM module.
555 TF_RETURN_IF_ERROR(LinkAndOptimizeModule(
556 module, gpu_version, hlo_module_config, libdevice_dir_path,
557 NVPTXTargetModuleLinker, default_target_triple, target_machine.get(),
558 kDefaultInlineThreshold));
559
560 uint64_t end_usecs = tensorflow::Env::Default()->NowMicros();
561 RecordLlvmPassesDuration(end_usecs - start_usecs);
562
563 start_usecs = tensorflow::Env::Default()->NowMicros();
564
565 // Lower optimized LLVM module to PTX.
566 ptx = EmitModuleToPTX(module, target_machine.get());
567
568 end_usecs = tensorflow::Env::Default()->NowMicros();
569 RecordLlvmToPtxDuration(end_usecs - start_usecs);
570 }
571 return ptx;
572 }
573
574 } // namespace nvptx
575
576 namespace {
577
578 // Gets the ROCm-Device-Libs filenames for a particular AMDGPU version.
GetROCDLPaths(std::string gcn_arch_name,const std::string & rocdl_dir_path)579 std::vector<std::string> GetROCDLPaths(std::string gcn_arch_name,
580 const std::string& rocdl_dir_path) {
581 // AMDGPU version-neutral bitcodes.
582 static std::vector<std::string>* rocdl_filenames =
583 new std::vector<std::string>(
584 {"opencl.bc", "ocml.bc", "ockl.bc", "oclc_finite_only_off.bc",
585 "oclc_daz_opt_off.bc", "oclc_correctly_rounded_sqrt_on.bc",
586 "oclc_unsafe_math_off.bc", "oclc_wavefrontsize64_on.bc"});
587
588 // Construct full path to ROCDL bitcode libraries.
589 std::vector<std::string> result;
590 result.reserve(rocdl_filenames->size() + 1);
591 for (auto& filename : *rocdl_filenames) {
592 result.push_back(tensorflow::io::JoinPath(rocdl_dir_path, filename));
593 }
594
595 // Add AMDGPU version-specific bitcodes.
596 std::vector<std::string> tokens = absl::StrSplit(gcn_arch_name, ':');
597 std::string amdgpu_version = gcn_arch_name;
598 if (!tokens.empty() && tokens[0].size() >= 3) {
599 amdgpu_version = tokens[0].substr(3);
600 }
601 result.push_back(tensorflow::io::JoinPath(
602 rocdl_dir_path,
603 absl::StrCat("oclc_isa_version_", amdgpu_version, ".bc")));
604 return result;
605 }
606
607 struct HsacoCacheEntry {
608 uint64_t hash;
609 std::string ir;
610 std::string gfx;
611 std::vector<uint8_t> hsaco;
612 };
613
614 struct HsacoCache {
615 protected:
616 std::vector<HsacoCacheEntry> cache;
617 std::mutex m_mutex;
618 int request_count = 0;
619 int hit_count = 0;
620
621 public:
622 static bool Find(const std::string& ir, uint64_t& hash,
623 const std::string& gfx, std::vector<uint8_t>& hsaco);
624 static void Add(const std::string& ir, uint64_t hash, const std::string& gfx,
625 const std::vector<uint8_t>& hsaco);
626 };
627
628 static HsacoCache g_hsacoCache;
629
Find(const std::string & ir,uint64_t & hash,const std::string & gfx,std::vector<uint8_t> & hsaco)630 bool HsacoCache::Find(const std::string& ir, uint64_t& hash,
631 const std::string& gfx, std::vector<uint8_t>& hsaco) {
632 std::lock_guard<std::mutex> lg(g_hsacoCache.m_mutex);
633 hash = std::hash<std::string>{}(ir);
634 bool hit = false;
635 for (auto& x : g_hsacoCache.cache) {
636 if (x.hash != hash) continue;
637 if (x.gfx != gfx) continue;
638 if (x.ir != ir) continue;
639 hsaco = x.hsaco;
640 hit = true;
641 break;
642 }
643 g_hsacoCache.request_count++;
644 if (hit) g_hsacoCache.hit_count++;
645 if (!(g_hsacoCache.request_count % 50))
646 VLOG(1) << "HSACO cache: " << g_hsacoCache.request_count << " requests, "
647 << g_hsacoCache.hit_count << " hits";
648 return hit;
649 }
650
Add(const std::string & ir,uint64_t hash,const std::string & gfx,const std::vector<uint8_t> & hsaco)651 void HsacoCache::Add(const std::string& ir, uint64_t hash,
652 const std::string& gfx,
653 const std::vector<uint8_t>& hsaco) {
654 std::lock_guard<std::mutex> lg(g_hsacoCache.m_mutex);
655 g_hsacoCache.cache.resize(g_hsacoCache.cache.size() + 1);
656 g_hsacoCache.cache.back().ir = ir;
657 g_hsacoCache.cache.back().hash = hash;
658 g_hsacoCache.cache.back().gfx = gfx;
659 g_hsacoCache.cache.back().hsaco = hsaco;
660 }
661
662 // Emits the given module to HSA Code Object. target_machine is an initialized
663 // TargetMachine for the AMDGPU target.
EmitModuleToHsaco(llvm::Module * module,llvm::TargetMachine * target_machine)664 StatusOr<std::vector<uint8_t>> EmitModuleToHsaco(
665 llvm::Module* module, llvm::TargetMachine* target_machine) {
666 auto* env = tensorflow::Env::Default();
667 std::vector<std::string> tempdir_vector;
668 env->GetLocalTempDirectories(&tempdir_vector);
669 if (tempdir_vector.empty()) {
670 return xla::InternalError(
671 "Unable to locate a temporary directory for compile-time artifacts.");
672 }
673 std::string tempdir_name = tempdir_vector.front();
674 VLOG(1) << "Compile-time artifacts located at: " << tempdir_name;
675
676 bool keep_tempfiles = false;
677 TF_CHECK_OK(tensorflow::ReadBoolFromEnvVar("TF_ROCM_KEEP_XLA_TEMPFILES",
678 /*default_val=*/false,
679 &keep_tempfiles));
680 // Prepare filenames for all stages of compilation:
681 // IR, binary ISA, and HSACO.
682 std::string random_number = std::to_string(tensorflow::random::New64());
683 std::string ir_filename =
684 absl::StrCat(module->getModuleIdentifier(), random_number + ".ll");
685 std::string ir_path = tensorflow::io::JoinPath(tempdir_name, ir_filename);
686
687 std::string ir_opt_filename =
688 absl::StrCat(module->getModuleIdentifier(), random_number + "_opt.ll");
689 std::string ir_opt_path =
690 tensorflow::io::JoinPath(tempdir_name, ir_opt_filename);
691
692 std::string isabin_filename =
693 absl::StrCat(module->getModuleIdentifier(), random_number + ".o");
694 std::string isabin_path =
695 tensorflow::io::JoinPath(tempdir_name, isabin_filename);
696
697 std::string hsaco_filename =
698 absl::StrCat(module->getModuleIdentifier(), random_number + ".hsaco");
699 std::string hsaco_path =
700 tensorflow::io::JoinPath(tempdir_name, hsaco_filename);
701
702 std::error_code ec;
703
704 // Dump LLVM IR.
705 std::unique_ptr<llvm::raw_fd_ostream> ir_fs(
706 new llvm::raw_fd_ostream(ir_path, ec, llvm::sys::fs::OF_None));
707 module->print(*ir_fs, nullptr);
708 ir_fs->flush();
709
710 // Emit GCN ISA binary.
711 // The extension is stripped by IrDumpingPassManager, so we need to
712 // get creative to add a suffix.
713 std::string module_id = module->getModuleIdentifier();
714 IrDumpingPassManager codegen_passes(
715 ReplaceFilenameExtension(tensorflow::io::Basename(module_id),
716 random_number + "-amdgpu.dummy"),
717 "", false);
718 codegen_passes.add(new llvm::TargetLibraryInfoWrapperPass(
719 llvm::Triple(module->getTargetTriple())));
720 llvm::SmallVector<char, 0> stream;
721 llvm::raw_svector_ostream pstream(stream);
722 std::unique_ptr<llvm::raw_fd_ostream> isabin_fs(
723 new llvm::raw_fd_ostream(isabin_path, ec, llvm::sys::fs::OF_Text));
724 module->setDataLayout(target_machine->createDataLayout());
725 target_machine->addPassesToEmitFile(codegen_passes, *isabin_fs, nullptr,
726 llvm::CGFT_ObjectFile);
727 codegen_passes.run(*module);
728 isabin_fs->flush();
729
730 if (keep_tempfiles) {
731 std::unique_ptr<llvm::raw_fd_ostream> ir_fs(
732 new llvm::raw_fd_ostream(ir_opt_path, ec, llvm::sys::fs::OF_None));
733 module->print(*ir_fs, nullptr);
734 ir_fs->flush();
735 }
736 // Locate lld.
737 // TODO([email protected]): change to tensorflow::ROCmRoot() after
738 // ROCm-Device-Libs PR.
739 std::string lld_path = tensorflow::io::JoinPath("/opt/rocm", "llvm/bin");
740 auto lld_program = llvm::sys::findProgramByName("ld.lld", {lld_path});
741 if (!lld_program) {
742 return xla::InternalError("unable to find ld.lld in PATH: %s",
743 lld_program.getError().message());
744 }
745 std::vector<llvm::StringRef> lld_args{
746 llvm_ir::AsStringRef("ld.lld"), llvm_ir::AsStringRef("-flavor"),
747 llvm_ir::AsStringRef("gnu"), llvm_ir::AsStringRef("-shared"),
748 llvm_ir::AsStringRef(isabin_path), llvm_ir::AsStringRef("-o"),
749 llvm_ir::AsStringRef(hsaco_path),
750 };
751
752 std::string error_message;
753 int lld_result =
754 llvm::sys::ExecuteAndWait(*lld_program, llvm_ir::AsArrayRef(lld_args),
755 llvm::None, {}, 0, 0, &error_message);
756 if (lld_result) {
757 return xla::InternalError("ld.lld execute fail: %s, error code %d",
758 error_message, lld_result);
759 }
760
761 // Read HSACO.
762 std::ifstream hsaco_file(hsaco_path, std::ios::binary | std::ios::ate);
763 std::ifstream::pos_type hsaco_file_size = hsaco_file.tellg();
764
765 std::vector<uint8_t> hsaco(hsaco_file_size);
766 hsaco_file.seekg(0, std::ios::beg);
767 hsaco_file.read(reinterpret_cast<char*>(&hsaco[0]), hsaco_file_size);
768 hsaco_file.close();
769 if (!keep_tempfiles) {
770 remove(ir_path.c_str());
771 remove(isabin_path.c_str());
772 remove(hsaco_path.c_str());
773 }
774 return hsaco;
775 }
776
777 // Links ROCm-Device-Libs into the given module if the module needs it.
LinkROCDLIfNecessary(llvm::Module * module,std::string gcn_arch_name,const std::string & rocdl_dir_path)778 Status LinkROCDLIfNecessary(llvm::Module* module, std::string gcn_arch_name,
779 const std::string& rocdl_dir_path) {
780 if (!CouldNeedDeviceBitcode(*module)) {
781 return OkStatus();
782 }
783
784 return LinkWithBitcodeVector(module,
785 GetROCDLPaths(gcn_arch_name, rocdl_dir_path));
786 }
787
AMDGPUTargetModuleLinker(llvm::Module * module,GpuVersion gpu_version,const HloModuleConfig & hlo_module_config,const std::string & device_bitcode_dir_path)788 Status AMDGPUTargetModuleLinker(llvm::Module* module, GpuVersion gpu_version,
789 const HloModuleConfig& hlo_module_config,
790 const std::string& device_bitcode_dir_path) {
791 // Link the input module with ROCDL.
792
793 auto compute_capability =
794 std::get_if<se::RocmComputeCapability>(&gpu_version);
795 if (!compute_capability) {
796 return xla::InternalError("Incompatible compute capability was specified.");
797 }
798
799 std::string gcn_arch_name = compute_capability->gcn_arch_name();
800 TF_RETURN_IF_ERROR(
801 LinkROCDLIfNecessary(module, gcn_arch_name, device_bitcode_dir_path));
802
803 // If ftz is enabled, set it as an attribute on every function in the module.
804 if (hlo_module_config.debug_options().xla_gpu_ftz()) {
805 for (llvm::Function& fn : *module) {
806 fn.addFnAttr("denormal-fp-math-f32", "preserve-sign");
807 }
808 }
809
810 return OkStatus();
811 }
812
813 // The following routine maps a feature token extracted from the
814 // hipDeviceProp_t::gcnArchName string, and maps it to a valid feature_str
815 // to be used for creating the AMDGPUTarget.
816 // This mapping is currently in a state of flux because TF XLA uses its
817 // own copy of LLVM, which is different from the LLVM version used by
818 // hipcc/runtime in the ROCm install. Ordinarily this is not a problem,
819 // but right now, the LLVM version used by hipcc/runtime has "targetID"
820 // related changes which have not yet been upstreamed (to the LLVM repo)
821 // When that upstreaming happens (and TF LLVM pointer moves past the
822 // upstream commit), the following mapping will need to change
MapGCNArchNameTokenToFeatureStr(const std::string & token)823 std::string MapGCNArchNameTokenToFeatureStr(const std::string& token) {
824 if (token == "sramecc+") {
825 return "+sramecc";
826 } else if (token == "sramecc-") {
827 return "-sramecc";
828 } else if (token == "xnack+") {
829 return "+xnack";
830 } else if (token == "xnack-") {
831 return "-xnack";
832 }
833 return "";
834 }
835
GetFeatureStrFromGCNArchName(const std::string & gcn_arch_name)836 std::pair<std::string, std::string> GetFeatureStrFromGCNArchName(
837 const std::string& gcn_arch_name) {
838 std::string feature_str;
839
840 std::string gfx = gcn_arch_name;
841 // For ROCm versions 4.0 and greater, we need to specify the correct
842 // feature str, based on the underlying GPU HW to get max performance.
843 std::vector<std::string> tokens = absl::StrSplit(gcn_arch_name, ':');
844 std::vector<std::string> mapped_tokens;
845 if (tokens.size() > 0) gfx = tokens[0];
846 for (auto it = tokens.begin(); it != tokens.end(); it++) {
847 // Skip the first token, that is the gfxNNN str
848 // The rest of the tokens are the feature/targetid strings
849 if (it != tokens.begin()) {
850 std::string token(*it);
851 std::string mapped_token = MapGCNArchNameTokenToFeatureStr(token);
852 mapped_tokens.push_back(mapped_token);
853 }
854 }
855 feature_str = absl::StrJoin(mapped_tokens, ",");
856
857 return std::make_pair(gfx, feature_str);
858 }
859
AMDGPUGetTargetMachine(llvm::Triple target_triple,GpuVersion gpu_version,const HloModuleConfig & hlo_module_config)860 std::unique_ptr<llvm::TargetMachine> AMDGPUGetTargetMachine(
861 llvm::Triple target_triple, GpuVersion gpu_version,
862 const HloModuleConfig& hlo_module_config) {
863 auto compute_capability =
864 std::get_if<se::RocmComputeCapability>(&gpu_version);
865
866 std::string gcn_arch_name = compute_capability->gcn_arch_name();
867 auto arch = GetFeatureStrFromGCNArchName(gcn_arch_name);
868 return GetTargetMachine(std::move(target_triple), arch.first,
869 hlo_module_config, arch.second);
870 }
871
AMDGPUBackendInit(const HloModuleConfig & hlo_module_config)872 void AMDGPUBackendInit(const HloModuleConfig& hlo_module_config) {
873 llvm_ir::InitializeLLVMCommandLineOptions(
874 hlo_module_config.debug_options().xla_backend_extra_options());
875
876 // Initialize the AMDGPU target; it's the only target we link with, so call
877 // its specific initialization functions instead of the catch-all
878 // InitializeAll*.
879 #if TENSORFLOW_USE_ROCM
880 LLVMInitializeAMDGPUTarget();
881 LLVMInitializeAMDGPUTargetInfo();
882 LLVMInitializeAMDGPUTargetMC();
883 LLVMInitializeAMDGPUAsmPrinter();
884
885 #endif
886
887 llvm::PassRegistry* registry = llvm::PassRegistry::getPassRegistry();
888 InitializePasses(registry);
889 }
890
891 } // namespace
892
893 namespace amdgpu {
CompileToHsaco(llvm::Module * module,GpuVersion gpu_version,const HloModuleConfig & hlo_module_config,const std::string & rocdl_dir_path)894 StatusOr<std::vector<uint8_t>> CompileToHsaco(
895 llvm::Module* module, GpuVersion gpu_version,
896 const HloModuleConfig& hlo_module_config,
897 const std::string& rocdl_dir_path) {
898 static absl::once_flag backend_init_flag;
899 absl::call_once(backend_init_flag, AMDGPUBackendInit, hlo_module_config);
900
901 std::vector<uint8_t> hsaco;
902 std::unique_ptr<llvm::TargetMachine> target_machine;
903 std::string str;
904 llvm::raw_string_ostream stream(str);
905 stream << *module;
906 // Delete the first two lines, since they usually vary even when the rest of
907 // the code is the same (but verify that they are what we expect).
908 if (str.size() >= 13 && str.substr(0, 13) == "; ModuleID = ") {
909 auto pos = str.find('\n');
910 if (pos != std::string::npos) str = str.substr(pos + 1);
911 }
912 if (str.size() >= 18 && str.substr(0, 18) == "source_filename = ") {
913 auto pos = str.find('\n');
914 if (pos != std::string::npos) str = str.substr(pos + 1);
915 }
916 str += hlo_module_config.compilation_cache_key();
917 {
918 tensorflow::profiler::TraceMe activity(
919 [&] { return absl::StrCat("Compiling IR", module->getName().str()); },
920 tensorflow::profiler::TraceMeLevel::kInfo);
921 XLA_SCOPED_LOGGING_TIMER("Compile module " + module->getName().str());
922
923 auto compute_capability =
924 std::get_if<se::RocmComputeCapability>(&gpu_version);
925 if (!compute_capability) {
926 return xla::InternalError(
927 "Incompatible compute capability was specified.");
928 }
929
930 std::string gcn_arch_name = compute_capability->gcn_arch_name();
931
932 uint64_t hash;
933 if (HsacoCache::Find(str, hash, gcn_arch_name, hsaco)) {
934 VLOG(1) << "HSACO cache hit";
935 return hsaco;
936 }
937 VLOG(1) << "HSACO cache miss";
938 bool dump_lls = false;
939 if (dump_lls) {
940 static int hsaco_count = 0;
941 std::string name = "/tmp/" + std::to_string(hsaco_count) + ".ll";
942 hsaco_count++;
943 std::ofstream ofs(name);
944 ofs << str;
945 ofs.close();
946 }
947
948 llvm::Triple default_target_triple("amdgcn--amdhsa-amdgiz");
949 // Construct LLVM TargetMachine for AMDGPU.
950 std::unique_ptr<llvm::TargetMachine> target_machine =
951 AMDGPUGetTargetMachine(default_target_triple, gpu_version,
952 hlo_module_config);
953
954 // Link with ROCm-Device-Libs, and optimize the LLVM module.
955 TF_RETURN_IF_ERROR(LinkAndOptimizeModule(
956 module, gpu_version, hlo_module_config, rocdl_dir_path,
957 AMDGPUTargetModuleLinker, default_target_triple, target_machine.get(),
958 kAMDGPUInlineThreshold));
959
960 // Lower optimized LLVM module to HSA code object.
961 TF_ASSIGN_OR_RETURN(hsaco, EmitModuleToHsaco(module, target_machine.get()));
962 HsacoCache::Add(str, hash, gcn_arch_name, hsaco);
963 }
964 return hsaco;
965 }
966
967 } // namespace amdgpu
968
969 } // namespace gpu
970 } // namespace xla
971