xref: /aosp_15_r20/external/tensorflow/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc (revision b6fb3261f9314811a0f4371741dbb8839866f948)
1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.h"
17 
18 #include <fstream>
19 #include <map>
20 #include <memory>
21 #include <string>
22 #include <utility>
23 
24 #include "absl/base/call_once.h"
25 #include "absl/strings/str_cat.h"
26 #include "absl/strings/string_view.h"
27 #include "llvm/ADT/STLExtras.h"
28 #include "llvm/ADT/StringMap.h"
29 #include "llvm/ADT/StringSet.h"
30 #include "llvm/Analysis/TargetLibraryInfo.h"
31 #include "llvm/Analysis/TargetTransformInfo.h"
32 #include "llvm/Bitcode/BitcodeReader.h"
33 #include "llvm/Bitcode/BitcodeWriter.h"
34 #include "llvm/CodeGen/CommandFlags.h"
35 #include "llvm/IR/LLVMContext.h"
36 #include "llvm/IR/LegacyPassManager.h"
37 #include "llvm/IR/Module.h"
38 #include "llvm/IR/Verifier.h"
39 #include "llvm/InitializePasses.h"
40 #include "llvm/Linker/Linker.h"
41 #include "llvm/MC/TargetRegistry.h"
42 #include "llvm/PassRegistry.h"
43 #include "llvm/Support/CommandLine.h"
44 #include "llvm/Support/FileSystem.h"
45 #include "llvm/Support/FormattedStream.h"
46 #include "llvm/Support/Program.h"
47 #include "llvm/Support/TargetSelect.h"
48 #include "llvm/Support/ToolOutputFile.h"
49 #include "llvm/Target/TargetMachine.h"
50 #include "llvm/Transforms/IPO.h"
51 #include "llvm/Transforms/IPO/AlwaysInliner.h"
52 #include "llvm/Transforms/IPO/Internalize.h"
53 #include "llvm/Transforms/IPO/PassManagerBuilder.h"
54 #include "llvm/Transforms/Scalar.h"
55 #include "tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/dump_ir_pass.h"
56 #include "tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/utils.h"
57 #include "tensorflow/compiler/xla/service/gpu/metrics.h"
58 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_command_line_options.h"
59 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_type_conversion_util.h"
60 #include "tensorflow/compiler/xla/status_macros.h"
61 #include "tensorflow/compiler/xla/types.h"
62 #include "tensorflow/compiler/xla/util.h"
63 #include "tensorflow/core/platform/env.h"
64 #include "tensorflow/core/platform/logging.h"
65 #include "tensorflow/core/platform/path.h"
66 #include "tensorflow/core/platform/random.h"
67 #include "tensorflow/core/profiler/lib/traceme.h"
68 #include "tensorflow/core/util/env_var.h"
69 
70 #if !defined(PLATFORM_GOOGLE) && TENSORFLOW_USE_ROCM
71 #include "rocm/rocm_config.h"
72 #endif
73 
74 namespace xla {
75 namespace gpu {
76 namespace {
77 
78 static llvm::codegen::RegisterCodeGenFlags CGF;
79 
80 // Inline threshold value to use in LLVM AMDGPU backend.
81 const int kAMDGPUInlineThreshold = 0x100000;
82 
83 // Default inline threshold value to use in llvm.
84 const int kDefaultInlineThreshold = 1100;
85 
86 // Gets the GPU name as it's known to LLVM for a given compute
87 // capability.  If we see an unrecognized compute capability, we
88 // return the highest one that is known and below the selected device.
GetSmName(se::CudaComputeCapability compute_capability)89 static std::string GetSmName(se::CudaComputeCapability compute_capability) {
90   int compute_capability_version =
91       compute_capability.major * 10 + compute_capability.minor;
92   int sm_version = 30;
93   // If the current compute capability isn't known, fallback to the
94   // most recent version before it.
95   int supported_versions[] = {86, 80, 75, 72, 70, 62, 61, 60,
96                               53, 52, 50, 37, 35, 32, 30};
97   for (int v : supported_versions) {
98     if (v <= compute_capability_version) {
99       sm_version = v;
100       break;
101     }
102   }
103 
104   // If the current CC isn't supported by LLVM and it is newer then
105   // the max supported LLVM version, do not warn about it. The end
106   // user can't do anything about this. E.g., PTX compiled for SM75 will
107   // run on SM80 too.
108   if (sm_version != compute_capability_version &&
109       compute_capability_version < supported_versions[0]) {
110     LOG(WARNING) << "Unknown compute capability "
111                  << compute_capability.ToString()
112                  << ". Defaulting to telling LLVM that we're compiling for sm_"
113                  << sm_version;
114   }
115   return absl::StrCat("sm_", sm_version);
116 }
117 
118 // Convenience function for producing a name of a temporary compilation product
119 // from the input filename.
MakeNameForTempProduct(absl::string_view input_filename,absl::string_view extension)120 std::string MakeNameForTempProduct(absl::string_view input_filename,
121                                    absl::string_view extension) {
122   return ReplaceFilenameExtension(tensorflow::io::Basename(input_filename),
123                                   extension);
124 }
125 
126 // Initializes LLVM passes. Uses the PassRegistry mechanism.
InitializePasses(llvm::PassRegistry * pass_registry)127 void InitializePasses(llvm::PassRegistry* pass_registry) {
128   llvm::initializeCore(*pass_registry);
129   llvm::initializeCodeGen(*pass_registry);
130   llvm::initializeScalarOpts(*pass_registry);
131   llvm::initializeObjCARCOpts(*pass_registry);
132   llvm::initializeVectorization(*pass_registry);
133   llvm::initializeIPO(*pass_registry);
134   llvm::initializeAnalysis(*pass_registry);
135   llvm::initializeTransformUtils(*pass_registry);
136   llvm::initializeInstCombine(*pass_registry);
137   llvm::initializeInstrumentation(*pass_registry);
138   llvm::initializeTarget(*pass_registry);
139   llvm::initializeCodeGenPreparePass(*pass_registry);
140 }
141 
142 // Returns the TargetMachine, given a triple.
GetTargetMachine(llvm::Triple triple,absl::string_view cpu_name,const HloModuleConfig & hlo_module_config,absl::string_view feature_str)143 std::unique_ptr<llvm::TargetMachine> GetTargetMachine(
144     llvm::Triple triple, absl::string_view cpu_name,
145     const HloModuleConfig& hlo_module_config, absl::string_view feature_str) {
146   std::string error;
147   const llvm::Target* target =
148       llvm::TargetRegistry::lookupTarget("", triple, error);
149   if (target == nullptr) {
150     LOG(FATAL) << "Unable to find Target for triple '" << triple.str() << "'"
151                << " -- " << error;
152     return nullptr;
153   }
154 
155   llvm::TargetOptions target_options =
156       llvm::codegen::InitTargetOptionsFromCodeGenFlags(llvm::Triple());
157 
158   // Set the verbose assembly options.
159   target_options.MCOptions.AsmVerbose = false;
160 
161   // The selection of codegen optimization level is copied from function
162   // GetCodeGenOptLevel in //third_party/llvm/llvm/tools/opt/opt.cpp.
163   llvm::CodeGenOpt::Level codegen_opt_level;
164   switch (hlo_module_config.debug_options().xla_backend_optimization_level()) {
165     case 1:
166       codegen_opt_level = llvm::CodeGenOpt::Less;
167       break;
168     case 2:
169       codegen_opt_level = llvm::CodeGenOpt::Default;
170       break;
171     case 3:
172       codegen_opt_level = llvm::CodeGenOpt::Aggressive;
173       break;
174     default:
175       codegen_opt_level = llvm::CodeGenOpt::None;
176   }
177   return absl::WrapUnique(target->createTargetMachine(
178       triple.str(), llvm_ir::AsStringRef(cpu_name),
179       llvm_ir::AsStringRef(feature_str), target_options,
180       llvm::codegen::getExplicitRelocModel(),
181       llvm::codegen::getExplicitCodeModel(), codegen_opt_level));
182 }
183 
184 // Adds the standard LLVM optimization passes, based on the speed optimization
185 // level (opt_level) and size optimization level (size_level). Both module
186 // and function-level passes are added, so two pass managers are passed in and
187 // modified by this function.
AddOptimizationPasses(unsigned opt_level,unsigned size_level,llvm::TargetMachine * target_machine,llvm::legacy::PassManagerBase * module_passes,llvm::legacy::FunctionPassManager * function_passes,int inline_threshold)188 void AddOptimizationPasses(unsigned opt_level, unsigned size_level,
189                            llvm::TargetMachine* target_machine,
190                            llvm::legacy::PassManagerBase* module_passes,
191                            llvm::legacy::FunctionPassManager* function_passes,
192                            int inline_threshold) {
193   llvm::PassManagerBuilder builder;
194   builder.OptLevel = opt_level;
195   builder.SizeLevel = size_level;
196 
197   if (opt_level > 1) {
198     builder.Inliner = llvm::createFunctionInliningPass(inline_threshold);
199   } else {
200     // Only inline functions marked with "alwaysinline".
201     builder.Inliner = llvm::createAlwaysInlinerLegacyPass();
202   }
203 
204   builder.DisableUnrollLoops = opt_level == 0;
205   builder.LoopVectorize = opt_level > 0;
206   builder.SLPVectorize = opt_level > 1 && size_level < 2;
207 
208   // NVPTX's early-as-possible passes include NVVM reflect.
209   target_machine->adjustPassManager(builder);
210 
211   builder.populateFunctionPassManager(*function_passes);
212   builder.populateModulePassManager(*module_passes);
213 }
214 
215 // Emits the given module to a bit code file.
EmitBitcodeToFile(const llvm::Module & module,absl::string_view filename)216 void EmitBitcodeToFile(const llvm::Module& module, absl::string_view filename) {
217   std::error_code error_code;
218   llvm::ToolOutputFile outfile(std::string(filename).c_str(), error_code,
219                                llvm::sys::fs::OF_None);
220   if (error_code) {
221     LOG(FATAL) << "opening bitcode file for writing: " << error_code.message();
222   }
223 
224   llvm::WriteBitcodeToFile(module, outfile.os());
225   outfile.keep();
226 }
227 
228 // Emits the given module to PTX. target_machine is an initialized TargetMachine
229 // for the NVPTX target.
EmitModuleToPTX(llvm::Module * module,llvm::TargetMachine * target_machine)230 std::string EmitModuleToPTX(llvm::Module* module,
231                             llvm::TargetMachine* target_machine) {
232   std::string ptx;
233   {
234     llvm::raw_string_ostream stream(ptx);
235     llvm::buffer_ostream pstream(stream);
236     // The extension is stripped by IrDumpingPassManager, so we need to
237     // get creative to add a suffix.
238     IrDumpingPassManager codegen_passes(
239         MakeNameForTempProduct(module->getModuleIdentifier(), "-nvptx.dummy"),
240         "", false);
241     codegen_passes.add(new llvm::TargetLibraryInfoWrapperPass(
242         llvm::Triple(module->getTargetTriple())));
243 
244     target_machine->addPassesToEmitFile(codegen_passes, pstream, nullptr,
245                                         llvm::CGFT_AssemblyFile);
246     codegen_passes.run(*module);
247   }
248 
249   return ptx;
250 }
251 
252 // LLVM has an extensive flags mechanism of its own, which is only accessible
253 // through the command line. Internal libraries within LLVM register parsers for
254 // flags, with no other way to configure them except pass these flags.
255 // To do this programmatically, we invoke ParseCommandLineOptions manually with
256 // a "fake argv".
257 // Note: setting flags with this method is stateful, since flags are just
258 // static globals within LLVM libraries.
FeedLLVMWithFlags(const std::vector<std::string> & cl_opts)259 void FeedLLVMWithFlags(const std::vector<std::string>& cl_opts) {
260   std::vector<const char*> fake_argv = {""};
261   for (const std::string& cl_opt : cl_opts) {
262     fake_argv.push_back(cl_opt.c_str());
263   }
264   llvm::cl::ParseCommandLineOptions(fake_argv.size(), &fake_argv[0]);
265 }
266 
267 // Returns whether the module could use any device bitcode library functions.
CouldNeedDeviceBitcode(const llvm::Module & module)268 bool CouldNeedDeviceBitcode(const llvm::Module& module) {
269   for (const llvm::Function& function : module.functions()) {
270     // The list of prefixes should be in sync with library functions used in
271     // target_util.cc.
272     if (!function.isIntrinsic() && function.isDeclaration() &&
273         (function.getName().startswith("__nv_") ||
274          function.getName().startswith("__ocml_") ||
275          function.getName().startswith("__ockl_"))) {
276       return true;
277     }
278   }
279   return false;
280 }
281 
282 // Links the module with a vector of path to bitcode modules.
283 // The caller must guarantee that the paths exist.
LinkWithBitcodeVector(llvm::Module * module,const std::vector<std::string> & bitcode_path_vector)284 Status LinkWithBitcodeVector(
285     llvm::Module* module, const std::vector<std::string>& bitcode_path_vector) {
286   llvm::Linker linker(*module);
287 
288   for (auto& bitcode_path : bitcode_path_vector) {
289     if (!tensorflow::Env::Default()->FileExists(bitcode_path).ok()) {
290       LOG(ERROR) << "bitcode module is required by this HLO module but was "
291                     "not found at "
292                  << bitcode_path;
293       return xla::InternalError("bitcode module not found at %s", bitcode_path);
294     }
295 
296     std::unique_ptr<llvm::Module> bitcode_module =
297         LoadIRModule(bitcode_path, &module->getContext());
298     // Ignore the data layout of the module we're importing. This avoids a
299     // warning from the linker.
300     bitcode_module->setDataLayout(module->getDataLayout());
301     if (linker.linkInModule(
302             std::move(bitcode_module), llvm::Linker::Flags::LinkOnlyNeeded,
303             [](llvm::Module& M, const llvm::StringSet<>& GVS) {
304               internalizeModule(M, [&GVS](const llvm::GlobalValue& GV) {
305                 return !GV.hasName() || (GVS.count(GV.getName()) == 0);
306               });
307             })) {
308       return xla::InternalError("Error linking bitcode module from %s",
309                                 bitcode_path);
310     }
311   }
312   return OkStatus();
313 }
314 
315 // Links libdevice into the given module if the module needs libdevice.
LinkLibdeviceIfNecessary(llvm::Module * module,const std::string & libdevice_dir_path)316 Status LinkLibdeviceIfNecessary(llvm::Module* module,
317                                 const std::string& libdevice_dir_path) {
318   if (!CouldNeedDeviceBitcode(*module)) {
319     return OkStatus();
320   }
321 
322   // CUDA 9+ uses a single libdevice file for all devices, and we don't support
323   // older CUDAs.
324   std::string libdevice_path =
325       tensorflow::io::JoinPath(libdevice_dir_path, "libdevice.10.bc");
326   if (!tensorflow::Env::Default()->FileExists(libdevice_path).ok()) {
327     LOG(WARNING)
328         << "libdevice is required by this HLO module but was not found at "
329         << libdevice_path;
330     return xla::InternalError("libdevice not found at %s", libdevice_path);
331   }
332 
333   VLOG(1) << "Linking with libdevice from: " << libdevice_path;
334   return LinkWithBitcodeVector(module, {libdevice_path});
335 }
336 
NVPTXTargetModuleLinker(llvm::Module * module,GpuVersion gpu_version,const HloModuleConfig & hlo_module_config,const std::string & device_bitcode_dir_path)337 Status NVPTXTargetModuleLinker(llvm::Module* module, GpuVersion gpu_version,
338                                const HloModuleConfig& hlo_module_config,
339                                const std::string& device_bitcode_dir_path) {
340   // Link the input module with libdevice, to pull in implementations of some
341   // builtins.
342   TF_RETURN_IF_ERROR(LinkLibdeviceIfNecessary(module, device_bitcode_dir_path));
343 
344   // Set the flush-denormals-to-zero flag on the module so the NVVM reflect pass
345   // can access it.
346   module->addModuleFlag(llvm::Module::Override, "nvvm-reflect-ftz",
347                         hlo_module_config.debug_options().xla_gpu_ftz());
348 
349   // If ftz is enabled, set it as an attribute on every function in the module.
350   if (hlo_module_config.debug_options().xla_gpu_ftz()) {
351     for (llvm::Function& fn : *module) {
352       fn.addFnAttr("denormal-fp-math-f32", "preserve-sign");
353     }
354   }
355 
356   return OkStatus();
357 }
358 
NVPTXGetTargetMachine(llvm::Triple target_triple,se::CudaComputeCapability compute_capability,const HloModuleConfig & hlo_module_config)359 std::unique_ptr<llvm::TargetMachine> NVPTXGetTargetMachine(
360     llvm::Triple target_triple, se::CudaComputeCapability compute_capability,
361     const HloModuleConfig& hlo_module_config) {
362   // Figure out the exact name of the processor as known to the NVPTX backend
363   // from the gpu_architecture flag.
364   return GetTargetMachine(target_triple, GetSmName(compute_capability),
365                           hlo_module_config, "+ptx60");
366 }
367 
368 using TargetModuleLinker = std::function<Status(
369     llvm::Module*, GpuVersion, const HloModuleConfig&, const std::string&)>;
370 
LinkAndOptimizeModule(llvm::Module * module,GpuVersion gpu_version,const HloModuleConfig & hlo_module_config,const std::string & device_bitcode_dir_path,TargetModuleLinker module_linker,llvm::Triple default_target_triple,llvm::TargetMachine * target_machine,int inline_threshold)371 Status LinkAndOptimizeModule(llvm::Module* module, GpuVersion gpu_version,
372                              const HloModuleConfig& hlo_module_config,
373                              const std::string& device_bitcode_dir_path,
374                              TargetModuleLinker module_linker,
375                              llvm::Triple default_target_triple,
376                              llvm::TargetMachine* target_machine,
377                              int inline_threshold) {
378   TF_RETURN_IF_ERROR(module_linker(module, gpu_version, hlo_module_config,
379                                    device_bitcode_dir_path));
380 
381   bool dump_ir = hlo_module_config.debug_options().xla_gpu_dump_llvmir();
382   std::string outputs_dir;
383   tensorflow::io::GetTestUndeclaredOutputsDir(&outputs_dir);
384   IrDumpingPassManager module_passes(module->getModuleIdentifier(), outputs_dir,
385                                      dump_ir);
386 
387   // Add an appropriate TargetLibraryInfo pass for the module's triple.
388   llvm::TargetLibraryInfoWrapperPass* tliwp =
389       new llvm::TargetLibraryInfoWrapperPass(
390           llvm::Triple(module->getTargetTriple()));
391   module_passes.add(tliwp);
392 
393   // Try to fetch the target triple from the module. If not present, set a
394   // default target triple.
395   llvm::Triple target_triple = llvm::Triple(module->getTargetTriple());
396   if (target_triple.getArch() == llvm::Triple::UnknownArch) {
397     LOG(WARNING) << "target triple not found in the module";
398     target_triple = default_target_triple;
399   }
400 
401   module_passes.add(llvm::createTargetTransformInfoWrapperPass(
402       target_machine->getTargetIRAnalysis()));
403 
404   // The LLVM IR verifier performs sanity checking on the IR. This helps
405   // discover problems and report them in a meaningful manner, rather than let
406   // later passes report obscure assertions because of unfulfilled invariants.
407   module_passes.add(llvm::createVerifierPass());
408 
409   // Create the function-level pass manager. It needs data layout information
410   // too.
411   llvm::legacy::FunctionPassManager function_passes(module);
412 
413   int32_t opt_level =
414       hlo_module_config.debug_options().xla_backend_optimization_level();
415 
416   if (opt_level < 2) {
417     LOG(ERROR) << std::string(80, '*');
418     LOG(ERROR) << "The XLA GPU backend doesn't support unoptimized code "
419                   "generation but ";
420     LOG(ERROR) << "--xla_backend_optimization_level is set to " << opt_level
421                << "!";
422     LOG(ERROR) << "(Supported configuration is "
423                   "--xla_backend_optimization_level >= 2.)";
424     LOG(ERROR) << std::string(80, '*');
425   }
426 
427   // Add optimization passes, and set inliner threshold.
428   AddOptimizationPasses(opt_level,
429                         /*size_level=*/0, target_machine, &module_passes,
430                         &function_passes, inline_threshold);
431 
432   // Loop unrolling exposes more opportunities for SROA. Therefore, we run SROA
433   // again after the standard optimization passes [http://b/13329423].
434   // TODO(jingyue): SROA may further expose more optimization opportunities such
435   // as more precise alias analysis and more function inlining (SROA may change
436   // the inlining cost of a function). For now, running SROA already emits good
437   // enough code for the evaluated benchmarks. We may want to run more
438   // optimizations later.
439   if (opt_level > 0) {
440     // LLVM's optimizer turns on SROA when the optimization level is greater
441     // than 0. We mimic this behavior here.
442     module_passes.add(llvm::createSROAPass());
443   }
444 
445   // Verify that the module is well formed after optimizations ran.
446   module_passes.add(llvm::createVerifierPass());
447 
448   // Done populating the pass managers. Now run them.
449 
450   function_passes.doInitialization();
451   for (auto func = module->begin(); func != module->end(); ++func) {
452     function_passes.run(*func);
453   }
454   function_passes.doFinalization();
455   module_passes.run(*module);
456 
457   return OkStatus();
458 }
459 
460 // One-time module initializer.
461 // Must be called only once -- DO NOT CALL DIRECTLY.
NVPTXBackendInit(const HloModuleConfig & hlo_module_config)462 void NVPTXBackendInit(const HloModuleConfig& hlo_module_config) {
463   // Feed all customized flags here, so we can override them with llvm_cl_opts
464   // without redeploy the compiler for development purpose.
465 
466   // This flag tunes a threshold in branch folding. The default threshold, which
467   // is one, is not suitable for CUDA programs where branches are more expensive
468   // than for CPU programs. Setting the threshold to 2 improves the latency of
469   // TwoDPatchDotProductKernel_IND_3_ND_48 by over 5%, and does not affect the
470   // latency of other benchmarks so far.
471   //
472   // I also tried setting this threshold to other values:
473   // * 3-6 gives similar results as 2;
474   // * >6 start hurting the performance of at least dot product kernels.
475   //
476   // TODO(jingyue): The current threshold only considers the number of IR
477   // instructions which do not accurately reflect the true cost. We need a
478   // better cost model.
479   FeedLLVMWithFlags({"-bonus-inst-threshold=2"});
480   // Increase limit when scanning memory dependencies.  This helps to reduce
481   // more redundant load instructions.
482   //
483   // The specific value is currently large enough for s3d in shoc benchmark,
484   // which contains a lot of load instructions and many arithmetic instructions
485   // between those loads.
486   FeedLLVMWithFlags({"-memdep-block-scan-limit=500"});
487 
488   // Use div.full -- it matters for some float-division heavy benchmarks.
489   // Using div.approx produces incorrect result for float32(max)/float32(max).
490   FeedLLVMWithFlags({"-nvptx-prec-divf32=1"});
491 
492   llvm_ir::InitializeLLVMCommandLineOptions(
493       hlo_module_config.debug_options().xla_backend_extra_options());
494 
495   // Initialize the NVPTX target; it's the only target we link with, so call its
496   // specific initialization functions instead of the catch-all InitializeAll*.
497   LLVMInitializeNVPTXTarget();
498   LLVMInitializeNVPTXTargetInfo();
499   LLVMInitializeNVPTXTargetMC();
500   LLVMInitializeNVPTXAsmPrinter();
501 
502   // Initialize the LLVM optimization passes.
503   llvm::PassRegistry* registry = llvm::PassRegistry::getPassRegistry();
504   InitializePasses(registry);
505 }
506 
507 }  // namespace
508 
509 namespace nvptx {
510 
CompileToPtx(llvm::Module * module,GpuVersion gpu_version,const HloModuleConfig & hlo_module_config,const std::string & libdevice_dir_path,std::function<void (llvm::TargetMachine *)> configure_target)511 StatusOr<std::string> CompileToPtx(
512     llvm::Module* module, GpuVersion gpu_version,
513     const HloModuleConfig& hlo_module_config,
514     const std::string& libdevice_dir_path,
515     std::function<void(llvm::TargetMachine*)> configure_target) {
516   static absl::once_flag backend_init_flag;
517   absl::call_once(backend_init_flag, NVPTXBackendInit, hlo_module_config);
518 
519   std::string ptx;
520   std::unique_ptr<llvm::TargetMachine> target_machine;
521   {
522     tensorflow::profiler::TraceMe activity(
523         [&] { return absl::StrCat("Compiling IR:", module->getName().str()); },
524         tensorflow::profiler::TraceMeLevel::kInfo);
525     XLA_SCOPED_LOGGING_TIMER("Compile module " + module->getName().str());
526 
527     // If the module has no functions or globals, there's nothing to compile.
528     // Just return an empty string.
529     if (module->empty() && module->global_empty()) {
530       VLOG(2) << "Module '" << module->getName().str()
531               << "' is empty. Skipping compilation.";
532       return std::string();
533     }
534 
535     auto compute_capability =
536         std::get_if<se::CudaComputeCapability>(&gpu_version);
537     if (!compute_capability) {
538       return xla::InternalError(
539           "Incompatible compute capability was specified.");
540     }
541 
542     llvm::Triple default_target_triple("nvptx64-unknown-unknown");
543     // Construct LLVM TargetMachine for NVPTX.
544     std::unique_ptr<llvm::TargetMachine> target_machine = NVPTXGetTargetMachine(
545         default_target_triple, *compute_capability, hlo_module_config);
546 
547     // Apply target machine configuration from call-back if available.
548     if (configure_target) {
549       configure_target(target_machine.get());
550     }
551 
552     uint64_t start_usecs = tensorflow::Env::Default()->NowMicros();
553 
554     // Link with libdevice, and optimize the LLVM module.
555     TF_RETURN_IF_ERROR(LinkAndOptimizeModule(
556         module, gpu_version, hlo_module_config, libdevice_dir_path,
557         NVPTXTargetModuleLinker, default_target_triple, target_machine.get(),
558         kDefaultInlineThreshold));
559 
560     uint64_t end_usecs = tensorflow::Env::Default()->NowMicros();
561     RecordLlvmPassesDuration(end_usecs - start_usecs);
562 
563     start_usecs = tensorflow::Env::Default()->NowMicros();
564 
565     // Lower optimized LLVM module to PTX.
566     ptx = EmitModuleToPTX(module, target_machine.get());
567 
568     end_usecs = tensorflow::Env::Default()->NowMicros();
569     RecordLlvmToPtxDuration(end_usecs - start_usecs);
570   }
571   return ptx;
572 }
573 
574 }  // namespace nvptx
575 
576 namespace {
577 
578 // Gets the ROCm-Device-Libs filenames for a particular AMDGPU version.
GetROCDLPaths(std::string gcn_arch_name,const std::string & rocdl_dir_path)579 std::vector<std::string> GetROCDLPaths(std::string gcn_arch_name,
580                                        const std::string& rocdl_dir_path) {
581   // AMDGPU version-neutral bitcodes.
582   static std::vector<std::string>* rocdl_filenames =
583       new std::vector<std::string>(
584           {"opencl.bc", "ocml.bc", "ockl.bc", "oclc_finite_only_off.bc",
585            "oclc_daz_opt_off.bc", "oclc_correctly_rounded_sqrt_on.bc",
586            "oclc_unsafe_math_off.bc", "oclc_wavefrontsize64_on.bc"});
587 
588   // Construct full path to ROCDL bitcode libraries.
589   std::vector<std::string> result;
590   result.reserve(rocdl_filenames->size() + 1);
591   for (auto& filename : *rocdl_filenames) {
592     result.push_back(tensorflow::io::JoinPath(rocdl_dir_path, filename));
593   }
594 
595   // Add AMDGPU version-specific bitcodes.
596   std::vector<std::string> tokens = absl::StrSplit(gcn_arch_name, ':');
597   std::string amdgpu_version = gcn_arch_name;
598   if (!tokens.empty() && tokens[0].size() >= 3) {
599     amdgpu_version = tokens[0].substr(3);
600   }
601   result.push_back(tensorflow::io::JoinPath(
602       rocdl_dir_path,
603       absl::StrCat("oclc_isa_version_", amdgpu_version, ".bc")));
604   return result;
605 }
606 
607 struct HsacoCacheEntry {
608   uint64_t hash;
609   std::string ir;
610   std::string gfx;
611   std::vector<uint8_t> hsaco;
612 };
613 
614 struct HsacoCache {
615  protected:
616   std::vector<HsacoCacheEntry> cache;
617   std::mutex m_mutex;
618   int request_count = 0;
619   int hit_count = 0;
620 
621  public:
622   static bool Find(const std::string& ir, uint64_t& hash,
623                    const std::string& gfx, std::vector<uint8_t>& hsaco);
624   static void Add(const std::string& ir, uint64_t hash, const std::string& gfx,
625                   const std::vector<uint8_t>& hsaco);
626 };
627 
628 static HsacoCache g_hsacoCache;
629 
Find(const std::string & ir,uint64_t & hash,const std::string & gfx,std::vector<uint8_t> & hsaco)630 bool HsacoCache::Find(const std::string& ir, uint64_t& hash,
631                       const std::string& gfx, std::vector<uint8_t>& hsaco) {
632   std::lock_guard<std::mutex> lg(g_hsacoCache.m_mutex);
633   hash = std::hash<std::string>{}(ir);
634   bool hit = false;
635   for (auto& x : g_hsacoCache.cache) {
636     if (x.hash != hash) continue;
637     if (x.gfx != gfx) continue;
638     if (x.ir != ir) continue;
639     hsaco = x.hsaco;
640     hit = true;
641     break;
642   }
643   g_hsacoCache.request_count++;
644   if (hit) g_hsacoCache.hit_count++;
645   if (!(g_hsacoCache.request_count % 50))
646     VLOG(1) << "HSACO cache: " << g_hsacoCache.request_count << " requests, "
647             << g_hsacoCache.hit_count << " hits";
648   return hit;
649 }
650 
Add(const std::string & ir,uint64_t hash,const std::string & gfx,const std::vector<uint8_t> & hsaco)651 void HsacoCache::Add(const std::string& ir, uint64_t hash,
652                      const std::string& gfx,
653                      const std::vector<uint8_t>& hsaco) {
654   std::lock_guard<std::mutex> lg(g_hsacoCache.m_mutex);
655   g_hsacoCache.cache.resize(g_hsacoCache.cache.size() + 1);
656   g_hsacoCache.cache.back().ir = ir;
657   g_hsacoCache.cache.back().hash = hash;
658   g_hsacoCache.cache.back().gfx = gfx;
659   g_hsacoCache.cache.back().hsaco = hsaco;
660 }
661 
662 // Emits the given module to HSA Code Object. target_machine is an initialized
663 // TargetMachine for the AMDGPU target.
EmitModuleToHsaco(llvm::Module * module,llvm::TargetMachine * target_machine)664 StatusOr<std::vector<uint8_t>> EmitModuleToHsaco(
665     llvm::Module* module, llvm::TargetMachine* target_machine) {
666   auto* env = tensorflow::Env::Default();
667   std::vector<std::string> tempdir_vector;
668   env->GetLocalTempDirectories(&tempdir_vector);
669   if (tempdir_vector.empty()) {
670     return xla::InternalError(
671         "Unable to locate a temporary directory for compile-time artifacts.");
672   }
673   std::string tempdir_name = tempdir_vector.front();
674   VLOG(1) << "Compile-time artifacts located at: " << tempdir_name;
675 
676   bool keep_tempfiles = false;
677   TF_CHECK_OK(tensorflow::ReadBoolFromEnvVar("TF_ROCM_KEEP_XLA_TEMPFILES",
678                                              /*default_val=*/false,
679                                              &keep_tempfiles));
680   // Prepare filenames for all stages of compilation:
681   // IR, binary ISA, and HSACO.
682   std::string random_number = std::to_string(tensorflow::random::New64());
683   std::string ir_filename =
684       absl::StrCat(module->getModuleIdentifier(), random_number + ".ll");
685   std::string ir_path = tensorflow::io::JoinPath(tempdir_name, ir_filename);
686 
687   std::string ir_opt_filename =
688       absl::StrCat(module->getModuleIdentifier(), random_number + "_opt.ll");
689   std::string ir_opt_path =
690       tensorflow::io::JoinPath(tempdir_name, ir_opt_filename);
691 
692   std::string isabin_filename =
693       absl::StrCat(module->getModuleIdentifier(), random_number + ".o");
694   std::string isabin_path =
695       tensorflow::io::JoinPath(tempdir_name, isabin_filename);
696 
697   std::string hsaco_filename =
698       absl::StrCat(module->getModuleIdentifier(), random_number + ".hsaco");
699   std::string hsaco_path =
700       tensorflow::io::JoinPath(tempdir_name, hsaco_filename);
701 
702   std::error_code ec;
703 
704   // Dump LLVM IR.
705   std::unique_ptr<llvm::raw_fd_ostream> ir_fs(
706       new llvm::raw_fd_ostream(ir_path, ec, llvm::sys::fs::OF_None));
707   module->print(*ir_fs, nullptr);
708   ir_fs->flush();
709 
710   // Emit GCN ISA binary.
711   // The extension is stripped by IrDumpingPassManager, so we need to
712   // get creative to add a suffix.
713   std::string module_id = module->getModuleIdentifier();
714   IrDumpingPassManager codegen_passes(
715       ReplaceFilenameExtension(tensorflow::io::Basename(module_id),
716                                random_number + "-amdgpu.dummy"),
717       "", false);
718   codegen_passes.add(new llvm::TargetLibraryInfoWrapperPass(
719       llvm::Triple(module->getTargetTriple())));
720   llvm::SmallVector<char, 0> stream;
721   llvm::raw_svector_ostream pstream(stream);
722   std::unique_ptr<llvm::raw_fd_ostream> isabin_fs(
723       new llvm::raw_fd_ostream(isabin_path, ec, llvm::sys::fs::OF_Text));
724   module->setDataLayout(target_machine->createDataLayout());
725   target_machine->addPassesToEmitFile(codegen_passes, *isabin_fs, nullptr,
726                                       llvm::CGFT_ObjectFile);
727   codegen_passes.run(*module);
728   isabin_fs->flush();
729 
730   if (keep_tempfiles) {
731     std::unique_ptr<llvm::raw_fd_ostream> ir_fs(
732         new llvm::raw_fd_ostream(ir_opt_path, ec, llvm::sys::fs::OF_None));
733     module->print(*ir_fs, nullptr);
734     ir_fs->flush();
735   }
736   // Locate lld.
737   // TODO([email protected]): change to tensorflow::ROCmRoot() after
738   // ROCm-Device-Libs PR.
739   std::string lld_path = tensorflow::io::JoinPath("/opt/rocm", "llvm/bin");
740   auto lld_program = llvm::sys::findProgramByName("ld.lld", {lld_path});
741   if (!lld_program) {
742     return xla::InternalError("unable to find ld.lld in PATH: %s",
743                               lld_program.getError().message());
744   }
745   std::vector<llvm::StringRef> lld_args{
746       llvm_ir::AsStringRef("ld.lld"),    llvm_ir::AsStringRef("-flavor"),
747       llvm_ir::AsStringRef("gnu"),       llvm_ir::AsStringRef("-shared"),
748       llvm_ir::AsStringRef(isabin_path), llvm_ir::AsStringRef("-o"),
749       llvm_ir::AsStringRef(hsaco_path),
750   };
751 
752   std::string error_message;
753   int lld_result =
754       llvm::sys::ExecuteAndWait(*lld_program, llvm_ir::AsArrayRef(lld_args),
755                                 llvm::None, {}, 0, 0, &error_message);
756   if (lld_result) {
757     return xla::InternalError("ld.lld execute fail: %s, error code %d",
758                               error_message, lld_result);
759   }
760 
761   // Read HSACO.
762   std::ifstream hsaco_file(hsaco_path, std::ios::binary | std::ios::ate);
763   std::ifstream::pos_type hsaco_file_size = hsaco_file.tellg();
764 
765   std::vector<uint8_t> hsaco(hsaco_file_size);
766   hsaco_file.seekg(0, std::ios::beg);
767   hsaco_file.read(reinterpret_cast<char*>(&hsaco[0]), hsaco_file_size);
768   hsaco_file.close();
769   if (!keep_tempfiles) {
770     remove(ir_path.c_str());
771     remove(isabin_path.c_str());
772     remove(hsaco_path.c_str());
773   }
774   return hsaco;
775 }
776 
777 // Links ROCm-Device-Libs into the given module if the module needs it.
LinkROCDLIfNecessary(llvm::Module * module,std::string gcn_arch_name,const std::string & rocdl_dir_path)778 Status LinkROCDLIfNecessary(llvm::Module* module, std::string gcn_arch_name,
779                             const std::string& rocdl_dir_path) {
780   if (!CouldNeedDeviceBitcode(*module)) {
781     return OkStatus();
782   }
783 
784   return LinkWithBitcodeVector(module,
785                                GetROCDLPaths(gcn_arch_name, rocdl_dir_path));
786 }
787 
AMDGPUTargetModuleLinker(llvm::Module * module,GpuVersion gpu_version,const HloModuleConfig & hlo_module_config,const std::string & device_bitcode_dir_path)788 Status AMDGPUTargetModuleLinker(llvm::Module* module, GpuVersion gpu_version,
789                                 const HloModuleConfig& hlo_module_config,
790                                 const std::string& device_bitcode_dir_path) {
791   // Link the input module with ROCDL.
792 
793   auto compute_capability =
794       std::get_if<se::RocmComputeCapability>(&gpu_version);
795   if (!compute_capability) {
796     return xla::InternalError("Incompatible compute capability was specified.");
797   }
798 
799   std::string gcn_arch_name = compute_capability->gcn_arch_name();
800   TF_RETURN_IF_ERROR(
801       LinkROCDLIfNecessary(module, gcn_arch_name, device_bitcode_dir_path));
802 
803   // If ftz is enabled, set it as an attribute on every function in the module.
804   if (hlo_module_config.debug_options().xla_gpu_ftz()) {
805     for (llvm::Function& fn : *module) {
806       fn.addFnAttr("denormal-fp-math-f32", "preserve-sign");
807     }
808   }
809 
810   return OkStatus();
811 }
812 
813 // The following routine maps a feature token extracted from the
814 // hipDeviceProp_t::gcnArchName string, and maps it to a valid feature_str
815 // to be used for creating the AMDGPUTarget.
816 // This mapping is currently in a state of flux because TF XLA uses its
817 // own copy of LLVM, which is different from the LLVM version used by
818 // hipcc/runtime in the ROCm install. Ordinarily this is not a problem,
819 // but right now, the LLVM version used by hipcc/runtime has "targetID"
820 // related changes which have not yet been upstreamed (to the LLVM repo)
821 // When that upstreaming happens (and TF LLVM pointer moves past the
822 // upstream commit), the following mapping will need to change
MapGCNArchNameTokenToFeatureStr(const std::string & token)823 std::string MapGCNArchNameTokenToFeatureStr(const std::string& token) {
824   if (token == "sramecc+") {
825     return "+sramecc";
826   } else if (token == "sramecc-") {
827     return "-sramecc";
828   } else if (token == "xnack+") {
829     return "+xnack";
830   } else if (token == "xnack-") {
831     return "-xnack";
832   }
833   return "";
834 }
835 
GetFeatureStrFromGCNArchName(const std::string & gcn_arch_name)836 std::pair<std::string, std::string> GetFeatureStrFromGCNArchName(
837     const std::string& gcn_arch_name) {
838   std::string feature_str;
839 
840   std::string gfx = gcn_arch_name;
841   // For ROCm versions 4.0 and greater, we need to specify the correct
842   // feature str, based on the underlying GPU HW to get max performance.
843   std::vector<std::string> tokens = absl::StrSplit(gcn_arch_name, ':');
844   std::vector<std::string> mapped_tokens;
845   if (tokens.size() > 0) gfx = tokens[0];
846   for (auto it = tokens.begin(); it != tokens.end(); it++) {
847     // Skip the first token, that is the gfxNNN str
848     // The rest of the tokens are the feature/targetid strings
849     if (it != tokens.begin()) {
850       std::string token(*it);
851       std::string mapped_token = MapGCNArchNameTokenToFeatureStr(token);
852       mapped_tokens.push_back(mapped_token);
853     }
854   }
855   feature_str = absl::StrJoin(mapped_tokens, ",");
856 
857   return std::make_pair(gfx, feature_str);
858 }
859 
AMDGPUGetTargetMachine(llvm::Triple target_triple,GpuVersion gpu_version,const HloModuleConfig & hlo_module_config)860 std::unique_ptr<llvm::TargetMachine> AMDGPUGetTargetMachine(
861     llvm::Triple target_triple, GpuVersion gpu_version,
862     const HloModuleConfig& hlo_module_config) {
863   auto compute_capability =
864       std::get_if<se::RocmComputeCapability>(&gpu_version);
865 
866   std::string gcn_arch_name = compute_capability->gcn_arch_name();
867   auto arch = GetFeatureStrFromGCNArchName(gcn_arch_name);
868   return GetTargetMachine(std::move(target_triple), arch.first,
869                           hlo_module_config, arch.second);
870 }
871 
AMDGPUBackendInit(const HloModuleConfig & hlo_module_config)872 void AMDGPUBackendInit(const HloModuleConfig& hlo_module_config) {
873   llvm_ir::InitializeLLVMCommandLineOptions(
874       hlo_module_config.debug_options().xla_backend_extra_options());
875 
876   // Initialize the AMDGPU target; it's the only target we link with, so call
877   // its specific initialization functions instead of the catch-all
878   // InitializeAll*.
879 #if TENSORFLOW_USE_ROCM
880   LLVMInitializeAMDGPUTarget();
881   LLVMInitializeAMDGPUTargetInfo();
882   LLVMInitializeAMDGPUTargetMC();
883   LLVMInitializeAMDGPUAsmPrinter();
884 
885 #endif
886 
887   llvm::PassRegistry* registry = llvm::PassRegistry::getPassRegistry();
888   InitializePasses(registry);
889 }
890 
891 }  // namespace
892 
893 namespace amdgpu {
CompileToHsaco(llvm::Module * module,GpuVersion gpu_version,const HloModuleConfig & hlo_module_config,const std::string & rocdl_dir_path)894 StatusOr<std::vector<uint8_t>> CompileToHsaco(
895     llvm::Module* module, GpuVersion gpu_version,
896     const HloModuleConfig& hlo_module_config,
897     const std::string& rocdl_dir_path) {
898   static absl::once_flag backend_init_flag;
899   absl::call_once(backend_init_flag, AMDGPUBackendInit, hlo_module_config);
900 
901   std::vector<uint8_t> hsaco;
902   std::unique_ptr<llvm::TargetMachine> target_machine;
903   std::string str;
904   llvm::raw_string_ostream stream(str);
905   stream << *module;
906   // Delete the first two lines, since they usually vary even when the rest of
907   // the code is the same (but verify that they are what we expect).
908   if (str.size() >= 13 && str.substr(0, 13) == "; ModuleID = ") {
909     auto pos = str.find('\n');
910     if (pos != std::string::npos) str = str.substr(pos + 1);
911   }
912   if (str.size() >= 18 && str.substr(0, 18) == "source_filename = ") {
913     auto pos = str.find('\n');
914     if (pos != std::string::npos) str = str.substr(pos + 1);
915   }
916   str += hlo_module_config.compilation_cache_key();
917   {
918     tensorflow::profiler::TraceMe activity(
919         [&] { return absl::StrCat("Compiling IR", module->getName().str()); },
920         tensorflow::profiler::TraceMeLevel::kInfo);
921     XLA_SCOPED_LOGGING_TIMER("Compile module " + module->getName().str());
922 
923     auto compute_capability =
924         std::get_if<se::RocmComputeCapability>(&gpu_version);
925     if (!compute_capability) {
926       return xla::InternalError(
927           "Incompatible compute capability was specified.");
928     }
929 
930     std::string gcn_arch_name = compute_capability->gcn_arch_name();
931 
932     uint64_t hash;
933     if (HsacoCache::Find(str, hash, gcn_arch_name, hsaco)) {
934       VLOG(1) << "HSACO cache hit";
935       return hsaco;
936     }
937     VLOG(1) << "HSACO cache miss";
938     bool dump_lls = false;
939     if (dump_lls) {
940       static int hsaco_count = 0;
941       std::string name = "/tmp/" + std::to_string(hsaco_count) + ".ll";
942       hsaco_count++;
943       std::ofstream ofs(name);
944       ofs << str;
945       ofs.close();
946     }
947 
948     llvm::Triple default_target_triple("amdgcn--amdhsa-amdgiz");
949     // Construct LLVM TargetMachine for AMDGPU.
950     std::unique_ptr<llvm::TargetMachine> target_machine =
951         AMDGPUGetTargetMachine(default_target_triple, gpu_version,
952                                hlo_module_config);
953 
954     // Link with ROCm-Device-Libs, and optimize the LLVM module.
955     TF_RETURN_IF_ERROR(LinkAndOptimizeModule(
956         module, gpu_version, hlo_module_config, rocdl_dir_path,
957         AMDGPUTargetModuleLinker, default_target_triple, target_machine.get(),
958         kAMDGPUInlineThreshold));
959 
960     // Lower optimized LLVM module to HSA code object.
961     TF_ASSIGN_OR_RETURN(hsaco, EmitModuleToHsaco(module, target_machine.get()));
962     HsacoCache::Add(str, hash, gcn_arch_name, hsaco);
963   }
964   return hsaco;
965 }
966 
967 }  // namespace amdgpu
968 
969 }  // namespace gpu
970 }  // namespace xla
971