1 #include <torch/csrc/jit/codegen/fuser/cpu/fused_kernel.h>
2
3 #include <ATen/DynamicLibrary.h>
4 #include <ATen/code_template.h>
5 #include <c10/util/Exception.h>
6 #include <torch/csrc/jit/codegen/fuser/compiler.h>
7 #include <torch/csrc/jit/codegen/fuser/cpu/temp_file.h>
8 #include <optional>
9
10 #include <cstdlib>
11 #include <iostream>
12 #include <string>
13
14 namespace torch {
15 namespace jit {
16 namespace fuser {
17 namespace cpu {
18
19 #ifdef _MSC_VER
getTempPath()20 static const std::string getTempPath() {
21 wchar_t lpTempPathBuffer[MAX_PATH];
22
23 DWORD dwRetVal = GetTempPathW(
24 MAX_PATH, // length of the buffer
25 lpTempPathBuffer); // buffer for path
26
27 TORCH_CHECK(dwRetVal < MAX_PATH && dwRetVal != 0, "GetTempPath failed.");
28
29 return std::string(c10::u16u8(lpTempPathBuffer));
30 }
31 static const std::string temp_dir = getTempPath();
32 static const std::string so_template = temp_dir + "pytorch_fuserXXXXXX.dll";
33 static const std::string cpp_template = temp_dir + "pytorch_fuserXXXXXX.cpp";
34 static const std::string check_exists_string = "where ${program} > nul 2> nul";
35 static std::vector<std::wstring> env_list;
36 constexpr int so_suffix_len = 4;
37 constexpr int cpp_suffix_len = 4;
38 #else
39 static const std::string so_template = "/tmp/pytorch_fuserXXXXXX.so";
40 static const std::string cpp_template = "/tmp/pytorch_fuserXXXXXX.cpp";
41 static const std::string check_exists_string = "which ${program} > /dev/null";
42 constexpr int so_suffix_len = 3;
43 constexpr int cpp_suffix_len = 4;
44 #endif
45
46 intptr_t run(const std::string& cmd);
47
programExists(const std::string & program)48 static bool programExists(const std::string& program) {
49 std::stringstream ss;
50 c10::printQuotedString(ss, program);
51 at::jit::TemplateEnv env;
52 env.s("program", ss.str());
53 std::string cmd = format(check_exists_string, env);
54 #ifdef _MSC_VER
55 return (run(cmd.c_str()) == 0);
56 #else
57 return (system(cmd.c_str()) == 0);
58 #endif
59 }
60
61 #ifdef _MSC_VER
exec(const std::wstring & cmd)62 std::optional<std::wstring> exec(const std::wstring& cmd) {
63 std::array<wchar_t, 128> buffer;
64 std::wstring result;
65 std::unique_ptr<FILE, decltype(&_pclose)> pipe(
66 _wpopen(cmd.c_str(), L"r"), _pclose);
67 if (!pipe) {
68 return std::nullopt;
69 }
70 while (fgetws(buffer.data(), static_cast<int>(buffer.size()), pipe.get()) !=
71 nullptr) {
72 result += buffer.data();
73 }
74 return result;
75 }
76
rtrim(std::wstring & s,const wchar_t * t=L" \\t\\n\\r\\f\\v")77 inline std::wstring& rtrim(std::wstring& s, const wchar_t* t = L" \t\n\r\f\v") {
78 s.erase(s.find_last_not_of(t) + 1);
79 return s;
80 }
81
activate()82 void activate() {
83 wchar_t* root = nullptr;
84 std::wstring cmd;
85 std::optional<std::wstring> exec_out;
86 std::wstring path;
87 std::wstring vcruntime_plat;
88 std::wstring envvars;
89
90 // Checking whether the environment is already activated
91 if (_wgetenv(L"VSCMD_ARG_TGT_ARCH")) {
92 return;
93 }
94
95 // Getting `ProgramFiles` through environment variable queries
96 root = _wgetenv(L"ProgramFiles(x86)");
97 if (!root) {
98 root = _wgetenv(L"ProgramFiles");
99 }
100 if (!root) {
101 return;
102 }
103
104 // Getting VS 2017 installation path through `vswhere`
105 cmd = L"\"" + std::wstring(root) +
106 L"\\Microsoft Visual Studio\\Installer\\vswhere.exe\""
107 L" -latest -prerelease -requires Microsoft.VisualStudio.Component.VC.Tools.x86.x64 -property installationPath";
108 exec_out = exec(cmd);
109 if (!exec_out) {
110 return;
111 }
112 path = *exec_out;
113 rtrim(path);
114
115 // Checking whether the activation script `vcvarsall.bat` exists
116 path += L"\\VC\\Auxiliary\\Build";
117 struct _stati64 st;
118 if (_wstati64(path.c_str(), &st) == -1 || !(st.st_mode & _S_IFDIR)) {
119 return;
120 }
121 path += L"\\vcvarsall.bat";
122 if (_waccess(path.c_str(), 0) == -1) {
123 return;
124 }
125
126 // Determining current platform
127 if (sizeof(void*) == 8) {
128 vcruntime_plat = L"x64";
129 } else {
130 vcruntime_plat = L"x86";
131 }
132
133 // Getting environment variables after activating VS development shell
134 cmd = L"\"" + path + L"\" " + vcruntime_plat + L">NUL && set";
135 exec_out = exec(cmd);
136 if (!exec_out) {
137 return;
138 }
139 envvars = *exec_out;
140
141 // Setting environment variables to the current environment
142 std::wistringstream f(envvars);
143 std::wstring envvar;
144 while (getline(f, envvar, L'\n')) {
145 env_list.push_back(envvar);
146 }
147 }
148
run(const std::string & cmd)149 intptr_t run(const std::string& cmd) {
150 // Getting the path of `cmd.exe`
151 wchar_t* comspec = _wgetenv(L"COMSPEC");
152 if (!comspec) {
153 comspec = L"C:\\Windows\\System32\\cmd.exe";
154 }
155 // Constructing the command line
156 auto wCmd = c10::u8u16(cmd);
157 const wchar_t* a[] = {L"/c", wCmd.c_str(), nullptr};
158 // Constructing the env array
159 // If `env_list` is not empty, then add char pointers ending with nullptr.
160 // Otherwise, it will be nullptr, which implies the default env.
161 std::vector<const wchar_t*> e;
162 if (!env_list.empty()) {
163 for (auto& s : env_list) {
164 e.push_back(s.c_str());
165 }
166 e.push_back(nullptr);
167 }
168 // Running the command
169 intptr_t r = _wspawnve(_P_WAIT, comspec, a, e.data());
170 return r;
171 }
172 #endif
173
174 // A single compiler config is accessed through getConfig() (below)
175 // Controls compilation options and may be updated based on the result
176 // of compilation attempts.
177 struct CompilerConfig {
CompilerConfigtorch::jit::fuser::cpu::CompilerConfig178 CompilerConfig() {
179 const char* cxx_env = getenv("CXX");
180 if (cxx_env != nullptr) {
181 cxx = cxx_env;
182 }
183
184 #ifdef _MSC_VER
185 activate();
186 #endif
187
188 if (!programExists(cxx)) {
189 TORCH_WARN("Compiler passed via CXX envvar does not exist!");
190 cxx = "";
191 }
192 }
193
194 ~CompilerConfig() = default;
195
196 #ifdef _MSC_VER
197 std::string cxx = "cl";
198 const std::string openmp_flags = "/openmp";
199 #elif defined(__clang__)
200 std::string cxx = "clang++";
201 const std::string openmp_flags = "-fopenmp";
202 #else
203 std::string cxx = "g++";
204 const std::string openmp_flags = "-fopenmp";
205 #endif
206 // Set openmp to true only if PyTorch is compiled with OpenMP support
207 // OpenMP is typically not available on MacOS platform
208 #if defined(_OPENMP)
209 bool openmp = true;
210 #else
211 bool openmp = false;
212 #endif
213 };
214
getConfig()215 static CompilerConfig& getConfig() {
216 static CompilerConfig config;
217 return config;
218 }
219
220 // NB: -march=native not supported on PPC64 g++. It's a bit annoying
221 // to do a configure-style test to decide whether or not the g++
222 // actually supports it or not, so we heuristically use the host
223 // compiler to predict if the runtime compiler supports the option we
224 // want. This probably won't work if you're cross-compiling.
225 // NB: -march=native is disabled because it has caused problems where
226 // compiler and assembler do not agree on what native instruction they
227 // understand for AVX512. When we need better CPU performance this
228 // optimization can be re-enabled by tracking down the platforms where
229 // this error occurs and only selectively disabling it.
230 #if (defined(_MSC_VER) && !defined(_M_ARM64))
231 // According to https://stackoverflow.com/a/29178079, we are able to
232 // detect which arch level is supported by the vectorizer using
233 // the macro __isa_available. It is added during runtime.
234 // The result of __isa_available and the corresponding arch:
235 // AVX 4
236 // AVX2 5
237 // AVX512 6
238 extern "C" int __isa_available;
getArchFlags()239 static std::string getArchFlags() {
240 if (__isa_available >= 6) {
241 return "/arch:AVX512";
242 } else if (__isa_available >= 5) {
243 return "/arch:AVX2";
244 } else if (__isa_available >= 4) {
245 return "/arch:AVX";
246 } else {
247 return "";
248 }
249 }
250 static const std::string arch_flags = getArchFlags();
251 static const std::string compile_string = "cd /D \"" + temp_dir +
252 "\" && "
253 "${cxx} /nologo /MD /O2 " +
254 arch_flags +
255 " /LD /EHsc "
256 "${fopenmp} \"${cpp_file}\" /link /out:\"${so_file}\"";
257 #else
258 static const std::string compile_string =
259 "\"${cxx}\" -O3 -g "
260 #ifndef __PPC64__
261 // "-march=native "
262 #endif
263 "-std=c++17 -fPIC ${fopenmp} -shared \"${cpp_file}\" -o \"${so_file}\" -lm";
264 #endif
runCompiler(const std::string & cpp_file,const std::string & so_file)265 static void runCompiler(
266 const std::string& cpp_file,
267 const std::string& so_file) {
268 auto& config = getConfig();
269 TORCH_CHECK(
270 !config.cxx.empty(),
271 "Failed to compile a fused CPU kernel: Compiler not found");
272 at::jit::TemplateEnv env;
273 env.s("cxx", config.cxx);
274 env.s("fopenmp", config.openmp ? config.openmp_flags : "");
275 env.s("cpp_file", cpp_file);
276 env.s("so_file", so_file);
277 std::string result = format(compile_string, env);
278 #ifdef _MSC_VER
279 intptr_t r = run(result);
280 #else
281 int r = system(result.c_str());
282 #endif
283 if (config.openmp && r != 0) {
284 std::cerr
285 << "warning: pytorch jit fuser failed to compile with openmp, trying without it...\n";
286 config.openmp = false; // disable for future compiles
287 return runCompiler(cpp_file, so_file);
288 }
289 TORCH_CHECK(r == 0, "Failed to compile a fused CPU kernel");
290 }
291
292 #ifdef _MSC_VER
293 static const std::string disas_string =
294 "dumpbin /DISASM:NOBYTES \"${so_file}\"";
295 #else
296 static const std::string disas_string = "objdump -M intel -d \"${so_file}\"";
297 #endif
disas(const std::string & so_file)298 static void disas(const std::string& so_file) {
299 at::jit::TemplateEnv env;
300 env.s("so_file", so_file);
301 std::string cmd = format(disas_string, env);
302 int r = system(cmd.c_str());
303 AT_ASSERT(r == 0);
304 }
305
FusedKernelCPU(std::string name,std::string code,std::vector<TensorDesc> input_desc,std::vector<TensorDesc> output_desc,std::vector<PartitionDesc> chunk_desc,std::vector<PartitionDesc> concat_desc,bool has_random)306 FusedKernelCPU::FusedKernelCPU(
307 std::string name,
308 std::string code,
309 std::vector<TensorDesc> input_desc,
310 std::vector<TensorDesc> output_desc,
311 std::vector<PartitionDesc> chunk_desc,
312 std::vector<PartitionDesc> concat_desc,
313 bool has_random)
314 : FusedKernel(
315 std::move(name),
316 std::move(code),
317 std::move(input_desc),
318 std::move(output_desc),
319 std::move(chunk_desc),
320 std::move(concat_desc),
321 has_random) {
322 TempFile so_file(so_template, so_suffix_len);
323 TempFile cpp_file(cpp_template, cpp_suffix_len);
324 cpp_file.write(code_);
325 cpp_file.sync();
326 #ifdef _MSC_VER
327 so_file.close();
328 cpp_file.close();
329 #endif
330 runCompiler(cpp_file.name(), so_file.name());
331 if (debugFuser() >= 2)
332 disas(so_file.name());
333 so_lib = std::make_unique<at::DynamicLibrary>(so_file.name().c_str());
334 #pragma GCC diagnostic ignored "-Wpedantic"
335 kernel =
336 reinterpret_cast<void (*)(uint32_t, void**)>(so_lib->sym(name_.c_str()));
337 #pragma GCC diagnostic pop
338 }
339
createFusionKernel(int16_t device,std::string name,std::string code,std::vector<TensorDesc> input_desc,std::vector<TensorDesc> output_desc,std::vector<PartitionDesc> chunk_desc,std::vector<PartitionDesc> concat_desc,bool has_random)340 static std::shared_ptr<FusedKernel> createFusionKernel(
341 int16_t device,
342 std::string name,
343 std::string code,
344 std::vector<TensorDesc> input_desc,
345 std::vector<TensorDesc> output_desc,
346 std::vector<PartitionDesc> chunk_desc,
347 std::vector<PartitionDesc> concat_desc,
348 bool has_random) {
349 return std::make_shared<FusedKernelCPU>(
350 std::move(name),
351 std::move(code),
352 std::move(input_desc),
353 std::move(output_desc),
354 std::move(chunk_desc),
355 std::move(concat_desc),
356 has_random);
357 }
358
359 RegisterFusionBackend reg(DeviceType::CPU, createFusionKernel);
360 } // namespace cpu
361 } // namespace fuser
362 } // namespace jit
363 } // namespace torch
364