1# mypy: allow-untyped-defs 2import functools 3import logging 4import os 5import sys 6import tempfile 7from typing import Any, Dict, Optional 8 9import torch 10from torch._strobelight.compile_time_profiler import StrobelightCompileTimeProfiler 11 12 13log = logging.getLogger(__name__) 14 15if os.environ.get("TORCH_COMPILE_STROBELIGHT", False): 16 import shutil 17 18 if not shutil.which("strobeclient"): 19 log.info( 20 "TORCH_COMPILE_STROBELIGHT is true, but seems like you are not on a FB machine." 21 ) 22 else: 23 log.info("Strobelight profiler is enabled via environment variable") 24 StrobelightCompileTimeProfiler.enable() 25 26# this arbitrary-looking assortment of functionality is provided here 27# to have a central place for overrideable behavior. The motivating 28# use is the FB build environment, where this source file is replaced 29# by an equivalent. 30 31if torch._running_with_deploy(): 32 # __file__ is meaningless in the context of frozen torch used in torch deploy. 33 # setting empty torch_parent should allow below functions to operate without crashing, 34 # but it's unclear if there is a valid use case for them in the context of deploy. 35 torch_parent = "" 36else: 37 if os.path.basename(os.path.dirname(__file__)) == "shared": 38 torch_parent = os.path.dirname(os.path.dirname(os.path.dirname(__file__))) 39 else: 40 torch_parent = os.path.dirname(os.path.dirname(__file__)) 41 42 43def get_file_path(*path_components: str) -> str: 44 return os.path.join(torch_parent, *path_components) 45 46 47def get_file_path_2(*path_components: str) -> str: 48 return os.path.join(*path_components) 49 50 51def get_writable_path(path: str) -> str: 52 if os.access(path, os.W_OK): 53 return path 54 return tempfile.mkdtemp(suffix=os.path.basename(path)) 55 56 57def prepare_multiprocessing_environment(path: str) -> None: 58 pass 59 60 61def resolve_library_path(path: str) -> str: 62 return os.path.realpath(path) 63 64 65def throw_abstract_impl_not_imported_error(opname, module, context): 66 if module in sys.modules: 67 raise NotImplementedError( 68 f"{opname}: We could not find the fake impl for this operator. " 69 ) 70 else: 71 raise NotImplementedError( 72 f"{opname}: We could not find the fake impl for this operator. " 73 f"The operator specified that you may need to import the '{module}' " 74 f"Python module to load the fake impl. {context}" 75 ) 76 77 78# NB! This treats "skip" kwarg specially!! 79def compile_time_strobelight_meta(phase_name): 80 def compile_time_strobelight_meta_inner(function): 81 @functools.wraps(function) 82 def wrapper_function(*args, **kwargs): 83 if "skip" in kwargs: 84 kwargs["skip"] = kwargs["skip"] + 1 85 86 if not StrobelightCompileTimeProfiler.enabled: 87 return function(*args, **kwargs) 88 89 return StrobelightCompileTimeProfiler.profile_compile_time( 90 function, phase_name, *args, **kwargs 91 ) 92 93 return wrapper_function 94 95 return compile_time_strobelight_meta_inner 96 97 98# Meta only, see 99# https://www.internalfb.com/intern/wiki/ML_Workflow_Observability/User_Guides/Adding_instrumentation_to_your_code/ 100# 101# This will cause an event to get logged to Scuba via the signposts API. You 102# can view samples on the API at https://fburl.com/scuba/workflow_signpost/zh9wmpqs 103# we log to subsystem "torch", and the category and name you provide here. 104# Each of the arguments translate into a Scuba column. We're still figuring 105# out local conventions in PyTorch, but category should be something like 106# "dynamo" or "inductor", and name should be a specific string describing what 107# kind of event happened. 108# 109# Killswitch is at 110# https://www.internalfb.com/intern/justknobs/?name=pytorch%2Fsignpost#event 111def signpost_event(category: str, name: str, parameters: Dict[str, Any]): 112 log.info("%s %s: %r", category, name, parameters) 113 114 115def log_compilation_event(metrics): 116 log.info("%s", metrics) 117 118 119def upload_graph(graph): 120 pass 121 122 123def set_pytorch_distributed_envs_from_justknobs(): 124 pass 125 126 127def log_export_usage(**kwargs): 128 pass 129 130 131def log_trace_structured_event(*args, **kwargs) -> None: 132 pass 133 134 135def log_cache_bypass(*args, **kwargs) -> None: 136 pass 137 138 139def log_torchscript_usage(api: str, **kwargs): 140 _ = api 141 return 142 143 144def check_if_torch_exportable(): 145 return False 146 147 148def log_torch_jit_trace_exportability( 149 api: str, 150 type_of_export: str, 151 export_outcome: str, 152 result: str, 153): 154 _, _, _, _ = api, type_of_export, export_outcome, result 155 return 156 157 158def capture_pre_autograd_graph_using_training_ir() -> bool: 159 return False 160 161 162class JustKnobsConfig: 163 """Represents a lazily loaded config 164 165 This is designed to be used to specify a value in a config. 166 167 i.e. foo.bar = JustknobsConfig(name="//foo:bar", env_name="FORCE_FOO_BAR") 168 169 Call .get() in order to access the value 170 i.e. if foo.bar.get(): 171 172 Note that the value is fetched once, and then not allowed to change. This 173 means less suprises, at the downside that you may have to restart a job 174 to pick up an update. 175 176 It can also be set explicitly via set - i.e. 177 foo.bar = JustknobsConfig(name="//foo:bar") 178 foo.bar.set(True) 179 180 Note that this does allow for no JK name (so that you can use this to replace old configurations). 181 """ 182 183 def __init__( 184 self, *, name: Optional[str] = None, env_name=None, default: bool = True 185 ): 186 self.name = name 187 self.env_name = env_name 188 self.default = default 189 self.value: Optional[bool] = None 190 self.executed_value = None 191 192 def set(self, value: bool): 193 self.value = value 194 195 def get(self): 196 if self.executed_value is None: 197 self.executed_value = justknobs_feature( 198 self.name, 199 config_value=self.value, 200 env_name=self.env_name, 201 default=self.default, 202 ) 203 return self.executed_value 204 205 def __str__(self): 206 v = bool(self) 207 return f"JustknobsConfig(name={self.name}, env_name={self.env_name}, default={self.default} - evals_to={v})" 208 209 def __bool__(self): 210 return self.get() 211 212 213def justknobs_feature( 214 name: Optional[str], config_value=None, env_name=None, default: bool = True 215): 216 """Returns whether or not a specific justknob feature is enabled. 217 218 This is a slightly higher level API then justknobs_check, designed to make it "easy" to do the right thing. 219 The primary thing it does, is allow configuration to override JK by default, while retaining some features to force this 220 the other way during sevs. 221 222 The preference order (i.e. who wins first) in OSS (and FB) is 223 - Config if specified 224 - Environment Variable if specified 225 - JK (FB), or default (OSS) 226 227 228 Quickstart 229 Have a config variable 230 Make a JK which is set to your "enabled" value (generally true). 231 Use this feature to check it (if you set the JK to be false, change the default). 232 If you have an env variable, also use the function to check it. 233 234 Arguments: 235 name - This should correspond 1:1 to a JK name internally to FB. 236 env_name - If this is set, we'll try and read the value from environment variables 237 config_value - If this is set to anything other than None, we'll use this value by 238 default. Note that within FB, there is some functionality to force override these 239 configs 240 default - This is the value to return in OSS. This avoids having to write weird double 241 negatives within justknobs and the config code, if you just want to have the 242 killswitch work by having feature return True to turn off features 243 244 Requirements: 245 WARNING - Don't use this at import time - Simply pass in the existing config. 246 If you want to use this at config time, use JustKnobsConfig 247 """ 248 if config_value is not None: 249 return config_value 250 if env_name is not None and ((env := os.getenv(env_name)) is not None): 251 env = env.upper() 252 if env in ("1", "TRUE"): 253 return True 254 if env in ("0", "FALSE"): 255 return False 256 log.error( 257 "Difficulty parsing env variable %s=%s for feature %s - Assuming env variable means true and returning True", 258 env_name, 259 env, 260 name, 261 ) 262 # We could return default here, but that was confusing to log. 263 return True 264 if name is None: 265 return True 266 if not default: 267 return not justknobs_check(name) 268 return justknobs_check(name) 269 270 271def justknobs_check(name: str) -> bool: 272 """ 273 This function can be used to killswitch functionality in FB prod, 274 where you can toggle this value to False in JK without having to 275 do a code push. In OSS, we always have everything turned on all 276 the time, because downstream users can simply choose to not update 277 PyTorch. (If more fine-grained enable/disable is needed, we could 278 potentially have a map we lookup name in to toggle behavior. But 279 the point is that it's all tied to source code in OSS, since there's 280 no live server to query.) 281 282 This is the bare minimum functionality I needed to do some killswitches. 283 We have a more detailed plan at 284 https://docs.google.com/document/d/1Ukerh9_42SeGh89J-tGtecpHBPwGlkQ043pddkKb3PU/edit 285 In particular, in some circumstances it may be necessary to read in 286 a knob once at process start, and then use it consistently for the 287 rest of the process. Future functionality will codify these patterns 288 into a better high level API. 289 290 WARNING: Do NOT call this function at module import time, JK is not 291 fork safe and you will break anyone who forks the process and then 292 hits JK again. 293 """ 294 return True 295 296 297def justknobs_getval_int(name: str) -> int: 298 """ 299 Read warning on justknobs_check 300 """ 301 return 0 302 303 304def is_fb_unit_test() -> bool: 305 return False 306 307 308@functools.lru_cache(None) 309def max_clock_rate(): 310 if not torch.version.hip: 311 from triton.testing import nvsmi 312 313 return nvsmi(["clocks.max.sm"])[0] 314 else: 315 # Manually set max-clock speeds on ROCm until equivalent nvmsi 316 # functionality in triton.testing or via pyamdsmi enablement. Required 317 # for test_snode_runtime unit tests. 318 gcn_arch = str(torch.cuda.get_device_properties(0).gcnArchName.split(":", 1)[0]) 319 if "gfx94" in gcn_arch: 320 return 1700 321 elif "gfx90a" in gcn_arch: 322 return 1700 323 elif "gfx908" in gcn_arch: 324 return 1502 325 elif "gfx11" in gcn_arch: 326 return 1700 327 elif "gfx103" in gcn_arch: 328 return 1967 329 elif "gfx101" in gcn_arch: 330 return 1144 331 else: 332 return 1100 333 334 335TEST_MASTER_ADDR = "127.0.0.1" 336TEST_MASTER_PORT = 29500 337# USE_GLOBAL_DEPS controls whether __init__.py tries to load 338# libtorch_global_deps, see Note [Global dependencies] 339USE_GLOBAL_DEPS = True 340# USE_RTLD_GLOBAL_WITH_LIBTORCH controls whether __init__.py tries to load 341# _C.so with RTLD_GLOBAL during the call to dlopen. 342USE_RTLD_GLOBAL_WITH_LIBTORCH = False 343# If an op was defined in C++ and extended from Python using the 344# torch.library.register_fake, returns if we require that there be a 345# m.set_python_module("mylib.ops") call from C++ that associates 346# the C++ op with a python module. 347REQUIRES_SET_PYTHON_MODULE = False 348 349 350def maybe_upload_prof_stats_to_manifold(profile_path: str) -> Optional[str]: 351 print("Uploading profile stats (fb-only otherwise no-op)") 352 return None 353 354 355def log_chromium_event_internal(event, stack, logger_uuid, start_timestamp=None): 356 return None 357