/aosp_15_r20/external/pytorch/test/inductor/ |
H A D | s429861_repro.py | 10 arg0_1: "f32[][]cuda:0", 11 arg1_1: "f32[50][1]cuda:0", 12 arg2_1: "f32[23][1]cuda:0", 13 arg3_1: "f32[38][1]cuda:0", 14 arg4_1: "f32[5][1]cuda:0", 15 arg5_1: "f32[100][1]cuda:0", 16 arg6_1: "f32[50][1]cuda:0", 17 arg7_1: "f32[77][1]cuda:0", 18 arg8_1: "f32[100][1]cuda:0", 19 arg9_1: "f32[100][1]cuda:0", [all …]
|
H A D | test_torchinductor_codegen_dynamic_shapes.py | 95 ("cpu", "cuda", "xpu"), is_skip=True 98 ("cpu", "cuda", "xpu"), is_skip=True 100 "test_to_device_dynamic_shapes": TestFailure(("cpu", "cuda", "xpu"), is_skip=True), 136 "test_complex_fallback_dynamic_shapes": TestFailure(("cpu", "cuda", "xpu")), 137 "test_adaptive_avg_pool2d2_dynamic_shapes": TestFailure(("cpu", "cuda", "xpu")), 138 "test_adaptive_max_pool2d2_dynamic_shapes": TestFailure(("cpu", "cuda", "xpu")), 139 "test_fractional_max_pool2d2_dynamic_shapes": TestFailure(("cpu", "cuda", "xpu")), 140 "test_argmax_to_float_dynamic_shapes": TestFailure(("cpu", "cuda", "xpu")), 141 "test_avg_pool2d7_dynamic_shapes": TestFailure(("cpu", "cuda", "xpu")), 142 "test_avg_pool2d_backward4_dynamic_shapes": TestFailure(("cpu", "cuda", "xpu")), [all …]
|
H A D | test_torchinductor_opinfo.py | 185 inductor_skips["cuda"] = { 200 inductor_skips["cuda"]["bfloat16"] = {b8, f16, f32, f64, i32, i64} 204 inductor_skips["cuda"]["logcumsumexp"] = {f32} 205 inductor_skips["cuda"]["special.modified_bessel_i1"] = {f64} 212 }, # half_to_float is only valid for the CUDA implementation 239 inductor_expected_failures_single_sample["cuda"] = { 277 inductor_expected_failures_single_sample["cuda"].update(intentionally_not_handled) 282 inductor_gradient_expected_failures_single_sample["cuda"] = {} 289 inductor_should_fail_with_exception["cuda"] = {} 335 ("cross", "cuda", f16): {"reference_in_float": True}, [all …]
|
H A D | test_foreach.py | 75 torch.rand(10, 10, device="cuda:0"), 76 torch.rand(20, 20, device="cuda:0"), 80 torch.rand(10, 10, device="cuda:0"), 81 torch.rand(20, 20, device="cuda:0"), 82 torch.rand(10, 10, device="cuda:0"), 83 torch.rand(20, 20, device="cuda:0"), 124 torch.rand(10, 10, device="cuda:0"), 125 torch.rand(20, 20, device="cuda:0"), 131 return op([a0, a1], torch.tensor(3.3, device="cuda:0")) 136 torch.rand(10, 10, device="cuda:0"), [all …]
|
H A D | test_pattern_matcher.py | 76 torch.randn(16, 16, device="cuda"), 77 torch.randn(16, 16, device="cuda"), 78 torch.randn(16, 16, device="cuda"), 79 torch.randn(16, 16, device="cuda"), 82 torch.randn(1, 4, device="cuda"), 83 torch.randn(4, 2, device="cuda"), 84 torch.randn(1, 5, device="cuda"), 85 torch.randn(5, 2, device="cuda"), 95 torch.randn(1, 4, device="cuda"), 96 torch.randn(4, 2, device="cuda"), [all …]
|
H A D | test_combo_kernels.py | 55 torch.rand(10, 10, device="cuda"), 56 torch.rand(20, 20, device="cuda"), 57 torch.rand(10, 10, device="cuda"), 77 torch.rand(10, 10, device="cuda"), 78 torch.rand(20, 20, device="cuda"), 79 torch.rand(10, 10, device="cuda"), 80 torch.rand(30, 8, device="cuda"), 100 torch.rand(10, 10, device="cuda"), 101 torch.rand(20, 20, device="cuda"), 102 torch.rand(10, 10, device="cuda"), [all …]
|
H A D | test_cuda_repro.py | 9 import torch.backends.cuda 63 device = "cuda" 94 torch.ones(shape, dtype=dtype, device="cuda") for (shape, dtype) in inps 105 ).cuda() 106 inp = torch.randn([2, 3, 16, 16]).to(memory_format=torch.channels_last).cuda() 130 rand_strided((12, 3, 512, 64), (64, 196608, 768, 1), torch.float32, "cuda"), 133 mod = make_fx(Repro().to(device="cuda"))(*inps) 138 IS_FBCODE, "RuntimeError: Triton Error [CUDA]: invalid device context" 144 x = torch.randn(4, device="cuda", requires_grad=True) 156 device=torch.device(type="cuda", index=0), [all …]
|
/aosp_15_r20/external/pytorch/test/ |
H A D | test_cuda_multigpu.py | 1 # Owner(s): ["module: cuda"] 17 import torch.cuda.comm as comm 41 torch.cuda.get_allocator_backend() == "cudaMallocAsync" 45 print("CUDA not available, skipping tests", file=sys.stderr) 53 snapshot = torch.cuda.memory_snapshot() 107 stats = torch.cuda.memory_stats(device) 112 torch.cuda.synchronize() 113 torch.cuda.synchronize("cuda") 114 torch.cuda.synchronize("cuda:0") 115 torch.cuda.synchronize(0) [all …]
|
H A D | test_cuda.py | 1 # Owner(s): ["module: cuda"] 23 import torch.cuda 25 from torch.cuda._memory_viz import ( 96 torch.cuda.get_allocator_backend() == "cudaMallocAsync" 101 TEST_PYNVML = not torch.cuda._HAS_PYNVML 103 TEST_LARGE_TENSOR = torch.cuda.get_device_properties(0).total_memory >= 12e9 104 TEST_MEDIUM_TENSOR = torch.cuda.get_device_properties(0).total_memory >= 6e9 105 TEST_BF16 = torch.cuda.is_bf16_supported() 110 @unittest.skipIf(not TEST_CUDA, "CUDA not available, skipping tests") 128 torch.cuda.memory._set_allocator_settings( [all …]
|
H A D | test_numba_integration.py | 19 import numba.cuda 24 @unittest.skipIf(not TEST_CUDA, "No cuda") 26 """torch.Tensor exposes __cuda_array_interface__ for cuda tensors. 28 An object t is considered a cuda-tensor if: 31 A cuda-tensor provides a tensor description dict: 39 https://numba.pydata.org/numba-doc/latest/cuda/cuda_array_interface.html 69 # Sparse CPU/CUDA tensors do not implement the interface 79 sparse_cuda_t = torch.sparse_coo_tensor(indices_t, cput).cuda() 86 # CUDA tensors have the attribute and v2 interface 87 cudat = tp(10).cuda() [all …]
|
H A D | test_jit_fuser.py | 84 @unittest.skipIf(not RUN_CUDA, "requires CUDA") 86 self._test_fused_abs(device="cuda") 88 @unittest.skipIf(not RUN_CUDA, "requires CUDA") 94 sin = torch.zeros(0, device="cuda") 95 cos = torch.zeros(0, device="cuda") 99 @unittest.skipIf(not RUN_CUDA, "fuser requires CUDA") 109 x = torch.randn(4, 4, dtype=torch.float, device='cuda') 110 y = torch.randn(4, 4, dtype=torch.float, device='cuda') 114 @unittest.skipIf(not RUN_CUDA, "fuser requires CUDA") 120 torch.randn(4, 4, dtype=torch.float, device='cuda'), [all …]
|
/aosp_15_r20/external/pytorch/test/jit/ |
H A D | test_cuda.py | 27 print("CUDA not available, skipping tests", file=sys.stderr) 32 # If GPU is available, then initialize the cuda context and check 35 torch.ones(1).cuda() # initialize cuda context 36 TEST_LARGE_TENSOR = torch.cuda.get_device_properties(0).total_memory >= 5e9 48 A suite of tests for the CUDA API in TorchScript. 53 torch.cuda.empty_cache() 63 prev_current_device_index = torch.cuda.current_device() 64 torch.cuda.synchronize() 65 torch.cuda.synchronize("cuda") 66 torch.cuda.synchronize("cuda:0") [all …]
|
/aosp_15_r20/external/pytorch/docs/cpp/source/notes/ |
H A D | tensor_cuda_stream.rst | 1 Tensor CUDA Stream API 4 A `CUDA Stream`_ is a linear sequence of execution that belongs to a specific CUDA device. 5 The PyTorch C++ API supports CUDA streams with the CUDAStream class and useful helper functions to … 6 …hem in `CUDAStream.h`_. This note provides more details on how to use Pytorch C++ CUDA Stream APIs. 8 .. _CUDA Stream: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#streams 9 .. _CUDAStream.h: https://pytorch.org/cppdocs/api/file_c10_cuda_CUDAStream.h.html#file-c10-cuda-cud… 12 Acquiring CUDA stream 15 Pytorch's C++ API provides the following ways to acquire CUDA stream: 17 1. Acquire a new stream from the CUDA stream pool, streams are preallocated from the pool and retur… 26 by setting device index (defaulting to the current CUDA stream's device index). [all …]
|
/aosp_15_r20/external/pytorch/aten/src/ATen/test/ |
H A D | cuda_stream_test.cpp | 3 #include <ATen/cuda/CUDAContext.h> 4 #include <ATen/cuda/CUDAEvent.h> 7 #include <c10/cuda/CUDAGuard.h> 8 #include <c10/cuda/impl/CUDAGuardImpl.h> 35 if (!at::cuda::is_available()) return; in TEST() 40 at::cuda::CUDAStream copyStream = at::cuda::getStreamFromPool(); in TEST() 42 auto s = at::cuda::getStreamFromPool(); in TEST() 56 at::cuda::CUDAStream moveStream = at::cuda::getStreamFromPool(); in TEST() 58 auto s = at::cuda::getStreamFromPool(); in TEST() 74 if (!at::cuda::is_available()) return; in TEST() [all …]
|
/aosp_15_r20/external/pytorch/docs/source/notes/ |
H A D | cuda.rst | 2 :description: A guide to torch.cuda, a PyTorch module to run CUDA operations 3 :keywords: memory management, PYTORCH_CUDA_ALLOC_CONF, optimize PyTorch, CUDA 7 CUDA semantics 11 :mod:`torch.cuda` is used to set up and run CUDA operations. It keeps track of 12 the currently selected GPU, and all CUDA tensors you allocate will by default be 14 :any:`torch.cuda.device` context manager. 22 such as :meth:`~torch.Tensor.to` and :meth:`~torch.Tensor.cuda`. 28 cuda = torch.device('cuda') # Default CUDA device 29 cuda0 = torch.device('cuda:0') 30 cuda2 = torch.device('cuda:2') # GPU 2 (these are 0-indexed) [all …]
|
/aosp_15_r20/external/pytorch/cmake/Modules/ |
H A D | FindCUDAToolkit.cmake | 13 This script locates the NVIDIA CUDA toolkit and the associated libraries, but 14 does not require the ``CUDA`` language be enabled for a given project. This 15 module does not search for the NVIDIA CUDA Samples. 23 The CUDA Toolkit search behavior uses the following order: 25 1. If the ``CUDA`` language has been enabled we will use the directory 44 the desired path in the event that multiple CUDA Toolkits are installed. 46 5. On Unix systems, if the symbolic link ``/usr/local/cuda`` exists, this is 51 candidate is found, this is used. The default CUDA Toolkit install locations 57 | macOS | ``/Developer/NVIDIA/CUDA-X.Y`` | 59 | Other Unix | ``/usr/local/cuda-X.Y`` | [all …]
|
/aosp_15_r20/prebuilts/cmake/linux-x86/share/cmake-3.22/Modules/ |
D | FindCUDAToolkit.cmake | 10 This script locates the NVIDIA CUDA toolkit and the associated libraries, but 11 does not require the ``CUDA`` language be enabled for a given project. This 12 module does not search for the NVIDIA CUDA Samples. 20 The CUDA Toolkit search behavior uses the following order: 22 1. If the ``CUDA`` language has been enabled we will use the directory 41 the desired path in the event that multiple CUDA Toolkits are installed. 43 5. On Unix systems, if the symbolic link ``/usr/local/cuda`` exists, this is 48 candidate is found, this is used. The default CUDA Toolkit install locations 54 | macOS | ``/Developer/NVIDIA/CUDA-X.Y`` | 56 | Other Unix | ``/usr/local/cuda-X.Y`` | [all …]
|
/aosp_15_r20/external/tensorflow/tensorflow/compiler/xla/stream_executor/gpu/ |
H A D | gpu_driver.h | 16 // CUDA userspace driver library wrapper functionality. 50 // The order of parameters is generally kept symmetric with the underlying CUDA 54 // http://docs.nvidia.com/cuda/cuda-driver-api/ 62 …// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__INITIALIZE.html#group__CUDA__INITIALIZ… 67 …// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g4e84b109eb… 70 // Creates a new CUDA stream associated with the given context via 73 …// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#group__CUDA__STREAM_1ga581… 77 // Destroys a CUDA stream associated with the given context. 80 …// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#group__CUDA__STREAM_1g244c… 83 // CUDA events can explicitly disable event TSC retrieval for some presumed [all …]
|
/aosp_15_r20/external/pytorch/test/distributed/_shard/sharded_tensor/ |
H A D | test_sharded_tensor.py | 81 placement="rank:0/cuda:0", 86 placement="rank:1/cuda:1", 91 placement="rank:2/cuda:2", 96 placement="rank:3/cuda:3", 149 @skip_but_pass_in_sandcastle_if(not TEST_CUDA, "CUDA GPU is needed") 159 local_device = torch.device("cuda:0") 177 "rank:0/cuda:0", 178 "rank:1/cuda:1", 179 "rank:2/cuda:2", 180 "rank:3/cuda:3", [all …]
|
/aosp_15_r20/external/pytorch/benchmarks/dynamo/ |
H A D | expected_ci_perf_inductor_torchbench.csv | 2 cuda,BERT_pytorch,16,2.6028,22.2879,41.0046,1.1965 3 cuda,Background_Matting,4,1.1296,112.7632,27.8916,1.0396 4 cuda,LearningToPaint,96,1.0951,11.3205,13.0241,0.9960 5 cuda,Super_SloMo,6,1.2160,65.3294,27.1633,1.2396 6 cuda,alexnet,128,1.1919,8.2399,6.5561,1.0008 7 cuda,attention_is_all_you_need_pytorch,256,1.4975,36.6682,43.0610,1.1824 8 cuda,dcgan,32,0.9276,2.2476,5.7151,1.0064 9 cuda,demucs,4,1.0313,51.7716,12.8195,0.9971 10 cuda,densenet121,4,1.1976,46.0111,64.0118,0.9945 11 cuda,dlrm,1024,1.3421,3.2177,4.9493,1.0009 [all …]
|
/aosp_15_r20/external/pytorch/test/distributed/_shard/sharding_spec/ |
H A D | test_sharding_spec.py | 45 @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "2 CUDA GPUs are needed") 48 DevicePlacementSpec("cuda:0") 50 DevicePlacementSpec(torch.device("cuda:0")) 51 DevicePlacementSpec("rank:0/cuda:0") 57 DevicePlacementSpec("cuda:foo") 61 DevicePlacementSpec("rank:0/cuda:foo") 65 @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "2 CUDA GPUs are needed") 69 ChunkShardingSpec(0, [torch.device("cuda:0"), torch.device("cuda:1")]) 70 ChunkShardingSpec(-1, ["cuda:0", "cuda:1"]) 71 ChunkShardingSpec(0, ["rank:0/cuda:0", "rank:0/cuda:1"]) [all …]
|
/aosp_15_r20/external/tensorflow/third_party/gpus/ |
H A D | cuda_configure.bzl | 1 """Repository rule for CUDA autoconfiguration. 5 * `TF_NEED_CUDA`: Whether to enable building with CUDA. 7 * `TF_CUDA_CLANG`: Whether to use clang as a cuda compiler. 14 * `TF_CUDA_PATHS`: The base paths to look for CUDA and cuDNN. Default is 15 `/usr/local/cuda,usr/`. 16 * `CUDA_TOOLKIT_PATH` (deprecated): The path to the CUDA toolkit. Default is 17 `/usr/local/cuda`. 18 * `TF_CUDA_VERSION`: The version of the CUDA toolkit. If this is blank, then 22 `/usr/local/cuda`. 23 * `TF_CUDA_COMPUTE_CAPABILITIES`: The CUDA compute capabilities. Default is [all …]
|
/aosp_15_r20/external/pytorch/torch/csrc/cuda/ |
H A D | Module.cpp | 3 #include <ATen/cuda/CUDAConfig.h> 16 #include <ATen/cuda/CUDAContext.h> 17 #include <ATen/cuda/CUDAGeneratorImpl.h> 18 #include <ATen/cuda/CachingHostAllocator.h> 19 #include <ATen/cuda/Sleep.h> 20 #include <ATen/cuda/detail/CUDAHooks.h> 21 #include <ATen/cuda/jiterator.h> 22 #include <ATen/cuda/tunable/Tunable.h> 24 #include <c10/cuda/CUDAAllocatorConfig.h> 25 #include <c10/cuda/CUDACachingAllocator.h> [all …]
|
/aosp_15_r20/external/pytorch/torch/csrc/jit/runtime/ |
H A D | register_cuda_ops.cpp | 1 // This file registers special JIT operators used to implement the PyTorch CUDA 4 #include <torch/csrc/jit/cuda/cuda.h> 21 auto current_device_index = c10::cuda::current_device(); in _device_synchronize() 26 c10::cuda::set_device(device_index); in _device_synchronize() 28 c10::cuda::device_synchronize(); in _device_synchronize() 32 c10::cuda::set_device(current_device_index); in _device_synchronize() 38 "cuda::current_stream.device(Device? device) -> __torch__.torch.classes.cuda.Stream", 43 : c10::cuda::current_device(); in __anon4995d1b40202() 44 auto s = c10::cuda::getCurrentCUDAStream(device_index); in __anon4995d1b40202() 50 "cuda::current_stream.int(int? val) -> __torch__.torch.classes.cuda.Stream", [all …]
|
/aosp_15_r20/external/clang/test/Driver/ |
H A D | cuda-detect.cu | 5 // # Check that we properly detect CUDA installation. 7 // RUN: --sysroot=%S/no-cuda-there 2>&1 | FileCheck %s -check-prefix NOCUDA 9 // RUN: --sysroot=%S/Inputs/CUDA 2>&1 | FileCheck %s 11 // RUN: --cuda-path=%S/Inputs/CUDA/usr/local/cuda 2>&1 | FileCheck %s 14 // RUN: %clang -### -v --target=i386-unknown-linux --cuda-gpu-arch=sm_21 \ 15 // RUN: --cuda-path=%S/Inputs/CUDA/usr/local/cuda %s 2>&1 \ 18 // RUN: %clang -### -v --target=i386-unknown-linux --cuda-gpu-arch=sm_35 \ 19 // RUN: --cuda-path=%S/Inputs/CUDA/usr/local/cuda %s 2>&1 \ 22 // Verify that -nocudainc prevents adding include path to CUDA headers. 23 // RUN: %clang -### -v --target=i386-unknown-linux --cuda-gpu-arch=sm_35 \ [all …]
|