Home
last modified time | relevance | path

Searched full:cuda (Results 1 – 25 of 3722) sorted by relevance

12345678910>>...149

/aosp_15_r20/external/pytorch/test/inductor/
H A Ds429861_repro.py10 arg0_1: "f32[][]cuda:0",
11 arg1_1: "f32[50][1]cuda:0",
12 arg2_1: "f32[23][1]cuda:0",
13 arg3_1: "f32[38][1]cuda:0",
14 arg4_1: "f32[5][1]cuda:0",
15 arg5_1: "f32[100][1]cuda:0",
16 arg6_1: "f32[50][1]cuda:0",
17 arg7_1: "f32[77][1]cuda:0",
18 arg8_1: "f32[100][1]cuda:0",
19 arg9_1: "f32[100][1]cuda:0",
[all …]
H A Dtest_torchinductor_codegen_dynamic_shapes.py95 ("cpu", "cuda", "xpu"), is_skip=True
98 ("cpu", "cuda", "xpu"), is_skip=True
100 "test_to_device_dynamic_shapes": TestFailure(("cpu", "cuda", "xpu"), is_skip=True),
136 "test_complex_fallback_dynamic_shapes": TestFailure(("cpu", "cuda", "xpu")),
137 "test_adaptive_avg_pool2d2_dynamic_shapes": TestFailure(("cpu", "cuda", "xpu")),
138 "test_adaptive_max_pool2d2_dynamic_shapes": TestFailure(("cpu", "cuda", "xpu")),
139 "test_fractional_max_pool2d2_dynamic_shapes": TestFailure(("cpu", "cuda", "xpu")),
140 "test_argmax_to_float_dynamic_shapes": TestFailure(("cpu", "cuda", "xpu")),
141 "test_avg_pool2d7_dynamic_shapes": TestFailure(("cpu", "cuda", "xpu")),
142 "test_avg_pool2d_backward4_dynamic_shapes": TestFailure(("cpu", "cuda", "xpu")),
[all …]
H A Dtest_torchinductor_opinfo.py185 inductor_skips["cuda"] = {
200 inductor_skips["cuda"]["bfloat16"] = {b8, f16, f32, f64, i32, i64}
204 inductor_skips["cuda"]["logcumsumexp"] = {f32}
205 inductor_skips["cuda"]["special.modified_bessel_i1"] = {f64}
212 }, # half_to_float is only valid for the CUDA implementation
239 inductor_expected_failures_single_sample["cuda"] = {
277 inductor_expected_failures_single_sample["cuda"].update(intentionally_not_handled)
282 inductor_gradient_expected_failures_single_sample["cuda"] = {}
289 inductor_should_fail_with_exception["cuda"] = {}
335 ("cross", "cuda", f16): {"reference_in_float": True},
[all …]
H A Dtest_foreach.py75 torch.rand(10, 10, device="cuda:0"),
76 torch.rand(20, 20, device="cuda:0"),
80 torch.rand(10, 10, device="cuda:0"),
81 torch.rand(20, 20, device="cuda:0"),
82 torch.rand(10, 10, device="cuda:0"),
83 torch.rand(20, 20, device="cuda:0"),
124 torch.rand(10, 10, device="cuda:0"),
125 torch.rand(20, 20, device="cuda:0"),
131 return op([a0, a1], torch.tensor(3.3, device="cuda:0"))
136 torch.rand(10, 10, device="cuda:0"),
[all …]
H A Dtest_pattern_matcher.py76 torch.randn(16, 16, device="cuda"),
77 torch.randn(16, 16, device="cuda"),
78 torch.randn(16, 16, device="cuda"),
79 torch.randn(16, 16, device="cuda"),
82 torch.randn(1, 4, device="cuda"),
83 torch.randn(4, 2, device="cuda"),
84 torch.randn(1, 5, device="cuda"),
85 torch.randn(5, 2, device="cuda"),
95 torch.randn(1, 4, device="cuda"),
96 torch.randn(4, 2, device="cuda"),
[all …]
H A Dtest_combo_kernels.py55 torch.rand(10, 10, device="cuda"),
56 torch.rand(20, 20, device="cuda"),
57 torch.rand(10, 10, device="cuda"),
77 torch.rand(10, 10, device="cuda"),
78 torch.rand(20, 20, device="cuda"),
79 torch.rand(10, 10, device="cuda"),
80 torch.rand(30, 8, device="cuda"),
100 torch.rand(10, 10, device="cuda"),
101 torch.rand(20, 20, device="cuda"),
102 torch.rand(10, 10, device="cuda"),
[all …]
H A Dtest_cuda_repro.py9 import torch.backends.cuda
63 device = "cuda"
94 torch.ones(shape, dtype=dtype, device="cuda") for (shape, dtype) in inps
105 ).cuda()
106 inp = torch.randn([2, 3, 16, 16]).to(memory_format=torch.channels_last).cuda()
130 rand_strided((12, 3, 512, 64), (64, 196608, 768, 1), torch.float32, "cuda"),
133 mod = make_fx(Repro().to(device="cuda"))(*inps)
138 IS_FBCODE, "RuntimeError: Triton Error [CUDA]: invalid device context"
144 x = torch.randn(4, device="cuda", requires_grad=True)
156 device=torch.device(type="cuda", index=0),
[all …]
/aosp_15_r20/external/pytorch/test/
H A Dtest_cuda_multigpu.py1 # Owner(s): ["module: cuda"]
17 import torch.cuda.comm as comm
41 torch.cuda.get_allocator_backend() == "cudaMallocAsync"
45 print("CUDA not available, skipping tests", file=sys.stderr)
53 snapshot = torch.cuda.memory_snapshot()
107 stats = torch.cuda.memory_stats(device)
112 torch.cuda.synchronize()
113 torch.cuda.synchronize("cuda")
114 torch.cuda.synchronize("cuda:0")
115 torch.cuda.synchronize(0)
[all …]
H A Dtest_cuda.py1 # Owner(s): ["module: cuda"]
23 import torch.cuda
25 from torch.cuda._memory_viz import (
96 torch.cuda.get_allocator_backend() == "cudaMallocAsync"
101 TEST_PYNVML = not torch.cuda._HAS_PYNVML
103 TEST_LARGE_TENSOR = torch.cuda.get_device_properties(0).total_memory >= 12e9
104 TEST_MEDIUM_TENSOR = torch.cuda.get_device_properties(0).total_memory >= 6e9
105 TEST_BF16 = torch.cuda.is_bf16_supported()
110 @unittest.skipIf(not TEST_CUDA, "CUDA not available, skipping tests")
128 torch.cuda.memory._set_allocator_settings(
[all …]
H A Dtest_numba_integration.py19 import numba.cuda
24 @unittest.skipIf(not TEST_CUDA, "No cuda")
26 """torch.Tensor exposes __cuda_array_interface__ for cuda tensors.
28 An object t is considered a cuda-tensor if:
31 A cuda-tensor provides a tensor description dict:
39 https://numba.pydata.org/numba-doc/latest/cuda/cuda_array_interface.html
69 # Sparse CPU/CUDA tensors do not implement the interface
79 sparse_cuda_t = torch.sparse_coo_tensor(indices_t, cput).cuda()
86 # CUDA tensors have the attribute and v2 interface
87 cudat = tp(10).cuda()
[all …]
H A Dtest_jit_fuser.py84 @unittest.skipIf(not RUN_CUDA, "requires CUDA")
86 self._test_fused_abs(device="cuda")
88 @unittest.skipIf(not RUN_CUDA, "requires CUDA")
94 sin = torch.zeros(0, device="cuda")
95 cos = torch.zeros(0, device="cuda")
99 @unittest.skipIf(not RUN_CUDA, "fuser requires CUDA")
109 x = torch.randn(4, 4, dtype=torch.float, device='cuda')
110 y = torch.randn(4, 4, dtype=torch.float, device='cuda')
114 @unittest.skipIf(not RUN_CUDA, "fuser requires CUDA")
120 torch.randn(4, 4, dtype=torch.float, device='cuda'),
[all …]
/aosp_15_r20/external/pytorch/test/jit/
H A Dtest_cuda.py27 print("CUDA not available, skipping tests", file=sys.stderr)
32 # If GPU is available, then initialize the cuda context and check
35 torch.ones(1).cuda() # initialize cuda context
36 TEST_LARGE_TENSOR = torch.cuda.get_device_properties(0).total_memory >= 5e9
48 A suite of tests for the CUDA API in TorchScript.
53 torch.cuda.empty_cache()
63 prev_current_device_index = torch.cuda.current_device()
64 torch.cuda.synchronize()
65 torch.cuda.synchronize("cuda")
66 torch.cuda.synchronize("cuda:0")
[all …]
/aosp_15_r20/external/pytorch/docs/cpp/source/notes/
H A Dtensor_cuda_stream.rst1 Tensor CUDA Stream API
4 A `CUDA Stream`_ is a linear sequence of execution that belongs to a specific CUDA device.
5 The PyTorch C++ API supports CUDA streams with the CUDAStream class and useful helper functions to …
6 …hem in `CUDAStream.h`_. This note provides more details on how to use Pytorch C++ CUDA Stream APIs.
8 .. _CUDA Stream: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#streams
9 .. _CUDAStream.h: https://pytorch.org/cppdocs/api/file_c10_cuda_CUDAStream.h.html#file-c10-cuda-cud…
12 Acquiring CUDA stream
15 Pytorch's C++ API provides the following ways to acquire CUDA stream:
17 1. Acquire a new stream from the CUDA stream pool, streams are preallocated from the pool and retur…
26 by setting device index (defaulting to the current CUDA stream's device index).
[all …]
/aosp_15_r20/external/pytorch/aten/src/ATen/test/
H A Dcuda_stream_test.cpp3 #include <ATen/cuda/CUDAContext.h>
4 #include <ATen/cuda/CUDAEvent.h>
7 #include <c10/cuda/CUDAGuard.h>
8 #include <c10/cuda/impl/CUDAGuardImpl.h>
35 if (!at::cuda::is_available()) return; in TEST()
40 at::cuda::CUDAStream copyStream = at::cuda::getStreamFromPool(); in TEST()
42 auto s = at::cuda::getStreamFromPool(); in TEST()
56 at::cuda::CUDAStream moveStream = at::cuda::getStreamFromPool(); in TEST()
58 auto s = at::cuda::getStreamFromPool(); in TEST()
74 if (!at::cuda::is_available()) return; in TEST()
[all …]
/aosp_15_r20/external/pytorch/docs/source/notes/
H A Dcuda.rst2 :description: A guide to torch.cuda, a PyTorch module to run CUDA operations
3 :keywords: memory management, PYTORCH_CUDA_ALLOC_CONF, optimize PyTorch, CUDA
7 CUDA semantics
11 :mod:`torch.cuda` is used to set up and run CUDA operations. It keeps track of
12 the currently selected GPU, and all CUDA tensors you allocate will by default be
14 :any:`torch.cuda.device` context manager.
22 such as :meth:`~torch.Tensor.to` and :meth:`~torch.Tensor.cuda`.
28 cuda = torch.device('cuda') # Default CUDA device
29 cuda0 = torch.device('cuda:0')
30 cuda2 = torch.device('cuda:2') # GPU 2 (these are 0-indexed)
[all …]
/aosp_15_r20/external/pytorch/cmake/Modules/
H A DFindCUDAToolkit.cmake13 This script locates the NVIDIA CUDA toolkit and the associated libraries, but
14 does not require the ``CUDA`` language be enabled for a given project. This
15 module does not search for the NVIDIA CUDA Samples.
23 The CUDA Toolkit search behavior uses the following order:
25 1. If the ``CUDA`` language has been enabled we will use the directory
44 the desired path in the event that multiple CUDA Toolkits are installed.
46 5. On Unix systems, if the symbolic link ``/usr/local/cuda`` exists, this is
51 candidate is found, this is used. The default CUDA Toolkit install locations
57 | macOS | ``/Developer/NVIDIA/CUDA-X.Y`` |
59 | Other Unix | ``/usr/local/cuda-X.Y`` |
[all …]
/aosp_15_r20/prebuilts/cmake/linux-x86/share/cmake-3.22/Modules/
DFindCUDAToolkit.cmake10 This script locates the NVIDIA CUDA toolkit and the associated libraries, but
11 does not require the ``CUDA`` language be enabled for a given project. This
12 module does not search for the NVIDIA CUDA Samples.
20 The CUDA Toolkit search behavior uses the following order:
22 1. If the ``CUDA`` language has been enabled we will use the directory
41 the desired path in the event that multiple CUDA Toolkits are installed.
43 5. On Unix systems, if the symbolic link ``/usr/local/cuda`` exists, this is
48 candidate is found, this is used. The default CUDA Toolkit install locations
54 | macOS | ``/Developer/NVIDIA/CUDA-X.Y`` |
56 | Other Unix | ``/usr/local/cuda-X.Y`` |
[all …]
/aosp_15_r20/external/tensorflow/tensorflow/compiler/xla/stream_executor/gpu/
H A Dgpu_driver.h16 // CUDA userspace driver library wrapper functionality.
50 // The order of parameters is generally kept symmetric with the underlying CUDA
54 // http://docs.nvidia.com/cuda/cuda-driver-api/
62 …// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__INITIALIZE.html#group__CUDA__INITIALIZ…
67 …// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g4e84b109eb…
70 // Creates a new CUDA stream associated with the given context via
73 …// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#group__CUDA__STREAM_1ga581…
77 // Destroys a CUDA stream associated with the given context.
80 …// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#group__CUDA__STREAM_1g244c…
83 // CUDA events can explicitly disable event TSC retrieval for some presumed
[all …]
/aosp_15_r20/external/pytorch/test/distributed/_shard/sharded_tensor/
H A Dtest_sharded_tensor.py81 placement="rank:0/cuda:0",
86 placement="rank:1/cuda:1",
91 placement="rank:2/cuda:2",
96 placement="rank:3/cuda:3",
149 @skip_but_pass_in_sandcastle_if(not TEST_CUDA, "CUDA GPU is needed")
159 local_device = torch.device("cuda:0")
177 "rank:0/cuda:0",
178 "rank:1/cuda:1",
179 "rank:2/cuda:2",
180 "rank:3/cuda:3",
[all …]
/aosp_15_r20/external/pytorch/benchmarks/dynamo/
H A Dexpected_ci_perf_inductor_torchbench.csv2 cuda,BERT_pytorch,16,2.6028,22.2879,41.0046,1.1965
3 cuda,Background_Matting,4,1.1296,112.7632,27.8916,1.0396
4 cuda,LearningToPaint,96,1.0951,11.3205,13.0241,0.9960
5 cuda,Super_SloMo,6,1.2160,65.3294,27.1633,1.2396
6 cuda,alexnet,128,1.1919,8.2399,6.5561,1.0008
7 cuda,attention_is_all_you_need_pytorch,256,1.4975,36.6682,43.0610,1.1824
8 cuda,dcgan,32,0.9276,2.2476,5.7151,1.0064
9 cuda,demucs,4,1.0313,51.7716,12.8195,0.9971
10 cuda,densenet121,4,1.1976,46.0111,64.0118,0.9945
11 cuda,dlrm,1024,1.3421,3.2177,4.9493,1.0009
[all …]
/aosp_15_r20/external/pytorch/test/distributed/_shard/sharding_spec/
H A Dtest_sharding_spec.py45 @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "2 CUDA GPUs are needed")
48 DevicePlacementSpec("cuda:0")
50 DevicePlacementSpec(torch.device("cuda:0"))
51 DevicePlacementSpec("rank:0/cuda:0")
57 DevicePlacementSpec("cuda:foo")
61 DevicePlacementSpec("rank:0/cuda:foo")
65 @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "2 CUDA GPUs are needed")
69 ChunkShardingSpec(0, [torch.device("cuda:0"), torch.device("cuda:1")])
70 ChunkShardingSpec(-1, ["cuda:0", "cuda:1"])
71 ChunkShardingSpec(0, ["rank:0/cuda:0", "rank:0/cuda:1"])
[all …]
/aosp_15_r20/external/tensorflow/third_party/gpus/
H A Dcuda_configure.bzl1 """Repository rule for CUDA autoconfiguration.
5 * `TF_NEED_CUDA`: Whether to enable building with CUDA.
7 * `TF_CUDA_CLANG`: Whether to use clang as a cuda compiler.
14 * `TF_CUDA_PATHS`: The base paths to look for CUDA and cuDNN. Default is
15 `/usr/local/cuda,usr/`.
16 * `CUDA_TOOLKIT_PATH` (deprecated): The path to the CUDA toolkit. Default is
17 `/usr/local/cuda`.
18 * `TF_CUDA_VERSION`: The version of the CUDA toolkit. If this is blank, then
22 `/usr/local/cuda`.
23 * `TF_CUDA_COMPUTE_CAPABILITIES`: The CUDA compute capabilities. Default is
[all …]
/aosp_15_r20/external/pytorch/torch/csrc/cuda/
H A DModule.cpp3 #include <ATen/cuda/CUDAConfig.h>
16 #include <ATen/cuda/CUDAContext.h>
17 #include <ATen/cuda/CUDAGeneratorImpl.h>
18 #include <ATen/cuda/CachingHostAllocator.h>
19 #include <ATen/cuda/Sleep.h>
20 #include <ATen/cuda/detail/CUDAHooks.h>
21 #include <ATen/cuda/jiterator.h>
22 #include <ATen/cuda/tunable/Tunable.h>
24 #include <c10/cuda/CUDAAllocatorConfig.h>
25 #include <c10/cuda/CUDACachingAllocator.h>
[all …]
/aosp_15_r20/external/pytorch/torch/csrc/jit/runtime/
H A Dregister_cuda_ops.cpp1 // This file registers special JIT operators used to implement the PyTorch CUDA
4 #include <torch/csrc/jit/cuda/cuda.h>
21 auto current_device_index = c10::cuda::current_device(); in _device_synchronize()
26 c10::cuda::set_device(device_index); in _device_synchronize()
28 c10::cuda::device_synchronize(); in _device_synchronize()
32 c10::cuda::set_device(current_device_index); in _device_synchronize()
38 "cuda::current_stream.device(Device? device) -> __torch__.torch.classes.cuda.Stream",
43 : c10::cuda::current_device(); in __anon4995d1b40202()
44 auto s = c10::cuda::getCurrentCUDAStream(device_index); in __anon4995d1b40202()
50 "cuda::current_stream.int(int? val) -> __torch__.torch.classes.cuda.Stream",
[all …]
/aosp_15_r20/external/clang/test/Driver/
H A Dcuda-detect.cu5 // # Check that we properly detect CUDA installation.
7 // RUN: --sysroot=%S/no-cuda-there 2>&1 | FileCheck %s -check-prefix NOCUDA
9 // RUN: --sysroot=%S/Inputs/CUDA 2>&1 | FileCheck %s
11 // RUN: --cuda-path=%S/Inputs/CUDA/usr/local/cuda 2>&1 | FileCheck %s
14 // RUN: %clang -### -v --target=i386-unknown-linux --cuda-gpu-arch=sm_21 \
15 // RUN: --cuda-path=%S/Inputs/CUDA/usr/local/cuda %s 2>&1 \
18 // RUN: %clang -### -v --target=i386-unknown-linux --cuda-gpu-arch=sm_35 \
19 // RUN: --cuda-path=%S/Inputs/CUDA/usr/local/cuda %s 2>&1 \
22 // Verify that -nocudainc prevents adding include path to CUDA headers.
23 // RUN: %clang -### -v --target=i386-unknown-linux --cuda-gpu-arch=sm_35 \
[all …]

12345678910>>...149