Lines Matching full:cuda
1 # Owner(s): ["module: cuda"]
23 import torch.cuda
25 from torch.cuda._memory_viz import (
96 torch.cuda.get_allocator_backend() == "cudaMallocAsync"
101 TEST_PYNVML = not torch.cuda._HAS_PYNVML
103 TEST_LARGE_TENSOR = torch.cuda.get_device_properties(0).total_memory >= 12e9
104 TEST_MEDIUM_TENSOR = torch.cuda.get_device_properties(0).total_memory >= 6e9
105 TEST_BF16 = torch.cuda.is_bf16_supported()
110 @unittest.skipIf(not TEST_CUDA, "CUDA not available, skipping tests")
128 torch.cuda.memory._set_allocator_settings(
155 torch.cuda.memory._set_allocator_settings(
170 cudart = torch.cuda.cudart()
180 torch.cuda.empty_cache()
185 prev = torch.cuda.memory_allocated()
186 mem = torch.cuda.caching_allocator_alloc(size)
187 self.assertGreater(torch.cuda.memory_allocated(), prev)
190 torch.cuda.caching_allocator_delete(mem)
191 self.assertEqual(torch.cuda.memory_allocated(), prev)
195 torch.cuda.check_error(0)
198 torch.cuda.CudaError, "out of memory|hipErrorOutOfMemory"
200 torch.cuda.check_error(2)
204 current_device = torch.cuda.current_device()
205 current_device_name = torch.cuda.get_device_name(current_device)
206 device_name_None = torch.cuda.get_device_name(None)
210 device_name_no_argument = torch.cuda.get_device_name()
215 current_device = torch.cuda.current_device()
216 current_device_capability = torch.cuda.get_device_capability(current_device)
217 device_capability_None = torch.cuda.get_device_capability(None)
221 device_capability_no_argument = torch.cuda.get_device_capability()
225 tensor = torch.zeros(1024, device="cuda")
233 torch.empty(1024 * 1024 * 1024 * 800000000, dtype=torch.int8, device="cuda")
239 1024 * 1024 * 1024 * 8000000000, dtype=torch.int8, device="cuda"
251 torch.cuda.empty_cache()
252 total_memory = torch.cuda.get_device_properties(0).total_memory
259 a = torch.empty(size, dtype=torch.int8, device="cuda")
261 b = torch.empty(size, dtype=torch.int8, device="cuda")
263 b = torch.empty(size, dtype=torch.int8, device="cuda")
266 torch.cuda.empty_cache()
267 torch.cuda.reset_peak_memory_stats()
273 torch.cuda.set_per_process_memory_fraction(1)
275 torch.cuda.set_per_process_memory_fraction(-0.1)
277 torch.cuda.set_per_process_memory_fraction(2.0)
279 tensor = torch.zeros(1024, device="cuda")
280 torch.cuda.empty_cache()
281 total_memory = torch.cuda.get_device_properties(0).total_memory
282 torch.cuda.set_per_process_memory_fraction(0.5, 0)
285 application = int(total_memory * 0.499) - torch.cuda.max_memory_reserved()
286 tmp_tensor = torch.empty(application, dtype=torch.int8, device="cuda")
288 torch.cuda.empty_cache()
296 torch.empty(application, dtype=torch.int8, device="cuda")
304 uuid = torch.cuda.get_device_properties(0).uuid
310 event = torch.cuda.Event()
317 x = torch.ones(10000000, dtype=torch.uint8).cuda()
322 y = torch.ones(10000000, dtype=torch.uint8).cuda()
332 y = torch.ones(10000000 - 1, dtype=torch.uint8).cuda()
336 a = torch.ones(1, device="cuda")
338 c = torch.empty(1, device="cuda", dtype=torch.long)
339 torch.cuda._sleep(int(100 * get_cycles_per_ms()))
346 stream = torch.cuda.current_stream()
349 torch.cuda.synchronize()
352 torch.cuda._sleep(int(100 * get_cycles_per_ms()))
359 for dst, try_non_blocking in product(("cuda", "cpu"), (True, False)):
363 device="cuda" if dst == "cpu" else "cpu",
364 pin_memory=True if dst == "cuda" else False,
369 src = torch.randn(1000000, device="cuda")
370 torch.cuda.synchronize()
371 torch.cuda._sleep(int(100 * get_cycles_per_ms()))
373 self.assertEqual(torch.cuda.current_stream().query(), True)
378 x = torch.randn(5, 5).cuda()
379 y = torch.IntTensor(2, 5).fill_(0).cuda()
388 self.assertTrue(isinstance(q_copy[0], torch.cuda.FloatTensor))
389 self.assertTrue(isinstance(q_copy[1], torch.cuda.IntTensor))
390 self.assertTrue(isinstance(q_copy[2], torch.cuda.FloatTensor))
394 self.assertEqual(q_copy[3], torch.cuda.IntStorage(10).fill_(10))
404 a = torch.randn(7, 7, device="cuda", requires_grad=False)
407 if torch.cuda.get_device_capability() == (9, 0):
412 start = torch.cuda.memory_stats()["active_bytes.all.allocated"]
415 finish = torch.cuda.memory_stats()["active_bytes.all.allocated"]
437 self.assertTrue(torch.backends.cuda.matmul.allow_tf32)
440 orig = torch.backends.cuda.matmul.allow_tf32
442 torch.backends.cuda.matmul.allow_tf32 = not orig
444 torch.backends.cuda.matmul.allow_tf32 = orig
454 self.assertFalse(torch.backends.cuda.matmul.allow_tf32)
457 self.assertTrue(torch.backends.cuda.matmul.allow_tf32)
461 self.assertTrue(torch.backends.cuda.matmul.allow_tf32)
464 self.assertFalse(torch.backends.cuda.matmul.allow_tf32)
468 orig = torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction
472 torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = not orig
476 torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = orig
479 orig = torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction
483 torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = not orig
487 torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = orig
502 self.assertIsInstance(x.cuda().double(), torch.cuda.DoubleTensor)
503 self.assertIsInstance(x.cuda().float(), torch.cuda.FloatTensor)
504 self.assertIsInstance(x.cuda().float().cpu(), torch.FloatTensor)
505 self.assertIsInstance(x.cuda().float().cpu().int(), torch.IntTensor)
509 self.assertIsInstance(y.cuda().double(), torch.cuda.DoubleStorage)
510 self.assertIsInstance(y.cuda().float(), torch.cuda.FloatStorage)
511 self.assertIsInstance(y.cuda().float().cpu(), torch.FloatStorage)
512 self.assertIsInstance(y.cuda().float().cpu().int(), torch.IntStorage)
516 x = torch.empty(2**30, device="cuda")
537 t = torch.tensor([[False, True], [True, True]], device="cuda")
539 torch.gather(t, 1, torch.tensor([[0, 0], [1, 0]], device="cuda")),
540 torch.tensor([[False, False], [True, True]], device="cuda"),
545 x = torch.zeros(4, 4).float().cuda()
547 self.assertEqual(torch.cuda.initial_seed(), 2)
552 self.assertEqual(torch.cuda.initial_seed(), 2)
556 x = torch.zeros(4, 4).float().cuda()
557 torch.cuda.manual_seed(2)
558 self.assertEqual(torch.cuda.initial_seed(), 2)
561 torch.cuda.manual_seed(2)
566 self.assertEqual(torch.cuda.initial_seed(), 2)
585 from torch.cuda._utils import _get_device_index
590 with self.assertRaisesRegex(ValueError, "Expected a cuda device"):
595 x = [torch.randn(4, 4).cuda(), torch.cuda.FloatTensor()]
607 default_stream = torch.cuda.current_stream()
608 user_stream = torch.cuda.Stream()
609 self.assertEqual(torch.cuda.current_stream(), default_stream)
613 with torch.cuda.stream(user_stream):
614 self.assertEqual(torch.cuda.current_stream(), user_stream)
617 tensor2 = tensor1.cuda(non_blocking=True) + 1
622 s = torch.cuda.current_stream()
623 self.assertTrue("torch.cuda.Stream" in s.__repr__())
624 e = torch.cuda.Event()
625 self.assertTrue("torch.cuda.Event" in e.__repr__())
627 self.assertTrue("torch.cuda.Event" in e.__repr__())
630 stream = torch.cuda.current_stream()
631 event = torch.cuda.Event(enable_timing=True)
633 start_event = torch.cuda.Event(enable_timing=True)
635 torch.cuda._sleep(int(50 * get_cycles_per_ms()))
643 stream = torch.Stream("cuda")
644 self.assertEqual(stream.device_index, torch.cuda.current_device())
645 cuda_stream = torch.cuda.Stream(
651 self.assertNotEqual(stream.stream_id, torch.cuda.current_stream().stream_id)
653 event1 = torch.Event("cuda", enable_timing=True)
654 event2 = torch.Event("cuda", enable_timing=True)
658 with torch.cuda.stream(cuda_stream):
659 a_cuda = a.to("cuda", non_blocking=True)
660 b_cuda = b.to("cuda", non_blocking=True)
661 self.assertEqual(stream.stream_id, torch.cuda.current_stream().stream_id)
677 result = torch.cuda.FloatTensor(t.size())
678 stream = torch.cuda.Stream()
683 with torch.cuda.stream(stream):
684 tmp = t.cuda(non_blocking=True)
686 torch.cuda.current_stream().wait_stream(stream)
687 tmp.record_stream(torch.cuda.current_stream())
688 torch.cuda._sleep(int(50 * cycles_per_ms)) # delay the copy
692 with torch.cuda.stream(stream):
693 tmp2 = torch.cuda.FloatTensor(t.size())
704 torch.cuda.current_stream().synchronize()
705 with torch.cuda.stream(stream):
706 tmp3 = torch.cuda.FloatTensor(t.size())
715 stream_alloc = torch.cuda.Stream()
716 with torch.cuda.stream(stream_alloc):
717 base = torch.cuda.FloatTensor([10, 10])
723 stream_record = torch.cuda.Stream()
724 with torch.cuda.stream(stream_record):
725 torch.cuda._sleep(int(50 * get_cycles_per_ms()))
736 with torch.cuda.stream(stream_alloc):
737 try_realloc = torch.cuda.FloatTensor([10, 10])
757 gpu_tensor = torch.cuda.FloatTensor([0])
758 torch.cuda._sleep(int(1000 * cycles_per_ms)) # delay the copy by 1s
768 stream = torch.cuda.Stream()
770 with torch.cuda.stream(stream):
771 y = torch.zeros(40 * 1024 * 1024, device="cuda")
774 x = torch.empty(40 * 1024 * 1024, device="cuda")
775 with torch.cuda.stream(stream):
783 torch.cuda.empty_cache()
787 x = torch.ones(512, 8, dtype=torch.float32, device="cuda")
791 x = torch.zeros(10, device="cuda", dtype=torch.float16)
794 x = torch.ones(65504, device="cuda", dtype=torch.float16)
798 x = torch.ones(65536, device="cuda", dtype=torch.float16)
802 x = a.to(device="cuda", dtype=torch.float16)
806 x = a.to(device="cuda", dtype=torch.float16)
810 x = torch.ones(65536, device="cuda", dtype=torch.float16)
813 x = torch.ones(65536, device="cuda", dtype=torch.float16)
818 x = torch.ones(240000, device="cuda", dtype=torch.float32)
823 x = torch.ones(240000, device="cuda", dtype=dtype) * (0 + 1j)
828 freqs = torch.cuda.FloatTensor(
883 torch.cuda.manual_seed(11042)
887 p = torch.zeros(3421, 2, device="cuda", dtype=torch.float)
889 torch.cuda.manual_seed(5214)
894 torch.cuda.manual_seed(33)
895 probs = torch.randn(1000000, device="cuda").clamp(min=0) * 3e-5
913 torch.multinomial(torch.tensor({probs}).to('cuda'), 2, replacement=True)
914 torch.cuda.synchronize()
930 "device-side assert triggered", # CUDA
931 "Assertion", # CUDA
964 x = torch.arange(10, device="cuda")
982 test_method(1), "x[torch.tensor([1)]=tensor([1], device='cuda:0')"
991 src = torch.empty(15000000, 45, device="cuda", dtype=torch.long).random_(
994 idx = torch.randperm(src.shape[0], device="cuda")
1020 assert abs(run(torch.device("cuda")) - run(torch.device("cpu"))) < 10_000
1040 run(func, torch.device("cuda"), dtype)
1048 x = torch.cuda.ByteTensor([0])
1049 y = torch.cuda.ByteTensor([255])
1050 expected = torch.cuda.LongTensor([0])[0]
1060 torch.cuda.nvtx.range_push("foo")
1061 torch.cuda.nvtx.mark("bar")
1062 torch.cuda.nvtx.range_pop()
1063 range_handle = torch.cuda.nvtx.range_start("range_start")
1064 torch.cuda.nvtx.range_end(range_handle)
1067 # ensure CUDA code coverage
1069 w = torch.randn(input_size, dtype=torch.double, device="cuda")
1072 t = torch.randint(50, input_size, dtype=torch.int8, device="cuda")
1078 t = torch.randint(50000, input_size, dtype=torch.int64, device="cuda")
1082 t = torch.zeros([10], dtype=torch.int32, device="cuda")
1090 a = torch.arange(25).cuda().float()
1096 a = torch.ones(65536).cuda().half()
1104 x = torch.randn(3, 1, device="cuda")
1105 y = torch.randn(2, 1, device="cuda")
1113 x = torch.randn(1, 1, 1, 2**30 + 1, dtype=torch.float16, device="cuda")
1116 torch.cuda.synchronize()
1124 x = torch.randn(1, 1, 1, 2**31, dtype=torch.float16, device="cuda")
1130 x = torch.randn(1, 1, 1, 2**31 - 1, dtype=torch.float16, device="cuda")
1133 torch.cuda.synchronize()
1142 ctx.stream = torch.cuda.current_stream()
1147 self.assertEqual(torch.cuda.current_stream(), ctx.stream)
1149 torch.cuda._sleep(1000 * 5000)
1156 default_stream = torch.cuda.current_stream()
1157 stream = torch.cuda.Stream()
1162 … # See "Stream semantics of backward passes" on https://pytorch.org/docs/stable/notes/cuda.html
1163 x = torch.randn(5, 5, device="cuda", requires_grad=True)
1164 with torch.cuda.stream(stream):
1171 self.assertEqual(torch.cuda.current_stream(), default_stream)
1175 bwd_ambient_stream = torch.cuda.Stream()
1176 x = torch.randn(5, 5, device="cuda", requires_grad=True)
1177 with torch.cuda.stream(stream):
1180 with torch.cuda.stream(bwd_ambient_stream):
1187 self.assertEqual(torch.cuda.current_stream(), bwd_ambient_stream)
1197 self.event = torch.cuda.Event()
1198 self.stream0 = torch.cuda.Stream()
1199 self.stream1 = torch.cuda.Stream()
1204 self.stream0.wait_stream(torch.cuda.current_stream())
1205 self.stream1.wait_stream(torch.cuda.current_stream())
1206 with torch.cuda.stream(self.stream0):
1210 self.event.record(stream=torch.cuda.current_stream())
1212 with torch.cuda.stream(self.stream1):
1217 stream = torch.cuda.Stream()
1223 with torch.cuda.stream(stream):
1224 x = torch.randn(5, 5, device="cuda", requires_grad=True)
1225 model = StreamModel().cuda()
1228 torch.cuda.current_stream(),
1240 … # See "Stream semantics of backward passes" on https://pytorch.org/docs/stable/notes/cuda.html
1241 torch.cuda.current_stream().wait_stream(stream)
1251 … # torch.cuda._sleep such that if the race condition exists, the test will almost certainly fail,
1254 fwd_bwd_op_stream = torch.cuda.Stream()
1255 bwd_ambient_stream = torch.cuda.Stream()
1261 a = torch.full((size,), 2.0, device="cuda", requires_grad=True)
1262 b = torch.full((size,), 3.0, device="cuda", requires_grad=True)
1268 torch.cuda.synchronize()
1270 with torch.cuda.stream(fwd_bwd_op_stream):
1273 with torch.cuda.stream(bwd_ambient_stream):
1274 torch.cuda.synchronize()
1276 torch.cuda._sleep(int(50 * get_cycles_per_ms()))
1278 grad = torch.full((size,), float(trial + 1), device="cuda")
1288 torch.cuda.synchronize()
1300 a = torch.full((size,), 1, device="cuda", dtype=torch.float, requires_grad=True)
1301 b = torch.full((size,), 1, device="cuda", dtype=torch.float, requires_grad=True)
1303 s0 = torch.cuda.Stream()
1304 s1 = torch.cuda.Stream()
1305 s2 = torch.cuda.Stream()
1310 s0.wait_stream(torch.cuda.current_stream())
1311 with torch.cuda.stream(s0):
1314 s1.wait_stream(torch.cuda.current_stream())
1315 with torch.cuda.stream(s1):
1332 with torch.cuda.stream(s2):
1347 torch._assert_async(torch.tensor([], device="cuda"))
1352 torch._assert_async(torch.tensor([0, 0], device="cuda"))
1354 torch._assert_async(torch.tensor(1, device="cuda"))
1355 torch._assert_async(torch.tensor(0.1, device="cuda"))
1356 torch._assert_async(torch.tensor(-0.1, device="cuda"))
1357 torch._assert_async(torch.tensor(True, device="cuda"))
1358 torch._assert_async(torch.tensor(0 + 0.1j, device="cuda"))
1361 "torch._assert_async(torch.tensor(0, device='cuda'))",
1362 "torch._assert_async(torch.tensor(0.0, device='cuda'))",
1363 "torch._assert_async(torch.tensor(False, device='cuda'))",
1364 "torch._assert_async(torch.tensor(0 + 0j, device='cuda'))",
1379 torch.cuda.synchronize()
1395 weight = torch.ones((size, size), device="cuda")
1400 my_stream = torch.cuda.Stream()
1404 torch.cuda.synchronize()
1407 with torch.cuda.stream(my_stream):
1419 torch.cuda.synchronize()
1423 results[t] = torch.ones((size, size), device="cuda")
1445 weight = torch.ones((1, 1, 2, 2), device="cuda")
1457 my_stream = torch.cuda.Stream()
1461 torch.cuda.synchronize()
1464 with torch.cuda.stream(my_stream):
1478 torch.cuda.synchronize()
1482 results[t] = torch.ones((1, 1, 2048, 2048), device="cuda")
1507 a = torch.arange(size, device="cuda")
1509 values = torch.ones(size * size, device="cuda")
1517 my_stream = torch.cuda.Stream()
1521 torch.cuda.synchronize()
1524 with torch.cuda.stream(my_stream):
1536 torch.cuda.synchronize()
1540 results[t] = torch.ones((size, size), device="cuda")
1558 x = torch.zeros(2**32, device="cuda", dtype=torch.int8)
1566 self.assertRaises(TypeError, lambda: torch.empty(1, device="cuda").numpy())
1569 self.assertFalse(torch.cuda.is_current_stream_capturing())
1572 s = torch.cuda.Stream()
1573 with torch.cuda.stream(s):
1574 g = torch.cuda.CUDAGraph()
1575 self.assertFalse(torch.cuda.is_current_stream_capturing())
1577 self.assertTrue(torch.cuda.is_current_stream_capturing())
1581 not TEST_CUDA_GRAPH, "CUDA >= 11.0 or ROCM >= 5.3 required for graphs"
1584 s = torch.cuda.Stream()
1586 with torch.cuda.stream(s):
1587 a = torch.full((1000,), 1, device="cuda")
1588 g = torch.cuda.CUDAGraph()
1589 torch.cuda.empty_cache()
1595 torch.cuda.current_stream().wait_stream(s)
1602 not TEST_CUDA_GRAPH, "CUDA >= 11.0 or ROCM >= 5.3 required for graphs"
1607 """Initializes generator states and registers them with a CUDA graph if provided."""
1608 # Ensure the CUDA generator is initialized
1609 torch.rand(1, device="cuda")
1631 random_values.append(torch.rand(5, device="cuda", generator=generator))
1636 [torch.rand(5, device="cuda", generator=generator) for _ in range(2)]
1648 # Set up and test a new CUDA generator
1649 generator = torch.Generator(device="cuda")
1652 # Set up and test the default CUDA generator with a CUDA Graph
1653 g = torch.cuda.CUDAGraph()
1654 s = torch.cuda.Stream()
1655 default_generator = torch.cuda.default_generators[0]
1659 # Perform random number generation within a CUDA graph
1660 with torch.cuda.stream(s):
1668 torch.cuda.current_stream().wait_stream(s)
1681 not TEST_CUDA_GRAPH, "CUDA >= 11.0 or ROCM >= 5.3 required for graphs"
1684 # Function to clear CUDA cache and collect garbage
1687 torch.cuda.empty_cache()
1689 …e graph task which includes capturing and executing a random number generation within a CUDA graph.
1691 s = torch.cuda.Stream()
1692 with torch.cuda.stream(s):
1694 torch.rand(1, device="cuda")
1696 torch.cuda.current_stream().wait_stream(s)
1700 stats = torch.cuda.memory_stats()
1709 # Allocate CUDA graphs
1710 graphs = [torch.cuda.CUDAGraph() for _ in range(num_graphs)]
1713 default_generator = torch.cuda.default_generators[0]
1740 # Cleanup graphs and clear CUDA cache
1758 not TEST_CUDA_GRAPH, "CUDA >= 11.0 or ROCM >= 5.3 required for graphs"
1761 s = torch.cuda.Stream()
1763 with torch.cuda.stream(s):
1764 a = torch.full((1000,), 1, device="cuda")
1765 g = torch.cuda.CUDAGraph()
1766 torch.cuda.empty_cache()
1772 torch.cuda.current_stream().wait_stream(s)
1780 with torch.cuda.stream(s):
1786 torch.cuda.current_stream().wait_stream(s)
1795 not TEST_CUDA_GRAPH, "CUDA >= 11.0 or ROCM >= 5.3 required for graphs"
1798 torch.cuda.empty_cache()
1799 x = torch.randn(10240000, device="cuda")
1801 g = torch.cuda.CUDAGraph()
1803 s0 = torch.cuda.Stream()
1804 s1 = torch.cuda.Stream()
1805 s0.wait_stream(torch.cuda.current_stream())
1806 with torch.cuda.stream(s0):
1809 with torch.cuda.stream(s1):
1815 torch.cuda.synchronize()
1820 not TEST_CUDA_GRAPH, "CUDA >= 11.0 or ROCM >= 5.3 required for graphs"
1824 # puts the cuda context in a bad state
1828 g = torch.cuda.CUDAGraph()
1832 if "CUDA graphs must be captured on a non-default stream." in str(e):
1859 (not TEST_CUDA) or TEST_WITH_ROCM or int(torch.version.cuda.split(".")[0]) < 11,
1860 "CUDA >= 11.0 required for graphs",
1864 g = torch.cuda.CUDAGraph()
1865 s = torch.cuda.Stream()
1866 with torch.cuda.stream(s):
1870 any("The CUDA Graph is empty" in str(w.message) for w in caught)
1874 not TEST_CUDA_GRAPH, "CUDA >= 11.0 or ROCM >= 5.3 required for graphs"
1884 with torch.cuda.graph(torch.cuda.CUDAGraph()):
1885 torch.zeros(2**40, device="cuda")
1888 not TEST_CUDA_GRAPH, "CUDA >= 11.0 or ROCM >= 5.3 required for graphs"
1893 a = torch.rand((x, y), device="cuda")
1894 b = torch.rand((y, z), device="cuda")
1899 free_bytes_before, total_bytes = torch.cuda.mem_get_info()
1903 torch_graph = torch.cuda.CUDAGraph()
1904 with torch.cuda.graph(torch_graph):
1908 free_bytes_after, _ = torch.cuda.mem_get_info()
1914 not TEST_CUDA_GRAPH, "CUDA >= 11.0 or ROCM >= 5.3 required for graphs"
1924 a = torch.randn((size,), device="cuda", dtype=torch.float)
1927 torch.cuda.manual_seed(5)
1933 stream = torch.cuda.Stream()
1934 stream.wait_stream(torch.cuda.current_stream())
1935 with torch.cuda.stream(stream):
1936 torch.cuda.manual_seed(5)
1938 g = torch.cuda.CUDAGraph()
1939 torch.cuda.empty_cache()
1945 torch.cuda.current_stream().wait_stream(stream)
1968 torch.cuda.manual_seed(seed)
1981 torch.cuda.manual_seed(seed)
1996 torch.cuda.synchronize()
2002 not TEST_CUDA_GRAPH, "CUDA >= 11.0 or ROCM >= 5.3 required for graphs"
2006 input = torch.rand((size,), device="cuda", dtype=torch.float)
2007 alloc = torch.empty((size,), device="cuda", dtype=torch.float)
2012 # multinomial uses some uncapturable CUDA calls.
2021 ("rand", (size,), {"device": "cuda", "dtype": torch.float}),
2022 ("randint", (0, 3, (size,)), {"device": "cuda", "dtype": torch.float}),
2023 ("randn", (size,), {"device": "cuda", "dtype": torch.float}),
2039 torch.cuda.manual_seed(5)
2054 stream = torch.cuda.Stream()
2055 stream.wait_stream(torch.cuda.current_stream())
2056 with torch.cuda.stream(stream):
2057 torch.cuda.manual_seed(5)
2059 g = torch.cuda.CUDAGraph()
2060 torch.cuda.empty_cache()
2073 torch.cuda.current_stream().wait_stream(stream)
2080 # If we try it with cudaMallocAsync, CUDA won't event consider
2091 torch.cuda.manual_seed(seed)
2103 torch.cuda.manual_seed(seed)
2125 torch.cuda.synchronize()
2135 not TEST_CUDA_GRAPH, "CUDA >= 11.0 or ROCM >= 5.3 required for graphs"
2138 torch.cuda.empty_cache()
2148 s = torch.cuda.Stream()
2151 g0 = torch.cuda.CUDAGraph()
2152 g1 = torch.cuda.CUDAGraph()
2154 a = torch.ones((size,), device="cuda")
2156 s.wait_stream(torch.cuda.current_stream())
2157 with torch.cuda.stream(s):
2159 (torch.cuda.graph_pool_handle(),)
2174 torch.cuda.current_stream().wait_stream(s)
2195 - torch.cuda.memory_stats()["reserved_bytes.all.current"],
2199 reserved_no_sharing = torch.cuda.memory_stats()[
2205 torch.cuda.synchronize()
2206 torch.cuda.empty_cache()
2212 torch.version.cuda
2213 and int(torch.version.cuda.split(".")[0]) == 11
2214 and int(torch.version.cuda.split(".")[1]) < 4
2216 "Graph bindings disallow concurrent replay for CUDA < 11.4, see "
2220 not TEST_CUDA_GRAPH, "CUDA >= 11.0 or ROCM >= 5.3 required for graphs"
2223 torch.cuda.empty_cache()
2232 s = torch.cuda.Stream()
2235 g0 = torch.cuda.CUDAGraph()
2236 g1 = torch.cuda.CUDAGraph()
2238 s0 = torch.cuda.Stream()
2239 s1 = torch.cuda.Stream()
2241 a = torch.ones((size,), device="cuda")
2243 s.wait_stream(torch.cuda.current_stream())
2244 with torch.cuda.stream(s):
2246 (torch.cuda.graph_pool_handle(),)
2266 torch.cuda.synchronize()
2267 with torch.cuda.stream(s0):
2268 torch.cuda._sleep(1000000)
2271 with torch.cuda.stream(s1):
2273 torch.cuda.current_stream().wait_stream(s0)
2274 torch.cuda.current_stream().wait_stream(s1)
2291 torch.cuda.synchronize()
2292 torch.cuda.empty_cache()
2295 not TEST_CUDA_GRAPH, "CUDA >= 11.0 or ROCM >= 5.3 required for graphs"
2298 torch.cuda.empty_cache()
2302 s = torch.cuda.Stream()
2305 a = torch.ones((size,), device="cuda")
2307 g0 = torch.cuda.CUDAGraph()
2308 g1 = torch.cuda.CUDAGraph()
2309 g2 = torch.cuda.CUDAGraph()
2311 s.wait_stream(torch.cuda.current_stream())
2312 with torch.cuda.stream(s):
2314 (torch.cuda.graph_pool_handle(),)
2334 torch.cuda.current_stream().wait_stream(s)
2361 torch.cuda.synchronize()
2362 torch.cuda.empty_cache()
2366 "CUDA >= 11.0 or ROCM >= 5.3 required for graphs",
2405 torch.cuda.empty_cache()
2407 s = torch.cuda.Stream()
2425 g = torch.cuda.CUDAGraph()
2426 s.wait_stream(torch.cuda.current_stream())
2427 with torch.cuda.stream(s):
2431 a = torch.ones((numel,), device="cuda")
2433 precapture_stats = torch.cuda.memory_stats()
2440 torch.cuda.current_stream().wait_stream(s)
2444 postcapture_stats = torch.cuda.memory_stats()
2483 torch.cuda.empty_cache()
2487 torch.cuda.empty_cache()
2488 postdel_stats = torch.cuda.memory_stats()
2531 torch.cuda.synchronize()
2532 torch.cuda.empty_cache()
2535 not TEST_CUDA_GRAPH, "CUDA >= 11.0 or ROCM >= 5.3 required for graphs"
2539 …# "Q. Why skip process_events if a capture might be underway?" in c10/cuda/CUDACachingAllocator.cpp
2540 torch.cuda.empty_cache()
2542 potential_problem = torch.zeros((3,), device="cuda")
2543 a = torch.zeros((3,), device="cuda")
2544 s0 = torch.cuda.Stream()
2545 s1 = torch.cuda.Stream()
2546 s2 = torch.cuda.Stream()
2547 g = torch.cuda.CUDAGraph()
2549 torch.cuda.synchronize()
2550 with torch.cuda.stream(s0):
2552 torch.cuda._sleep(TestCuda.FIFTY_MIL_CYCLES)
2556 with torch.cuda.stream(s1):
2565 with torch.cuda.stream(s2):
2571 torch.cuda.synchronize()
2574 c = torch.zeros((3,), device="cuda")
2578 not TEST_CUDA_GRAPH, "CUDA >= 11.0 or ROCM >= 5.3 required for graphs"
2586 …# Tests the interaction of cuda graph capture with DropoutState's syncs in ATen/native/cudnn/RNN.c…
2589 torch.cuda.empty_cache()
2591 model = torch.nn.LSTM(512, 512, 2, dropout=0.5).cuda()
2592 x = torch.ones(100, 192, 512, device="cuda")
2596 g = torch.cuda.CUDAGraph()
2597 s = torch.cuda.Stream()
2598 s.wait_stream(torch.cuda.current_stream())
2599 with torch.cuda.stream(s):
2603 torch.cuda.current_stream().wait_stream(s)
2610 not TEST_CUDA_GRAPH, "CUDA >= 11.0 or ROCM >= 5.3 required for graphs"
2631 torch.cuda.manual_seed(5)
2640 ).cuda()
2643 ).cuda()
2654 ).cuda()
2657 ).cuda()
2673 model_section1 = MLP1(D_in, H, H).cuda()
2674 model_section2 = MLP2(H, H, D_out).cuda()
2675 model_section3 = ParameterlessModule().cuda()
2688 x = torch.randn(N, D_in, device="cuda")
2689 h = torch.randn(N, H, device="cuda", requires_grad=True)
2690 h2 = torch.randn(N, D_out, device="cuda", requires_grad=True)
2691 unused_input = torch.randn(N, H, device="cuda", requires_grad=True)
2692 y_pred = torch.randn(N, D_out, device="cuda", requires_grad=True)
2693 y = torch.randn(N, D_out, device="cuda")
2700 device_type="cuda", enabled=with_amp, cache_enabled=cache_enabled
2708 ) = torch.cuda.make_graphed_callables(
2738 torch.cuda.manual_seed(5)
2742 device_type="cuda", enabled=with_amp, cache_enabled=cache_enabled
2761 not TEST_CUDA_GRAPH, "CUDA >= 11.0 or ROCM >= 5.3 required for graphs"
2782 torch.cuda.manual_seed(5)
2798 model_section1 = ParameterlessModule().cuda()
2806 x = torch.randn(N, D_in, device="cuda", requires_grad=False)
2807 unused_input = torch.randn(N, H, device="cuda", requires_grad=False)
2808 y_pred = torch.randn(N, D_in, device="cuda", requires_grad=False)
2809 y = torch.randn(N, D_in, device="cuda")
2813 device_type="cuda", enabled=with_amp, cache_enabled=cache_enabled
2815 model_graphed[0] = torch.cuda.make_graphed_callables(
2828 torch.cuda.manual_seed(5)
2831 device_type="cuda", enabled=with_amp, cache_enabled=cache_enabled
2843 not TEST_CUDA_GRAPH, "CUDA >= 11.0 or ROCM >= 5.3 required for graphs"
2847 torch.cuda.manual_seed(5)
2856 ).cuda()
2859 mempool = torch.cuda.graph_pool_handle()
2862 x = torch.randn([64, 32], device="cuda")
2864 graphed_model = torch.cuda.make_graphed_callables(
2870 x = torch.randn([64, 32], device="cuda")
2890 params = [torch.randn((i + 5, i + 5), device="cuda") for i in range(2)] + [
2891 torch.randn((), device="cuda")
2920 g = torch.cuda.CUDAGraph()
2921 with torch.cuda.graph(g):
2940 not TEST_CUDA_GRAPH, "CUDA >= 11.0 or ROCM >= 5.3 required for graphs"
2960 torch.nn.Parameter(torch.ones(1, device="cuda")) for _ in range(2)
2963 torch.nn.Parameter(torch.ones(1, device="cuda")) for _ in range(2)
2994 g = torch.cuda.CUDAGraph()
2996 with self.assertRaisesRegex(RuntimeError, "Attempting CUDA graph"):
2997 with torch.cuda.graph(g):
3000 with torch.cuda.graph(g):
3011 not TEST_CUDA_GRAPH, "CUDA >= 11.0 or ROCM >= 5.3 required for graphs"
3015 x = torch.zeros([2000], device="cuda")
3024 stream = torch.cuda.Stream()
3026 with torch.cuda.stream(stream):
3027 mem = torch.cuda.caching_allocator_alloc(1024)
3032 torch.cuda.caching_allocator_delete(mem)
3039 graph = torch.cuda.CUDAGraph()
3040 torch.cuda.synchronize()
3041 stream = torch.cuda.Stream()
3042 stream.wait_stream(torch.cuda.current_stream())
3043 with torch.cuda.stream(stream):
3046 torch.cuda.current_stream().wait_stream(stream)
3047 torch.cuda.synchronize()
3049 with torch.cuda.graph(
3058 torch.cuda.caching_allocator_delete(mem)
3070 not TEST_CUDA_GRAPH, "CUDA >= 11.0 or ROCM >= 5.3 required for graphs"
3073 segments = torch.cuda.memory_snapshot()
3075 x = torch.randn(10240000, device="cuda")
3077 g = torch.cuda.CUDAGraph()
3078 s0 = torch.cuda.Stream()
3079 s1 = torch.cuda.Stream()
3080 s0.wait_stream(torch.cuda.current_stream())
3081 with torch.cuda.stream(s0):
3084 with torch.cuda.stream(s1):
3088 with torch.cuda.stream(s0):
3090 segments = torch.cuda.memory_snapshot()
3100 input = torch.randn(1, 3, 3, 3, device="cuda")
3103 mean=torch.ones(2, 3, device="cuda"),
3104 invstd=torch.ones(2, 3, device="cuda"),
3111 self.assertEqual(mean, torch.ones(3, device="cuda"))
3112 self.assertEqual(invstd, torch.ones(3, device="cuda"))
3116 torch.cuda.synchronize()
3117 val = torch.cuda.max_memory_allocated()
3118 torch.cuda.reset_peak_memory_stats()
3121 a = torch.rand(1, 32, 32, device="cuda")
3122 b = torch.rand(24, 32, 1, device="cuda")
3179 cuda = cpu.cuda()
3183 cpu @ cuda
3187 cuda @ cpu
3189 for s, m1, m2 in product((cpu, cuda), repeat=3):
3200 """Validate that no CUDA calls are made during `import torch` call"""
3212 … = f"import os; import torch;os.environ['{VISIBLE_DEVICES}']='32';print(torch.cuda.device_count())"
3228 @unittest.skipIf(not TEST_WITH_ROCM, "not relevant for CUDA testing")
3230 """Validate device_count works with both CUDA/HIP visible devices"""
3234 print(f"{torch.cuda.device_count()}")
3264 r1 = torch.cuda.device_count()
3266 r2 = torch.cuda.device_count()
3267 torch.empty(10, device='cuda')
3277 x = torch.cuda.device_count()
3288 file = torch.cuda.gds._GdsFile(f, os.O_CREAT | os.O_RDWR)
3291 @unittest.skipIf(not TEST_CUDA, "CUDA not available, skipping tests")
3299 torch.cuda.memory.empty_cache()
3300 torch.cuda.memory._record_memory_history("state", stacks="python")
3302 torch.rand(2 * 311, 411, device="cuda")
3303 unused = torch.rand(310, 410, device="cuda")
3304 x = torch.rand(311, 411, device="cuda")
3311 tensors = [torch.rand(128, device="cuda") for _ in range(1000)]
3316 torch.rand(128 * 5, device="cuda")
3318 ss = torch.cuda.memory._snapshot()
3331 torch.cuda.memory._save_segment_usage(f.name)
3336 torch.cuda.empty_cache()
3337 ss = torch.cuda.memory._snapshot()
3344 torch.cuda.memory._record_memory_history(None)
3362 torch.cuda.memory.empty_cache()
3363 torch.cuda.memory._record_memory_history("state", stacks="all")
3364 x = torch.rand(311, 411, device="cuda")
3366 ss = torch.cuda.memory._snapshot()["segments"]
3376 torch.cuda.memory._record_memory_history(None)
3383 x = torch.rand(128, 128, device="cuda")
3413 x = torch.empty(3, 4, device="cuda")
3444 torch.cuda.memory.empty_cache()
3445 torch.cuda.memory._record_memory_history(
3450 x = torch.rand(128, 128, device="cuda")
3456 ss = torch.cuda.memory._snapshot()
3467 torch.cuda.memory._record_memory_history(None)
3476 torch.cuda.memory.empty_cache()
3477 torch.cuda.memory._record_memory_history(context=context)
3482 x = torch.rand(3, 4, device="cuda")
3490 ss = json.dumps(torch.cuda.memory._snapshot())
3494 torch.cuda.memory._record_memory_history(None)
3502 torch.cuda.memory.empty_cache()
3507 x = torch.rand(4, 4, device="cuda")
3511 x = torch.rand(3, 4, device="cuda")
3515 x = torch.rand(4, 4, device="cuda")
3518 torch.cuda.memory._record_memory_history(context="all", stacks="python")
3521 torch.cuda.memory._record_memory_history(context=None)
3524 torch.cuda.memory._record_memory_history(context="all", stacks="python")
3527 ss = json.dumps(torch.cuda.memory._snapshot())
3532 torch.cuda.memory._record_memory_history(None)
3541 torch.cuda.memory.empty_cache()
3542 torch.cuda.memory._record_memory_history(context=context)
3543 x = torch.rand(3, 4, device="cuda")
3545 torch.cuda.memory.empty_cache()
3547 ss = json.dumps(torch.cuda.memory._snapshot())
3550 torch.cuda.memory._record_memory_history(None)
3557 torch.cuda.memory.empty_cache()
3558 torch.cuda.memory._record_memory_history("state", stacks="python")
3562 return torch.rand(311, 411, device="cuda")
3566 ss = torch.cuda.memory._snapshot()["segments"]
3576 torch.cuda.memory._record_memory_history(None)
3579 torch.cuda.memory.empty_cache()
3581 _, all_memory = torch.cuda.memory.mem_get_info()
3585 torch.cuda.memory.set_per_process_memory_fraction(fraction_allowed)
3588 return torch.ones(n * mb, dtype=torch.int8, device="cuda")
3590 torch.cuda.memory._set_allocator_settings(
3594 torch.cuda.memory._set_allocator_settings(
3598 torch.cuda.memory._set_allocator_settings(
3609 torch.cuda.memory.empty_cache()
3611 _, all_memory = torch.cuda.memory.mem_get_info()
3615 torch.cuda.memory.set_per_process_memory_fraction(fraction_allowed)
3618 return torch.ones(n * mb, dtype=torch.int8, device="cuda")
3620 torch.cuda.memory._set_allocator_settings(
3624 torch.cuda.memory._set_allocator_settings(
3647 torch.cuda.memory.empty_cache()
3661 start_mem = torch.cuda.memory_stats()[key_allocated]
3662 torch.cuda.memory._set_allocator_settings("")
3663 x = torch.rand(nelems, device="cuda")
3666 reg_mem = torch.cuda.memory_stats()[key_allocated]
3667 start_requested = torch.cuda.memory_stats()[key_requested]
3668 torch.cuda.memory._set_allocator_settings("roundup_power2_divisions:4")
3669 y = torch.rand(nelems, device="cuda")
3671 pow2_div4_mem = torch.cuda.memory_stats()[key_allocated]
3672 current_requested = torch.cuda.memory_stats()[key_requested]
3680 torch.cuda.memory._set_allocator_settings("garbage_collection_threshold:0.5")
3681 torch.cuda.memory._set_allocator_settings(
3686 torch.cuda.memory.empty_cache()
3687 start_mem = torch.cuda.memory_stats()[key_allocated]
3688 z = torch.rand(nelems, device="cuda")
3689 reg_mem = torch.cuda.memory_stats()[key_allocated]
3693 torch.cuda.memory.empty_cache()
3694 torch.cuda.memory._set_allocator_settings(
3697 start_mem = torch.cuda.memory_stats()[key_allocated]
3698 w = torch.rand(nelems, device="cuda")
3700 pow2_div8_mem = torch.cuda.memory_stats()[key_allocated]
3705 torch.cuda.memory.empty_cache()
3706 start_mem = torch.cuda.memory_stats()[key_allocated]
3707 v = torch.rand(nelems_big, device="cuda")
3709 pow2_div2_mem = torch.cuda.memory_stats()[key_allocated]
3714 torch.cuda.memory.empty_cache()
3715 torch.cuda.memory._set_allocator_settings("release_lock_on_cudamalloc:True")
3716 start_mem = torch.cuda.memory_stats()[key_allocated]
3717 w = torch.rand(nelems, device="cuda")
3718 reg_mem = torch.cuda.memory_stats()[key_allocated]
3722 torch.cuda.memory._set_allocator_settings("foo:1,bar:2")
3725 torch.cuda.memory._set_allocator_settings(
3730 torch.cuda.memory._set_allocator_settings("max_split_size_mb:2")
3733 torch.cuda.memory._set_allocator_settings("release_lock_on_cudamalloc:none")
3736 torch.cuda.memory._set_allocator_settings(
3741 torch.cuda.memory._set_allocator_settings(
3746 torch.cuda.memory._set_allocator_settings(
3756 torch.cuda.memory._set_allocator_settings("max_split_size_mb:1024")
3757 torch.cuda.memory.empty_cache()
3758 with self.assertRaises(torch.cuda.OutOfMemoryError):
3759 torch.empty(1024 * 1024 * 1024 * 1024, device="cuda")
3771 #include <torch/csrc/cuda/memory_snapshot.h>
3773 std::string data = torch::cuda::_memory_snapshot_pickled();
3777 torch::cuda::_record_memory_history(e, ctx, 10, ctx, ctx);
3789 return torch.rand(311, 411, device="cuda")
3826 with self.assertRaises(torch.cuda.OutOfMemoryError):
3827 torch.empty(1024 * 1024 * 1024 * 1024, device="cuda")
3843 mem.append((c, torch.full((b,), c, dtype=torch.int32, device="cuda")))
3854 choices = [alloc, free, torch.cuda.memory.empty_cache]
3866 self.assertTrue(torch.cuda._get_pynvml_handler() is not None)
3868 self.assertTrue(torch.cuda._get_amdsmi_handler() is not None)
3872 self.assertTrue(0 <= torch.cuda.temperature() <= 150)
3876 self.assertTrue(torch.cuda.power_draw() >= 0)
3880 self.assertTrue(torch.cuda.clock_rate() >= 0)
3890 segments = torch.cuda.memory_snapshot()
3895 segments = torch.cuda.memory_snapshot()
3901 raise unittest.SkipTest("cuda graph test is skipped")
3903 torch.cuda.synchronize()
3904 stream = torch.cuda.Stream()
3905 stream.wait_stream(torch.cuda.current_stream())
3906 with torch.cuda.stream(stream):
3909 torch.cuda.current_stream().wait_stream(stream)
3910 torch.cuda.synchronize()
3912 graph = torch.cuda.CUDAGraph()
3913 with torch.cuda.graph(graph, stream=stream, pool=pool):
3920 return torch.ones([size], device="cuda", dtype=torch.uint8)
4032 torch.cuda.synchronize()
4034 torch.cuda.empty_cache()
4042 x = torch.zeros([SMALL_SIZE * 8], device="cuda", dtype=torch.uint8)
4146 inp = torch.tensor([1], device="cuda")
4206 m = m.cuda()
4208 inp = torch.rand([1, 3, 255, 255], device="cuda")
4213 return torch.ones([4], device="cuda")
4215 pool = torch.cuda.graph_pool_handle()
4233 return torch.rand([4], device="cuda")
4235 pool = torch.cuda.graph_pool_handle()
4243 Context manager to use cuda graph pool for new allocations. If you use this manager
4247 torch.cuda.synchronize()
4248 stream = torch.cuda.Stream()
4249 stream.wait_stream(torch.cuda.current_stream())
4250 stream_context = torch.cuda.stream(stream)
4298 torch.cuda.synchronize()
4300 torch.cuda.empty_cache()
4306 … script = "import sys; import torch; torch.rand(2, device='cuda'); print('triton' in sys.modules)"
4321 @unittest.skipIf(not TEST_CUDA, "CUDA not available, skipping tests")
4324 pool1 = torch.cuda.graph_pool_handle()
4325 pool2 = torch.cuda.MemPool().id
4330 # each call to torch.cuda.graph_pool_handle() or torch.cuda.MemPool()
4335 pool = torch.cuda.MemPool()
4344 #include <ATen/cuda/Exceptions.h>
4374 allocator = torch.cuda.memory.CUDAPluggableAllocator(
4379 pool = torch.cuda.MemPool(allocator.allocator())
4389 with torch.cuda.use_mem_pool(pool):
4390 out = torch.randn(1, device="cuda")
4397 active_pool = torch.cuda.MemPoolContext.active_pool()
4402 pool = torch.cuda.MemPool()
4403 ctx = torch.cuda.MemPoolContext(pool)
4404 active_pool = torch.cuda.MemPoolContext.active_pool()
4410 active_pool = torch.cuda.MemPoolContext.active_pool()
4420 pool = torch.cuda.MemPool()
4423 ctx = torch.cuda.MemPoolContext(pool)
4424 active_pool = torch.cuda.MemPoolContext.active_pool()
4447 @unittest.skipIf(not TEST_CUDA, "CUDA not available, skipping tests")
4455 not TEST_CUDA_GRAPH, "CUDA >= 11.0 or ROCM >=5.3 required for graphs"
4508 g = torch.cuda.CUDAGraph()
4509 with torch.cuda.graph(g):
4529 not TEST_CUDA_GRAPH, "CUDA >= 11.0 or ROCM >= 5.3 required for graphs"
4535 if "fused" in optim.supported_impls and "cuda" in optim.supports_fused_on
4568 scaler_for_control = torch.cuda.amp.GradScaler(init_scale=128.0)
4572 scaler_for_graphed = torch.cuda.amp.GradScaler()
4600 g = torch.cuda.CUDAGraph()
4601 with torch.cuda.graph(g):
4703 [optim for optim in optim_db if "cuda" in optim.supports_fused_on],
4709 weight = torch.ones((5, 5), device="cuda", requires_grad=True)
4723 opt.grad_scale = torch.Tensor([3]).cuda()
4731 not TEST_CUDA_GRAPH, "CUDA >= 11.0 or ROCM >= 5.3 required for graphs"
4738 if "foreach" in optim.supported_impls and "cuda" in optim.supports_fused_on
4743 torch.cuda.empty_cache()
4745 scaler = torch.amp.GradScaler(device="cuda", init_scale=4.0)
4746 g = torch.cuda.CUDAGraph()
4747 s = torch.cuda.Stream()
4749 weight = torch.ones((100,), device="cuda", requires_grad=True)
4755 s = torch.cuda.Stream()
4756 s.wait_stream(torch.cuda.current_stream())
4757 with torch.cuda.stream(s):
4760 torch.cuda.current_stream().wait_stream(s)
4765 with torch.cuda.stream(s):
4790 @unittest.skipIf(not TEST_CUDA, "CUDA not available, skipping tests")
4807 src1 = torch.randn(1024, device="cuda")
4808 src2 = torch.randn(2, 1024, device="cuda")
4809 torch.cuda.gds._gds_register_buffer(src1.untyped_storage())
4810 torch.cuda.gds._gds_register_buffer(src2.untyped_storage())
4811 dest1 = torch.empty(1024, device="cuda")
4812 dest2 = torch.empty(2, 1024, device="cuda")
4814 file = torch.cuda.gds._GdsFile(f, os.O_CREAT | os.O_RDWR)
4821 torch.cuda.gds._gds_deregister_buffer(src1.untyped_storage())
4822 torch.cuda.gds._gds_deregister_buffer(src2.untyped_storage())
4825 @unittest.skipIf(not TEST_CUDA, "CUDA not available, skipping tests")
4829 self.autocast_lists = AutocastTestLists(torch.device("cuda:0"))
4845 op, args, torch.float16, device="cuda", amp_dtype=torch.float16
4859 or torch.cuda.get_device_capability() < (8, 0)
4869 op, args, torch.bfloat16, device="cuda"
4872 if torch.cuda.is_bf16_supported():
4874 op, args, torch.bfloat16, device="cuda"
4881 op, args, torch.bfloat16, device="cuda"
4892 device="cuda",
4901 op, args, torch.float32, device="cuda", amp_dtype=torch.float16
4911 device="cuda",
4924 device="cuda",
4933 if torch.cuda.is_bf16_supported():
4935 op, args, torch.bfloat16, device="cuda", module=torch._C._nn
4942 op, args, torch.bfloat16, device="cuda", module=torch._C._nn
4952 device="cuda",
4965 device="cuda",
4978 device="cuda",
4990 device="cuda",
5002 device="cuda",
5009 with torch.autocast("cuda"):
5015 with torch.autocast("cuda"):
5017 a_ignore = torch.ones((8, 8), dtype=ignore_type, device="cuda:0")
5018 b_ignore = torch.ones((8, 8), dtype=ignore_type, device="cuda:0")
5019 c_16 = torch.ones((8, 8), dtype=torch.float16, device="cuda:0")
5026 with torch.autocast("cuda", enabled=False):
5033 with torch.autocast("cuda", enabled=False):
5038 with torch.autocast("cuda", enabled=False):
5045 with torch.autocast("cuda", enabled=False):
5052 @torch.amp.custom_fwd(device_type="cuda")
5061 @torch.amp.custom_bwd(device_type="cuda")
5071 x = torch.randn((8, 8), device="cuda", dtype=torch.float32, requires_grad=True)
5072 y = torch.randn((8, 8), device="cuda", dtype=torch.float32, requires_grad=True)
5076 with torch.cuda.amp.autocast(dtype=dtype):
5085 @torch.amp.custom_fwd(device_type="cuda", cast_inputs=torch.float32)
5095 @torch.amp.custom_bwd(device_type="cuda")
5103 x = torch.randn((8, 8), device="cuda", dtype=torch.float16, requires_grad=True)
5111 (8, 8), device="cuda", dtype=torch.float16, requires_grad=False
5116 with torch.autocast("cuda"):
5133 @torch.cuda.amp.custom_fwd(cast_inputs=torch.float32)
5140 @torch.cuda.amp.custom_bwd
5147 str(w[0].message), r"`torch.cuda.amp.custom_fwd\(args...\)` is deprecated."
5150 str(w[1].message), r"`torch.cuda.amp.custom_bwd\(args...\)` is deprecated."
5156 with torch.amp.autocast("cuda"):
5177 with torch.autocast("cuda", enabled=True):
5216 x = torch.randn((T, B, F), device="cuda", dtype=input_dtype)
5219 x = torch.randn((B, T, F), device="cuda", dtype=input_dtype)
5223 torch.randn((T, B, F), device="cuda", dtype=input_dtype),
5237 .cuda()
5248 device="cuda",
5254 device="cuda",
5259 with torch.autocast("cuda"):
5311 linear = torch.nn.Linear(10, 10).to("cuda")
5312 data = torch.randn(1, 10, device="cuda")
5314 with torch.autocast("cuda"):
5317 first_iter_mem = torch.cuda.memory_allocated()
5320 self.assertTrue(first_iter_mem == torch.cuda.memory_allocated())
5325 ).cuda()
5327 (8, 8), device="cuda", dtype=torch.float16, requires_grad=True
5330 with torch.autocast("cuda"):
5339 …r"`torch.cuda.amp.autocast\(args...\)` is deprecated. Please use `torch.amp.autocast\('cuda', args…
5341 with torch.cuda.amp.autocast():