test_cuda.py - OpenGrok cross reference for /aosp_15_r20/external/pytorch/test/test

Lines Matching full:cuda
1 # Owner(s): ["module: cuda"]
23 import torch.cuda
25 from torch.cuda._memory_viz import (
96     torch.cuda.get_allocator_backend() == "cudaMallocAsync"
101 TEST_PYNVML = not torch.cuda._HAS_PYNVML
103     TEST_LARGE_TENSOR = torch.cuda.get_device_properties(0).total_memory >= 12e9
104     TEST_MEDIUM_TENSOR = torch.cuda.get_device_properties(0).total_memory >= 6e9
105     TEST_BF16 = torch.cuda.is_bf16_supported()
110 @unittest.skipIf(not TEST_CUDA, "CUDA not available, skipping tests")
128         torch.cuda.memory._set_allocator_settings(
155             torch.cuda.memory._set_allocator_settings(
170         cudart = torch.cuda.cudart()
180         torch.cuda.empty_cache()
185             prev = torch.cuda.memory_allocated()
186             mem = torch.cuda.caching_allocator_alloc(size)
187             self.assertGreater(torch.cuda.memory_allocated(), prev)
190                 torch.cuda.caching_allocator_delete(mem)
191                 self.assertEqual(torch.cuda.memory_allocated(), prev)
195         torch.cuda.check_error(0)
198             torch.cuda.CudaError, "out of memory|hipErrorOutOfMemory"
200             torch.cuda.check_error(2)
204         current_device = torch.cuda.current_device()
205         current_device_name = torch.cuda.get_device_name(current_device)
206         device_name_None = torch.cuda.get_device_name(None)
210         device_name_no_argument = torch.cuda.get_device_name()
215         current_device = torch.cuda.current_device()
216         current_device_capability = torch.cuda.get_device_capability(current_device)
217         device_capability_None = torch.cuda.get_device_capability(None)
221         device_capability_no_argument = torch.cuda.get_device_capability()
225         tensor = torch.zeros(1024, device="cuda")
233             torch.empty(1024 * 1024 * 1024 * 800000000, dtype=torch.int8, device="cuda")
239                 1024 * 1024 * 1024 * 8000000000, dtype=torch.int8, device="cuda"
251         torch.cuda.empty_cache()
252         total_memory = torch.cuda.get_device_properties(0).total_memory
259         a = torch.empty(size, dtype=torch.int8, device="cuda")
261             b = torch.empty(size, dtype=torch.int8, device="cuda")
263         b = torch.empty(size, dtype=torch.int8, device="cuda")
266         torch.cuda.empty_cache()
267         torch.cuda.reset_peak_memory_stats()
273             torch.cuda.set_per_process_memory_fraction(1)
275             torch.cuda.set_per_process_memory_fraction(-0.1)
277             torch.cuda.set_per_process_memory_fraction(2.0)
279         tensor = torch.zeros(1024, device="cuda")
280         torch.cuda.empty_cache()
281         total_memory = torch.cuda.get_device_properties(0).total_memory
282         torch.cuda.set_per_process_memory_fraction(0.5, 0)
285         application = int(total_memory * 0.499) - torch.cuda.max_memory_reserved()
286         tmp_tensor = torch.empty(application, dtype=torch.int8, device="cuda")
288         torch.cuda.empty_cache()
296             torch.empty(application, dtype=torch.int8, device="cuda")
304         uuid = torch.cuda.get_device_properties(0).uuid
310             event = torch.cuda.Event()
317         x = torch.ones(10000000, dtype=torch.uint8).cuda()
322         y = torch.ones(10000000, dtype=torch.uint8).cuda()
332         y = torch.ones(10000000 - 1, dtype=torch.uint8).cuda()
336         a = torch.ones(1, device="cuda")
338         c = torch.empty(1, device="cuda", dtype=torch.long)
339         torch.cuda._sleep(int(100 * get_cycles_per_ms()))
346         stream = torch.cuda.current_stream()
349             torch.cuda.synchronize()
352             torch.cuda._sleep(int(100 * get_cycles_per_ms()))
359         for dst, try_non_blocking in product(("cuda", "cpu"), (True, False)):
363                 device="cuda" if dst == "cpu" else "cpu",
364                 pin_memory=True if dst == "cuda" else False,
369         src = torch.randn(1000000, device="cuda")
370         torch.cuda.synchronize()
371         torch.cuda._sleep(int(100 * get_cycles_per_ms()))
373         self.assertEqual(torch.cuda.current_stream().query(), True)
378         x = torch.randn(5, 5).cuda()
379         y = torch.IntTensor(2, 5).fill_(0).cuda()
388         self.assertTrue(isinstance(q_copy[0], torch.cuda.FloatTensor))
389         self.assertTrue(isinstance(q_copy[1], torch.cuda.IntTensor))
390         self.assertTrue(isinstance(q_copy[2], torch.cuda.FloatTensor))
394         self.assertEqual(q_copy[3], torch.cuda.IntStorage(10).fill_(10))
404         a = torch.randn(7, 7, device="cuda", requires_grad=False)
407         if torch.cuda.get_device_capability() == (9, 0):
412             start = torch.cuda.memory_stats()["active_bytes.all.allocated"]
415             finish = torch.cuda.memory_stats()["active_bytes.all.allocated"]
437             self.assertTrue(torch.backends.cuda.matmul.allow_tf32)
440         orig = torch.backends.cuda.matmul.allow_tf32
442         torch.backends.cuda.matmul.allow_tf32 = not orig
444         torch.backends.cuda.matmul.allow_tf32 = orig
454             self.assertFalse(torch.backends.cuda.matmul.allow_tf32)
457             self.assertTrue(torch.backends.cuda.matmul.allow_tf32)
461             self.assertTrue(torch.backends.cuda.matmul.allow_tf32)
464         self.assertFalse(torch.backends.cuda.matmul.allow_tf32)
468         orig = torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction
472         torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = not orig
476         torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = orig
479         orig = torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction
483         torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = not orig
487         torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = orig
502         self.assertIsInstance(x.cuda().double(), torch.cuda.DoubleTensor)
503         self.assertIsInstance(x.cuda().float(), torch.cuda.FloatTensor)
504         self.assertIsInstance(x.cuda().float().cpu(), torch.FloatTensor)
505         self.assertIsInstance(x.cuda().float().cpu().int(), torch.IntTensor)
509         self.assertIsInstance(y.cuda().double(), torch.cuda.DoubleStorage)
510         self.assertIsInstance(y.cuda().float(), torch.cuda.FloatStorage)
511         self.assertIsInstance(y.cuda().float().cpu(), torch.FloatStorage)
512         self.assertIsInstance(y.cuda().float().cpu().int(), torch.IntStorage)
516         x = torch.empty(2**30, device="cuda")
537         t = torch.tensor([[False, True], [True, True]], device="cuda")
539             torch.gather(t, 1, torch.tensor([[0, 0], [1, 0]], device="cuda")),
540             torch.tensor([[False, False], [True, True]], device="cuda"),
545             x = torch.zeros(4, 4).float().cuda()
547             self.assertEqual(torch.cuda.initial_seed(), 2)
552             self.assertEqual(torch.cuda.initial_seed(), 2)
556             x = torch.zeros(4, 4).float().cuda()
557             torch.cuda.manual_seed(2)
558             self.assertEqual(torch.cuda.initial_seed(), 2)
561             torch.cuda.manual_seed(2)
566             self.assertEqual(torch.cuda.initial_seed(), 2)
585         from torch.cuda._utils import _get_device_index
590         with self.assertRaisesRegex(ValueError, "Expected a cuda device"):
595         x = [torch.randn(4, 4).cuda(), torch.cuda.FloatTensor()]
607         default_stream = torch.cuda.current_stream()
608         user_stream = torch.cuda.Stream()
609         self.assertEqual(torch.cuda.current_stream(), default_stream)
613         with torch.cuda.stream(user_stream):
614             self.assertEqual(torch.cuda.current_stream(), user_stream)
617         tensor2 = tensor1.cuda(non_blocking=True) + 1
622         s = torch.cuda.current_stream()
623         self.assertTrue("torch.cuda.Stream" in s.__repr__())
624         e = torch.cuda.Event()
625         self.assertTrue("torch.cuda.Event" in e.__repr__())
627         self.assertTrue("torch.cuda.Event" in e.__repr__())
630         stream = torch.cuda.current_stream()
631         event = torch.cuda.Event(enable_timing=True)
633         start_event = torch.cuda.Event(enable_timing=True)
635         torch.cuda._sleep(int(50 * get_cycles_per_ms()))
643         stream = torch.Stream("cuda")
644         self.assertEqual(stream.device_index, torch.cuda.current_device())
645         cuda_stream = torch.cuda.Stream(
651         self.assertNotEqual(stream.stream_id, torch.cuda.current_stream().stream_id)
653         event1 = torch.Event("cuda", enable_timing=True)
654         event2 = torch.Event("cuda", enable_timing=True)
658         with torch.cuda.stream(cuda_stream):
659             a_cuda = a.to("cuda", non_blocking=True)
660             b_cuda = b.to("cuda", non_blocking=True)
661             self.assertEqual(stream.stream_id, torch.cuda.current_stream().stream_id)
677         result = torch.cuda.FloatTensor(t.size())
678         stream = torch.cuda.Stream()
683             with torch.cuda.stream(stream):
684                 tmp = t.cuda(non_blocking=True)
686             torch.cuda.current_stream().wait_stream(stream)
687             tmp.record_stream(torch.cuda.current_stream())
688             torch.cuda._sleep(int(50 * cycles_per_ms))  # delay the copy
692         with torch.cuda.stream(stream):
693             tmp2 = torch.cuda.FloatTensor(t.size())
704             torch.cuda.current_stream().synchronize()
705             with torch.cuda.stream(stream):
706                 tmp3 = torch.cuda.FloatTensor(t.size())
715         stream_alloc = torch.cuda.Stream()
716         with torch.cuda.stream(stream_alloc):
717             base = torch.cuda.FloatTensor([10, 10])
723         stream_record = torch.cuda.Stream()
724         with torch.cuda.stream(stream_record):
725             torch.cuda._sleep(int(50 * get_cycles_per_ms()))
736         with torch.cuda.stream(stream_alloc):
737             try_realloc = torch.cuda.FloatTensor([10, 10])
757         gpu_tensor = torch.cuda.FloatTensor([0])
758         torch.cuda._sleep(int(1000 * cycles_per_ms))  # delay the copy by 1s
768         stream = torch.cuda.Stream()
770         with torch.cuda.stream(stream):
771             y = torch.zeros(40 * 1024 * 1024, device="cuda")
774             x = torch.empty(40 * 1024 * 1024, device="cuda")
775             with torch.cuda.stream(stream):
783         torch.cuda.empty_cache()
787         x = torch.ones(512, 8, dtype=torch.float32, device="cuda")
791         x = torch.zeros(10, device="cuda", dtype=torch.float16)
794         x = torch.ones(65504, device="cuda", dtype=torch.float16)
798         x = torch.ones(65536, device="cuda", dtype=torch.float16)
802         x = a.to(device="cuda", dtype=torch.float16)
806         x = a.to(device="cuda", dtype=torch.float16)
810         x = torch.ones(65536, device="cuda", dtype=torch.float16)
813         x = torch.ones(65536, device="cuda", dtype=torch.float16)
818         x = torch.ones(240000, device="cuda", dtype=torch.float32)
823             x = torch.ones(240000, device="cuda", dtype=dtype) * (0 + 1j)
828         freqs = torch.cuda.FloatTensor(
883         torch.cuda.manual_seed(11042)
887         p = torch.zeros(3421, 2, device="cuda", dtype=torch.float)
889         torch.cuda.manual_seed(5214)
894         torch.cuda.manual_seed(33)
895         probs = torch.randn(1000000, device="cuda").clamp(min=0) * 3e-5
913         torch.multinomial(torch.tensor({probs}).to('cuda'), 2, replacement=True)
914         torch.cuda.synchronize()
930             "device-side assert triggered",  # CUDA
931             "Assertion",  # CUDA
964         x = torch.arange(10, device="cuda")
982             test_method(1), "x[torch.tensor([1)]=tensor([1], device='cuda:0')"
991         src = torch.empty(15000000, 45, device="cuda", dtype=torch.long).random_(
994         idx = torch.randperm(src.shape[0], device="cuda")
1020         assert abs(run(torch.device("cuda")) - run(torch.device("cpu"))) < 10_000
1040                 run(func, torch.device("cuda"), dtype)
1048         x = torch.cuda.ByteTensor([0])
1049         y = torch.cuda.ByteTensor([255])
1050         expected = torch.cuda.LongTensor([0])[0]
1060         torch.cuda.nvtx.range_push("foo")
1061         torch.cuda.nvtx.mark("bar")
1062         torch.cuda.nvtx.range_pop()
1063         range_handle = torch.cuda.nvtx.range_start("range_start")
1064         torch.cuda.nvtx.range_end(range_handle)
1067         # ensure CUDA code coverage
1069         w = torch.randn(input_size, dtype=torch.double, device="cuda")
1072         t = torch.randint(50, input_size, dtype=torch.int8, device="cuda")
1078         t = torch.randint(50000, input_size, dtype=torch.int64, device="cuda")
1082         t = torch.zeros([10], dtype=torch.int32, device="cuda")
1090         a = torch.arange(25).cuda().float()
1096         a = torch.ones(65536).cuda().half()
1104                 x = torch.randn(3, 1, device="cuda")
1105                 y = torch.randn(2, 1, device="cuda")
1113         x = torch.randn(1, 1, 1, 2**30 + 1, dtype=torch.float16, device="cuda")
1116         torch.cuda.synchronize()
1124         x = torch.randn(1, 1, 1, 2**31, dtype=torch.float16, device="cuda")
1130         x = torch.randn(1, 1, 1, 2**31 - 1, dtype=torch.float16, device="cuda")
1133         torch.cuda.synchronize()
1142                 ctx.stream = torch.cuda.current_stream()
1147                 self.assertEqual(torch.cuda.current_stream(), ctx.stream)
1149                 torch.cuda._sleep(1000 * 5000)
1156         default_stream = torch.cuda.current_stream()
1157         stream = torch.cuda.Stream()
1162 …     # See "Stream semantics of backward passes" on https://pytorch.org/docs/stable/notes/cuda.html
1163         x = torch.randn(5, 5, device="cuda", requires_grad=True)
1164         with torch.cuda.stream(stream):
1171         self.assertEqual(torch.cuda.current_stream(), default_stream)
1175         bwd_ambient_stream = torch.cuda.Stream()
1176         x = torch.randn(5, 5, device="cuda", requires_grad=True)
1177         with torch.cuda.stream(stream):
1180         with torch.cuda.stream(bwd_ambient_stream):
1187             self.assertEqual(torch.cuda.current_stream(), bwd_ambient_stream)
1197                 self.event = torch.cuda.Event()
1198                 self.stream0 = torch.cuda.Stream()
1199                 self.stream1 = torch.cuda.Stream()
1204                 self.stream0.wait_stream(torch.cuda.current_stream())
1205                 self.stream1.wait_stream(torch.cuda.current_stream())
1206                 with torch.cuda.stream(self.stream0):
1210                     self.event.record(stream=torch.cuda.current_stream())
1212                 with torch.cuda.stream(self.stream1):
1217         stream = torch.cuda.Stream()
1223                 with torch.cuda.stream(stream):
1224                     x = torch.randn(5, 5, device="cuda", requires_grad=True)
1225                     model = StreamModel().cuda()
1228                             torch.cuda.current_stream(),
1240 …     # See "Stream semantics of backward passes" on https://pytorch.org/docs/stable/notes/cuda.html
1241                 torch.cuda.current_stream().wait_stream(stream)
1251 …   # torch.cuda._sleep such that if the race condition exists, the test will almost certainly fail,
1254         fwd_bwd_op_stream = torch.cuda.Stream()
1255         bwd_ambient_stream = torch.cuda.Stream()
1261         a = torch.full((size,), 2.0, device="cuda", requires_grad=True)
1262         b = torch.full((size,), 3.0, device="cuda", requires_grad=True)
1268             torch.cuda.synchronize()
1270             with torch.cuda.stream(fwd_bwd_op_stream):
1273             with torch.cuda.stream(bwd_ambient_stream):
1274                 torch.cuda.synchronize()
1276                 torch.cuda._sleep(int(50 * get_cycles_per_ms()))
1278                 grad = torch.full((size,), float(trial + 1), device="cuda")
1288                 torch.cuda.synchronize()
1300         a = torch.full((size,), 1, device="cuda", dtype=torch.float, requires_grad=True)
1301         b = torch.full((size,), 1, device="cuda", dtype=torch.float, requires_grad=True)
1303         s0 = torch.cuda.Stream()
1304         s1 = torch.cuda.Stream()
1305         s2 = torch.cuda.Stream()
1310         s0.wait_stream(torch.cuda.current_stream())
1311         with torch.cuda.stream(s0):
1314         s1.wait_stream(torch.cuda.current_stream())
1315         with torch.cuda.stream(s1):
1332         with torch.cuda.stream(s2):
1347             torch._assert_async(torch.tensor([], device="cuda"))
1352             torch._assert_async(torch.tensor([0, 0], device="cuda"))
1354         torch._assert_async(torch.tensor(1, device="cuda"))
1355         torch._assert_async(torch.tensor(0.1, device="cuda"))
1356         torch._assert_async(torch.tensor(-0.1, device="cuda"))
1357         torch._assert_async(torch.tensor(True, device="cuda"))
1358         torch._assert_async(torch.tensor(0 + 0.1j, device="cuda"))
1361             "torch._assert_async(torch.tensor(0, device='cuda'))",
1362             "torch._assert_async(torch.tensor(0.0, device='cuda'))",
1363             "torch._assert_async(torch.tensor(False, device='cuda'))",
1364             "torch._assert_async(torch.tensor(0 + 0j, device='cuda'))",
1379 torch.cuda.synchronize()
1395         weight = torch.ones((size, size), device="cuda")
1400             my_stream = torch.cuda.Stream()
1404             torch.cuda.synchronize()
1407             with torch.cuda.stream(my_stream):
1419             torch.cuda.synchronize()
1423                 results[t] = torch.ones((size, size), device="cuda")
1445         weight = torch.ones((1, 1, 2, 2), device="cuda")
1457                 my_stream = torch.cuda.Stream()
1461                 torch.cuda.synchronize()
1464                 with torch.cuda.stream(my_stream):
1478                 torch.cuda.synchronize()
1482                     results[t] = torch.ones((1, 1, 2048, 2048), device="cuda")
1507             a = torch.arange(size, device="cuda")
1509             values = torch.ones(size * size, device="cuda")
1517             my_stream = torch.cuda.Stream()
1521             torch.cuda.synchronize()
1524             with torch.cuda.stream(my_stream):
1536             torch.cuda.synchronize()
1540                 results[t] = torch.ones((size, size), device="cuda")
1558         x = torch.zeros(2**32, device="cuda", dtype=torch.int8)
1566         self.assertRaises(TypeError, lambda: torch.empty(1, device="cuda").numpy())
1569         self.assertFalse(torch.cuda.is_current_stream_capturing())
1572             s = torch.cuda.Stream()
1573             with torch.cuda.stream(s):
1574                 g = torch.cuda.CUDAGraph()
1575                 self.assertFalse(torch.cuda.is_current_stream_capturing())
1577                 self.assertTrue(torch.cuda.is_current_stream_capturing())
1581         not TEST_CUDA_GRAPH, "CUDA >= 11.0 or ROCM >= 5.3 required for graphs"
1584         s = torch.cuda.Stream()
1586         with torch.cuda.stream(s):
1587             a = torch.full((1000,), 1, device="cuda")
1588             g = torch.cuda.CUDAGraph()
1589             torch.cuda.empty_cache()
1595         torch.cuda.current_stream().wait_stream(s)
1602         not TEST_CUDA_GRAPH, "CUDA >= 11.0 or ROCM >= 5.3 required for graphs"
1607             """Initializes generator states and registers them with a CUDA graph if provided."""
1608             # Ensure the CUDA generator is initialized
1609             torch.rand(1, device="cuda")
1631             random_values.append(torch.rand(5, device="cuda", generator=generator))
1636                 [torch.rand(5, device="cuda", generator=generator) for _ in range(2)]
1648         # Set up and test a new CUDA generator
1649         generator = torch.Generator(device="cuda")
1652         # Set up and test the default CUDA generator with a CUDA Graph
1653         g = torch.cuda.CUDAGraph()
1654         s = torch.cuda.Stream()
1655         default_generator = torch.cuda.default_generators[0]
1659         # Perform random number generation within a CUDA graph
1660         with torch.cuda.stream(s):
1668         torch.cuda.current_stream().wait_stream(s)
1681         not TEST_CUDA_GRAPH, "CUDA >= 11.0 or ROCM >= 5.3 required for graphs"
1684         # Function to clear CUDA cache and collect garbage
1687             torch.cuda.empty_cache()
1689 …e graph task which includes capturing and executing a random number generation within a CUDA graph.
1691             s = torch.cuda.Stream()
1692             with torch.cuda.stream(s):
1694                 torch.rand(1, device="cuda")
1696             torch.cuda.current_stream().wait_stream(s)
1700             stats = torch.cuda.memory_stats()
1709             # Allocate CUDA graphs
1710             graphs = [torch.cuda.CUDAGraph() for _ in range(num_graphs)]
1713             default_generator = torch.cuda.default_generators[0]
1740             # Cleanup graphs and clear CUDA cache
1758         not TEST_CUDA_GRAPH, "CUDA >= 11.0 or ROCM >= 5.3 required for graphs"
1761         s = torch.cuda.Stream()
1763         with torch.cuda.stream(s):
1764             a = torch.full((1000,), 1, device="cuda")
1765             g = torch.cuda.CUDAGraph()
1766             torch.cuda.empty_cache()
1772         torch.cuda.current_stream().wait_stream(s)
1780         with torch.cuda.stream(s):
1786         torch.cuda.current_stream().wait_stream(s)
1795         not TEST_CUDA_GRAPH, "CUDA >= 11.0 or ROCM >= 5.3 required for graphs"
1798         torch.cuda.empty_cache()
1799         x = torch.randn(10240000, device="cuda")
1801         g = torch.cuda.CUDAGraph()
1803         s0 = torch.cuda.Stream()
1804         s1 = torch.cuda.Stream()
1805         s0.wait_stream(torch.cuda.current_stream())
1806         with torch.cuda.stream(s0):
1809             with torch.cuda.stream(s1):
1815         torch.cuda.synchronize()
1820         not TEST_CUDA_GRAPH, "CUDA >= 11.0 or ROCM >= 5.3 required for graphs"
1824         # puts the cuda context in a bad state
1828 g = torch.cuda.CUDAGraph()
1832     if "CUDA graphs must be captured on a non-default stream." in str(e):
1859         (not TEST_CUDA) or TEST_WITH_ROCM or int(torch.version.cuda.split(".")[0]) < 11,
1860         "CUDA >= 11.0 required for graphs",
1864             g = torch.cuda.CUDAGraph()
1865             s = torch.cuda.Stream()
1866             with torch.cuda.stream(s):
1870             any("The CUDA Graph is empty" in str(w.message) for w in caught)
1874         not TEST_CUDA_GRAPH, "CUDA >= 11.0 or ROCM >= 5.3 required for graphs"
1884             with torch.cuda.graph(torch.cuda.CUDAGraph()):
1885                 torch.zeros(2**40, device="cuda")
1888         not TEST_CUDA_GRAPH, "CUDA >= 11.0 or ROCM >= 5.3 required for graphs"
1893         a = torch.rand((x, y), device="cuda")
1894         b = torch.rand((y, z), device="cuda")
1899         free_bytes_before, total_bytes = torch.cuda.mem_get_info()
1903             torch_graph = torch.cuda.CUDAGraph()
1904             with torch.cuda.graph(torch_graph):
1908         free_bytes_after, _ = torch.cuda.mem_get_info()
1914         not TEST_CUDA_GRAPH, "CUDA >= 11.0 or ROCM >= 5.3 required for graphs"
1924             a = torch.randn((size,), device="cuda", dtype=torch.float)
1927             torch.cuda.manual_seed(5)
1933             stream = torch.cuda.Stream()
1934             stream.wait_stream(torch.cuda.current_stream())
1935             with torch.cuda.stream(stream):
1936                 torch.cuda.manual_seed(5)
1938                 g = torch.cuda.CUDAGraph()
1939                 torch.cuda.empty_cache()
1945             torch.cuda.current_stream().wait_stream(stream)
1968                 torch.cuda.manual_seed(seed)
1981                 torch.cuda.manual_seed(seed)
1996             torch.cuda.synchronize()
2002         not TEST_CUDA_GRAPH, "CUDA >= 11.0 or ROCM >= 5.3 required for graphs"
2006         input = torch.rand((size,), device="cuda", dtype=torch.float)
2007         alloc = torch.empty((size,), device="cuda", dtype=torch.float)
2012             # multinomial uses some uncapturable CUDA calls.
2021             ("rand", (size,), {"device": "cuda", "dtype": torch.float}),
2022             ("randint", (0, 3, (size,)), {"device": "cuda", "dtype": torch.float}),
2023             ("randn", (size,), {"device": "cuda", "dtype": torch.float}),
2039             torch.cuda.manual_seed(5)
2054             stream = torch.cuda.Stream()
2055             stream.wait_stream(torch.cuda.current_stream())
2056             with torch.cuda.stream(stream):
2057                 torch.cuda.manual_seed(5)
2059                 g = torch.cuda.CUDAGraph()
2060                 torch.cuda.empty_cache()
2073             torch.cuda.current_stream().wait_stream(stream)
2080                 # If we try it with cudaMallocAsync, CUDA won't event consider
2091                 torch.cuda.manual_seed(seed)
2103                 torch.cuda.manual_seed(seed)
2125             torch.cuda.synchronize()
2135         not TEST_CUDA_GRAPH, "CUDA >= 11.0 or ROCM >= 5.3 required for graphs"
2138         torch.cuda.empty_cache()
2148         s = torch.cuda.Stream()
2151             g0 = torch.cuda.CUDAGraph()
2152             g1 = torch.cuda.CUDAGraph()
2154             a = torch.ones((size,), device="cuda")
2156             s.wait_stream(torch.cuda.current_stream())
2157             with torch.cuda.stream(s):
2159                     (torch.cuda.graph_pool_handle(),)
2174             torch.cuda.current_stream().wait_stream(s)
2195                         - torch.cuda.memory_stats()["reserved_bytes.all.current"],
2199                     reserved_no_sharing = torch.cuda.memory_stats()[
2205             torch.cuda.synchronize()
2206             torch.cuda.empty_cache()
2212             torch.version.cuda
2213             and int(torch.version.cuda.split(".")[0]) == 11
2214             and int(torch.version.cuda.split(".")[1]) < 4
2216         "Graph bindings disallow concurrent replay for CUDA < 11.4, see "
2220         not TEST_CUDA_GRAPH, "CUDA >= 11.0 or ROCM >= 5.3 required for graphs"
2223         torch.cuda.empty_cache()
2232         s = torch.cuda.Stream()
2235             g0 = torch.cuda.CUDAGraph()
2236             g1 = torch.cuda.CUDAGraph()
2238             s0 = torch.cuda.Stream()
2239             s1 = torch.cuda.Stream()
2241             a = torch.ones((size,), device="cuda")
2243             s.wait_stream(torch.cuda.current_stream())
2244             with torch.cuda.stream(s):
2246                     (torch.cuda.graph_pool_handle(),)
2266             torch.cuda.synchronize()
2267             with torch.cuda.stream(s0):
2268                 torch.cuda._sleep(1000000)
2271             with torch.cuda.stream(s1):
2273             torch.cuda.current_stream().wait_stream(s0)
2274             torch.cuda.current_stream().wait_stream(s1)
2291             torch.cuda.synchronize()
2292             torch.cuda.empty_cache()
2295         not TEST_CUDA_GRAPH, "CUDA >= 11.0 or ROCM >= 5.3 required for graphs"
2298         torch.cuda.empty_cache()
2302         s = torch.cuda.Stream()
2305             a = torch.ones((size,), device="cuda")
2307             g0 = torch.cuda.CUDAGraph()
2308             g1 = torch.cuda.CUDAGraph()
2309             g2 = torch.cuda.CUDAGraph()
2311             s.wait_stream(torch.cuda.current_stream())
2312             with torch.cuda.stream(s):
2314                     (torch.cuda.graph_pool_handle(),)
2334             torch.cuda.current_stream().wait_stream(s)
2361             torch.cuda.synchronize()
2362             torch.cuda.empty_cache()
2366         "CUDA >= 11.0 or ROCM >= 5.3 required for graphs",
2405         torch.cuda.empty_cache()
2407         s = torch.cuda.Stream()
2425             g = torch.cuda.CUDAGraph()
2426             s.wait_stream(torch.cuda.current_stream())
2427             with torch.cuda.stream(s):
2431                 a = torch.ones((numel,), device="cuda")
2433                 precapture_stats = torch.cuda.memory_stats()
2440             torch.cuda.current_stream().wait_stream(s)
2444             postcapture_stats = torch.cuda.memory_stats()
2483                     torch.cuda.empty_cache()
2487             torch.cuda.empty_cache()
2488             postdel_stats = torch.cuda.memory_stats()
2531             torch.cuda.synchronize()
2532             torch.cuda.empty_cache()
2535         not TEST_CUDA_GRAPH, "CUDA >= 11.0 or ROCM >= 5.3 required for graphs"
2539 …# "Q. Why skip process_events if a capture might be underway?" in c10/cuda/CUDACachingAllocator.cpp
2540         torch.cuda.empty_cache()
2542         potential_problem = torch.zeros((3,), device="cuda")
2543         a = torch.zeros((3,), device="cuda")
2544         s0 = torch.cuda.Stream()
2545         s1 = torch.cuda.Stream()
2546         s2 = torch.cuda.Stream()
2547         g = torch.cuda.CUDAGraph()
2549         torch.cuda.synchronize()
2550         with torch.cuda.stream(s0):
2552             torch.cuda._sleep(TestCuda.FIFTY_MIL_CYCLES)
2556         with torch.cuda.stream(s1):
2565             with torch.cuda.stream(s2):
2571         torch.cuda.synchronize()
2574         c = torch.zeros((3,), device="cuda")
2578         not TEST_CUDA_GRAPH, "CUDA >= 11.0 or ROCM >= 5.3 required for graphs"
2586 …# Tests the interaction of cuda graph capture with DropoutState's syncs in ATen/native/cudnn/RNN.c…
2589         torch.cuda.empty_cache()
2591         model = torch.nn.LSTM(512, 512, 2, dropout=0.5).cuda()
2592         x = torch.ones(100, 192, 512, device="cuda")
2596         g = torch.cuda.CUDAGraph()
2597         s = torch.cuda.Stream()
2598         s.wait_stream(torch.cuda.current_stream())
2599         with torch.cuda.stream(s):
2603         torch.cuda.current_stream().wait_stream(s)
2610         not TEST_CUDA_GRAPH, "CUDA >= 11.0 or ROCM >= 5.3 required for graphs"
2631         torch.cuda.manual_seed(5)
2640                 ).cuda()
2643                 ).cuda()
2654                 ).cuda()
2657                 ).cuda()
2673             model_section1 = MLP1(D_in, H, H).cuda()
2674             model_section2 = MLP2(H, H, D_out).cuda()
2675             model_section3 = ParameterlessModule().cuda()
2688         x = torch.randn(N, D_in, device="cuda")
2689         h = torch.randn(N, H, device="cuda", requires_grad=True)
2690         h2 = torch.randn(N, D_out, device="cuda", requires_grad=True)
2691         unused_input = torch.randn(N, H, device="cuda", requires_grad=True)
2692         y_pred = torch.randn(N, D_out, device="cuda", requires_grad=True)
2693         y = torch.randn(N, D_out, device="cuda")
2700             device_type="cuda", enabled=with_amp, cache_enabled=cache_enabled
2708             ) = torch.cuda.make_graphed_callables(
2738             torch.cuda.manual_seed(5)
2742                     device_type="cuda", enabled=with_amp, cache_enabled=cache_enabled
2761         not TEST_CUDA_GRAPH, "CUDA >= 11.0 or ROCM >= 5.3 required for graphs"
2782         torch.cuda.manual_seed(5)
2798             model_section1 = ParameterlessModule().cuda()
2806         x = torch.randn(N, D_in, device="cuda", requires_grad=False)
2807         unused_input = torch.randn(N, H, device="cuda", requires_grad=False)
2808         y_pred = torch.randn(N, D_in, device="cuda", requires_grad=False)
2809         y = torch.randn(N, D_in, device="cuda")
2813             device_type="cuda", enabled=with_amp, cache_enabled=cache_enabled
2815             model_graphed[0] = torch.cuda.make_graphed_callables(
2828             torch.cuda.manual_seed(5)
2831                     device_type="cuda", enabled=with_amp, cache_enabled=cache_enabled
2843         not TEST_CUDA_GRAPH, "CUDA >= 11.0 or ROCM >= 5.3 required for graphs"
2847         torch.cuda.manual_seed(5)
2856                 ).cuda()
2859         mempool = torch.cuda.graph_pool_handle()
2862             x = torch.randn([64, 32], device="cuda")
2864             graphed_model = torch.cuda.make_graphed_callables(
2870             x = torch.randn([64, 32], device="cuda")
2890             params = [torch.randn((i + 5, i + 5), device="cuda") for i in range(2)] + [
2891                 torch.randn((), device="cuda")
2920                 g = torch.cuda.CUDAGraph()
2921                 with torch.cuda.graph(g):
2940         not TEST_CUDA_GRAPH, "CUDA >= 11.0 or ROCM >= 5.3 required for graphs"
2960                 torch.nn.Parameter(torch.ones(1, device="cuda")) for _ in range(2)
2963                 torch.nn.Parameter(torch.ones(1, device="cuda")) for _ in range(2)
2994             g = torch.cuda.CUDAGraph()
2996                 with self.assertRaisesRegex(RuntimeError, "Attempting CUDA graph"):
2997                     with torch.cuda.graph(g):
3000                 with torch.cuda.graph(g):
3011         not TEST_CUDA_GRAPH, "CUDA >= 11.0 or ROCM >= 5.3 required for graphs"
3015             x = torch.zeros([2000], device="cuda")
3024             stream = torch.cuda.Stream()
3026                 with torch.cuda.stream(stream):
3027                     mem = torch.cuda.caching_allocator_alloc(1024)
3032                 torch.cuda.caching_allocator_delete(mem)
3039             graph = torch.cuda.CUDAGraph()
3040             torch.cuda.synchronize()
3041             stream = torch.cuda.Stream()
3042             stream.wait_stream(torch.cuda.current_stream())
3043             with torch.cuda.stream(stream):
3046             torch.cuda.current_stream().wait_stream(stream)
3047             torch.cuda.synchronize()
3049                 with torch.cuda.graph(
3058                     torch.cuda.caching_allocator_delete(mem)
3070         not TEST_CUDA_GRAPH, "CUDA >= 11.0 or ROCM >= 5.3 required for graphs"
3073         segments = torch.cuda.memory_snapshot()
3075         x = torch.randn(10240000, device="cuda")
3077         g = torch.cuda.CUDAGraph()
3078         s0 = torch.cuda.Stream()
3079         s1 = torch.cuda.Stream()
3080         s0.wait_stream(torch.cuda.current_stream())
3081         with torch.cuda.stream(s0):
3084         with torch.cuda.stream(s1):
3088         with torch.cuda.stream(s0):
3090         segments = torch.cuda.memory_snapshot()
3100         input = torch.randn(1, 3, 3, 3, device="cuda")
3103             mean=torch.ones(2, 3, device="cuda"),
3104             invstd=torch.ones(2, 3, device="cuda"),
3111         self.assertEqual(mean, torch.ones(3, device="cuda"))
3112         self.assertEqual(invstd, torch.ones(3, device="cuda"))
3116             torch.cuda.synchronize()
3117             val = torch.cuda.max_memory_allocated()
3118             torch.cuda.reset_peak_memory_stats()
3121         a = torch.rand(1, 32, 32, device="cuda")
3122         b = torch.rand(24, 32, 1, device="cuda")
3179         cuda = cpu.cuda()
3183             cpu @ cuda
3187             cuda @ cpu
3189         for s, m1, m2 in product((cpu, cuda), repeat=3):
3200         """Validate that no CUDA calls are made during `import torch` call"""
3212 … = f"import os; import torch;os.environ['{VISIBLE_DEVICES}']='32';print(torch.cuda.device_count())"
3228     @unittest.skipIf(not TEST_WITH_ROCM, "not relevant for CUDA testing")
3230         """Validate device_count works with both CUDA/HIP visible devices"""
3234 print(f"{torch.cuda.device_count()}")
3264 r1 = torch.cuda.device_count()
3266 r2 = torch.cuda.device_count()
3267 torch.empty(10, device='cuda')
3277         x = torch.cuda.device_count()
3288                 file = torch.cuda.gds._GdsFile(f, os.O_CREAT | os.O_RDWR)
3291 @unittest.skipIf(not TEST_CUDA, "CUDA not available, skipping tests")
3299             torch.cuda.memory.empty_cache()
3300             torch.cuda.memory._record_memory_history("state", stacks="python")
3302             torch.rand(2 * 311, 411, device="cuda")
3303             unused = torch.rand(310, 410, device="cuda")
3304             x = torch.rand(311, 411, device="cuda")
3311             tensors = [torch.rand(128, device="cuda") for _ in range(1000)]
3316             torch.rand(128 * 5, device="cuda")
3318             ss = torch.cuda.memory._snapshot()
3331                     torch.cuda.memory._save_segment_usage(f.name)
3336             torch.cuda.empty_cache()
3337             ss = torch.cuda.memory._snapshot()
3344             torch.cuda.memory._record_memory_history(None)
3362             torch.cuda.memory.empty_cache()
3363             torch.cuda.memory._record_memory_history("state", stacks="all")
3364             x = torch.rand(311, 411, device="cuda")
3366             ss = torch.cuda.memory._snapshot()["segments"]
3376             torch.cuda.memory._record_memory_history(None)
3383             x = torch.rand(128, 128, device="cuda")
3413                 x = torch.empty(3, 4, device="cuda")
3444                 torch.cuda.memory.empty_cache()
3445                 torch.cuda.memory._record_memory_history(
3450                     x = torch.rand(128, 128, device="cuda")
3456                 ss = torch.cuda.memory._snapshot()
3467                 torch.cuda.memory._record_memory_history(None)
3476                 torch.cuda.memory.empty_cache()
3477                 torch.cuda.memory._record_memory_history(context=context)
3482                     x = torch.rand(3, 4, device="cuda")
3490                 ss = json.dumps(torch.cuda.memory._snapshot())
3494                 torch.cuda.memory._record_memory_history(None)
3502             torch.cuda.memory.empty_cache()
3507                 x = torch.rand(4, 4, device="cuda")
3511                 x = torch.rand(3, 4, device="cuda")
3515                 x = torch.rand(4, 4, device="cuda")
3518             torch.cuda.memory._record_memory_history(context="all", stacks="python")
3521             torch.cuda.memory._record_memory_history(context=None)
3524             torch.cuda.memory._record_memory_history(context="all", stacks="python")
3527             ss = json.dumps(torch.cuda.memory._snapshot())
3532             torch.cuda.memory._record_memory_history(None)
3541                 torch.cuda.memory.empty_cache()
3542                 torch.cuda.memory._record_memory_history(context=context)
3543                 x = torch.rand(3, 4, device="cuda")
3545                 torch.cuda.memory.empty_cache()
3547                 ss = json.dumps(torch.cuda.memory._snapshot())
3550                 torch.cuda.memory._record_memory_history(None)
3557             torch.cuda.memory.empty_cache()
3558             torch.cuda.memory._record_memory_history("state", stacks="python")
3562                 return torch.rand(311, 411, device="cuda")
3566             ss = torch.cuda.memory._snapshot()["segments"]
3576             torch.cuda.memory._record_memory_history(None)
3579         torch.cuda.memory.empty_cache()
3581         _, all_memory = torch.cuda.memory.mem_get_info()
3585         torch.cuda.memory.set_per_process_memory_fraction(fraction_allowed)
3588             return torch.ones(n * mb, dtype=torch.int8, device="cuda")
3590         torch.cuda.memory._set_allocator_settings(
3594         torch.cuda.memory._set_allocator_settings(
3598         torch.cuda.memory._set_allocator_settings(
3609         torch.cuda.memory.empty_cache()
3611         _, all_memory = torch.cuda.memory.mem_get_info()
3615         torch.cuda.memory.set_per_process_memory_fraction(fraction_allowed)
3618             return torch.ones(n * mb, dtype=torch.int8, device="cuda")
3620         torch.cuda.memory._set_allocator_settings(
3624         torch.cuda.memory._set_allocator_settings(
3647         torch.cuda.memory.empty_cache()
3661         start_mem = torch.cuda.memory_stats()[key_allocated]
3662         torch.cuda.memory._set_allocator_settings("")
3663         x = torch.rand(nelems, device="cuda")
3666         reg_mem = torch.cuda.memory_stats()[key_allocated]
3667         start_requested = torch.cuda.memory_stats()[key_requested]
3668         torch.cuda.memory._set_allocator_settings("roundup_power2_divisions:4")
3669         y = torch.rand(nelems, device="cuda")
3671         pow2_div4_mem = torch.cuda.memory_stats()[key_allocated]
3672         current_requested = torch.cuda.memory_stats()[key_requested]
3680         torch.cuda.memory._set_allocator_settings("garbage_collection_threshold:0.5")
3681         torch.cuda.memory._set_allocator_settings(
3686         torch.cuda.memory.empty_cache()
3687         start_mem = torch.cuda.memory_stats()[key_allocated]
3688         z = torch.rand(nelems, device="cuda")
3689         reg_mem = torch.cuda.memory_stats()[key_allocated]
3693         torch.cuda.memory.empty_cache()
3694         torch.cuda.memory._set_allocator_settings(
3697         start_mem = torch.cuda.memory_stats()[key_allocated]
3698         w = torch.rand(nelems, device="cuda")
3700         pow2_div8_mem = torch.cuda.memory_stats()[key_allocated]
3705         torch.cuda.memory.empty_cache()
3706         start_mem = torch.cuda.memory_stats()[key_allocated]
3707         v = torch.rand(nelems_big, device="cuda")
3709         pow2_div2_mem = torch.cuda.memory_stats()[key_allocated]
3714         torch.cuda.memory.empty_cache()
3715         torch.cuda.memory._set_allocator_settings("release_lock_on_cudamalloc:True")
3716         start_mem = torch.cuda.memory_stats()[key_allocated]
3717         w = torch.rand(nelems, device="cuda")
3718         reg_mem = torch.cuda.memory_stats()[key_allocated]
3722             torch.cuda.memory._set_allocator_settings("foo:1,bar:2")
3725             torch.cuda.memory._set_allocator_settings(
3730             torch.cuda.memory._set_allocator_settings("max_split_size_mb:2")
3733             torch.cuda.memory._set_allocator_settings("release_lock_on_cudamalloc:none")
3736             torch.cuda.memory._set_allocator_settings(
3741             torch.cuda.memory._set_allocator_settings(
3746             torch.cuda.memory._set_allocator_settings(
3756             torch.cuda.memory._set_allocator_settings("max_split_size_mb:1024")
3757             torch.cuda.memory.empty_cache()
3758         with self.assertRaises(torch.cuda.OutOfMemoryError):
3759             torch.empty(1024 * 1024 * 1024 * 1024, device="cuda")
3771         #include <torch/csrc/cuda/memory_snapshot.h>
3773             std::string data = torch::cuda::_memory_snapshot_pickled();
3777             torch::cuda::_record_memory_history(e, ctx, 10, ctx, ctx);
3789                     return torch.rand(311, 411, device="cuda")
3826         with self.assertRaises(torch.cuda.OutOfMemoryError):
3827             torch.empty(1024 * 1024 * 1024 * 1024, device="cuda")
3843                 mem.append((c, torch.full((b,), c, dtype=torch.int32, device="cuda")))
3854             choices = [alloc, free, torch.cuda.memory.empty_cache]
3866             self.assertTrue(torch.cuda._get_pynvml_handler() is not None)
3868             self.assertTrue(torch.cuda._get_amdsmi_handler() is not None)
3872         self.assertTrue(0 <= torch.cuda.temperature() <= 150)
3876         self.assertTrue(torch.cuda.power_draw() >= 0)
3880         self.assertTrue(torch.cuda.clock_rate() >= 0)
3890     segments = torch.cuda.memory_snapshot()
3895     segments = torch.cuda.memory_snapshot()
3901         raise unittest.SkipTest("cuda graph test is skipped")
3903     torch.cuda.synchronize()
3904     stream = torch.cuda.Stream()
3905     stream.wait_stream(torch.cuda.current_stream())
3906     with torch.cuda.stream(stream):
3909     torch.cuda.current_stream().wait_stream(stream)
3910     torch.cuda.synchronize()
3912     graph = torch.cuda.CUDAGraph()
3913     with torch.cuda.graph(graph, stream=stream, pool=pool):
3920     return torch.ones([size], device="cuda", dtype=torch.uint8)
4032         torch.cuda.synchronize()
4034         torch.cuda.empty_cache()
4042             x = torch.zeros([SMALL_SIZE * 8], device="cuda", dtype=torch.uint8)
4146         inp = torch.tensor([1], device="cuda")
4206         m = m.cuda()
4208         inp = torch.rand([1, 3, 255, 255], device="cuda")
4213             return torch.ones([4], device="cuda")
4215         pool = torch.cuda.graph_pool_handle()
4233             return torch.rand([4], device="cuda")
4235         pool = torch.cuda.graph_pool_handle()
4243             Context manager to use cuda graph pool for new allocations. If you use this manager
4247             torch.cuda.synchronize()
4248             stream = torch.cuda.Stream()
4249             stream.wait_stream(torch.cuda.current_stream())
4250             stream_context = torch.cuda.stream(stream)
4298         torch.cuda.synchronize()
4300         torch.cuda.empty_cache()
4306 …  script = "import sys; import torch; torch.rand(2, device='cuda'); print('triton' in sys.modules)"
4321 @unittest.skipIf(not TEST_CUDA, "CUDA not available, skipping tests")
4324         pool1 = torch.cuda.graph_pool_handle()
4325         pool2 = torch.cuda.MemPool().id
4330         # each call to torch.cuda.graph_pool_handle() or torch.cuda.MemPool()
4335         pool = torch.cuda.MemPool()
4344         #include <ATen/cuda/Exceptions.h>
4374         allocator = torch.cuda.memory.CUDAPluggableAllocator(
4379         pool = torch.cuda.MemPool(allocator.allocator())
4389         with torch.cuda.use_mem_pool(pool):
4390             out = torch.randn(1, device="cuda")
4397         active_pool = torch.cuda.MemPoolContext.active_pool()
4402         pool = torch.cuda.MemPool()
4403         ctx = torch.cuda.MemPoolContext(pool)
4404         active_pool = torch.cuda.MemPoolContext.active_pool()
4410         active_pool = torch.cuda.MemPoolContext.active_pool()
4420             pool = torch.cuda.MemPool()
4423             ctx = torch.cuda.MemPoolContext(pool)
4424             active_pool = torch.cuda.MemPoolContext.active_pool()
4447 @unittest.skipIf(not TEST_CUDA, "CUDA not available, skipping tests")
4455         not TEST_CUDA_GRAPH, "CUDA >= 11.0 or ROCM >=5.3 required for graphs"
4508                     g = torch.cuda.CUDAGraph()
4509                     with torch.cuda.graph(g):
4529         not TEST_CUDA_GRAPH, "CUDA >= 11.0 or ROCM >= 5.3 required for graphs"
4535             if "fused" in optim.supported_impls and "cuda" in optim.supports_fused_on
4568                 scaler_for_control = torch.cuda.amp.GradScaler(init_scale=128.0)
4572                 scaler_for_graphed = torch.cuda.amp.GradScaler()
4600                     g = torch.cuda.CUDAGraph()
4601                     with torch.cuda.graph(g):
4703         [optim for optim in optim_db if "cuda" in optim.supports_fused_on],
4709         weight = torch.ones((5, 5), device="cuda", requires_grad=True)
4723         opt.grad_scale = torch.Tensor([3]).cuda()
4731         not TEST_CUDA_GRAPH, "CUDA >= 11.0 or ROCM >= 5.3 required for graphs"
4738             if "foreach" in optim.supported_impls and "cuda" in optim.supports_fused_on
4743         torch.cuda.empty_cache()
4745         scaler = torch.amp.GradScaler(device="cuda", init_scale=4.0)
4746         g = torch.cuda.CUDAGraph()
4747         s = torch.cuda.Stream()
4749         weight = torch.ones((100,), device="cuda", requires_grad=True)
4755         s = torch.cuda.Stream()
4756         s.wait_stream(torch.cuda.current_stream())
4757         with torch.cuda.stream(s):
4760         torch.cuda.current_stream().wait_stream(s)
4765         with torch.cuda.stream(s):
4790 @unittest.skipIf(not TEST_CUDA, "CUDA not available, skipping tests")
4807         src1 = torch.randn(1024, device="cuda")
4808         src2 = torch.randn(2, 1024, device="cuda")
4809         torch.cuda.gds._gds_register_buffer(src1.untyped_storage())
4810         torch.cuda.gds._gds_register_buffer(src2.untyped_storage())
4811         dest1 = torch.empty(1024, device="cuda")
4812         dest2 = torch.empty(2, 1024, device="cuda")
4814             file = torch.cuda.gds._GdsFile(f, os.O_CREAT | os.O_RDWR)
4821         torch.cuda.gds._gds_deregister_buffer(src1.untyped_storage())
4822         torch.cuda.gds._gds_deregister_buffer(src2.untyped_storage())
4825 @unittest.skipIf(not TEST_CUDA, "CUDA not available, skipping tests")
4829         self.autocast_lists = AutocastTestLists(torch.device("cuda:0"))
4845                         op, args, torch.float16, device="cuda", amp_dtype=torch.float16
4859                     or torch.cuda.get_device_capability() < (8, 0)
4869                                 op, args, torch.bfloat16, device="cuda"
4872                         if torch.cuda.is_bf16_supported():
4874                                 op, args, torch.bfloat16, device="cuda"
4881                                     op, args, torch.bfloat16, device="cuda"
4892                 device="cuda",
4901                 op, args, torch.float32, device="cuda", amp_dtype=torch.float16
4911                 device="cuda",
4924                     device="cuda",
4933                 if torch.cuda.is_bf16_supported():
4935                         op, args, torch.bfloat16, device="cuda", module=torch._C._nn
4942                             op, args, torch.bfloat16, device="cuda", module=torch._C._nn
4952                 device="cuda",
4965                     device="cuda",
4978                     device="cuda",
4990                 device="cuda",
5002                 device="cuda",
5009         with torch.autocast("cuda"):
5015         with torch.autocast("cuda"):
5017                 a_ignore = torch.ones((8, 8), dtype=ignore_type, device="cuda:0")
5018                 b_ignore = torch.ones((8, 8), dtype=ignore_type, device="cuda:0")
5019                 c_16 = torch.ones((8, 8), dtype=torch.float16, device="cuda:0")
5026                     with torch.autocast("cuda", enabled=False):
5033                 with torch.autocast("cuda", enabled=False):
5038                 with torch.autocast("cuda", enabled=False):
5045                     with torch.autocast("cuda", enabled=False):
5052             @torch.amp.custom_fwd(device_type="cuda")
5061             @torch.amp.custom_bwd(device_type="cuda")
5071         x = torch.randn((8, 8), device="cuda", dtype=torch.float32, requires_grad=True)
5072         y = torch.randn((8, 8), device="cuda", dtype=torch.float32, requires_grad=True)
5076             with torch.cuda.amp.autocast(dtype=dtype):
5085             @torch.amp.custom_fwd(device_type="cuda", cast_inputs=torch.float32)
5095             @torch.amp.custom_bwd(device_type="cuda")
5103         x = torch.randn((8, 8), device="cuda", dtype=torch.float16, requires_grad=True)
5111                     (8, 8), device="cuda", dtype=torch.float16, requires_grad=False
5116         with torch.autocast("cuda"):
5133                 @torch.cuda.amp.custom_fwd(cast_inputs=torch.float32)
5140                 @torch.cuda.amp.custom_bwd
5147             str(w[0].message), r"`torch.cuda.amp.custom_fwd\(args...\)` is deprecated."
5150             str(w[1].message), r"`torch.cuda.amp.custom_bwd\(args...\)` is deprecated."
5156         with torch.amp.autocast("cuda"):
5177         with torch.autocast("cuda", enabled=True):
5216                     x = torch.randn((T, B, F), device="cuda", dtype=input_dtype)
5219                     x = torch.randn((B, T, F), device="cuda", dtype=input_dtype)
5223                         torch.randn((T, B, F), device="cuda", dtype=input_dtype),
5237                     .cuda()
5248                     device="cuda",
5254                         device="cuda",
5259                 with torch.autocast("cuda"):
5311         linear = torch.nn.Linear(10, 10).to("cuda")
5312         data = torch.randn(1, 10, device="cuda")
5314         with torch.autocast("cuda"):
5317                 first_iter_mem = torch.cuda.memory_allocated()
5320                 self.assertTrue(first_iter_mem == torch.cuda.memory_allocated())
5325         ).cuda()
5327             (8, 8), device="cuda", dtype=torch.float16, requires_grad=True
5330             with torch.autocast("cuda"):
5339 …r"`torch.cuda.amp.autocast\(args...\)` is deprecated. Please use `torch.amp.autocast\('cuda', args…
5341             with torch.cuda.amp.autocast():