xref: /aosp_15_r20/external/pytorch/aten/src/ATen/test/cuda_generator_test.cu (revision da0073e96a02ea20f0ac840b70461e3646d07c45)
1 #include <gtest/gtest.h>
2 
3 #include <ATen/ATen.h>
4 #include <ATen/cuda/CUDAContext.h>
5 #include <ATen/cuda/CUDAGeneratorImpl.h>
6 #include <c10/cuda/CUDAException.h>
7 #include <c10/cuda/CUDAFunctions.h>
8 #include <ATen/core/PhiloxRNGEngine.h>
9 #include <cuda.h>
10 #include <cuda_fp16.h>
11 #include <cuda_runtime.h>
12 
13 #include <assert.h>
14 #include <thread>
15 
16 using namespace at;
17 
18 /*
19 * Philox Engine Tests
20 */
21 
testEngineReproducibility()22 __global__ void testEngineReproducibility(){
23   int idx = blockIdx.x * blockDim.x + threadIdx.x;
24   at::Philox4_32 engine1(0, idx, 4);
25   at::Philox4_32 engine2(0, idx, 4);
26   assert(engine1() == engine2());
27 }
28 
test_engine_reproducibility()29 void test_engine_reproducibility(){
30   testEngineReproducibility<<<1, 1>>>();
31   C10_CUDA_KERNEL_LAUNCH_CHECK();
32 }
33 
TEST(CUDAGeneratorImpl,TestPhiloxEngineReproducibility)34 TEST(CUDAGeneratorImpl, TestPhiloxEngineReproducibility) {
35   // Test Description:
36   //   Tests if same inputs give same results.
37   //   launch one thread and create two engines.
38   //   Given same seed, idx and offset, assert that the engines
39   //   should be aligned and have the same sequence.
40   if (!at::cuda::is_available()) return;
41   test_engine_reproducibility();
42   cudaError_t err = cudaDeviceSynchronize();
43   bool isEQ = err == cudaSuccess;
44   ASSERT_TRUE(isEQ);
45 }
46 
testEngineOffset1()47 __global__ void testEngineOffset1(){
48   at::Philox4_32 engine1(123, 1, 0);
49   // Note: offset is a multiple of 4.
50   // So if you want to skip 8 values, offset would
51   // be 2, since 2*4=8.
52   at::Philox4_32 engine2(123, 1, 2);
53   for(int i = 0; i < 8; i++){
54     // Note: instead of using the engine() call 8 times
55     // we could have achieved the same functionality by
56     // calling the incr() function twice.
57     engine1();
58   }
59   assert(engine1() == engine2());
60 }
61 
test_engine_offset1()62 void test_engine_offset1(){
63   testEngineOffset1<<<1, 1>>>();
64   C10_CUDA_KERNEL_LAUNCH_CHECK();
65 }
66 
TEST(CUDAGeneratorImpl,TestPhiloxEngineOffset1)67 TEST(CUDAGeneratorImpl, TestPhiloxEngineOffset1) {
68   // Test Description:
69   //   Tests offsetting in same thread.
70   //   launch one thread and create two engines.
71   //   make one engine skip the first 8 values and
72   //   make another engine increment to until the
73   //   first 8 values. Assert that the first call
74   //   of engine2 and the 9th call of engine1 are equal.
75   if (!at::cuda::is_available()) return;
76   test_engine_offset1();
77   cudaError_t err = cudaDeviceSynchronize();
78   bool isEQ = err == cudaSuccess;
79   ASSERT_TRUE(isEQ);
80 }
81 
testEngineOffset2()82 __global__ void testEngineOffset2(){
83   unsigned long long increment_val = ::ldexp(1.0, 64);
84   at::Philox4_32 engine1(123, 0, increment_val);
85   at::Philox4_32 engine2(123, increment_val, increment_val);
86 
87   engine2.incr_n(increment_val);
88   engine2.incr();
89   assert(engine1() == engine2());
90 }
91 
test_engine_offset2()92 void test_engine_offset2(){
93   testEngineOffset2<<<1, 1>>>();
94   C10_CUDA_KERNEL_LAUNCH_CHECK();
95 }
96 
TEST(CUDAGeneratorImpl,TestPhiloxEngineOffset2)97 TEST(CUDAGeneratorImpl, TestPhiloxEngineOffset2) {
98   // Test Description:
99   //   Tests edge case at the end of the 2^190th value of the generator.
100   //   launch one thread and create two engines
101   //   make engine1 skip to the 2^64th 128 bit while being at thread 0
102   //   make engine2 skip to the 2^64th 128 bit while being at 2^64th thread
103   //   Assert that engine2 should be increment_val+1 steps behind engine1.
104   if (!at::cuda::is_available()) return;
105   test_engine_offset2();
106   cudaDeviceSynchronize();
107   bool isEQ = cudaGetLastError() == cudaSuccess;
108   ASSERT_TRUE(isEQ);
109 }
110 
testEngineOffset3()111 __global__ void testEngineOffset3(){
112   unsigned long long increment_val = ::ldexp(1.0, 64);
113   at::Philox4_32 engine1(123, 0, increment_val);
114   at::Philox4_32 engine2(123, 1, 0);
115   engine1.incr();
116   assert(engine1() == engine2());
117 }
118 
test_engine_offset3()119 void test_engine_offset3(){
120   testEngineOffset2<<<1, 1>>>();
121   C10_CUDA_KERNEL_LAUNCH_CHECK();
122 }
123 
TEST(CUDAGeneratorImpl,TestPhiloxEngineOffset3)124 TEST(CUDAGeneratorImpl, TestPhiloxEngineOffset3) {
125   // Test Description:
126   //   Tests edge case in between threads.
127   //   launch one thread and create two engines
128   //   make engine1 skip to the 2^64th 128 bit while being at thread 0
129   //   start engine2 at thread 1, with offset 0
130   //   Assert that engine1 is 1 step behind engine2.
131   if (!at::cuda::is_available()) return;
132   test_engine_offset3();
133   cudaDeviceSynchronize();
134   bool isEQ = cudaGetLastError() == cudaSuccess;
135   ASSERT_TRUE(isEQ);
136 }
137 
testEngineThreadIndex()138 __global__ void testEngineThreadIndex(){
139   at::Philox4_32 engine1(123456, 0, 4);
140   at::Philox4_32 engine2(123456, 1, 4);
141   assert(engine1() != engine2());
142 }
143 
test_engine_thread_index()144 void test_engine_thread_index(){
145   testEngineThreadIndex<<<1, 1>>>();
146   C10_CUDA_KERNEL_LAUNCH_CHECK();
147 }
148 
TEST(CUDAGeneratorImpl,TestPhiloxEngineIndex)149 TEST(CUDAGeneratorImpl, TestPhiloxEngineIndex) {
150   // Test Description:
151   //   Tests if thread indexing is working properly.
152   //   launch one thread and create two engines
153   //   with different thread index but same offset.
154   //   Assert that the engines have different sequences.
155   if (!at::cuda::is_available()) return;
156   test_engine_thread_index();
157   cudaDeviceSynchronize();
158   bool isEQ = cudaGetLastError() == cudaSuccess;
159   ASSERT_TRUE(isEQ);
160 }
161 
162 /*
163 * CUDA Generator Tests
164 */
165 
TEST(CUDAGeneratorImpl,TestGeneratorDynamicCast)166 TEST(CUDAGeneratorImpl, TestGeneratorDynamicCast) {
167   //  Test Description: Check dynamic cast for CUDA
168   if (!at::cuda::is_available()) return;
169   auto foo = at::cuda::detail::createCUDAGenerator();
170   auto result = foo.get<CUDAGeneratorImpl>();
171   ASSERT_EQ(typeid(at::CUDAGeneratorImpl*).hash_code(), typeid(result).hash_code());
172 }
173 
TEST(CUDAGeneratorImpl,TestDefaultGenerator)174 TEST(CUDAGeneratorImpl, TestDefaultGenerator) {
175   // Test Description:
176   // Check if default generator state is created only once
177   // address of generator should be same in all calls
178   if (!at::cuda::is_available()) return;
179   auto foo = at::cuda::detail::getDefaultCUDAGenerator();
180   auto bar = at::cuda::detail::getDefaultCUDAGenerator();
181   ASSERT_EQ(foo, bar);
182 
183   if (c10::cuda::device_count() >= 2) {
184     foo = at::cuda::detail::getDefaultCUDAGenerator(1);
185     bar = at::cuda::detail::getDefaultCUDAGenerator(1);
186     ASSERT_EQ(foo, bar);
187 
188     foo = at::cuda::detail::getDefaultCUDAGenerator(0);
189     bar = at::cuda::detail::getDefaultCUDAGenerator(1);
190     ASSERT_NE(foo, bar);
191   }
192 }
193 
TEST(CUDAGeneratorImpl,TestCloning)194 TEST(CUDAGeneratorImpl, TestCloning) {
195   // Test Description:
196   // Check cloning of new generators.
197   // Note that we don't allow cloning of other
198   // generator states into default generators.
199   if (!at::cuda::is_available()) return;
200   auto gen1 = at::cuda::detail::createCUDAGenerator();
201   gen1.set_current_seed(123); // modify gen1 state
202   auto cuda_gen1 = check_generator<CUDAGeneratorImpl>(gen1);
203   cuda_gen1->set_philox_offset_per_thread(4);
204   auto gen2 = at::cuda::detail::createCUDAGenerator();
205   gen2 = gen1.clone();
206   auto cuda_gen2 = check_generator<CUDAGeneratorImpl>(gen2);
207   ASSERT_EQ(gen1.current_seed(), gen2.current_seed());
208   ASSERT_EQ(
209     cuda_gen1->philox_offset_per_thread(),
210     cuda_gen2->philox_offset_per_thread()
211   );
212 }
213 
thread_func_get_set_current_seed(Generator generator)214 void thread_func_get_set_current_seed(Generator generator) {
215   std::lock_guard<std::mutex> lock(generator.mutex());
216   auto current_seed = generator.current_seed();
217   current_seed++;
218   generator.set_current_seed(current_seed);
219 }
220 
TEST(CUDAGeneratorImpl,TestMultithreadingGetSetCurrentSeed)221 TEST(CUDAGeneratorImpl, TestMultithreadingGetSetCurrentSeed) {
222   // Test Description:
223   // Test current seed getter and setter are thread safe
224   // See Note [Acquire lock when using random generators]
225   if (!at::cuda::is_available()) return;
226   auto gen1 = at::cuda::detail::getDefaultCUDAGenerator();
227   auto initial_seed = gen1.current_seed();
228   std::thread t0{thread_func_get_set_current_seed, gen1};
229   std::thread t1{thread_func_get_set_current_seed, gen1};
230   std::thread t2{thread_func_get_set_current_seed, gen1};
231   t0.join();
232   t1.join();
233   t2.join();
234   ASSERT_EQ(gen1.current_seed(), initial_seed+3);
235 }
236 
TEST(CUDAGeneratorImpl,TestRNGForking)237 TEST(CUDAGeneratorImpl, TestRNGForking) {
238   // Test Description:
239   // Test that state of a generator can be frozen and
240   // restored
241   // See Note [Acquire lock when using random generators]
242   if (!at::cuda::is_available()) return;
243   auto default_gen = at::cuda::detail::getDefaultCUDAGenerator();
244   auto current_gen = at::cuda::detail::createCUDAGenerator();
245   {
246     std::lock_guard<std::mutex> lock(default_gen.mutex());
247     current_gen = default_gen.clone(); // capture the current state of default generator
248   }
249   auto target_value = at::randn({1000}, at::kCUDA);
250   // Dramatically alter the internal state of the main generator
251   auto x = at::randn({100000}, at::kCUDA);
252   auto forked_value = at::randn({1000}, current_gen, at::kCUDA);
253   ASSERT_EQ(target_value.sum().item<double>(), forked_value.sum().item<double>());
254 }
255 
makeRandomNumber()256 void makeRandomNumber() {
257   cudaSetDevice(std::rand() % 2);
258   auto x = at::randn({1000});
259 }
260 
testCudaRNGMultithread()261 void testCudaRNGMultithread() {
262   auto threads = std::vector<std::thread>();
263   for (auto i = 0; i < 1000; i++) {
264     threads.emplace_back(makeRandomNumber);
265   }
266   for (auto& t : threads) {
267     t.join();
268   }
269 };
270 
TEST(CUDAGeneratorImpl,TestMultithreadRNG)271 TEST(CUDAGeneratorImpl, TestMultithreadRNG) {
272   if (!at::cuda::is_available()) return;
273   testCudaRNGMultithread();
274 }
275