1 #include <gtest/gtest.h>
2
3 #include <ATen/ATen.h>
4 #include <ATen/cuda/CUDAContext.h>
5 #include <ATen/cuda/CUDAGeneratorImpl.h>
6 #include <c10/cuda/CUDAException.h>
7 #include <c10/cuda/CUDAFunctions.h>
8 #include <ATen/core/PhiloxRNGEngine.h>
9 #include <cuda.h>
10 #include <cuda_fp16.h>
11 #include <cuda_runtime.h>
12
13 #include <assert.h>
14 #include <thread>
15
16 using namespace at;
17
18 /*
19 * Philox Engine Tests
20 */
21
testEngineReproducibility()22 __global__ void testEngineReproducibility(){
23 int idx = blockIdx.x * blockDim.x + threadIdx.x;
24 at::Philox4_32 engine1(0, idx, 4);
25 at::Philox4_32 engine2(0, idx, 4);
26 assert(engine1() == engine2());
27 }
28
test_engine_reproducibility()29 void test_engine_reproducibility(){
30 testEngineReproducibility<<<1, 1>>>();
31 C10_CUDA_KERNEL_LAUNCH_CHECK();
32 }
33
TEST(CUDAGeneratorImpl,TestPhiloxEngineReproducibility)34 TEST(CUDAGeneratorImpl, TestPhiloxEngineReproducibility) {
35 // Test Description:
36 // Tests if same inputs give same results.
37 // launch one thread and create two engines.
38 // Given same seed, idx and offset, assert that the engines
39 // should be aligned and have the same sequence.
40 if (!at::cuda::is_available()) return;
41 test_engine_reproducibility();
42 cudaError_t err = cudaDeviceSynchronize();
43 bool isEQ = err == cudaSuccess;
44 ASSERT_TRUE(isEQ);
45 }
46
testEngineOffset1()47 __global__ void testEngineOffset1(){
48 at::Philox4_32 engine1(123, 1, 0);
49 // Note: offset is a multiple of 4.
50 // So if you want to skip 8 values, offset would
51 // be 2, since 2*4=8.
52 at::Philox4_32 engine2(123, 1, 2);
53 for(int i = 0; i < 8; i++){
54 // Note: instead of using the engine() call 8 times
55 // we could have achieved the same functionality by
56 // calling the incr() function twice.
57 engine1();
58 }
59 assert(engine1() == engine2());
60 }
61
test_engine_offset1()62 void test_engine_offset1(){
63 testEngineOffset1<<<1, 1>>>();
64 C10_CUDA_KERNEL_LAUNCH_CHECK();
65 }
66
TEST(CUDAGeneratorImpl,TestPhiloxEngineOffset1)67 TEST(CUDAGeneratorImpl, TestPhiloxEngineOffset1) {
68 // Test Description:
69 // Tests offsetting in same thread.
70 // launch one thread and create two engines.
71 // make one engine skip the first 8 values and
72 // make another engine increment to until the
73 // first 8 values. Assert that the first call
74 // of engine2 and the 9th call of engine1 are equal.
75 if (!at::cuda::is_available()) return;
76 test_engine_offset1();
77 cudaError_t err = cudaDeviceSynchronize();
78 bool isEQ = err == cudaSuccess;
79 ASSERT_TRUE(isEQ);
80 }
81
testEngineOffset2()82 __global__ void testEngineOffset2(){
83 unsigned long long increment_val = ::ldexp(1.0, 64);
84 at::Philox4_32 engine1(123, 0, increment_val);
85 at::Philox4_32 engine2(123, increment_val, increment_val);
86
87 engine2.incr_n(increment_val);
88 engine2.incr();
89 assert(engine1() == engine2());
90 }
91
test_engine_offset2()92 void test_engine_offset2(){
93 testEngineOffset2<<<1, 1>>>();
94 C10_CUDA_KERNEL_LAUNCH_CHECK();
95 }
96
TEST(CUDAGeneratorImpl,TestPhiloxEngineOffset2)97 TEST(CUDAGeneratorImpl, TestPhiloxEngineOffset2) {
98 // Test Description:
99 // Tests edge case at the end of the 2^190th value of the generator.
100 // launch one thread and create two engines
101 // make engine1 skip to the 2^64th 128 bit while being at thread 0
102 // make engine2 skip to the 2^64th 128 bit while being at 2^64th thread
103 // Assert that engine2 should be increment_val+1 steps behind engine1.
104 if (!at::cuda::is_available()) return;
105 test_engine_offset2();
106 cudaDeviceSynchronize();
107 bool isEQ = cudaGetLastError() == cudaSuccess;
108 ASSERT_TRUE(isEQ);
109 }
110
testEngineOffset3()111 __global__ void testEngineOffset3(){
112 unsigned long long increment_val = ::ldexp(1.0, 64);
113 at::Philox4_32 engine1(123, 0, increment_val);
114 at::Philox4_32 engine2(123, 1, 0);
115 engine1.incr();
116 assert(engine1() == engine2());
117 }
118
test_engine_offset3()119 void test_engine_offset3(){
120 testEngineOffset2<<<1, 1>>>();
121 C10_CUDA_KERNEL_LAUNCH_CHECK();
122 }
123
TEST(CUDAGeneratorImpl,TestPhiloxEngineOffset3)124 TEST(CUDAGeneratorImpl, TestPhiloxEngineOffset3) {
125 // Test Description:
126 // Tests edge case in between threads.
127 // launch one thread and create two engines
128 // make engine1 skip to the 2^64th 128 bit while being at thread 0
129 // start engine2 at thread 1, with offset 0
130 // Assert that engine1 is 1 step behind engine2.
131 if (!at::cuda::is_available()) return;
132 test_engine_offset3();
133 cudaDeviceSynchronize();
134 bool isEQ = cudaGetLastError() == cudaSuccess;
135 ASSERT_TRUE(isEQ);
136 }
137
testEngineThreadIndex()138 __global__ void testEngineThreadIndex(){
139 at::Philox4_32 engine1(123456, 0, 4);
140 at::Philox4_32 engine2(123456, 1, 4);
141 assert(engine1() != engine2());
142 }
143
test_engine_thread_index()144 void test_engine_thread_index(){
145 testEngineThreadIndex<<<1, 1>>>();
146 C10_CUDA_KERNEL_LAUNCH_CHECK();
147 }
148
TEST(CUDAGeneratorImpl,TestPhiloxEngineIndex)149 TEST(CUDAGeneratorImpl, TestPhiloxEngineIndex) {
150 // Test Description:
151 // Tests if thread indexing is working properly.
152 // launch one thread and create two engines
153 // with different thread index but same offset.
154 // Assert that the engines have different sequences.
155 if (!at::cuda::is_available()) return;
156 test_engine_thread_index();
157 cudaDeviceSynchronize();
158 bool isEQ = cudaGetLastError() == cudaSuccess;
159 ASSERT_TRUE(isEQ);
160 }
161
162 /*
163 * CUDA Generator Tests
164 */
165
TEST(CUDAGeneratorImpl,TestGeneratorDynamicCast)166 TEST(CUDAGeneratorImpl, TestGeneratorDynamicCast) {
167 // Test Description: Check dynamic cast for CUDA
168 if (!at::cuda::is_available()) return;
169 auto foo = at::cuda::detail::createCUDAGenerator();
170 auto result = foo.get<CUDAGeneratorImpl>();
171 ASSERT_EQ(typeid(at::CUDAGeneratorImpl*).hash_code(), typeid(result).hash_code());
172 }
173
TEST(CUDAGeneratorImpl,TestDefaultGenerator)174 TEST(CUDAGeneratorImpl, TestDefaultGenerator) {
175 // Test Description:
176 // Check if default generator state is created only once
177 // address of generator should be same in all calls
178 if (!at::cuda::is_available()) return;
179 auto foo = at::cuda::detail::getDefaultCUDAGenerator();
180 auto bar = at::cuda::detail::getDefaultCUDAGenerator();
181 ASSERT_EQ(foo, bar);
182
183 if (c10::cuda::device_count() >= 2) {
184 foo = at::cuda::detail::getDefaultCUDAGenerator(1);
185 bar = at::cuda::detail::getDefaultCUDAGenerator(1);
186 ASSERT_EQ(foo, bar);
187
188 foo = at::cuda::detail::getDefaultCUDAGenerator(0);
189 bar = at::cuda::detail::getDefaultCUDAGenerator(1);
190 ASSERT_NE(foo, bar);
191 }
192 }
193
TEST(CUDAGeneratorImpl,TestCloning)194 TEST(CUDAGeneratorImpl, TestCloning) {
195 // Test Description:
196 // Check cloning of new generators.
197 // Note that we don't allow cloning of other
198 // generator states into default generators.
199 if (!at::cuda::is_available()) return;
200 auto gen1 = at::cuda::detail::createCUDAGenerator();
201 gen1.set_current_seed(123); // modify gen1 state
202 auto cuda_gen1 = check_generator<CUDAGeneratorImpl>(gen1);
203 cuda_gen1->set_philox_offset_per_thread(4);
204 auto gen2 = at::cuda::detail::createCUDAGenerator();
205 gen2 = gen1.clone();
206 auto cuda_gen2 = check_generator<CUDAGeneratorImpl>(gen2);
207 ASSERT_EQ(gen1.current_seed(), gen2.current_seed());
208 ASSERT_EQ(
209 cuda_gen1->philox_offset_per_thread(),
210 cuda_gen2->philox_offset_per_thread()
211 );
212 }
213
thread_func_get_set_current_seed(Generator generator)214 void thread_func_get_set_current_seed(Generator generator) {
215 std::lock_guard<std::mutex> lock(generator.mutex());
216 auto current_seed = generator.current_seed();
217 current_seed++;
218 generator.set_current_seed(current_seed);
219 }
220
TEST(CUDAGeneratorImpl,TestMultithreadingGetSetCurrentSeed)221 TEST(CUDAGeneratorImpl, TestMultithreadingGetSetCurrentSeed) {
222 // Test Description:
223 // Test current seed getter and setter are thread safe
224 // See Note [Acquire lock when using random generators]
225 if (!at::cuda::is_available()) return;
226 auto gen1 = at::cuda::detail::getDefaultCUDAGenerator();
227 auto initial_seed = gen1.current_seed();
228 std::thread t0{thread_func_get_set_current_seed, gen1};
229 std::thread t1{thread_func_get_set_current_seed, gen1};
230 std::thread t2{thread_func_get_set_current_seed, gen1};
231 t0.join();
232 t1.join();
233 t2.join();
234 ASSERT_EQ(gen1.current_seed(), initial_seed+3);
235 }
236
TEST(CUDAGeneratorImpl,TestRNGForking)237 TEST(CUDAGeneratorImpl, TestRNGForking) {
238 // Test Description:
239 // Test that state of a generator can be frozen and
240 // restored
241 // See Note [Acquire lock when using random generators]
242 if (!at::cuda::is_available()) return;
243 auto default_gen = at::cuda::detail::getDefaultCUDAGenerator();
244 auto current_gen = at::cuda::detail::createCUDAGenerator();
245 {
246 std::lock_guard<std::mutex> lock(default_gen.mutex());
247 current_gen = default_gen.clone(); // capture the current state of default generator
248 }
249 auto target_value = at::randn({1000}, at::kCUDA);
250 // Dramatically alter the internal state of the main generator
251 auto x = at::randn({100000}, at::kCUDA);
252 auto forked_value = at::randn({1000}, current_gen, at::kCUDA);
253 ASSERT_EQ(target_value.sum().item<double>(), forked_value.sum().item<double>());
254 }
255
makeRandomNumber()256 void makeRandomNumber() {
257 cudaSetDevice(std::rand() % 2);
258 auto x = at::randn({1000});
259 }
260
testCudaRNGMultithread()261 void testCudaRNGMultithread() {
262 auto threads = std::vector<std::thread>();
263 for (auto i = 0; i < 1000; i++) {
264 threads.emplace_back(makeRandomNumber);
265 }
266 for (auto& t : threads) {
267 t.join();
268 }
269 };
270
TEST(CUDAGeneratorImpl,TestMultithreadRNG)271 TEST(CUDAGeneratorImpl, TestMultithreadRNG) {
272 if (!at::cuda::is_available()) return;
273 testCudaRNGMultithread();
274 }
275