xref: /aosp_15_r20/external/pytorch/caffe2/utils/threadpool/pthreadpool.cc (revision da0073e96a02ea20f0ac840b70461e3646d07c45)
1 /* Standard C headers */
2 #include <stdint.h>
3 #include <stdbool.h>
4 #include <stdlib.h>
5 #include <string.h>
6 #include <assert.h>
7 #include <limits>
8 
9 #ifdef _MSC_VER
10 #include <cstdio>
11 #undef min
12 #else
13 /* POSIX headers */
14 #include <unistd.h>
15 #endif
16 
17 /* Library header */
18 #include "caffe2/utils/fixed_divisor.h"
19 #include "caffe2/utils/threadpool/pthreadpool.h"
20 
21 #include <c10/util/Logging.h>
22 
divide_round_up(size_t dividend,size_t divisor)23 static inline size_t divide_round_up(size_t dividend, size_t divisor) {
24   if (dividend % divisor == 0) {
25     return dividend / divisor;
26   } else {
27     return dividend / divisor + 1;
28   }
29 }
30 
min(size_t a,size_t b)31 static inline size_t min(size_t a, size_t b) {
32   return a < b ? a : b;
33 }
34 
35 struct compute_1d_tiled_context {
36   legacy_pthreadpool_function_1d_tiled_t function;
37   void* argument;
38   size_t range;
39   size_t tile;
40 };
41 
compute_1d_tiled(void * context_,size_t linear_index)42 static void compute_1d_tiled(void* context_, size_t linear_index) {
43   const struct compute_1d_tiled_context* context = (compute_1d_tiled_context*) context_;
44   const size_t tile_index = linear_index;
45   const size_t index = tile_index * context->tile;
46   const size_t tile = min(context->tile, context->range - index);
47   context->function(context->argument, index, tile);
48 }
49 
legacy_pthreadpool_compute_1d_tiled(legacy_pthreadpool_t threadpool,legacy_pthreadpool_function_1d_tiled_t function,void * argument,size_t range,size_t tile)50 void legacy_pthreadpool_compute_1d_tiled(
51   legacy_pthreadpool_t threadpool,
52   legacy_pthreadpool_function_1d_tiled_t function,
53   void* argument,
54   size_t range,
55   size_t tile)
56 {
57   if (threadpool == nullptr) {
58     /* No thread pool provided: execute function sequentially on the calling thread */
59     for (size_t i = 0; i < range; i += tile) {
60       function(argument, i, min(range - i, tile));
61     }
62   } else {
63     /* Execute in parallel on the thread pool using linearized index */
64     const size_t tile_range = divide_round_up(range, tile);
65     struct compute_1d_tiled_context context = {/*.function = */ function,
66                                                /*.argument = */ argument,
67                                                /*.range = */ range,
68                                                /*.tile = */ tile};
69     legacy_pthreadpool_compute_1d(threadpool, (legacy_pthreadpool_function_1d_t) compute_1d_tiled, &context, tile_range);
70   }
71 }
72 
73 struct compute_2d_context {
74   legacy_pthreadpool_function_2d_t function;
75   void* argument;
76   caffe2::FixedDivisor<int32_t> range_j;
77 };
78 
compute_2d(void * context_,size_t linear_index)79 static void compute_2d(void* context_, size_t linear_index) {
80   TORCH_DCHECK_LE(linear_index, std::numeric_limits<int32_t>::max());
81 
82   const struct compute_2d_context* context = static_cast<compute_2d_context*>(context_);
83   int32_t q;
84   int32_t r;
85   context->range_j.DivMod(static_cast<int32_t>(linear_index), &q, &r);
86   context->function(context->argument, q, r);
87 }
88 
legacy_pthreadpool_compute_2d(legacy_pthreadpool_t threadpool,legacy_pthreadpool_function_2d_t function,void * argument,size_t range_i,size_t range_j)89 void legacy_pthreadpool_compute_2d(
90   legacy_pthreadpool_t threadpool,
91   legacy_pthreadpool_function_2d_t function,
92   void* argument,
93   size_t range_i,
94   size_t range_j)
95 {
96   if (threadpool == nullptr) {
97     /* No thread pool provided: execute function sequentially on the calling thread */
98     for (size_t i = 0; i < range_i; i++) {
99       for (size_t j = 0; j < range_j; j++) {
100         function(argument, i, j);
101       }
102     }
103   } else {
104     TORCH_DCHECK_LE(range_i * range_j, (size_t)std::numeric_limits<int32_t>::max());
105     /* Execute in parallel on the thread pool using linearized index */
106     struct compute_2d_context context = {
107         /*.function = */ function,
108         /*.argument = */ argument,
109         /*.range_j = */ caffe2::FixedDivisor<int32_t>(range_j)};
110     legacy_pthreadpool_compute_1d(threadpool, (legacy_pthreadpool_function_1d_t) compute_2d, &context, range_i * range_j);
111   }
112 }
113 
114 struct compute_2d_tiled_context {
115   legacy_pthreadpool_function_2d_tiled_t function;
116   void* argument;
117   caffe2::FixedDivisor<int32_t> tile_range_j;
118   size_t range_i;
119   size_t range_j;
120   size_t tile_i;
121   size_t tile_j;
122 };
123 
compute_2d_tiled(void * context_,size_t linear_index)124 static void compute_2d_tiled(void* context_, size_t linear_index) {
125   int32_t q;
126   int32_t r;
127 
128   const struct compute_2d_tiled_context* context = static_cast<compute_2d_tiled_context*>(context_);
129   context->tile_range_j.DivMod(linear_index, &q, &r);
130   const size_t max_tile_i = context->tile_i;
131   const size_t max_tile_j = context->tile_j;
132   const size_t index_i = q * max_tile_i;
133   const size_t index_j = r * max_tile_j;
134   const size_t tile_i = min(max_tile_i, context->range_i - index_i);
135   const size_t tile_j = min(max_tile_j, context->range_j - index_j);
136   context->function(context->argument, index_i, index_j, tile_i, tile_j);
137 }
138 
legacy_pthreadpool_compute_2d_tiled(legacy_pthreadpool_t threadpool,legacy_pthreadpool_function_2d_tiled_t function,void * argument,size_t range_i,size_t range_j,size_t tile_i,size_t tile_j)139 void legacy_pthreadpool_compute_2d_tiled(
140   legacy_pthreadpool_t threadpool,
141   legacy_pthreadpool_function_2d_tiled_t function,
142   void* argument,
143   size_t range_i,
144   size_t range_j,
145   size_t tile_i,
146   size_t tile_j)
147 {
148   if (threadpool == nullptr) {
149     /* No thread pool provided: execute function sequentially on the calling thread */
150     for (size_t i = 0; i < range_i; i += tile_i) {
151       for (size_t j = 0; j < range_j; j += tile_j) {
152         function(argument, i, j, min(range_i - i, tile_i), min(range_j - j, tile_j));
153       }
154     }
155   } else {
156     /* Execute in parallel on the thread pool using linearized index */
157     const size_t tile_range_i = divide_round_up(range_i, tile_i);
158     const size_t tile_range_j = divide_round_up(range_j, tile_j);
159     TORCH_DCHECK_LE(
160         tile_range_i * tile_range_j,
161         (size_t)std::numeric_limits<int32_t>::max());
162     struct compute_2d_tiled_context context = {
163         /*.function = */ function,
164         /*.argument = */ argument,
165         /*.tile_range_j = */ caffe2::FixedDivisor<int32_t>(tile_range_j),
166         /*.range_i = */ range_i,
167         /*.range_j = */ range_j,
168         /*.tile_i = */ tile_i,
169         /*.tile_j = */ tile_j};
170     legacy_pthreadpool_compute_1d(threadpool, (legacy_pthreadpool_function_1d_t) compute_2d_tiled, &context, tile_range_i * tile_range_j);
171   }
172 }
173 
174 struct compute_3d_tiled_context {
175   legacy_pthreadpool_function_3d_tiled_t function;
176   void* argument;
177   caffe2::FixedDivisor<int32_t> tile_range_j;
178   caffe2::FixedDivisor<int32_t> tile_range_k;
179   size_t range_i;
180   size_t range_j;
181   size_t range_k;
182   size_t tile_i;
183   size_t tile_j;
184   size_t tile_k;
185 };
186 
compute_3d_tiled(void * context_,size_t linear_index)187 static void compute_3d_tiled(
188     void* context_,
189     size_t linear_index) {
190   int32_t tile_index_ij, tile_index_k;
191   const struct compute_3d_tiled_context* context = static_cast<compute_3d_tiled_context*>(context_);
192   context->tile_range_k.DivMod(
193       static_cast<int32_t>(linear_index), &tile_index_ij, &tile_index_k);
194   int32_t tile_index_i, tile_index_j;
195   context->tile_range_j.DivMod(tile_index_ij, &tile_index_i, &tile_index_j);
196   const size_t max_tile_i = context->tile_i;
197   const size_t max_tile_j = context->tile_j;
198   const size_t max_tile_k = context->tile_k;
199   const size_t index_i = static_cast<uint32_t>(tile_index_i) * max_tile_i;
200   const size_t index_j = static_cast<uint32_t>(tile_index_j) * max_tile_j;
201   const size_t index_k = static_cast<uint32_t>(tile_index_k) * max_tile_k;
202   const size_t tile_i = min(max_tile_i, context->range_i - index_i);
203   const size_t tile_j = min(max_tile_j, context->range_j - index_j);
204   const size_t tile_k = min(max_tile_k, context->range_k - index_k);
205   context->function(
206       context->argument, index_i, index_j, index_k, tile_i, tile_j, tile_k);
207 }
208 
legacy_pthreadpool_compute_3d_tiled(legacy_pthreadpool_t threadpool,legacy_pthreadpool_function_3d_tiled_t function,void * argument,size_t range_i,size_t range_j,size_t range_k,size_t tile_i,size_t tile_j,size_t tile_k)209 void legacy_pthreadpool_compute_3d_tiled(
210     legacy_pthreadpool_t threadpool,
211     legacy_pthreadpool_function_3d_tiled_t function,
212     void* argument,
213     size_t range_i,
214     size_t range_j,
215     size_t range_k,
216     size_t tile_i,
217     size_t tile_j,
218     size_t tile_k) {
219   if (threadpool == nullptr) {
220     /* No thread pool provided: execute function sequentially on the calling
221      * thread */
222     for (size_t i = 0; i < range_i; i += tile_i) {
223       for (size_t j = 0; j < range_j; j += tile_j) {
224         for (size_t k = 0; k < range_k; k += tile_k) {
225           function(
226               argument,
227               i,
228               j,
229               k,
230               min(range_i - i, tile_i),
231               min(range_j - j, tile_j),
232               min(range_k - k, tile_k));
233         }
234       }
235     }
236   } else {
237     /* Execute in parallel on the thread pool using linearized index */
238     const size_t tile_range_i = divide_round_up(range_i, tile_i);
239     const size_t tile_range_j = divide_round_up(range_j, tile_j);
240     const size_t tile_range_k = divide_round_up(range_k, tile_k);
241     TORCH_DCHECK_LE(
242         tile_range_i * tile_range_j * tile_range_k,
243         (size_t)std::numeric_limits<int>::max());
244     struct compute_3d_tiled_context context = {
245         /*.function = */ function,
246         /*.argument = */ argument,
247         /*.tile_range_j = */ caffe2::FixedDivisor<int>(tile_range_j),
248         /*.tile_range_k = */ caffe2::FixedDivisor<int>(tile_range_k),
249         /*.range_i = */ range_i,
250         /*.range_j = */ range_j,
251         /*.range_k = */ range_k,
252         /*.tile_i = */ tile_i,
253         /*.tile_j = */ tile_j,
254         /*.tile_k = */ tile_k};
255     legacy_pthreadpool_compute_1d(
256         threadpool,
257         (legacy_pthreadpool_function_1d_t)compute_3d_tiled,
258         &context,
259         tile_range_i * tile_range_j * tile_range_k);
260   }
261 }
262 
263 struct compute_4d_tiled_context {
264   legacy_pthreadpool_function_4d_tiled_t function;
265   void* argument;
266   caffe2::FixedDivisor<int32_t> tile_range_kl;
267   caffe2::FixedDivisor<int32_t> tile_range_j;
268   caffe2::FixedDivisor<int32_t> tile_range_l;
269   size_t range_i;
270   size_t range_j;
271   size_t range_k;
272   size_t range_l;
273   size_t tile_i;
274   size_t tile_j;
275   size_t tile_k;
276   size_t tile_l;
277 };
278 
compute_4d_tiled(void * context_,size_t linear_index)279 static void compute_4d_tiled(
280     void* context_,
281     size_t linear_index) {
282   int32_t tile_index_ij, tile_index_kl;
283   const struct compute_4d_tiled_context* context = static_cast<compute_4d_tiled_context*>(context_);
284   context->tile_range_kl.DivMod(
285       static_cast<int32_t>(linear_index), &tile_index_ij, &tile_index_kl);
286   int32_t tile_index_i, tile_index_j;
287   context->tile_range_j.DivMod(tile_index_ij, &tile_index_i, &tile_index_j);
288   int32_t tile_index_k, tile_index_l;
289   context->tile_range_l.DivMod(tile_index_kl, &tile_index_k, &tile_index_l);
290   const size_t max_tile_i = context->tile_i;
291   const size_t max_tile_j = context->tile_j;
292   const size_t max_tile_k = context->tile_k;
293   const size_t max_tile_l = context->tile_l;
294   const size_t index_i = static_cast<uint32_t>(tile_index_i) * max_tile_i;
295   const size_t index_j = static_cast<uint32_t>(tile_index_j) * max_tile_j;
296   const size_t index_k = static_cast<uint32_t>(tile_index_k) * max_tile_k;
297   const size_t index_l = static_cast<uint32_t>(tile_index_l) * max_tile_l;
298   const size_t tile_i = min(max_tile_i, context->range_i - index_i);
299   const size_t tile_j = min(max_tile_j, context->range_j - index_j);
300   const size_t tile_k = min(max_tile_k, context->range_k - index_k);
301   const size_t tile_l = min(max_tile_l, context->range_l - index_l);
302   context->function(
303       context->argument,
304       index_i,
305       index_j,
306       index_k,
307       index_l,
308       tile_i,
309       tile_j,
310       tile_k,
311       tile_l);
312 }
313 
legacy_pthreadpool_compute_4d_tiled(legacy_pthreadpool_t threadpool,legacy_pthreadpool_function_4d_tiled_t function,void * argument,size_t range_i,size_t range_j,size_t range_k,size_t range_l,size_t tile_i,size_t tile_j,size_t tile_k,size_t tile_l)314 void legacy_pthreadpool_compute_4d_tiled(
315     legacy_pthreadpool_t threadpool,
316     legacy_pthreadpool_function_4d_tiled_t function,
317     void* argument,
318     size_t range_i,
319     size_t range_j,
320     size_t range_k,
321     size_t range_l,
322     size_t tile_i,
323     size_t tile_j,
324     size_t tile_k,
325     size_t tile_l) {
326   if (threadpool == nullptr) {
327     /* No thread pool provided: execute function sequentially on the calling
328      * thread */
329     for (size_t i = 0; i < range_i; i += tile_i) {
330       for (size_t j = 0; j < range_j; j += tile_j) {
331         for (size_t k = 0; k < range_k; k += tile_k) {
332           for (size_t l = 0; l < range_l; l += tile_l) {
333             function(
334                 argument,
335                 i,
336                 j,
337                 k,
338                 l,
339                 min(range_i - i, tile_i),
340                 min(range_j - j, tile_j),
341                 min(range_k - k, tile_k),
342                 min(range_l - l, tile_l));
343           }
344         }
345       }
346     }
347   } else {
348     /* Execute in parallel on the thread pool using linearized index */
349     const size_t tile_range_i = divide_round_up(range_i, tile_i);
350     const size_t tile_range_j = divide_round_up(range_j, tile_j);
351     const size_t tile_range_k = divide_round_up(range_k, tile_k);
352     const size_t tile_range_l = divide_round_up(range_l, tile_l);
353     TORCH_DCHECK_LE(
354         tile_range_i * tile_range_j * tile_range_k * tile_range_l,
355         (size_t)std::numeric_limits<int>::max());
356     struct compute_4d_tiled_context context = {
357         /*.function = */ function,
358         /*.argument = */ argument,
359         /*.tile_range_kl = */
360         caffe2::FixedDivisor<int>(tile_range_k * tile_range_l),
361         /*.tile_range_j = */ caffe2::FixedDivisor<int>(tile_range_j),
362         /*.tile_range_l = */ caffe2::FixedDivisor<int>(tile_range_l),
363         /*.range_i = */ range_i,
364         /*.range_j = */ range_j,
365         /*.range_k = */ range_k,
366         /*.range_l = */ range_l,
367         /*.tile_i = */ tile_i,
368         /*.tile_j = */ tile_j,
369         /*.tile_k = */ tile_k,
370         /*.tile_l = */ tile_l};
371     legacy_pthreadpool_compute_1d(
372         threadpool,
373         (legacy_pthreadpool_function_1d_t)compute_4d_tiled,
374         &context,
375         tile_range_i * tile_range_j * tile_range_k * tile_range_l);
376   }
377 }
378