1 /* Standard C headers */
2 #include <stdint.h>
3 #include <stdbool.h>
4 #include <stdlib.h>
5 #include <string.h>
6 #include <assert.h>
7 #include <limits>
8
9 #ifdef _MSC_VER
10 #include <cstdio>
11 #undef min
12 #else
13 /* POSIX headers */
14 #include <unistd.h>
15 #endif
16
17 /* Library header */
18 #include "caffe2/utils/fixed_divisor.h"
19 #include "caffe2/utils/threadpool/pthreadpool.h"
20
21 #include <c10/util/Logging.h>
22
divide_round_up(size_t dividend,size_t divisor)23 static inline size_t divide_round_up(size_t dividend, size_t divisor) {
24 if (dividend % divisor == 0) {
25 return dividend / divisor;
26 } else {
27 return dividend / divisor + 1;
28 }
29 }
30
min(size_t a,size_t b)31 static inline size_t min(size_t a, size_t b) {
32 return a < b ? a : b;
33 }
34
35 struct compute_1d_tiled_context {
36 legacy_pthreadpool_function_1d_tiled_t function;
37 void* argument;
38 size_t range;
39 size_t tile;
40 };
41
compute_1d_tiled(void * context_,size_t linear_index)42 static void compute_1d_tiled(void* context_, size_t linear_index) {
43 const struct compute_1d_tiled_context* context = (compute_1d_tiled_context*) context_;
44 const size_t tile_index = linear_index;
45 const size_t index = tile_index * context->tile;
46 const size_t tile = min(context->tile, context->range - index);
47 context->function(context->argument, index, tile);
48 }
49
legacy_pthreadpool_compute_1d_tiled(legacy_pthreadpool_t threadpool,legacy_pthreadpool_function_1d_tiled_t function,void * argument,size_t range,size_t tile)50 void legacy_pthreadpool_compute_1d_tiled(
51 legacy_pthreadpool_t threadpool,
52 legacy_pthreadpool_function_1d_tiled_t function,
53 void* argument,
54 size_t range,
55 size_t tile)
56 {
57 if (threadpool == nullptr) {
58 /* No thread pool provided: execute function sequentially on the calling thread */
59 for (size_t i = 0; i < range; i += tile) {
60 function(argument, i, min(range - i, tile));
61 }
62 } else {
63 /* Execute in parallel on the thread pool using linearized index */
64 const size_t tile_range = divide_round_up(range, tile);
65 struct compute_1d_tiled_context context = {/*.function = */ function,
66 /*.argument = */ argument,
67 /*.range = */ range,
68 /*.tile = */ tile};
69 legacy_pthreadpool_compute_1d(threadpool, (legacy_pthreadpool_function_1d_t) compute_1d_tiled, &context, tile_range);
70 }
71 }
72
73 struct compute_2d_context {
74 legacy_pthreadpool_function_2d_t function;
75 void* argument;
76 caffe2::FixedDivisor<int32_t> range_j;
77 };
78
compute_2d(void * context_,size_t linear_index)79 static void compute_2d(void* context_, size_t linear_index) {
80 TORCH_DCHECK_LE(linear_index, std::numeric_limits<int32_t>::max());
81
82 const struct compute_2d_context* context = static_cast<compute_2d_context*>(context_);
83 int32_t q;
84 int32_t r;
85 context->range_j.DivMod(static_cast<int32_t>(linear_index), &q, &r);
86 context->function(context->argument, q, r);
87 }
88
legacy_pthreadpool_compute_2d(legacy_pthreadpool_t threadpool,legacy_pthreadpool_function_2d_t function,void * argument,size_t range_i,size_t range_j)89 void legacy_pthreadpool_compute_2d(
90 legacy_pthreadpool_t threadpool,
91 legacy_pthreadpool_function_2d_t function,
92 void* argument,
93 size_t range_i,
94 size_t range_j)
95 {
96 if (threadpool == nullptr) {
97 /* No thread pool provided: execute function sequentially on the calling thread */
98 for (size_t i = 0; i < range_i; i++) {
99 for (size_t j = 0; j < range_j; j++) {
100 function(argument, i, j);
101 }
102 }
103 } else {
104 TORCH_DCHECK_LE(range_i * range_j, (size_t)std::numeric_limits<int32_t>::max());
105 /* Execute in parallel on the thread pool using linearized index */
106 struct compute_2d_context context = {
107 /*.function = */ function,
108 /*.argument = */ argument,
109 /*.range_j = */ caffe2::FixedDivisor<int32_t>(range_j)};
110 legacy_pthreadpool_compute_1d(threadpool, (legacy_pthreadpool_function_1d_t) compute_2d, &context, range_i * range_j);
111 }
112 }
113
114 struct compute_2d_tiled_context {
115 legacy_pthreadpool_function_2d_tiled_t function;
116 void* argument;
117 caffe2::FixedDivisor<int32_t> tile_range_j;
118 size_t range_i;
119 size_t range_j;
120 size_t tile_i;
121 size_t tile_j;
122 };
123
compute_2d_tiled(void * context_,size_t linear_index)124 static void compute_2d_tiled(void* context_, size_t linear_index) {
125 int32_t q;
126 int32_t r;
127
128 const struct compute_2d_tiled_context* context = static_cast<compute_2d_tiled_context*>(context_);
129 context->tile_range_j.DivMod(linear_index, &q, &r);
130 const size_t max_tile_i = context->tile_i;
131 const size_t max_tile_j = context->tile_j;
132 const size_t index_i = q * max_tile_i;
133 const size_t index_j = r * max_tile_j;
134 const size_t tile_i = min(max_tile_i, context->range_i - index_i);
135 const size_t tile_j = min(max_tile_j, context->range_j - index_j);
136 context->function(context->argument, index_i, index_j, tile_i, tile_j);
137 }
138
legacy_pthreadpool_compute_2d_tiled(legacy_pthreadpool_t threadpool,legacy_pthreadpool_function_2d_tiled_t function,void * argument,size_t range_i,size_t range_j,size_t tile_i,size_t tile_j)139 void legacy_pthreadpool_compute_2d_tiled(
140 legacy_pthreadpool_t threadpool,
141 legacy_pthreadpool_function_2d_tiled_t function,
142 void* argument,
143 size_t range_i,
144 size_t range_j,
145 size_t tile_i,
146 size_t tile_j)
147 {
148 if (threadpool == nullptr) {
149 /* No thread pool provided: execute function sequentially on the calling thread */
150 for (size_t i = 0; i < range_i; i += tile_i) {
151 for (size_t j = 0; j < range_j; j += tile_j) {
152 function(argument, i, j, min(range_i - i, tile_i), min(range_j - j, tile_j));
153 }
154 }
155 } else {
156 /* Execute in parallel on the thread pool using linearized index */
157 const size_t tile_range_i = divide_round_up(range_i, tile_i);
158 const size_t tile_range_j = divide_round_up(range_j, tile_j);
159 TORCH_DCHECK_LE(
160 tile_range_i * tile_range_j,
161 (size_t)std::numeric_limits<int32_t>::max());
162 struct compute_2d_tiled_context context = {
163 /*.function = */ function,
164 /*.argument = */ argument,
165 /*.tile_range_j = */ caffe2::FixedDivisor<int32_t>(tile_range_j),
166 /*.range_i = */ range_i,
167 /*.range_j = */ range_j,
168 /*.tile_i = */ tile_i,
169 /*.tile_j = */ tile_j};
170 legacy_pthreadpool_compute_1d(threadpool, (legacy_pthreadpool_function_1d_t) compute_2d_tiled, &context, tile_range_i * tile_range_j);
171 }
172 }
173
174 struct compute_3d_tiled_context {
175 legacy_pthreadpool_function_3d_tiled_t function;
176 void* argument;
177 caffe2::FixedDivisor<int32_t> tile_range_j;
178 caffe2::FixedDivisor<int32_t> tile_range_k;
179 size_t range_i;
180 size_t range_j;
181 size_t range_k;
182 size_t tile_i;
183 size_t tile_j;
184 size_t tile_k;
185 };
186
compute_3d_tiled(void * context_,size_t linear_index)187 static void compute_3d_tiled(
188 void* context_,
189 size_t linear_index) {
190 int32_t tile_index_ij, tile_index_k;
191 const struct compute_3d_tiled_context* context = static_cast<compute_3d_tiled_context*>(context_);
192 context->tile_range_k.DivMod(
193 static_cast<int32_t>(linear_index), &tile_index_ij, &tile_index_k);
194 int32_t tile_index_i, tile_index_j;
195 context->tile_range_j.DivMod(tile_index_ij, &tile_index_i, &tile_index_j);
196 const size_t max_tile_i = context->tile_i;
197 const size_t max_tile_j = context->tile_j;
198 const size_t max_tile_k = context->tile_k;
199 const size_t index_i = static_cast<uint32_t>(tile_index_i) * max_tile_i;
200 const size_t index_j = static_cast<uint32_t>(tile_index_j) * max_tile_j;
201 const size_t index_k = static_cast<uint32_t>(tile_index_k) * max_tile_k;
202 const size_t tile_i = min(max_tile_i, context->range_i - index_i);
203 const size_t tile_j = min(max_tile_j, context->range_j - index_j);
204 const size_t tile_k = min(max_tile_k, context->range_k - index_k);
205 context->function(
206 context->argument, index_i, index_j, index_k, tile_i, tile_j, tile_k);
207 }
208
legacy_pthreadpool_compute_3d_tiled(legacy_pthreadpool_t threadpool,legacy_pthreadpool_function_3d_tiled_t function,void * argument,size_t range_i,size_t range_j,size_t range_k,size_t tile_i,size_t tile_j,size_t tile_k)209 void legacy_pthreadpool_compute_3d_tiled(
210 legacy_pthreadpool_t threadpool,
211 legacy_pthreadpool_function_3d_tiled_t function,
212 void* argument,
213 size_t range_i,
214 size_t range_j,
215 size_t range_k,
216 size_t tile_i,
217 size_t tile_j,
218 size_t tile_k) {
219 if (threadpool == nullptr) {
220 /* No thread pool provided: execute function sequentially on the calling
221 * thread */
222 for (size_t i = 0; i < range_i; i += tile_i) {
223 for (size_t j = 0; j < range_j; j += tile_j) {
224 for (size_t k = 0; k < range_k; k += tile_k) {
225 function(
226 argument,
227 i,
228 j,
229 k,
230 min(range_i - i, tile_i),
231 min(range_j - j, tile_j),
232 min(range_k - k, tile_k));
233 }
234 }
235 }
236 } else {
237 /* Execute in parallel on the thread pool using linearized index */
238 const size_t tile_range_i = divide_round_up(range_i, tile_i);
239 const size_t tile_range_j = divide_round_up(range_j, tile_j);
240 const size_t tile_range_k = divide_round_up(range_k, tile_k);
241 TORCH_DCHECK_LE(
242 tile_range_i * tile_range_j * tile_range_k,
243 (size_t)std::numeric_limits<int>::max());
244 struct compute_3d_tiled_context context = {
245 /*.function = */ function,
246 /*.argument = */ argument,
247 /*.tile_range_j = */ caffe2::FixedDivisor<int>(tile_range_j),
248 /*.tile_range_k = */ caffe2::FixedDivisor<int>(tile_range_k),
249 /*.range_i = */ range_i,
250 /*.range_j = */ range_j,
251 /*.range_k = */ range_k,
252 /*.tile_i = */ tile_i,
253 /*.tile_j = */ tile_j,
254 /*.tile_k = */ tile_k};
255 legacy_pthreadpool_compute_1d(
256 threadpool,
257 (legacy_pthreadpool_function_1d_t)compute_3d_tiled,
258 &context,
259 tile_range_i * tile_range_j * tile_range_k);
260 }
261 }
262
263 struct compute_4d_tiled_context {
264 legacy_pthreadpool_function_4d_tiled_t function;
265 void* argument;
266 caffe2::FixedDivisor<int32_t> tile_range_kl;
267 caffe2::FixedDivisor<int32_t> tile_range_j;
268 caffe2::FixedDivisor<int32_t> tile_range_l;
269 size_t range_i;
270 size_t range_j;
271 size_t range_k;
272 size_t range_l;
273 size_t tile_i;
274 size_t tile_j;
275 size_t tile_k;
276 size_t tile_l;
277 };
278
compute_4d_tiled(void * context_,size_t linear_index)279 static void compute_4d_tiled(
280 void* context_,
281 size_t linear_index) {
282 int32_t tile_index_ij, tile_index_kl;
283 const struct compute_4d_tiled_context* context = static_cast<compute_4d_tiled_context*>(context_);
284 context->tile_range_kl.DivMod(
285 static_cast<int32_t>(linear_index), &tile_index_ij, &tile_index_kl);
286 int32_t tile_index_i, tile_index_j;
287 context->tile_range_j.DivMod(tile_index_ij, &tile_index_i, &tile_index_j);
288 int32_t tile_index_k, tile_index_l;
289 context->tile_range_l.DivMod(tile_index_kl, &tile_index_k, &tile_index_l);
290 const size_t max_tile_i = context->tile_i;
291 const size_t max_tile_j = context->tile_j;
292 const size_t max_tile_k = context->tile_k;
293 const size_t max_tile_l = context->tile_l;
294 const size_t index_i = static_cast<uint32_t>(tile_index_i) * max_tile_i;
295 const size_t index_j = static_cast<uint32_t>(tile_index_j) * max_tile_j;
296 const size_t index_k = static_cast<uint32_t>(tile_index_k) * max_tile_k;
297 const size_t index_l = static_cast<uint32_t>(tile_index_l) * max_tile_l;
298 const size_t tile_i = min(max_tile_i, context->range_i - index_i);
299 const size_t tile_j = min(max_tile_j, context->range_j - index_j);
300 const size_t tile_k = min(max_tile_k, context->range_k - index_k);
301 const size_t tile_l = min(max_tile_l, context->range_l - index_l);
302 context->function(
303 context->argument,
304 index_i,
305 index_j,
306 index_k,
307 index_l,
308 tile_i,
309 tile_j,
310 tile_k,
311 tile_l);
312 }
313
legacy_pthreadpool_compute_4d_tiled(legacy_pthreadpool_t threadpool,legacy_pthreadpool_function_4d_tiled_t function,void * argument,size_t range_i,size_t range_j,size_t range_k,size_t range_l,size_t tile_i,size_t tile_j,size_t tile_k,size_t tile_l)314 void legacy_pthreadpool_compute_4d_tiled(
315 legacy_pthreadpool_t threadpool,
316 legacy_pthreadpool_function_4d_tiled_t function,
317 void* argument,
318 size_t range_i,
319 size_t range_j,
320 size_t range_k,
321 size_t range_l,
322 size_t tile_i,
323 size_t tile_j,
324 size_t tile_k,
325 size_t tile_l) {
326 if (threadpool == nullptr) {
327 /* No thread pool provided: execute function sequentially on the calling
328 * thread */
329 for (size_t i = 0; i < range_i; i += tile_i) {
330 for (size_t j = 0; j < range_j; j += tile_j) {
331 for (size_t k = 0; k < range_k; k += tile_k) {
332 for (size_t l = 0; l < range_l; l += tile_l) {
333 function(
334 argument,
335 i,
336 j,
337 k,
338 l,
339 min(range_i - i, tile_i),
340 min(range_j - j, tile_j),
341 min(range_k - k, tile_k),
342 min(range_l - l, tile_l));
343 }
344 }
345 }
346 }
347 } else {
348 /* Execute in parallel on the thread pool using linearized index */
349 const size_t tile_range_i = divide_round_up(range_i, tile_i);
350 const size_t tile_range_j = divide_round_up(range_j, tile_j);
351 const size_t tile_range_k = divide_round_up(range_k, tile_k);
352 const size_t tile_range_l = divide_round_up(range_l, tile_l);
353 TORCH_DCHECK_LE(
354 tile_range_i * tile_range_j * tile_range_k * tile_range_l,
355 (size_t)std::numeric_limits<int>::max());
356 struct compute_4d_tiled_context context = {
357 /*.function = */ function,
358 /*.argument = */ argument,
359 /*.tile_range_kl = */
360 caffe2::FixedDivisor<int>(tile_range_k * tile_range_l),
361 /*.tile_range_j = */ caffe2::FixedDivisor<int>(tile_range_j),
362 /*.tile_range_l = */ caffe2::FixedDivisor<int>(tile_range_l),
363 /*.range_i = */ range_i,
364 /*.range_j = */ range_j,
365 /*.range_k = */ range_k,
366 /*.range_l = */ range_l,
367 /*.tile_i = */ tile_i,
368 /*.tile_j = */ tile_j,
369 /*.tile_k = */ tile_k,
370 /*.tile_l = */ tile_l};
371 legacy_pthreadpool_compute_1d(
372 threadpool,
373 (legacy_pthreadpool_function_1d_t)compute_4d_tiled,
374 &context,
375 tile_range_i * tile_range_j * tile_range_k * tile_range_l);
376 }
377 }
378