xref: /aosp_15_r20/external/pthreadpool/src/portable-api.c (revision b095b0533730c2930f947df924a4486d266faa1a)
1*b095b053SXin Li /* Standard C headers */
2*b095b053SXin Li #include <assert.h>
3*b095b053SXin Li #include <stdbool.h>
4*b095b053SXin Li #include <stdint.h>
5*b095b053SXin Li #include <stdlib.h>
6*b095b053SXin Li #include <string.h>
7*b095b053SXin Li 
8*b095b053SXin Li #if PTHREADPOOL_USE_CPUINFO
9*b095b053SXin Li 	#include <cpuinfo.h>
10*b095b053SXin Li #endif
11*b095b053SXin Li 
12*b095b053SXin Li /* Dependencies */
13*b095b053SXin Li #include <fxdiv.h>
14*b095b053SXin Li 
15*b095b053SXin Li /* Public library header */
16*b095b053SXin Li #include <pthreadpool.h>
17*b095b053SXin Li 
18*b095b053SXin Li /* Internal library headers */
19*b095b053SXin Li #include "threadpool-atomics.h"
20*b095b053SXin Li #include "threadpool-object.h"
21*b095b053SXin Li #include "threadpool-utils.h"
22*b095b053SXin Li 
23*b095b053SXin Li 
pthreadpool_get_threads_count(struct pthreadpool * threadpool)24*b095b053SXin Li size_t pthreadpool_get_threads_count(struct pthreadpool* threadpool) {
25*b095b053SXin Li 	if (threadpool == NULL) {
26*b095b053SXin Li 		return 1;
27*b095b053SXin Li 	}
28*b095b053SXin Li 
29*b095b053SXin Li 	return threadpool->threads_count.value;
30*b095b053SXin Li }
31*b095b053SXin Li 
thread_parallelize_1d(struct pthreadpool * threadpool,struct thread_info * thread)32*b095b053SXin Li static void thread_parallelize_1d(struct pthreadpool* threadpool, struct thread_info* thread) {
33*b095b053SXin Li 	assert(threadpool != NULL);
34*b095b053SXin Li 	assert(thread != NULL);
35*b095b053SXin Li 
36*b095b053SXin Li 	const pthreadpool_task_1d_t task = (pthreadpool_task_1d_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
37*b095b053SXin Li 	void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
38*b095b053SXin Li 
39*b095b053SXin Li 	/* Process thread's own range of items */
40*b095b053SXin Li 	size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
41*b095b053SXin Li 	while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) {
42*b095b053SXin Li 		task(argument, range_start++);
43*b095b053SXin Li 	}
44*b095b053SXin Li 
45*b095b053SXin Li 	/* There still may be other threads with work */
46*b095b053SXin Li 	const size_t thread_number = thread->thread_number;
47*b095b053SXin Li 	const size_t threads_count = threadpool->threads_count.value;
48*b095b053SXin Li 	for (size_t tid = modulo_decrement(thread_number, threads_count);
49*b095b053SXin Li 		tid != thread_number;
50*b095b053SXin Li 		tid = modulo_decrement(tid, threads_count))
51*b095b053SXin Li 	{
52*b095b053SXin Li 		struct thread_info* other_thread = &threadpool->threads[tid];
53*b095b053SXin Li 		while (pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) {
54*b095b053SXin Li 			const size_t index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
55*b095b053SXin Li 			task(argument, index);
56*b095b053SXin Li 		}
57*b095b053SXin Li 	}
58*b095b053SXin Li 
59*b095b053SXin Li 	/* Make changes by this thread visible to other threads */
60*b095b053SXin Li 	pthreadpool_fence_release();
61*b095b053SXin Li }
62*b095b053SXin Li 
thread_parallelize_1d_with_uarch(struct pthreadpool * threadpool,struct thread_info * thread)63*b095b053SXin Li static void thread_parallelize_1d_with_uarch(struct pthreadpool* threadpool, struct thread_info* thread) {
64*b095b053SXin Li 	assert(threadpool != NULL);
65*b095b053SXin Li 	assert(thread != NULL);
66*b095b053SXin Li 
67*b095b053SXin Li 	const pthreadpool_task_1d_with_id_t task = (pthreadpool_task_1d_with_id_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
68*b095b053SXin Li 	void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
69*b095b053SXin Li 
70*b095b053SXin Li 	const uint32_t default_uarch_index = threadpool->params.parallelize_1d_with_uarch.default_uarch_index;
71*b095b053SXin Li 	uint32_t uarch_index = default_uarch_index;
72*b095b053SXin Li 	#if PTHREADPOOL_USE_CPUINFO
73*b095b053SXin Li 		uarch_index = cpuinfo_get_current_uarch_index_with_default(default_uarch_index);
74*b095b053SXin Li 		if (uarch_index > threadpool->params.parallelize_1d_with_uarch.max_uarch_index) {
75*b095b053SXin Li 			uarch_index = default_uarch_index;
76*b095b053SXin Li 		}
77*b095b053SXin Li 	#endif
78*b095b053SXin Li 
79*b095b053SXin Li 	/* Process thread's own range of items */
80*b095b053SXin Li 	size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
81*b095b053SXin Li 	while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) {
82*b095b053SXin Li 		task(argument, uarch_index, range_start++);
83*b095b053SXin Li 	}
84*b095b053SXin Li 
85*b095b053SXin Li 	/* There still may be other threads with work */
86*b095b053SXin Li 	const size_t thread_number = thread->thread_number;
87*b095b053SXin Li 	const size_t threads_count = threadpool->threads_count.value;
88*b095b053SXin Li 	for (size_t tid = modulo_decrement(thread_number, threads_count);
89*b095b053SXin Li 		tid != thread_number;
90*b095b053SXin Li 		tid = modulo_decrement(tid, threads_count))
91*b095b053SXin Li 	{
92*b095b053SXin Li 		struct thread_info* other_thread = &threadpool->threads[tid];
93*b095b053SXin Li 		while (pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) {
94*b095b053SXin Li 			const size_t index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
95*b095b053SXin Li 			task(argument, uarch_index, index);
96*b095b053SXin Li 		}
97*b095b053SXin Li 	}
98*b095b053SXin Li 
99*b095b053SXin Li 	/* Make changes by this thread visible to other threads */
100*b095b053SXin Li 	pthreadpool_fence_release();
101*b095b053SXin Li }
102*b095b053SXin Li 
thread_parallelize_1d_tile_1d(struct pthreadpool * threadpool,struct thread_info * thread)103*b095b053SXin Li static void thread_parallelize_1d_tile_1d(struct pthreadpool* threadpool, struct thread_info* thread) {
104*b095b053SXin Li 	assert(threadpool != NULL);
105*b095b053SXin Li 	assert(thread != NULL);
106*b095b053SXin Li 
107*b095b053SXin Li 	const pthreadpool_task_1d_tile_1d_t task = (pthreadpool_task_1d_tile_1d_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
108*b095b053SXin Li 	void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
109*b095b053SXin Li 
110*b095b053SXin Li 	/* Process thread's own range of items */
111*b095b053SXin Li 	const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
112*b095b053SXin Li 	const size_t tile = threadpool->params.parallelize_1d_tile_1d.tile;
113*b095b053SXin Li 	size_t tile_start = range_start * tile;
114*b095b053SXin Li 
115*b095b053SXin Li 	const size_t range = threadpool->params.parallelize_1d_tile_1d.range;
116*b095b053SXin Li 	while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) {
117*b095b053SXin Li 		task(argument, tile_start, min(range - tile_start, tile));
118*b095b053SXin Li 		tile_start += tile;
119*b095b053SXin Li 	}
120*b095b053SXin Li 
121*b095b053SXin Li 	/* There still may be other threads with work */
122*b095b053SXin Li 	const size_t thread_number = thread->thread_number;
123*b095b053SXin Li 	const size_t threads_count = threadpool->threads_count.value;
124*b095b053SXin Li 	for (size_t tid = modulo_decrement(thread_number, threads_count);
125*b095b053SXin Li 		tid != thread_number;
126*b095b053SXin Li 		tid = modulo_decrement(tid, threads_count))
127*b095b053SXin Li 	{
128*b095b053SXin Li 		struct thread_info* other_thread = &threadpool->threads[tid];
129*b095b053SXin Li 		while (pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) {
130*b095b053SXin Li 			const size_t tile_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
131*b095b053SXin Li 			const size_t tile_start = tile_index * tile;
132*b095b053SXin Li 			task(argument, tile_start, min(range - tile_start, tile));
133*b095b053SXin Li 		}
134*b095b053SXin Li 	}
135*b095b053SXin Li 
136*b095b053SXin Li 	/* Make changes by this thread visible to other threads */
137*b095b053SXin Li 	pthreadpool_fence_release();
138*b095b053SXin Li }
139*b095b053SXin Li 
thread_parallelize_2d(struct pthreadpool * threadpool,struct thread_info * thread)140*b095b053SXin Li static void thread_parallelize_2d(struct pthreadpool* threadpool, struct thread_info* thread) {
141*b095b053SXin Li 	assert(threadpool != NULL);
142*b095b053SXin Li 	assert(thread != NULL);
143*b095b053SXin Li 
144*b095b053SXin Li 	const pthreadpool_task_2d_t task = (pthreadpool_task_2d_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
145*b095b053SXin Li 	void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
146*b095b053SXin Li 
147*b095b053SXin Li 	/* Process thread's own range of items */
148*b095b053SXin Li 	const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
149*b095b053SXin Li 	const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_2d.range_j;
150*b095b053SXin Li 	const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(range_start, range_j);
151*b095b053SXin Li 	size_t i = index_i_j.quotient;
152*b095b053SXin Li 	size_t j = index_i_j.remainder;
153*b095b053SXin Li 
154*b095b053SXin Li 	while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) {
155*b095b053SXin Li 		task(argument, i, j);
156*b095b053SXin Li 		if (++j == range_j.value) {
157*b095b053SXin Li 			j = 0;
158*b095b053SXin Li 			i += 1;
159*b095b053SXin Li 		}
160*b095b053SXin Li 	}
161*b095b053SXin Li 
162*b095b053SXin Li 	/* There still may be other threads with work */
163*b095b053SXin Li 	const size_t thread_number = thread->thread_number;
164*b095b053SXin Li 	const size_t threads_count = threadpool->threads_count.value;
165*b095b053SXin Li 	for (size_t tid = modulo_decrement(thread_number, threads_count);
166*b095b053SXin Li 		tid != thread_number;
167*b095b053SXin Li 		tid = modulo_decrement(tid, threads_count))
168*b095b053SXin Li 	{
169*b095b053SXin Li 		struct thread_info* other_thread = &threadpool->threads[tid];
170*b095b053SXin Li 		while (pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) {
171*b095b053SXin Li 			const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
172*b095b053SXin Li 			const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(linear_index, range_j);
173*b095b053SXin Li 			task(argument, index_i_j.quotient, index_i_j.remainder);
174*b095b053SXin Li 		}
175*b095b053SXin Li 	}
176*b095b053SXin Li 
177*b095b053SXin Li 	/* Make changes by this thread visible to other threads */
178*b095b053SXin Li 	pthreadpool_fence_release();
179*b095b053SXin Li }
180*b095b053SXin Li 
thread_parallelize_2d_tile_1d(struct pthreadpool * threadpool,struct thread_info * thread)181*b095b053SXin Li static void thread_parallelize_2d_tile_1d(struct pthreadpool* threadpool, struct thread_info* thread) {
182*b095b053SXin Li 	assert(threadpool != NULL);
183*b095b053SXin Li 	assert(thread != NULL);
184*b095b053SXin Li 
185*b095b053SXin Li 	const pthreadpool_task_2d_tile_1d_t task = (pthreadpool_task_2d_tile_1d_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
186*b095b053SXin Li 	void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
187*b095b053SXin Li 
188*b095b053SXin Li 	/* Process thread's own range of items */
189*b095b053SXin Li 	const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
190*b095b053SXin Li 	const struct fxdiv_divisor_size_t tile_range_j = threadpool->params.parallelize_2d_tile_1d.tile_range_j;
191*b095b053SXin Li 	const struct fxdiv_result_size_t tile_index_i_j = fxdiv_divide_size_t(range_start, tile_range_j);
192*b095b053SXin Li 	const size_t tile_j = threadpool->params.parallelize_2d_tile_1d.tile_j;
193*b095b053SXin Li 	size_t i = tile_index_i_j.quotient;
194*b095b053SXin Li 	size_t start_j = tile_index_i_j.remainder * tile_j;
195*b095b053SXin Li 
196*b095b053SXin Li 	const size_t range_j = threadpool->params.parallelize_2d_tile_1d.range_j;
197*b095b053SXin Li 	while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) {
198*b095b053SXin Li 		task(argument, i, start_j, min(range_j - start_j, tile_j));
199*b095b053SXin Li 		start_j += tile_j;
200*b095b053SXin Li 		if (start_j >= range_j) {
201*b095b053SXin Li 			start_j = 0;
202*b095b053SXin Li 			i += 1;
203*b095b053SXin Li 		}
204*b095b053SXin Li 	}
205*b095b053SXin Li 
206*b095b053SXin Li 	/* There still may be other threads with work */
207*b095b053SXin Li 	const size_t thread_number = thread->thread_number;
208*b095b053SXin Li 	const size_t threads_count = threadpool->threads_count.value;
209*b095b053SXin Li 	for (size_t tid = modulo_decrement(thread_number, threads_count);
210*b095b053SXin Li 		tid != thread_number;
211*b095b053SXin Li 		tid = modulo_decrement(tid, threads_count))
212*b095b053SXin Li 	{
213*b095b053SXin Li 		struct thread_info* other_thread = &threadpool->threads[tid];
214*b095b053SXin Li 		while (pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) {
215*b095b053SXin Li 			const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
216*b095b053SXin Li 			const struct fxdiv_result_size_t tile_index_i_j = fxdiv_divide_size_t(linear_index, tile_range_j);
217*b095b053SXin Li 			const size_t start_j = tile_index_i_j.remainder * tile_j;
218*b095b053SXin Li 			task(argument, tile_index_i_j.quotient, start_j, min(range_j - start_j, tile_j));
219*b095b053SXin Li 		}
220*b095b053SXin Li 	}
221*b095b053SXin Li 
222*b095b053SXin Li 	/* Make changes by this thread visible to other threads */
223*b095b053SXin Li 	pthreadpool_fence_release();
224*b095b053SXin Li }
225*b095b053SXin Li 
thread_parallelize_2d_tile_2d(struct pthreadpool * threadpool,struct thread_info * thread)226*b095b053SXin Li static void thread_parallelize_2d_tile_2d(struct pthreadpool* threadpool, struct thread_info* thread) {
227*b095b053SXin Li 	assert(threadpool != NULL);
228*b095b053SXin Li 	assert(thread != NULL);
229*b095b053SXin Li 
230*b095b053SXin Li 	const pthreadpool_task_2d_tile_2d_t task = (pthreadpool_task_2d_tile_2d_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
231*b095b053SXin Li 	void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
232*b095b053SXin Li 
233*b095b053SXin Li 	/* Process thread's own range of items */
234*b095b053SXin Li 	const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
235*b095b053SXin Li 	const struct fxdiv_divisor_size_t tile_range_j = threadpool->params.parallelize_2d_tile_2d.tile_range_j;
236*b095b053SXin Li 	const struct fxdiv_result_size_t tile_index_i_j = fxdiv_divide_size_t(range_start, tile_range_j);
237*b095b053SXin Li 	const size_t tile_i = threadpool->params.parallelize_2d_tile_2d.tile_i;
238*b095b053SXin Li 	const size_t tile_j = threadpool->params.parallelize_2d_tile_2d.tile_j;
239*b095b053SXin Li 	size_t start_i = tile_index_i_j.quotient * tile_i;
240*b095b053SXin Li 	size_t start_j = tile_index_i_j.remainder * tile_j;
241*b095b053SXin Li 
242*b095b053SXin Li 	const size_t range_i = threadpool->params.parallelize_2d_tile_2d.range_i;
243*b095b053SXin Li 	const size_t range_j = threadpool->params.parallelize_2d_tile_2d.range_j;
244*b095b053SXin Li 	while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) {
245*b095b053SXin Li 		task(argument, start_i, start_j, min(range_i - start_i, tile_i), min(range_j - start_j, tile_j));
246*b095b053SXin Li 		start_j += tile_j;
247*b095b053SXin Li 		if (start_j >= range_j) {
248*b095b053SXin Li 			start_j = 0;
249*b095b053SXin Li 			start_i += tile_i;
250*b095b053SXin Li 		}
251*b095b053SXin Li 	}
252*b095b053SXin Li 
253*b095b053SXin Li 	/* There still may be other threads with work */
254*b095b053SXin Li 	const size_t thread_number = thread->thread_number;
255*b095b053SXin Li 	const size_t threads_count = threadpool->threads_count.value;
256*b095b053SXin Li 	for (size_t tid = modulo_decrement(thread_number, threads_count);
257*b095b053SXin Li 		tid != thread_number;
258*b095b053SXin Li 		tid = modulo_decrement(tid, threads_count))
259*b095b053SXin Li 	{
260*b095b053SXin Li 		struct thread_info* other_thread = &threadpool->threads[tid];
261*b095b053SXin Li 		while (pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) {
262*b095b053SXin Li 			const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
263*b095b053SXin Li 			const struct fxdiv_result_size_t tile_index_i_j = fxdiv_divide_size_t(linear_index, tile_range_j);
264*b095b053SXin Li 			const size_t start_i = tile_index_i_j.quotient * tile_i;
265*b095b053SXin Li 			const size_t start_j = tile_index_i_j.remainder * tile_j;
266*b095b053SXin Li 			task(argument, start_i, start_j, min(range_i - start_i, tile_i), min(range_j - start_j, tile_j));
267*b095b053SXin Li 		}
268*b095b053SXin Li 	}
269*b095b053SXin Li 
270*b095b053SXin Li 	/* Make changes by this thread visible to other threads */
271*b095b053SXin Li 	pthreadpool_fence_release();
272*b095b053SXin Li }
273*b095b053SXin Li 
thread_parallelize_2d_tile_2d_with_uarch(struct pthreadpool * threadpool,struct thread_info * thread)274*b095b053SXin Li static void thread_parallelize_2d_tile_2d_with_uarch(struct pthreadpool* threadpool, struct thread_info* thread) {
275*b095b053SXin Li 	assert(threadpool != NULL);
276*b095b053SXin Li 	assert(thread != NULL);
277*b095b053SXin Li 
278*b095b053SXin Li 	const pthreadpool_task_2d_tile_2d_with_id_t task = (pthreadpool_task_2d_tile_2d_with_id_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
279*b095b053SXin Li 	void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
280*b095b053SXin Li 
281*b095b053SXin Li 	const uint32_t default_uarch_index = threadpool->params.parallelize_2d_tile_2d_with_uarch.default_uarch_index;
282*b095b053SXin Li 	uint32_t uarch_index = default_uarch_index;
283*b095b053SXin Li 	#if PTHREADPOOL_USE_CPUINFO
284*b095b053SXin Li 		uarch_index = cpuinfo_get_current_uarch_index_with_default(default_uarch_index);
285*b095b053SXin Li 		if (uarch_index > threadpool->params.parallelize_2d_tile_2d_with_uarch.max_uarch_index) {
286*b095b053SXin Li 			uarch_index = default_uarch_index;
287*b095b053SXin Li 		}
288*b095b053SXin Li 	#endif
289*b095b053SXin Li 
290*b095b053SXin Li 	/* Process thread's own range of items */
291*b095b053SXin Li 	const struct fxdiv_divisor_size_t tile_range_j = threadpool->params.parallelize_2d_tile_2d_with_uarch.tile_range_j;
292*b095b053SXin Li 	const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
293*b095b053SXin Li 	const struct fxdiv_result_size_t index = fxdiv_divide_size_t(range_start, tile_range_j);
294*b095b053SXin Li 	const size_t range_i = threadpool->params.parallelize_2d_tile_2d_with_uarch.range_i;
295*b095b053SXin Li 	const size_t tile_i = threadpool->params.parallelize_2d_tile_2d_with_uarch.tile_i;
296*b095b053SXin Li 	const size_t range_j = threadpool->params.parallelize_2d_tile_2d_with_uarch.range_j;
297*b095b053SXin Li 	const size_t tile_j = threadpool->params.parallelize_2d_tile_2d_with_uarch.tile_j;
298*b095b053SXin Li 	size_t start_i = index.quotient * tile_i;
299*b095b053SXin Li 	size_t start_j = index.remainder * tile_j;
300*b095b053SXin Li 
301*b095b053SXin Li 	while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) {
302*b095b053SXin Li 		task(argument, uarch_index, start_i, start_j, min(range_i - start_i, tile_i), min(range_j - start_j, tile_j));
303*b095b053SXin Li 		start_j += tile_j;
304*b095b053SXin Li 		if (start_j >= range_j) {
305*b095b053SXin Li 			start_j = 0;
306*b095b053SXin Li 			start_i += tile_i;
307*b095b053SXin Li 		}
308*b095b053SXin Li 	}
309*b095b053SXin Li 
310*b095b053SXin Li 	/* There still may be other threads with work */
311*b095b053SXin Li 	const size_t thread_number = thread->thread_number;
312*b095b053SXin Li 	const size_t threads_count = threadpool->threads_count.value;
313*b095b053SXin Li 	for (size_t tid = modulo_decrement(thread_number, threads_count);
314*b095b053SXin Li 		tid != thread_number;
315*b095b053SXin Li 		tid = modulo_decrement(tid, threads_count))
316*b095b053SXin Li 	{
317*b095b053SXin Li 		struct thread_info* other_thread = &threadpool->threads[tid];
318*b095b053SXin Li 		while (pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) {
319*b095b053SXin Li 			const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
320*b095b053SXin Li 			const struct fxdiv_result_size_t tile_index_i_j = fxdiv_divide_size_t(linear_index, tile_range_j);
321*b095b053SXin Li 			const size_t start_i = tile_index_i_j.quotient * tile_i;
322*b095b053SXin Li 			const size_t start_j = tile_index_i_j.remainder * tile_j;
323*b095b053SXin Li 			task(argument, uarch_index, start_i, start_j, min(range_i - start_i, tile_i), min(range_j - start_j, tile_j));
324*b095b053SXin Li 		}
325*b095b053SXin Li 	}
326*b095b053SXin Li 
327*b095b053SXin Li 	/* Make changes by this thread visible to other threads */
328*b095b053SXin Li 	pthreadpool_fence_release();
329*b095b053SXin Li }
330*b095b053SXin Li 
thread_parallelize_3d(struct pthreadpool * threadpool,struct thread_info * thread)331*b095b053SXin Li static void thread_parallelize_3d(struct pthreadpool* threadpool, struct thread_info* thread) {
332*b095b053SXin Li 	assert(threadpool != NULL);
333*b095b053SXin Li 	assert(thread != NULL);
334*b095b053SXin Li 
335*b095b053SXin Li 	const pthreadpool_task_3d_t task = (pthreadpool_task_3d_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
336*b095b053SXin Li 	void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
337*b095b053SXin Li 
338*b095b053SXin Li 	/* Process thread's own range of items */
339*b095b053SXin Li 	const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
340*b095b053SXin Li 	const struct fxdiv_divisor_size_t range_k = threadpool->params.parallelize_3d.range_k;
341*b095b053SXin Li 	const struct fxdiv_result_size_t index_ij_k = fxdiv_divide_size_t(range_start, range_k);
342*b095b053SXin Li 	const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_3d.range_j;
343*b095b053SXin Li 	const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_k.quotient, range_j);
344*b095b053SXin Li 	size_t i = index_i_j.quotient;
345*b095b053SXin Li 	size_t j = index_i_j.remainder;
346*b095b053SXin Li 	size_t k = index_ij_k.remainder;
347*b095b053SXin Li 
348*b095b053SXin Li 	while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) {
349*b095b053SXin Li 		task(argument, i, j, k);
350*b095b053SXin Li 		if (++k == range_k.value) {
351*b095b053SXin Li 			k = 0;
352*b095b053SXin Li 			if (++j == range_j.value) {
353*b095b053SXin Li 				j = 0;
354*b095b053SXin Li 				i += 1;
355*b095b053SXin Li 			}
356*b095b053SXin Li 		}
357*b095b053SXin Li 	}
358*b095b053SXin Li 
359*b095b053SXin Li 	/* There still may be other threads with work */
360*b095b053SXin Li 	const size_t thread_number = thread->thread_number;
361*b095b053SXin Li 	const size_t threads_count = threadpool->threads_count.value;
362*b095b053SXin Li 	for (size_t tid = modulo_decrement(thread_number, threads_count);
363*b095b053SXin Li 		tid != thread_number;
364*b095b053SXin Li 		tid = modulo_decrement(tid, threads_count))
365*b095b053SXin Li 	{
366*b095b053SXin Li 		struct thread_info* other_thread = &threadpool->threads[tid];
367*b095b053SXin Li 		while (pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) {
368*b095b053SXin Li 			const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
369*b095b053SXin Li 			const struct fxdiv_result_size_t index_ij_k = fxdiv_divide_size_t(linear_index, range_k);
370*b095b053SXin Li 			const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_k.quotient, range_j);
371*b095b053SXin Li 			task(argument, index_i_j.quotient, index_i_j.remainder, index_ij_k.remainder);
372*b095b053SXin Li 		}
373*b095b053SXin Li 	}
374*b095b053SXin Li 
375*b095b053SXin Li 	/* Make changes by this thread visible to other threads */
376*b095b053SXin Li 	pthreadpool_fence_release();
377*b095b053SXin Li }
378*b095b053SXin Li 
thread_parallelize_3d_tile_1d(struct pthreadpool * threadpool,struct thread_info * thread)379*b095b053SXin Li static void thread_parallelize_3d_tile_1d(struct pthreadpool* threadpool, struct thread_info* thread) {
380*b095b053SXin Li 	assert(threadpool != NULL);
381*b095b053SXin Li 	assert(thread != NULL);
382*b095b053SXin Li 
383*b095b053SXin Li 	const pthreadpool_task_3d_tile_1d_t task = (pthreadpool_task_3d_tile_1d_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
384*b095b053SXin Li 	void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
385*b095b053SXin Li 
386*b095b053SXin Li 	/* Process thread's own range of items */
387*b095b053SXin Li 	const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
388*b095b053SXin Li 	const struct fxdiv_divisor_size_t tile_range_k = threadpool->params.parallelize_3d_tile_1d.tile_range_k;
389*b095b053SXin Li 	const struct fxdiv_result_size_t tile_index_ij_k = fxdiv_divide_size_t(range_start, tile_range_k);
390*b095b053SXin Li 	const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_3d_tile_1d.range_j;
391*b095b053SXin Li 	const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(tile_index_ij_k.quotient, range_j);
392*b095b053SXin Li 	const size_t tile_k = threadpool->params.parallelize_3d_tile_1d.tile_k;
393*b095b053SXin Li 	size_t i = index_i_j.quotient;
394*b095b053SXin Li 	size_t j = index_i_j.remainder;
395*b095b053SXin Li 	size_t start_k = tile_index_ij_k.remainder * tile_k;
396*b095b053SXin Li 
397*b095b053SXin Li 	const size_t range_k = threadpool->params.parallelize_3d_tile_1d.range_k;
398*b095b053SXin Li 	while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) {
399*b095b053SXin Li 		task(argument, i, j, start_k, min(range_k - start_k, tile_k));
400*b095b053SXin Li 		start_k += tile_k;
401*b095b053SXin Li 		if (start_k >= range_k) {
402*b095b053SXin Li 			start_k = 0;
403*b095b053SXin Li 			if (++j == range_j.value) {
404*b095b053SXin Li 				j = 0;
405*b095b053SXin Li 				i += 1;
406*b095b053SXin Li 			}
407*b095b053SXin Li 		}
408*b095b053SXin Li 	}
409*b095b053SXin Li 
410*b095b053SXin Li 	/* There still may be other threads with work */
411*b095b053SXin Li 	const size_t thread_number = thread->thread_number;
412*b095b053SXin Li 	const size_t threads_count = threadpool->threads_count.value;
413*b095b053SXin Li 	for (size_t tid = modulo_decrement(thread_number, threads_count);
414*b095b053SXin Li 		tid != thread_number;
415*b095b053SXin Li 		tid = modulo_decrement(tid, threads_count))
416*b095b053SXin Li 	{
417*b095b053SXin Li 		struct thread_info* other_thread = &threadpool->threads[tid];
418*b095b053SXin Li 		while (pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) {
419*b095b053SXin Li 			const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
420*b095b053SXin Li 			const struct fxdiv_result_size_t tile_index_ij_k = fxdiv_divide_size_t(linear_index, tile_range_k);
421*b095b053SXin Li 			const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(tile_index_ij_k.quotient, range_j);
422*b095b053SXin Li 			const size_t start_k = tile_index_ij_k.remainder * tile_k;
423*b095b053SXin Li 			task(argument, index_i_j.quotient, index_i_j.remainder, start_k, min(range_k - start_k, tile_k));
424*b095b053SXin Li 		}
425*b095b053SXin Li 	}
426*b095b053SXin Li 
427*b095b053SXin Li 	/* Make changes by this thread visible to other threads */
428*b095b053SXin Li 	pthreadpool_fence_release();
429*b095b053SXin Li }
430*b095b053SXin Li 
thread_parallelize_3d_tile_2d(struct pthreadpool * threadpool,struct thread_info * thread)431*b095b053SXin Li static void thread_parallelize_3d_tile_2d(struct pthreadpool* threadpool, struct thread_info* thread) {
432*b095b053SXin Li 	assert(threadpool != NULL);
433*b095b053SXin Li 	assert(thread != NULL);
434*b095b053SXin Li 
435*b095b053SXin Li 	const pthreadpool_task_3d_tile_2d_t task = (pthreadpool_task_3d_tile_2d_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
436*b095b053SXin Li 	void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
437*b095b053SXin Li 
438*b095b053SXin Li 	/* Process thread's own range of items */
439*b095b053SXin Li 	const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
440*b095b053SXin Li 	const struct fxdiv_divisor_size_t tile_range_k = threadpool->params.parallelize_3d_tile_2d.tile_range_k;
441*b095b053SXin Li 	const struct fxdiv_result_size_t tile_index_ij_k = fxdiv_divide_size_t(range_start, tile_range_k);
442*b095b053SXin Li 	const struct fxdiv_divisor_size_t tile_range_j = threadpool->params.parallelize_3d_tile_2d.tile_range_j;
443*b095b053SXin Li 	const struct fxdiv_result_size_t tile_index_i_j = fxdiv_divide_size_t(tile_index_ij_k.quotient, tile_range_j);
444*b095b053SXin Li 	const size_t tile_j = threadpool->params.parallelize_3d_tile_2d.tile_j;
445*b095b053SXin Li 	const size_t tile_k = threadpool->params.parallelize_3d_tile_2d.tile_k;
446*b095b053SXin Li 	size_t i = tile_index_i_j.quotient;
447*b095b053SXin Li 	size_t start_j = tile_index_i_j.remainder * tile_j;
448*b095b053SXin Li 	size_t start_k = tile_index_ij_k.remainder * tile_k;
449*b095b053SXin Li 
450*b095b053SXin Li 	const size_t range_k = threadpool->params.parallelize_3d_tile_2d.range_k;
451*b095b053SXin Li 	const size_t range_j = threadpool->params.parallelize_3d_tile_2d.range_j;
452*b095b053SXin Li 	while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) {
453*b095b053SXin Li 		task(argument, i, start_j, start_k, min(range_j - start_j, tile_j), min(range_k - start_k, tile_k));
454*b095b053SXin Li 		start_k += tile_k;
455*b095b053SXin Li 		if (start_k >= range_k) {
456*b095b053SXin Li 			start_k = 0;
457*b095b053SXin Li 			start_j += tile_j;
458*b095b053SXin Li 			if (start_j >= range_j) {
459*b095b053SXin Li 				start_j = 0;
460*b095b053SXin Li 				i += 1;
461*b095b053SXin Li 			}
462*b095b053SXin Li 		}
463*b095b053SXin Li 	}
464*b095b053SXin Li 
465*b095b053SXin Li 	/* There still may be other threads with work */
466*b095b053SXin Li 	const size_t thread_number = thread->thread_number;
467*b095b053SXin Li 	const size_t threads_count = threadpool->threads_count.value;
468*b095b053SXin Li 	for (size_t tid = modulo_decrement(thread_number, threads_count);
469*b095b053SXin Li 		tid != thread_number;
470*b095b053SXin Li 		tid = modulo_decrement(tid, threads_count))
471*b095b053SXin Li 	{
472*b095b053SXin Li 		struct thread_info* other_thread = &threadpool->threads[tid];
473*b095b053SXin Li 		while (pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) {
474*b095b053SXin Li 			const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
475*b095b053SXin Li 			const struct fxdiv_result_size_t tile_index_ij_k = fxdiv_divide_size_t(linear_index, tile_range_k);
476*b095b053SXin Li 			const struct fxdiv_result_size_t tile_index_i_j = fxdiv_divide_size_t(tile_index_ij_k.quotient, tile_range_j);
477*b095b053SXin Li 			const size_t start_j = tile_index_i_j.remainder * tile_j;
478*b095b053SXin Li 			const size_t start_k = tile_index_ij_k.remainder * tile_k;
479*b095b053SXin Li 			task(argument, tile_index_i_j.quotient, start_j, start_k, min(range_j - start_j, tile_j), min(range_k - start_k, tile_k));
480*b095b053SXin Li 		}
481*b095b053SXin Li 	}
482*b095b053SXin Li 
483*b095b053SXin Li 	/* Make changes by this thread visible to other threads */
484*b095b053SXin Li 	pthreadpool_fence_release();
485*b095b053SXin Li }
486*b095b053SXin Li 
thread_parallelize_3d_tile_2d_with_uarch(struct pthreadpool * threadpool,struct thread_info * thread)487*b095b053SXin Li static void thread_parallelize_3d_tile_2d_with_uarch(struct pthreadpool* threadpool, struct thread_info* thread) {
488*b095b053SXin Li 	assert(threadpool != NULL);
489*b095b053SXin Li 	assert(thread != NULL);
490*b095b053SXin Li 
491*b095b053SXin Li 	const pthreadpool_task_3d_tile_2d_with_id_t task = (pthreadpool_task_3d_tile_2d_with_id_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
492*b095b053SXin Li 	void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
493*b095b053SXin Li 
494*b095b053SXin Li 	const uint32_t default_uarch_index = threadpool->params.parallelize_3d_tile_2d_with_uarch.default_uarch_index;
495*b095b053SXin Li 	uint32_t uarch_index = default_uarch_index;
496*b095b053SXin Li 	#if PTHREADPOOL_USE_CPUINFO
497*b095b053SXin Li 		uarch_index = cpuinfo_get_current_uarch_index_with_default(default_uarch_index);
498*b095b053SXin Li 		if (uarch_index > threadpool->params.parallelize_3d_tile_2d_with_uarch.max_uarch_index) {
499*b095b053SXin Li 			uarch_index = default_uarch_index;
500*b095b053SXin Li 		}
501*b095b053SXin Li 	#endif
502*b095b053SXin Li 
503*b095b053SXin Li 	/* Process thread's own range of items */
504*b095b053SXin Li 	const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
505*b095b053SXin Li 	const struct fxdiv_divisor_size_t tile_range_k = threadpool->params.parallelize_3d_tile_2d_with_uarch.tile_range_k;
506*b095b053SXin Li 	const struct fxdiv_result_size_t tile_index_ij_k = fxdiv_divide_size_t(range_start, tile_range_k);
507*b095b053SXin Li 	const struct fxdiv_divisor_size_t tile_range_j = threadpool->params.parallelize_3d_tile_2d_with_uarch.tile_range_j;
508*b095b053SXin Li 	const struct fxdiv_result_size_t tile_index_i_j = fxdiv_divide_size_t(tile_index_ij_k.quotient, tile_range_j);
509*b095b053SXin Li 	const size_t tile_j = threadpool->params.parallelize_3d_tile_2d_with_uarch.tile_j;
510*b095b053SXin Li 	const size_t tile_k = threadpool->params.parallelize_3d_tile_2d_with_uarch.tile_k;
511*b095b053SXin Li 	size_t i = tile_index_i_j.quotient;
512*b095b053SXin Li 	size_t start_j = tile_index_i_j.remainder * tile_j;
513*b095b053SXin Li 	size_t start_k = tile_index_ij_k.remainder * tile_k;
514*b095b053SXin Li 
515*b095b053SXin Li 	const size_t range_k = threadpool->params.parallelize_3d_tile_2d_with_uarch.range_k;
516*b095b053SXin Li 	const size_t range_j = threadpool->params.parallelize_3d_tile_2d_with_uarch.range_j;
517*b095b053SXin Li 	while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) {
518*b095b053SXin Li 		task(argument, uarch_index, i, start_j, start_k, min(range_j - start_j, tile_j), min(range_k - start_k, tile_k));
519*b095b053SXin Li 		start_k += tile_k;
520*b095b053SXin Li 		if (start_k >= range_k) {
521*b095b053SXin Li 			start_k = 0;
522*b095b053SXin Li 			start_j += tile_j;
523*b095b053SXin Li 			if (start_j >= range_j) {
524*b095b053SXin Li 				start_j = 0;
525*b095b053SXin Li 				i += 1;
526*b095b053SXin Li 			}
527*b095b053SXin Li 		}
528*b095b053SXin Li 	}
529*b095b053SXin Li 
530*b095b053SXin Li 	/* There still may be other threads with work */
531*b095b053SXin Li 	const size_t thread_number = thread->thread_number;
532*b095b053SXin Li 	const size_t threads_count = threadpool->threads_count.value;
533*b095b053SXin Li 	for (size_t tid = modulo_decrement(thread_number, threads_count);
534*b095b053SXin Li 		tid != thread_number;
535*b095b053SXin Li 		tid = modulo_decrement(tid, threads_count))
536*b095b053SXin Li 	{
537*b095b053SXin Li 		struct thread_info* other_thread = &threadpool->threads[tid];
538*b095b053SXin Li 		while (pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) {
539*b095b053SXin Li 			const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
540*b095b053SXin Li 			const struct fxdiv_result_size_t tile_index_ij_k = fxdiv_divide_size_t(linear_index, tile_range_k);
541*b095b053SXin Li 			const struct fxdiv_result_size_t tile_index_i_j = fxdiv_divide_size_t(tile_index_ij_k.quotient, tile_range_j);
542*b095b053SXin Li 			const size_t start_j = tile_index_i_j.remainder * tile_j;
543*b095b053SXin Li 			const size_t start_k = tile_index_ij_k.remainder * tile_k;
544*b095b053SXin Li 			task(argument, uarch_index, tile_index_i_j.quotient, start_j, start_k, min(range_j - start_j, tile_j), min(range_k - start_k, tile_k));
545*b095b053SXin Li 		}
546*b095b053SXin Li 	}
547*b095b053SXin Li 
548*b095b053SXin Li 	/* Make changes by this thread visible to other threads */
549*b095b053SXin Li 	pthreadpool_fence_release();
550*b095b053SXin Li }
551*b095b053SXin Li 
thread_parallelize_4d(struct pthreadpool * threadpool,struct thread_info * thread)552*b095b053SXin Li static void thread_parallelize_4d(struct pthreadpool* threadpool, struct thread_info* thread) {
553*b095b053SXin Li 	assert(threadpool != NULL);
554*b095b053SXin Li 	assert(thread != NULL);
555*b095b053SXin Li 
556*b095b053SXin Li 	const pthreadpool_task_4d_t task = (pthreadpool_task_4d_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
557*b095b053SXin Li 	void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
558*b095b053SXin Li 
559*b095b053SXin Li 	/* Process thread's own range of items */
560*b095b053SXin Li 	const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
561*b095b053SXin Li 	const struct fxdiv_divisor_size_t range_kl = threadpool->params.parallelize_4d.range_kl;
562*b095b053SXin Li 	const struct fxdiv_result_size_t index_ij_kl = fxdiv_divide_size_t(range_start, range_kl);
563*b095b053SXin Li 	const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_4d.range_j;
564*b095b053SXin Li 	const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_kl.quotient, range_j);
565*b095b053SXin Li 	const struct fxdiv_divisor_size_t range_l = threadpool->params.parallelize_4d.range_l;
566*b095b053SXin Li 	const struct fxdiv_result_size_t index_k_l = fxdiv_divide_size_t(index_ij_kl.remainder, range_l);
567*b095b053SXin Li 	size_t i = index_i_j.quotient;
568*b095b053SXin Li 	size_t j = index_i_j.remainder;
569*b095b053SXin Li 	size_t k = index_k_l.quotient;
570*b095b053SXin Li 	size_t l = index_k_l.remainder;
571*b095b053SXin Li 
572*b095b053SXin Li 	const size_t range_k = threadpool->params.parallelize_4d.range_k;
573*b095b053SXin Li 	while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) {
574*b095b053SXin Li 		task(argument, i, j, k, l);
575*b095b053SXin Li 		if (++l == range_l.value) {
576*b095b053SXin Li 			l = 0;
577*b095b053SXin Li 			if (++k == range_k) {
578*b095b053SXin Li 				k = 0;
579*b095b053SXin Li 				if (++j == range_j.value) {
580*b095b053SXin Li 					j = 0;
581*b095b053SXin Li 					i += 1;
582*b095b053SXin Li 				}
583*b095b053SXin Li 			}
584*b095b053SXin Li 		}
585*b095b053SXin Li 	}
586*b095b053SXin Li 
587*b095b053SXin Li 	/* There still may be other threads with work */
588*b095b053SXin Li 	const size_t thread_number = thread->thread_number;
589*b095b053SXin Li 	const size_t threads_count = threadpool->threads_count.value;
590*b095b053SXin Li 	for (size_t tid = modulo_decrement(thread_number, threads_count);
591*b095b053SXin Li 		tid != thread_number;
592*b095b053SXin Li 		tid = modulo_decrement(tid, threads_count))
593*b095b053SXin Li 	{
594*b095b053SXin Li 		struct thread_info* other_thread = &threadpool->threads[tid];
595*b095b053SXin Li 		while (pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) {
596*b095b053SXin Li 			const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
597*b095b053SXin Li 			const struct fxdiv_result_size_t index_ij_kl = fxdiv_divide_size_t(linear_index, range_kl);
598*b095b053SXin Li 			const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_kl.quotient, range_j);
599*b095b053SXin Li 			const struct fxdiv_result_size_t index_k_l = fxdiv_divide_size_t(index_ij_kl.remainder, range_l);
600*b095b053SXin Li 			task(argument, index_i_j.quotient, index_i_j.remainder, index_k_l.quotient, index_k_l.remainder);
601*b095b053SXin Li 		}
602*b095b053SXin Li 	}
603*b095b053SXin Li 
604*b095b053SXin Li 	/* Make changes by this thread visible to other threads */
605*b095b053SXin Li 	pthreadpool_fence_release();
606*b095b053SXin Li }
607*b095b053SXin Li 
thread_parallelize_4d_tile_1d(struct pthreadpool * threadpool,struct thread_info * thread)608*b095b053SXin Li static void thread_parallelize_4d_tile_1d(struct pthreadpool* threadpool, struct thread_info* thread) {
609*b095b053SXin Li 	assert(threadpool != NULL);
610*b095b053SXin Li 	assert(thread != NULL);
611*b095b053SXin Li 
612*b095b053SXin Li 	const pthreadpool_task_4d_tile_1d_t task = (pthreadpool_task_4d_tile_1d_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
613*b095b053SXin Li 	void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
614*b095b053SXin Li 
615*b095b053SXin Li 	/* Process thread's own range of items */
616*b095b053SXin Li 	const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
617*b095b053SXin Li 	const struct fxdiv_divisor_size_t tile_range_kl = threadpool->params.parallelize_4d_tile_1d.tile_range_kl;
618*b095b053SXin Li 	const struct fxdiv_result_size_t tile_index_ij_kl = fxdiv_divide_size_t(range_start, tile_range_kl);
619*b095b053SXin Li 	const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_4d_tile_1d.range_j;
620*b095b053SXin Li 	const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(tile_index_ij_kl.quotient, range_j);
621*b095b053SXin Li 	const struct fxdiv_divisor_size_t tile_range_l = threadpool->params.parallelize_4d_tile_1d.tile_range_l;
622*b095b053SXin Li 	const struct fxdiv_result_size_t tile_index_k_l = fxdiv_divide_size_t(tile_index_ij_kl.remainder, tile_range_l);
623*b095b053SXin Li 	const size_t tile_l = threadpool->params.parallelize_4d_tile_1d.tile_l;
624*b095b053SXin Li 	size_t i = index_i_j.quotient;
625*b095b053SXin Li 	size_t j = index_i_j.remainder;
626*b095b053SXin Li 	size_t k = tile_index_k_l.quotient;
627*b095b053SXin Li 	size_t start_l = tile_index_k_l.remainder * tile_l;
628*b095b053SXin Li 
629*b095b053SXin Li 	const size_t range_k = threadpool->params.parallelize_4d_tile_1d.range_k;
630*b095b053SXin Li 	const size_t range_l = threadpool->params.parallelize_4d_tile_1d.range_l;
631*b095b053SXin Li 	while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) {
632*b095b053SXin Li 		task(argument, i, j, k, start_l, min(range_l - start_l, tile_l));
633*b095b053SXin Li 		start_l += tile_l;
634*b095b053SXin Li 		if (start_l >= range_l) {
635*b095b053SXin Li 			start_l = 0;
636*b095b053SXin Li 			if (++k == range_k) {
637*b095b053SXin Li 				k = 0;
638*b095b053SXin Li 				if (++j == range_j.value) {
639*b095b053SXin Li 					j = 0;
640*b095b053SXin Li 					i += 1;
641*b095b053SXin Li 				}
642*b095b053SXin Li 			}
643*b095b053SXin Li 		}
644*b095b053SXin Li 	}
645*b095b053SXin Li 
646*b095b053SXin Li 	/* There still may be other threads with work */
647*b095b053SXin Li 	const size_t thread_number = thread->thread_number;
648*b095b053SXin Li 	const size_t threads_count = threadpool->threads_count.value;
649*b095b053SXin Li 	for (size_t tid = modulo_decrement(thread_number, threads_count);
650*b095b053SXin Li 		tid != thread_number;
651*b095b053SXin Li 		tid = modulo_decrement(tid, threads_count))
652*b095b053SXin Li 	{
653*b095b053SXin Li 		struct thread_info* other_thread = &threadpool->threads[tid];
654*b095b053SXin Li 		while (pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) {
655*b095b053SXin Li 			const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
656*b095b053SXin Li 			const struct fxdiv_result_size_t tile_index_ij_kl = fxdiv_divide_size_t(linear_index, tile_range_kl);
657*b095b053SXin Li 			const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(tile_index_ij_kl.quotient, range_j);
658*b095b053SXin Li 			const struct fxdiv_result_size_t tile_index_k_l = fxdiv_divide_size_t(tile_index_ij_kl.remainder, tile_range_l);
659*b095b053SXin Li 			const size_t start_l = tile_index_k_l.remainder * tile_l;
660*b095b053SXin Li 			task(argument, index_i_j.quotient, index_i_j.remainder, tile_index_k_l.quotient, start_l, min(range_l - start_l, tile_l));
661*b095b053SXin Li 		}
662*b095b053SXin Li 	}
663*b095b053SXin Li 
664*b095b053SXin Li 	/* Make changes by this thread visible to other threads */
665*b095b053SXin Li 	pthreadpool_fence_release();
666*b095b053SXin Li }
667*b095b053SXin Li 
thread_parallelize_4d_tile_2d(struct pthreadpool * threadpool,struct thread_info * thread)668*b095b053SXin Li static void thread_parallelize_4d_tile_2d(struct pthreadpool* threadpool, struct thread_info* thread) {
669*b095b053SXin Li 	assert(threadpool != NULL);
670*b095b053SXin Li 	assert(thread != NULL);
671*b095b053SXin Li 
672*b095b053SXin Li 	const pthreadpool_task_4d_tile_2d_t task = (pthreadpool_task_4d_tile_2d_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
673*b095b053SXin Li 	void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
674*b095b053SXin Li 
675*b095b053SXin Li 	/* Process thread's own range of items */
676*b095b053SXin Li 	const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
677*b095b053SXin Li 	const struct fxdiv_divisor_size_t tile_range_kl = threadpool->params.parallelize_4d_tile_2d.tile_range_kl;
678*b095b053SXin Li 	const struct fxdiv_result_size_t tile_index_ij_kl = fxdiv_divide_size_t(range_start, tile_range_kl);
679*b095b053SXin Li 	const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_4d_tile_2d.range_j;
680*b095b053SXin Li 	const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(tile_index_ij_kl.quotient, range_j);
681*b095b053SXin Li 	const struct fxdiv_divisor_size_t tile_range_l = threadpool->params.parallelize_4d_tile_2d.tile_range_l;
682*b095b053SXin Li 	const struct fxdiv_result_size_t tile_index_k_l = fxdiv_divide_size_t(tile_index_ij_kl.remainder, tile_range_l);
683*b095b053SXin Li 	const size_t tile_k = threadpool->params.parallelize_4d_tile_2d.tile_k;
684*b095b053SXin Li 	const size_t tile_l = threadpool->params.parallelize_4d_tile_2d.tile_l;
685*b095b053SXin Li 	size_t i = index_i_j.quotient;
686*b095b053SXin Li 	size_t j = index_i_j.remainder;
687*b095b053SXin Li 	size_t start_k = tile_index_k_l.quotient * tile_k;
688*b095b053SXin Li 	size_t start_l = tile_index_k_l.remainder * tile_l;
689*b095b053SXin Li 
690*b095b053SXin Li 	const size_t range_l = threadpool->params.parallelize_4d_tile_2d.range_l;
691*b095b053SXin Li 	const size_t range_k = threadpool->params.parallelize_4d_tile_2d.range_k;
692*b095b053SXin Li 	while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) {
693*b095b053SXin Li 		task(argument, i, j, start_k, start_l, min(range_k - start_k, tile_k), min(range_l - start_l, tile_l));
694*b095b053SXin Li 		start_l += tile_l;
695*b095b053SXin Li 		if (start_l >= range_l) {
696*b095b053SXin Li 			start_l = 0;
697*b095b053SXin Li 			start_k += tile_k;
698*b095b053SXin Li 			if (start_k >= range_k) {
699*b095b053SXin Li 				start_k = 0;
700*b095b053SXin Li 				if (++j == range_j.value) {
701*b095b053SXin Li 					j = 0;
702*b095b053SXin Li 					i += 1;
703*b095b053SXin Li 				}
704*b095b053SXin Li 			}
705*b095b053SXin Li 		}
706*b095b053SXin Li 	}
707*b095b053SXin Li 
708*b095b053SXin Li 	/* There still may be other threads with work */
709*b095b053SXin Li 	const size_t thread_number = thread->thread_number;
710*b095b053SXin Li 	const size_t threads_count = threadpool->threads_count.value;
711*b095b053SXin Li 	for (size_t tid = modulo_decrement(thread_number, threads_count);
712*b095b053SXin Li 		tid != thread_number;
713*b095b053SXin Li 		tid = modulo_decrement(tid, threads_count))
714*b095b053SXin Li 	{
715*b095b053SXin Li 		struct thread_info* other_thread = &threadpool->threads[tid];
716*b095b053SXin Li 		while (pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) {
717*b095b053SXin Li 			const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
718*b095b053SXin Li 			const struct fxdiv_result_size_t tile_index_ij_kl = fxdiv_divide_size_t(linear_index, tile_range_kl);
719*b095b053SXin Li 			const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(tile_index_ij_kl.quotient, range_j);
720*b095b053SXin Li 			const struct fxdiv_result_size_t tile_index_k_l = fxdiv_divide_size_t(tile_index_ij_kl.remainder, tile_range_l);
721*b095b053SXin Li 			const size_t start_k = tile_index_k_l.quotient * tile_k;
722*b095b053SXin Li 			const size_t start_l = tile_index_k_l.remainder * tile_l;
723*b095b053SXin Li 			task(argument, index_i_j.quotient, index_i_j.remainder, start_k, start_l, min(range_k - start_k, tile_k), min(range_l - start_l, tile_l));
724*b095b053SXin Li 		}
725*b095b053SXin Li 	}
726*b095b053SXin Li 
727*b095b053SXin Li 	/* Make changes by this thread visible to other threads */
728*b095b053SXin Li 	pthreadpool_fence_release();
729*b095b053SXin Li }
730*b095b053SXin Li 
thread_parallelize_4d_tile_2d_with_uarch(struct pthreadpool * threadpool,struct thread_info * thread)731*b095b053SXin Li static void thread_parallelize_4d_tile_2d_with_uarch(struct pthreadpool* threadpool, struct thread_info* thread) {
732*b095b053SXin Li 	assert(threadpool != NULL);
733*b095b053SXin Li 	assert(thread != NULL);
734*b095b053SXin Li 
735*b095b053SXin Li 	const pthreadpool_task_4d_tile_2d_with_id_t task = (pthreadpool_task_4d_tile_2d_with_id_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
736*b095b053SXin Li 	void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
737*b095b053SXin Li 
738*b095b053SXin Li 	const uint32_t default_uarch_index = threadpool->params.parallelize_4d_tile_2d_with_uarch.default_uarch_index;
739*b095b053SXin Li 	uint32_t uarch_index = default_uarch_index;
740*b095b053SXin Li 	#if PTHREADPOOL_USE_CPUINFO
741*b095b053SXin Li 		uarch_index = cpuinfo_get_current_uarch_index_with_default(default_uarch_index);
742*b095b053SXin Li 		if (uarch_index > threadpool->params.parallelize_4d_tile_2d_with_uarch.max_uarch_index) {
743*b095b053SXin Li 			uarch_index = default_uarch_index;
744*b095b053SXin Li 		}
745*b095b053SXin Li 	#endif
746*b095b053SXin Li 
747*b095b053SXin Li 	/* Process thread's own range of items */
748*b095b053SXin Li 	const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
749*b095b053SXin Li 	const struct fxdiv_divisor_size_t tile_range_kl = threadpool->params.parallelize_4d_tile_2d_with_uarch.tile_range_kl;
750*b095b053SXin Li 	const struct fxdiv_result_size_t tile_index_ij_kl = fxdiv_divide_size_t(range_start, tile_range_kl);
751*b095b053SXin Li 	const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_4d_tile_2d_with_uarch.range_j;
752*b095b053SXin Li 	const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(tile_index_ij_kl.quotient, range_j);
753*b095b053SXin Li 	const struct fxdiv_divisor_size_t tile_range_l = threadpool->params.parallelize_4d_tile_2d_with_uarch.tile_range_l;
754*b095b053SXin Li 	const struct fxdiv_result_size_t tile_index_k_l = fxdiv_divide_size_t(tile_index_ij_kl.remainder, tile_range_l);
755*b095b053SXin Li 	const size_t tile_k = threadpool->params.parallelize_4d_tile_2d_with_uarch.tile_k;
756*b095b053SXin Li 	const size_t tile_l = threadpool->params.parallelize_4d_tile_2d_with_uarch.tile_l;
757*b095b053SXin Li 	size_t i = index_i_j.quotient;
758*b095b053SXin Li 	size_t j = index_i_j.remainder;
759*b095b053SXin Li 	size_t start_k = tile_index_k_l.quotient * tile_k;
760*b095b053SXin Li 	size_t start_l = tile_index_k_l.remainder * tile_l;
761*b095b053SXin Li 
762*b095b053SXin Li 	const size_t range_l = threadpool->params.parallelize_4d_tile_2d_with_uarch.range_l;
763*b095b053SXin Li 	const size_t range_k = threadpool->params.parallelize_4d_tile_2d_with_uarch.range_k;
764*b095b053SXin Li 	while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) {
765*b095b053SXin Li 		task(argument, uarch_index, i, j, start_k, start_l, min(range_k - start_k, tile_k), min(range_l - start_l, tile_l));
766*b095b053SXin Li 		start_l += tile_l;
767*b095b053SXin Li 		if (start_l >= range_l) {
768*b095b053SXin Li 			start_l = 0;
769*b095b053SXin Li 			start_k += tile_k;
770*b095b053SXin Li 			if (start_k >= range_k) {
771*b095b053SXin Li 				start_k = 0;
772*b095b053SXin Li 				if (++j == range_j.value) {
773*b095b053SXin Li 					j = 0;
774*b095b053SXin Li 					i += 1;
775*b095b053SXin Li 				}
776*b095b053SXin Li 			}
777*b095b053SXin Li 		}
778*b095b053SXin Li 	}
779*b095b053SXin Li 
780*b095b053SXin Li 	/* There still may be other threads with work */
781*b095b053SXin Li 	const size_t thread_number = thread->thread_number;
782*b095b053SXin Li 	const size_t threads_count = threadpool->threads_count.value;
783*b095b053SXin Li 	for (size_t tid = modulo_decrement(thread_number, threads_count);
784*b095b053SXin Li 		tid != thread_number;
785*b095b053SXin Li 		tid = modulo_decrement(tid, threads_count))
786*b095b053SXin Li 	{
787*b095b053SXin Li 		struct thread_info* other_thread = &threadpool->threads[tid];
788*b095b053SXin Li 		while (pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) {
789*b095b053SXin Li 			const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
790*b095b053SXin Li 			const struct fxdiv_result_size_t tile_index_ij_kl = fxdiv_divide_size_t(linear_index, tile_range_kl);
791*b095b053SXin Li 			const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(tile_index_ij_kl.quotient, range_j);
792*b095b053SXin Li 			const struct fxdiv_result_size_t tile_index_k_l = fxdiv_divide_size_t(tile_index_ij_kl.remainder, tile_range_l);
793*b095b053SXin Li 			const size_t start_k = tile_index_k_l.quotient * tile_k;
794*b095b053SXin Li 			const size_t start_l = tile_index_k_l.remainder * tile_l;
795*b095b053SXin Li 			task(argument, uarch_index, index_i_j.quotient, index_i_j.remainder, start_k, start_l, min(range_k - start_k, tile_k), min(range_l - start_l, tile_l));
796*b095b053SXin Li 		}
797*b095b053SXin Li 	}
798*b095b053SXin Li 
799*b095b053SXin Li 	/* Make changes by this thread visible to other threads */
800*b095b053SXin Li 	pthreadpool_fence_release();
801*b095b053SXin Li }
802*b095b053SXin Li 
thread_parallelize_5d(struct pthreadpool * threadpool,struct thread_info * thread)803*b095b053SXin Li static void thread_parallelize_5d(struct pthreadpool* threadpool, struct thread_info* thread) {
804*b095b053SXin Li 	assert(threadpool != NULL);
805*b095b053SXin Li 	assert(thread != NULL);
806*b095b053SXin Li 
807*b095b053SXin Li 	const pthreadpool_task_5d_t task = (pthreadpool_task_5d_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
808*b095b053SXin Li 	void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
809*b095b053SXin Li 
810*b095b053SXin Li 	/* Process thread's own range of items */
811*b095b053SXin Li 	const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
812*b095b053SXin Li 	const struct fxdiv_divisor_size_t range_lm = threadpool->params.parallelize_5d.range_lm;
813*b095b053SXin Li 	const struct fxdiv_result_size_t index_ijk_lm = fxdiv_divide_size_t(range_start, range_lm);
814*b095b053SXin Li 	const struct fxdiv_divisor_size_t range_k = threadpool->params.parallelize_5d.range_k;
815*b095b053SXin Li 	const struct fxdiv_result_size_t index_ij_k = fxdiv_divide_size_t(index_ijk_lm.quotient, range_k);
816*b095b053SXin Li 	const struct fxdiv_divisor_size_t range_m = threadpool->params.parallelize_5d.range_m;
817*b095b053SXin Li 	const struct fxdiv_result_size_t index_l_m = fxdiv_divide_size_t(index_ijk_lm.remainder, range_m);
818*b095b053SXin Li 	const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_5d.range_j;
819*b095b053SXin Li 	const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_k.quotient, range_j);
820*b095b053SXin Li 	size_t i = index_i_j.quotient;
821*b095b053SXin Li 	size_t j = index_i_j.remainder;
822*b095b053SXin Li 	size_t k = index_ij_k.remainder;
823*b095b053SXin Li 	size_t l = index_l_m.quotient;
824*b095b053SXin Li 	size_t m = index_l_m.remainder;
825*b095b053SXin Li 
826*b095b053SXin Li 	const size_t range_l = threadpool->params.parallelize_5d.range_l;
827*b095b053SXin Li 	while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) {
828*b095b053SXin Li 		task(argument, i, j, k, l, m);
829*b095b053SXin Li 		if (++m == range_m.value) {
830*b095b053SXin Li 			m = 0;
831*b095b053SXin Li 			if (++l == range_l) {
832*b095b053SXin Li 				l = 0;
833*b095b053SXin Li 				if (++k == range_k.value) {
834*b095b053SXin Li 					k = 0;
835*b095b053SXin Li 					if (++j == range_j.value) {
836*b095b053SXin Li 						j = 0;
837*b095b053SXin Li 						i += 1;
838*b095b053SXin Li 					}
839*b095b053SXin Li 				}
840*b095b053SXin Li 			}
841*b095b053SXin Li 		}
842*b095b053SXin Li 	}
843*b095b053SXin Li 
844*b095b053SXin Li 	/* There still may be other threads with work */
845*b095b053SXin Li 	const size_t thread_number = thread->thread_number;
846*b095b053SXin Li 	const size_t threads_count = threadpool->threads_count.value;
847*b095b053SXin Li 	for (size_t tid = modulo_decrement(thread_number, threads_count);
848*b095b053SXin Li 		tid != thread_number;
849*b095b053SXin Li 		tid = modulo_decrement(tid, threads_count))
850*b095b053SXin Li 	{
851*b095b053SXin Li 		struct thread_info* other_thread = &threadpool->threads[tid];
852*b095b053SXin Li 		while (pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) {
853*b095b053SXin Li 			const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
854*b095b053SXin Li 			const struct fxdiv_result_size_t index_ijk_lm = fxdiv_divide_size_t(linear_index, range_lm);
855*b095b053SXin Li 			const struct fxdiv_result_size_t index_ij_k = fxdiv_divide_size_t(index_ijk_lm.quotient, range_k);
856*b095b053SXin Li 			const struct fxdiv_result_size_t index_l_m = fxdiv_divide_size_t(index_ijk_lm.remainder, range_m);
857*b095b053SXin Li 			const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_k.quotient, range_j);
858*b095b053SXin Li 			task(argument, index_i_j.quotient, index_i_j.remainder, index_ij_k.remainder, index_l_m.quotient, index_l_m.remainder);
859*b095b053SXin Li 		}
860*b095b053SXin Li 	}
861*b095b053SXin Li 
862*b095b053SXin Li 	/* Make changes by this thread visible to other threads */
863*b095b053SXin Li 	pthreadpool_fence_release();
864*b095b053SXin Li }
865*b095b053SXin Li 
thread_parallelize_5d_tile_1d(struct pthreadpool * threadpool,struct thread_info * thread)866*b095b053SXin Li static void thread_parallelize_5d_tile_1d(struct pthreadpool* threadpool, struct thread_info* thread) {
867*b095b053SXin Li 	assert(threadpool != NULL);
868*b095b053SXin Li 	assert(thread != NULL);
869*b095b053SXin Li 
870*b095b053SXin Li 	const pthreadpool_task_5d_tile_1d_t task = (pthreadpool_task_5d_tile_1d_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
871*b095b053SXin Li 	void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
872*b095b053SXin Li 
873*b095b053SXin Li 	/* Process thread's own range of items */
874*b095b053SXin Li 	const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
875*b095b053SXin Li 	const struct fxdiv_divisor_size_t tile_range_m = threadpool->params.parallelize_5d_tile_1d.tile_range_m;
876*b095b053SXin Li 	const struct fxdiv_result_size_t tile_index_ijkl_m = fxdiv_divide_size_t(range_start, tile_range_m);
877*b095b053SXin Li 	const struct fxdiv_divisor_size_t range_kl = threadpool->params.parallelize_5d_tile_1d.range_kl;
878*b095b053SXin Li 	const struct fxdiv_result_size_t index_ij_kl = fxdiv_divide_size_t(tile_index_ijkl_m.quotient, range_kl);
879*b095b053SXin Li 	const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_5d_tile_1d.range_j;
880*b095b053SXin Li 	const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_kl.quotient, range_j);
881*b095b053SXin Li 	const struct fxdiv_divisor_size_t range_l = threadpool->params.parallelize_5d_tile_1d.range_l;
882*b095b053SXin Li 	const struct fxdiv_result_size_t index_k_l = fxdiv_divide_size_t(index_ij_kl.remainder, range_l);
883*b095b053SXin Li 	const size_t tile_m = threadpool->params.parallelize_5d_tile_1d.tile_m;
884*b095b053SXin Li 	size_t i = index_i_j.quotient;
885*b095b053SXin Li 	size_t j = index_i_j.remainder;
886*b095b053SXin Li 	size_t k = index_k_l.quotient;
887*b095b053SXin Li 	size_t l = index_k_l.remainder;
888*b095b053SXin Li 	size_t start_m = tile_index_ijkl_m.remainder * tile_m;
889*b095b053SXin Li 
890*b095b053SXin Li 	const size_t range_m = threadpool->params.parallelize_5d_tile_1d.range_m;
891*b095b053SXin Li 	const size_t range_k = threadpool->params.parallelize_5d_tile_1d.range_k;
892*b095b053SXin Li 	while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) {
893*b095b053SXin Li 		task(argument, i, j, k, l, start_m, min(range_m - start_m, tile_m));
894*b095b053SXin Li 		start_m += tile_m;
895*b095b053SXin Li 		if (start_m >= range_m) {
896*b095b053SXin Li 			start_m = 0;
897*b095b053SXin Li 			if (++l == range_l.value) {
898*b095b053SXin Li 				l = 0;
899*b095b053SXin Li 				if (++k == range_k) {
900*b095b053SXin Li 					k = 0;
901*b095b053SXin Li 					if (++j == range_j.value) {
902*b095b053SXin Li 						j = 0;
903*b095b053SXin Li 						i += 1;
904*b095b053SXin Li 					}
905*b095b053SXin Li 				}
906*b095b053SXin Li 			}
907*b095b053SXin Li 		}
908*b095b053SXin Li 	}
909*b095b053SXin Li 
910*b095b053SXin Li 	/* There still may be other threads with work */
911*b095b053SXin Li 	const size_t thread_number = thread->thread_number;
912*b095b053SXin Li 	const size_t threads_count = threadpool->threads_count.value;
913*b095b053SXin Li 	for (size_t tid = modulo_decrement(thread_number, threads_count);
914*b095b053SXin Li 		tid != thread_number;
915*b095b053SXin Li 		tid = modulo_decrement(tid, threads_count))
916*b095b053SXin Li 	{
917*b095b053SXin Li 		struct thread_info* other_thread = &threadpool->threads[tid];
918*b095b053SXin Li 		while (pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) {
919*b095b053SXin Li 			const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
920*b095b053SXin Li 			const struct fxdiv_result_size_t tile_index_ijkl_m = fxdiv_divide_size_t(linear_index, tile_range_m);
921*b095b053SXin Li 			const struct fxdiv_result_size_t index_ij_kl = fxdiv_divide_size_t(tile_index_ijkl_m.quotient, range_kl);
922*b095b053SXin Li 			const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_kl.quotient, range_j);
923*b095b053SXin Li 			const struct fxdiv_result_size_t index_k_l = fxdiv_divide_size_t(index_ij_kl.remainder, range_l);
924*b095b053SXin Li 			size_t start_m = tile_index_ijkl_m.remainder * tile_m;
925*b095b053SXin Li 			task(argument, index_i_j.quotient, index_i_j.remainder, index_k_l.quotient, index_k_l.remainder, start_m,
926*b095b053SXin Li 				min(range_m - start_m, tile_m));
927*b095b053SXin Li 		}
928*b095b053SXin Li 	}
929*b095b053SXin Li 
930*b095b053SXin Li 	/* Make changes by this thread visible to other threads */
931*b095b053SXin Li 	pthreadpool_fence_release();
932*b095b053SXin Li }
933*b095b053SXin Li 
thread_parallelize_5d_tile_2d(struct pthreadpool * threadpool,struct thread_info * thread)934*b095b053SXin Li static void thread_parallelize_5d_tile_2d(struct pthreadpool* threadpool, struct thread_info* thread) {
935*b095b053SXin Li 	assert(threadpool != NULL);
936*b095b053SXin Li 	assert(thread != NULL);
937*b095b053SXin Li 
938*b095b053SXin Li 	const pthreadpool_task_5d_tile_2d_t task = (pthreadpool_task_5d_tile_2d_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
939*b095b053SXin Li 	void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
940*b095b053SXin Li 
941*b095b053SXin Li 	/* Process thread's own range of items */
942*b095b053SXin Li 	const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
943*b095b053SXin Li 	const struct fxdiv_divisor_size_t tile_range_lm = threadpool->params.parallelize_5d_tile_2d.tile_range_lm;
944*b095b053SXin Li 	const struct fxdiv_result_size_t tile_index_ijk_lm = fxdiv_divide_size_t(range_start, tile_range_lm);
945*b095b053SXin Li 	const struct fxdiv_divisor_size_t range_k = threadpool->params.parallelize_5d_tile_2d.range_k;
946*b095b053SXin Li 	const struct fxdiv_result_size_t index_ij_k = fxdiv_divide_size_t(tile_index_ijk_lm.quotient, range_k);
947*b095b053SXin Li 	const struct fxdiv_divisor_size_t tile_range_m = threadpool->params.parallelize_5d_tile_2d.tile_range_m;
948*b095b053SXin Li 	const struct fxdiv_result_size_t tile_index_l_m = fxdiv_divide_size_t(tile_index_ijk_lm.remainder, tile_range_m);
949*b095b053SXin Li 	const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_5d_tile_2d.range_j;
950*b095b053SXin Li 	const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_k.quotient, range_j);
951*b095b053SXin Li 	const size_t tile_l = threadpool->params.parallelize_5d_tile_2d.tile_l;
952*b095b053SXin Li 	const size_t tile_m = threadpool->params.parallelize_5d_tile_2d.tile_m;
953*b095b053SXin Li 	size_t i = index_i_j.quotient;
954*b095b053SXin Li 	size_t j = index_i_j.remainder;
955*b095b053SXin Li 	size_t k = index_ij_k.remainder;
956*b095b053SXin Li 	size_t start_l = tile_index_l_m.quotient * tile_l;
957*b095b053SXin Li 	size_t start_m = tile_index_l_m.remainder * tile_m;
958*b095b053SXin Li 
959*b095b053SXin Li 	const size_t range_m = threadpool->params.parallelize_5d_tile_2d.range_m;
960*b095b053SXin Li 	const size_t range_l = threadpool->params.parallelize_5d_tile_2d.range_l;
961*b095b053SXin Li 	while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) {
962*b095b053SXin Li 		task(argument, i, j, k, start_l, start_m, min(range_l - start_l, tile_l), min(range_m - start_m, tile_m));
963*b095b053SXin Li 		start_m += tile_m;
964*b095b053SXin Li 		if (start_m >= range_m) {
965*b095b053SXin Li 			start_m = 0;
966*b095b053SXin Li 			start_l += tile_l;
967*b095b053SXin Li 			if (start_l >= range_l) {
968*b095b053SXin Li 				start_l = 0;
969*b095b053SXin Li 				if (++k == range_k.value) {
970*b095b053SXin Li 					k = 0;
971*b095b053SXin Li 					if (++j == range_j.value) {
972*b095b053SXin Li 						j = 0;
973*b095b053SXin Li 						i += 1;
974*b095b053SXin Li 					}
975*b095b053SXin Li 				}
976*b095b053SXin Li 			}
977*b095b053SXin Li 		}
978*b095b053SXin Li 	}
979*b095b053SXin Li 
980*b095b053SXin Li 	/* There still may be other threads with work */
981*b095b053SXin Li 	const size_t thread_number = thread->thread_number;
982*b095b053SXin Li 	const size_t threads_count = threadpool->threads_count.value;
983*b095b053SXin Li 	for (size_t tid = modulo_decrement(thread_number, threads_count);
984*b095b053SXin Li 		tid != thread_number;
985*b095b053SXin Li 		tid = modulo_decrement(tid, threads_count))
986*b095b053SXin Li 	{
987*b095b053SXin Li 		struct thread_info* other_thread = &threadpool->threads[tid];
988*b095b053SXin Li 		while (pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) {
989*b095b053SXin Li 			const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
990*b095b053SXin Li 			const struct fxdiv_result_size_t tile_index_ijk_lm = fxdiv_divide_size_t(linear_index, tile_range_lm);
991*b095b053SXin Li 			const struct fxdiv_result_size_t index_ij_k = fxdiv_divide_size_t(tile_index_ijk_lm.quotient, range_k);
992*b095b053SXin Li 			const struct fxdiv_result_size_t tile_index_l_m = fxdiv_divide_size_t(tile_index_ijk_lm.remainder, tile_range_m);
993*b095b053SXin Li 			const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_k.quotient, range_j);
994*b095b053SXin Li 			const size_t start_l = tile_index_l_m.quotient * tile_l;
995*b095b053SXin Li 			const size_t start_m = tile_index_l_m.remainder * tile_m;
996*b095b053SXin Li 			task(argument, index_i_j.quotient, index_i_j.remainder, index_ij_k.remainder,
997*b095b053SXin Li 				start_l, start_m, min(range_l - start_l, tile_l), min(range_m - start_m, tile_m));
998*b095b053SXin Li 		}
999*b095b053SXin Li 	}
1000*b095b053SXin Li 
1001*b095b053SXin Li 	/* Make changes by this thread visible to other threads */
1002*b095b053SXin Li 	pthreadpool_fence_release();
1003*b095b053SXin Li }
1004*b095b053SXin Li 
thread_parallelize_6d(struct pthreadpool * threadpool,struct thread_info * thread)1005*b095b053SXin Li static void thread_parallelize_6d(struct pthreadpool* threadpool, struct thread_info* thread) {
1006*b095b053SXin Li 	assert(threadpool != NULL);
1007*b095b053SXin Li 	assert(thread != NULL);
1008*b095b053SXin Li 
1009*b095b053SXin Li 	const pthreadpool_task_6d_t task = (pthreadpool_task_6d_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
1010*b095b053SXin Li 	void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
1011*b095b053SXin Li 
1012*b095b053SXin Li 	/* Process thread's own range of items */
1013*b095b053SXin Li 	const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
1014*b095b053SXin Li 	const struct fxdiv_divisor_size_t range_lmn = threadpool->params.parallelize_6d.range_lmn;
1015*b095b053SXin Li 	const struct fxdiv_result_size_t index_ijk_lmn = fxdiv_divide_size_t(range_start, range_lmn);
1016*b095b053SXin Li 	const struct fxdiv_divisor_size_t range_k = threadpool->params.parallelize_6d.range_k;
1017*b095b053SXin Li 	const struct fxdiv_result_size_t index_ij_k = fxdiv_divide_size_t(index_ijk_lmn.quotient, range_k);
1018*b095b053SXin Li 	const struct fxdiv_divisor_size_t range_n = threadpool->params.parallelize_6d.range_n;
1019*b095b053SXin Li 	const struct fxdiv_result_size_t index_lm_n = fxdiv_divide_size_t(index_ijk_lmn.remainder, range_n);
1020*b095b053SXin Li 	const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_6d.range_j;
1021*b095b053SXin Li 	const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_k.quotient, range_j);
1022*b095b053SXin Li 	const struct fxdiv_divisor_size_t range_m = threadpool->params.parallelize_6d.range_m;
1023*b095b053SXin Li 	const struct fxdiv_result_size_t index_l_m = fxdiv_divide_size_t(index_lm_n.quotient, range_m);
1024*b095b053SXin Li 	size_t i = index_i_j.quotient;
1025*b095b053SXin Li 	size_t j = index_i_j.remainder;
1026*b095b053SXin Li 	size_t k = index_ij_k.remainder;
1027*b095b053SXin Li 	size_t l = index_l_m.quotient;
1028*b095b053SXin Li 	size_t m = index_l_m.remainder;
1029*b095b053SXin Li 	size_t n = index_lm_n.remainder;
1030*b095b053SXin Li 
1031*b095b053SXin Li 	const size_t range_l = threadpool->params.parallelize_6d.range_l;
1032*b095b053SXin Li 	while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) {
1033*b095b053SXin Li 		task(argument, i, j, k, l, m, n);
1034*b095b053SXin Li 		if (++n == range_n.value) {
1035*b095b053SXin Li 			n = 0;
1036*b095b053SXin Li 			if (++m == range_m.value) {
1037*b095b053SXin Li 				m = 0;
1038*b095b053SXin Li 				if (++l == range_l) {
1039*b095b053SXin Li 					l = 0;
1040*b095b053SXin Li 					if (++k == range_k.value) {
1041*b095b053SXin Li 						k = 0;
1042*b095b053SXin Li 						if (++j == range_j.value) {
1043*b095b053SXin Li 							j = 0;
1044*b095b053SXin Li 							i += 1;
1045*b095b053SXin Li 						}
1046*b095b053SXin Li 					}
1047*b095b053SXin Li 				}
1048*b095b053SXin Li 			}
1049*b095b053SXin Li 		}
1050*b095b053SXin Li 	}
1051*b095b053SXin Li 
1052*b095b053SXin Li 
1053*b095b053SXin Li 	/* There still may be other threads with work */
1054*b095b053SXin Li 	const size_t thread_number = thread->thread_number;
1055*b095b053SXin Li 	const size_t threads_count = threadpool->threads_count.value;
1056*b095b053SXin Li 	for (size_t tid = modulo_decrement(thread_number, threads_count);
1057*b095b053SXin Li 		tid != thread_number;
1058*b095b053SXin Li 		tid = modulo_decrement(tid, threads_count))
1059*b095b053SXin Li 	{
1060*b095b053SXin Li 		struct thread_info* other_thread = &threadpool->threads[tid];
1061*b095b053SXin Li 		while (pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) {
1062*b095b053SXin Li 			const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
1063*b095b053SXin Li 			const struct fxdiv_result_size_t index_ijk_lmn = fxdiv_divide_size_t(linear_index, range_lmn);
1064*b095b053SXin Li 			const struct fxdiv_result_size_t index_ij_k = fxdiv_divide_size_t(index_ijk_lmn.quotient, range_k);
1065*b095b053SXin Li 			const struct fxdiv_result_size_t index_lm_n = fxdiv_divide_size_t(index_ijk_lmn.remainder, range_n);
1066*b095b053SXin Li 			const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_k.quotient, range_j);
1067*b095b053SXin Li 			const struct fxdiv_result_size_t index_l_m = fxdiv_divide_size_t(index_lm_n.quotient, range_m);
1068*b095b053SXin Li 			task(argument, index_i_j.quotient, index_i_j.remainder, index_ij_k.remainder, index_l_m.quotient, index_l_m.remainder, index_lm_n.remainder);
1069*b095b053SXin Li 		}
1070*b095b053SXin Li 	}
1071*b095b053SXin Li 
1072*b095b053SXin Li 	/* Make changes by this thread visible to other threads */
1073*b095b053SXin Li 	pthreadpool_fence_release();
1074*b095b053SXin Li }
1075*b095b053SXin Li 
thread_parallelize_6d_tile_1d(struct pthreadpool * threadpool,struct thread_info * thread)1076*b095b053SXin Li static void thread_parallelize_6d_tile_1d(struct pthreadpool* threadpool, struct thread_info* thread) {
1077*b095b053SXin Li 	assert(threadpool != NULL);
1078*b095b053SXin Li 	assert(thread != NULL);
1079*b095b053SXin Li 
1080*b095b053SXin Li 	const pthreadpool_task_6d_tile_1d_t task = (pthreadpool_task_6d_tile_1d_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
1081*b095b053SXin Li 	void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
1082*b095b053SXin Li 
1083*b095b053SXin Li 	/* Process thread's own range of items */
1084*b095b053SXin Li 	const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
1085*b095b053SXin Li 	const struct fxdiv_divisor_size_t tile_range_lmn = threadpool->params.parallelize_6d_tile_1d.tile_range_lmn;
1086*b095b053SXin Li 	const struct fxdiv_result_size_t tile_index_ijk_lmn = fxdiv_divide_size_t(range_start, tile_range_lmn);
1087*b095b053SXin Li 	const struct fxdiv_divisor_size_t range_k = threadpool->params.parallelize_6d_tile_1d.range_k;
1088*b095b053SXin Li 	const struct fxdiv_result_size_t index_ij_k = fxdiv_divide_size_t(tile_index_ijk_lmn.quotient, range_k);
1089*b095b053SXin Li 	const struct fxdiv_divisor_size_t tile_range_n = threadpool->params.parallelize_6d_tile_1d.tile_range_n;
1090*b095b053SXin Li 	const struct fxdiv_result_size_t tile_index_lm_n = fxdiv_divide_size_t(tile_index_ijk_lmn.remainder, tile_range_n);
1091*b095b053SXin Li 	const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_6d_tile_1d.range_j;
1092*b095b053SXin Li 	const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_k.quotient, range_j);
1093*b095b053SXin Li 	const struct fxdiv_divisor_size_t range_m = threadpool->params.parallelize_6d_tile_1d.range_m;
1094*b095b053SXin Li 	const struct fxdiv_result_size_t index_l_m = fxdiv_divide_size_t(tile_index_lm_n.quotient, range_m);
1095*b095b053SXin Li 	const size_t tile_n = threadpool->params.parallelize_6d_tile_1d.tile_n;
1096*b095b053SXin Li 	size_t i = index_i_j.quotient;
1097*b095b053SXin Li 	size_t j = index_i_j.remainder;
1098*b095b053SXin Li 	size_t k = index_ij_k.remainder;
1099*b095b053SXin Li 	size_t l = index_l_m.quotient;
1100*b095b053SXin Li 	size_t m = index_l_m.remainder;
1101*b095b053SXin Li 	size_t start_n = tile_index_lm_n.remainder * tile_n;
1102*b095b053SXin Li 
1103*b095b053SXin Li 	const size_t range_n = threadpool->params.parallelize_6d_tile_1d.range_n;
1104*b095b053SXin Li 	const size_t range_l = threadpool->params.parallelize_6d_tile_1d.range_l;
1105*b095b053SXin Li 	while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) {
1106*b095b053SXin Li 		task(argument, i, j, k, l, m, start_n, min(range_n - start_n, tile_n));
1107*b095b053SXin Li 		start_n += tile_n;
1108*b095b053SXin Li 		if (start_n >= range_n) {
1109*b095b053SXin Li 			start_n = 0;
1110*b095b053SXin Li 			if (++m == range_m.value) {
1111*b095b053SXin Li 				m = 0;
1112*b095b053SXin Li 				if (++l == range_l) {
1113*b095b053SXin Li 					l = 0;
1114*b095b053SXin Li 					if (++k == range_k.value) {
1115*b095b053SXin Li 						k = 0;
1116*b095b053SXin Li 						if (++j == range_j.value) {
1117*b095b053SXin Li 							j = 0;
1118*b095b053SXin Li 							i += 1;
1119*b095b053SXin Li 						}
1120*b095b053SXin Li 					}
1121*b095b053SXin Li 				}
1122*b095b053SXin Li 			}
1123*b095b053SXin Li 		}
1124*b095b053SXin Li 	}
1125*b095b053SXin Li 
1126*b095b053SXin Li 
1127*b095b053SXin Li 	/* There still may be other threads with work */
1128*b095b053SXin Li 	const size_t thread_number = thread->thread_number;
1129*b095b053SXin Li 	const size_t threads_count = threadpool->threads_count.value;
1130*b095b053SXin Li 	for (size_t tid = modulo_decrement(thread_number, threads_count);
1131*b095b053SXin Li 		tid != thread_number;
1132*b095b053SXin Li 		tid = modulo_decrement(tid, threads_count))
1133*b095b053SXin Li 	{
1134*b095b053SXin Li 		struct thread_info* other_thread = &threadpool->threads[tid];
1135*b095b053SXin Li 		while (pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) {
1136*b095b053SXin Li 			const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
1137*b095b053SXin Li 			const struct fxdiv_result_size_t tile_index_ijk_lmn = fxdiv_divide_size_t(linear_index, tile_range_lmn);
1138*b095b053SXin Li 			const struct fxdiv_result_size_t index_ij_k = fxdiv_divide_size_t(tile_index_ijk_lmn.quotient, range_k);
1139*b095b053SXin Li 			const struct fxdiv_result_size_t tile_index_lm_n = fxdiv_divide_size_t(tile_index_ijk_lmn.remainder, tile_range_n);
1140*b095b053SXin Li 			const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_k.quotient, range_j);
1141*b095b053SXin Li 			const struct fxdiv_result_size_t index_l_m = fxdiv_divide_size_t(tile_index_lm_n.quotient, range_m);
1142*b095b053SXin Li 			const size_t start_n = tile_index_lm_n.remainder * tile_n;
1143*b095b053SXin Li 			task(argument, index_i_j.quotient, index_i_j.remainder, index_ij_k.remainder, index_l_m.quotient, index_l_m.remainder,
1144*b095b053SXin Li 				start_n, min(range_n - start_n, tile_n));
1145*b095b053SXin Li 		}
1146*b095b053SXin Li 	}
1147*b095b053SXin Li 
1148*b095b053SXin Li 	/* Make changes by this thread visible to other threads */
1149*b095b053SXin Li 	pthreadpool_fence_release();
1150*b095b053SXin Li }
1151*b095b053SXin Li 
thread_parallelize_6d_tile_2d(struct pthreadpool * threadpool,struct thread_info * thread)1152*b095b053SXin Li static void thread_parallelize_6d_tile_2d(struct pthreadpool* threadpool, struct thread_info* thread) {
1153*b095b053SXin Li 	assert(threadpool != NULL);
1154*b095b053SXin Li 	assert(thread != NULL);
1155*b095b053SXin Li 
1156*b095b053SXin Li 	const pthreadpool_task_6d_tile_2d_t task = (pthreadpool_task_6d_tile_2d_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
1157*b095b053SXin Li 	void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
1158*b095b053SXin Li 
1159*b095b053SXin Li 	/* Process thread's own range of items */
1160*b095b053SXin Li 	const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
1161*b095b053SXin Li 	const struct fxdiv_divisor_size_t tile_range_mn = threadpool->params.parallelize_6d_tile_2d.tile_range_mn;
1162*b095b053SXin Li 	const struct fxdiv_result_size_t tile_index_ijkl_mn = fxdiv_divide_size_t(range_start, tile_range_mn);
1163*b095b053SXin Li 	const struct fxdiv_divisor_size_t range_kl = threadpool->params.parallelize_6d_tile_2d.range_kl;
1164*b095b053SXin Li 	const struct fxdiv_result_size_t index_ij_kl = fxdiv_divide_size_t(tile_index_ijkl_mn.quotient, range_kl);
1165*b095b053SXin Li 	const struct fxdiv_divisor_size_t tile_range_n = threadpool->params.parallelize_6d_tile_2d.tile_range_n;
1166*b095b053SXin Li 	const struct fxdiv_result_size_t tile_index_m_n = fxdiv_divide_size_t(tile_index_ijkl_mn.remainder, tile_range_n);
1167*b095b053SXin Li 	const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_6d_tile_2d.range_j;
1168*b095b053SXin Li 	const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_kl.quotient, range_j);
1169*b095b053SXin Li 	const struct fxdiv_divisor_size_t range_l = threadpool->params.parallelize_6d_tile_2d.range_l;
1170*b095b053SXin Li 	const struct fxdiv_result_size_t index_k_l = fxdiv_divide_size_t(index_ij_kl.remainder, range_l);
1171*b095b053SXin Li 	const size_t tile_m = threadpool->params.parallelize_6d_tile_2d.tile_m;
1172*b095b053SXin Li 	const size_t tile_n = threadpool->params.parallelize_6d_tile_2d.tile_n;
1173*b095b053SXin Li 	size_t i = index_i_j.quotient;
1174*b095b053SXin Li 	size_t j = index_i_j.remainder;
1175*b095b053SXin Li 	size_t k = index_k_l.quotient;
1176*b095b053SXin Li 	size_t l = index_k_l.remainder;
1177*b095b053SXin Li 	size_t start_m = tile_index_m_n.quotient * tile_m;
1178*b095b053SXin Li 	size_t start_n = tile_index_m_n.remainder * tile_n;
1179*b095b053SXin Li 
1180*b095b053SXin Li 	const size_t range_n = threadpool->params.parallelize_6d_tile_2d.range_n;
1181*b095b053SXin Li 	const size_t range_m = threadpool->params.parallelize_6d_tile_2d.range_m;
1182*b095b053SXin Li 	const size_t range_k = threadpool->params.parallelize_6d_tile_2d.range_k;
1183*b095b053SXin Li 	while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) {
1184*b095b053SXin Li 		task(argument, i, j, k, l, start_m, start_n, min(range_m - start_m, tile_m), min(range_n - start_n, tile_n));
1185*b095b053SXin Li 		start_n += tile_n;
1186*b095b053SXin Li 		if (start_n >= range_n) {
1187*b095b053SXin Li 			start_n = 0;
1188*b095b053SXin Li 			start_m += tile_m;
1189*b095b053SXin Li 			if (start_m >= range_m) {
1190*b095b053SXin Li 				start_m = 0;
1191*b095b053SXin Li 				if (++l == range_l.value) {
1192*b095b053SXin Li 					l = 0;
1193*b095b053SXin Li 					if (++k == range_k) {
1194*b095b053SXin Li 						k = 0;
1195*b095b053SXin Li 						if (++j == range_j.value) {
1196*b095b053SXin Li 							j = 0;
1197*b095b053SXin Li 							i += 1;
1198*b095b053SXin Li 						}
1199*b095b053SXin Li 					}
1200*b095b053SXin Li 				}
1201*b095b053SXin Li 			}
1202*b095b053SXin Li 		}
1203*b095b053SXin Li 	}
1204*b095b053SXin Li 
1205*b095b053SXin Li 	/* There still may be other threads with work */
1206*b095b053SXin Li 	const size_t thread_number = thread->thread_number;
1207*b095b053SXin Li 	const size_t threads_count = threadpool->threads_count.value;
1208*b095b053SXin Li 	for (size_t tid = modulo_decrement(thread_number, threads_count);
1209*b095b053SXin Li 		tid != thread_number;
1210*b095b053SXin Li 		tid = modulo_decrement(tid, threads_count))
1211*b095b053SXin Li 	{
1212*b095b053SXin Li 		struct thread_info* other_thread = &threadpool->threads[tid];
1213*b095b053SXin Li 		while (pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) {
1214*b095b053SXin Li 			const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
1215*b095b053SXin Li 			const struct fxdiv_result_size_t tile_index_ijkl_mn = fxdiv_divide_size_t(linear_index, tile_range_mn);
1216*b095b053SXin Li 			const struct fxdiv_result_size_t index_ij_kl = fxdiv_divide_size_t(tile_index_ijkl_mn.quotient, range_kl);
1217*b095b053SXin Li 			const struct fxdiv_result_size_t tile_index_m_n = fxdiv_divide_size_t(tile_index_ijkl_mn.remainder, tile_range_n);
1218*b095b053SXin Li 			const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_kl.quotient, range_j);
1219*b095b053SXin Li 			const struct fxdiv_result_size_t index_k_l = fxdiv_divide_size_t(index_ij_kl.remainder, range_l);
1220*b095b053SXin Li 			const size_t start_m = tile_index_m_n.quotient * tile_m;
1221*b095b053SXin Li 			const size_t start_n = tile_index_m_n.remainder * tile_n;
1222*b095b053SXin Li 			task(argument, index_i_j.quotient, index_i_j.remainder, index_k_l.quotient, index_k_l.remainder,
1223*b095b053SXin Li 				start_m, start_n, min(range_m - start_m, tile_m), min(range_n - start_n, tile_n));
1224*b095b053SXin Li 		}
1225*b095b053SXin Li 	}
1226*b095b053SXin Li 
1227*b095b053SXin Li 	/* Make changes by this thread visible to other threads */
1228*b095b053SXin Li 	pthreadpool_fence_release();
1229*b095b053SXin Li }
1230*b095b053SXin Li 
pthreadpool_parallelize_1d(struct pthreadpool * threadpool,pthreadpool_task_1d_t task,void * argument,size_t range,uint32_t flags)1231*b095b053SXin Li void pthreadpool_parallelize_1d(
1232*b095b053SXin Li 	struct pthreadpool* threadpool,
1233*b095b053SXin Li 	pthreadpool_task_1d_t task,
1234*b095b053SXin Li 	void* argument,
1235*b095b053SXin Li 	size_t range,
1236*b095b053SXin Li 	uint32_t flags)
1237*b095b053SXin Li {
1238*b095b053SXin Li 	size_t threads_count;
1239*b095b053SXin Li 	if (threadpool == NULL || (threads_count = threadpool->threads_count.value) <= 1 || range <= 1) {
1240*b095b053SXin Li 		/* No thread pool used: execute task sequentially on the calling thread */
1241*b095b053SXin Li 		struct fpu_state saved_fpu_state = { 0 };
1242*b095b053SXin Li 		if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
1243*b095b053SXin Li 			saved_fpu_state = get_fpu_state();
1244*b095b053SXin Li 			disable_fpu_denormals();
1245*b095b053SXin Li 		}
1246*b095b053SXin Li 		for (size_t i = 0; i < range; i++) {
1247*b095b053SXin Li 			task(argument, i);
1248*b095b053SXin Li 		}
1249*b095b053SXin Li 		if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
1250*b095b053SXin Li 			set_fpu_state(saved_fpu_state);
1251*b095b053SXin Li 		}
1252*b095b053SXin Li 	} else {
1253*b095b053SXin Li 		thread_function_t parallelize_1d = &thread_parallelize_1d;
1254*b095b053SXin Li 		#if PTHREADPOOL_USE_FASTPATH
1255*b095b053SXin Li 			const size_t range_threshold = -threads_count;
1256*b095b053SXin Li 			if (range < range_threshold) {
1257*b095b053SXin Li 				parallelize_1d = &pthreadpool_thread_parallelize_1d_fastpath;
1258*b095b053SXin Li 			}
1259*b095b053SXin Li 		#endif
1260*b095b053SXin Li 		pthreadpool_parallelize(
1261*b095b053SXin Li 			threadpool, parallelize_1d, NULL, 0,
1262*b095b053SXin Li 			(void*) task, argument, range, flags);
1263*b095b053SXin Li 	}
1264*b095b053SXin Li }
1265*b095b053SXin Li 
pthreadpool_parallelize_1d_with_uarch(pthreadpool_t threadpool,pthreadpool_task_1d_with_id_t task,void * argument,uint32_t default_uarch_index,uint32_t max_uarch_index,size_t range,uint32_t flags)1266*b095b053SXin Li void pthreadpool_parallelize_1d_with_uarch(
1267*b095b053SXin Li 	pthreadpool_t threadpool,
1268*b095b053SXin Li 	pthreadpool_task_1d_with_id_t task,
1269*b095b053SXin Li 	void* argument,
1270*b095b053SXin Li 	uint32_t default_uarch_index,
1271*b095b053SXin Li 	uint32_t max_uarch_index,
1272*b095b053SXin Li 	size_t range,
1273*b095b053SXin Li 	uint32_t flags)
1274*b095b053SXin Li {
1275*b095b053SXin Li 	size_t threads_count;
1276*b095b053SXin Li 	if (threadpool == NULL || (threads_count = threadpool->threads_count.value) <= 1 || range <= 1) {
1277*b095b053SXin Li 		/* No thread pool used: execute task sequentially on the calling thread */
1278*b095b053SXin Li 
1279*b095b053SXin Li 		uint32_t uarch_index = default_uarch_index;
1280*b095b053SXin Li 		#if PTHREADPOOL_USE_CPUINFO
1281*b095b053SXin Li 			uarch_index = cpuinfo_get_current_uarch_index_with_default(default_uarch_index);
1282*b095b053SXin Li 			if (uarch_index > max_uarch_index) {
1283*b095b053SXin Li 				uarch_index = default_uarch_index;
1284*b095b053SXin Li 			}
1285*b095b053SXin Li 		#endif
1286*b095b053SXin Li 
1287*b095b053SXin Li 		struct fpu_state saved_fpu_state = { 0 };
1288*b095b053SXin Li 		if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
1289*b095b053SXin Li 			saved_fpu_state = get_fpu_state();
1290*b095b053SXin Li 			disable_fpu_denormals();
1291*b095b053SXin Li 		}
1292*b095b053SXin Li 		for (size_t i = 0; i < range; i++) {
1293*b095b053SXin Li 			task(argument, uarch_index, i);
1294*b095b053SXin Li 		}
1295*b095b053SXin Li 		if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
1296*b095b053SXin Li 			set_fpu_state(saved_fpu_state);
1297*b095b053SXin Li 		}
1298*b095b053SXin Li 	} else {
1299*b095b053SXin Li 		const struct pthreadpool_1d_with_uarch_params params = {
1300*b095b053SXin Li 			.default_uarch_index = default_uarch_index,
1301*b095b053SXin Li 			.max_uarch_index = max_uarch_index,
1302*b095b053SXin Li 		};
1303*b095b053SXin Li 		thread_function_t parallelize_1d_with_uarch = &thread_parallelize_1d_with_uarch;
1304*b095b053SXin Li 		#if PTHREADPOOL_USE_FASTPATH
1305*b095b053SXin Li 			const size_t range_threshold = -threads_count;
1306*b095b053SXin Li 			if (range < range_threshold) {
1307*b095b053SXin Li 				parallelize_1d_with_uarch = &pthreadpool_thread_parallelize_1d_with_uarch_fastpath;
1308*b095b053SXin Li 			}
1309*b095b053SXin Li 		#endif
1310*b095b053SXin Li 		pthreadpool_parallelize(
1311*b095b053SXin Li 			threadpool, parallelize_1d_with_uarch, &params, sizeof(params),
1312*b095b053SXin Li 			task, argument, range, flags);
1313*b095b053SXin Li 	}
1314*b095b053SXin Li }
1315*b095b053SXin Li 
pthreadpool_parallelize_1d_tile_1d(pthreadpool_t threadpool,pthreadpool_task_1d_tile_1d_t task,void * argument,size_t range,size_t tile,uint32_t flags)1316*b095b053SXin Li void pthreadpool_parallelize_1d_tile_1d(
1317*b095b053SXin Li 	pthreadpool_t threadpool,
1318*b095b053SXin Li 	pthreadpool_task_1d_tile_1d_t task,
1319*b095b053SXin Li 	void* argument,
1320*b095b053SXin Li 	size_t range,
1321*b095b053SXin Li 	size_t tile,
1322*b095b053SXin Li 	uint32_t flags)
1323*b095b053SXin Li {
1324*b095b053SXin Li 	size_t threads_count;
1325*b095b053SXin Li 	if (threadpool == NULL || (threads_count = threadpool->threads_count.value) <= 1 || range <= tile) {
1326*b095b053SXin Li 		/* No thread pool used: execute task sequentially on the calling thread */
1327*b095b053SXin Li 		struct fpu_state saved_fpu_state = { 0 };
1328*b095b053SXin Li 		if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
1329*b095b053SXin Li 			saved_fpu_state = get_fpu_state();
1330*b095b053SXin Li 			disable_fpu_denormals();
1331*b095b053SXin Li 		}
1332*b095b053SXin Li 		for (size_t i = 0; i < range; i += tile) {
1333*b095b053SXin Li 			task(argument, i, min(range - i, tile));
1334*b095b053SXin Li 		}
1335*b095b053SXin Li 		if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
1336*b095b053SXin Li 			set_fpu_state(saved_fpu_state);
1337*b095b053SXin Li 		}
1338*b095b053SXin Li 	} else {
1339*b095b053SXin Li 		const size_t tile_range = divide_round_up(range, tile);
1340*b095b053SXin Li 		const struct pthreadpool_1d_tile_1d_params params = {
1341*b095b053SXin Li 			.range = range,
1342*b095b053SXin Li 			.tile = tile,
1343*b095b053SXin Li 		};
1344*b095b053SXin Li 		thread_function_t parallelize_1d_tile_1d = &thread_parallelize_1d_tile_1d;
1345*b095b053SXin Li 		#if PTHREADPOOL_USE_FASTPATH
1346*b095b053SXin Li 			const size_t range_threshold = -threads_count;
1347*b095b053SXin Li 			if (range < range_threshold) {
1348*b095b053SXin Li 				parallelize_1d_tile_1d = &pthreadpool_thread_parallelize_1d_tile_1d_fastpath;
1349*b095b053SXin Li 			}
1350*b095b053SXin Li 		#endif
1351*b095b053SXin Li 		pthreadpool_parallelize(
1352*b095b053SXin Li 			threadpool, parallelize_1d_tile_1d, &params, sizeof(params),
1353*b095b053SXin Li 			task, argument, tile_range, flags);
1354*b095b053SXin Li 	}
1355*b095b053SXin Li }
1356*b095b053SXin Li 
pthreadpool_parallelize_2d(pthreadpool_t threadpool,pthreadpool_task_2d_t task,void * argument,size_t range_i,size_t range_j,uint32_t flags)1357*b095b053SXin Li void pthreadpool_parallelize_2d(
1358*b095b053SXin Li 	pthreadpool_t threadpool,
1359*b095b053SXin Li 	pthreadpool_task_2d_t task,
1360*b095b053SXin Li 	void* argument,
1361*b095b053SXin Li 	size_t range_i,
1362*b095b053SXin Li 	size_t range_j,
1363*b095b053SXin Li 	uint32_t flags)
1364*b095b053SXin Li {
1365*b095b053SXin Li 	size_t threads_count;
1366*b095b053SXin Li 	if (threadpool == NULL || (threads_count = threadpool->threads_count.value) <= 1 || (range_i | range_j) <= 1) {
1367*b095b053SXin Li 		/* No thread pool used: execute task sequentially on the calling thread */
1368*b095b053SXin Li 		struct fpu_state saved_fpu_state = { 0 };
1369*b095b053SXin Li 		if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
1370*b095b053SXin Li 			saved_fpu_state = get_fpu_state();
1371*b095b053SXin Li 			disable_fpu_denormals();
1372*b095b053SXin Li 		}
1373*b095b053SXin Li 		for (size_t i = 0; i < range_i; i++) {
1374*b095b053SXin Li 			for (size_t j = 0; j < range_j; j++) {
1375*b095b053SXin Li 				task(argument, i, j);
1376*b095b053SXin Li 			}
1377*b095b053SXin Li 		}
1378*b095b053SXin Li 		if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
1379*b095b053SXin Li 			set_fpu_state(saved_fpu_state);
1380*b095b053SXin Li 		}
1381*b095b053SXin Li 	} else {
1382*b095b053SXin Li 		const size_t range = range_i * range_j;
1383*b095b053SXin Li 		const struct pthreadpool_2d_params params = {
1384*b095b053SXin Li 			.range_j = fxdiv_init_size_t(range_j),
1385*b095b053SXin Li 		};
1386*b095b053SXin Li 		thread_function_t parallelize_2d = &thread_parallelize_2d;
1387*b095b053SXin Li 		#if PTHREADPOOL_USE_FASTPATH
1388*b095b053SXin Li 			const size_t range_threshold = -threads_count;
1389*b095b053SXin Li 			if (range < range_threshold) {
1390*b095b053SXin Li 				parallelize_2d = &pthreadpool_thread_parallelize_2d_fastpath;
1391*b095b053SXin Li 			}
1392*b095b053SXin Li 		#endif
1393*b095b053SXin Li 		pthreadpool_parallelize(
1394*b095b053SXin Li 			threadpool, parallelize_2d, &params, sizeof(params),
1395*b095b053SXin Li 			task, argument, range, flags);
1396*b095b053SXin Li 	}
1397*b095b053SXin Li }
1398*b095b053SXin Li 
pthreadpool_parallelize_2d_tile_1d(pthreadpool_t threadpool,pthreadpool_task_2d_tile_1d_t task,void * argument,size_t range_i,size_t range_j,size_t tile_j,uint32_t flags)1399*b095b053SXin Li void pthreadpool_parallelize_2d_tile_1d(
1400*b095b053SXin Li 	pthreadpool_t threadpool,
1401*b095b053SXin Li 	pthreadpool_task_2d_tile_1d_t task,
1402*b095b053SXin Li 	void* argument,
1403*b095b053SXin Li 	size_t range_i,
1404*b095b053SXin Li 	size_t range_j,
1405*b095b053SXin Li 	size_t tile_j,
1406*b095b053SXin Li 	uint32_t flags)
1407*b095b053SXin Li {
1408*b095b053SXin Li 	size_t threads_count;
1409*b095b053SXin Li 	if (threadpool == NULL || (threads_count = threadpool->threads_count.value) <= 1 || (range_i <= 1 && range_j <= tile_j)) {
1410*b095b053SXin Li 		/* No thread pool used: execute task sequentially on the calling thread */
1411*b095b053SXin Li 		struct fpu_state saved_fpu_state = { 0 };
1412*b095b053SXin Li 		if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
1413*b095b053SXin Li 			saved_fpu_state = get_fpu_state();
1414*b095b053SXin Li 			disable_fpu_denormals();
1415*b095b053SXin Li 		}
1416*b095b053SXin Li 		for (size_t i = 0; i < range_i; i++) {
1417*b095b053SXin Li 			for (size_t j = 0; j < range_j; j += tile_j) {
1418*b095b053SXin Li 				task(argument, i, j, min(range_j - j, tile_j));
1419*b095b053SXin Li 			}
1420*b095b053SXin Li 		}
1421*b095b053SXin Li 		if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
1422*b095b053SXin Li 			set_fpu_state(saved_fpu_state);
1423*b095b053SXin Li 		}
1424*b095b053SXin Li 	} else {
1425*b095b053SXin Li 		const size_t tile_range_j = divide_round_up(range_j, tile_j);
1426*b095b053SXin Li 		const size_t tile_range = range_i * tile_range_j;
1427*b095b053SXin Li 		const struct pthreadpool_2d_tile_1d_params params = {
1428*b095b053SXin Li 			.range_j = range_j,
1429*b095b053SXin Li 			.tile_j = tile_j,
1430*b095b053SXin Li 			.tile_range_j = fxdiv_init_size_t(tile_range_j),
1431*b095b053SXin Li 		};
1432*b095b053SXin Li 		thread_function_t parallelize_2d_tile_1d = &thread_parallelize_2d_tile_1d;
1433*b095b053SXin Li 		#if PTHREADPOOL_USE_FASTPATH
1434*b095b053SXin Li 			const size_t range_threshold = -threads_count;
1435*b095b053SXin Li 			if (tile_range < range_threshold) {
1436*b095b053SXin Li 				parallelize_2d_tile_1d = &pthreadpool_thread_parallelize_2d_tile_1d_fastpath;
1437*b095b053SXin Li 			}
1438*b095b053SXin Li 		#endif
1439*b095b053SXin Li 		pthreadpool_parallelize(
1440*b095b053SXin Li 			threadpool, parallelize_2d_tile_1d, &params, sizeof(params),
1441*b095b053SXin Li 			task, argument, tile_range, flags);
1442*b095b053SXin Li 	}
1443*b095b053SXin Li }
1444*b095b053SXin Li 
pthreadpool_parallelize_2d_tile_2d(pthreadpool_t threadpool,pthreadpool_task_2d_tile_2d_t task,void * argument,size_t range_i,size_t range_j,size_t tile_i,size_t tile_j,uint32_t flags)1445*b095b053SXin Li void pthreadpool_parallelize_2d_tile_2d(
1446*b095b053SXin Li 	pthreadpool_t threadpool,
1447*b095b053SXin Li 	pthreadpool_task_2d_tile_2d_t task,
1448*b095b053SXin Li 	void* argument,
1449*b095b053SXin Li 	size_t range_i,
1450*b095b053SXin Li 	size_t range_j,
1451*b095b053SXin Li 	size_t tile_i,
1452*b095b053SXin Li 	size_t tile_j,
1453*b095b053SXin Li 	uint32_t flags)
1454*b095b053SXin Li {
1455*b095b053SXin Li 	size_t threads_count;
1456*b095b053SXin Li 	if (threadpool == NULL || (threads_count = threadpool->threads_count.value) <= 1 || (range_i <= tile_i && range_j <= tile_j)) {
1457*b095b053SXin Li 		/* No thread pool used: execute task sequentially on the calling thread */
1458*b095b053SXin Li 		struct fpu_state saved_fpu_state = { 0 };
1459*b095b053SXin Li 		if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
1460*b095b053SXin Li 			saved_fpu_state = get_fpu_state();
1461*b095b053SXin Li 			disable_fpu_denormals();
1462*b095b053SXin Li 		}
1463*b095b053SXin Li 		for (size_t i = 0; i < range_i; i += tile_i) {
1464*b095b053SXin Li 			for (size_t j = 0; j < range_j; j += tile_j) {
1465*b095b053SXin Li 				task(argument, i, j, min(range_i - i, tile_i), min(range_j - j, tile_j));
1466*b095b053SXin Li 			}
1467*b095b053SXin Li 		}
1468*b095b053SXin Li 		if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
1469*b095b053SXin Li 			set_fpu_state(saved_fpu_state);
1470*b095b053SXin Li 		}
1471*b095b053SXin Li 	} else {
1472*b095b053SXin Li 		const size_t tile_range_i = divide_round_up(range_i, tile_i);
1473*b095b053SXin Li 		const size_t tile_range_j = divide_round_up(range_j, tile_j);
1474*b095b053SXin Li 		const size_t tile_range = tile_range_i * tile_range_j;
1475*b095b053SXin Li 		const struct pthreadpool_2d_tile_2d_params params = {
1476*b095b053SXin Li 			.range_i = range_i,
1477*b095b053SXin Li 			.tile_i = tile_i,
1478*b095b053SXin Li 			.range_j = range_j,
1479*b095b053SXin Li 			.tile_j = tile_j,
1480*b095b053SXin Li 			.tile_range_j = fxdiv_init_size_t(tile_range_j),
1481*b095b053SXin Li 		};
1482*b095b053SXin Li 		thread_function_t parallelize_2d_tile_2d = &thread_parallelize_2d_tile_2d;
1483*b095b053SXin Li 		#if PTHREADPOOL_USE_FASTPATH
1484*b095b053SXin Li 			const size_t range_threshold = -threads_count;
1485*b095b053SXin Li 			if (tile_range < range_threshold) {
1486*b095b053SXin Li 				parallelize_2d_tile_2d = &pthreadpool_thread_parallelize_2d_tile_2d_fastpath;
1487*b095b053SXin Li 			}
1488*b095b053SXin Li 		#endif
1489*b095b053SXin Li 		pthreadpool_parallelize(
1490*b095b053SXin Li 			threadpool, parallelize_2d_tile_2d, &params, sizeof(params),
1491*b095b053SXin Li 			task, argument, tile_range, flags);
1492*b095b053SXin Li 	}
1493*b095b053SXin Li }
1494*b095b053SXin Li 
pthreadpool_parallelize_2d_tile_2d_with_uarch(pthreadpool_t threadpool,pthreadpool_task_2d_tile_2d_with_id_t task,void * argument,uint32_t default_uarch_index,uint32_t max_uarch_index,size_t range_i,size_t range_j,size_t tile_i,size_t tile_j,uint32_t flags)1495*b095b053SXin Li void pthreadpool_parallelize_2d_tile_2d_with_uarch(
1496*b095b053SXin Li 	pthreadpool_t threadpool,
1497*b095b053SXin Li 	pthreadpool_task_2d_tile_2d_with_id_t task,
1498*b095b053SXin Li 	void* argument,
1499*b095b053SXin Li 	uint32_t default_uarch_index,
1500*b095b053SXin Li 	uint32_t max_uarch_index,
1501*b095b053SXin Li 	size_t range_i,
1502*b095b053SXin Li 	size_t range_j,
1503*b095b053SXin Li 	size_t tile_i,
1504*b095b053SXin Li 	size_t tile_j,
1505*b095b053SXin Li 	uint32_t flags)
1506*b095b053SXin Li {
1507*b095b053SXin Li 	size_t threads_count;
1508*b095b053SXin Li 	if (threadpool == NULL || (threads_count = threadpool->threads_count.value) <= 1 || (range_i <= tile_i && range_j <= tile_j)) {
1509*b095b053SXin Li 		/* No thread pool used: execute task sequentially on the calling thread */
1510*b095b053SXin Li 
1511*b095b053SXin Li 		uint32_t uarch_index = default_uarch_index;
1512*b095b053SXin Li 		#if PTHREADPOOL_USE_CPUINFO
1513*b095b053SXin Li 			uarch_index = cpuinfo_get_current_uarch_index_with_default(default_uarch_index);
1514*b095b053SXin Li 			if (uarch_index > max_uarch_index) {
1515*b095b053SXin Li 				uarch_index = default_uarch_index;
1516*b095b053SXin Li 			}
1517*b095b053SXin Li 		#endif
1518*b095b053SXin Li 
1519*b095b053SXin Li 		struct fpu_state saved_fpu_state = { 0 };
1520*b095b053SXin Li 		if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
1521*b095b053SXin Li 			saved_fpu_state = get_fpu_state();
1522*b095b053SXin Li 			disable_fpu_denormals();
1523*b095b053SXin Li 		}
1524*b095b053SXin Li 		for (size_t i = 0; i < range_i; i += tile_i) {
1525*b095b053SXin Li 			for (size_t j = 0; j < range_j; j += tile_j) {
1526*b095b053SXin Li 				task(argument, uarch_index, i, j, min(range_i - i, tile_i), min(range_j - j, tile_j));
1527*b095b053SXin Li 			}
1528*b095b053SXin Li 		}
1529*b095b053SXin Li 		if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
1530*b095b053SXin Li 			set_fpu_state(saved_fpu_state);
1531*b095b053SXin Li 		}
1532*b095b053SXin Li 	} else {
1533*b095b053SXin Li 		const size_t tile_range_i = divide_round_up(range_i, tile_i);
1534*b095b053SXin Li 		const size_t tile_range_j = divide_round_up(range_j, tile_j);
1535*b095b053SXin Li 		const size_t tile_range = tile_range_i * tile_range_j;
1536*b095b053SXin Li 		const struct pthreadpool_2d_tile_2d_with_uarch_params params = {
1537*b095b053SXin Li 			.default_uarch_index = default_uarch_index,
1538*b095b053SXin Li 			.max_uarch_index = max_uarch_index,
1539*b095b053SXin Li 			.range_i = range_i,
1540*b095b053SXin Li 			.tile_i = tile_i,
1541*b095b053SXin Li 			.range_j = range_j,
1542*b095b053SXin Li 			.tile_j = tile_j,
1543*b095b053SXin Li 			.tile_range_j = fxdiv_init_size_t(tile_range_j),
1544*b095b053SXin Li 		};
1545*b095b053SXin Li 		thread_function_t parallelize_2d_tile_2d_with_uarch = &thread_parallelize_2d_tile_2d_with_uarch;
1546*b095b053SXin Li 		#if PTHREADPOOL_USE_FASTPATH
1547*b095b053SXin Li 			const size_t range_threshold = -threads_count;
1548*b095b053SXin Li 			if (tile_range < range_threshold) {
1549*b095b053SXin Li 				parallelize_2d_tile_2d_with_uarch = &pthreadpool_thread_parallelize_2d_tile_2d_with_uarch_fastpath;
1550*b095b053SXin Li 			}
1551*b095b053SXin Li 		#endif
1552*b095b053SXin Li 		pthreadpool_parallelize(
1553*b095b053SXin Li 			threadpool, parallelize_2d_tile_2d_with_uarch, &params, sizeof(params),
1554*b095b053SXin Li 			task, argument, tile_range, flags);
1555*b095b053SXin Li 	}
1556*b095b053SXin Li }
1557*b095b053SXin Li 
pthreadpool_parallelize_3d(pthreadpool_t threadpool,pthreadpool_task_3d_t task,void * argument,size_t range_i,size_t range_j,size_t range_k,uint32_t flags)1558*b095b053SXin Li void pthreadpool_parallelize_3d(
1559*b095b053SXin Li 	pthreadpool_t threadpool,
1560*b095b053SXin Li 	pthreadpool_task_3d_t task,
1561*b095b053SXin Li 	void* argument,
1562*b095b053SXin Li 	size_t range_i,
1563*b095b053SXin Li 	size_t range_j,
1564*b095b053SXin Li 	size_t range_k,
1565*b095b053SXin Li 	uint32_t flags)
1566*b095b053SXin Li {
1567*b095b053SXin Li 	size_t threads_count;
1568*b095b053SXin Li 	if (threadpool == NULL || (threads_count = threadpool->threads_count.value) <= 1 || (range_i | range_j | range_k) <= 1) {
1569*b095b053SXin Li 		/* No thread pool used: execute task sequentially on the calling thread */
1570*b095b053SXin Li 		struct fpu_state saved_fpu_state = { 0 };
1571*b095b053SXin Li 		if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
1572*b095b053SXin Li 			saved_fpu_state = get_fpu_state();
1573*b095b053SXin Li 			disable_fpu_denormals();
1574*b095b053SXin Li 		}
1575*b095b053SXin Li 		for (size_t i = 0; i < range_i; i++) {
1576*b095b053SXin Li 			for (size_t j = 0; j < range_j; j++) {
1577*b095b053SXin Li 				for (size_t k = 0; k < range_k; k++) {
1578*b095b053SXin Li 					task(argument, i, j, k);
1579*b095b053SXin Li 				}
1580*b095b053SXin Li 			}
1581*b095b053SXin Li 		}
1582*b095b053SXin Li 		if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
1583*b095b053SXin Li 			set_fpu_state(saved_fpu_state);
1584*b095b053SXin Li 		}
1585*b095b053SXin Li 	} else {
1586*b095b053SXin Li 		const size_t range = range_i * range_j * range_k;
1587*b095b053SXin Li 		const struct pthreadpool_3d_params params = {
1588*b095b053SXin Li 			.range_j = fxdiv_init_size_t(range_j),
1589*b095b053SXin Li 			.range_k = fxdiv_init_size_t(range_k),
1590*b095b053SXin Li 		};
1591*b095b053SXin Li 		thread_function_t parallelize_3d = &thread_parallelize_3d;
1592*b095b053SXin Li 		#if PTHREADPOOL_USE_FASTPATH
1593*b095b053SXin Li 			const size_t range_threshold = -threads_count;
1594*b095b053SXin Li 			if (range < range_threshold) {
1595*b095b053SXin Li 				parallelize_3d = &pthreadpool_thread_parallelize_3d_fastpath;
1596*b095b053SXin Li 			}
1597*b095b053SXin Li 		#endif
1598*b095b053SXin Li 		pthreadpool_parallelize(
1599*b095b053SXin Li 			threadpool, parallelize_3d, &params, sizeof(params),
1600*b095b053SXin Li 			task, argument, range, flags);
1601*b095b053SXin Li 	}
1602*b095b053SXin Li }
1603*b095b053SXin Li 
pthreadpool_parallelize_3d_tile_1d(pthreadpool_t threadpool,pthreadpool_task_3d_tile_1d_t task,void * argument,size_t range_i,size_t range_j,size_t range_k,size_t tile_k,uint32_t flags)1604*b095b053SXin Li void pthreadpool_parallelize_3d_tile_1d(
1605*b095b053SXin Li 	pthreadpool_t threadpool,
1606*b095b053SXin Li 	pthreadpool_task_3d_tile_1d_t task,
1607*b095b053SXin Li 	void* argument,
1608*b095b053SXin Li 	size_t range_i,
1609*b095b053SXin Li 	size_t range_j,
1610*b095b053SXin Li 	size_t range_k,
1611*b095b053SXin Li 	size_t tile_k,
1612*b095b053SXin Li 	uint32_t flags)
1613*b095b053SXin Li {
1614*b095b053SXin Li 	size_t threads_count;
1615*b095b053SXin Li 	if (threadpool == NULL || (threads_count = threadpool->threads_count.value) <= 1 || ((range_i | range_j) <= 1 && range_k <= tile_k)) {
1616*b095b053SXin Li 		/* No thread pool used: execute task sequentially on the calling thread */
1617*b095b053SXin Li 		struct fpu_state saved_fpu_state = { 0 };
1618*b095b053SXin Li 		if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
1619*b095b053SXin Li 			saved_fpu_state = get_fpu_state();
1620*b095b053SXin Li 			disable_fpu_denormals();
1621*b095b053SXin Li 		}
1622*b095b053SXin Li 		for (size_t i = 0; i < range_i; i++) {
1623*b095b053SXin Li 			for (size_t j = 0; j < range_j; j++) {
1624*b095b053SXin Li 				for (size_t k = 0; k < range_k; k += tile_k) {
1625*b095b053SXin Li 					task(argument, i, j, k, min(range_k - k, tile_k));
1626*b095b053SXin Li 				}
1627*b095b053SXin Li 			}
1628*b095b053SXin Li 		}
1629*b095b053SXin Li 		if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
1630*b095b053SXin Li 			set_fpu_state(saved_fpu_state);
1631*b095b053SXin Li 		}
1632*b095b053SXin Li 	} else {
1633*b095b053SXin Li 		const size_t tile_range_k = divide_round_up(range_k, tile_k);
1634*b095b053SXin Li 		const size_t tile_range = range_i * range_j * tile_range_k;
1635*b095b053SXin Li 		const struct pthreadpool_3d_tile_1d_params params = {
1636*b095b053SXin Li 			.range_k = range_k,
1637*b095b053SXin Li 			.tile_k = tile_k,
1638*b095b053SXin Li 			.range_j = fxdiv_init_size_t(range_j),
1639*b095b053SXin Li 			.tile_range_k = fxdiv_init_size_t(tile_range_k),
1640*b095b053SXin Li 		};
1641*b095b053SXin Li 		thread_function_t parallelize_3d_tile_1d = &thread_parallelize_3d_tile_1d;
1642*b095b053SXin Li 		#if PTHREADPOOL_USE_FASTPATH
1643*b095b053SXin Li 			const size_t range_threshold = -threads_count;
1644*b095b053SXin Li 			if (tile_range < range_threshold) {
1645*b095b053SXin Li 				parallelize_3d_tile_1d = &pthreadpool_thread_parallelize_3d_tile_1d_fastpath;
1646*b095b053SXin Li 			}
1647*b095b053SXin Li 		#endif
1648*b095b053SXin Li 		pthreadpool_parallelize(
1649*b095b053SXin Li 			threadpool, parallelize_3d_tile_1d, &params, sizeof(params),
1650*b095b053SXin Li 			task, argument, tile_range, flags);
1651*b095b053SXin Li 	}
1652*b095b053SXin Li }
1653*b095b053SXin Li 
pthreadpool_parallelize_3d_tile_2d(pthreadpool_t threadpool,pthreadpool_task_3d_tile_2d_t task,void * argument,size_t range_i,size_t range_j,size_t range_k,size_t tile_j,size_t tile_k,uint32_t flags)1654*b095b053SXin Li void pthreadpool_parallelize_3d_tile_2d(
1655*b095b053SXin Li 	pthreadpool_t threadpool,
1656*b095b053SXin Li 	pthreadpool_task_3d_tile_2d_t task,
1657*b095b053SXin Li 	void* argument,
1658*b095b053SXin Li 	size_t range_i,
1659*b095b053SXin Li 	size_t range_j,
1660*b095b053SXin Li 	size_t range_k,
1661*b095b053SXin Li 	size_t tile_j,
1662*b095b053SXin Li 	size_t tile_k,
1663*b095b053SXin Li 	uint32_t flags)
1664*b095b053SXin Li {
1665*b095b053SXin Li 	size_t threads_count;
1666*b095b053SXin Li 	if (threadpool == NULL || (threads_count = threadpool->threads_count.value) <= 1 || (range_i <= 1 && range_j <= tile_j && range_k <= tile_k)) {
1667*b095b053SXin Li 		/* No thread pool used: execute task sequentially on the calling thread */
1668*b095b053SXin Li 		struct fpu_state saved_fpu_state = { 0 };
1669*b095b053SXin Li 		if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
1670*b095b053SXin Li 			saved_fpu_state = get_fpu_state();
1671*b095b053SXin Li 			disable_fpu_denormals();
1672*b095b053SXin Li 		}
1673*b095b053SXin Li 		for (size_t i = 0; i < range_i; i++) {
1674*b095b053SXin Li 			for (size_t j = 0; j < range_j; j += tile_j) {
1675*b095b053SXin Li 				for (size_t k = 0; k < range_k; k += tile_k) {
1676*b095b053SXin Li 					task(argument, i, j, k, min(range_j - j, tile_j), min(range_k - k, tile_k));
1677*b095b053SXin Li 				}
1678*b095b053SXin Li 			}
1679*b095b053SXin Li 		}
1680*b095b053SXin Li 		if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
1681*b095b053SXin Li 			set_fpu_state(saved_fpu_state);
1682*b095b053SXin Li 		}
1683*b095b053SXin Li 	} else {
1684*b095b053SXin Li 		const size_t tile_range_j = divide_round_up(range_j, tile_j);
1685*b095b053SXin Li 		const size_t tile_range_k = divide_round_up(range_k, tile_k);
1686*b095b053SXin Li 		const size_t tile_range = range_i * tile_range_j * tile_range_k;
1687*b095b053SXin Li 		const struct pthreadpool_3d_tile_2d_params params = {
1688*b095b053SXin Li 			.range_j = range_j,
1689*b095b053SXin Li 			.tile_j = tile_j,
1690*b095b053SXin Li 			.range_k = range_k,
1691*b095b053SXin Li 			.tile_k = tile_k,
1692*b095b053SXin Li 			.tile_range_j = fxdiv_init_size_t(tile_range_j),
1693*b095b053SXin Li 			.tile_range_k = fxdiv_init_size_t(tile_range_k),
1694*b095b053SXin Li 		};
1695*b095b053SXin Li 		thread_function_t parallelize_3d_tile_2d = &thread_parallelize_3d_tile_2d;
1696*b095b053SXin Li 		#if PTHREADPOOL_USE_FASTPATH
1697*b095b053SXin Li 			const size_t range_threshold = -threads_count;
1698*b095b053SXin Li 			if (tile_range < range_threshold) {
1699*b095b053SXin Li 				parallelize_3d_tile_2d = &pthreadpool_thread_parallelize_3d_tile_2d_fastpath;
1700*b095b053SXin Li 			}
1701*b095b053SXin Li 		#endif
1702*b095b053SXin Li 		pthreadpool_parallelize(
1703*b095b053SXin Li 			threadpool, parallelize_3d_tile_2d, &params, sizeof(params),
1704*b095b053SXin Li 			task, argument, tile_range, flags);
1705*b095b053SXin Li 	}
1706*b095b053SXin Li }
1707*b095b053SXin Li 
pthreadpool_parallelize_3d_tile_2d_with_uarch(pthreadpool_t threadpool,pthreadpool_task_3d_tile_2d_with_id_t task,void * argument,uint32_t default_uarch_index,uint32_t max_uarch_index,size_t range_i,size_t range_j,size_t range_k,size_t tile_j,size_t tile_k,uint32_t flags)1708*b095b053SXin Li void pthreadpool_parallelize_3d_tile_2d_with_uarch(
1709*b095b053SXin Li 	pthreadpool_t threadpool,
1710*b095b053SXin Li 	pthreadpool_task_3d_tile_2d_with_id_t task,
1711*b095b053SXin Li 	void* argument,
1712*b095b053SXin Li 	uint32_t default_uarch_index,
1713*b095b053SXin Li 	uint32_t max_uarch_index,
1714*b095b053SXin Li 	size_t range_i,
1715*b095b053SXin Li 	size_t range_j,
1716*b095b053SXin Li 	size_t range_k,
1717*b095b053SXin Li 	size_t tile_j,
1718*b095b053SXin Li 	size_t tile_k,
1719*b095b053SXin Li 	uint32_t flags)
1720*b095b053SXin Li {
1721*b095b053SXin Li 	size_t threads_count;
1722*b095b053SXin Li 	if (threadpool == NULL || (threads_count = threadpool->threads_count.value) <= 1 || (range_i <= 1 && range_j <= tile_j && range_k <= tile_k)) {
1723*b095b053SXin Li 		/* No thread pool used: execute task sequentially on the calling thread */
1724*b095b053SXin Li 
1725*b095b053SXin Li 		uint32_t uarch_index = default_uarch_index;
1726*b095b053SXin Li 		#if PTHREADPOOL_USE_CPUINFO
1727*b095b053SXin Li 			uarch_index = cpuinfo_get_current_uarch_index_with_default(default_uarch_index);
1728*b095b053SXin Li 			if (uarch_index > max_uarch_index) {
1729*b095b053SXin Li 				uarch_index = default_uarch_index;
1730*b095b053SXin Li 			}
1731*b095b053SXin Li 		#endif
1732*b095b053SXin Li 
1733*b095b053SXin Li 		struct fpu_state saved_fpu_state = { 0 };
1734*b095b053SXin Li 		if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
1735*b095b053SXin Li 			saved_fpu_state = get_fpu_state();
1736*b095b053SXin Li 			disable_fpu_denormals();
1737*b095b053SXin Li 		}
1738*b095b053SXin Li 		for (size_t i = 0; i < range_i; i++) {
1739*b095b053SXin Li 			for (size_t j = 0; j < range_j; j += tile_j) {
1740*b095b053SXin Li 				for (size_t k = 0; k < range_k; k += tile_k) {
1741*b095b053SXin Li 					task(argument, uarch_index, i, j, k, min(range_j - j, tile_j), min(range_k - k, tile_k));
1742*b095b053SXin Li 				}
1743*b095b053SXin Li 			}
1744*b095b053SXin Li 		}
1745*b095b053SXin Li 		if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
1746*b095b053SXin Li 			set_fpu_state(saved_fpu_state);
1747*b095b053SXin Li 		}
1748*b095b053SXin Li 	} else {
1749*b095b053SXin Li 		const size_t tile_range_j = divide_round_up(range_j, tile_j);
1750*b095b053SXin Li 		const size_t tile_range_k = divide_round_up(range_k, tile_k);
1751*b095b053SXin Li 		const size_t tile_range = range_i * tile_range_j * tile_range_k;
1752*b095b053SXin Li 		const struct pthreadpool_3d_tile_2d_with_uarch_params params = {
1753*b095b053SXin Li 			.default_uarch_index = default_uarch_index,
1754*b095b053SXin Li 			.max_uarch_index = max_uarch_index,
1755*b095b053SXin Li 			.range_j = range_j,
1756*b095b053SXin Li 			.tile_j = tile_j,
1757*b095b053SXin Li 			.range_k = range_k,
1758*b095b053SXin Li 			.tile_k = tile_k,
1759*b095b053SXin Li 			.tile_range_j = fxdiv_init_size_t(tile_range_j),
1760*b095b053SXin Li 			.tile_range_k = fxdiv_init_size_t(tile_range_k),
1761*b095b053SXin Li 		};
1762*b095b053SXin Li 		thread_function_t parallelize_3d_tile_2d_with_uarch = &thread_parallelize_3d_tile_2d_with_uarch;
1763*b095b053SXin Li 		#if PTHREADPOOL_USE_FASTPATH
1764*b095b053SXin Li 			const size_t range_threshold = -threads_count;
1765*b095b053SXin Li 			if (tile_range < range_threshold) {
1766*b095b053SXin Li 				parallelize_3d_tile_2d_with_uarch = &pthreadpool_thread_parallelize_3d_tile_2d_with_uarch_fastpath;
1767*b095b053SXin Li 			}
1768*b095b053SXin Li 		#endif
1769*b095b053SXin Li 		pthreadpool_parallelize(
1770*b095b053SXin Li 			threadpool, parallelize_3d_tile_2d_with_uarch, &params, sizeof(params),
1771*b095b053SXin Li 			task, argument, tile_range, flags);
1772*b095b053SXin Li 	}
1773*b095b053SXin Li }
1774*b095b053SXin Li 
pthreadpool_parallelize_4d(pthreadpool_t threadpool,pthreadpool_task_4d_t task,void * argument,size_t range_i,size_t range_j,size_t range_k,size_t range_l,uint32_t flags)1775*b095b053SXin Li void pthreadpool_parallelize_4d(
1776*b095b053SXin Li 	pthreadpool_t threadpool,
1777*b095b053SXin Li 	pthreadpool_task_4d_t task,
1778*b095b053SXin Li 	void* argument,
1779*b095b053SXin Li 	size_t range_i,
1780*b095b053SXin Li 	size_t range_j,
1781*b095b053SXin Li 	size_t range_k,
1782*b095b053SXin Li 	size_t range_l,
1783*b095b053SXin Li 	uint32_t flags)
1784*b095b053SXin Li {
1785*b095b053SXin Li 	size_t threads_count;
1786*b095b053SXin Li 	if (threadpool == NULL || (threads_count = threadpool->threads_count.value) <= 1 || (range_i | range_j | range_k | range_l) <= 1) {
1787*b095b053SXin Li 		/* No thread pool used: execute task sequentially on the calling thread */
1788*b095b053SXin Li 		struct fpu_state saved_fpu_state = { 0 };
1789*b095b053SXin Li 		if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
1790*b095b053SXin Li 			saved_fpu_state = get_fpu_state();
1791*b095b053SXin Li 			disable_fpu_denormals();
1792*b095b053SXin Li 		}
1793*b095b053SXin Li 		for (size_t i = 0; i < range_i; i++) {
1794*b095b053SXin Li 			for (size_t j = 0; j < range_j; j++) {
1795*b095b053SXin Li 				for (size_t k = 0; k < range_k; k++) {
1796*b095b053SXin Li 					for (size_t l = 0; l < range_l; l++) {
1797*b095b053SXin Li 						task(argument, i, j, k, l);
1798*b095b053SXin Li 					}
1799*b095b053SXin Li 				}
1800*b095b053SXin Li 			}
1801*b095b053SXin Li 		}
1802*b095b053SXin Li 		if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
1803*b095b053SXin Li 			set_fpu_state(saved_fpu_state);
1804*b095b053SXin Li 		}
1805*b095b053SXin Li 	} else {
1806*b095b053SXin Li 		const size_t range_kl = range_k * range_l;
1807*b095b053SXin Li 		const size_t range = range_i * range_j * range_kl;
1808*b095b053SXin Li 		const struct pthreadpool_4d_params params = {
1809*b095b053SXin Li 			.range_k = range_k,
1810*b095b053SXin Li 			.range_j = fxdiv_init_size_t(range_j),
1811*b095b053SXin Li 			.range_kl = fxdiv_init_size_t(range_kl),
1812*b095b053SXin Li 			.range_l = fxdiv_init_size_t(range_l),
1813*b095b053SXin Li 		};
1814*b095b053SXin Li 		thread_function_t parallelize_4d = &thread_parallelize_4d;
1815*b095b053SXin Li 		#if PTHREADPOOL_USE_FASTPATH
1816*b095b053SXin Li 			const size_t range_threshold = -threads_count;
1817*b095b053SXin Li 			if (range < range_threshold) {
1818*b095b053SXin Li 				parallelize_4d = &pthreadpool_thread_parallelize_4d_fastpath;
1819*b095b053SXin Li 			}
1820*b095b053SXin Li 		#endif
1821*b095b053SXin Li 		pthreadpool_parallelize(
1822*b095b053SXin Li 			threadpool, parallelize_4d, &params, sizeof(params),
1823*b095b053SXin Li 			task, argument, range, flags);
1824*b095b053SXin Li 	}
1825*b095b053SXin Li }
1826*b095b053SXin Li 
pthreadpool_parallelize_4d_tile_1d(pthreadpool_t threadpool,pthreadpool_task_4d_tile_1d_t task,void * argument,size_t range_i,size_t range_j,size_t range_k,size_t range_l,size_t tile_l,uint32_t flags)1827*b095b053SXin Li void pthreadpool_parallelize_4d_tile_1d(
1828*b095b053SXin Li 	pthreadpool_t threadpool,
1829*b095b053SXin Li 	pthreadpool_task_4d_tile_1d_t task,
1830*b095b053SXin Li 	void* argument,
1831*b095b053SXin Li 	size_t range_i,
1832*b095b053SXin Li 	size_t range_j,
1833*b095b053SXin Li 	size_t range_k,
1834*b095b053SXin Li 	size_t range_l,
1835*b095b053SXin Li 	size_t tile_l,
1836*b095b053SXin Li 	uint32_t flags)
1837*b095b053SXin Li {
1838*b095b053SXin Li 	size_t threads_count;
1839*b095b053SXin Li 	if (threadpool == NULL || (threads_count = threadpool->threads_count.value) <= 1 || ((range_i | range_j | range_k) <= 1 && range_l <= tile_l)) {
1840*b095b053SXin Li 		/* No thread pool used: execute task sequentially on the calling thread */
1841*b095b053SXin Li 		struct fpu_state saved_fpu_state = { 0 };
1842*b095b053SXin Li 		if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
1843*b095b053SXin Li 			saved_fpu_state = get_fpu_state();
1844*b095b053SXin Li 			disable_fpu_denormals();
1845*b095b053SXin Li 		}
1846*b095b053SXin Li 		for (size_t i = 0; i < range_i; i++) {
1847*b095b053SXin Li 			for (size_t j = 0; j < range_j; j++) {
1848*b095b053SXin Li 				for (size_t k = 0; k < range_k; k++) {
1849*b095b053SXin Li 					for (size_t l = 0; l < range_l; l += tile_l) {
1850*b095b053SXin Li 						task(argument, i, j, k, l, min(range_l - l, tile_l));
1851*b095b053SXin Li 					}
1852*b095b053SXin Li 				}
1853*b095b053SXin Li 			}
1854*b095b053SXin Li 		}
1855*b095b053SXin Li 		if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
1856*b095b053SXin Li 			set_fpu_state(saved_fpu_state);
1857*b095b053SXin Li 		}
1858*b095b053SXin Li 	} else {
1859*b095b053SXin Li 		const size_t tile_range_l = divide_round_up(range_l, tile_l);
1860*b095b053SXin Li 		const size_t tile_range_kl = range_k * tile_range_l;
1861*b095b053SXin Li 		const size_t tile_range = range_i * range_j * tile_range_kl;
1862*b095b053SXin Li 		const struct pthreadpool_4d_tile_1d_params params = {
1863*b095b053SXin Li 			.range_k = range_k,
1864*b095b053SXin Li 			.range_l = range_l,
1865*b095b053SXin Li 			.tile_l = tile_l,
1866*b095b053SXin Li 			.range_j = fxdiv_init_size_t(range_j),
1867*b095b053SXin Li 			.tile_range_kl = fxdiv_init_size_t(tile_range_kl),
1868*b095b053SXin Li 			.tile_range_l = fxdiv_init_size_t(tile_range_l),
1869*b095b053SXin Li 		};
1870*b095b053SXin Li 		thread_function_t parallelize_4d_tile_1d = &thread_parallelize_4d_tile_1d;
1871*b095b053SXin Li 		#if PTHREADPOOL_USE_FASTPATH
1872*b095b053SXin Li 			const size_t range_threshold = -threads_count;
1873*b095b053SXin Li 			if (tile_range < range_threshold) {
1874*b095b053SXin Li 				parallelize_4d_tile_1d = &pthreadpool_thread_parallelize_4d_tile_1d_fastpath;
1875*b095b053SXin Li 			}
1876*b095b053SXin Li 		#endif
1877*b095b053SXin Li 		pthreadpool_parallelize(
1878*b095b053SXin Li 			threadpool, parallelize_4d_tile_1d, &params, sizeof(params),
1879*b095b053SXin Li 			task, argument, tile_range, flags);
1880*b095b053SXin Li 	}
1881*b095b053SXin Li }
1882*b095b053SXin Li 
pthreadpool_parallelize_4d_tile_2d(pthreadpool_t threadpool,pthreadpool_task_4d_tile_2d_t task,void * argument,size_t range_i,size_t range_j,size_t range_k,size_t range_l,size_t tile_k,size_t tile_l,uint32_t flags)1883*b095b053SXin Li void pthreadpool_parallelize_4d_tile_2d(
1884*b095b053SXin Li 	pthreadpool_t threadpool,
1885*b095b053SXin Li 	pthreadpool_task_4d_tile_2d_t task,
1886*b095b053SXin Li 	void* argument,
1887*b095b053SXin Li 	size_t range_i,
1888*b095b053SXin Li 	size_t range_j,
1889*b095b053SXin Li 	size_t range_k,
1890*b095b053SXin Li 	size_t range_l,
1891*b095b053SXin Li 	size_t tile_k,
1892*b095b053SXin Li 	size_t tile_l,
1893*b095b053SXin Li 	uint32_t flags)
1894*b095b053SXin Li {
1895*b095b053SXin Li 	size_t threads_count;
1896*b095b053SXin Li 	if (threadpool == NULL || (threads_count = threadpool->threads_count.value) <= 1 || ((range_i | range_j) <= 1 && range_k <= tile_k && range_l <= tile_l)) {
1897*b095b053SXin Li 		/* No thread pool used: execute task sequentially on the calling thread */
1898*b095b053SXin Li 		struct fpu_state saved_fpu_state = { 0 };
1899*b095b053SXin Li 		if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
1900*b095b053SXin Li 			saved_fpu_state = get_fpu_state();
1901*b095b053SXin Li 			disable_fpu_denormals();
1902*b095b053SXin Li 		}
1903*b095b053SXin Li 		for (size_t i = 0; i < range_i; i++) {
1904*b095b053SXin Li 			for (size_t j = 0; j < range_j; j++) {
1905*b095b053SXin Li 				for (size_t k = 0; k < range_k; k += tile_k) {
1906*b095b053SXin Li 					for (size_t l = 0; l < range_l; l += tile_l) {
1907*b095b053SXin Li 						task(argument, i, j, k, l,
1908*b095b053SXin Li 							min(range_k - k, tile_k), min(range_l - l, tile_l));
1909*b095b053SXin Li 					}
1910*b095b053SXin Li 				}
1911*b095b053SXin Li 			}
1912*b095b053SXin Li 		}
1913*b095b053SXin Li 		if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
1914*b095b053SXin Li 			set_fpu_state(saved_fpu_state);
1915*b095b053SXin Li 		}
1916*b095b053SXin Li 	} else {
1917*b095b053SXin Li 		const size_t tile_range_l = divide_round_up(range_l, tile_l);
1918*b095b053SXin Li 		const size_t tile_range_kl = divide_round_up(range_k, tile_k) * tile_range_l;
1919*b095b053SXin Li 		const size_t tile_range = range_i * range_j * tile_range_kl;
1920*b095b053SXin Li 		const struct pthreadpool_4d_tile_2d_params params = {
1921*b095b053SXin Li 			.range_k = range_k,
1922*b095b053SXin Li 			.tile_k = tile_k,
1923*b095b053SXin Li 			.range_l = range_l,
1924*b095b053SXin Li 			.tile_l = tile_l,
1925*b095b053SXin Li 			.range_j = fxdiv_init_size_t(range_j),
1926*b095b053SXin Li 			.tile_range_kl = fxdiv_init_size_t(tile_range_kl),
1927*b095b053SXin Li 			.tile_range_l = fxdiv_init_size_t(tile_range_l),
1928*b095b053SXin Li 		};
1929*b095b053SXin Li 		thread_function_t parallelize_4d_tile_2d = &thread_parallelize_4d_tile_2d;
1930*b095b053SXin Li 		#if PTHREADPOOL_USE_FASTPATH
1931*b095b053SXin Li 			const size_t range_threshold = -threads_count;
1932*b095b053SXin Li 			if (tile_range < range_threshold) {
1933*b095b053SXin Li 				parallelize_4d_tile_2d = &pthreadpool_thread_parallelize_4d_tile_2d_fastpath;
1934*b095b053SXin Li 			}
1935*b095b053SXin Li 		#endif
1936*b095b053SXin Li 		pthreadpool_parallelize(
1937*b095b053SXin Li 			threadpool, parallelize_4d_tile_2d, &params, sizeof(params),
1938*b095b053SXin Li 			task, argument, tile_range, flags);
1939*b095b053SXin Li 	}
1940*b095b053SXin Li }
1941*b095b053SXin Li 
pthreadpool_parallelize_4d_tile_2d_with_uarch(pthreadpool_t threadpool,pthreadpool_task_4d_tile_2d_with_id_t task,void * argument,uint32_t default_uarch_index,uint32_t max_uarch_index,size_t range_i,size_t range_j,size_t range_k,size_t range_l,size_t tile_k,size_t tile_l,uint32_t flags)1942*b095b053SXin Li void pthreadpool_parallelize_4d_tile_2d_with_uarch(
1943*b095b053SXin Li 	pthreadpool_t threadpool,
1944*b095b053SXin Li 	pthreadpool_task_4d_tile_2d_with_id_t task,
1945*b095b053SXin Li 	void* argument,
1946*b095b053SXin Li 	uint32_t default_uarch_index,
1947*b095b053SXin Li 	uint32_t max_uarch_index,
1948*b095b053SXin Li 	size_t range_i,
1949*b095b053SXin Li 	size_t range_j,
1950*b095b053SXin Li 	size_t range_k,
1951*b095b053SXin Li 	size_t range_l,
1952*b095b053SXin Li 	size_t tile_k,
1953*b095b053SXin Li 	size_t tile_l,
1954*b095b053SXin Li 	uint32_t flags)
1955*b095b053SXin Li {
1956*b095b053SXin Li 	size_t threads_count;
1957*b095b053SXin Li 	if (threadpool == NULL || (threads_count = threadpool->threads_count.value) <= 1 || ((range_i | range_j) <= 1 && range_k <= tile_k && range_l <= tile_l)) {
1958*b095b053SXin Li 		/* No thread pool used: execute task sequentially on the calling thread */
1959*b095b053SXin Li 
1960*b095b053SXin Li 		uint32_t uarch_index = default_uarch_index;
1961*b095b053SXin Li 		#if PTHREADPOOL_USE_CPUINFO
1962*b095b053SXin Li 			uarch_index = cpuinfo_get_current_uarch_index_with_default(default_uarch_index);
1963*b095b053SXin Li 			if (uarch_index > max_uarch_index) {
1964*b095b053SXin Li 				uarch_index = default_uarch_index;
1965*b095b053SXin Li 			}
1966*b095b053SXin Li 		#endif
1967*b095b053SXin Li 
1968*b095b053SXin Li 		struct fpu_state saved_fpu_state = { 0 };
1969*b095b053SXin Li 		if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
1970*b095b053SXin Li 			saved_fpu_state = get_fpu_state();
1971*b095b053SXin Li 			disable_fpu_denormals();
1972*b095b053SXin Li 		}
1973*b095b053SXin Li 		for (size_t i = 0; i < range_i; i++) {
1974*b095b053SXin Li 			for (size_t j = 0; j < range_j; j++) {
1975*b095b053SXin Li 				for (size_t k = 0; k < range_k; k += tile_k) {
1976*b095b053SXin Li 					for (size_t l = 0; l < range_l; l += tile_l) {
1977*b095b053SXin Li 						task(argument, uarch_index, i, j, k, l,
1978*b095b053SXin Li 							min(range_k - k, tile_k), min(range_l - l, tile_l));
1979*b095b053SXin Li 					}
1980*b095b053SXin Li 				}
1981*b095b053SXin Li 			}
1982*b095b053SXin Li 		}
1983*b095b053SXin Li 		if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
1984*b095b053SXin Li 			set_fpu_state(saved_fpu_state);
1985*b095b053SXin Li 		}
1986*b095b053SXin Li 	} else {
1987*b095b053SXin Li 		const size_t tile_range_l = divide_round_up(range_l, tile_l);
1988*b095b053SXin Li 		const size_t tile_range_kl = divide_round_up(range_k, tile_k) * tile_range_l;
1989*b095b053SXin Li 		const size_t tile_range = range_i * range_j * tile_range_kl;
1990*b095b053SXin Li 		const struct pthreadpool_4d_tile_2d_with_uarch_params params = {
1991*b095b053SXin Li 			.default_uarch_index = default_uarch_index,
1992*b095b053SXin Li 			.max_uarch_index = max_uarch_index,
1993*b095b053SXin Li 			.range_k = range_k,
1994*b095b053SXin Li 			.tile_k = tile_k,
1995*b095b053SXin Li 			.range_l = range_l,
1996*b095b053SXin Li 			.tile_l = tile_l,
1997*b095b053SXin Li 			.range_j = fxdiv_init_size_t(range_j),
1998*b095b053SXin Li 			.tile_range_kl = fxdiv_init_size_t(tile_range_kl),
1999*b095b053SXin Li 			.tile_range_l = fxdiv_init_size_t(tile_range_l),
2000*b095b053SXin Li 		};
2001*b095b053SXin Li 		thread_function_t parallelize_4d_tile_2d_with_uarch = &thread_parallelize_4d_tile_2d_with_uarch;
2002*b095b053SXin Li 		#if PTHREADPOOL_USE_FASTPATH
2003*b095b053SXin Li 			const size_t range_threshold = -threads_count;
2004*b095b053SXin Li 			if (tile_range < range_threshold) {
2005*b095b053SXin Li 				parallelize_4d_tile_2d_with_uarch = &pthreadpool_thread_parallelize_4d_tile_2d_with_uarch_fastpath;
2006*b095b053SXin Li 			}
2007*b095b053SXin Li 		#endif
2008*b095b053SXin Li 		pthreadpool_parallelize(
2009*b095b053SXin Li 			threadpool, parallelize_4d_tile_2d_with_uarch, &params, sizeof(params),
2010*b095b053SXin Li 			task, argument, tile_range, flags);
2011*b095b053SXin Li 	}
2012*b095b053SXin Li }
2013*b095b053SXin Li 
pthreadpool_parallelize_5d(pthreadpool_t threadpool,pthreadpool_task_5d_t task,void * argument,size_t range_i,size_t range_j,size_t range_k,size_t range_l,size_t range_m,uint32_t flags)2014*b095b053SXin Li void pthreadpool_parallelize_5d(
2015*b095b053SXin Li 	pthreadpool_t threadpool,
2016*b095b053SXin Li 	pthreadpool_task_5d_t task,
2017*b095b053SXin Li 	void* argument,
2018*b095b053SXin Li 	size_t range_i,
2019*b095b053SXin Li 	size_t range_j,
2020*b095b053SXin Li 	size_t range_k,
2021*b095b053SXin Li 	size_t range_l,
2022*b095b053SXin Li 	size_t range_m,
2023*b095b053SXin Li 	uint32_t flags)
2024*b095b053SXin Li {
2025*b095b053SXin Li 	size_t threads_count;
2026*b095b053SXin Li 	if (threadpool == NULL || (threads_count = threadpool->threads_count.value) <= 1 || (range_i | range_j | range_k | range_l | range_m) <= 1) {
2027*b095b053SXin Li 		/* No thread pool used: execute task sequentially on the calling thread */
2028*b095b053SXin Li 		struct fpu_state saved_fpu_state = { 0 };
2029*b095b053SXin Li 		if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
2030*b095b053SXin Li 			saved_fpu_state = get_fpu_state();
2031*b095b053SXin Li 			disable_fpu_denormals();
2032*b095b053SXin Li 		}
2033*b095b053SXin Li 		for (size_t i = 0; i < range_i; i++) {
2034*b095b053SXin Li 			for (size_t j = 0; j < range_j; j++) {
2035*b095b053SXin Li 				for (size_t k = 0; k < range_k; k++) {
2036*b095b053SXin Li 					for (size_t l = 0; l < range_l; l++) {
2037*b095b053SXin Li 						for (size_t m = 0; m < range_m; m++) {
2038*b095b053SXin Li 							task(argument, i, j, k, l, m);
2039*b095b053SXin Li 						}
2040*b095b053SXin Li 					}
2041*b095b053SXin Li 				}
2042*b095b053SXin Li 			}
2043*b095b053SXin Li 		}
2044*b095b053SXin Li 		if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
2045*b095b053SXin Li 			set_fpu_state(saved_fpu_state);
2046*b095b053SXin Li 		}
2047*b095b053SXin Li 	} else {
2048*b095b053SXin Li 		const size_t range_lm = range_l * range_m;
2049*b095b053SXin Li 		const size_t range = range_i * range_j * range_k * range_lm;
2050*b095b053SXin Li 		const struct pthreadpool_5d_params params = {
2051*b095b053SXin Li 			.range_l = range_l,
2052*b095b053SXin Li 			.range_j = fxdiv_init_size_t(range_j),
2053*b095b053SXin Li 			.range_k = fxdiv_init_size_t(range_k),
2054*b095b053SXin Li 			.range_lm = fxdiv_init_size_t(range_lm),
2055*b095b053SXin Li 			.range_m = fxdiv_init_size_t(range_m),
2056*b095b053SXin Li 		};
2057*b095b053SXin Li 		thread_function_t parallelize_5d = &thread_parallelize_5d;
2058*b095b053SXin Li 		#if PTHREADPOOL_USE_FASTPATH
2059*b095b053SXin Li 			const size_t range_threshold = -threads_count;
2060*b095b053SXin Li 			if (range < range_threshold) {
2061*b095b053SXin Li 				parallelize_5d = &pthreadpool_thread_parallelize_5d_fastpath;
2062*b095b053SXin Li 			}
2063*b095b053SXin Li 		#endif
2064*b095b053SXin Li 		pthreadpool_parallelize(
2065*b095b053SXin Li 			threadpool, parallelize_5d, &params, sizeof(params),
2066*b095b053SXin Li 			task, argument, range, flags);
2067*b095b053SXin Li 	}
2068*b095b053SXin Li }
2069*b095b053SXin Li 
pthreadpool_parallelize_5d_tile_1d(pthreadpool_t threadpool,pthreadpool_task_5d_tile_1d_t task,void * argument,size_t range_i,size_t range_j,size_t range_k,size_t range_l,size_t range_m,size_t tile_m,uint32_t flags)2070*b095b053SXin Li void pthreadpool_parallelize_5d_tile_1d(
2071*b095b053SXin Li 	pthreadpool_t threadpool,
2072*b095b053SXin Li 	pthreadpool_task_5d_tile_1d_t task,
2073*b095b053SXin Li 	void* argument,
2074*b095b053SXin Li 	size_t range_i,
2075*b095b053SXin Li 	size_t range_j,
2076*b095b053SXin Li 	size_t range_k,
2077*b095b053SXin Li 	size_t range_l,
2078*b095b053SXin Li 	size_t range_m,
2079*b095b053SXin Li 	size_t tile_m,
2080*b095b053SXin Li 	uint32_t flags)
2081*b095b053SXin Li {
2082*b095b053SXin Li 	size_t threads_count;
2083*b095b053SXin Li 	if (threadpool == NULL || (threads_count = threadpool->threads_count.value) <= 1 || ((range_i | range_j | range_k | range_l) <= 1 && range_m <= tile_m)) {
2084*b095b053SXin Li 		/* No thread pool used: execute task sequentially on the calling thread */
2085*b095b053SXin Li 		struct fpu_state saved_fpu_state = { 0 };
2086*b095b053SXin Li 		if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
2087*b095b053SXin Li 			saved_fpu_state = get_fpu_state();
2088*b095b053SXin Li 			disable_fpu_denormals();
2089*b095b053SXin Li 		}
2090*b095b053SXin Li 		for (size_t i = 0; i < range_i; i++) {
2091*b095b053SXin Li 			for (size_t j = 0; j < range_j; j++) {
2092*b095b053SXin Li 				for (size_t k = 0; k < range_k; k++) {
2093*b095b053SXin Li 					for (size_t l = 0; l < range_l; l++) {
2094*b095b053SXin Li 						for (size_t m = 0; m < range_m; m += tile_m) {
2095*b095b053SXin Li 							task(argument, i, j, k, l, m, min(range_m - m, tile_m));
2096*b095b053SXin Li 						}
2097*b095b053SXin Li 					}
2098*b095b053SXin Li 				}
2099*b095b053SXin Li 			}
2100*b095b053SXin Li 		}
2101*b095b053SXin Li 		if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
2102*b095b053SXin Li 			set_fpu_state(saved_fpu_state);
2103*b095b053SXin Li 		}
2104*b095b053SXin Li 	} else {
2105*b095b053SXin Li 		const size_t tile_range_m = divide_round_up(range_m, tile_m);
2106*b095b053SXin Li 		const size_t range_kl = range_k * range_l;
2107*b095b053SXin Li 		const size_t tile_range = range_i * range_j * range_kl * tile_range_m;
2108*b095b053SXin Li 		const struct pthreadpool_5d_tile_1d_params params = {
2109*b095b053SXin Li 			.range_k = range_k,
2110*b095b053SXin Li 			.range_m = range_m,
2111*b095b053SXin Li 			.tile_m = tile_m,
2112*b095b053SXin Li 			.range_j = fxdiv_init_size_t(range_j),
2113*b095b053SXin Li 			.range_kl = fxdiv_init_size_t(range_kl),
2114*b095b053SXin Li 			.range_l = fxdiv_init_size_t(range_l),
2115*b095b053SXin Li 			.tile_range_m = fxdiv_init_size_t(tile_range_m),
2116*b095b053SXin Li 		};
2117*b095b053SXin Li 		thread_function_t parallelize_5d_tile_1d = &thread_parallelize_5d_tile_1d;
2118*b095b053SXin Li 		#if PTHREADPOOL_USE_FASTPATH
2119*b095b053SXin Li 			const size_t range_threshold = -threads_count;
2120*b095b053SXin Li 			if (tile_range < range_threshold) {
2121*b095b053SXin Li 				parallelize_5d_tile_1d = &pthreadpool_thread_parallelize_5d_tile_1d_fastpath;
2122*b095b053SXin Li 			}
2123*b095b053SXin Li 		#endif
2124*b095b053SXin Li 		pthreadpool_parallelize(
2125*b095b053SXin Li 			threadpool, parallelize_5d_tile_1d, &params, sizeof(params),
2126*b095b053SXin Li 			task, argument, tile_range, flags);
2127*b095b053SXin Li 	}
2128*b095b053SXin Li }
2129*b095b053SXin Li 
pthreadpool_parallelize_5d_tile_2d(pthreadpool_t threadpool,pthreadpool_task_5d_tile_2d_t task,void * argument,size_t range_i,size_t range_j,size_t range_k,size_t range_l,size_t range_m,size_t tile_l,size_t tile_m,uint32_t flags)2130*b095b053SXin Li void pthreadpool_parallelize_5d_tile_2d(
2131*b095b053SXin Li 	pthreadpool_t threadpool,
2132*b095b053SXin Li 	pthreadpool_task_5d_tile_2d_t task,
2133*b095b053SXin Li 	void* argument,
2134*b095b053SXin Li 	size_t range_i,
2135*b095b053SXin Li 	size_t range_j,
2136*b095b053SXin Li 	size_t range_k,
2137*b095b053SXin Li 	size_t range_l,
2138*b095b053SXin Li 	size_t range_m,
2139*b095b053SXin Li 	size_t tile_l,
2140*b095b053SXin Li 	size_t tile_m,
2141*b095b053SXin Li 	uint32_t flags)
2142*b095b053SXin Li {
2143*b095b053SXin Li 	size_t threads_count;
2144*b095b053SXin Li 	if (threadpool == NULL || (threads_count = threadpool->threads_count.value) <= 1 || ((range_i | range_j | range_k) <= 1 && range_l <= tile_l && range_m <= tile_m)) {
2145*b095b053SXin Li 		/* No thread pool used: execute task sequentially on the calling thread */
2146*b095b053SXin Li 		struct fpu_state saved_fpu_state = { 0 };
2147*b095b053SXin Li 		if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
2148*b095b053SXin Li 			saved_fpu_state = get_fpu_state();
2149*b095b053SXin Li 			disable_fpu_denormals();
2150*b095b053SXin Li 		}
2151*b095b053SXin Li 		for (size_t i = 0; i < range_i; i++) {
2152*b095b053SXin Li 			for (size_t j = 0; j < range_j; j++) {
2153*b095b053SXin Li 				for (size_t k = 0; k < range_k; k++) {
2154*b095b053SXin Li 					for (size_t l = 0; l < range_l; l += tile_l) {
2155*b095b053SXin Li 						for (size_t m = 0; m < range_m; m += tile_m) {
2156*b095b053SXin Li 							task(argument, i, j, k, l, m,
2157*b095b053SXin Li 								min(range_l - l, tile_l), min(range_m - m, tile_m));
2158*b095b053SXin Li 						}
2159*b095b053SXin Li 					}
2160*b095b053SXin Li 				}
2161*b095b053SXin Li 			}
2162*b095b053SXin Li 		}
2163*b095b053SXin Li 		if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
2164*b095b053SXin Li 			set_fpu_state(saved_fpu_state);
2165*b095b053SXin Li 		}
2166*b095b053SXin Li 	} else {
2167*b095b053SXin Li 		const size_t tile_range_m = divide_round_up(range_m, tile_m);
2168*b095b053SXin Li 		const size_t tile_range_lm = divide_round_up(range_l, tile_l) * tile_range_m;
2169*b095b053SXin Li 		const size_t tile_range = range_i * range_j * range_k * tile_range_lm;
2170*b095b053SXin Li 		const struct pthreadpool_5d_tile_2d_params params = {
2171*b095b053SXin Li 			.range_l = range_l,
2172*b095b053SXin Li 			.tile_l = tile_l,
2173*b095b053SXin Li 			.range_m = range_m,
2174*b095b053SXin Li 			.tile_m = tile_m,
2175*b095b053SXin Li 			.range_j = fxdiv_init_size_t(range_j),
2176*b095b053SXin Li 			.range_k = fxdiv_init_size_t(range_k),
2177*b095b053SXin Li 			.tile_range_lm = fxdiv_init_size_t(tile_range_lm),
2178*b095b053SXin Li 			.tile_range_m = fxdiv_init_size_t(tile_range_m),
2179*b095b053SXin Li 		};
2180*b095b053SXin Li 		thread_function_t parallelize_5d_tile_2d = &thread_parallelize_5d_tile_2d;
2181*b095b053SXin Li 		#if PTHREADPOOL_USE_FASTPATH
2182*b095b053SXin Li 			const size_t range_threshold = -threads_count;
2183*b095b053SXin Li 			if (tile_range < range_threshold) {
2184*b095b053SXin Li 				parallelize_5d_tile_2d = &pthreadpool_thread_parallelize_5d_tile_2d_fastpath;
2185*b095b053SXin Li 			}
2186*b095b053SXin Li 		#endif
2187*b095b053SXin Li 		pthreadpool_parallelize(
2188*b095b053SXin Li 			threadpool, parallelize_5d_tile_2d, &params, sizeof(params),
2189*b095b053SXin Li 			task, argument, tile_range, flags);
2190*b095b053SXin Li 	}
2191*b095b053SXin Li }
2192*b095b053SXin Li 
pthreadpool_parallelize_6d(pthreadpool_t threadpool,pthreadpool_task_6d_t task,void * argument,size_t range_i,size_t range_j,size_t range_k,size_t range_l,size_t range_m,size_t range_n,uint32_t flags)2193*b095b053SXin Li void pthreadpool_parallelize_6d(
2194*b095b053SXin Li 	pthreadpool_t threadpool,
2195*b095b053SXin Li 	pthreadpool_task_6d_t task,
2196*b095b053SXin Li 	void* argument,
2197*b095b053SXin Li 	size_t range_i,
2198*b095b053SXin Li 	size_t range_j,
2199*b095b053SXin Li 	size_t range_k,
2200*b095b053SXin Li 	size_t range_l,
2201*b095b053SXin Li 	size_t range_m,
2202*b095b053SXin Li 	size_t range_n,
2203*b095b053SXin Li 	uint32_t flags)
2204*b095b053SXin Li {
2205*b095b053SXin Li 	size_t threads_count;
2206*b095b053SXin Li 	if (threadpool == NULL || (threads_count = threadpool->threads_count.value) <= 1 || (range_i | range_j | range_k | range_l | range_m | range_n) <= 1) {
2207*b095b053SXin Li 		/* No thread pool used: execute task sequentially on the calling thread */
2208*b095b053SXin Li 		struct fpu_state saved_fpu_state = { 0 };
2209*b095b053SXin Li 		if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
2210*b095b053SXin Li 			saved_fpu_state = get_fpu_state();
2211*b095b053SXin Li 			disable_fpu_denormals();
2212*b095b053SXin Li 		}
2213*b095b053SXin Li 		for (size_t i = 0; i < range_i; i++) {
2214*b095b053SXin Li 			for (size_t j = 0; j < range_j; j++) {
2215*b095b053SXin Li 				for (size_t k = 0; k < range_k; k++) {
2216*b095b053SXin Li 					for (size_t l = 0; l < range_l; l++) {
2217*b095b053SXin Li 						for (size_t m = 0; m < range_m; m++) {
2218*b095b053SXin Li 							for (size_t n = 0; n < range_n; n++) {
2219*b095b053SXin Li 								task(argument, i, j, k, l, m, n);
2220*b095b053SXin Li 							}
2221*b095b053SXin Li 						}
2222*b095b053SXin Li 					}
2223*b095b053SXin Li 				}
2224*b095b053SXin Li 			}
2225*b095b053SXin Li 		}
2226*b095b053SXin Li 		if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
2227*b095b053SXin Li 			set_fpu_state(saved_fpu_state);
2228*b095b053SXin Li 		}
2229*b095b053SXin Li 	} else {
2230*b095b053SXin Li 		const size_t range_lmn = range_l * range_m * range_n;
2231*b095b053SXin Li 		const size_t range = range_i * range_j * range_k * range_lmn;
2232*b095b053SXin Li 		const struct pthreadpool_6d_params params = {
2233*b095b053SXin Li 			.range_l = range_l,
2234*b095b053SXin Li 			.range_j = fxdiv_init_size_t(range_j),
2235*b095b053SXin Li 			.range_k = fxdiv_init_size_t(range_k),
2236*b095b053SXin Li 			.range_lmn = fxdiv_init_size_t(range_lmn),
2237*b095b053SXin Li 			.range_m = fxdiv_init_size_t(range_m),
2238*b095b053SXin Li 			.range_n = fxdiv_init_size_t(range_n),
2239*b095b053SXin Li 		};
2240*b095b053SXin Li 		thread_function_t parallelize_6d = &thread_parallelize_6d;
2241*b095b053SXin Li 		#if PTHREADPOOL_USE_FASTPATH
2242*b095b053SXin Li 			const size_t range_threshold = -threads_count;
2243*b095b053SXin Li 			if (range < range_threshold) {
2244*b095b053SXin Li 				parallelize_6d = &pthreadpool_thread_parallelize_6d_fastpath;
2245*b095b053SXin Li 			}
2246*b095b053SXin Li 		#endif
2247*b095b053SXin Li 		pthreadpool_parallelize(
2248*b095b053SXin Li 			threadpool, parallelize_6d, &params, sizeof(params),
2249*b095b053SXin Li 			task, argument, range, flags);
2250*b095b053SXin Li 	}
2251*b095b053SXin Li }
2252*b095b053SXin Li 
pthreadpool_parallelize_6d_tile_1d(pthreadpool_t threadpool,pthreadpool_task_6d_tile_1d_t task,void * argument,size_t range_i,size_t range_j,size_t range_k,size_t range_l,size_t range_m,size_t range_n,size_t tile_n,uint32_t flags)2253*b095b053SXin Li void pthreadpool_parallelize_6d_tile_1d(
2254*b095b053SXin Li 	pthreadpool_t threadpool,
2255*b095b053SXin Li 	pthreadpool_task_6d_tile_1d_t task,
2256*b095b053SXin Li 	void* argument,
2257*b095b053SXin Li 	size_t range_i,
2258*b095b053SXin Li 	size_t range_j,
2259*b095b053SXin Li 	size_t range_k,
2260*b095b053SXin Li 	size_t range_l,
2261*b095b053SXin Li 	size_t range_m,
2262*b095b053SXin Li 	size_t range_n,
2263*b095b053SXin Li 	size_t tile_n,
2264*b095b053SXin Li 	uint32_t flags)
2265*b095b053SXin Li {
2266*b095b053SXin Li 	size_t threads_count;
2267*b095b053SXin Li 	if (threadpool == NULL || (threads_count = threadpool->threads_count.value) <= 1 || ((range_i | range_j | range_k | range_l | range_m) <= 1 && range_n <= tile_n)) {
2268*b095b053SXin Li 		/* No thread pool used: execute task sequentially on the calling thread */
2269*b095b053SXin Li 		struct fpu_state saved_fpu_state = { 0 };
2270*b095b053SXin Li 		if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
2271*b095b053SXin Li 			saved_fpu_state = get_fpu_state();
2272*b095b053SXin Li 			disable_fpu_denormals();
2273*b095b053SXin Li 		}
2274*b095b053SXin Li 		for (size_t i = 0; i < range_i; i++) {
2275*b095b053SXin Li 			for (size_t j = 0; j < range_j; j++) {
2276*b095b053SXin Li 				for (size_t k = 0; k < range_k; k++) {
2277*b095b053SXin Li 					for (size_t l = 0; l < range_l; l++) {
2278*b095b053SXin Li 						for (size_t m = 0; m < range_m; m++) {
2279*b095b053SXin Li 							for (size_t n = 0; n < range_n; n += tile_n) {
2280*b095b053SXin Li 								task(argument, i, j, k, l, m, n, min(range_n - n, tile_n));
2281*b095b053SXin Li 							}
2282*b095b053SXin Li 						}
2283*b095b053SXin Li 					}
2284*b095b053SXin Li 				}
2285*b095b053SXin Li 			}
2286*b095b053SXin Li 		}
2287*b095b053SXin Li 		if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
2288*b095b053SXin Li 			set_fpu_state(saved_fpu_state);
2289*b095b053SXin Li 		}
2290*b095b053SXin Li 	} else {
2291*b095b053SXin Li 		const size_t tile_range_n = divide_round_up(range_n, tile_n);
2292*b095b053SXin Li 		const size_t tile_range_lmn = range_l * range_m * tile_range_n;
2293*b095b053SXin Li 		const size_t tile_range = range_i * range_j * range_k * tile_range_lmn;
2294*b095b053SXin Li 		const struct pthreadpool_6d_tile_1d_params params = {
2295*b095b053SXin Li 			.range_l = range_l,
2296*b095b053SXin Li 			.range_n = range_n,
2297*b095b053SXin Li 			.tile_n = tile_n,
2298*b095b053SXin Li 			.range_j = fxdiv_init_size_t(range_j),
2299*b095b053SXin Li 			.range_k = fxdiv_init_size_t(range_k),
2300*b095b053SXin Li 			.tile_range_lmn = fxdiv_init_size_t(tile_range_lmn),
2301*b095b053SXin Li 			.range_m = fxdiv_init_size_t(range_m),
2302*b095b053SXin Li 			.tile_range_n = fxdiv_init_size_t(tile_range_n),
2303*b095b053SXin Li 		};
2304*b095b053SXin Li 		thread_function_t parallelize_6d_tile_1d = &thread_parallelize_6d_tile_1d;
2305*b095b053SXin Li 		#if PTHREADPOOL_USE_FASTPATH
2306*b095b053SXin Li 			const size_t range_threshold = -threads_count;
2307*b095b053SXin Li 			if (tile_range < range_threshold) {
2308*b095b053SXin Li 				parallelize_6d_tile_1d = &pthreadpool_thread_parallelize_6d_tile_1d_fastpath;
2309*b095b053SXin Li 			}
2310*b095b053SXin Li 		#endif
2311*b095b053SXin Li 		pthreadpool_parallelize(
2312*b095b053SXin Li 			threadpool, parallelize_6d_tile_1d, &params, sizeof(params),
2313*b095b053SXin Li 			task, argument, tile_range, flags);
2314*b095b053SXin Li 	}
2315*b095b053SXin Li }
2316*b095b053SXin Li 
pthreadpool_parallelize_6d_tile_2d(pthreadpool_t threadpool,pthreadpool_task_6d_tile_2d_t task,void * argument,size_t range_i,size_t range_j,size_t range_k,size_t range_l,size_t range_m,size_t range_n,size_t tile_m,size_t tile_n,uint32_t flags)2317*b095b053SXin Li void pthreadpool_parallelize_6d_tile_2d(
2318*b095b053SXin Li 	pthreadpool_t threadpool,
2319*b095b053SXin Li 	pthreadpool_task_6d_tile_2d_t task,
2320*b095b053SXin Li 	void* argument,
2321*b095b053SXin Li 	size_t range_i,
2322*b095b053SXin Li 	size_t range_j,
2323*b095b053SXin Li 	size_t range_k,
2324*b095b053SXin Li 	size_t range_l,
2325*b095b053SXin Li 	size_t range_m,
2326*b095b053SXin Li 	size_t range_n,
2327*b095b053SXin Li 	size_t tile_m,
2328*b095b053SXin Li 	size_t tile_n,
2329*b095b053SXin Li 	uint32_t flags)
2330*b095b053SXin Li {
2331*b095b053SXin Li 	size_t threads_count;
2332*b095b053SXin Li 	if (threadpool == NULL || (threads_count = threadpool->threads_count.value) <= 1 || ((range_i | range_j | range_k | range_l) <= 1 && range_m <= tile_m && range_n <= tile_n)) {
2333*b095b053SXin Li 		/* No thread pool used: execute task sequentially on the calling thread */
2334*b095b053SXin Li 		struct fpu_state saved_fpu_state = { 0 };
2335*b095b053SXin Li 		if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
2336*b095b053SXin Li 			saved_fpu_state = get_fpu_state();
2337*b095b053SXin Li 			disable_fpu_denormals();
2338*b095b053SXin Li 		}
2339*b095b053SXin Li 		for (size_t i = 0; i < range_i; i++) {
2340*b095b053SXin Li 			for (size_t j = 0; j < range_j; j++) {
2341*b095b053SXin Li 				for (size_t k = 0; k < range_k; k++) {
2342*b095b053SXin Li 					for (size_t l = 0; l < range_l; l++) {
2343*b095b053SXin Li 						for (size_t m = 0; m < range_m; m += tile_m) {
2344*b095b053SXin Li 							for (size_t n = 0; n < range_n; n += tile_n) {
2345*b095b053SXin Li 								task(argument, i, j, k, l, m, n,
2346*b095b053SXin Li 									min(range_m - m, tile_m), min(range_n - n, tile_n));
2347*b095b053SXin Li 							}
2348*b095b053SXin Li 						}
2349*b095b053SXin Li 					}
2350*b095b053SXin Li 				}
2351*b095b053SXin Li 			}
2352*b095b053SXin Li 		}
2353*b095b053SXin Li 		if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) {
2354*b095b053SXin Li 			set_fpu_state(saved_fpu_state);
2355*b095b053SXin Li 		}
2356*b095b053SXin Li 	} else {
2357*b095b053SXin Li 		const size_t range_kl = range_k * range_l;
2358*b095b053SXin Li 		const size_t tile_range_n = divide_round_up(range_n, tile_n);
2359*b095b053SXin Li 		const size_t tile_range_mn = divide_round_up(range_m, tile_m) * tile_range_n;
2360*b095b053SXin Li 		const size_t tile_range = range_i * range_j * range_kl * tile_range_mn;
2361*b095b053SXin Li 		const struct pthreadpool_6d_tile_2d_params params = {
2362*b095b053SXin Li 			.range_k = range_k,
2363*b095b053SXin Li 			.range_m = range_m,
2364*b095b053SXin Li 			.tile_m = tile_m,
2365*b095b053SXin Li 			.range_n = range_n,
2366*b095b053SXin Li 			.tile_n = tile_n,
2367*b095b053SXin Li 			.range_j = fxdiv_init_size_t(range_j),
2368*b095b053SXin Li 			.range_kl = fxdiv_init_size_t(range_kl),
2369*b095b053SXin Li 			.range_l = fxdiv_init_size_t(range_l),
2370*b095b053SXin Li 			.tile_range_mn = fxdiv_init_size_t(tile_range_mn),
2371*b095b053SXin Li 			.tile_range_n = fxdiv_init_size_t(tile_range_n),
2372*b095b053SXin Li 		};
2373*b095b053SXin Li 		thread_function_t parallelize_6d_tile_2d = &thread_parallelize_6d_tile_2d;
2374*b095b053SXin Li 		#if PTHREADPOOL_USE_FASTPATH
2375*b095b053SXin Li 			const size_t range_threshold = -threads_count;
2376*b095b053SXin Li 			if (tile_range < range_threshold) {
2377*b095b053SXin Li 				parallelize_6d_tile_2d = &pthreadpool_thread_parallelize_6d_tile_2d_fastpath;
2378*b095b053SXin Li 			}
2379*b095b053SXin Li 		#endif
2380*b095b053SXin Li 		pthreadpool_parallelize(
2381*b095b053SXin Li 			threadpool, parallelize_6d_tile_2d, &params, sizeof(params),
2382*b095b053SXin Li 			task, argument, tile_range, flags);
2383*b095b053SXin Li 	}
2384*b095b053SXin Li }
2385