xref: /aosp_15_r20/external/pthreadpool/src/fastpath.c (revision b095b0533730c2930f947df924a4486d266faa1a)
1*b095b053SXin Li /* Standard C headers */
2*b095b053SXin Li #include <assert.h>
3*b095b053SXin Li #include <stdbool.h>
4*b095b053SXin Li #include <stdint.h>
5*b095b053SXin Li #include <stdlib.h>
6*b095b053SXin Li #include <string.h>
7*b095b053SXin Li 
8*b095b053SXin Li #if PTHREADPOOL_USE_CPUINFO
9*b095b053SXin Li 	#include <cpuinfo.h>
10*b095b053SXin Li #endif
11*b095b053SXin Li 
12*b095b053SXin Li /* Dependencies */
13*b095b053SXin Li #include <fxdiv.h>
14*b095b053SXin Li 
15*b095b053SXin Li /* Public library header */
16*b095b053SXin Li #include <pthreadpool.h>
17*b095b053SXin Li 
18*b095b053SXin Li /* Internal library headers */
19*b095b053SXin Li #include "threadpool-atomics.h"
20*b095b053SXin Li #include "threadpool-common.h"
21*b095b053SXin Li #include "threadpool-object.h"
22*b095b053SXin Li #include "threadpool-utils.h"
23*b095b053SXin Li 
24*b095b053SXin Li 
pthreadpool_thread_parallelize_1d_fastpath(struct pthreadpool * threadpool,struct thread_info * thread)25*b095b053SXin Li PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_1d_fastpath(
26*b095b053SXin Li 	struct pthreadpool* threadpool,
27*b095b053SXin Li 	struct thread_info* thread)
28*b095b053SXin Li {
29*b095b053SXin Li 	assert(threadpool != NULL);
30*b095b053SXin Li 	assert(thread != NULL);
31*b095b053SXin Li 
32*b095b053SXin Li 	const pthreadpool_task_1d_t task = (pthreadpool_task_1d_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
33*b095b053SXin Li 	void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
34*b095b053SXin Li 
35*b095b053SXin Li 	const size_t threads_count = threadpool->threads_count.value;
36*b095b053SXin Li 	const size_t range_threshold = -threads_count;
37*b095b053SXin Li 
38*b095b053SXin Li 	/* Process thread's own range of items */
39*b095b053SXin Li 	size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
40*b095b053SXin Li 	while (pthreadpool_decrement_fetch_relaxed_size_t(&thread->range_length) < range_threshold) {
41*b095b053SXin Li 		task(argument, range_start++);
42*b095b053SXin Li 	}
43*b095b053SXin Li 
44*b095b053SXin Li 	/* There still may be other threads with work */
45*b095b053SXin Li 	const size_t thread_number = thread->thread_number;
46*b095b053SXin Li 	for (size_t tid = modulo_decrement(thread_number, threads_count);
47*b095b053SXin Li 		tid != thread_number;
48*b095b053SXin Li 		tid = modulo_decrement(tid, threads_count))
49*b095b053SXin Li 	{
50*b095b053SXin Li 		struct thread_info* other_thread = &threadpool->threads[tid];
51*b095b053SXin Li 		while (pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_length) < range_threshold) {
52*b095b053SXin Li 			const size_t index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
53*b095b053SXin Li 			task(argument, index);
54*b095b053SXin Li 		}
55*b095b053SXin Li 	}
56*b095b053SXin Li 
57*b095b053SXin Li 	/* Make changes by this thread visible to other threads */
58*b095b053SXin Li 	pthreadpool_fence_release();
59*b095b053SXin Li }
60*b095b053SXin Li 
pthreadpool_thread_parallelize_1d_with_uarch_fastpath(struct pthreadpool * threadpool,struct thread_info * thread)61*b095b053SXin Li PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_1d_with_uarch_fastpath(
62*b095b053SXin Li 	struct pthreadpool* threadpool,
63*b095b053SXin Li 	struct thread_info* thread)
64*b095b053SXin Li {
65*b095b053SXin Li 	assert(threadpool != NULL);
66*b095b053SXin Li 	assert(thread != NULL);
67*b095b053SXin Li 
68*b095b053SXin Li 	const pthreadpool_task_1d_with_id_t task = (pthreadpool_task_1d_with_id_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
69*b095b053SXin Li 	void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
70*b095b053SXin Li 
71*b095b053SXin Li 	const uint32_t default_uarch_index = threadpool->params.parallelize_1d_with_uarch.default_uarch_index;
72*b095b053SXin Li 	uint32_t uarch_index = default_uarch_index;
73*b095b053SXin Li 	#if PTHREADPOOL_USE_CPUINFO
74*b095b053SXin Li 		uarch_index = cpuinfo_get_current_uarch_index_with_default(default_uarch_index);
75*b095b053SXin Li 		if (uarch_index > threadpool->params.parallelize_1d_with_uarch.max_uarch_index) {
76*b095b053SXin Li 			uarch_index = default_uarch_index;
77*b095b053SXin Li 		}
78*b095b053SXin Li 	#endif
79*b095b053SXin Li 
80*b095b053SXin Li 	const size_t threads_count = threadpool->threads_count.value;
81*b095b053SXin Li 	const size_t range_threshold = -threads_count;
82*b095b053SXin Li 
83*b095b053SXin Li 	/* Process thread's own range of items */
84*b095b053SXin Li 	size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
85*b095b053SXin Li 	while (pthreadpool_decrement_fetch_relaxed_size_t(&thread->range_length) < range_threshold) {
86*b095b053SXin Li 		task(argument, uarch_index, range_start++);
87*b095b053SXin Li 	}
88*b095b053SXin Li 
89*b095b053SXin Li 	/* There still may be other threads with work */
90*b095b053SXin Li 	const size_t thread_number = thread->thread_number;
91*b095b053SXin Li 	for (size_t tid = modulo_decrement(thread_number, threads_count);
92*b095b053SXin Li 		tid != thread_number;
93*b095b053SXin Li 		tid = modulo_decrement(tid, threads_count))
94*b095b053SXin Li 	{
95*b095b053SXin Li 		struct thread_info* other_thread = &threadpool->threads[tid];
96*b095b053SXin Li 		while (pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_length) < range_threshold) {
97*b095b053SXin Li 			const size_t index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
98*b095b053SXin Li 			task(argument, uarch_index, index);
99*b095b053SXin Li 		}
100*b095b053SXin Li 	}
101*b095b053SXin Li 
102*b095b053SXin Li 	/* Make changes by this thread visible to other threads */
103*b095b053SXin Li 	pthreadpool_fence_release();
104*b095b053SXin Li }
105*b095b053SXin Li 
pthreadpool_thread_parallelize_1d_tile_1d_fastpath(struct pthreadpool * threadpool,struct thread_info * thread)106*b095b053SXin Li PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_1d_tile_1d_fastpath(
107*b095b053SXin Li 	struct pthreadpool* threadpool,
108*b095b053SXin Li 	struct thread_info* thread)
109*b095b053SXin Li {
110*b095b053SXin Li 	assert(threadpool != NULL);
111*b095b053SXin Li 	assert(thread != NULL);
112*b095b053SXin Li 
113*b095b053SXin Li 	const pthreadpool_task_1d_tile_1d_t task = (pthreadpool_task_1d_tile_1d_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
114*b095b053SXin Li 	void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
115*b095b053SXin Li 
116*b095b053SXin Li 	const size_t threads_count = threadpool->threads_count.value;
117*b095b053SXin Li 	const size_t range_threshold = -threads_count;
118*b095b053SXin Li 
119*b095b053SXin Li 	/* Process thread's own range of items */
120*b095b053SXin Li 	const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
121*b095b053SXin Li 	const size_t tile = threadpool->params.parallelize_1d_tile_1d.tile;
122*b095b053SXin Li 	size_t tile_start = range_start * tile;
123*b095b053SXin Li 
124*b095b053SXin Li 	const size_t range = threadpool->params.parallelize_1d_tile_1d.range;
125*b095b053SXin Li 	while (pthreadpool_decrement_fetch_relaxed_size_t(&thread->range_length) < range_threshold) {
126*b095b053SXin Li 		task(argument, tile_start, min(range - tile_start, tile));
127*b095b053SXin Li 		tile_start += tile;
128*b095b053SXin Li 	}
129*b095b053SXin Li 
130*b095b053SXin Li 	/* There still may be other threads with work */
131*b095b053SXin Li 	const size_t thread_number = thread->thread_number;
132*b095b053SXin Li 	for (size_t tid = modulo_decrement(thread_number, threads_count);
133*b095b053SXin Li 		tid != thread_number;
134*b095b053SXin Li 		tid = modulo_decrement(tid, threads_count))
135*b095b053SXin Li 	{
136*b095b053SXin Li 		struct thread_info* other_thread = &threadpool->threads[tid];
137*b095b053SXin Li 		while (pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_length) < range_threshold) {
138*b095b053SXin Li 			const size_t tile_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
139*b095b053SXin Li 			const size_t tile_start = tile_index * tile;
140*b095b053SXin Li 			task(argument, tile_start, min(range - tile_start, tile));
141*b095b053SXin Li 		}
142*b095b053SXin Li 	}
143*b095b053SXin Li 
144*b095b053SXin Li 	/* Make changes by this thread visible to other threads */
145*b095b053SXin Li 	pthreadpool_fence_release();
146*b095b053SXin Li }
147*b095b053SXin Li 
pthreadpool_thread_parallelize_2d_fastpath(struct pthreadpool * threadpool,struct thread_info * thread)148*b095b053SXin Li PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_2d_fastpath(
149*b095b053SXin Li 	struct pthreadpool* threadpool,
150*b095b053SXin Li 	struct thread_info* thread)
151*b095b053SXin Li {
152*b095b053SXin Li 	assert(threadpool != NULL);
153*b095b053SXin Li 	assert(thread != NULL);
154*b095b053SXin Li 
155*b095b053SXin Li 	const pthreadpool_task_2d_t task = (pthreadpool_task_2d_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
156*b095b053SXin Li 	void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
157*b095b053SXin Li 
158*b095b053SXin Li 	const size_t threads_count = threadpool->threads_count.value;
159*b095b053SXin Li 	const size_t range_threshold = -threads_count;
160*b095b053SXin Li 
161*b095b053SXin Li 	/* Process thread's own range of items */
162*b095b053SXin Li 	const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
163*b095b053SXin Li 	const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_2d.range_j;
164*b095b053SXin Li 	const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(range_start, range_j);
165*b095b053SXin Li 	size_t i = index_i_j.quotient;
166*b095b053SXin Li 	size_t j = index_i_j.remainder;
167*b095b053SXin Li 
168*b095b053SXin Li 	while (pthreadpool_decrement_fetch_relaxed_size_t(&thread->range_length) < range_threshold) {
169*b095b053SXin Li 		task(argument, i, j);
170*b095b053SXin Li 		if (++j == range_j.value) {
171*b095b053SXin Li 			j = 0;
172*b095b053SXin Li 			i += 1;
173*b095b053SXin Li 		}
174*b095b053SXin Li 	}
175*b095b053SXin Li 
176*b095b053SXin Li 	/* There still may be other threads with work */
177*b095b053SXin Li 	const size_t thread_number = thread->thread_number;
178*b095b053SXin Li 	for (size_t tid = modulo_decrement(thread_number, threads_count);
179*b095b053SXin Li 		tid != thread_number;
180*b095b053SXin Li 		tid = modulo_decrement(tid, threads_count))
181*b095b053SXin Li 	{
182*b095b053SXin Li 		struct thread_info* other_thread = &threadpool->threads[tid];
183*b095b053SXin Li 		while (pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_length) < range_threshold) {
184*b095b053SXin Li 			const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
185*b095b053SXin Li 			const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(linear_index, range_j);
186*b095b053SXin Li 			task(argument, index_i_j.quotient, index_i_j.remainder);
187*b095b053SXin Li 		}
188*b095b053SXin Li 	}
189*b095b053SXin Li 
190*b095b053SXin Li 	/* Make changes by this thread visible to other threads */
191*b095b053SXin Li 	pthreadpool_fence_release();
192*b095b053SXin Li }
193*b095b053SXin Li 
pthreadpool_thread_parallelize_2d_tile_1d_fastpath(struct pthreadpool * threadpool,struct thread_info * thread)194*b095b053SXin Li PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_2d_tile_1d_fastpath(
195*b095b053SXin Li 	struct pthreadpool* threadpool,
196*b095b053SXin Li 	struct thread_info* thread)
197*b095b053SXin Li {
198*b095b053SXin Li 	assert(threadpool != NULL);
199*b095b053SXin Li 	assert(thread != NULL);
200*b095b053SXin Li 
201*b095b053SXin Li 	const pthreadpool_task_2d_tile_1d_t task = (pthreadpool_task_2d_tile_1d_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
202*b095b053SXin Li 	void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
203*b095b053SXin Li 
204*b095b053SXin Li 	const size_t threads_count = threadpool->threads_count.value;
205*b095b053SXin Li 	const size_t range_threshold = -threads_count;
206*b095b053SXin Li 
207*b095b053SXin Li 	/* Process thread's own range of items */
208*b095b053SXin Li 	const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
209*b095b053SXin Li 	const struct fxdiv_divisor_size_t tile_range_j = threadpool->params.parallelize_2d_tile_1d.tile_range_j;
210*b095b053SXin Li 	const struct fxdiv_result_size_t tile_index_i_j = fxdiv_divide_size_t(range_start, tile_range_j);
211*b095b053SXin Li 	const size_t tile_j = threadpool->params.parallelize_2d_tile_1d.tile_j;
212*b095b053SXin Li 	size_t i = tile_index_i_j.quotient;
213*b095b053SXin Li 	size_t start_j = tile_index_i_j.remainder * tile_j;
214*b095b053SXin Li 
215*b095b053SXin Li 	const size_t range_j = threadpool->params.parallelize_2d_tile_1d.range_j;
216*b095b053SXin Li 	while (pthreadpool_decrement_fetch_relaxed_size_t(&thread->range_length) < range_threshold) {
217*b095b053SXin Li 		task(argument, i, start_j, min(range_j - start_j, tile_j));
218*b095b053SXin Li 		start_j += tile_j;
219*b095b053SXin Li 		if (start_j >= range_j) {
220*b095b053SXin Li 			start_j = 0;
221*b095b053SXin Li 			i += 1;
222*b095b053SXin Li 		}
223*b095b053SXin Li 	}
224*b095b053SXin Li 
225*b095b053SXin Li 	/* There still may be other threads with work */
226*b095b053SXin Li 	const size_t thread_number = thread->thread_number;
227*b095b053SXin Li 	for (size_t tid = modulo_decrement(thread_number, threads_count);
228*b095b053SXin Li 		tid != thread_number;
229*b095b053SXin Li 		tid = modulo_decrement(tid, threads_count))
230*b095b053SXin Li 	{
231*b095b053SXin Li 		struct thread_info* other_thread = &threadpool->threads[tid];
232*b095b053SXin Li 		while (pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_length) < range_threshold) {
233*b095b053SXin Li 			const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
234*b095b053SXin Li 			const struct fxdiv_result_size_t tile_index_i_j = fxdiv_divide_size_t(linear_index, tile_range_j);
235*b095b053SXin Li 			const size_t start_j = tile_index_i_j.remainder * tile_j;
236*b095b053SXin Li 			task(argument, tile_index_i_j.quotient, start_j, min(range_j - start_j, tile_j));
237*b095b053SXin Li 		}
238*b095b053SXin Li 	}
239*b095b053SXin Li 
240*b095b053SXin Li 	/* Make changes by this thread visible to other threads */
241*b095b053SXin Li 	pthreadpool_fence_release();
242*b095b053SXin Li }
243*b095b053SXin Li 
pthreadpool_thread_parallelize_2d_tile_2d_fastpath(struct pthreadpool * threadpool,struct thread_info * thread)244*b095b053SXin Li PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_2d_tile_2d_fastpath(
245*b095b053SXin Li 	struct pthreadpool* threadpool,
246*b095b053SXin Li 	struct thread_info* thread)
247*b095b053SXin Li {
248*b095b053SXin Li 	assert(threadpool != NULL);
249*b095b053SXin Li 	assert(thread != NULL);
250*b095b053SXin Li 
251*b095b053SXin Li 	const pthreadpool_task_2d_tile_2d_t task = (pthreadpool_task_2d_tile_2d_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
252*b095b053SXin Li 	void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
253*b095b053SXin Li 
254*b095b053SXin Li 	const size_t threads_count = threadpool->threads_count.value;
255*b095b053SXin Li 	const size_t range_threshold = -threads_count;
256*b095b053SXin Li 
257*b095b053SXin Li 	/* Process thread's own range of items */
258*b095b053SXin Li 	const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
259*b095b053SXin Li 	const struct fxdiv_divisor_size_t tile_range_j = threadpool->params.parallelize_2d_tile_2d.tile_range_j;
260*b095b053SXin Li 	const struct fxdiv_result_size_t tile_index_i_j = fxdiv_divide_size_t(range_start, tile_range_j);
261*b095b053SXin Li 	const size_t tile_i = threadpool->params.parallelize_2d_tile_2d.tile_i;
262*b095b053SXin Li 	const size_t tile_j = threadpool->params.parallelize_2d_tile_2d.tile_j;
263*b095b053SXin Li 	size_t start_i = tile_index_i_j.quotient * tile_i;
264*b095b053SXin Li 	size_t start_j = tile_index_i_j.remainder * tile_j;
265*b095b053SXin Li 
266*b095b053SXin Li 	const size_t range_i = threadpool->params.parallelize_2d_tile_2d.range_i;
267*b095b053SXin Li 	const size_t range_j = threadpool->params.parallelize_2d_tile_2d.range_j;
268*b095b053SXin Li 	while (pthreadpool_decrement_fetch_relaxed_size_t(&thread->range_length) < range_threshold) {
269*b095b053SXin Li 		task(argument, start_i, start_j, min(range_i - start_i, tile_i), min(range_j - start_j, tile_j));
270*b095b053SXin Li 		start_j += tile_j;
271*b095b053SXin Li 		if (start_j >= range_j) {
272*b095b053SXin Li 			start_j = 0;
273*b095b053SXin Li 			start_i += tile_i;
274*b095b053SXin Li 		}
275*b095b053SXin Li 	}
276*b095b053SXin Li 
277*b095b053SXin Li 	/* There still may be other threads with work */
278*b095b053SXin Li 	const size_t thread_number = thread->thread_number;
279*b095b053SXin Li 	for (size_t tid = modulo_decrement(thread_number, threads_count);
280*b095b053SXin Li 		tid != thread_number;
281*b095b053SXin Li 		tid = modulo_decrement(tid, threads_count))
282*b095b053SXin Li 	{
283*b095b053SXin Li 		struct thread_info* other_thread = &threadpool->threads[tid];
284*b095b053SXin Li 		while (pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_length) < range_threshold) {
285*b095b053SXin Li 			const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
286*b095b053SXin Li 			const struct fxdiv_result_size_t tile_index_i_j = fxdiv_divide_size_t(linear_index, tile_range_j);
287*b095b053SXin Li 			const size_t start_i = tile_index_i_j.quotient * tile_i;
288*b095b053SXin Li 			const size_t start_j = tile_index_i_j.remainder * tile_j;
289*b095b053SXin Li 			task(argument, start_i, start_j, min(range_i - start_i, tile_i), min(range_j - start_j, tile_j));
290*b095b053SXin Li 		}
291*b095b053SXin Li 	}
292*b095b053SXin Li 
293*b095b053SXin Li 	/* Make changes by this thread visible to other threads */
294*b095b053SXin Li 	pthreadpool_fence_release();
295*b095b053SXin Li }
296*b095b053SXin Li 
pthreadpool_thread_parallelize_2d_tile_2d_with_uarch_fastpath(struct pthreadpool * threadpool,struct thread_info * thread)297*b095b053SXin Li PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_2d_tile_2d_with_uarch_fastpath(
298*b095b053SXin Li 	struct pthreadpool* threadpool,
299*b095b053SXin Li 	struct thread_info* thread)
300*b095b053SXin Li {
301*b095b053SXin Li 	assert(threadpool != NULL);
302*b095b053SXin Li 	assert(thread != NULL);
303*b095b053SXin Li 
304*b095b053SXin Li 	const pthreadpool_task_2d_tile_2d_with_id_t task = (pthreadpool_task_2d_tile_2d_with_id_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
305*b095b053SXin Li 	void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
306*b095b053SXin Li 
307*b095b053SXin Li 	const uint32_t default_uarch_index = threadpool->params.parallelize_2d_tile_2d_with_uarch.default_uarch_index;
308*b095b053SXin Li 	uint32_t uarch_index = default_uarch_index;
309*b095b053SXin Li 	#if PTHREADPOOL_USE_CPUINFO
310*b095b053SXin Li 		uarch_index = cpuinfo_get_current_uarch_index_with_default(default_uarch_index);
311*b095b053SXin Li 		if (uarch_index > threadpool->params.parallelize_2d_tile_2d_with_uarch.max_uarch_index) {
312*b095b053SXin Li 			uarch_index = default_uarch_index;
313*b095b053SXin Li 		}
314*b095b053SXin Li 	#endif
315*b095b053SXin Li 
316*b095b053SXin Li 	const size_t threads_count = threadpool->threads_count.value;
317*b095b053SXin Li 	const size_t range_threshold = -threads_count;
318*b095b053SXin Li 
319*b095b053SXin Li 	/* Process thread's own range of items */
320*b095b053SXin Li 	const struct fxdiv_divisor_size_t tile_range_j = threadpool->params.parallelize_2d_tile_2d_with_uarch.tile_range_j;
321*b095b053SXin Li 	const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
322*b095b053SXin Li 	const struct fxdiv_result_size_t index = fxdiv_divide_size_t(range_start, tile_range_j);
323*b095b053SXin Li 	const size_t range_i = threadpool->params.parallelize_2d_tile_2d_with_uarch.range_i;
324*b095b053SXin Li 	const size_t tile_i = threadpool->params.parallelize_2d_tile_2d_with_uarch.tile_i;
325*b095b053SXin Li 	const size_t range_j = threadpool->params.parallelize_2d_tile_2d_with_uarch.range_j;
326*b095b053SXin Li 	const size_t tile_j = threadpool->params.parallelize_2d_tile_2d_with_uarch.tile_j;
327*b095b053SXin Li 	size_t start_i = index.quotient * tile_i;
328*b095b053SXin Li 	size_t start_j = index.remainder * tile_j;
329*b095b053SXin Li 
330*b095b053SXin Li 	while (pthreadpool_decrement_fetch_relaxed_size_t(&thread->range_length) < range_threshold) {
331*b095b053SXin Li 		task(argument, uarch_index, start_i, start_j, min(range_i - start_i, tile_i), min(range_j - start_j, tile_j));
332*b095b053SXin Li 		start_j += tile_j;
333*b095b053SXin Li 		if (start_j >= range_j) {
334*b095b053SXin Li 			start_j = 0;
335*b095b053SXin Li 			start_i += tile_i;
336*b095b053SXin Li 		}
337*b095b053SXin Li 	}
338*b095b053SXin Li 
339*b095b053SXin Li 	/* There still may be other threads with work */
340*b095b053SXin Li 	const size_t thread_number = thread->thread_number;
341*b095b053SXin Li 	for (size_t tid = modulo_decrement(thread_number, threads_count);
342*b095b053SXin Li 		tid != thread_number;
343*b095b053SXin Li 		tid = modulo_decrement(tid, threads_count))
344*b095b053SXin Li 	{
345*b095b053SXin Li 		struct thread_info* other_thread = &threadpool->threads[tid];
346*b095b053SXin Li 		while (pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_length) < range_threshold) {
347*b095b053SXin Li 			const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
348*b095b053SXin Li 			const struct fxdiv_result_size_t tile_index_i_j = fxdiv_divide_size_t(linear_index, tile_range_j);
349*b095b053SXin Li 			const size_t start_i = tile_index_i_j.quotient * tile_i;
350*b095b053SXin Li 			const size_t start_j = tile_index_i_j.remainder * tile_j;
351*b095b053SXin Li 			task(argument, uarch_index, start_i, start_j, min(range_i - start_i, tile_i), min(range_j - start_j, tile_j));
352*b095b053SXin Li 		}
353*b095b053SXin Li 	}
354*b095b053SXin Li 
355*b095b053SXin Li 	/* Make changes by this thread visible to other threads */
356*b095b053SXin Li 	pthreadpool_fence_release();
357*b095b053SXin Li }
358*b095b053SXin Li 
pthreadpool_thread_parallelize_3d_fastpath(struct pthreadpool * threadpool,struct thread_info * thread)359*b095b053SXin Li PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_3d_fastpath(
360*b095b053SXin Li 	struct pthreadpool* threadpool,
361*b095b053SXin Li 	struct thread_info* thread)
362*b095b053SXin Li {
363*b095b053SXin Li 	assert(threadpool != NULL);
364*b095b053SXin Li 	assert(thread != NULL);
365*b095b053SXin Li 
366*b095b053SXin Li 	const pthreadpool_task_3d_t task = (pthreadpool_task_3d_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
367*b095b053SXin Li 	void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
368*b095b053SXin Li 
369*b095b053SXin Li 	const size_t threads_count = threadpool->threads_count.value;
370*b095b053SXin Li 	const size_t range_threshold = -threads_count;
371*b095b053SXin Li 
372*b095b053SXin Li 	/* Process thread's own range of items */
373*b095b053SXin Li 	const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
374*b095b053SXin Li 	const struct fxdiv_divisor_size_t range_k = threadpool->params.parallelize_3d.range_k;
375*b095b053SXin Li 	const struct fxdiv_result_size_t index_ij_k = fxdiv_divide_size_t(range_start, range_k);
376*b095b053SXin Li 	const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_3d.range_j;
377*b095b053SXin Li 	const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_k.quotient, range_j);
378*b095b053SXin Li 	size_t i = index_i_j.quotient;
379*b095b053SXin Li 	size_t j = index_i_j.remainder;
380*b095b053SXin Li 	size_t k = index_ij_k.remainder;
381*b095b053SXin Li 
382*b095b053SXin Li 	while (pthreadpool_decrement_fetch_relaxed_size_t(&thread->range_length) < range_threshold) {
383*b095b053SXin Li 		task(argument, i, j, k);
384*b095b053SXin Li 		if (++k == range_k.value) {
385*b095b053SXin Li 			k = 0;
386*b095b053SXin Li 			if (++j == range_j.value) {
387*b095b053SXin Li 				j = 0;
388*b095b053SXin Li 				i += 1;
389*b095b053SXin Li 			}
390*b095b053SXin Li 		}
391*b095b053SXin Li 	}
392*b095b053SXin Li 
393*b095b053SXin Li 	/* There still may be other threads with work */
394*b095b053SXin Li 	const size_t thread_number = thread->thread_number;
395*b095b053SXin Li 	for (size_t tid = modulo_decrement(thread_number, threads_count);
396*b095b053SXin Li 		tid != thread_number;
397*b095b053SXin Li 		tid = modulo_decrement(tid, threads_count))
398*b095b053SXin Li 	{
399*b095b053SXin Li 		struct thread_info* other_thread = &threadpool->threads[tid];
400*b095b053SXin Li 		while (pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_length) < range_threshold) {
401*b095b053SXin Li 			const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
402*b095b053SXin Li 			const struct fxdiv_result_size_t index_ij_k = fxdiv_divide_size_t(linear_index, range_k);
403*b095b053SXin Li 			const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_k.quotient, range_j);
404*b095b053SXin Li 			task(argument, index_i_j.quotient, index_i_j.remainder, index_ij_k.remainder);
405*b095b053SXin Li 		}
406*b095b053SXin Li 	}
407*b095b053SXin Li 
408*b095b053SXin Li 	/* Make changes by this thread visible to other threads */
409*b095b053SXin Li 	pthreadpool_fence_release();
410*b095b053SXin Li }
411*b095b053SXin Li 
pthreadpool_thread_parallelize_3d_tile_1d_fastpath(struct pthreadpool * threadpool,struct thread_info * thread)412*b095b053SXin Li PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_3d_tile_1d_fastpath(
413*b095b053SXin Li 	struct pthreadpool* threadpool,
414*b095b053SXin Li 	struct thread_info* thread)
415*b095b053SXin Li {
416*b095b053SXin Li 	assert(threadpool != NULL);
417*b095b053SXin Li 	assert(thread != NULL);
418*b095b053SXin Li 
419*b095b053SXin Li 	const pthreadpool_task_3d_tile_1d_t task = (pthreadpool_task_3d_tile_1d_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
420*b095b053SXin Li 	void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
421*b095b053SXin Li 
422*b095b053SXin Li 	const size_t threads_count = threadpool->threads_count.value;
423*b095b053SXin Li 	const size_t range_threshold = -threads_count;
424*b095b053SXin Li 
425*b095b053SXin Li 	/* Process thread's own range of items */
426*b095b053SXin Li 	const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
427*b095b053SXin Li 	const struct fxdiv_divisor_size_t tile_range_k = threadpool->params.parallelize_3d_tile_1d.tile_range_k;
428*b095b053SXin Li 	const struct fxdiv_result_size_t tile_index_ij_k = fxdiv_divide_size_t(range_start, tile_range_k);
429*b095b053SXin Li 	const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_3d_tile_1d.range_j;
430*b095b053SXin Li 	const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(tile_index_ij_k.quotient, range_j);
431*b095b053SXin Li 	const size_t tile_k = threadpool->params.parallelize_3d_tile_1d.tile_k;
432*b095b053SXin Li 	size_t i = index_i_j.quotient;
433*b095b053SXin Li 	size_t j = index_i_j.remainder;
434*b095b053SXin Li 	size_t start_k = tile_index_ij_k.remainder * tile_k;
435*b095b053SXin Li 
436*b095b053SXin Li 	const size_t range_k = threadpool->params.parallelize_3d_tile_1d.range_k;
437*b095b053SXin Li 	while (pthreadpool_decrement_fetch_relaxed_size_t(&thread->range_length) < range_threshold) {
438*b095b053SXin Li 		task(argument, i, j, start_k, min(range_k - start_k, tile_k));
439*b095b053SXin Li 		start_k += tile_k;
440*b095b053SXin Li 		if (start_k >= range_k) {
441*b095b053SXin Li 			start_k = 0;
442*b095b053SXin Li 			if (++j == range_j.value) {
443*b095b053SXin Li 				j = 0;
444*b095b053SXin Li 				i += 1;
445*b095b053SXin Li 			}
446*b095b053SXin Li 		}
447*b095b053SXin Li 	}
448*b095b053SXin Li 
449*b095b053SXin Li 	/* There still may be other threads with work */
450*b095b053SXin Li 	const size_t thread_number = thread->thread_number;
451*b095b053SXin Li 	for (size_t tid = modulo_decrement(thread_number, threads_count);
452*b095b053SXin Li 		tid != thread_number;
453*b095b053SXin Li 		tid = modulo_decrement(tid, threads_count))
454*b095b053SXin Li 	{
455*b095b053SXin Li 		struct thread_info* other_thread = &threadpool->threads[tid];
456*b095b053SXin Li 		while (pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_length) < range_threshold) {
457*b095b053SXin Li 			const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
458*b095b053SXin Li 			const struct fxdiv_result_size_t tile_index_ij_k = fxdiv_divide_size_t(linear_index, tile_range_k);
459*b095b053SXin Li 			const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(tile_index_ij_k.quotient, range_j);
460*b095b053SXin Li 			const size_t start_k = tile_index_ij_k.remainder * tile_k;
461*b095b053SXin Li 			task(argument, index_i_j.quotient, index_i_j.remainder, start_k, min(range_k - start_k, tile_k));
462*b095b053SXin Li 		}
463*b095b053SXin Li 	}
464*b095b053SXin Li 
465*b095b053SXin Li 	/* Make changes by this thread visible to other threads */
466*b095b053SXin Li 	pthreadpool_fence_release();
467*b095b053SXin Li }
468*b095b053SXin Li 
pthreadpool_thread_parallelize_3d_tile_2d_fastpath(struct pthreadpool * threadpool,struct thread_info * thread)469*b095b053SXin Li PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_3d_tile_2d_fastpath(
470*b095b053SXin Li 	struct pthreadpool* threadpool,
471*b095b053SXin Li 	struct thread_info* thread)
472*b095b053SXin Li {
473*b095b053SXin Li 	assert(threadpool != NULL);
474*b095b053SXin Li 	assert(thread != NULL);
475*b095b053SXin Li 
476*b095b053SXin Li 	const pthreadpool_task_3d_tile_2d_t task = (pthreadpool_task_3d_tile_2d_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
477*b095b053SXin Li 	void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
478*b095b053SXin Li 
479*b095b053SXin Li 	const size_t threads_count = threadpool->threads_count.value;
480*b095b053SXin Li 	const size_t range_threshold = -threads_count;
481*b095b053SXin Li 
482*b095b053SXin Li 	/* Process thread's own range of items */
483*b095b053SXin Li 	const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
484*b095b053SXin Li 	const struct fxdiv_divisor_size_t tile_range_k = threadpool->params.parallelize_3d_tile_2d.tile_range_k;
485*b095b053SXin Li 	const struct fxdiv_result_size_t tile_index_ij_k = fxdiv_divide_size_t(range_start, tile_range_k);
486*b095b053SXin Li 	const struct fxdiv_divisor_size_t tile_range_j = threadpool->params.parallelize_3d_tile_2d.tile_range_j;
487*b095b053SXin Li 	const struct fxdiv_result_size_t tile_index_i_j = fxdiv_divide_size_t(tile_index_ij_k.quotient, tile_range_j);
488*b095b053SXin Li 	const size_t tile_j = threadpool->params.parallelize_3d_tile_2d.tile_j;
489*b095b053SXin Li 	const size_t tile_k = threadpool->params.parallelize_3d_tile_2d.tile_k;
490*b095b053SXin Li 	size_t i = tile_index_i_j.quotient;
491*b095b053SXin Li 	size_t start_j = tile_index_i_j.remainder * tile_j;
492*b095b053SXin Li 	size_t start_k = tile_index_ij_k.remainder * tile_k;
493*b095b053SXin Li 
494*b095b053SXin Li 	const size_t range_k = threadpool->params.parallelize_3d_tile_2d.range_k;
495*b095b053SXin Li 	const size_t range_j = threadpool->params.parallelize_3d_tile_2d.range_j;
496*b095b053SXin Li 	while (pthreadpool_decrement_fetch_relaxed_size_t(&thread->range_length) < range_threshold) {
497*b095b053SXin Li 		task(argument, i, start_j, start_k, min(range_j - start_j, tile_j), min(range_k - start_k, tile_k));
498*b095b053SXin Li 		start_k += tile_k;
499*b095b053SXin Li 		if (start_k >= range_k) {
500*b095b053SXin Li 			start_k = 0;
501*b095b053SXin Li 			start_j += tile_j;
502*b095b053SXin Li 			if (start_j >= range_j) {
503*b095b053SXin Li 				start_j = 0;
504*b095b053SXin Li 				i += 1;
505*b095b053SXin Li 			}
506*b095b053SXin Li 		}
507*b095b053SXin Li 	}
508*b095b053SXin Li 
509*b095b053SXin Li 	/* There still may be other threads with work */
510*b095b053SXin Li 	const size_t thread_number = thread->thread_number;
511*b095b053SXin Li 	for (size_t tid = modulo_decrement(thread_number, threads_count);
512*b095b053SXin Li 		tid != thread_number;
513*b095b053SXin Li 		tid = modulo_decrement(tid, threads_count))
514*b095b053SXin Li 	{
515*b095b053SXin Li 		struct thread_info* other_thread = &threadpool->threads[tid];
516*b095b053SXin Li 		while (pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_length) < range_threshold) {
517*b095b053SXin Li 			const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
518*b095b053SXin Li 			const struct fxdiv_result_size_t tile_index_ij_k = fxdiv_divide_size_t(linear_index, tile_range_k);
519*b095b053SXin Li 			const struct fxdiv_result_size_t tile_index_i_j = fxdiv_divide_size_t(tile_index_ij_k.quotient, tile_range_j);
520*b095b053SXin Li 			const size_t start_j = tile_index_i_j.remainder * tile_j;
521*b095b053SXin Li 			const size_t start_k = tile_index_ij_k.remainder * tile_k;
522*b095b053SXin Li 			task(argument, tile_index_i_j.quotient, start_j, start_k, min(range_j - start_j, tile_j), min(range_k - start_k, tile_k));
523*b095b053SXin Li 		}
524*b095b053SXin Li 	}
525*b095b053SXin Li 
526*b095b053SXin Li 	/* Make changes by this thread visible to other threads */
527*b095b053SXin Li 	pthreadpool_fence_release();
528*b095b053SXin Li }
529*b095b053SXin Li 
pthreadpool_thread_parallelize_3d_tile_2d_with_uarch_fastpath(struct pthreadpool * threadpool,struct thread_info * thread)530*b095b053SXin Li PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_3d_tile_2d_with_uarch_fastpath(
531*b095b053SXin Li 	struct pthreadpool* threadpool,
532*b095b053SXin Li 	struct thread_info* thread)
533*b095b053SXin Li {
534*b095b053SXin Li 	assert(threadpool != NULL);
535*b095b053SXin Li 	assert(thread != NULL);
536*b095b053SXin Li 
537*b095b053SXin Li 	const pthreadpool_task_3d_tile_2d_with_id_t task = (pthreadpool_task_3d_tile_2d_with_id_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
538*b095b053SXin Li 	void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
539*b095b053SXin Li 
540*b095b053SXin Li 	const uint32_t default_uarch_index = threadpool->params.parallelize_3d_tile_2d_with_uarch.default_uarch_index;
541*b095b053SXin Li 	uint32_t uarch_index = default_uarch_index;
542*b095b053SXin Li 	#if PTHREADPOOL_USE_CPUINFO
543*b095b053SXin Li 		uarch_index = cpuinfo_get_current_uarch_index_with_default(default_uarch_index);
544*b095b053SXin Li 		if (uarch_index > threadpool->params.parallelize_3d_tile_2d_with_uarch.max_uarch_index) {
545*b095b053SXin Li 			uarch_index = default_uarch_index;
546*b095b053SXin Li 		}
547*b095b053SXin Li 	#endif
548*b095b053SXin Li 
549*b095b053SXin Li 	const size_t threads_count = threadpool->threads_count.value;
550*b095b053SXin Li 	const size_t range_threshold = -threads_count;
551*b095b053SXin Li 
552*b095b053SXin Li 	/* Process thread's own range of items */
553*b095b053SXin Li 	const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
554*b095b053SXin Li 	const struct fxdiv_divisor_size_t tile_range_k = threadpool->params.parallelize_3d_tile_2d_with_uarch.tile_range_k;
555*b095b053SXin Li 	const struct fxdiv_result_size_t tile_index_ij_k = fxdiv_divide_size_t(range_start, tile_range_k);
556*b095b053SXin Li 	const struct fxdiv_divisor_size_t tile_range_j = threadpool->params.parallelize_3d_tile_2d_with_uarch.tile_range_j;
557*b095b053SXin Li 	const struct fxdiv_result_size_t tile_index_i_j = fxdiv_divide_size_t(tile_index_ij_k.quotient, tile_range_j);
558*b095b053SXin Li 	const size_t tile_j = threadpool->params.parallelize_3d_tile_2d_with_uarch.tile_j;
559*b095b053SXin Li 	const size_t tile_k = threadpool->params.parallelize_3d_tile_2d_with_uarch.tile_k;
560*b095b053SXin Li 	size_t i = tile_index_i_j.quotient;
561*b095b053SXin Li 	size_t start_j = tile_index_i_j.remainder * tile_j;
562*b095b053SXin Li 	size_t start_k = tile_index_ij_k.remainder * tile_k;
563*b095b053SXin Li 
564*b095b053SXin Li 	const size_t range_k = threadpool->params.parallelize_3d_tile_2d_with_uarch.range_k;
565*b095b053SXin Li 	const size_t range_j = threadpool->params.parallelize_3d_tile_2d_with_uarch.range_j;
566*b095b053SXin Li 	while (pthreadpool_decrement_fetch_relaxed_size_t(&thread->range_length) < range_threshold) {
567*b095b053SXin Li 		task(argument, uarch_index, i, start_j, start_k, min(range_j - start_j, tile_j), min(range_k - start_k, tile_k));
568*b095b053SXin Li 		start_k += tile_k;
569*b095b053SXin Li 		if (start_k >= range_k) {
570*b095b053SXin Li 			start_k = 0;
571*b095b053SXin Li 			start_j += tile_j;
572*b095b053SXin Li 			if (start_j >= range_j) {
573*b095b053SXin Li 				start_j = 0;
574*b095b053SXin Li 				i += 1;
575*b095b053SXin Li 			}
576*b095b053SXin Li 		}
577*b095b053SXin Li 	}
578*b095b053SXin Li 
579*b095b053SXin Li 	/* There still may be other threads with work */
580*b095b053SXin Li 	const size_t thread_number = thread->thread_number;
581*b095b053SXin Li 	for (size_t tid = modulo_decrement(thread_number, threads_count);
582*b095b053SXin Li 		tid != thread_number;
583*b095b053SXin Li 		tid = modulo_decrement(tid, threads_count))
584*b095b053SXin Li 	{
585*b095b053SXin Li 		struct thread_info* other_thread = &threadpool->threads[tid];
586*b095b053SXin Li 		while (pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_length) < range_threshold) {
587*b095b053SXin Li 			const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
588*b095b053SXin Li 			const struct fxdiv_result_size_t tile_index_ij_k = fxdiv_divide_size_t(linear_index, tile_range_k);
589*b095b053SXin Li 			const struct fxdiv_result_size_t tile_index_i_j = fxdiv_divide_size_t(tile_index_ij_k.quotient, tile_range_j);
590*b095b053SXin Li 			const size_t start_j = tile_index_i_j.remainder * tile_j;
591*b095b053SXin Li 			const size_t start_k = tile_index_ij_k.remainder * tile_k;
592*b095b053SXin Li 			task(argument, uarch_index, tile_index_i_j.quotient, start_j, start_k, min(range_j - start_j, tile_j), min(range_k - start_k, tile_k));
593*b095b053SXin Li 		}
594*b095b053SXin Li 	}
595*b095b053SXin Li 
596*b095b053SXin Li 	/* Make changes by this thread visible to other threads */
597*b095b053SXin Li 	pthreadpool_fence_release();
598*b095b053SXin Li }
599*b095b053SXin Li 
pthreadpool_thread_parallelize_4d_fastpath(struct pthreadpool * threadpool,struct thread_info * thread)600*b095b053SXin Li PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_4d_fastpath(
601*b095b053SXin Li 	struct pthreadpool* threadpool,
602*b095b053SXin Li 	struct thread_info* thread)
603*b095b053SXin Li {
604*b095b053SXin Li 	assert(threadpool != NULL);
605*b095b053SXin Li 	assert(thread != NULL);
606*b095b053SXin Li 
607*b095b053SXin Li 	const pthreadpool_task_4d_t task = (pthreadpool_task_4d_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
608*b095b053SXin Li 	void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
609*b095b053SXin Li 
610*b095b053SXin Li 	const size_t threads_count = threadpool->threads_count.value;
611*b095b053SXin Li 	const size_t range_threshold = -threads_count;
612*b095b053SXin Li 
613*b095b053SXin Li 	/* Process thread's own range of items */
614*b095b053SXin Li 	const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
615*b095b053SXin Li 	const struct fxdiv_divisor_size_t range_kl = threadpool->params.parallelize_4d.range_kl;
616*b095b053SXin Li 	const struct fxdiv_result_size_t index_ij_kl = fxdiv_divide_size_t(range_start, range_kl);
617*b095b053SXin Li 	const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_4d.range_j;
618*b095b053SXin Li 	const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_kl.quotient, range_j);
619*b095b053SXin Li 	const struct fxdiv_divisor_size_t range_l = threadpool->params.parallelize_4d.range_l;
620*b095b053SXin Li 	const struct fxdiv_result_size_t index_k_l = fxdiv_divide_size_t(index_ij_kl.remainder, range_l);
621*b095b053SXin Li 	size_t i = index_i_j.quotient;
622*b095b053SXin Li 	size_t j = index_i_j.remainder;
623*b095b053SXin Li 	size_t k = index_k_l.quotient;
624*b095b053SXin Li 	size_t l = index_k_l.remainder;
625*b095b053SXin Li 
626*b095b053SXin Li 	const size_t range_k = threadpool->params.parallelize_4d.range_k;
627*b095b053SXin Li 	while (pthreadpool_decrement_fetch_relaxed_size_t(&thread->range_length) < range_threshold) {
628*b095b053SXin Li 		task(argument, i, j, k, l);
629*b095b053SXin Li 		if (++l == range_l.value) {
630*b095b053SXin Li 			l = 0;
631*b095b053SXin Li 			if (++k == range_k) {
632*b095b053SXin Li 				k = 0;
633*b095b053SXin Li 				if (++j == range_j.value) {
634*b095b053SXin Li 					j = 0;
635*b095b053SXin Li 					i += 1;
636*b095b053SXin Li 				}
637*b095b053SXin Li 			}
638*b095b053SXin Li 		}
639*b095b053SXin Li 	}
640*b095b053SXin Li 
641*b095b053SXin Li 	/* There still may be other threads with work */
642*b095b053SXin Li 	const size_t thread_number = thread->thread_number;
643*b095b053SXin Li 	for (size_t tid = modulo_decrement(thread_number, threads_count);
644*b095b053SXin Li 		tid != thread_number;
645*b095b053SXin Li 		tid = modulo_decrement(tid, threads_count))
646*b095b053SXin Li 	{
647*b095b053SXin Li 		struct thread_info* other_thread = &threadpool->threads[tid];
648*b095b053SXin Li 		while (pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_length) < range_threshold) {
649*b095b053SXin Li 			const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
650*b095b053SXin Li 			const struct fxdiv_result_size_t index_ij_kl = fxdiv_divide_size_t(linear_index, range_kl);
651*b095b053SXin Li 			const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_kl.quotient, range_j);
652*b095b053SXin Li 			const struct fxdiv_result_size_t index_k_l = fxdiv_divide_size_t(index_ij_kl.remainder, range_l);
653*b095b053SXin Li 			task(argument, index_i_j.quotient, index_i_j.remainder, index_k_l.quotient, index_k_l.remainder);
654*b095b053SXin Li 		}
655*b095b053SXin Li 	}
656*b095b053SXin Li 
657*b095b053SXin Li 	/* Make changes by this thread visible to other threads */
658*b095b053SXin Li 	pthreadpool_fence_release();
659*b095b053SXin Li }
660*b095b053SXin Li 
pthreadpool_thread_parallelize_4d_tile_1d_fastpath(struct pthreadpool * threadpool,struct thread_info * thread)661*b095b053SXin Li PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_4d_tile_1d_fastpath(
662*b095b053SXin Li 	struct pthreadpool* threadpool,
663*b095b053SXin Li 	struct thread_info* thread)
664*b095b053SXin Li {
665*b095b053SXin Li 	assert(threadpool != NULL);
666*b095b053SXin Li 	assert(thread != NULL);
667*b095b053SXin Li 
668*b095b053SXin Li 	const pthreadpool_task_4d_tile_1d_t task = (pthreadpool_task_4d_tile_1d_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
669*b095b053SXin Li 	void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
670*b095b053SXin Li 
671*b095b053SXin Li 	const size_t threads_count = threadpool->threads_count.value;
672*b095b053SXin Li 	const size_t range_threshold = -threads_count;
673*b095b053SXin Li 
674*b095b053SXin Li 	/* Process thread's own range of items */
675*b095b053SXin Li 	const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
676*b095b053SXin Li 	const struct fxdiv_divisor_size_t tile_range_kl = threadpool->params.parallelize_4d_tile_1d.tile_range_kl;
677*b095b053SXin Li 	const struct fxdiv_result_size_t tile_index_ij_kl = fxdiv_divide_size_t(range_start, tile_range_kl);
678*b095b053SXin Li 	const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_4d_tile_1d.range_j;
679*b095b053SXin Li 	const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(tile_index_ij_kl.quotient, range_j);
680*b095b053SXin Li 	const struct fxdiv_divisor_size_t tile_range_l = threadpool->params.parallelize_4d_tile_1d.tile_range_l;
681*b095b053SXin Li 	const struct fxdiv_result_size_t tile_index_k_l = fxdiv_divide_size_t(tile_index_ij_kl.remainder, tile_range_l);
682*b095b053SXin Li 	const size_t tile_l = threadpool->params.parallelize_4d_tile_1d.tile_l;
683*b095b053SXin Li 	size_t i = index_i_j.quotient;
684*b095b053SXin Li 	size_t j = index_i_j.remainder;
685*b095b053SXin Li 	size_t k = tile_index_k_l.quotient;
686*b095b053SXin Li 	size_t start_l = tile_index_k_l.remainder * tile_l;
687*b095b053SXin Li 
688*b095b053SXin Li 	const size_t range_l = threadpool->params.parallelize_4d_tile_1d.range_l;
689*b095b053SXin Li 	const size_t range_k = threadpool->params.parallelize_4d_tile_1d.range_k;
690*b095b053SXin Li 	while (pthreadpool_decrement_fetch_relaxed_size_t(&thread->range_length) < range_threshold) {
691*b095b053SXin Li 		task(argument, i, j, k, start_l, min(range_l - start_l, tile_l));
692*b095b053SXin Li 		start_l += tile_l;
693*b095b053SXin Li 		if (start_l >= range_l) {
694*b095b053SXin Li 			start_l = 0;
695*b095b053SXin Li 			if (++k == range_k) {
696*b095b053SXin Li 				k = 0;
697*b095b053SXin Li 				if (++j == range_j.value) {
698*b095b053SXin Li 					j = 0;
699*b095b053SXin Li 					i += 1;
700*b095b053SXin Li 				}
701*b095b053SXin Li 			}
702*b095b053SXin Li 		}
703*b095b053SXin Li 	}
704*b095b053SXin Li 
705*b095b053SXin Li 	/* There still may be other threads with work */
706*b095b053SXin Li 	const size_t thread_number = thread->thread_number;
707*b095b053SXin Li 	for (size_t tid = modulo_decrement(thread_number, threads_count);
708*b095b053SXin Li 		tid != thread_number;
709*b095b053SXin Li 		tid = modulo_decrement(tid, threads_count))
710*b095b053SXin Li 	{
711*b095b053SXin Li 		struct thread_info* other_thread = &threadpool->threads[tid];
712*b095b053SXin Li 		while (pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_length) < range_threshold) {
713*b095b053SXin Li 			const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
714*b095b053SXin Li 			const struct fxdiv_result_size_t tile_index_ij_kl = fxdiv_divide_size_t(linear_index, tile_range_kl);
715*b095b053SXin Li 			const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(tile_index_ij_kl.quotient, range_j);
716*b095b053SXin Li 			const struct fxdiv_result_size_t tile_index_k_l = fxdiv_divide_size_t(tile_index_ij_kl.remainder, tile_range_l);
717*b095b053SXin Li 			const size_t start_l = tile_index_k_l.remainder * tile_l;
718*b095b053SXin Li 			task(argument, index_i_j.quotient, index_i_j.remainder, tile_index_k_l.quotient, start_l, min(range_l - start_l, tile_l));
719*b095b053SXin Li 		}
720*b095b053SXin Li 	}
721*b095b053SXin Li 
722*b095b053SXin Li 	/* Make changes by this thread visible to other threads */
723*b095b053SXin Li 	pthreadpool_fence_release();
724*b095b053SXin Li }
725*b095b053SXin Li 
pthreadpool_thread_parallelize_4d_tile_2d_fastpath(struct pthreadpool * threadpool,struct thread_info * thread)726*b095b053SXin Li PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_4d_tile_2d_fastpath(
727*b095b053SXin Li 	struct pthreadpool* threadpool,
728*b095b053SXin Li 	struct thread_info* thread)
729*b095b053SXin Li {
730*b095b053SXin Li 	assert(threadpool != NULL);
731*b095b053SXin Li 	assert(thread != NULL);
732*b095b053SXin Li 
733*b095b053SXin Li 	const pthreadpool_task_4d_tile_2d_t task = (pthreadpool_task_4d_tile_2d_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
734*b095b053SXin Li 	void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
735*b095b053SXin Li 
736*b095b053SXin Li 	const size_t threads_count = threadpool->threads_count.value;
737*b095b053SXin Li 	const size_t range_threshold = -threads_count;
738*b095b053SXin Li 
739*b095b053SXin Li 	/* Process thread's own range of items */
740*b095b053SXin Li 	const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
741*b095b053SXin Li 	const struct fxdiv_divisor_size_t tile_range_kl = threadpool->params.parallelize_4d_tile_2d.tile_range_kl;
742*b095b053SXin Li 	const struct fxdiv_result_size_t tile_index_ij_kl = fxdiv_divide_size_t(range_start, tile_range_kl);
743*b095b053SXin Li 	const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_4d_tile_2d.range_j;
744*b095b053SXin Li 	const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(tile_index_ij_kl.quotient, range_j);
745*b095b053SXin Li 	const struct fxdiv_divisor_size_t tile_range_l = threadpool->params.parallelize_4d_tile_2d.tile_range_l;
746*b095b053SXin Li 	const struct fxdiv_result_size_t tile_index_k_l = fxdiv_divide_size_t(tile_index_ij_kl.remainder, tile_range_l);
747*b095b053SXin Li 	const size_t tile_k = threadpool->params.parallelize_4d_tile_2d.tile_k;
748*b095b053SXin Li 	const size_t tile_l = threadpool->params.parallelize_4d_tile_2d.tile_l;
749*b095b053SXin Li 	size_t i = index_i_j.quotient;
750*b095b053SXin Li 	size_t j = index_i_j.remainder;
751*b095b053SXin Li 	size_t start_k = tile_index_k_l.quotient * tile_k;
752*b095b053SXin Li 	size_t start_l = tile_index_k_l.remainder * tile_l;
753*b095b053SXin Li 
754*b095b053SXin Li 	const size_t range_l = threadpool->params.parallelize_4d_tile_2d.range_l;
755*b095b053SXin Li 	const size_t range_k = threadpool->params.parallelize_4d_tile_2d.range_k;
756*b095b053SXin Li 	while (pthreadpool_decrement_fetch_relaxed_size_t(&thread->range_length) < range_threshold) {
757*b095b053SXin Li 		task(argument, i, j, start_k, start_l, min(range_k - start_k, tile_k), min(range_l - start_l, tile_l));
758*b095b053SXin Li 		start_l += tile_l;
759*b095b053SXin Li 		if (start_l >= range_l) {
760*b095b053SXin Li 			start_l = 0;
761*b095b053SXin Li 			start_k += tile_k;
762*b095b053SXin Li 			if (start_k >= range_k) {
763*b095b053SXin Li 				start_k = 0;
764*b095b053SXin Li 				if (++j == range_j.value) {
765*b095b053SXin Li 					j = 0;
766*b095b053SXin Li 					i += 1;
767*b095b053SXin Li 				}
768*b095b053SXin Li 			}
769*b095b053SXin Li 		}
770*b095b053SXin Li 	}
771*b095b053SXin Li 
772*b095b053SXin Li 	/* There still may be other threads with work */
773*b095b053SXin Li 	const size_t thread_number = thread->thread_number;
774*b095b053SXin Li 	for (size_t tid = modulo_decrement(thread_number, threads_count);
775*b095b053SXin Li 		tid != thread_number;
776*b095b053SXin Li 		tid = modulo_decrement(tid, threads_count))
777*b095b053SXin Li 	{
778*b095b053SXin Li 		struct thread_info* other_thread = &threadpool->threads[tid];
779*b095b053SXin Li 		while (pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_length) < range_threshold) {
780*b095b053SXin Li 			const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
781*b095b053SXin Li 			const struct fxdiv_result_size_t tile_index_ij_kl = fxdiv_divide_size_t(linear_index, tile_range_kl);
782*b095b053SXin Li 			const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(tile_index_ij_kl.quotient, range_j);
783*b095b053SXin Li 			const struct fxdiv_result_size_t tile_index_k_l = fxdiv_divide_size_t(tile_index_ij_kl.remainder, tile_range_l);
784*b095b053SXin Li 			const size_t start_k = tile_index_k_l.quotient * tile_k;
785*b095b053SXin Li 			const size_t start_l = tile_index_k_l.remainder * tile_l;
786*b095b053SXin Li 			task(argument, index_i_j.quotient, index_i_j.remainder, start_k, start_l, min(range_k - start_k, tile_k), min(range_l - start_l, tile_l));
787*b095b053SXin Li 		}
788*b095b053SXin Li 	}
789*b095b053SXin Li 
790*b095b053SXin Li 	/* Make changes by this thread visible to other threads */
791*b095b053SXin Li 	pthreadpool_fence_release();
792*b095b053SXin Li }
793*b095b053SXin Li 
pthreadpool_thread_parallelize_4d_tile_2d_with_uarch_fastpath(struct pthreadpool * threadpool,struct thread_info * thread)794*b095b053SXin Li PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_4d_tile_2d_with_uarch_fastpath(
795*b095b053SXin Li 	struct pthreadpool* threadpool,
796*b095b053SXin Li 	struct thread_info* thread)
797*b095b053SXin Li {
798*b095b053SXin Li 	assert(threadpool != NULL);
799*b095b053SXin Li 	assert(thread != NULL);
800*b095b053SXin Li 
801*b095b053SXin Li 	const pthreadpool_task_4d_tile_2d_with_id_t task = (pthreadpool_task_4d_tile_2d_with_id_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
802*b095b053SXin Li 	void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
803*b095b053SXin Li 
804*b095b053SXin Li 	const uint32_t default_uarch_index = threadpool->params.parallelize_4d_tile_2d_with_uarch.default_uarch_index;
805*b095b053SXin Li 	uint32_t uarch_index = default_uarch_index;
806*b095b053SXin Li 	#if PTHREADPOOL_USE_CPUINFO
807*b095b053SXin Li 		uarch_index = cpuinfo_get_current_uarch_index_with_default(default_uarch_index);
808*b095b053SXin Li 		if (uarch_index > threadpool->params.parallelize_4d_tile_2d_with_uarch.max_uarch_index) {
809*b095b053SXin Li 			uarch_index = default_uarch_index;
810*b095b053SXin Li 		}
811*b095b053SXin Li 	#endif
812*b095b053SXin Li 
813*b095b053SXin Li 	const size_t threads_count = threadpool->threads_count.value;
814*b095b053SXin Li 	const size_t range_threshold = -threads_count;
815*b095b053SXin Li 
816*b095b053SXin Li 	/* Process thread's own range of items */
817*b095b053SXin Li 	const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
818*b095b053SXin Li 	const struct fxdiv_divisor_size_t tile_range_kl = threadpool->params.parallelize_4d_tile_2d_with_uarch.tile_range_kl;
819*b095b053SXin Li 	const struct fxdiv_result_size_t tile_index_ij_kl = fxdiv_divide_size_t(range_start, tile_range_kl);
820*b095b053SXin Li 	const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_4d_tile_2d_with_uarch.range_j;
821*b095b053SXin Li 	const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(tile_index_ij_kl.quotient, range_j);
822*b095b053SXin Li 	const struct fxdiv_divisor_size_t tile_range_l = threadpool->params.parallelize_4d_tile_2d_with_uarch.tile_range_l;
823*b095b053SXin Li 	const struct fxdiv_result_size_t tile_index_k_l = fxdiv_divide_size_t(tile_index_ij_kl.remainder, tile_range_l);
824*b095b053SXin Li 	const size_t tile_k = threadpool->params.parallelize_4d_tile_2d_with_uarch.tile_k;
825*b095b053SXin Li 	const size_t tile_l = threadpool->params.parallelize_4d_tile_2d_with_uarch.tile_l;
826*b095b053SXin Li 	size_t i = index_i_j.quotient;
827*b095b053SXin Li 	size_t j = index_i_j.remainder;
828*b095b053SXin Li 	size_t start_k = tile_index_k_l.quotient * tile_k;
829*b095b053SXin Li 	size_t start_l = tile_index_k_l.remainder * tile_l;
830*b095b053SXin Li 
831*b095b053SXin Li 	const size_t range_l = threadpool->params.parallelize_4d_tile_2d_with_uarch.range_l;
832*b095b053SXin Li 	const size_t range_k = threadpool->params.parallelize_4d_tile_2d_with_uarch.range_k;
833*b095b053SXin Li 	while (pthreadpool_decrement_fetch_relaxed_size_t(&thread->range_length) < range_threshold) {
834*b095b053SXin Li 		task(argument, uarch_index, i, j, start_k, start_l, min(range_k - start_k, tile_k), min(range_l - start_l, tile_l));
835*b095b053SXin Li 		start_l += tile_l;
836*b095b053SXin Li 		if (start_l >= range_l) {
837*b095b053SXin Li 			start_l = 0;
838*b095b053SXin Li 			start_k += tile_k;
839*b095b053SXin Li 			if (start_k >= range_k) {
840*b095b053SXin Li 				start_k = 0;
841*b095b053SXin Li 				if (++j == range_j.value) {
842*b095b053SXin Li 					j = 0;
843*b095b053SXin Li 					i += 1;
844*b095b053SXin Li 				}
845*b095b053SXin Li 			}
846*b095b053SXin Li 		}
847*b095b053SXin Li 	}
848*b095b053SXin Li 
849*b095b053SXin Li 	/* There still may be other threads with work */
850*b095b053SXin Li 	const size_t thread_number = thread->thread_number;
851*b095b053SXin Li 	for (size_t tid = modulo_decrement(thread_number, threads_count);
852*b095b053SXin Li 		tid != thread_number;
853*b095b053SXin Li 		tid = modulo_decrement(tid, threads_count))
854*b095b053SXin Li 	{
855*b095b053SXin Li 		struct thread_info* other_thread = &threadpool->threads[tid];
856*b095b053SXin Li 		while (pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_length) < range_threshold) {
857*b095b053SXin Li 			const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
858*b095b053SXin Li 			const struct fxdiv_result_size_t tile_index_ij_kl = fxdiv_divide_size_t(linear_index, tile_range_kl);
859*b095b053SXin Li 			const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(tile_index_ij_kl.quotient, range_j);
860*b095b053SXin Li 			const struct fxdiv_result_size_t tile_index_k_l = fxdiv_divide_size_t(tile_index_ij_kl.remainder, tile_range_l);
861*b095b053SXin Li 			const size_t start_k = tile_index_k_l.quotient * tile_k;
862*b095b053SXin Li 			const size_t start_l = tile_index_k_l.remainder * tile_l;
863*b095b053SXin Li 			task(argument, uarch_index, index_i_j.quotient, index_i_j.remainder, start_k, start_l, min(range_k - start_k, tile_k), min(range_l - start_l, tile_l));
864*b095b053SXin Li 		}
865*b095b053SXin Li 	}
866*b095b053SXin Li 
867*b095b053SXin Li 	/* Make changes by this thread visible to other threads */
868*b095b053SXin Li 	pthreadpool_fence_release();
869*b095b053SXin Li }
870*b095b053SXin Li 
pthreadpool_thread_parallelize_5d_fastpath(struct pthreadpool * threadpool,struct thread_info * thread)871*b095b053SXin Li PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_5d_fastpath(
872*b095b053SXin Li 	struct pthreadpool* threadpool,
873*b095b053SXin Li 	struct thread_info* thread)
874*b095b053SXin Li {
875*b095b053SXin Li 	assert(threadpool != NULL);
876*b095b053SXin Li 	assert(thread != NULL);
877*b095b053SXin Li 
878*b095b053SXin Li 	const pthreadpool_task_5d_t task = (pthreadpool_task_5d_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
879*b095b053SXin Li 	void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
880*b095b053SXin Li 
881*b095b053SXin Li 	const size_t threads_count = threadpool->threads_count.value;
882*b095b053SXin Li 	const size_t range_threshold = -threads_count;
883*b095b053SXin Li 
884*b095b053SXin Li 	/* Process thread's own range of items */
885*b095b053SXin Li 	const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
886*b095b053SXin Li 	const struct fxdiv_divisor_size_t range_lm = threadpool->params.parallelize_5d.range_lm;
887*b095b053SXin Li 	const struct fxdiv_result_size_t index_ijk_lm = fxdiv_divide_size_t(range_start, range_lm);
888*b095b053SXin Li 	const struct fxdiv_divisor_size_t range_k = threadpool->params.parallelize_5d.range_k;
889*b095b053SXin Li 	const struct fxdiv_result_size_t index_ij_k = fxdiv_divide_size_t(index_ijk_lm.quotient, range_k);
890*b095b053SXin Li 	const struct fxdiv_divisor_size_t range_m = threadpool->params.parallelize_5d.range_m;
891*b095b053SXin Li 	const struct fxdiv_result_size_t index_l_m = fxdiv_divide_size_t(index_ijk_lm.remainder, range_m);
892*b095b053SXin Li 	const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_5d.range_j;
893*b095b053SXin Li 	const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_k.quotient, range_j);
894*b095b053SXin Li 	size_t i = index_i_j.quotient;
895*b095b053SXin Li 	size_t j = index_i_j.remainder;
896*b095b053SXin Li 	size_t k = index_ij_k.remainder;
897*b095b053SXin Li 	size_t l = index_l_m.quotient;
898*b095b053SXin Li 	size_t m = index_l_m.remainder;
899*b095b053SXin Li 
900*b095b053SXin Li 	const size_t range_l = threadpool->params.parallelize_5d.range_l;
901*b095b053SXin Li 	while (pthreadpool_decrement_fetch_relaxed_size_t(&thread->range_length) < range_threshold) {
902*b095b053SXin Li 		task(argument, i, j, k, l, m);
903*b095b053SXin Li 		if (++m == range_m.value) {
904*b095b053SXin Li 			m = 0;
905*b095b053SXin Li 			if (++l == range_l) {
906*b095b053SXin Li 				l = 0;
907*b095b053SXin Li 				if (++k == range_k.value) {
908*b095b053SXin Li 					k = 0;
909*b095b053SXin Li 					if (++j == range_j.value) {
910*b095b053SXin Li 						j = 0;
911*b095b053SXin Li 						i += 1;
912*b095b053SXin Li 					}
913*b095b053SXin Li 				}
914*b095b053SXin Li 			}
915*b095b053SXin Li 		}
916*b095b053SXin Li 	}
917*b095b053SXin Li 
918*b095b053SXin Li 	/* There still may be other threads with work */
919*b095b053SXin Li 	const size_t thread_number = thread->thread_number;
920*b095b053SXin Li 	for (size_t tid = modulo_decrement(thread_number, threads_count);
921*b095b053SXin Li 		tid != thread_number;
922*b095b053SXin Li 		tid = modulo_decrement(tid, threads_count))
923*b095b053SXin Li 	{
924*b095b053SXin Li 		struct thread_info* other_thread = &threadpool->threads[tid];
925*b095b053SXin Li 		while (pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_length) < range_threshold) {
926*b095b053SXin Li 			const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
927*b095b053SXin Li 			const struct fxdiv_result_size_t index_ijk_lm = fxdiv_divide_size_t(linear_index, range_lm);
928*b095b053SXin Li 			const struct fxdiv_result_size_t index_ij_k = fxdiv_divide_size_t(index_ijk_lm.quotient, range_k);
929*b095b053SXin Li 			const struct fxdiv_result_size_t index_l_m = fxdiv_divide_size_t(index_ijk_lm.remainder, range_m);
930*b095b053SXin Li 			const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_k.quotient, range_j);
931*b095b053SXin Li 			task(argument, index_i_j.quotient, index_i_j.remainder, index_ij_k.remainder, index_l_m.quotient, index_l_m.remainder);
932*b095b053SXin Li 		}
933*b095b053SXin Li 	}
934*b095b053SXin Li 
935*b095b053SXin Li 	/* Make changes by this thread visible to other threads */
936*b095b053SXin Li 	pthreadpool_fence_release();
937*b095b053SXin Li }
938*b095b053SXin Li 
pthreadpool_thread_parallelize_5d_tile_1d_fastpath(struct pthreadpool * threadpool,struct thread_info * thread)939*b095b053SXin Li PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_5d_tile_1d_fastpath(
940*b095b053SXin Li 	struct pthreadpool* threadpool,
941*b095b053SXin Li 	struct thread_info* thread)
942*b095b053SXin Li {
943*b095b053SXin Li 	assert(threadpool != NULL);
944*b095b053SXin Li 	assert(thread != NULL);
945*b095b053SXin Li 
946*b095b053SXin Li 	const pthreadpool_task_5d_tile_1d_t task = (pthreadpool_task_5d_tile_1d_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
947*b095b053SXin Li 	void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
948*b095b053SXin Li 
949*b095b053SXin Li 	const size_t threads_count = threadpool->threads_count.value;
950*b095b053SXin Li 	const size_t range_threshold = -threads_count;
951*b095b053SXin Li 
952*b095b053SXin Li 	/* Process thread's own range of items */
953*b095b053SXin Li 	const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
954*b095b053SXin Li 	const struct fxdiv_divisor_size_t tile_range_m = threadpool->params.parallelize_5d_tile_1d.tile_range_m;
955*b095b053SXin Li 	const struct fxdiv_result_size_t tile_index_ijkl_m = fxdiv_divide_size_t(range_start, tile_range_m);
956*b095b053SXin Li 	const struct fxdiv_divisor_size_t range_kl = threadpool->params.parallelize_5d_tile_1d.range_kl;
957*b095b053SXin Li 	const struct fxdiv_result_size_t index_ij_kl = fxdiv_divide_size_t(tile_index_ijkl_m.quotient, range_kl);
958*b095b053SXin Li 	const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_5d_tile_1d.range_j;
959*b095b053SXin Li 	const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_kl.quotient, range_j);
960*b095b053SXin Li 	const struct fxdiv_divisor_size_t range_l = threadpool->params.parallelize_5d_tile_1d.range_l;
961*b095b053SXin Li 	const struct fxdiv_result_size_t index_k_l = fxdiv_divide_size_t(index_ij_kl.remainder, range_l);
962*b095b053SXin Li 	const size_t tile_m = threadpool->params.parallelize_5d_tile_1d.tile_m;
963*b095b053SXin Li 	size_t i = index_i_j.quotient;
964*b095b053SXin Li 	size_t j = index_i_j.remainder;
965*b095b053SXin Li 	size_t k = index_k_l.quotient;
966*b095b053SXin Li 	size_t l = index_k_l.remainder;
967*b095b053SXin Li 	size_t start_m = tile_index_ijkl_m.remainder * tile_m;
968*b095b053SXin Li 
969*b095b053SXin Li 	const size_t range_m = threadpool->params.parallelize_5d_tile_1d.range_m;
970*b095b053SXin Li 	const size_t range_k = threadpool->params.parallelize_5d_tile_1d.range_k;
971*b095b053SXin Li 	while (pthreadpool_decrement_fetch_relaxed_size_t(&thread->range_length) < range_threshold) {
972*b095b053SXin Li 		task(argument, i, j, k, l, start_m, min(range_m - start_m, tile_m));
973*b095b053SXin Li 		start_m += tile_m;
974*b095b053SXin Li 		if (start_m >= range_m) {
975*b095b053SXin Li 			start_m = 0;
976*b095b053SXin Li 			if (++l == range_l.value) {
977*b095b053SXin Li 				l = 0;
978*b095b053SXin Li 				if (++k == range_k) {
979*b095b053SXin Li 					k = 0;
980*b095b053SXin Li 					if (++j == range_j.value) {
981*b095b053SXin Li 						j = 0;
982*b095b053SXin Li 						i += 1;
983*b095b053SXin Li 					}
984*b095b053SXin Li 				}
985*b095b053SXin Li 			}
986*b095b053SXin Li 		}
987*b095b053SXin Li 	}
988*b095b053SXin Li 
989*b095b053SXin Li 	/* There still may be other threads with work */
990*b095b053SXin Li 	const size_t thread_number = thread->thread_number;
991*b095b053SXin Li 	for (size_t tid = modulo_decrement(thread_number, threads_count);
992*b095b053SXin Li 		tid != thread_number;
993*b095b053SXin Li 		tid = modulo_decrement(tid, threads_count))
994*b095b053SXin Li 	{
995*b095b053SXin Li 		struct thread_info* other_thread = &threadpool->threads[tid];
996*b095b053SXin Li 		while (pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_length) < range_threshold) {
997*b095b053SXin Li 			const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
998*b095b053SXin Li 			const struct fxdiv_result_size_t tile_index_ijkl_m = fxdiv_divide_size_t(linear_index, tile_range_m);
999*b095b053SXin Li 			const struct fxdiv_result_size_t index_ij_kl = fxdiv_divide_size_t(tile_index_ijkl_m.quotient, range_kl);
1000*b095b053SXin Li 			const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_kl.quotient, range_j);
1001*b095b053SXin Li 			const struct fxdiv_result_size_t index_k_l = fxdiv_divide_size_t(index_ij_kl.remainder, range_l);
1002*b095b053SXin Li 			size_t start_m = tile_index_ijkl_m.remainder * tile_m;
1003*b095b053SXin Li 			task(argument, index_i_j.quotient, index_i_j.remainder, index_k_l.quotient, index_k_l.remainder, start_m,
1004*b095b053SXin Li 				min(range_m - start_m, tile_m));
1005*b095b053SXin Li 		}
1006*b095b053SXin Li 	}
1007*b095b053SXin Li 
1008*b095b053SXin Li 	/* Make changes by this thread visible to other threads */
1009*b095b053SXin Li 	pthreadpool_fence_release();
1010*b095b053SXin Li }
1011*b095b053SXin Li 
pthreadpool_thread_parallelize_5d_tile_2d_fastpath(struct pthreadpool * threadpool,struct thread_info * thread)1012*b095b053SXin Li PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_5d_tile_2d_fastpath(
1013*b095b053SXin Li 	struct pthreadpool* threadpool,
1014*b095b053SXin Li 	struct thread_info* thread)
1015*b095b053SXin Li {
1016*b095b053SXin Li 	assert(threadpool != NULL);
1017*b095b053SXin Li 	assert(thread != NULL);
1018*b095b053SXin Li 
1019*b095b053SXin Li 	const pthreadpool_task_5d_tile_2d_t task = (pthreadpool_task_5d_tile_2d_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
1020*b095b053SXin Li 	void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
1021*b095b053SXin Li 
1022*b095b053SXin Li 	const size_t threads_count = threadpool->threads_count.value;
1023*b095b053SXin Li 	const size_t range_threshold = -threads_count;
1024*b095b053SXin Li 
1025*b095b053SXin Li 	/* Process thread's own range of items */
1026*b095b053SXin Li 	const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
1027*b095b053SXin Li 	const struct fxdiv_divisor_size_t tile_range_lm = threadpool->params.parallelize_5d_tile_2d.tile_range_lm;
1028*b095b053SXin Li 	const struct fxdiv_result_size_t tile_index_ijk_lm = fxdiv_divide_size_t(range_start, tile_range_lm);
1029*b095b053SXin Li 	const struct fxdiv_divisor_size_t range_k = threadpool->params.parallelize_5d_tile_2d.range_k;
1030*b095b053SXin Li 	const struct fxdiv_result_size_t index_ij_k = fxdiv_divide_size_t(tile_index_ijk_lm.quotient, range_k);
1031*b095b053SXin Li 	const struct fxdiv_divisor_size_t tile_range_m = threadpool->params.parallelize_5d_tile_2d.tile_range_m;
1032*b095b053SXin Li 	const struct fxdiv_result_size_t tile_index_l_m = fxdiv_divide_size_t(tile_index_ijk_lm.remainder, tile_range_m);
1033*b095b053SXin Li 	const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_5d_tile_2d.range_j;
1034*b095b053SXin Li 	const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_k.quotient, range_j);
1035*b095b053SXin Li 	const size_t tile_l = threadpool->params.parallelize_5d_tile_2d.tile_l;
1036*b095b053SXin Li 	const size_t tile_m = threadpool->params.parallelize_5d_tile_2d.tile_m;
1037*b095b053SXin Li 	size_t i = index_i_j.quotient;
1038*b095b053SXin Li 	size_t j = index_i_j.remainder;
1039*b095b053SXin Li 	size_t k = index_ij_k.remainder;
1040*b095b053SXin Li 	size_t start_l = tile_index_l_m.quotient * tile_l;
1041*b095b053SXin Li 	size_t start_m = tile_index_l_m.remainder * tile_m;
1042*b095b053SXin Li 
1043*b095b053SXin Li 	const size_t range_m = threadpool->params.parallelize_5d_tile_2d.range_m;
1044*b095b053SXin Li 	const size_t range_l = threadpool->params.parallelize_5d_tile_2d.range_l;
1045*b095b053SXin Li 	while (pthreadpool_decrement_fetch_relaxed_size_t(&thread->range_length) < range_threshold) {
1046*b095b053SXin Li 		task(argument, i, j, k, start_l, start_m, min(range_l - start_l, tile_l), min(range_m - start_m, tile_m));
1047*b095b053SXin Li 		start_m += tile_m;
1048*b095b053SXin Li 		if (start_m >= range_m) {
1049*b095b053SXin Li 			start_m = 0;
1050*b095b053SXin Li 			start_l += tile_l;
1051*b095b053SXin Li 			if (start_l >= range_l) {
1052*b095b053SXin Li 				start_l = 0;
1053*b095b053SXin Li 				if (++k == range_k.value) {
1054*b095b053SXin Li 					k = 0;
1055*b095b053SXin Li 					if (++j == range_j.value) {
1056*b095b053SXin Li 						j = 0;
1057*b095b053SXin Li 						i += 1;
1058*b095b053SXin Li 					}
1059*b095b053SXin Li 				}
1060*b095b053SXin Li 			}
1061*b095b053SXin Li 		}
1062*b095b053SXin Li 	}
1063*b095b053SXin Li 
1064*b095b053SXin Li 	/* There still may be other threads with work */
1065*b095b053SXin Li 	const size_t thread_number = thread->thread_number;
1066*b095b053SXin Li 	for (size_t tid = modulo_decrement(thread_number, threads_count);
1067*b095b053SXin Li 		tid != thread_number;
1068*b095b053SXin Li 		tid = modulo_decrement(tid, threads_count))
1069*b095b053SXin Li 	{
1070*b095b053SXin Li 		struct thread_info* other_thread = &threadpool->threads[tid];
1071*b095b053SXin Li 		while (pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_length) < range_threshold) {
1072*b095b053SXin Li 			const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
1073*b095b053SXin Li 			const struct fxdiv_result_size_t tile_index_ijk_lm = fxdiv_divide_size_t(linear_index, tile_range_lm);
1074*b095b053SXin Li 			const struct fxdiv_result_size_t index_ij_k = fxdiv_divide_size_t(tile_index_ijk_lm.quotient, range_k);
1075*b095b053SXin Li 			const struct fxdiv_result_size_t tile_index_l_m = fxdiv_divide_size_t(tile_index_ijk_lm.remainder, tile_range_m);
1076*b095b053SXin Li 			const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_k.quotient, range_j);
1077*b095b053SXin Li 			const size_t start_l = tile_index_l_m.quotient * tile_l;
1078*b095b053SXin Li 			const size_t start_m = tile_index_l_m.remainder * tile_m;
1079*b095b053SXin Li 			task(argument, index_i_j.quotient, index_i_j.remainder, index_ij_k.remainder,
1080*b095b053SXin Li 				start_l, start_m, min(range_l - start_l, tile_l), min(range_m - start_m, tile_m));
1081*b095b053SXin Li 		}
1082*b095b053SXin Li 	}
1083*b095b053SXin Li 
1084*b095b053SXin Li 	/* Make changes by this thread visible to other threads */
1085*b095b053SXin Li 	pthreadpool_fence_release();
1086*b095b053SXin Li }
1087*b095b053SXin Li 
pthreadpool_thread_parallelize_6d_fastpath(struct pthreadpool * threadpool,struct thread_info * thread)1088*b095b053SXin Li PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_6d_fastpath(
1089*b095b053SXin Li 	struct pthreadpool* threadpool,
1090*b095b053SXin Li 	struct thread_info* thread)
1091*b095b053SXin Li {
1092*b095b053SXin Li 	assert(threadpool != NULL);
1093*b095b053SXin Li 	assert(thread != NULL);
1094*b095b053SXin Li 
1095*b095b053SXin Li 	const pthreadpool_task_6d_t task = (pthreadpool_task_6d_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
1096*b095b053SXin Li 	void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
1097*b095b053SXin Li 
1098*b095b053SXin Li 	const size_t threads_count = threadpool->threads_count.value;
1099*b095b053SXin Li 	const size_t range_threshold = -threads_count;
1100*b095b053SXin Li 
1101*b095b053SXin Li 	/* Process thread's own range of items */
1102*b095b053SXin Li 	const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
1103*b095b053SXin Li 	const struct fxdiv_divisor_size_t range_lmn = threadpool->params.parallelize_6d.range_lmn;
1104*b095b053SXin Li 	const struct fxdiv_result_size_t index_ijk_lmn = fxdiv_divide_size_t(range_start, range_lmn);
1105*b095b053SXin Li 	const struct fxdiv_divisor_size_t range_k = threadpool->params.parallelize_6d.range_k;
1106*b095b053SXin Li 	const struct fxdiv_result_size_t index_ij_k = fxdiv_divide_size_t(index_ijk_lmn.quotient, range_k);
1107*b095b053SXin Li 	const struct fxdiv_divisor_size_t range_n = threadpool->params.parallelize_6d.range_n;
1108*b095b053SXin Li 	const struct fxdiv_result_size_t index_lm_n = fxdiv_divide_size_t(index_ijk_lmn.remainder, range_n);
1109*b095b053SXin Li 	const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_6d.range_j;
1110*b095b053SXin Li 	const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_k.quotient, range_j);
1111*b095b053SXin Li 	const struct fxdiv_divisor_size_t range_m = threadpool->params.parallelize_6d.range_m;
1112*b095b053SXin Li 	const struct fxdiv_result_size_t index_l_m = fxdiv_divide_size_t(index_lm_n.quotient, range_m);
1113*b095b053SXin Li 	size_t i = index_i_j.quotient;
1114*b095b053SXin Li 	size_t j = index_i_j.remainder;
1115*b095b053SXin Li 	size_t k = index_ij_k.remainder;
1116*b095b053SXin Li 	size_t l = index_l_m.quotient;
1117*b095b053SXin Li 	size_t m = index_l_m.remainder;
1118*b095b053SXin Li 	size_t n = index_lm_n.remainder;
1119*b095b053SXin Li 
1120*b095b053SXin Li 	const size_t range_l = threadpool->params.parallelize_6d.range_l;
1121*b095b053SXin Li 	while (pthreadpool_decrement_fetch_relaxed_size_t(&thread->range_length) < range_threshold) {
1122*b095b053SXin Li 		task(argument, i, j, k, l, m, n);
1123*b095b053SXin Li 		if (++n == range_n.value) {
1124*b095b053SXin Li 			n = 0;
1125*b095b053SXin Li 			if (++m == range_m.value) {
1126*b095b053SXin Li 				m = 0;
1127*b095b053SXin Li 				if (++l == range_l) {
1128*b095b053SXin Li 					l = 0;
1129*b095b053SXin Li 					if (++k == range_k.value) {
1130*b095b053SXin Li 						k = 0;
1131*b095b053SXin Li 						if (++j == range_j.value) {
1132*b095b053SXin Li 							j = 0;
1133*b095b053SXin Li 							i += 1;
1134*b095b053SXin Li 						}
1135*b095b053SXin Li 					}
1136*b095b053SXin Li 				}
1137*b095b053SXin Li 			}
1138*b095b053SXin Li 		}
1139*b095b053SXin Li 	}
1140*b095b053SXin Li 
1141*b095b053SXin Li 
1142*b095b053SXin Li 	/* There still may be other threads with work */
1143*b095b053SXin Li 	const size_t thread_number = thread->thread_number;
1144*b095b053SXin Li 	for (size_t tid = modulo_decrement(thread_number, threads_count);
1145*b095b053SXin Li 		tid != thread_number;
1146*b095b053SXin Li 		tid = modulo_decrement(tid, threads_count))
1147*b095b053SXin Li 	{
1148*b095b053SXin Li 		struct thread_info* other_thread = &threadpool->threads[tid];
1149*b095b053SXin Li 		while (pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_length) < range_threshold) {
1150*b095b053SXin Li 			const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
1151*b095b053SXin Li 			const struct fxdiv_result_size_t index_ijk_lmn = fxdiv_divide_size_t(linear_index, range_lmn);
1152*b095b053SXin Li 			const struct fxdiv_result_size_t index_ij_k = fxdiv_divide_size_t(index_ijk_lmn.quotient, range_k);
1153*b095b053SXin Li 			const struct fxdiv_result_size_t index_lm_n = fxdiv_divide_size_t(index_ijk_lmn.remainder, range_n);
1154*b095b053SXin Li 			const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_k.quotient, range_j);
1155*b095b053SXin Li 			const struct fxdiv_result_size_t index_l_m = fxdiv_divide_size_t(index_lm_n.quotient, range_m);
1156*b095b053SXin Li 			task(argument, index_i_j.quotient, index_i_j.remainder, index_ij_k.remainder, index_l_m.quotient, index_l_m.remainder, index_lm_n.remainder);
1157*b095b053SXin Li 		}
1158*b095b053SXin Li 	}
1159*b095b053SXin Li 
1160*b095b053SXin Li 	/* Make changes by this thread visible to other threads */
1161*b095b053SXin Li 	pthreadpool_fence_release();
1162*b095b053SXin Li }
1163*b095b053SXin Li 
pthreadpool_thread_parallelize_6d_tile_1d_fastpath(struct pthreadpool * threadpool,struct thread_info * thread)1164*b095b053SXin Li PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_6d_tile_1d_fastpath(
1165*b095b053SXin Li 	struct pthreadpool* threadpool,
1166*b095b053SXin Li 	struct thread_info* thread)
1167*b095b053SXin Li {
1168*b095b053SXin Li 	assert(threadpool != NULL);
1169*b095b053SXin Li 	assert(thread != NULL);
1170*b095b053SXin Li 
1171*b095b053SXin Li 	const pthreadpool_task_6d_tile_1d_t task = (pthreadpool_task_6d_tile_1d_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
1172*b095b053SXin Li 	void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
1173*b095b053SXin Li 
1174*b095b053SXin Li 	const size_t threads_count = threadpool->threads_count.value;
1175*b095b053SXin Li 	const size_t range_threshold = -threads_count;
1176*b095b053SXin Li 
1177*b095b053SXin Li 	/* Process thread's own range of items */
1178*b095b053SXin Li 	const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
1179*b095b053SXin Li 	const struct fxdiv_divisor_size_t tile_range_lmn = threadpool->params.parallelize_6d_tile_1d.tile_range_lmn;
1180*b095b053SXin Li 	const struct fxdiv_result_size_t tile_index_ijk_lmn = fxdiv_divide_size_t(range_start, tile_range_lmn);
1181*b095b053SXin Li 	const struct fxdiv_divisor_size_t range_k = threadpool->params.parallelize_6d_tile_1d.range_k;
1182*b095b053SXin Li 	const struct fxdiv_result_size_t index_ij_k = fxdiv_divide_size_t(tile_index_ijk_lmn.quotient, range_k);
1183*b095b053SXin Li 	const struct fxdiv_divisor_size_t tile_range_n = threadpool->params.parallelize_6d_tile_1d.tile_range_n;
1184*b095b053SXin Li 	const struct fxdiv_result_size_t tile_index_lm_n = fxdiv_divide_size_t(tile_index_ijk_lmn.remainder, tile_range_n);
1185*b095b053SXin Li 	const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_6d_tile_1d.range_j;
1186*b095b053SXin Li 	const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_k.quotient, range_j);
1187*b095b053SXin Li 	const struct fxdiv_divisor_size_t range_m = threadpool->params.parallelize_6d_tile_1d.range_m;
1188*b095b053SXin Li 	const struct fxdiv_result_size_t index_l_m = fxdiv_divide_size_t(tile_index_lm_n.quotient, range_m);
1189*b095b053SXin Li 	const size_t tile_n = threadpool->params.parallelize_6d_tile_1d.tile_n;
1190*b095b053SXin Li 	size_t i = index_i_j.quotient;
1191*b095b053SXin Li 	size_t j = index_i_j.remainder;
1192*b095b053SXin Li 	size_t k = index_ij_k.remainder;
1193*b095b053SXin Li 	size_t l = index_l_m.quotient;
1194*b095b053SXin Li 	size_t m = index_l_m.remainder;
1195*b095b053SXin Li 	size_t start_n = tile_index_lm_n.remainder * tile_n;
1196*b095b053SXin Li 
1197*b095b053SXin Li 	const size_t range_n = threadpool->params.parallelize_6d_tile_1d.range_n;
1198*b095b053SXin Li 	const size_t range_l = threadpool->params.parallelize_6d_tile_1d.range_l;
1199*b095b053SXin Li 	while (pthreadpool_decrement_fetch_relaxed_size_t(&thread->range_length) < range_threshold) {
1200*b095b053SXin Li 		task(argument, i, j, k, l, m, start_n, min(range_n - start_n, tile_n));
1201*b095b053SXin Li 		start_n += tile_n;
1202*b095b053SXin Li 		if (start_n >= range_n) {
1203*b095b053SXin Li 			start_n = 0;
1204*b095b053SXin Li 			if (++m == range_m.value) {
1205*b095b053SXin Li 				m = 0;
1206*b095b053SXin Li 				if (++l == range_l) {
1207*b095b053SXin Li 					l = 0;
1208*b095b053SXin Li 					if (++k == range_k.value) {
1209*b095b053SXin Li 						k = 0;
1210*b095b053SXin Li 						if (++j == range_j.value) {
1211*b095b053SXin Li 							j = 0;
1212*b095b053SXin Li 							i += 1;
1213*b095b053SXin Li 						}
1214*b095b053SXin Li 					}
1215*b095b053SXin Li 				}
1216*b095b053SXin Li 			}
1217*b095b053SXin Li 		}
1218*b095b053SXin Li 	}
1219*b095b053SXin Li 
1220*b095b053SXin Li 
1221*b095b053SXin Li 	/* There still may be other threads with work */
1222*b095b053SXin Li 	const size_t thread_number = thread->thread_number;
1223*b095b053SXin Li 	for (size_t tid = modulo_decrement(thread_number, threads_count);
1224*b095b053SXin Li 		tid != thread_number;
1225*b095b053SXin Li 		tid = modulo_decrement(tid, threads_count))
1226*b095b053SXin Li 	{
1227*b095b053SXin Li 		struct thread_info* other_thread = &threadpool->threads[tid];
1228*b095b053SXin Li 		while (pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_length) < range_threshold) {
1229*b095b053SXin Li 			const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
1230*b095b053SXin Li 			const struct fxdiv_result_size_t tile_index_ijk_lmn = fxdiv_divide_size_t(linear_index, tile_range_lmn);
1231*b095b053SXin Li 			const struct fxdiv_result_size_t index_ij_k = fxdiv_divide_size_t(tile_index_ijk_lmn.quotient, range_k);
1232*b095b053SXin Li 			const struct fxdiv_result_size_t tile_index_lm_n = fxdiv_divide_size_t(tile_index_ijk_lmn.remainder, tile_range_n);
1233*b095b053SXin Li 			const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_k.quotient, range_j);
1234*b095b053SXin Li 			const struct fxdiv_result_size_t index_l_m = fxdiv_divide_size_t(tile_index_lm_n.quotient, range_m);
1235*b095b053SXin Li 			const size_t start_n = tile_index_lm_n.remainder * tile_n;
1236*b095b053SXin Li 			task(argument, index_i_j.quotient, index_i_j.remainder, index_ij_k.remainder, index_l_m.quotient, index_l_m.remainder,
1237*b095b053SXin Li 				start_n, min(range_n - start_n, tile_n));
1238*b095b053SXin Li 		}
1239*b095b053SXin Li 	}
1240*b095b053SXin Li 
1241*b095b053SXin Li 	/* Make changes by this thread visible to other threads */
1242*b095b053SXin Li 	pthreadpool_fence_release();
1243*b095b053SXin Li }
1244*b095b053SXin Li 
pthreadpool_thread_parallelize_6d_tile_2d_fastpath(struct pthreadpool * threadpool,struct thread_info * thread)1245*b095b053SXin Li PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_6d_tile_2d_fastpath(
1246*b095b053SXin Li 	struct pthreadpool* threadpool,
1247*b095b053SXin Li 	struct thread_info* thread)
1248*b095b053SXin Li {
1249*b095b053SXin Li 	assert(threadpool != NULL);
1250*b095b053SXin Li 	assert(thread != NULL);
1251*b095b053SXin Li 
1252*b095b053SXin Li 	const pthreadpool_task_6d_tile_2d_t task = (pthreadpool_task_6d_tile_2d_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
1253*b095b053SXin Li 	void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
1254*b095b053SXin Li 
1255*b095b053SXin Li 	const size_t threads_count = threadpool->threads_count.value;
1256*b095b053SXin Li 	const size_t range_threshold = -threads_count;
1257*b095b053SXin Li 
1258*b095b053SXin Li 	/* Process thread's own range of items */
1259*b095b053SXin Li 	const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
1260*b095b053SXin Li 	const struct fxdiv_divisor_size_t tile_range_mn = threadpool->params.parallelize_6d_tile_2d.tile_range_mn;
1261*b095b053SXin Li 	const struct fxdiv_result_size_t tile_index_ijkl_mn = fxdiv_divide_size_t(range_start, tile_range_mn);
1262*b095b053SXin Li 	const struct fxdiv_divisor_size_t range_kl = threadpool->params.parallelize_6d_tile_2d.range_kl;
1263*b095b053SXin Li 	const struct fxdiv_result_size_t index_ij_kl = fxdiv_divide_size_t(tile_index_ijkl_mn.quotient, range_kl);
1264*b095b053SXin Li 	const struct fxdiv_divisor_size_t tile_range_n = threadpool->params.parallelize_6d_tile_2d.tile_range_n;
1265*b095b053SXin Li 	const struct fxdiv_result_size_t tile_index_m_n = fxdiv_divide_size_t(tile_index_ijkl_mn.remainder, tile_range_n);
1266*b095b053SXin Li 	const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_6d_tile_2d.range_j;
1267*b095b053SXin Li 	const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_kl.quotient, range_j);
1268*b095b053SXin Li 	const struct fxdiv_divisor_size_t range_l = threadpool->params.parallelize_6d_tile_2d.range_l;
1269*b095b053SXin Li 	const struct fxdiv_result_size_t index_k_l = fxdiv_divide_size_t(index_ij_kl.remainder, range_l);
1270*b095b053SXin Li 	const size_t tile_m = threadpool->params.parallelize_6d_tile_2d.tile_m;
1271*b095b053SXin Li 	const size_t tile_n = threadpool->params.parallelize_6d_tile_2d.tile_n;
1272*b095b053SXin Li 	size_t i = index_i_j.quotient;
1273*b095b053SXin Li 	size_t j = index_i_j.remainder;
1274*b095b053SXin Li 	size_t k = index_k_l.quotient;
1275*b095b053SXin Li 	size_t l = index_k_l.remainder;
1276*b095b053SXin Li 	size_t start_m = tile_index_m_n.quotient * tile_m;
1277*b095b053SXin Li 	size_t start_n = tile_index_m_n.remainder * tile_n;
1278*b095b053SXin Li 
1279*b095b053SXin Li 	const size_t range_n = threadpool->params.parallelize_6d_tile_2d.range_n;
1280*b095b053SXin Li 	const size_t range_m = threadpool->params.parallelize_6d_tile_2d.range_m;
1281*b095b053SXin Li 	const size_t range_k = threadpool->params.parallelize_6d_tile_2d.range_k;
1282*b095b053SXin Li 	while (pthreadpool_decrement_fetch_relaxed_size_t(&thread->range_length) < range_threshold) {
1283*b095b053SXin Li 		task(argument, i, j, k, l, start_m, start_n, min(range_m - start_m, tile_m), min(range_n - start_n, tile_n));
1284*b095b053SXin Li 		start_n += tile_n;
1285*b095b053SXin Li 		if (start_n >= range_n) {
1286*b095b053SXin Li 			start_n = 0;
1287*b095b053SXin Li 			start_m += tile_m;
1288*b095b053SXin Li 			if (start_m >= range_m) {
1289*b095b053SXin Li 				start_m = 0;
1290*b095b053SXin Li 				if (++l == range_l.value) {
1291*b095b053SXin Li 					l = 0;
1292*b095b053SXin Li 					if (++k == range_k) {
1293*b095b053SXin Li 						k = 0;
1294*b095b053SXin Li 						if (++j == range_j.value) {
1295*b095b053SXin Li 							j = 0;
1296*b095b053SXin Li 							i += 1;
1297*b095b053SXin Li 						}
1298*b095b053SXin Li 					}
1299*b095b053SXin Li 				}
1300*b095b053SXin Li 			}
1301*b095b053SXin Li 		}
1302*b095b053SXin Li 	}
1303*b095b053SXin Li 
1304*b095b053SXin Li 	/* There still may be other threads with work */
1305*b095b053SXin Li 	const size_t thread_number = thread->thread_number;
1306*b095b053SXin Li 	for (size_t tid = modulo_decrement(thread_number, threads_count);
1307*b095b053SXin Li 		tid != thread_number;
1308*b095b053SXin Li 		tid = modulo_decrement(tid, threads_count))
1309*b095b053SXin Li 	{
1310*b095b053SXin Li 		struct thread_info* other_thread = &threadpool->threads[tid];
1311*b095b053SXin Li 		while (pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_length) < range_threshold) {
1312*b095b053SXin Li 			const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
1313*b095b053SXin Li 			const struct fxdiv_result_size_t tile_index_ijkl_mn = fxdiv_divide_size_t(linear_index, tile_range_mn);
1314*b095b053SXin Li 			const struct fxdiv_result_size_t index_ij_kl = fxdiv_divide_size_t(tile_index_ijkl_mn.quotient, range_kl);
1315*b095b053SXin Li 			const struct fxdiv_result_size_t tile_index_m_n = fxdiv_divide_size_t(tile_index_ijkl_mn.remainder, tile_range_n);
1316*b095b053SXin Li 			const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_kl.quotient, range_j);
1317*b095b053SXin Li 			const struct fxdiv_result_size_t index_k_l = fxdiv_divide_size_t(index_ij_kl.remainder, range_l);
1318*b095b053SXin Li 			const size_t start_m = tile_index_m_n.quotient * tile_m;
1319*b095b053SXin Li 			const size_t start_n = tile_index_m_n.remainder * tile_n;
1320*b095b053SXin Li 			task(argument, index_i_j.quotient, index_i_j.remainder, index_k_l.quotient, index_k_l.remainder,
1321*b095b053SXin Li 				start_m, start_n, min(range_m - start_m, tile_m), min(range_n - start_n, tile_n));
1322*b095b053SXin Li 		}
1323*b095b053SXin Li 	}
1324*b095b053SXin Li 
1325*b095b053SXin Li 	/* Make changes by this thread visible to other threads */
1326*b095b053SXin Li 	pthreadpool_fence_release();
1327*b095b053SXin Li }
1328