1 /*
2 * Copyright (c) Meta Platforms, Inc. and affiliates.
3 * All rights reserved.
4 *
5 * This source code is licensed under the BSD-style license found in the
6 * LICENSE file in the root directory of this source tree.
7 */
8
9 #include <tuple>
10
11 #include <executorch/extension/parallel/thread_parallel.h>
12 #include <executorch/extension/threadpool/threadpool.h>
13 #include <executorch/runtime/core/exec_aten/util/tensor_util.h>
14 #include <executorch/runtime/platform/assert.h>
15
16 namespace executorch {
17 namespace extension {
18
19 namespace {
20 thread_local int64_t thread_num_ = 0;
21 }
22
23 using namespace ::executorch::extension::threadpool;
24
divup(int64_t x,int64_t y)25 inline int64_t divup(int64_t x, int64_t y) {
26 return (x + y - 1) / y;
27 }
28
get_thread_num()29 int64_t get_thread_num() {
30 return thread_num_;
31 }
32
set_thread_num(int64_t thread_num)33 void set_thread_num(int64_t thread_num) {
34 thread_num_ = thread_num;
35 }
36
37 inline std::tuple<int64_t, int64_t>
calc_num_tasks_and_chunk_size(int64_t begin,int64_t end,int64_t grain_size)38 calc_num_tasks_and_chunk_size(int64_t begin, int64_t end, int64_t grain_size) {
39 if ((end - begin) < grain_size) {
40 return std::make_tuple(1, std::max((int64_t)0, end - begin));
41 }
42 // Choose number of tasks based on grain size and number of threads.
43 int64_t chunk_size =
44 divup((end - begin), get_threadpool()->get_thread_count());
45 // Make sure each task is at least grain_size size.
46 chunk_size = std::max(grain_size, chunk_size);
47 int64_t num_tasks = divup((end - begin), chunk_size);
48 return std::make_tuple(num_tasks, chunk_size);
49 }
50
parallel_for(const int64_t begin,const int64_t end,const int64_t grain_size,const std::function<void (int64_t,int64_t)> & f)51 bool parallel_for(
52 const int64_t begin,
53 const int64_t end,
54 const int64_t grain_size,
55 const std::function<void(int64_t, int64_t)>& f) {
56 ET_LOG_AND_RETURN_IF_FALSE(begin >= 0 && end >= 0);
57 ET_LOG_AND_RETURN_IF_FALSE(end >= begin);
58 ET_LOG_AND_RETURN_IF_FALSE(grain_size > 0);
59 int64_t num_tasks = 0, chunk_size = 0;
60 std::tie(num_tasks, chunk_size) =
61 calc_num_tasks_and_chunk_size(begin, end, grain_size);
62
63 auto task = [f, begin, end, chunk_size](size_t task_id) {
64 set_thread_num(task_id);
65 int64_t local_start = begin + static_cast<int64_t>(task_id) * chunk_size;
66 if (local_start < end) {
67 int64_t local_end = std::min(end, (int64_t)(chunk_size + local_start));
68 f(local_start, local_end);
69 }
70 };
71
72 // Per protocol from threadpool (pthreadpool), when this returns, all tasks
73 // are executed, so this is synchronous.
74 get_threadpool()->run(task, num_tasks);
75 return true;
76 }
77
78 } // namespace extension
79 } // namespace executorch
80