1 //
2 // Copyright (c) 2017 The Khronos Group Inc.
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 // http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 //
16 #include "harness/compat.h"
17
18 #include <stdio.h>
19 #include <stdlib.h>
20 #include <string.h>
21 #include <sys/types.h>
22 #include <sys/stat.h>
23
24 #include <algorithm>
25 #include <numeric>
26 #include <vector>
27
28 #include "procs.h"
29
30 namespace {
31 const char *barrier_kernel_code = R"(
32 __kernel void compute_sum(__global int *a, int n, __global int *tmp_sum,
33 __global int *sum)
34 {
35 int tid = get_local_id(0);
36 int lsize = get_local_size(0);
37 int i;
38
39 tmp_sum[tid] = 0;
40 for (i = tid; i < n; i += lsize) tmp_sum[tid] += a[i];
41
42 // updated to work for any workgroup size
43 for (i = hadd(lsize, 1); lsize > 1; i = hadd(i, 1))
44 {
45 BARRIER(CLK_GLOBAL_MEM_FENCE);
46 if (tid + i < lsize) tmp_sum[tid] += tmp_sum[tid + i];
47 lsize = i;
48 }
49
50 // no barrier is required here because last person to write to tmp_sum[0]
51 // was tid 0
52 if (tid == 0) *sum = tmp_sum[0];
53 }
54 )";
55
56
generate_random_inputs(std::vector<cl_int> & v)57 void generate_random_inputs(std::vector<cl_int> &v)
58 {
59 RandomSeed seed(gRandomSeed);
60
61 auto random_generator = [&seed]() {
62 return static_cast<cl_int>(
63 get_random_float(-0x01000000, 0x01000000, seed));
64 };
65
66 std::generate(v.begin(), v.end(), random_generator);
67 }
68
test_barrier_common(cl_device_id device,cl_context context,cl_command_queue queue,int num_elements,std::string barrier_str)69 int test_barrier_common(cl_device_id device, cl_context context,
70 cl_command_queue queue, int num_elements,
71 std::string barrier_str)
72 {
73 clMemWrapper streams[3];
74 clProgramWrapper program;
75 clKernelWrapper kernel;
76
77 cl_int output;
78 int err;
79
80 size_t max_threadgroup_size = 0;
81 std::string build_options = std::string("-DBARRIER=") + barrier_str;
82 err = create_single_kernel_helper(context, &program, &kernel, 1,
83 &barrier_kernel_code, "compute_sum",
84 build_options.c_str());
85 test_error(err, "Failed to build kernel/program.");
86
87 err = get_max_allowed_1d_work_group_size_on_device(device, kernel,
88 &max_threadgroup_size);
89 test_error(err, "get_max_allowed_1d_work_group_size_on_device failed.");
90
91 // work group size must divide evenly into the global size
92 while (num_elements % max_threadgroup_size) max_threadgroup_size--;
93
94 std::vector<cl_int> input(num_elements);
95
96 streams[0] = clCreateBuffer(context, CL_MEM_READ_WRITE,
97 sizeof(cl_int) * num_elements, nullptr, &err);
98 test_error(err, "clCreateBuffer failed.");
99 streams[1] = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_int),
100 nullptr, &err);
101 test_error(err, "clCreateBuffer failed.");
102 streams[2] =
103 clCreateBuffer(context, CL_MEM_READ_WRITE,
104 sizeof(cl_int) * max_threadgroup_size, nullptr, &err);
105 test_error(err, "clCreateBuffer failed.");
106
107 generate_random_inputs(input);
108
109 err = clEnqueueWriteBuffer(queue, streams[0], CL_TRUE, 0,
110 sizeof(cl_int) * num_elements, input.data(), 0,
111 nullptr, nullptr);
112 test_error(err, "clEnqueueWriteBuffer failed.");
113
114 err = clSetKernelArg(kernel, 0, sizeof(streams[0]), &streams[0]);
115 err |= clSetKernelArg(kernel, 1, sizeof(num_elements), &num_elements);
116 err |= clSetKernelArg(kernel, 2, sizeof(streams[2]), &streams[2]);
117 err |= clSetKernelArg(kernel, 3, sizeof(streams[1]), &streams[1]);
118 test_error(err, "clSetKernelArg failed.");
119
120 size_t global_threads[] = { max_threadgroup_size };
121 size_t local_threads[] = { max_threadgroup_size };
122
123 err = clEnqueueNDRangeKernel(queue, kernel, 1, nullptr, global_threads,
124 local_threads, 0, nullptr, nullptr);
125 test_error(err, "clEnqueueNDRangeKernel failed.");
126
127 err = clEnqueueReadBuffer(queue, streams[1], true, 0, sizeof(cl_int),
128 &output, 0, nullptr, nullptr);
129 test_error(err, "clEnqueueReadBuffer failed.");
130
131 if (std::accumulate(input.begin(), input.end(), 0) != output)
132 {
133 log_error("%s test failed\n", barrier_str.c_str());
134 err = -1;
135 }
136 else
137 {
138 log_info("%s test passed\n", barrier_str.c_str());
139 }
140
141 return err;
142 }
143 }
144
test_barrier(cl_device_id device,cl_context context,cl_command_queue queue,int num_elements)145 int test_barrier(cl_device_id device, cl_context context,
146 cl_command_queue queue, int num_elements)
147 {
148 return test_barrier_common(device, context, queue, num_elements, "barrier");
149 }
150
test_wg_barrier(cl_device_id device,cl_context context,cl_command_queue queue,int num_elements)151 int test_wg_barrier(cl_device_id device, cl_context context,
152 cl_command_queue queue, int num_elements)
153 {
154 return test_barrier_common(device, context, queue, num_elements,
155 "work_group_barrier");
156 }
157