1 //
2 // Copyright (c) 2021 The Khronos Group Inc.
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 // http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 //
16
17 #include <algorithm>
18 #include <limits>
19 #include <numeric>
20 #include <string>
21 #include <vector>
22
23 #include "procs.h"
24 #include "harness/integer_ops_test_info.h"
25 #include "harness/testHarness.h"
26
27 template <size_t N, typename DstType, typename SrcTypeA, typename SrcTypeB>
28 static void
calculate_reference(std::vector<DstType> & ref,const std::vector<SrcTypeA> & a,const std::vector<SrcTypeB> & b,const bool AccSat=false,const std::vector<DstType> & acc={})29 calculate_reference(std::vector<DstType>& ref, const std::vector<SrcTypeA>& a,
30 const std::vector<SrcTypeB>& b, const bool AccSat = false,
31 const std::vector<DstType>& acc = {})
32 {
33 assert(a.size() == b.size());
34 assert(AccSat == false || acc.size() == a.size() / N);
35
36 ref.resize(a.size() / N);
37 for (size_t r = 0; r < ref.size(); r++)
38 {
39 cl_long result = AccSat ? acc[r] : 0;
40 for (size_t c = 0; c < N; c++)
41 {
42 // OK to assume no overflow?
43 result += a[r * N + c] * b[r * N + c];
44 }
45 if (AccSat && result > std::numeric_limits<DstType>::max())
46 {
47 result = std::numeric_limits<DstType>::max();
48 }
49 ref[r] = static_cast<DstType>(result);
50 }
51 }
52
53 template <typename SrcTypeA, typename SrcTypeB>
generate_inputs_with_special_values(std::vector<SrcTypeA> & a,std::vector<SrcTypeB> & b)54 void generate_inputs_with_special_values(std::vector<SrcTypeA>& a,
55 std::vector<SrcTypeB>& b)
56 {
57 const std::vector<SrcTypeA> specialValuesA(
58 { static_cast<SrcTypeA>(std::numeric_limits<SrcTypeA>::min()),
59 static_cast<SrcTypeA>(std::numeric_limits<SrcTypeA>::min() + 1),
60 static_cast<SrcTypeA>(std::numeric_limits<SrcTypeA>::min() / 2), 0,
61 static_cast<SrcTypeA>(std::numeric_limits<SrcTypeA>::max() / 2),
62 static_cast<SrcTypeA>(std::numeric_limits<SrcTypeA>::max() - 1),
63 static_cast<SrcTypeA>(std::numeric_limits<SrcTypeA>::max()) });
64 const std::vector<SrcTypeB> specialValuesB(
65 { static_cast<SrcTypeB>(std::numeric_limits<SrcTypeB>::min()),
66 static_cast<SrcTypeB>(std::numeric_limits<SrcTypeB>::min() + 1),
67 static_cast<SrcTypeB>(std::numeric_limits<SrcTypeB>::min() / 2), 0,
68 static_cast<SrcTypeB>(std::numeric_limits<SrcTypeB>::max() / 2),
69 static_cast<SrcTypeB>(std::numeric_limits<SrcTypeB>::max() - 1),
70 static_cast<SrcTypeB>(std::numeric_limits<SrcTypeB>::max()) });
71
72 size_t count = 0;
73 for (auto svA : specialValuesA)
74 {
75 for (auto svB : specialValuesB)
76 {
77 a[count] = svA;
78 b[count] = svB;
79 ++count;
80 }
81 }
82
83 // Generate random data for the rest of the inputs:
84 MTdataHolder d(gRandomSeed);
85 generate_random_data(TestInfo<SrcTypeA>::explicitType, a.size() - count, d,
86 a.data() + count);
87 generate_random_data(TestInfo<SrcTypeB>::explicitType, b.size() - count, d,
88 b.data() + count);
89 }
90
91 template <typename SrcType>
generate_acc_sat_inputs(std::vector<SrcType> & acc)92 void generate_acc_sat_inputs(std::vector<SrcType>& acc)
93 {
94 // First generate random data:
95 fill_vector_with_random_data(acc);
96
97 // Now go through the generated data, and make every other element large.
98 // This ensures we have some elements that need saturation.
99 for (size_t i = 0; i < acc.size(); i += 2)
100 {
101 acc[i] = std::numeric_limits<SrcType>::max() - acc[i];
102 }
103 }
104
105 template <typename T> struct PackedTestInfo
106 {
107 static constexpr const char* deviceTypeName = "UNSUPPORTED";
108 };
109 template <> struct PackedTestInfo<cl_char>
110 {
111 static constexpr const char* deviceTypeName = "int";
112 };
113 template <> struct PackedTestInfo<cl_uchar>
114 {
115 static constexpr const char* deviceTypeName = "uint";
116 };
117
118 static constexpr const char* kernel_source_dot = R"CLC(
119 __kernel void test_dot(__global DSTTYPE* dst, __global SRCTYPEA* a, __global SRCTYPEB* b)
120 {
121 int index = get_global_id(0);
122 dst[index] = DOT(a[index], b[index]);
123 }
124 )CLC";
125
126 static constexpr const char* kernel_source_dot_acc_sat = R"CLC(
127 __kernel void test_dot_acc_sat(
128 __global DSTTYPE* dst,
129 __global SRCTYPEA* a, __global SRCTYPEB* b, __global DSTTYPE* acc)
130 {
131 int index = get_global_id(0);
132 dst[index] = DOT_ACC_SAT(a[index], b[index], acc[index]);
133 }
134 )CLC";
135
136 template <typename DstType, typename SrcTypeA, typename SrcTypeB, size_t N>
test_case_dot(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements,bool packed,bool sat)137 static int test_case_dot(cl_device_id deviceID, cl_context context,
138 cl_command_queue queue, int num_elements, bool packed,
139 bool sat)
140 {
141 log_info(" testing %s = dot%s%s(%s, %s)\n",
142 std::numeric_limits<DstType>::is_signed ? "signed" : "unsigned",
143 sat ? "_acc_sat" : "", packed ? "_packed" : "",
144 std::numeric_limits<SrcTypeA>::is_signed ? "signed" : "unsigned",
145 std::numeric_limits<SrcTypeB>::is_signed ? "signed" : "unsigned");
146
147 cl_int error = CL_SUCCESS;
148
149 clProgramWrapper program;
150 clKernelWrapper kernel;
151
152 std::string buildOptions;
153 buildOptions += " -DDSTTYPE=";
154 buildOptions += TestInfo<DstType>::deviceTypeName;
155 buildOptions += " -DSRCTYPEA=";
156 buildOptions += packed
157 ? PackedTestInfo<SrcTypeA>::deviceTypeName
158 : TestInfo<SrcTypeA>::deviceTypeName + std::to_string(N);
159 buildOptions += " -DSRCTYPEB=";
160 buildOptions += packed
161 ? PackedTestInfo<SrcTypeB>::deviceTypeName
162 : TestInfo<SrcTypeB>::deviceTypeName + std::to_string(N);
163 std::string packedSuffix;
164 packedSuffix += std::numeric_limits<SrcTypeA>::is_signed ? "s" : "u";
165 packedSuffix += std::numeric_limits<SrcTypeB>::is_signed ? "s" : "u";
166 packedSuffix += std::numeric_limits<DstType>::is_signed ? "_int" : "_uint";
167 if (sat)
168 {
169 buildOptions += packed
170 ? " -DDOT_ACC_SAT=dot_acc_sat_4x8packed_" + packedSuffix
171 : " -DDOT_ACC_SAT=dot_acc_sat";
172 }
173 else
174 {
175 buildOptions +=
176 packed ? " -DDOT=dot_4x8packed_" + packedSuffix : " -DDOT=dot";
177 }
178
179 std::vector<SrcTypeA> a(N * num_elements);
180 std::vector<SrcTypeB> b(N * num_elements);
181 generate_inputs_with_special_values(a, b);
182
183 std::vector<DstType> acc;
184 if (sat)
185 {
186 acc.resize(num_elements);
187 generate_acc_sat_inputs(acc);
188 }
189
190 std::vector<DstType> reference(num_elements);
191 calculate_reference<N>(reference, a, b, sat, acc);
192
193 const char* source = sat ? kernel_source_dot_acc_sat : kernel_source_dot;
194 const char* name = sat ? "test_dot_acc_sat" : "test_dot";
195 error = create_single_kernel_helper(context, &program, &kernel, 1, &source,
196 name, buildOptions.c_str());
197 test_error(error, "Unable to create test kernel");
198
199 clMemWrapper dst = clCreateBuffer(
200 context, 0, reference.size() * sizeof(DstType), NULL, &error);
201 test_error(error, "Unable to create output buffer");
202
203 clMemWrapper srcA =
204 clCreateBuffer(context, CL_MEM_COPY_HOST_PTR,
205 a.size() * sizeof(SrcTypeA), a.data(), &error);
206 test_error(error, "Unable to create srcA buffer");
207
208 clMemWrapper srcB =
209 clCreateBuffer(context, CL_MEM_COPY_HOST_PTR,
210 b.size() * sizeof(SrcTypeB), b.data(), &error);
211 test_error(error, "Unable to create srcB buffer");
212
213 clMemWrapper srcAcc;
214 if (sat)
215 {
216 srcAcc =
217 clCreateBuffer(context, CL_MEM_COPY_HOST_PTR,
218 acc.size() * sizeof(DstType), acc.data(), &error);
219 test_error(error, "Unable to create acc buffer");
220 }
221
222 error = clSetKernelArg(kernel, 0, sizeof(dst), &dst);
223 test_error(error, "Unable to set output buffer kernel arg");
224
225 error = clSetKernelArg(kernel, 1, sizeof(srcA), &srcA);
226 test_error(error, "Unable to set srcA buffer kernel arg");
227
228 error = clSetKernelArg(kernel, 2, sizeof(srcB), &srcB);
229 test_error(error, "Unable to set srcB buffer kernel arg");
230
231 if (sat)
232 {
233 error = clSetKernelArg(kernel, 3, sizeof(srcAcc), &srcAcc);
234 test_error(error, "Unable to set acc buffer kernel arg");
235 }
236
237 size_t global_work_size[] = { reference.size() };
238 error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, global_work_size,
239 NULL, 0, NULL, NULL);
240 test_error(error, "Unable to enqueue test kernel");
241
242 error = clFinish(queue);
243 test_error(error, "clFinish failed after test kernel");
244
245 std::vector<DstType> results(reference.size(), 99);
246 error = clEnqueueReadBuffer(queue, dst, CL_TRUE, 0,
247 results.size() * sizeof(DstType),
248 results.data(), 0, NULL, NULL);
249 test_error(error, "Unable to read data after test kernel");
250
251 if (results != reference)
252 {
253 log_error("Result buffer did not match reference buffer!\n");
254 return TEST_FAIL;
255 }
256
257 return TEST_PASS;
258 }
259
260 template <typename SrcType, typename DstType, size_t N>
test_vectype(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)261 static int test_vectype(cl_device_id deviceID, cl_context context,
262 cl_command_queue queue, int num_elements)
263 {
264 int result = TEST_PASS;
265
266 typedef typename std::make_signed<SrcType>::type SSrcType;
267 typedef typename std::make_signed<DstType>::type SDstType;
268
269 typedef typename std::make_unsigned<SrcType>::type USrcType;
270 typedef typename std::make_unsigned<DstType>::type UDstType;
271
272 // dot testing:
273 result |= test_case_dot<UDstType, USrcType, USrcType, N>(
274 deviceID, context, queue, num_elements, false, false);
275 result |= test_case_dot<SDstType, SSrcType, SSrcType, N>(
276 deviceID, context, queue, num_elements, false, false);
277 result |= test_case_dot<SDstType, USrcType, SSrcType, N>(
278 deviceID, context, queue, num_elements, false, false);
279 result |= test_case_dot<SDstType, SSrcType, USrcType, N>(
280 deviceID, context, queue, num_elements, false, false);
281
282 // dot_acc_sat testing:
283 result |= test_case_dot<UDstType, USrcType, USrcType, N>(
284 deviceID, context, queue, num_elements, false, true);
285 result |= test_case_dot<SDstType, SSrcType, SSrcType, N>(
286 deviceID, context, queue, num_elements, false, true);
287 result |= test_case_dot<SDstType, USrcType, SSrcType, N>(
288 deviceID, context, queue, num_elements, false, true);
289 result |= test_case_dot<SDstType, SSrcType, USrcType, N>(
290 deviceID, context, queue, num_elements, false, true);
291
292 return result;
293 }
294
295 template <typename SrcType, typename DstType, size_t N>
test_vectype_packed(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)296 static int test_vectype_packed(cl_device_id deviceID, cl_context context,
297 cl_command_queue queue, int num_elements)
298 {
299 int result = TEST_PASS;
300
301 typedef typename std::make_signed<SrcType>::type SSrcType;
302 typedef typename std::make_signed<DstType>::type SDstType;
303
304 typedef typename std::make_unsigned<SrcType>::type USrcType;
305 typedef typename std::make_unsigned<DstType>::type UDstType;
306
307 // packed dot testing:
308 result |= test_case_dot<UDstType, USrcType, USrcType, N>(
309 deviceID, context, queue, num_elements, true, false);
310 result |= test_case_dot<SDstType, SSrcType, SSrcType, N>(
311 deviceID, context, queue, num_elements, true, false);
312 result |= test_case_dot<SDstType, USrcType, SSrcType, N>(
313 deviceID, context, queue, num_elements, true, false);
314 result |= test_case_dot<SDstType, SSrcType, USrcType, N>(
315 deviceID, context, queue, num_elements, true, false);
316
317 // packed dot_acc_sat testing:
318 result |= test_case_dot<UDstType, USrcType, USrcType, N>(
319 deviceID, context, queue, num_elements, true, true);
320 result |= test_case_dot<SDstType, SSrcType, SSrcType, N>(
321 deviceID, context, queue, num_elements, true, true);
322 result |= test_case_dot<SDstType, USrcType, SSrcType, N>(
323 deviceID, context, queue, num_elements, true, true);
324 result |= test_case_dot<SDstType, SSrcType, USrcType, N>(
325 deviceID, context, queue, num_elements, true, true);
326
327 return result;
328 }
329
test_integer_dot_product(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)330 int test_integer_dot_product(cl_device_id deviceID, cl_context context,
331 cl_command_queue queue, int num_elements)
332 {
333 if (!is_extension_available(deviceID, "cl_khr_integer_dot_product"))
334 {
335 log_info("cl_khr_integer_dot_product is not supported\n");
336 return TEST_SKIPPED_ITSELF;
337 }
338
339 Version deviceVersion = get_device_cl_version(deviceID);
340 cl_version extensionVersion;
341
342 if ((deviceVersion >= Version(3, 0))
343 || is_extension_available(deviceID, "cl_khr_extended_versioning"))
344 {
345 extensionVersion =
346 get_extension_version(deviceID, "cl_khr_integer_dot_product");
347 }
348 else
349 {
350 // Assume 1.0.0 is supported if the version can't be queried
351 extensionVersion = CL_MAKE_VERSION(1, 0, 0);
352 }
353
354 cl_int error = CL_SUCCESS;
355 int result = TEST_PASS;
356
357 cl_device_integer_dot_product_capabilities_khr dotCaps = 0;
358 error = clGetDeviceInfo(deviceID,
359 CL_DEVICE_INTEGER_DOT_PRODUCT_CAPABILITIES_KHR,
360 sizeof(dotCaps), &dotCaps, NULL);
361 test_error(
362 error,
363 "Unable to query CL_DEVICE_INTEGER_DOT_PRODUCT_CAPABILITIES_KHR");
364
365 // Check that the required capabilities are reported
366 test_assert_error(
367 dotCaps & CL_DEVICE_INTEGER_DOT_PRODUCT_INPUT_4x8BIT_PACKED_KHR,
368 "When cl_khr_integer_dot_product is supported "
369 "CL_DEVICE_INTEGER_DOT_PRODUCT_INPUT_4x8BIT_PACKED_KHR must be "
370 "supported");
371
372 if (extensionVersion >= CL_MAKE_VERSION(2, 0, 0))
373 {
374 test_assert_error(
375 dotCaps & CL_DEVICE_INTEGER_DOT_PRODUCT_INPUT_4x8BIT_KHR,
376 "When cl_khr_integer_dot_product is supported with version >= 2.0.0"
377 "CL_DEVICE_INTEGER_DOT_PRODUCT_INPUT_4x8BIT_KHR must be "
378 "supported");
379 }
380
381 // Check that acceleration properties can be queried
382 if (extensionVersion >= CL_MAKE_VERSION(2, 0, 0))
383 {
384 size_t size_ret;
385 error = clGetDeviceInfo(
386 deviceID,
387 CL_DEVICE_INTEGER_DOT_PRODUCT_ACCELERATION_PROPERTIES_8BIT_KHR, 0,
388 nullptr, &size_ret);
389 test_error(
390 error,
391 "Unable to query size of data returned by "
392 "CL_DEVICE_INTEGER_DOT_PRODUCT_ACCELERATION_PROPERTIES_8BIT_KHR");
393
394 cl_device_integer_dot_product_acceleration_properties_khr
395 accelerationProperties;
396 error = clGetDeviceInfo(
397 deviceID,
398 CL_DEVICE_INTEGER_DOT_PRODUCT_ACCELERATION_PROPERTIES_8BIT_KHR,
399 sizeof(accelerationProperties), &accelerationProperties, nullptr);
400 test_error(error, "Unable to query 8-bit acceleration properties");
401
402 error = clGetDeviceInfo(
403 deviceID,
404 CL_DEVICE_INTEGER_DOT_PRODUCT_ACCELERATION_PROPERTIES_4x8BIT_PACKED_KHR,
405 0, nullptr, &size_ret);
406 test_error(
407 error,
408 "Unable to query size of data returned by "
409 "CL_DEVICE_INTEGER_DOT_PRODUCT_ACCELERATION_PROPERTIES_4x8BIT_"
410 "PACKED_KHR");
411
412 error = clGetDeviceInfo(
413 deviceID,
414 CL_DEVICE_INTEGER_DOT_PRODUCT_ACCELERATION_PROPERTIES_4x8BIT_PACKED_KHR,
415 sizeof(accelerationProperties), &accelerationProperties, nullptr);
416 test_error(error,
417 "Unable to query 4x8-bit packed acceleration properties");
418 }
419
420 // Report when unknown capabilities are found
421 if (dotCaps
422 & ~(CL_DEVICE_INTEGER_DOT_PRODUCT_INPUT_4x8BIT_PACKED_KHR
423 | CL_DEVICE_INTEGER_DOT_PRODUCT_INPUT_4x8BIT_KHR))
424 {
425 log_info("NOTE: found an unknown / untested capability!\n");
426 }
427
428 // Test built-in functions
429 if (dotCaps & CL_DEVICE_INTEGER_DOT_PRODUCT_INPUT_4x8BIT_KHR)
430 {
431 result |= test_vectype<cl_uchar, cl_uint, 4>(deviceID, context, queue,
432 num_elements);
433 }
434
435 if (dotCaps & CL_DEVICE_INTEGER_DOT_PRODUCT_INPUT_4x8BIT_PACKED_KHR)
436 {
437 result |= test_vectype_packed<cl_uchar, cl_uint, 4>(
438 deviceID, context, queue, num_elements);
439 }
440
441 return result;
442 }
443