1 #include <common.h>
2 #include <math.h>
3 #include <iostream>
4 #include <string>
5
6 using namespace std;
7
getDeviceInfo(cl::Device & d)8 device_info_t getDeviceInfo(cl::Device &d)
9 {
10 device_info_t devInfo;
11
12 devInfo.deviceName = d.getInfo<CL_DEVICE_NAME>();
13 devInfo.driverVersion = d.getInfo<CL_DRIVER_VERSION>();
14 trimString(devInfo.deviceName);
15 trimString(devInfo.driverVersion);
16
17 devInfo.numCUs = (uint)d.getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>();
18 vector<size_t> maxWIPerDim;
19 maxWIPerDim = d.getInfo<CL_DEVICE_MAX_WORK_ITEM_SIZES>();
20 devInfo.maxWGSize = (uint)maxWIPerDim[0];
21
22 // Limiting max work-group size to 256
23 #define MAX_WG_SIZE 256
24 devInfo.maxWGSize = std::min(devInfo.maxWGSize, (uint)MAX_WG_SIZE);
25
26 // FIXME limit max-workgroup size for qualcomm platform to 128
27 // Kernel launch fails for workgroup size 256(CL_DEVICE_MAX_WORK_ITEM_SIZES)
28 string vendor = d.getInfo<CL_DEVICE_VENDOR>();
29 if ((vendor.find("QUALCOMM") != std::string::npos) ||
30 (vendor.find("qualcomm") != std::string::npos))
31 {
32 devInfo.maxWGSize = std::min(devInfo.maxWGSize, (uint)128);
33 }
34
35 devInfo.maxAllocSize = static_cast<uint64_t>(d.getInfo<CL_DEVICE_MAX_MEM_ALLOC_SIZE>());
36 devInfo.maxGlobalSize = static_cast<uint64_t>(d.getInfo<CL_DEVICE_GLOBAL_MEM_SIZE>());
37 devInfo.maxClockFreq = static_cast<uint>(d.getInfo<CL_DEVICE_MAX_CLOCK_FREQUENCY>());
38 devInfo.doubleSupported = false;
39 devInfo.halfSupported = false;
40
41 std::string extns = d.getInfo<CL_DEVICE_EXTENSIONS>();
42
43 if ((extns.find("cl_khr_fp16") != std::string::npos))
44 devInfo.halfSupported = true;
45
46 if ((extns.find("cl_khr_fp64") != std::string::npos) || (extns.find("cl_amd_fp64") != std::string::npos))
47 devInfo.doubleSupported = true;
48
49 devInfo.deviceType = d.getInfo<CL_DEVICE_TYPE>();
50
51 if (devInfo.deviceType & CL_DEVICE_TYPE_CPU)
52 {
53 devInfo.gloalBWIters = 20;
54 devInfo.globalBWMaxSize = 1 << 27;
55 devInfo.computeWgsPerCU = 512;
56 devInfo.computeDPWgsPerCU = 256;
57 devInfo.computeIters = 10;
58 devInfo.transferBWMaxSize = 1 << 27;
59 }
60 else
61 { // GPU
62 devInfo.gloalBWIters = 50;
63 devInfo.globalBWMaxSize = 1 << 29;
64 devInfo.computeWgsPerCU = 2048;
65 devInfo.computeDPWgsPerCU = 512;
66 devInfo.computeIters = 30;
67 devInfo.transferBWMaxSize = 1 << 29;
68 }
69 devInfo.transferBWIters = 20;
70 devInfo.kernelLatencyIters = 20000;
71
72 return devInfo;
73 }
74
timeInUS(cl::Event & timeEvent)75 float timeInUS(cl::Event &timeEvent)
76 {
77 cl_ulong start = timeEvent.getProfilingInfo<CL_PROFILING_COMMAND_START>() / 1000;
78 cl_ulong end = timeEvent.getProfilingInfo<CL_PROFILING_COMMAND_END>() / 1000;
79
80 return (float)((int)end - (int)start);
81 }
82
start()83 void Timer::start()
84 {
85 tick = chrono::high_resolution_clock::now();
86 }
87
stopAndTime()88 float Timer::stopAndTime()
89 {
90 tock = chrono::high_resolution_clock::now();
91 return (float)(chrono::duration_cast<chrono::microseconds>(tock - tick).count());
92 }
93
populate(float * ptr,uint64_t N)94 void populate(float *ptr, uint64_t N)
95 {
96 srand((unsigned int)time(NULL));
97
98 for (uint64_t i = 0; i < N; i++)
99 {
100 //ptr[i] = (float)rand();
101 ptr[i] = (float)i;
102 }
103 }
104
populate(double * ptr,uint64_t N)105 void populate(double *ptr, uint64_t N)
106 {
107 srand((unsigned int)time(NULL));
108
109 for (uint64_t i = 0; i < N; i++)
110 {
111 //ptr[i] = (double)rand();
112 ptr[i] = (double)i;
113 }
114 }
115
roundToMultipleOf(uint64_t number,uint64_t base,uint64_t maxValue)116 uint64_t roundToMultipleOf(uint64_t number, uint64_t base, uint64_t maxValue)
117 {
118 uint64_t n = (number > maxValue) ? maxValue : number;
119 return (n / base) * base;
120 }
121
trimString(std::string & str)122 void trimString(std::string &str)
123 {
124 size_t pos = str.find('\0');
125
126 if (pos != std::string::npos)
127 {
128 str.erase(pos);
129 }
130 }
131