1 #include <clpeak.h>
2
3 #define FETCH_PER_WI 16
4
runGlobalBandwidthTest(cl::CommandQueue & queue,cl::Program & prog,device_info_t & devInfo)5 int clPeak::runGlobalBandwidthTest(cl::CommandQueue &queue, cl::Program &prog, device_info_t &devInfo)
6 {
7 float timed_lo, timed_go, timed, gbps;
8 cl::NDRange globalSize, localSize;
9 float *arr = NULL;
10
11 if (!isGlobalBW)
12 return 0;
13
14 cl::Context ctx = queue.getInfo<CL_QUEUE_CONTEXT>();
15 uint iters = devInfo.gloalBWIters;
16
17 uint64_t maxItems = devInfo.maxAllocSize / sizeof(float) / 2;
18 uint64_t numItems = roundToMultipleOf(maxItems, (devInfo.maxWGSize * FETCH_PER_WI * 16), devInfo.globalBWMaxSize);
19
20 try
21 {
22 arr = new float[numItems];
23 populate(arr, numItems);
24
25 log->print(NEWLINE TAB TAB "Global memory bandwidth (GBPS)" NEWLINE);
26 log->xmlOpenTag("global_memory_bandwidth");
27 log->xmlAppendAttribs("unit", "gbps");
28
29 cl::Buffer inputBuf = cl::Buffer(ctx, CL_MEM_READ_ONLY, (numItems * sizeof(float)));
30 cl::Buffer outputBuf = cl::Buffer(ctx, CL_MEM_WRITE_ONLY, (numItems * sizeof(float)));
31 queue.enqueueWriteBuffer(inputBuf, CL_TRUE, 0, (numItems * sizeof(float)), arr);
32
33 cl::Kernel kernel_v1_lo(prog, "global_bandwidth_v1_local_offset");
34 kernel_v1_lo.setArg(0, inputBuf), kernel_v1_lo.setArg(1, outputBuf);
35
36 cl::Kernel kernel_v2_lo(prog, "global_bandwidth_v2_local_offset");
37 kernel_v2_lo.setArg(0, inputBuf), kernel_v2_lo.setArg(1, outputBuf);
38
39 cl::Kernel kernel_v4_lo(prog, "global_bandwidth_v4_local_offset");
40 kernel_v4_lo.setArg(0, inputBuf), kernel_v4_lo.setArg(1, outputBuf);
41
42 cl::Kernel kernel_v8_lo(prog, "global_bandwidth_v8_local_offset");
43 kernel_v8_lo.setArg(0, inputBuf), kernel_v8_lo.setArg(1, outputBuf);
44
45 cl::Kernel kernel_v16_lo(prog, "global_bandwidth_v16_local_offset");
46 kernel_v16_lo.setArg(0, inputBuf), kernel_v16_lo.setArg(1, outputBuf);
47
48 cl::Kernel kernel_v1_go(prog, "global_bandwidth_v1_global_offset");
49 kernel_v1_go.setArg(0, inputBuf), kernel_v1_go.setArg(1, outputBuf);
50
51 cl::Kernel kernel_v2_go(prog, "global_bandwidth_v2_global_offset");
52 kernel_v2_go.setArg(0, inputBuf), kernel_v2_go.setArg(1, outputBuf);
53
54 cl::Kernel kernel_v4_go(prog, "global_bandwidth_v4_global_offset");
55 kernel_v4_go.setArg(0, inputBuf), kernel_v4_go.setArg(1, outputBuf);
56
57 cl::Kernel kernel_v8_go(prog, "global_bandwidth_v8_global_offset");
58 kernel_v8_go.setArg(0, inputBuf), kernel_v8_go.setArg(1, outputBuf);
59
60 cl::Kernel kernel_v16_go(prog, "global_bandwidth_v16_global_offset");
61 kernel_v16_go.setArg(0, inputBuf), kernel_v16_go.setArg(1, outputBuf);
62
63 localSize = devInfo.maxWGSize;
64
65 ///////////////////////////////////////////////////////////////////////////
66 // Vector width 1
67 if (!forceTest || strcmp(specifiedTestName, "float") == 0)
68 {
69 log->print(TAB TAB TAB "float : ");
70
71 globalSize = numItems / FETCH_PER_WI;
72
73 // Run 2 kind of bandwidth kernel
74 // lo -- local_size offset - subsequent fetches at local_size offset
75 // go -- global_size offset
76 timed_lo = run_kernel(queue, kernel_v1_lo, globalSize, localSize, iters);
77 timed_go = run_kernel(queue, kernel_v1_go, globalSize, localSize, iters);
78 timed = (timed_lo < timed_go) ? timed_lo : timed_go;
79
80 gbps = ((float)numItems * sizeof(float)) / timed / 1e3f;
81
82 log->print(gbps);
83 log->print(NEWLINE);
84 log->xmlRecord("float", gbps);
85 }
86 ///////////////////////////////////////////////////////////////////////////
87
88 // Vector width 2
89 if (!forceTest || strcmp(specifiedTestName, "float2") == 0)
90 {
91 log->print(TAB TAB TAB "float2 : ");
92
93 globalSize = (numItems / 2 / FETCH_PER_WI);
94
95 timed_lo = run_kernel(queue, kernel_v2_lo, globalSize, localSize, iters);
96 timed_go = run_kernel(queue, kernel_v2_go, globalSize, localSize, iters);
97 timed = (timed_lo < timed_go) ? timed_lo : timed_go;
98
99 gbps = ((float)numItems * sizeof(float)) / timed / 1e3f;
100
101 log->print(gbps);
102 log->print(NEWLINE);
103 log->xmlRecord("float2", gbps);
104 }
105 ///////////////////////////////////////////////////////////////////////////
106
107 // Vector width 4
108 if (!forceTest || strcmp(specifiedTestName, "float4") == 0)
109 {
110 log->print(TAB TAB TAB "float4 : ");
111
112 globalSize = (numItems / 4 / FETCH_PER_WI);
113
114 timed_lo = run_kernel(queue, kernel_v4_lo, globalSize, localSize, iters);
115 timed_go = run_kernel(queue, kernel_v4_go, globalSize, localSize, iters);
116 timed = (timed_lo < timed_go) ? timed_lo : timed_go;
117
118 gbps = ((float)numItems * sizeof(float)) / timed / 1e3f;
119
120 log->print(gbps);
121 log->print(NEWLINE);
122 log->xmlRecord("float4", gbps);
123 }
124 ///////////////////////////////////////////////////////////////////////////
125
126 // Vector width 8
127 if (!forceTest || strcmp(specifiedTestName, "float8") == 0)
128 {
129 log->print(TAB TAB TAB "float8 : ");
130
131 globalSize = (numItems / 8 / FETCH_PER_WI);
132
133 timed_lo = run_kernel(queue, kernel_v8_lo, globalSize, localSize, iters);
134 timed_go = run_kernel(queue, kernel_v8_go, globalSize, localSize, iters);
135 timed = (timed_lo < timed_go) ? timed_lo : timed_go;
136
137 gbps = ((float)numItems * sizeof(float)) / timed / 1e3f;
138
139 log->print(gbps);
140 log->print(NEWLINE);
141 log->xmlRecord("float8", gbps);
142 }
143 ///////////////////////////////////////////////////////////////////////////
144
145 // Vector width 16
146 if (!forceTest || strcmp(specifiedTestName, "float16") == 0)
147 {
148 log->print(TAB TAB TAB "float16 : ");
149 globalSize = (numItems / 16 / FETCH_PER_WI);
150
151 timed_lo = run_kernel(queue, kernel_v16_lo, globalSize, localSize, iters);
152 timed_go = run_kernel(queue, kernel_v16_go, globalSize, localSize, iters);
153 timed = (timed_lo < timed_go) ? timed_lo : timed_go;
154
155 gbps = ((float)numItems * sizeof(float)) / timed / 1e3f;
156
157 log->print(gbps);
158 log->print(NEWLINE);
159 log->xmlRecord("float16", gbps);
160 }
161 ///////////////////////////////////////////////////////////////////////////
162 log->xmlCloseTag(); // global_memory_bandwidth
163
164 if (arr)
165 {
166 delete[] arr;
167 }
168 }
169 catch (cl::Error &error)
170 {
171 stringstream ss;
172 ss << error.what() << " (" << error.err() << ")" NEWLINE
173 << TAB TAB TAB "Tests skipped" NEWLINE;
174 log->print(ss.str());
175
176 if (arr)
177 {
178 delete[] arr;
179 }
180 return -1;
181 }
182
183 return 0;
184 }
185