1*1cd03ba3SJeremy Kemp #include <clpeak.h>
2*1cd03ba3SJeremy Kemp
3*1cd03ba3SJeremy Kemp #define FETCH_PER_WI 16
4*1cd03ba3SJeremy Kemp
runGlobalBandwidthTest(cl::CommandQueue & queue,cl::Program & prog,device_info_t & devInfo)5*1cd03ba3SJeremy Kemp int clPeak::runGlobalBandwidthTest(cl::CommandQueue &queue, cl::Program &prog, device_info_t &devInfo)
6*1cd03ba3SJeremy Kemp {
7*1cd03ba3SJeremy Kemp float timed_lo, timed_go, timed, gbps;
8*1cd03ba3SJeremy Kemp cl::NDRange globalSize, localSize;
9*1cd03ba3SJeremy Kemp float *arr = NULL;
10*1cd03ba3SJeremy Kemp
11*1cd03ba3SJeremy Kemp if (!isGlobalBW)
12*1cd03ba3SJeremy Kemp return 0;
13*1cd03ba3SJeremy Kemp
14*1cd03ba3SJeremy Kemp cl::Context ctx = queue.getInfo<CL_QUEUE_CONTEXT>();
15*1cd03ba3SJeremy Kemp uint iters = devInfo.gloalBWIters;
16*1cd03ba3SJeremy Kemp
17*1cd03ba3SJeremy Kemp uint64_t maxItems = devInfo.maxAllocSize / sizeof(float) / 2;
18*1cd03ba3SJeremy Kemp uint64_t numItems = roundToMultipleOf(maxItems, (devInfo.maxWGSize * FETCH_PER_WI * 16), devInfo.globalBWMaxSize);
19*1cd03ba3SJeremy Kemp
20*1cd03ba3SJeremy Kemp try
21*1cd03ba3SJeremy Kemp {
22*1cd03ba3SJeremy Kemp arr = new float[numItems];
23*1cd03ba3SJeremy Kemp populate(arr, numItems);
24*1cd03ba3SJeremy Kemp
25*1cd03ba3SJeremy Kemp log->print(NEWLINE TAB TAB "Global memory bandwidth (GBPS)" NEWLINE);
26*1cd03ba3SJeremy Kemp log->xmlOpenTag("global_memory_bandwidth");
27*1cd03ba3SJeremy Kemp log->xmlAppendAttribs("unit", "gbps");
28*1cd03ba3SJeremy Kemp
29*1cd03ba3SJeremy Kemp cl::Buffer inputBuf = cl::Buffer(ctx, CL_MEM_READ_ONLY, (numItems * sizeof(float)));
30*1cd03ba3SJeremy Kemp cl::Buffer outputBuf = cl::Buffer(ctx, CL_MEM_WRITE_ONLY, (numItems * sizeof(float)));
31*1cd03ba3SJeremy Kemp queue.enqueueWriteBuffer(inputBuf, CL_TRUE, 0, (numItems * sizeof(float)), arr);
32*1cd03ba3SJeremy Kemp
33*1cd03ba3SJeremy Kemp cl::Kernel kernel_v1_lo(prog, "global_bandwidth_v1_local_offset");
34*1cd03ba3SJeremy Kemp kernel_v1_lo.setArg(0, inputBuf), kernel_v1_lo.setArg(1, outputBuf);
35*1cd03ba3SJeremy Kemp
36*1cd03ba3SJeremy Kemp cl::Kernel kernel_v2_lo(prog, "global_bandwidth_v2_local_offset");
37*1cd03ba3SJeremy Kemp kernel_v2_lo.setArg(0, inputBuf), kernel_v2_lo.setArg(1, outputBuf);
38*1cd03ba3SJeremy Kemp
39*1cd03ba3SJeremy Kemp cl::Kernel kernel_v4_lo(prog, "global_bandwidth_v4_local_offset");
40*1cd03ba3SJeremy Kemp kernel_v4_lo.setArg(0, inputBuf), kernel_v4_lo.setArg(1, outputBuf);
41*1cd03ba3SJeremy Kemp
42*1cd03ba3SJeremy Kemp cl::Kernel kernel_v8_lo(prog, "global_bandwidth_v8_local_offset");
43*1cd03ba3SJeremy Kemp kernel_v8_lo.setArg(0, inputBuf), kernel_v8_lo.setArg(1, outputBuf);
44*1cd03ba3SJeremy Kemp
45*1cd03ba3SJeremy Kemp cl::Kernel kernel_v16_lo(prog, "global_bandwidth_v16_local_offset");
46*1cd03ba3SJeremy Kemp kernel_v16_lo.setArg(0, inputBuf), kernel_v16_lo.setArg(1, outputBuf);
47*1cd03ba3SJeremy Kemp
48*1cd03ba3SJeremy Kemp cl::Kernel kernel_v1_go(prog, "global_bandwidth_v1_global_offset");
49*1cd03ba3SJeremy Kemp kernel_v1_go.setArg(0, inputBuf), kernel_v1_go.setArg(1, outputBuf);
50*1cd03ba3SJeremy Kemp
51*1cd03ba3SJeremy Kemp cl::Kernel kernel_v2_go(prog, "global_bandwidth_v2_global_offset");
52*1cd03ba3SJeremy Kemp kernel_v2_go.setArg(0, inputBuf), kernel_v2_go.setArg(1, outputBuf);
53*1cd03ba3SJeremy Kemp
54*1cd03ba3SJeremy Kemp cl::Kernel kernel_v4_go(prog, "global_bandwidth_v4_global_offset");
55*1cd03ba3SJeremy Kemp kernel_v4_go.setArg(0, inputBuf), kernel_v4_go.setArg(1, outputBuf);
56*1cd03ba3SJeremy Kemp
57*1cd03ba3SJeremy Kemp cl::Kernel kernel_v8_go(prog, "global_bandwidth_v8_global_offset");
58*1cd03ba3SJeremy Kemp kernel_v8_go.setArg(0, inputBuf), kernel_v8_go.setArg(1, outputBuf);
59*1cd03ba3SJeremy Kemp
60*1cd03ba3SJeremy Kemp cl::Kernel kernel_v16_go(prog, "global_bandwidth_v16_global_offset");
61*1cd03ba3SJeremy Kemp kernel_v16_go.setArg(0, inputBuf), kernel_v16_go.setArg(1, outputBuf);
62*1cd03ba3SJeremy Kemp
63*1cd03ba3SJeremy Kemp localSize = devInfo.maxWGSize;
64*1cd03ba3SJeremy Kemp
65*1cd03ba3SJeremy Kemp ///////////////////////////////////////////////////////////////////////////
66*1cd03ba3SJeremy Kemp // Vector width 1
67*1cd03ba3SJeremy Kemp if (!forceTest || strcmp(specifiedTestName, "float") == 0)
68*1cd03ba3SJeremy Kemp {
69*1cd03ba3SJeremy Kemp log->print(TAB TAB TAB "float : ");
70*1cd03ba3SJeremy Kemp
71*1cd03ba3SJeremy Kemp globalSize = numItems / FETCH_PER_WI;
72*1cd03ba3SJeremy Kemp
73*1cd03ba3SJeremy Kemp // Run 2 kind of bandwidth kernel
74*1cd03ba3SJeremy Kemp // lo -- local_size offset - subsequent fetches at local_size offset
75*1cd03ba3SJeremy Kemp // go -- global_size offset
76*1cd03ba3SJeremy Kemp timed_lo = run_kernel(queue, kernel_v1_lo, globalSize, localSize, iters);
77*1cd03ba3SJeremy Kemp timed_go = run_kernel(queue, kernel_v1_go, globalSize, localSize, iters);
78*1cd03ba3SJeremy Kemp timed = (timed_lo < timed_go) ? timed_lo : timed_go;
79*1cd03ba3SJeremy Kemp
80*1cd03ba3SJeremy Kemp gbps = ((float)numItems * sizeof(float)) / timed / 1e3f;
81*1cd03ba3SJeremy Kemp
82*1cd03ba3SJeremy Kemp log->print(gbps);
83*1cd03ba3SJeremy Kemp log->print(NEWLINE);
84*1cd03ba3SJeremy Kemp log->xmlRecord("float", gbps);
85*1cd03ba3SJeremy Kemp }
86*1cd03ba3SJeremy Kemp ///////////////////////////////////////////////////////////////////////////
87*1cd03ba3SJeremy Kemp
88*1cd03ba3SJeremy Kemp // Vector width 2
89*1cd03ba3SJeremy Kemp if (!forceTest || strcmp(specifiedTestName, "float2") == 0)
90*1cd03ba3SJeremy Kemp {
91*1cd03ba3SJeremy Kemp log->print(TAB TAB TAB "float2 : ");
92*1cd03ba3SJeremy Kemp
93*1cd03ba3SJeremy Kemp globalSize = (numItems / 2 / FETCH_PER_WI);
94*1cd03ba3SJeremy Kemp
95*1cd03ba3SJeremy Kemp timed_lo = run_kernel(queue, kernel_v2_lo, globalSize, localSize, iters);
96*1cd03ba3SJeremy Kemp timed_go = run_kernel(queue, kernel_v2_go, globalSize, localSize, iters);
97*1cd03ba3SJeremy Kemp timed = (timed_lo < timed_go) ? timed_lo : timed_go;
98*1cd03ba3SJeremy Kemp
99*1cd03ba3SJeremy Kemp gbps = ((float)numItems * sizeof(float)) / timed / 1e3f;
100*1cd03ba3SJeremy Kemp
101*1cd03ba3SJeremy Kemp log->print(gbps);
102*1cd03ba3SJeremy Kemp log->print(NEWLINE);
103*1cd03ba3SJeremy Kemp log->xmlRecord("float2", gbps);
104*1cd03ba3SJeremy Kemp }
105*1cd03ba3SJeremy Kemp ///////////////////////////////////////////////////////////////////////////
106*1cd03ba3SJeremy Kemp
107*1cd03ba3SJeremy Kemp // Vector width 4
108*1cd03ba3SJeremy Kemp if (!forceTest || strcmp(specifiedTestName, "float4") == 0)
109*1cd03ba3SJeremy Kemp {
110*1cd03ba3SJeremy Kemp log->print(TAB TAB TAB "float4 : ");
111*1cd03ba3SJeremy Kemp
112*1cd03ba3SJeremy Kemp globalSize = (numItems / 4 / FETCH_PER_WI);
113*1cd03ba3SJeremy Kemp
114*1cd03ba3SJeremy Kemp timed_lo = run_kernel(queue, kernel_v4_lo, globalSize, localSize, iters);
115*1cd03ba3SJeremy Kemp timed_go = run_kernel(queue, kernel_v4_go, globalSize, localSize, iters);
116*1cd03ba3SJeremy Kemp timed = (timed_lo < timed_go) ? timed_lo : timed_go;
117*1cd03ba3SJeremy Kemp
118*1cd03ba3SJeremy Kemp gbps = ((float)numItems * sizeof(float)) / timed / 1e3f;
119*1cd03ba3SJeremy Kemp
120*1cd03ba3SJeremy Kemp log->print(gbps);
121*1cd03ba3SJeremy Kemp log->print(NEWLINE);
122*1cd03ba3SJeremy Kemp log->xmlRecord("float4", gbps);
123*1cd03ba3SJeremy Kemp }
124*1cd03ba3SJeremy Kemp ///////////////////////////////////////////////////////////////////////////
125*1cd03ba3SJeremy Kemp
126*1cd03ba3SJeremy Kemp // Vector width 8
127*1cd03ba3SJeremy Kemp if (!forceTest || strcmp(specifiedTestName, "float8") == 0)
128*1cd03ba3SJeremy Kemp {
129*1cd03ba3SJeremy Kemp log->print(TAB TAB TAB "float8 : ");
130*1cd03ba3SJeremy Kemp
131*1cd03ba3SJeremy Kemp globalSize = (numItems / 8 / FETCH_PER_WI);
132*1cd03ba3SJeremy Kemp
133*1cd03ba3SJeremy Kemp timed_lo = run_kernel(queue, kernel_v8_lo, globalSize, localSize, iters);
134*1cd03ba3SJeremy Kemp timed_go = run_kernel(queue, kernel_v8_go, globalSize, localSize, iters);
135*1cd03ba3SJeremy Kemp timed = (timed_lo < timed_go) ? timed_lo : timed_go;
136*1cd03ba3SJeremy Kemp
137*1cd03ba3SJeremy Kemp gbps = ((float)numItems * sizeof(float)) / timed / 1e3f;
138*1cd03ba3SJeremy Kemp
139*1cd03ba3SJeremy Kemp log->print(gbps);
140*1cd03ba3SJeremy Kemp log->print(NEWLINE);
141*1cd03ba3SJeremy Kemp log->xmlRecord("float8", gbps);
142*1cd03ba3SJeremy Kemp }
143*1cd03ba3SJeremy Kemp ///////////////////////////////////////////////////////////////////////////
144*1cd03ba3SJeremy Kemp
145*1cd03ba3SJeremy Kemp // Vector width 16
146*1cd03ba3SJeremy Kemp if (!forceTest || strcmp(specifiedTestName, "float16") == 0)
147*1cd03ba3SJeremy Kemp {
148*1cd03ba3SJeremy Kemp log->print(TAB TAB TAB "float16 : ");
149*1cd03ba3SJeremy Kemp globalSize = (numItems / 16 / FETCH_PER_WI);
150*1cd03ba3SJeremy Kemp
151*1cd03ba3SJeremy Kemp timed_lo = run_kernel(queue, kernel_v16_lo, globalSize, localSize, iters);
152*1cd03ba3SJeremy Kemp timed_go = run_kernel(queue, kernel_v16_go, globalSize, localSize, iters);
153*1cd03ba3SJeremy Kemp timed = (timed_lo < timed_go) ? timed_lo : timed_go;
154*1cd03ba3SJeremy Kemp
155*1cd03ba3SJeremy Kemp gbps = ((float)numItems * sizeof(float)) / timed / 1e3f;
156*1cd03ba3SJeremy Kemp
157*1cd03ba3SJeremy Kemp log->print(gbps);
158*1cd03ba3SJeremy Kemp log->print(NEWLINE);
159*1cd03ba3SJeremy Kemp log->xmlRecord("float16", gbps);
160*1cd03ba3SJeremy Kemp }
161*1cd03ba3SJeremy Kemp ///////////////////////////////////////////////////////////////////////////
162*1cd03ba3SJeremy Kemp log->xmlCloseTag(); // global_memory_bandwidth
163*1cd03ba3SJeremy Kemp
164*1cd03ba3SJeremy Kemp if (arr)
165*1cd03ba3SJeremy Kemp {
166*1cd03ba3SJeremy Kemp delete[] arr;
167*1cd03ba3SJeremy Kemp }
168*1cd03ba3SJeremy Kemp }
169*1cd03ba3SJeremy Kemp catch (cl::Error &error)
170*1cd03ba3SJeremy Kemp {
171*1cd03ba3SJeremy Kemp stringstream ss;
172*1cd03ba3SJeremy Kemp ss << error.what() << " (" << error.err() << ")" NEWLINE
173*1cd03ba3SJeremy Kemp << TAB TAB TAB "Tests skipped" NEWLINE;
174*1cd03ba3SJeremy Kemp log->print(ss.str());
175*1cd03ba3SJeremy Kemp
176*1cd03ba3SJeremy Kemp if (arr)
177*1cd03ba3SJeremy Kemp {
178*1cd03ba3SJeremy Kemp delete[] arr;
179*1cd03ba3SJeremy Kemp }
180*1cd03ba3SJeremy Kemp return -1;
181*1cd03ba3SJeremy Kemp }
182*1cd03ba3SJeremy Kemp
183*1cd03ba3SJeremy Kemp return 0;
184*1cd03ba3SJeremy Kemp }
185