xref: /aosp_15_r20/external/clpeak/src/global_bandwidth.cpp (revision 1cd03ba3888297bc945f2c84574e105e3ced3e34)
1*1cd03ba3SJeremy Kemp #include <clpeak.h>
2*1cd03ba3SJeremy Kemp 
3*1cd03ba3SJeremy Kemp #define FETCH_PER_WI 16
4*1cd03ba3SJeremy Kemp 
runGlobalBandwidthTest(cl::CommandQueue & queue,cl::Program & prog,device_info_t & devInfo)5*1cd03ba3SJeremy Kemp int clPeak::runGlobalBandwidthTest(cl::CommandQueue &queue, cl::Program &prog, device_info_t &devInfo)
6*1cd03ba3SJeremy Kemp {
7*1cd03ba3SJeremy Kemp   float timed_lo, timed_go, timed, gbps;
8*1cd03ba3SJeremy Kemp   cl::NDRange globalSize, localSize;
9*1cd03ba3SJeremy Kemp   float *arr = NULL;
10*1cd03ba3SJeremy Kemp 
11*1cd03ba3SJeremy Kemp   if (!isGlobalBW)
12*1cd03ba3SJeremy Kemp     return 0;
13*1cd03ba3SJeremy Kemp 
14*1cd03ba3SJeremy Kemp   cl::Context ctx = queue.getInfo<CL_QUEUE_CONTEXT>();
15*1cd03ba3SJeremy Kemp   uint iters = devInfo.gloalBWIters;
16*1cd03ba3SJeremy Kemp 
17*1cd03ba3SJeremy Kemp   uint64_t maxItems = devInfo.maxAllocSize / sizeof(float) / 2;
18*1cd03ba3SJeremy Kemp   uint64_t numItems = roundToMultipleOf(maxItems, (devInfo.maxWGSize * FETCH_PER_WI * 16), devInfo.globalBWMaxSize);
19*1cd03ba3SJeremy Kemp 
20*1cd03ba3SJeremy Kemp   try
21*1cd03ba3SJeremy Kemp   {
22*1cd03ba3SJeremy Kemp     arr = new float[numItems];
23*1cd03ba3SJeremy Kemp     populate(arr, numItems);
24*1cd03ba3SJeremy Kemp 
25*1cd03ba3SJeremy Kemp     log->print(NEWLINE TAB TAB "Global memory bandwidth (GBPS)" NEWLINE);
26*1cd03ba3SJeremy Kemp     log->xmlOpenTag("global_memory_bandwidth");
27*1cd03ba3SJeremy Kemp     log->xmlAppendAttribs("unit", "gbps");
28*1cd03ba3SJeremy Kemp 
29*1cd03ba3SJeremy Kemp     cl::Buffer inputBuf = cl::Buffer(ctx, CL_MEM_READ_ONLY, (numItems * sizeof(float)));
30*1cd03ba3SJeremy Kemp     cl::Buffer outputBuf = cl::Buffer(ctx, CL_MEM_WRITE_ONLY, (numItems * sizeof(float)));
31*1cd03ba3SJeremy Kemp     queue.enqueueWriteBuffer(inputBuf, CL_TRUE, 0, (numItems * sizeof(float)), arr);
32*1cd03ba3SJeremy Kemp 
33*1cd03ba3SJeremy Kemp     cl::Kernel kernel_v1_lo(prog, "global_bandwidth_v1_local_offset");
34*1cd03ba3SJeremy Kemp     kernel_v1_lo.setArg(0, inputBuf), kernel_v1_lo.setArg(1, outputBuf);
35*1cd03ba3SJeremy Kemp 
36*1cd03ba3SJeremy Kemp     cl::Kernel kernel_v2_lo(prog, "global_bandwidth_v2_local_offset");
37*1cd03ba3SJeremy Kemp     kernel_v2_lo.setArg(0, inputBuf), kernel_v2_lo.setArg(1, outputBuf);
38*1cd03ba3SJeremy Kemp 
39*1cd03ba3SJeremy Kemp     cl::Kernel kernel_v4_lo(prog, "global_bandwidth_v4_local_offset");
40*1cd03ba3SJeremy Kemp     kernel_v4_lo.setArg(0, inputBuf), kernel_v4_lo.setArg(1, outputBuf);
41*1cd03ba3SJeremy Kemp 
42*1cd03ba3SJeremy Kemp     cl::Kernel kernel_v8_lo(prog, "global_bandwidth_v8_local_offset");
43*1cd03ba3SJeremy Kemp     kernel_v8_lo.setArg(0, inputBuf), kernel_v8_lo.setArg(1, outputBuf);
44*1cd03ba3SJeremy Kemp 
45*1cd03ba3SJeremy Kemp     cl::Kernel kernel_v16_lo(prog, "global_bandwidth_v16_local_offset");
46*1cd03ba3SJeremy Kemp     kernel_v16_lo.setArg(0, inputBuf), kernel_v16_lo.setArg(1, outputBuf);
47*1cd03ba3SJeremy Kemp 
48*1cd03ba3SJeremy Kemp     cl::Kernel kernel_v1_go(prog, "global_bandwidth_v1_global_offset");
49*1cd03ba3SJeremy Kemp     kernel_v1_go.setArg(0, inputBuf), kernel_v1_go.setArg(1, outputBuf);
50*1cd03ba3SJeremy Kemp 
51*1cd03ba3SJeremy Kemp     cl::Kernel kernel_v2_go(prog, "global_bandwidth_v2_global_offset");
52*1cd03ba3SJeremy Kemp     kernel_v2_go.setArg(0, inputBuf), kernel_v2_go.setArg(1, outputBuf);
53*1cd03ba3SJeremy Kemp 
54*1cd03ba3SJeremy Kemp     cl::Kernel kernel_v4_go(prog, "global_bandwidth_v4_global_offset");
55*1cd03ba3SJeremy Kemp     kernel_v4_go.setArg(0, inputBuf), kernel_v4_go.setArg(1, outputBuf);
56*1cd03ba3SJeremy Kemp 
57*1cd03ba3SJeremy Kemp     cl::Kernel kernel_v8_go(prog, "global_bandwidth_v8_global_offset");
58*1cd03ba3SJeremy Kemp     kernel_v8_go.setArg(0, inputBuf), kernel_v8_go.setArg(1, outputBuf);
59*1cd03ba3SJeremy Kemp 
60*1cd03ba3SJeremy Kemp     cl::Kernel kernel_v16_go(prog, "global_bandwidth_v16_global_offset");
61*1cd03ba3SJeremy Kemp     kernel_v16_go.setArg(0, inputBuf), kernel_v16_go.setArg(1, outputBuf);
62*1cd03ba3SJeremy Kemp 
63*1cd03ba3SJeremy Kemp     localSize = devInfo.maxWGSize;
64*1cd03ba3SJeremy Kemp 
65*1cd03ba3SJeremy Kemp     ///////////////////////////////////////////////////////////////////////////
66*1cd03ba3SJeremy Kemp     // Vector width 1
67*1cd03ba3SJeremy Kemp     if (!forceTest || strcmp(specifiedTestName, "float") == 0)
68*1cd03ba3SJeremy Kemp     {
69*1cd03ba3SJeremy Kemp       log->print(TAB TAB TAB "float   : ");
70*1cd03ba3SJeremy Kemp 
71*1cd03ba3SJeremy Kemp       globalSize = numItems / FETCH_PER_WI;
72*1cd03ba3SJeremy Kemp 
73*1cd03ba3SJeremy Kemp       // Run 2 kind of bandwidth kernel
74*1cd03ba3SJeremy Kemp       // lo -- local_size offset - subsequent fetches at local_size offset
75*1cd03ba3SJeremy Kemp       // go -- global_size offset
76*1cd03ba3SJeremy Kemp       timed_lo = run_kernel(queue, kernel_v1_lo, globalSize, localSize, iters);
77*1cd03ba3SJeremy Kemp       timed_go = run_kernel(queue, kernel_v1_go, globalSize, localSize, iters);
78*1cd03ba3SJeremy Kemp       timed = (timed_lo < timed_go) ? timed_lo : timed_go;
79*1cd03ba3SJeremy Kemp 
80*1cd03ba3SJeremy Kemp       gbps = ((float)numItems * sizeof(float)) / timed / 1e3f;
81*1cd03ba3SJeremy Kemp 
82*1cd03ba3SJeremy Kemp       log->print(gbps);
83*1cd03ba3SJeremy Kemp       log->print(NEWLINE);
84*1cd03ba3SJeremy Kemp       log->xmlRecord("float", gbps);
85*1cd03ba3SJeremy Kemp     }
86*1cd03ba3SJeremy Kemp     ///////////////////////////////////////////////////////////////////////////
87*1cd03ba3SJeremy Kemp 
88*1cd03ba3SJeremy Kemp     // Vector width 2
89*1cd03ba3SJeremy Kemp     if (!forceTest || strcmp(specifiedTestName, "float2") == 0)
90*1cd03ba3SJeremy Kemp     {
91*1cd03ba3SJeremy Kemp       log->print(TAB TAB TAB "float2  : ");
92*1cd03ba3SJeremy Kemp 
93*1cd03ba3SJeremy Kemp       globalSize = (numItems / 2 / FETCH_PER_WI);
94*1cd03ba3SJeremy Kemp 
95*1cd03ba3SJeremy Kemp       timed_lo = run_kernel(queue, kernel_v2_lo, globalSize, localSize, iters);
96*1cd03ba3SJeremy Kemp       timed_go = run_kernel(queue, kernel_v2_go, globalSize, localSize, iters);
97*1cd03ba3SJeremy Kemp       timed = (timed_lo < timed_go) ? timed_lo : timed_go;
98*1cd03ba3SJeremy Kemp 
99*1cd03ba3SJeremy Kemp       gbps = ((float)numItems * sizeof(float)) / timed / 1e3f;
100*1cd03ba3SJeremy Kemp 
101*1cd03ba3SJeremy Kemp       log->print(gbps);
102*1cd03ba3SJeremy Kemp       log->print(NEWLINE);
103*1cd03ba3SJeremy Kemp       log->xmlRecord("float2", gbps);
104*1cd03ba3SJeremy Kemp     }
105*1cd03ba3SJeremy Kemp     ///////////////////////////////////////////////////////////////////////////
106*1cd03ba3SJeremy Kemp 
107*1cd03ba3SJeremy Kemp     // Vector width 4
108*1cd03ba3SJeremy Kemp     if (!forceTest || strcmp(specifiedTestName, "float4") == 0)
109*1cd03ba3SJeremy Kemp     {
110*1cd03ba3SJeremy Kemp       log->print(TAB TAB TAB "float4  : ");
111*1cd03ba3SJeremy Kemp 
112*1cd03ba3SJeremy Kemp       globalSize = (numItems / 4 / FETCH_PER_WI);
113*1cd03ba3SJeremy Kemp 
114*1cd03ba3SJeremy Kemp       timed_lo = run_kernel(queue, kernel_v4_lo, globalSize, localSize, iters);
115*1cd03ba3SJeremy Kemp       timed_go = run_kernel(queue, kernel_v4_go, globalSize, localSize, iters);
116*1cd03ba3SJeremy Kemp       timed = (timed_lo < timed_go) ? timed_lo : timed_go;
117*1cd03ba3SJeremy Kemp 
118*1cd03ba3SJeremy Kemp       gbps = ((float)numItems * sizeof(float)) / timed / 1e3f;
119*1cd03ba3SJeremy Kemp 
120*1cd03ba3SJeremy Kemp       log->print(gbps);
121*1cd03ba3SJeremy Kemp       log->print(NEWLINE);
122*1cd03ba3SJeremy Kemp       log->xmlRecord("float4", gbps);
123*1cd03ba3SJeremy Kemp     }
124*1cd03ba3SJeremy Kemp     ///////////////////////////////////////////////////////////////////////////
125*1cd03ba3SJeremy Kemp 
126*1cd03ba3SJeremy Kemp     // Vector width 8
127*1cd03ba3SJeremy Kemp     if (!forceTest || strcmp(specifiedTestName, "float8") == 0)
128*1cd03ba3SJeremy Kemp     {
129*1cd03ba3SJeremy Kemp       log->print(TAB TAB TAB "float8  : ");
130*1cd03ba3SJeremy Kemp 
131*1cd03ba3SJeremy Kemp       globalSize = (numItems / 8 / FETCH_PER_WI);
132*1cd03ba3SJeremy Kemp 
133*1cd03ba3SJeremy Kemp       timed_lo = run_kernel(queue, kernel_v8_lo, globalSize, localSize, iters);
134*1cd03ba3SJeremy Kemp       timed_go = run_kernel(queue, kernel_v8_go, globalSize, localSize, iters);
135*1cd03ba3SJeremy Kemp       timed = (timed_lo < timed_go) ? timed_lo : timed_go;
136*1cd03ba3SJeremy Kemp 
137*1cd03ba3SJeremy Kemp       gbps = ((float)numItems * sizeof(float)) / timed / 1e3f;
138*1cd03ba3SJeremy Kemp 
139*1cd03ba3SJeremy Kemp       log->print(gbps);
140*1cd03ba3SJeremy Kemp       log->print(NEWLINE);
141*1cd03ba3SJeremy Kemp       log->xmlRecord("float8", gbps);
142*1cd03ba3SJeremy Kemp     }
143*1cd03ba3SJeremy Kemp     ///////////////////////////////////////////////////////////////////////////
144*1cd03ba3SJeremy Kemp 
145*1cd03ba3SJeremy Kemp     // Vector width 16
146*1cd03ba3SJeremy Kemp     if (!forceTest || strcmp(specifiedTestName, "float16") == 0)
147*1cd03ba3SJeremy Kemp     {
148*1cd03ba3SJeremy Kemp       log->print(TAB TAB TAB "float16 : ");
149*1cd03ba3SJeremy Kemp       globalSize = (numItems / 16 / FETCH_PER_WI);
150*1cd03ba3SJeremy Kemp 
151*1cd03ba3SJeremy Kemp       timed_lo = run_kernel(queue, kernel_v16_lo, globalSize, localSize, iters);
152*1cd03ba3SJeremy Kemp       timed_go = run_kernel(queue, kernel_v16_go, globalSize, localSize, iters);
153*1cd03ba3SJeremy Kemp       timed = (timed_lo < timed_go) ? timed_lo : timed_go;
154*1cd03ba3SJeremy Kemp 
155*1cd03ba3SJeremy Kemp       gbps = ((float)numItems * sizeof(float)) / timed / 1e3f;
156*1cd03ba3SJeremy Kemp 
157*1cd03ba3SJeremy Kemp       log->print(gbps);
158*1cd03ba3SJeremy Kemp       log->print(NEWLINE);
159*1cd03ba3SJeremy Kemp       log->xmlRecord("float16", gbps);
160*1cd03ba3SJeremy Kemp     }
161*1cd03ba3SJeremy Kemp     ///////////////////////////////////////////////////////////////////////////
162*1cd03ba3SJeremy Kemp     log->xmlCloseTag(); // global_memory_bandwidth
163*1cd03ba3SJeremy Kemp 
164*1cd03ba3SJeremy Kemp     if (arr)
165*1cd03ba3SJeremy Kemp     {
166*1cd03ba3SJeremy Kemp       delete[] arr;
167*1cd03ba3SJeremy Kemp     }
168*1cd03ba3SJeremy Kemp   }
169*1cd03ba3SJeremy Kemp   catch (cl::Error &error)
170*1cd03ba3SJeremy Kemp   {
171*1cd03ba3SJeremy Kemp     stringstream ss;
172*1cd03ba3SJeremy Kemp     ss << error.what() << " (" << error.err() << ")" NEWLINE
173*1cd03ba3SJeremy Kemp        << TAB TAB TAB "Tests skipped" NEWLINE;
174*1cd03ba3SJeremy Kemp     log->print(ss.str());
175*1cd03ba3SJeremy Kemp 
176*1cd03ba3SJeremy Kemp     if (arr)
177*1cd03ba3SJeremy Kemp     {
178*1cd03ba3SJeremy Kemp       delete[] arr;
179*1cd03ba3SJeremy Kemp     }
180*1cd03ba3SJeremy Kemp     return -1;
181*1cd03ba3SJeremy Kemp   }
182*1cd03ba3SJeremy Kemp 
183*1cd03ba3SJeremy Kemp   return 0;
184*1cd03ba3SJeremy Kemp }
185