xref: /aosp_15_r20/external/clpeak/src/global_bandwidth.cpp (revision 1cd03ba3888297bc945f2c84574e105e3ced3e34)
1 #include <clpeak.h>
2 
3 #define FETCH_PER_WI 16
4 
runGlobalBandwidthTest(cl::CommandQueue & queue,cl::Program & prog,device_info_t & devInfo)5 int clPeak::runGlobalBandwidthTest(cl::CommandQueue &queue, cl::Program &prog, device_info_t &devInfo)
6 {
7   float timed_lo, timed_go, timed, gbps;
8   cl::NDRange globalSize, localSize;
9   float *arr = NULL;
10 
11   if (!isGlobalBW)
12     return 0;
13 
14   cl::Context ctx = queue.getInfo<CL_QUEUE_CONTEXT>();
15   uint iters = devInfo.gloalBWIters;
16 
17   uint64_t maxItems = devInfo.maxAllocSize / sizeof(float) / 2;
18   uint64_t numItems = roundToMultipleOf(maxItems, (devInfo.maxWGSize * FETCH_PER_WI * 16), devInfo.globalBWMaxSize);
19 
20   try
21   {
22     arr = new float[numItems];
23     populate(arr, numItems);
24 
25     log->print(NEWLINE TAB TAB "Global memory bandwidth (GBPS)" NEWLINE);
26     log->xmlOpenTag("global_memory_bandwidth");
27     log->xmlAppendAttribs("unit", "gbps");
28 
29     cl::Buffer inputBuf = cl::Buffer(ctx, CL_MEM_READ_ONLY, (numItems * sizeof(float)));
30     cl::Buffer outputBuf = cl::Buffer(ctx, CL_MEM_WRITE_ONLY, (numItems * sizeof(float)));
31     queue.enqueueWriteBuffer(inputBuf, CL_TRUE, 0, (numItems * sizeof(float)), arr);
32 
33     cl::Kernel kernel_v1_lo(prog, "global_bandwidth_v1_local_offset");
34     kernel_v1_lo.setArg(0, inputBuf), kernel_v1_lo.setArg(1, outputBuf);
35 
36     cl::Kernel kernel_v2_lo(prog, "global_bandwidth_v2_local_offset");
37     kernel_v2_lo.setArg(0, inputBuf), kernel_v2_lo.setArg(1, outputBuf);
38 
39     cl::Kernel kernel_v4_lo(prog, "global_bandwidth_v4_local_offset");
40     kernel_v4_lo.setArg(0, inputBuf), kernel_v4_lo.setArg(1, outputBuf);
41 
42     cl::Kernel kernel_v8_lo(prog, "global_bandwidth_v8_local_offset");
43     kernel_v8_lo.setArg(0, inputBuf), kernel_v8_lo.setArg(1, outputBuf);
44 
45     cl::Kernel kernel_v16_lo(prog, "global_bandwidth_v16_local_offset");
46     kernel_v16_lo.setArg(0, inputBuf), kernel_v16_lo.setArg(1, outputBuf);
47 
48     cl::Kernel kernel_v1_go(prog, "global_bandwidth_v1_global_offset");
49     kernel_v1_go.setArg(0, inputBuf), kernel_v1_go.setArg(1, outputBuf);
50 
51     cl::Kernel kernel_v2_go(prog, "global_bandwidth_v2_global_offset");
52     kernel_v2_go.setArg(0, inputBuf), kernel_v2_go.setArg(1, outputBuf);
53 
54     cl::Kernel kernel_v4_go(prog, "global_bandwidth_v4_global_offset");
55     kernel_v4_go.setArg(0, inputBuf), kernel_v4_go.setArg(1, outputBuf);
56 
57     cl::Kernel kernel_v8_go(prog, "global_bandwidth_v8_global_offset");
58     kernel_v8_go.setArg(0, inputBuf), kernel_v8_go.setArg(1, outputBuf);
59 
60     cl::Kernel kernel_v16_go(prog, "global_bandwidth_v16_global_offset");
61     kernel_v16_go.setArg(0, inputBuf), kernel_v16_go.setArg(1, outputBuf);
62 
63     localSize = devInfo.maxWGSize;
64 
65     ///////////////////////////////////////////////////////////////////////////
66     // Vector width 1
67     if (!forceTest || strcmp(specifiedTestName, "float") == 0)
68     {
69       log->print(TAB TAB TAB "float   : ");
70 
71       globalSize = numItems / FETCH_PER_WI;
72 
73       // Run 2 kind of bandwidth kernel
74       // lo -- local_size offset - subsequent fetches at local_size offset
75       // go -- global_size offset
76       timed_lo = run_kernel(queue, kernel_v1_lo, globalSize, localSize, iters);
77       timed_go = run_kernel(queue, kernel_v1_go, globalSize, localSize, iters);
78       timed = (timed_lo < timed_go) ? timed_lo : timed_go;
79 
80       gbps = ((float)numItems * sizeof(float)) / timed / 1e3f;
81 
82       log->print(gbps);
83       log->print(NEWLINE);
84       log->xmlRecord("float", gbps);
85     }
86     ///////////////////////////////////////////////////////////////////////////
87 
88     // Vector width 2
89     if (!forceTest || strcmp(specifiedTestName, "float2") == 0)
90     {
91       log->print(TAB TAB TAB "float2  : ");
92 
93       globalSize = (numItems / 2 / FETCH_PER_WI);
94 
95       timed_lo = run_kernel(queue, kernel_v2_lo, globalSize, localSize, iters);
96       timed_go = run_kernel(queue, kernel_v2_go, globalSize, localSize, iters);
97       timed = (timed_lo < timed_go) ? timed_lo : timed_go;
98 
99       gbps = ((float)numItems * sizeof(float)) / timed / 1e3f;
100 
101       log->print(gbps);
102       log->print(NEWLINE);
103       log->xmlRecord("float2", gbps);
104     }
105     ///////////////////////////////////////////////////////////////////////////
106 
107     // Vector width 4
108     if (!forceTest || strcmp(specifiedTestName, "float4") == 0)
109     {
110       log->print(TAB TAB TAB "float4  : ");
111 
112       globalSize = (numItems / 4 / FETCH_PER_WI);
113 
114       timed_lo = run_kernel(queue, kernel_v4_lo, globalSize, localSize, iters);
115       timed_go = run_kernel(queue, kernel_v4_go, globalSize, localSize, iters);
116       timed = (timed_lo < timed_go) ? timed_lo : timed_go;
117 
118       gbps = ((float)numItems * sizeof(float)) / timed / 1e3f;
119 
120       log->print(gbps);
121       log->print(NEWLINE);
122       log->xmlRecord("float4", gbps);
123     }
124     ///////////////////////////////////////////////////////////////////////////
125 
126     // Vector width 8
127     if (!forceTest || strcmp(specifiedTestName, "float8") == 0)
128     {
129       log->print(TAB TAB TAB "float8  : ");
130 
131       globalSize = (numItems / 8 / FETCH_PER_WI);
132 
133       timed_lo = run_kernel(queue, kernel_v8_lo, globalSize, localSize, iters);
134       timed_go = run_kernel(queue, kernel_v8_go, globalSize, localSize, iters);
135       timed = (timed_lo < timed_go) ? timed_lo : timed_go;
136 
137       gbps = ((float)numItems * sizeof(float)) / timed / 1e3f;
138 
139       log->print(gbps);
140       log->print(NEWLINE);
141       log->xmlRecord("float8", gbps);
142     }
143     ///////////////////////////////////////////////////////////////////////////
144 
145     // Vector width 16
146     if (!forceTest || strcmp(specifiedTestName, "float16") == 0)
147     {
148       log->print(TAB TAB TAB "float16 : ");
149       globalSize = (numItems / 16 / FETCH_PER_WI);
150 
151       timed_lo = run_kernel(queue, kernel_v16_lo, globalSize, localSize, iters);
152       timed_go = run_kernel(queue, kernel_v16_go, globalSize, localSize, iters);
153       timed = (timed_lo < timed_go) ? timed_lo : timed_go;
154 
155       gbps = ((float)numItems * sizeof(float)) / timed / 1e3f;
156 
157       log->print(gbps);
158       log->print(NEWLINE);
159       log->xmlRecord("float16", gbps);
160     }
161     ///////////////////////////////////////////////////////////////////////////
162     log->xmlCloseTag(); // global_memory_bandwidth
163 
164     if (arr)
165     {
166       delete[] arr;
167     }
168   }
169   catch (cl::Error &error)
170   {
171     stringstream ss;
172     ss << error.what() << " (" << error.err() << ")" NEWLINE
173        << TAB TAB TAB "Tests skipped" NEWLINE;
174     log->print(ss.str());
175 
176     if (arr)
177     {
178       delete[] arr;
179     }
180     return -1;
181   }
182 
183   return 0;
184 }
185