xref: /aosp_15_r20/external/clpeak/src/kernel_latency.cpp (revision 1cd03ba3888297bc945f2c84574e105e3ced3e34)
1*1cd03ba3SJeremy Kemp #include <clpeak.h>
2*1cd03ba3SJeremy Kemp 
3*1cd03ba3SJeremy Kemp #define FETCH_PER_WI 16
4*1cd03ba3SJeremy Kemp 
runKernelLatency(cl::CommandQueue & queue,cl::Program & prog,device_info_t & devInfo)5*1cd03ba3SJeremy Kemp int clPeak::runKernelLatency(cl::CommandQueue &queue, cl::Program &prog, device_info_t &devInfo)
6*1cd03ba3SJeremy Kemp {
7*1cd03ba3SJeremy Kemp   if (!isKernelLatency)
8*1cd03ba3SJeremy Kemp     return 0;
9*1cd03ba3SJeremy Kemp 
10*1cd03ba3SJeremy Kemp   cl::Context ctx = queue.getInfo<CL_QUEUE_CONTEXT>();
11*1cd03ba3SJeremy Kemp   cl_uint numItems = (devInfo.maxWGSize) * (devInfo.numCUs) * FETCH_PER_WI;
12*1cd03ba3SJeremy Kemp   cl::NDRange globalSize = (numItems / FETCH_PER_WI);
13*1cd03ba3SJeremy Kemp   cl::NDRange localSize = devInfo.maxWGSize;
14*1cd03ba3SJeremy Kemp   uint iters = devInfo.kernelLatencyIters;
15*1cd03ba3SJeremy Kemp   float latency;
16*1cd03ba3SJeremy Kemp 
17*1cd03ba3SJeremy Kemp   try
18*1cd03ba3SJeremy Kemp   {
19*1cd03ba3SJeremy Kemp     log->print(NEWLINE TAB TAB "Kernel launch latency : ");
20*1cd03ba3SJeremy Kemp     log->xmlOpenTag("kernel_launch_latency");
21*1cd03ba3SJeremy Kemp     log->xmlAppendAttribs("unit", "us");
22*1cd03ba3SJeremy Kemp 
23*1cd03ba3SJeremy Kemp     cl::Buffer inputBuf = cl::Buffer(ctx, CL_MEM_READ_ONLY, (numItems * sizeof(float)));
24*1cd03ba3SJeremy Kemp     cl::Buffer outputBuf = cl::Buffer(ctx, CL_MEM_WRITE_ONLY, (numItems * sizeof(float)));
25*1cd03ba3SJeremy Kemp 
26*1cd03ba3SJeremy Kemp     cl::Kernel kernel_v1(prog, "global_bandwidth_v1_local_offset");
27*1cd03ba3SJeremy Kemp     kernel_v1.setArg(0, inputBuf), kernel_v1.setArg(1, outputBuf);
28*1cd03ba3SJeremy Kemp 
29*1cd03ba3SJeremy Kemp     // Dummy calls
30*1cd03ba3SJeremy Kemp     queue.enqueueNDRangeKernel(kernel_v1, cl::NullRange, globalSize, localSize);
31*1cd03ba3SJeremy Kemp     queue.enqueueNDRangeKernel(kernel_v1, cl::NullRange, globalSize, localSize);
32*1cd03ba3SJeremy Kemp     queue.finish();
33*1cd03ba3SJeremy Kemp 
34*1cd03ba3SJeremy Kemp     latency = 0;
35*1cd03ba3SJeremy Kemp     for (uint i = 0; i < iters; i++)
36*1cd03ba3SJeremy Kemp     {
37*1cd03ba3SJeremy Kemp       cl::Event timeEvent;
38*1cd03ba3SJeremy Kemp       queue.enqueueNDRangeKernel(kernel_v1, cl::NullRange, globalSize, localSize, NULL, &timeEvent);
39*1cd03ba3SJeremy Kemp       queue.finish();
40*1cd03ba3SJeremy Kemp       cl_ulong start = timeEvent.getProfilingInfo<CL_PROFILING_COMMAND_QUEUED>() / 1000;
41*1cd03ba3SJeremy Kemp       cl_ulong end = timeEvent.getProfilingInfo<CL_PROFILING_COMMAND_START>() / 1000;
42*1cd03ba3SJeremy Kemp       latency += (float)((int)end - (int)start);
43*1cd03ba3SJeremy Kemp     }
44*1cd03ba3SJeremy Kemp     latency /= static_cast<float>(iters);
45*1cd03ba3SJeremy Kemp 
46*1cd03ba3SJeremy Kemp     log->print(latency);
47*1cd03ba3SJeremy Kemp     log->print(" us" NEWLINE);
48*1cd03ba3SJeremy Kemp     log->xmlSetContent(latency);
49*1cd03ba3SJeremy Kemp     log->xmlCloseTag();
50*1cd03ba3SJeremy Kemp   }
51*1cd03ba3SJeremy Kemp   catch (cl::Error &error)
52*1cd03ba3SJeremy Kemp   {
53*1cd03ba3SJeremy Kemp     stringstream ss;
54*1cd03ba3SJeremy Kemp     ss << error.what() << " (" << error.err() << ")" NEWLINE
55*1cd03ba3SJeremy Kemp        << TAB TAB TAB "Tests skipped" NEWLINE;
56*1cd03ba3SJeremy Kemp     log->print(ss.str());
57*1cd03ba3SJeremy Kemp     return -1;
58*1cd03ba3SJeremy Kemp   }
59*1cd03ba3SJeremy Kemp 
60*1cd03ba3SJeremy Kemp   return 0;
61*1cd03ba3SJeremy Kemp }
62