1*1cd03ba3SJeremy Kemp #include <clpeak.h>
2*1cd03ba3SJeremy Kemp
3*1cd03ba3SJeremy Kemp #define FETCH_PER_WI 16
4*1cd03ba3SJeremy Kemp
runKernelLatency(cl::CommandQueue & queue,cl::Program & prog,device_info_t & devInfo)5*1cd03ba3SJeremy Kemp int clPeak::runKernelLatency(cl::CommandQueue &queue, cl::Program &prog, device_info_t &devInfo)
6*1cd03ba3SJeremy Kemp {
7*1cd03ba3SJeremy Kemp if (!isKernelLatency)
8*1cd03ba3SJeremy Kemp return 0;
9*1cd03ba3SJeremy Kemp
10*1cd03ba3SJeremy Kemp cl::Context ctx = queue.getInfo<CL_QUEUE_CONTEXT>();
11*1cd03ba3SJeremy Kemp cl_uint numItems = (devInfo.maxWGSize) * (devInfo.numCUs) * FETCH_PER_WI;
12*1cd03ba3SJeremy Kemp cl::NDRange globalSize = (numItems / FETCH_PER_WI);
13*1cd03ba3SJeremy Kemp cl::NDRange localSize = devInfo.maxWGSize;
14*1cd03ba3SJeremy Kemp uint iters = devInfo.kernelLatencyIters;
15*1cd03ba3SJeremy Kemp float latency;
16*1cd03ba3SJeremy Kemp
17*1cd03ba3SJeremy Kemp try
18*1cd03ba3SJeremy Kemp {
19*1cd03ba3SJeremy Kemp log->print(NEWLINE TAB TAB "Kernel launch latency : ");
20*1cd03ba3SJeremy Kemp log->xmlOpenTag("kernel_launch_latency");
21*1cd03ba3SJeremy Kemp log->xmlAppendAttribs("unit", "us");
22*1cd03ba3SJeremy Kemp
23*1cd03ba3SJeremy Kemp cl::Buffer inputBuf = cl::Buffer(ctx, CL_MEM_READ_ONLY, (numItems * sizeof(float)));
24*1cd03ba3SJeremy Kemp cl::Buffer outputBuf = cl::Buffer(ctx, CL_MEM_WRITE_ONLY, (numItems * sizeof(float)));
25*1cd03ba3SJeremy Kemp
26*1cd03ba3SJeremy Kemp cl::Kernel kernel_v1(prog, "global_bandwidth_v1_local_offset");
27*1cd03ba3SJeremy Kemp kernel_v1.setArg(0, inputBuf), kernel_v1.setArg(1, outputBuf);
28*1cd03ba3SJeremy Kemp
29*1cd03ba3SJeremy Kemp // Dummy calls
30*1cd03ba3SJeremy Kemp queue.enqueueNDRangeKernel(kernel_v1, cl::NullRange, globalSize, localSize);
31*1cd03ba3SJeremy Kemp queue.enqueueNDRangeKernel(kernel_v1, cl::NullRange, globalSize, localSize);
32*1cd03ba3SJeremy Kemp queue.finish();
33*1cd03ba3SJeremy Kemp
34*1cd03ba3SJeremy Kemp latency = 0;
35*1cd03ba3SJeremy Kemp for (uint i = 0; i < iters; i++)
36*1cd03ba3SJeremy Kemp {
37*1cd03ba3SJeremy Kemp cl::Event timeEvent;
38*1cd03ba3SJeremy Kemp queue.enqueueNDRangeKernel(kernel_v1, cl::NullRange, globalSize, localSize, NULL, &timeEvent);
39*1cd03ba3SJeremy Kemp queue.finish();
40*1cd03ba3SJeremy Kemp cl_ulong start = timeEvent.getProfilingInfo<CL_PROFILING_COMMAND_QUEUED>() / 1000;
41*1cd03ba3SJeremy Kemp cl_ulong end = timeEvent.getProfilingInfo<CL_PROFILING_COMMAND_START>() / 1000;
42*1cd03ba3SJeremy Kemp latency += (float)((int)end - (int)start);
43*1cd03ba3SJeremy Kemp }
44*1cd03ba3SJeremy Kemp latency /= static_cast<float>(iters);
45*1cd03ba3SJeremy Kemp
46*1cd03ba3SJeremy Kemp log->print(latency);
47*1cd03ba3SJeremy Kemp log->print(" us" NEWLINE);
48*1cd03ba3SJeremy Kemp log->xmlSetContent(latency);
49*1cd03ba3SJeremy Kemp log->xmlCloseTag();
50*1cd03ba3SJeremy Kemp }
51*1cd03ba3SJeremy Kemp catch (cl::Error &error)
52*1cd03ba3SJeremy Kemp {
53*1cd03ba3SJeremy Kemp stringstream ss;
54*1cd03ba3SJeremy Kemp ss << error.what() << " (" << error.err() << ")" NEWLINE
55*1cd03ba3SJeremy Kemp << TAB TAB TAB "Tests skipped" NEWLINE;
56*1cd03ba3SJeremy Kemp log->print(ss.str());
57*1cd03ba3SJeremy Kemp return -1;
58*1cd03ba3SJeremy Kemp }
59*1cd03ba3SJeremy Kemp
60*1cd03ba3SJeremy Kemp return 0;
61*1cd03ba3SJeremy Kemp }
62