1*1cd03ba3SJeremy Kemp #include <clpeak.h>
2*1cd03ba3SJeremy Kemp
runComputeSP(cl::CommandQueue & queue,cl::Program & prog,device_info_t & devInfo)3*1cd03ba3SJeremy Kemp int clPeak::runComputeSP(cl::CommandQueue &queue, cl::Program &prog, device_info_t &devInfo)
4*1cd03ba3SJeremy Kemp {
5*1cd03ba3SJeremy Kemp float timed, gflops;
6*1cd03ba3SJeremy Kemp cl_uint workPerWI;
7*1cd03ba3SJeremy Kemp cl::NDRange globalSize, localSize;
8*1cd03ba3SJeremy Kemp cl_float A = 1.3f;
9*1cd03ba3SJeremy Kemp uint iters = devInfo.computeIters;
10*1cd03ba3SJeremy Kemp
11*1cd03ba3SJeremy Kemp if (!isComputeSP)
12*1cd03ba3SJeremy Kemp return 0;
13*1cd03ba3SJeremy Kemp
14*1cd03ba3SJeremy Kemp try
15*1cd03ba3SJeremy Kemp {
16*1cd03ba3SJeremy Kemp log->print(NEWLINE TAB TAB "Single-precision compute (GFLOPS)" NEWLINE);
17*1cd03ba3SJeremy Kemp log->xmlOpenTag("single_precision_compute");
18*1cd03ba3SJeremy Kemp log->xmlAppendAttribs("unit", "gflops");
19*1cd03ba3SJeremy Kemp
20*1cd03ba3SJeremy Kemp cl::Context ctx = queue.getInfo<CL_QUEUE_CONTEXT>();
21*1cd03ba3SJeremy Kemp
22*1cd03ba3SJeremy Kemp uint64_t globalWIs = (devInfo.numCUs) * (devInfo.computeWgsPerCU) * (devInfo.maxWGSize);
23*1cd03ba3SJeremy Kemp uint64_t t = std::min((globalWIs * sizeof(cl_float)), devInfo.maxAllocSize) / sizeof(cl_float);
24*1cd03ba3SJeremy Kemp globalWIs = roundToMultipleOf(t, devInfo.maxWGSize);
25*1cd03ba3SJeremy Kemp
26*1cd03ba3SJeremy Kemp cl::Buffer outputBuf = cl::Buffer(ctx, CL_MEM_WRITE_ONLY, (globalWIs * sizeof(cl_float)));
27*1cd03ba3SJeremy Kemp
28*1cd03ba3SJeremy Kemp globalSize = globalWIs;
29*1cd03ba3SJeremy Kemp localSize = devInfo.maxWGSize;
30*1cd03ba3SJeremy Kemp
31*1cd03ba3SJeremy Kemp cl::Kernel kernel_v1(prog, "compute_sp_v1");
32*1cd03ba3SJeremy Kemp kernel_v1.setArg(0, outputBuf), kernel_v1.setArg(1, A);
33*1cd03ba3SJeremy Kemp
34*1cd03ba3SJeremy Kemp cl::Kernel kernel_v2(prog, "compute_sp_v2");
35*1cd03ba3SJeremy Kemp kernel_v2.setArg(0, outputBuf), kernel_v2.setArg(1, A);
36*1cd03ba3SJeremy Kemp
37*1cd03ba3SJeremy Kemp cl::Kernel kernel_v4(prog, "compute_sp_v4");
38*1cd03ba3SJeremy Kemp kernel_v4.setArg(0, outputBuf), kernel_v4.setArg(1, A);
39*1cd03ba3SJeremy Kemp
40*1cd03ba3SJeremy Kemp cl::Kernel kernel_v8(prog, "compute_sp_v8");
41*1cd03ba3SJeremy Kemp kernel_v8.setArg(0, outputBuf), kernel_v8.setArg(1, A);
42*1cd03ba3SJeremy Kemp
43*1cd03ba3SJeremy Kemp cl::Kernel kernel_v16(prog, "compute_sp_v16");
44*1cd03ba3SJeremy Kemp kernel_v16.setArg(0, outputBuf), kernel_v16.setArg(1, A);
45*1cd03ba3SJeremy Kemp
46*1cd03ba3SJeremy Kemp ///////////////////////////////////////////////////////////////////////////
47*1cd03ba3SJeremy Kemp // Vector width 1
48*1cd03ba3SJeremy Kemp if (!forceTest || strcmp(specifiedTestName, "float") == 0)
49*1cd03ba3SJeremy Kemp {
50*1cd03ba3SJeremy Kemp log->print(TAB TAB TAB "float : ");
51*1cd03ba3SJeremy Kemp
52*1cd03ba3SJeremy Kemp workPerWI = 4096; // Indicates flops executed per work-item
53*1cd03ba3SJeremy Kemp
54*1cd03ba3SJeremy Kemp timed = run_kernel(queue, kernel_v1, globalSize, localSize, iters);
55*1cd03ba3SJeremy Kemp
56*1cd03ba3SJeremy Kemp gflops = (static_cast<float>(globalWIs) * static_cast<float>(workPerWI)) / timed / 1e3f;
57*1cd03ba3SJeremy Kemp
58*1cd03ba3SJeremy Kemp log->print(gflops);
59*1cd03ba3SJeremy Kemp log->print(NEWLINE);
60*1cd03ba3SJeremy Kemp log->xmlRecord("float", gflops);
61*1cd03ba3SJeremy Kemp }
62*1cd03ba3SJeremy Kemp ///////////////////////////////////////////////////////////////////////////
63*1cd03ba3SJeremy Kemp
64*1cd03ba3SJeremy Kemp // Vector width 2
65*1cd03ba3SJeremy Kemp if (!forceTest || strcmp(specifiedTestName, "float2") == 0)
66*1cd03ba3SJeremy Kemp {
67*1cd03ba3SJeremy Kemp log->print(TAB TAB TAB "float2 : ");
68*1cd03ba3SJeremy Kemp
69*1cd03ba3SJeremy Kemp workPerWI = 4096;
70*1cd03ba3SJeremy Kemp
71*1cd03ba3SJeremy Kemp timed = run_kernel(queue, kernel_v2, globalSize, localSize, iters);
72*1cd03ba3SJeremy Kemp
73*1cd03ba3SJeremy Kemp gflops = (static_cast<float>(globalWIs) * static_cast<float>(workPerWI)) / timed / 1e3f;
74*1cd03ba3SJeremy Kemp
75*1cd03ba3SJeremy Kemp log->print(gflops);
76*1cd03ba3SJeremy Kemp log->print(NEWLINE);
77*1cd03ba3SJeremy Kemp log->xmlRecord("float2", gflops);
78*1cd03ba3SJeremy Kemp }
79*1cd03ba3SJeremy Kemp ///////////////////////////////////////////////////////////////////////////
80*1cd03ba3SJeremy Kemp
81*1cd03ba3SJeremy Kemp // Vector width 4
82*1cd03ba3SJeremy Kemp if (!forceTest || strcmp(specifiedTestName, "float4") == 0)
83*1cd03ba3SJeremy Kemp {
84*1cd03ba3SJeremy Kemp log->print(TAB TAB TAB "float4 : ");
85*1cd03ba3SJeremy Kemp
86*1cd03ba3SJeremy Kemp workPerWI = 4096;
87*1cd03ba3SJeremy Kemp
88*1cd03ba3SJeremy Kemp timed = run_kernel(queue, kernel_v4, globalSize, localSize, iters);
89*1cd03ba3SJeremy Kemp
90*1cd03ba3SJeremy Kemp gflops = (static_cast<float>(globalWIs) * static_cast<float>(workPerWI)) / timed / 1e3f;
91*1cd03ba3SJeremy Kemp
92*1cd03ba3SJeremy Kemp log->print(gflops);
93*1cd03ba3SJeremy Kemp log->print(NEWLINE);
94*1cd03ba3SJeremy Kemp log->xmlRecord("float4", gflops);
95*1cd03ba3SJeremy Kemp }
96*1cd03ba3SJeremy Kemp ///////////////////////////////////////////////////////////////////////////
97*1cd03ba3SJeremy Kemp
98*1cd03ba3SJeremy Kemp // Vector width 8
99*1cd03ba3SJeremy Kemp if (!forceTest || strcmp(specifiedTestName, "float8") == 0)
100*1cd03ba3SJeremy Kemp {
101*1cd03ba3SJeremy Kemp log->print(TAB TAB TAB "float8 : ");
102*1cd03ba3SJeremy Kemp
103*1cd03ba3SJeremy Kemp workPerWI = 4096;
104*1cd03ba3SJeremy Kemp
105*1cd03ba3SJeremy Kemp timed = run_kernel(queue, kernel_v8, globalSize, localSize, iters);
106*1cd03ba3SJeremy Kemp
107*1cd03ba3SJeremy Kemp gflops = (static_cast<float>(globalWIs) * static_cast<float>(workPerWI)) / timed / 1e3f;
108*1cd03ba3SJeremy Kemp
109*1cd03ba3SJeremy Kemp log->print(gflops);
110*1cd03ba3SJeremy Kemp log->print(NEWLINE);
111*1cd03ba3SJeremy Kemp log->xmlRecord("float8", gflops);
112*1cd03ba3SJeremy Kemp }
113*1cd03ba3SJeremy Kemp ///////////////////////////////////////////////////////////////////////////
114*1cd03ba3SJeremy Kemp
115*1cd03ba3SJeremy Kemp // Vector width 16
116*1cd03ba3SJeremy Kemp if (!forceTest || strcmp(specifiedTestName, "float16") == 0)
117*1cd03ba3SJeremy Kemp {
118*1cd03ba3SJeremy Kemp log->print(TAB TAB TAB "float16 : ");
119*1cd03ba3SJeremy Kemp
120*1cd03ba3SJeremy Kemp workPerWI = 4096;
121*1cd03ba3SJeremy Kemp
122*1cd03ba3SJeremy Kemp timed = run_kernel(queue, kernel_v16, globalSize, localSize, iters);
123*1cd03ba3SJeremy Kemp
124*1cd03ba3SJeremy Kemp gflops = (static_cast<float>(globalWIs) * static_cast<float>(workPerWI)) / timed / 1e3f;
125*1cd03ba3SJeremy Kemp
126*1cd03ba3SJeremy Kemp log->print(gflops);
127*1cd03ba3SJeremy Kemp log->print(NEWLINE);
128*1cd03ba3SJeremy Kemp log->xmlRecord("float16", gflops);
129*1cd03ba3SJeremy Kemp }
130*1cd03ba3SJeremy Kemp ///////////////////////////////////////////////////////////////////////////
131*1cd03ba3SJeremy Kemp log->xmlCloseTag(); // single_precision_compute
132*1cd03ba3SJeremy Kemp }
133*1cd03ba3SJeremy Kemp catch (cl::Error &error)
134*1cd03ba3SJeremy Kemp {
135*1cd03ba3SJeremy Kemp stringstream ss;
136*1cd03ba3SJeremy Kemp ss << error.what() << " (" << error.err() << ")" NEWLINE
137*1cd03ba3SJeremy Kemp << TAB TAB TAB "Tests skipped" NEWLINE;
138*1cd03ba3SJeremy Kemp log->print(ss.str());
139*1cd03ba3SJeremy Kemp return -1;
140*1cd03ba3SJeremy Kemp }
141*1cd03ba3SJeremy Kemp
142*1cd03ba3SJeremy Kemp return 0;
143*1cd03ba3SJeremy Kemp }
144