xref: /aosp_15_r20/external/clpeak/src/compute_dp.cpp (revision 1cd03ba3888297bc945f2c84574e105e3ced3e34)
1 #include <clpeak.h>
2 
runComputeDP(cl::CommandQueue & queue,cl::Program & prog,device_info_t & devInfo)3 int clPeak::runComputeDP(cl::CommandQueue &queue, cl::Program &prog, device_info_t &devInfo)
4 {
5   float timed, gflops;
6   cl_uint workPerWI;
7   cl::NDRange globalSize, localSize;
8   cl_double A = 1.3f;
9   uint iters = devInfo.computeIters;
10 
11   if (!isComputeDP)
12     return 0;
13 
14   if (!devInfo.doubleSupported)
15   {
16     log->print(NEWLINE TAB TAB "No double precision support! Skipped" NEWLINE);
17     return 0;
18   }
19 
20   try
21   {
22     log->print(NEWLINE TAB TAB "Double-precision compute (GFLOPS)" NEWLINE);
23     log->xmlOpenTag("double_precision_compute");
24     log->xmlAppendAttribs("unit", "gflops");
25 
26     cl::Context ctx = queue.getInfo<CL_QUEUE_CONTEXT>();
27 
28     uint64_t globalWIs = (devInfo.numCUs) * (devInfo.computeDPWgsPerCU) * (devInfo.maxWGSize);
29     uint64_t t = std::min((globalWIs * sizeof(cl_double)), devInfo.maxAllocSize) / sizeof(cl_double);
30     globalWIs = roundToMultipleOf(t, devInfo.maxWGSize);
31 
32     cl::Buffer outputBuf = cl::Buffer(ctx, CL_MEM_WRITE_ONLY, (globalWIs * sizeof(cl_double)));
33 
34     globalSize = globalWIs;
35     localSize = devInfo.maxWGSize;
36 
37     cl::Kernel kernel_v1(prog, "compute_dp_v1");
38     kernel_v1.setArg(0, outputBuf), kernel_v1.setArg(1, A);
39 
40     cl::Kernel kernel_v2(prog, "compute_dp_v2");
41     kernel_v2.setArg(0, outputBuf), kernel_v2.setArg(1, A);
42 
43     cl::Kernel kernel_v4(prog, "compute_dp_v4");
44     kernel_v4.setArg(0, outputBuf), kernel_v4.setArg(1, A);
45 
46     cl::Kernel kernel_v8(prog, "compute_dp_v8");
47     kernel_v8.setArg(0, outputBuf), kernel_v8.setArg(1, A);
48 
49     cl::Kernel kernel_v16(prog, "compute_dp_v16");
50     kernel_v16.setArg(0, outputBuf), kernel_v16.setArg(1, A);
51 
52     ///////////////////////////////////////////////////////////////////////////
53     // Vector width 1
54     if (!forceTest || strcmp(specifiedTestName, "double") == 0)
55     {
56       log->print(TAB TAB TAB "double   : ");
57 
58       workPerWI = 4096; // Indicates flops executed per work-item
59 
60       timed = run_kernel(queue, kernel_v1, globalSize, localSize, iters);
61 
62       gflops = (static_cast<float>(globalWIs) * static_cast<float>(workPerWI)) / timed / 1e3f;
63 
64       log->print(gflops);
65       log->print(NEWLINE);
66       log->xmlRecord("double", gflops);
67     }
68     ///////////////////////////////////////////////////////////////////////////
69 
70     // Vector width 2
71     if (!forceTest || strcmp(specifiedTestName, "double2") == 0)
72     {
73       log->print(TAB TAB TAB "double2  : ");
74 
75       workPerWI = 4096;
76 
77       timed = run_kernel(queue, kernel_v2, globalSize, localSize, iters);
78 
79       gflops = (static_cast<float>(globalWIs) * static_cast<float>(workPerWI)) / timed / 1e3f;
80 
81       log->print(gflops);
82       log->print(NEWLINE);
83       log->xmlRecord("double2", gflops);
84     }
85     ///////////////////////////////////////////////////////////////////////////
86 
87     // Vector width 4
88     if (!forceTest || strcmp(specifiedTestName, "double4") == 0)
89     {
90       log->print(TAB TAB TAB "double4  : ");
91 
92       workPerWI = 4096;
93 
94       timed = run_kernel(queue, kernel_v4, globalSize, localSize, iters);
95 
96       gflops = (static_cast<float>(globalWIs) * static_cast<float>(workPerWI)) / timed / 1e3f;
97 
98       log->print(gflops);
99       log->print(NEWLINE);
100       log->xmlRecord("double4", gflops);
101     }
102     ///////////////////////////////////////////////////////////////////////////
103 
104     // Vector width 8
105     if (!forceTest || strcmp(specifiedTestName, "double8") == 0)
106     {
107       log->print(TAB TAB TAB "double8  : ");
108       workPerWI = 4096;
109 
110       timed = run_kernel(queue, kernel_v8, globalSize, localSize, iters);
111 
112       gflops = (static_cast<float>(globalWIs) * static_cast<float>(workPerWI)) / timed / 1e3f;
113 
114       log->print(gflops);
115       log->print(NEWLINE);
116       log->xmlRecord("double8", gflops);
117     }
118     ///////////////////////////////////////////////////////////////////////////
119 
120     // Vector width 16
121     if (!forceTest || strcmp(specifiedTestName, "double16") == 0)
122     {
123       log->print(TAB TAB TAB "double16 : ");
124 
125       workPerWI = 4096;
126 
127       timed = run_kernel(queue, kernel_v16, globalSize, localSize, iters);
128 
129       gflops = (static_cast<float>(globalWIs) * static_cast<float>(workPerWI)) / timed / 1e3f;
130 
131       log->print(gflops);
132       log->print(NEWLINE);
133       log->xmlRecord("double16", gflops);
134     }
135     ///////////////////////////////////////////////////////////////////////////
136     log->xmlCloseTag(); // double_precision_compute
137   }
138   catch (cl::Error &error)
139   {
140     stringstream ss;
141     ss << error.what() << " (" << error.err() << ")" NEWLINE
142        << TAB TAB TAB "Tests skipped" NEWLINE;
143     log->print(ss.str());
144     return -1;
145   }
146 
147   return 0;
148 }
149