xref: /aosp_15_r20/external/deqp/modules/gles2/performance/es2pShaderOperatorTests.cpp (revision 35238bce31c2a825756842865a792f8cf7f89930)
1 /*-------------------------------------------------------------------------
2  * drawElements Quality Program OpenGL ES 2.0 Module
3  * -------------------------------------------------
4  *
5  * Copyright 2014 The Android Open Source Project
6  *
7  * Licensed under the Apache License, Version 2.0 (the "License");
8  * you may not use this file except in compliance with the License.
9  * You may obtain a copy of the License at
10  *
11  *      http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing, software
14  * distributed under the License is distributed on an "AS IS" BASIS,
15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16  * See the License for the specific language governing permissions and
17  * limitations under the License.
18  *
19  *//*!
20  * \file
21  * \brief Shader operator performance tests.
22  *//*--------------------------------------------------------------------*/
23 
24 #include "es2pShaderOperatorTests.hpp"
25 #include "glsCalibration.hpp"
26 #include "gluShaderUtil.hpp"
27 #include "gluShaderProgram.hpp"
28 #include "gluPixelTransfer.hpp"
29 #include "tcuTestLog.hpp"
30 #include "tcuRenderTarget.hpp"
31 #include "tcuCommandLine.hpp"
32 #include "tcuSurface.hpp"
33 #include "deStringUtil.hpp"
34 #include "deSharedPtr.hpp"
35 #include "deClock.h"
36 #include "deMath.h"
37 
38 #include "glwEnums.hpp"
39 #include "glwFunctions.hpp"
40 
41 #include <map>
42 #include <algorithm>
43 #include <limits>
44 #include <set>
45 
46 namespace deqp
47 {
48 namespace gles2
49 {
50 namespace Performance
51 {
52 
53 using namespace gls;
54 using namespace glu;
55 using de::SharedPtr;
56 using tcu::TestLog;
57 using tcu::Vec2;
58 using tcu::Vec4;
59 
60 using std::string;
61 using std::vector;
62 
63 #define MEASUREMENT_FAIL() \
64     throw tcu::InternalError("Unable to get sensible measurements for estimation", DE_NULL, __FILE__, __LINE__)
65 
66 // Number of measurements in OperatorPerformanceCase for each workload size, unless specified otherwise by a command line argument.
67 static const int DEFAULT_NUM_MEASUREMENTS_PER_WORKLOAD = 3;
68 // How many different workload sizes are used by OperatorPerformanceCase.
69 static const int NUM_WORKLOADS = 8;
70 // Maximum workload size that can be attempted. In a sensible case, this most likely won't be reached.
71 static const int MAX_WORKLOAD_SIZE = 1 << 29;
72 
73 // BinaryOpCase-specific constants for shader generation.
74 static const int BINARY_OPERATOR_CASE_NUM_INDEPENDENT_CALCULATIONS = 4;
75 static const int BINARY_OPERATOR_CASE_SMALL_PROGRAM_UNROLL_AMOUNT  = 2;
76 static const int BINARY_OPERATOR_CASE_BIG_PROGRAM_UNROLL_AMOUNT    = 4;
77 
78 // FunctionCase-specific constants for shader generation.
79 static const int FUNCTION_CASE_NUM_INDEPENDENT_CALCULATIONS = 4;
80 
81 static const char *const s_swizzles[][4] = {{"x", "yx", "yzx", "wzyx"},
82                                             {"y", "zy", "wyz", "xwzy"},
83                                             {"z", "wy", "zxy", "yzwx"},
84                                             {"w", "xw", "yxw", "zyxw"}};
85 
86 template <int N>
mean(const vector<tcu::Vector<float,N>> & data)87 static tcu::Vector<float, N> mean(const vector<tcu::Vector<float, N>> &data)
88 {
89     tcu::Vector<float, N> sum(0.0f);
90     for (int i = 0; i < (int)data.size(); i++)
91         sum += data[i];
92     return sum / tcu::Vector<float, N>((float)data.size());
93 }
94 
uniformNfv(const glw::Functions & gl,int n,int location,int count,const float * data)95 static void uniformNfv(const glw::Functions &gl, int n, int location, int count, const float *data)
96 {
97     switch (n)
98     {
99     case 1:
100         gl.uniform1fv(location, count, data);
101         break;
102     case 2:
103         gl.uniform2fv(location, count, data);
104         break;
105     case 3:
106         gl.uniform3fv(location, count, data);
107         break;
108     case 4:
109         gl.uniform4fv(location, count, data);
110         break;
111     default:
112         DE_ASSERT(false);
113     }
114 }
115 
uniformNiv(const glw::Functions & gl,int n,int location,int count,const int * data)116 static void uniformNiv(const glw::Functions &gl, int n, int location, int count, const int *data)
117 {
118     switch (n)
119     {
120     case 1:
121         gl.uniform1iv(location, count, data);
122         break;
123     case 2:
124         gl.uniform2iv(location, count, data);
125         break;
126     case 3:
127         gl.uniform3iv(location, count, data);
128         break;
129     case 4:
130         gl.uniform4iv(location, count, data);
131         break;
132     default:
133         DE_ASSERT(false);
134     }
135 }
136 
uniformMatrixNfv(const glw::Functions & gl,int n,int location,int count,const float * data)137 static void uniformMatrixNfv(const glw::Functions &gl, int n, int location, int count, const float *data)
138 {
139     switch (n)
140     {
141     case 2:
142         gl.uniformMatrix2fv(location, count, GL_FALSE, &data[0]);
143         break;
144     case 3:
145         gl.uniformMatrix3fv(location, count, GL_FALSE, &data[0]);
146         break;
147     case 4:
148         gl.uniformMatrix4fv(location, count, GL_FALSE, &data[0]);
149         break;
150     default:
151         DE_ASSERT(false);
152     }
153 }
154 
getDataTypeFloatOrVec(int size)155 static glu::DataType getDataTypeFloatOrVec(int size)
156 {
157     return size == 1 ? glu::TYPE_FLOAT : glu::getDataTypeFloatVec(size);
158 }
159 
getIterationCountOrDefault(const tcu::CommandLine & cmdLine,int def)160 static int getIterationCountOrDefault(const tcu::CommandLine &cmdLine, int def)
161 {
162     const int cmdLineVal = cmdLine.getTestIterationCount();
163     return cmdLineVal > 0 ? cmdLineVal : def;
164 }
165 
lineParamsString(const LineParameters & params)166 static string lineParamsString(const LineParameters &params)
167 {
168     return "y = " + de::toString(params.offset) + " + " + de::toString(params.coefficient) + "*x";
169 }
170 
171 namespace
172 {
173 
174 /*--------------------------------------------------------------------*//*!
175  * \brief Abstract class for measuring shader operator performance.
176  *
177  * This class draws multiple times with different workload sizes (set
178  * via a uniform, by subclass). Time for each frame is measured, and the
179  * slope of the workload size vs frame time data is estimated. This slope
180  * tells us the estimated increase in frame time caused by a workload
181  * increase of 1 unit (what 1 workload unit means is up to subclass).
182  *
183  * Generally, the shaders contain not just the operation we're interested
184  * in (e.g. addition) but also some other stuff (e.g. loop overhead). To
185  * eliminate this cost, we actually do the stuff described in the above
186  * paragraph with multiple programs (usually two), which contain different
187  * kinds of workload (e.g. different loop contents). Then we can (in
188  * theory) compute the cost of just one operation in a subclass-dependent
189  * manner.
190  *
191  * At this point, the result tells us the increase in frame time caused
192  * by the addition of one operation. Dividing this by the amount of
193  * draw calls in a frame, and further by the amount of vertices or
194  * fragments in a draw call, we get the time cost of one operation.
195  *
196  * In reality, there sometimes isn't just a trivial linear dependence
197  * between workload size and frame time. Instead, there tends to be some
198  * amount of initial "free" operations. That is, it may be that all
199  * workload sizes below some positive integer C yield the same frame time,
200  * and only workload sizes beyond C increase the frame time in a supposedly
201  * linear manner. Graphically, this means that there graph consists of two
202  * parts: a horizontal left part, and a linearly increasing right part; the
203  * right part starts where the left parts ends. The principal task of these
204  * tests is to look at the slope of the increasing right part. Additionally
205  * an estimate for the amount of initial free operations is calculated.
206  * Note that it is also normal to get graphs where the horizontal left part
207  * is of zero width, i.e. there are no free operations.
208  *//*--------------------------------------------------------------------*/
209 class OperatorPerformanceCase : public tcu::TestCase
210 {
211 public:
212     enum CaseType
213     {
214         CASETYPE_VERTEX = 0,
215         CASETYPE_FRAGMENT,
216 
217         CASETYPE_LAST
218     };
219 
220     struct InitialCalibration
221     {
222         int initialNumCalls;
InitialCalibrationdeqp::gles2::Performance::__anon145888f60111::OperatorPerformanceCase::InitialCalibration223         InitialCalibration(void) : initialNumCalls(1)
224         {
225         }
226     };
227 
228     typedef SharedPtr<InitialCalibration> InitialCalibrationStorage;
229 
230     OperatorPerformanceCase(tcu::TestContext &testCtx, glu::RenderContext &renderCtx, const char *name,
231                             const char *description, CaseType caseType, int numWorkloads,
232                             const InitialCalibrationStorage &initialCalibrationStorage);
233     ~OperatorPerformanceCase(void);
234 
235     void init(void);
236     void deinit(void);
237 
238     IterateResult iterate(void);
239 
240     struct AttribSpec
241     {
AttribSpecdeqp::gles2::Performance::__anon145888f60111::OperatorPerformanceCase::AttribSpec242         AttribSpec(const char *name_, const tcu::Vec4 &p00_, const tcu::Vec4 &p01_, const tcu::Vec4 &p10_,
243                    const tcu::Vec4 &p11_)
244             : name(name_)
245             , p00(p00_)
246             , p01(p01_)
247             , p10(p10_)
248             , p11(p11_)
249         {
250         }
251 
AttribSpecdeqp::gles2::Performance::__anon145888f60111::OperatorPerformanceCase::AttribSpec252         AttribSpec(void)
253         {
254         }
255 
256         std::string name;
257         tcu::Vec4 p00; //!< Bottom left.
258         tcu::Vec4 p01; //!< Bottom right.
259         tcu::Vec4 p10; //!< Top left.
260         tcu::Vec4 p11; //!< Top right.
261     };
262 
263 protected:
264     struct ProgramContext
265     {
266         string vertShaderSource;
267         string fragShaderSource;
268         vector<AttribSpec> attributes;
269 
270         string description;
271 
ProgramContextdeqp::gles2::Performance::__anon145888f60111::OperatorPerformanceCase::ProgramContext272         ProgramContext(void)
273         {
274         }
ProgramContextdeqp::gles2::Performance::__anon145888f60111::OperatorPerformanceCase::ProgramContext275         ProgramContext(const string &vs, const string &fs, const vector<AttribSpec> &attrs, const string &desc)
276             : vertShaderSource(vs)
277             , fragShaderSource(fs)
278             , attributes(attrs)
279             , description(desc)
280         {
281         }
282     };
283 
284     virtual vector<ProgramContext> generateProgramData(void) const = 0;
285     //! Sets program-specific uniforms that don't depend on the workload size.
286     virtual void setGeneralUniforms(uint32_t program) const = 0;
287     //! Sets the uniform(s) that specifies the workload size in the shader.
288     virtual void setWorkloadSizeUniform(uint32_t program, int workload) const = 0;
289     //! Computes the cost of a single operation, given the workload costs per program.
290     virtual float computeSingleOperationTime(const vector<float> &perProgramWorkloadCosts) const = 0;
291     //! Logs a human-readable description of what computeSingleOperationTime does.
292     virtual void logSingleOperationCalculationInfo(void) const = 0;
293 
294     glu::RenderContext &m_renderCtx;
295 
296     CaseType m_caseType;
297 
298 private:
299     enum State
300     {
301         STATE_CALIBRATING = 0, //!< Calibrate draw call count, using first program in m_programs, with workload size 1.
302         STATE_FIND_HIGH_WORKLOAD, //!< Find an appropriate lower bound for the highest workload size we intend to use (one with high-enough frame time compared to workload size 1) for each program.
303         STATE_MEASURING,          //!< Do actual measurements, for each program in m_programs.
304         STATE_REPORTING,          //!< Measurements are done; calculate results and log.
305         STATE_FINISHED,           //!< All done.
306 
307         STATE_LAST
308     };
309 
310     struct WorkloadRecord
311     {
312         int workloadSize;
313         vector<float> frameTimes; //!< In microseconds.
314 
WorkloadRecorddeqp::gles2::Performance::__anon145888f60111::OperatorPerformanceCase::WorkloadRecord315         WorkloadRecord(int workloadSize_) : workloadSize(workloadSize_)
316         {
317         }
operator <deqp::gles2::Performance::__anon145888f60111::OperatorPerformanceCase::WorkloadRecord318         bool operator<(const WorkloadRecord &other) const
319         {
320             return this->workloadSize < other.workloadSize;
321         }
addFrameTimedeqp::gles2::Performance::__anon145888f60111::OperatorPerformanceCase::WorkloadRecord322         void addFrameTime(float time)
323         {
324             frameTimes.push_back(time);
325         }
getMedianTimedeqp::gles2::Performance::__anon145888f60111::OperatorPerformanceCase::WorkloadRecord326         float getMedianTime(void) const
327         {
328             vector<float> times = frameTimes;
329             std::sort(times.begin(), times.end());
330             return times.size() % 2 == 0 ? (times[times.size() / 2 - 1] + times[times.size() / 2]) * 0.5f :
331                                            times[times.size() / 2];
332         }
333     };
334 
335     void prepareProgram(int progNdx); //!< Sets attributes and uniforms for m_programs[progNdx].
336     void prepareWorkload(
337         int progNdx,
338         int workload); //!< Calls setWorkloadSizeUniform and draws, in case the implementation does some draw-time compilation.
339     void prepareNextRound(void); //!< Increases workload and/or updates m_state.
340     void render(int numDrawCalls);
341     uint64_t renderAndMeasure(int numDrawCalls);
342     void adjustAndLogGridAndViewport(
343         void); //!< Log grid and viewport sizes, after possibly reducing them to reduce draw time.
344 
345     vector<Vec2> getWorkloadMedianDataPoints(
346         int progNdx) const; //!< [ Vec2(r.workloadSize, r.getMedianTime()) for r in m_workloadRecords[progNdx] ]
347 
348     const int m_numMeasurementsPerWorkload;
349     const int m_numWorkloads; //!< How many different workload sizes are used for measurement for each program.
350 
351     int m_workloadNdx; //!< Runs from 0 to m_numWorkloads-1.
352 
353     int m_workloadMeasurementNdx;
354     vector<vector<WorkloadRecord>>
355         m_workloadRecordsFindHigh; //!< The measurements done during STATE_FIND_HIGH_WORKLOAD.
356     vector<vector<WorkloadRecord>>
357         m_workloadRecords; //!< The measurements of each program in m_programs. Generated during STATE_MEASURING, into index specified by m_measureProgramNdx.
358 
359     State m_state;
360     int m_measureProgramNdx; //!< When m_state is STATE_FIND_HIGH_WORKLOAD or STATE_MEASURING, this tells which program in m_programs is being measured.
361 
362     vector<int>
363         m_highWorkloadSizes; //!< The first workload size encountered during STATE_FIND_HIGH_WORKLOAD that was determined suitable, for each program.
364 
365     TheilSenCalibrator m_calibrator;
366     InitialCalibrationStorage m_initialCalibrationStorage;
367 
368     int m_viewportWidth;
369     int m_viewportHeight;
370     int m_gridSizeX;
371     int m_gridSizeY;
372 
373     vector<ProgramContext> m_programData;
374     vector<SharedPtr<ShaderProgram>> m_programs;
375 
376     std::vector<uint32_t> m_attribBuffers;
377 };
378 
triangleInterpolate(float v0,float v1,float v2,float x,float y)379 static inline float triangleInterpolate(float v0, float v1, float v2, float x, float y)
380 {
381     return v0 + (v2 - v0) * x + (v1 - v0) * y;
382 }
383 
triQuadInterpolate(float x,float y,const tcu::Vec4 & quad)384 static inline float triQuadInterpolate(float x, float y, const tcu::Vec4 &quad)
385 {
386     // \note Top left fill rule.
387     if (x + y < 1.0f)
388         return triangleInterpolate(quad.x(), quad.y(), quad.z(), x, y);
389     else
390         return triangleInterpolate(quad.w(), quad.z(), quad.y(), 1.0f - x, 1.0f - y);
391 }
392 
getNumVertices(int gridSizeX,int gridSizeY)393 static inline int getNumVertices(int gridSizeX, int gridSizeY)
394 {
395     return gridSizeX * gridSizeY * 2 * 3;
396 }
397 
generateVertices(std::vector<float> & dst,int gridSizeX,int gridSizeY,const OperatorPerformanceCase::AttribSpec & spec)398 static void generateVertices(std::vector<float> &dst, int gridSizeX, int gridSizeY,
399                              const OperatorPerformanceCase::AttribSpec &spec)
400 {
401     const int numComponents = 4;
402 
403     DE_ASSERT(gridSizeX >= 1 && gridSizeY >= 1);
404     dst.resize(getNumVertices(gridSizeX, gridSizeY) * numComponents);
405 
406     {
407         int dstNdx = 0;
408 
409         for (int baseY = 0; baseY < gridSizeY; baseY++)
410             for (int baseX = 0; baseX < gridSizeX; baseX++)
411             {
412                 const float xf0 = (float)(baseX + 0) / (float)gridSizeX;
413                 const float yf0 = (float)(baseY + 0) / (float)gridSizeY;
414                 const float xf1 = (float)(baseX + 1) / (float)gridSizeX;
415                 const float yf1 = (float)(baseY + 1) / (float)gridSizeY;
416 
417 #define ADD_VERTEX(XF, YF)                                    \
418     for (int compNdx = 0; compNdx < numComponents; compNdx++) \
419     dst[dstNdx++] = triQuadInterpolate(                       \
420         (XF), (YF), tcu::Vec4(spec.p00[compNdx], spec.p01[compNdx], spec.p10[compNdx], spec.p11[compNdx]))
421 
422                 ADD_VERTEX(xf0, yf0);
423                 ADD_VERTEX(xf1, yf0);
424                 ADD_VERTEX(xf0, yf1);
425 
426                 ADD_VERTEX(xf1, yf0);
427                 ADD_VERTEX(xf1, yf1);
428                 ADD_VERTEX(xf0, yf1);
429 
430 #undef ADD_VERTEX
431             }
432     }
433 }
434 
intersectionX(const gls::LineParameters & a,const gls::LineParameters & b)435 static float intersectionX(const gls::LineParameters &a, const gls::LineParameters &b)
436 {
437     return (a.offset - b.offset) / (b.coefficient - a.coefficient);
438 }
439 
numDistinctX(const vector<Vec2> & data)440 static int numDistinctX(const vector<Vec2> &data)
441 {
442     std::set<float> xs;
443     for (int i = 0; i < (int)data.size(); i++)
444         xs.insert(data[i].x());
445     return (int)xs.size();
446 }
447 
simpleLinearRegression(const vector<Vec2> & data)448 static gls::LineParameters simpleLinearRegression(const vector<Vec2> &data)
449 {
450     const Vec2 mid = mean(data);
451 
452     float slopeNumerator   = 0.0f;
453     float slopeDenominator = 0.0f;
454 
455     for (int i = 0; i < (int)data.size(); i++)
456     {
457         const Vec2 diff = data[i] - mid;
458 
459         slopeNumerator += diff.x() * diff.y();
460         slopeDenominator += diff.x() * diff.x();
461     }
462 
463     const float slope  = slopeNumerator / slopeDenominator;
464     const float offset = mid.y() - slope * mid.x();
465 
466     return gls::LineParameters(offset, slope);
467 }
468 
simpleLinearRegressionError(const vector<Vec2> & data)469 static float simpleLinearRegressionError(const vector<Vec2> &data)
470 {
471     if (numDistinctX(data) <= 2)
472         return 0.0f;
473     else
474     {
475         const gls::LineParameters estimator = simpleLinearRegression(data);
476         float error                         = 0.0f;
477 
478         for (int i = 0; i < (int)data.size(); i++)
479         {
480             const float estY = estimator.offset + estimator.coefficient * data[i].x();
481             const float diff = estY - data[i].y();
482             error += diff * diff;
483         }
484 
485         return error / (float)data.size();
486     }
487 }
488 
verticalVariance(const vector<Vec2> & data)489 static float verticalVariance(const vector<Vec2> &data)
490 {
491     if (numDistinctX(data) <= 2)
492         return 0.0f;
493     else
494     {
495         const float meanY = mean(data).y();
496         float error       = 0.0f;
497 
498         for (int i = 0; i < (int)data.size(); i++)
499         {
500             const float diff = meanY - data[i].y();
501             error += diff * diff;
502         }
503 
504         return error / (float)data.size();
505     }
506 }
507 
508 /*--------------------------------------------------------------------*//*!
509  * \brief Find the x coord that divides the input data into two slopes.
510  *
511  * The operator performance measurements tend to produce results where
512  * we get small operation counts "for free" (e.g. because the operations
513  * are performed during some memory transfer overhead or something),
514  * resulting in a curve with two parts: an initial horizontal line segment,
515  * and a rising line.
516  *
517  * This function finds the x coordinate that divides the input data into
518  * two parts such that the sum of the mean square errors for the
519  * least-squares estimated lines for the two parts is minimized, under the
520  * additional condition that the left line is horizontal.
521  *
522  * This function returns a number X s.t. { pt | pt is in data, pt.x >= X }
523  * is the right line, and the rest of data is the left line.
524  *//*--------------------------------------------------------------------*/
findSlopePivotX(const vector<Vec2> & data)525 static float findSlopePivotX(const vector<Vec2> &data)
526 {
527     std::set<float> xCoords;
528     for (int i = 0; i < (int)data.size(); i++)
529         xCoords.insert(data[i].x());
530 
531     float lowestError = std::numeric_limits<float>::infinity();
532     float bestPivotX  = -std::numeric_limits<float>::infinity();
533 
534     for (std::set<float>::const_iterator pivotX = xCoords.begin(); pivotX != xCoords.end(); ++pivotX)
535     {
536         vector<Vec2> leftData;
537         vector<Vec2> rightData;
538         for (int i = 0; i < (int)data.size(); i++)
539         {
540             if (data[i].x() < *pivotX)
541                 leftData.push_back(data[i]);
542             else
543                 rightData.push_back(data[i]);
544         }
545 
546         if (numDistinctX(rightData) < 3) // We don't trust the right data if there's too little of it.
547             break;
548 
549         {
550             const float totalError = verticalVariance(leftData) + simpleLinearRegressionError(rightData);
551 
552             if (totalError < lowestError)
553             {
554                 lowestError = totalError;
555                 bestPivotX  = *pivotX;
556             }
557         }
558     }
559 
560     DE_ASSERT(lowestError < std::numeric_limits<float>::infinity());
561 
562     return bestPivotX;
563 }
564 
565 struct SegmentedEstimator
566 {
567     float pivotX; //!< Value returned by findSlopePivotX, or -infinity if only single line.
568     gls::LineParameters left;
569     gls::LineParameters right;
SegmentedEstimatordeqp::gles2::Performance::__anon145888f60111::SegmentedEstimator570     SegmentedEstimator(const gls::LineParameters &l, const gls::LineParameters &r, float pivotX_)
571         : pivotX(pivotX_)
572         , left(l)
573         , right(r)
574     {
575     }
576 };
577 
578 /*--------------------------------------------------------------------*//*!
579  * \brief Compute line estimators for (potentially) two-segment data.
580  *
581  * Splits the given data into left and right parts (using findSlopePivotX)
582  * and returns the line estimates for them.
583  *
584  * Sometimes, however (especially in fragment shader cases) the data is
585  * in fact not segmented, but a straight line. This function attempts to
586  * detect if this the case, and if so, sets left.offset = right.offset and
587  * left.slope = 0, meaning essentially that the initial "flat" part of the
588  * data has zero width.
589  *//*--------------------------------------------------------------------*/
computeSegmentedEstimator(const vector<Vec2> & data)590 static SegmentedEstimator computeSegmentedEstimator(const vector<Vec2> &data)
591 {
592     const float pivotX = findSlopePivotX(data);
593     vector<Vec2> leftData;
594     vector<Vec2> rightData;
595 
596     for (int i = 0; i < (int)data.size(); i++)
597     {
598         if (data[i].x() < pivotX)
599             leftData.push_back(data[i]);
600         else
601             rightData.push_back(data[i]);
602     }
603 
604     {
605         const gls::LineParameters leftLine  = gls::theilSenLinearRegression(leftData);
606         const gls::LineParameters rightLine = gls::theilSenLinearRegression(rightData);
607 
608         if (numDistinctX(leftData) < 2 || leftLine.coefficient > rightLine.coefficient * 0.5f)
609         {
610             // Left data doesn't seem credible; assume the data is just a single line.
611             const gls::LineParameters entireLine = gls::theilSenLinearRegression(data);
612             return SegmentedEstimator(gls::LineParameters(entireLine.offset, 0.0f), entireLine,
613                                       -std::numeric_limits<float>::infinity());
614         }
615         else
616             return SegmentedEstimator(leftLine, rightLine, pivotX);
617     }
618 }
619 
OperatorPerformanceCase(tcu::TestContext & testCtx,glu::RenderContext & renderCtx,const char * name,const char * description,CaseType caseType,int numWorkloads,const InitialCalibrationStorage & initialCalibrationStorage)620 OperatorPerformanceCase::OperatorPerformanceCase(tcu::TestContext &testCtx, glu::RenderContext &renderCtx,
621                                                  const char *name, const char *description, CaseType caseType,
622                                                  int numWorkloads,
623                                                  const InitialCalibrationStorage &initialCalibrationStorage)
624     : tcu::TestCase(testCtx, tcu::NODETYPE_PERFORMANCE, name, description)
625     , m_renderCtx(renderCtx)
626     , m_caseType(caseType)
627     , m_numMeasurementsPerWorkload(
628           getIterationCountOrDefault(m_testCtx.getCommandLine(), DEFAULT_NUM_MEASUREMENTS_PER_WORKLOAD))
629     , m_numWorkloads(numWorkloads)
630     , m_workloadNdx(-1)
631     , m_workloadMeasurementNdx(-1)
632     , m_state(STATE_LAST)
633     , m_measureProgramNdx(-1)
634     , m_initialCalibrationStorage(initialCalibrationStorage)
635     , m_viewportWidth(caseType == CASETYPE_VERTEX ? 32 : renderCtx.getRenderTarget().getWidth())
636     , m_viewportHeight(caseType == CASETYPE_VERTEX ? 32 : renderCtx.getRenderTarget().getHeight())
637     , m_gridSizeX(caseType == CASETYPE_FRAGMENT ? 1 : 100)
638     , m_gridSizeY(caseType == CASETYPE_FRAGMENT ? 1 : 100)
639 {
640     DE_ASSERT(m_numWorkloads > 0);
641 }
642 
~OperatorPerformanceCase(void)643 OperatorPerformanceCase::~OperatorPerformanceCase(void)
644 {
645     if (!m_attribBuffers.empty())
646     {
647         m_renderCtx.getFunctions().deleteBuffers((glw::GLsizei)m_attribBuffers.size(), &m_attribBuffers[0]);
648         m_attribBuffers.clear();
649     }
650 }
651 
logRenderTargetInfo(TestLog & log,const tcu::RenderTarget & renderTarget)652 static void logRenderTargetInfo(TestLog &log, const tcu::RenderTarget &renderTarget)
653 {
654     log << TestLog::Section("RenderTarget", "Render target") << TestLog::Message << "size: " << renderTarget.getWidth()
655         << "x" << renderTarget.getHeight() << TestLog::EndMessage << TestLog::Message << "bits:"
656         << " R" << renderTarget.getPixelFormat().redBits << " G" << renderTarget.getPixelFormat().greenBits << " B"
657         << renderTarget.getPixelFormat().blueBits << " A" << renderTarget.getPixelFormat().alphaBits << " D"
658         << renderTarget.getDepthBits() << " S" << renderTarget.getStencilBits() << TestLog::EndMessage;
659 
660     if (renderTarget.getNumSamples() != 0)
661         log << TestLog::Message << renderTarget.getNumSamples() << "x MSAA" << TestLog::EndMessage;
662     else
663         log << TestLog::Message << "No MSAA" << TestLog::EndMessage;
664 
665     log << TestLog::EndSection;
666 }
667 
getWorkloadMedianDataPoints(int progNdx) const668 vector<Vec2> OperatorPerformanceCase::getWorkloadMedianDataPoints(int progNdx) const
669 {
670     const vector<WorkloadRecord> &records = m_workloadRecords[progNdx];
671     vector<Vec2> result;
672 
673     for (int i = 0; i < (int)records.size(); i++)
674         result.push_back(Vec2((float)records[i].workloadSize, records[i].getMedianTime()));
675 
676     return result;
677 }
678 
prepareProgram(int progNdx)679 void OperatorPerformanceCase::prepareProgram(int progNdx)
680 {
681     DE_ASSERT(progNdx < (int)m_programs.size());
682     DE_ASSERT(m_programData.size() == m_programs.size());
683 
684     const glw::Functions &gl     = m_renderCtx.getFunctions();
685     const ShaderProgram &program = *m_programs[progNdx];
686 
687     vector<AttribSpec> attributes = m_programData[progNdx].attributes;
688 
689     attributes.push_back(AttribSpec("a_position", Vec4(-1.0f, -1.0f, 0.0f, 1.0f), Vec4(1.0f, -1.0f, 0.0f, 1.0f),
690                                     Vec4(-1.0f, 1.0f, 0.0f, 1.0f), Vec4(1.0f, 1.0f, 0.0f, 1.0f)));
691 
692     DE_ASSERT(program.isOk());
693 
694     // Generate vertices.
695     if (!m_attribBuffers.empty())
696         gl.deleteBuffers((glw::GLsizei)m_attribBuffers.size(), &m_attribBuffers[0]);
697     m_attribBuffers.resize(attributes.size(), 0);
698     gl.genBuffers((glw::GLsizei)m_attribBuffers.size(), &m_attribBuffers[0]);
699     GLU_EXPECT_NO_ERROR(gl.getError(), "glGenBuffers()");
700 
701     for (int attribNdx = 0; attribNdx < (int)attributes.size(); attribNdx++)
702     {
703         std::vector<float> vertices;
704         generateVertices(vertices, m_gridSizeX, m_gridSizeY, attributes[attribNdx]);
705 
706         gl.bindBuffer(GL_ARRAY_BUFFER, m_attribBuffers[attribNdx]);
707         gl.bufferData(GL_ARRAY_BUFFER, (glw::GLsizeiptr)(vertices.size() * sizeof(float)), &vertices[0],
708                       GL_STATIC_DRAW);
709         GLU_EXPECT_NO_ERROR(gl.getError(), "Upload buffer data");
710     }
711 
712     // Setup attribute bindings.
713     for (int attribNdx = 0; attribNdx < (int)attributes.size(); attribNdx++)
714     {
715         int location = gl.getAttribLocation(program.getProgram(), attributes[attribNdx].name.c_str());
716 
717         if (location >= 0)
718         {
719             gl.enableVertexAttribArray(location);
720             gl.bindBuffer(GL_ARRAY_BUFFER, m_attribBuffers[attribNdx]);
721             gl.vertexAttribPointer(location, 4, GL_FLOAT, GL_FALSE, 0, DE_NULL);
722         }
723     }
724     GLU_EXPECT_NO_ERROR(gl.getError(), "Setup vertex input state");
725 
726     gl.useProgram(program.getProgram());
727     setGeneralUniforms(program.getProgram());
728     gl.viewport(0, 0, m_viewportWidth, m_viewportHeight);
729 }
730 
prepareWorkload(int progNdx,int workload)731 void OperatorPerformanceCase::prepareWorkload(int progNdx, int workload)
732 {
733     setWorkloadSizeUniform(m_programs[progNdx]->getProgram(), workload);
734     render(m_calibrator.getCallCount());
735 }
736 
prepareNextRound(void)737 void OperatorPerformanceCase::prepareNextRound(void)
738 {
739     DE_ASSERT(m_state == STATE_CALIBRATING || m_state == STATE_FIND_HIGH_WORKLOAD || m_state == STATE_MEASURING);
740 
741     TestLog &log = m_testCtx.getLog();
742 
743     if (m_state == STATE_CALIBRATING && m_calibrator.getState() == TheilSenCalibrator::STATE_FINISHED)
744     {
745         m_measureProgramNdx = 0;
746         m_state             = STATE_FIND_HIGH_WORKLOAD;
747     }
748 
749     if (m_state == STATE_CALIBRATING)
750         prepareWorkload(0, 1);
751     else if (m_state == STATE_FIND_HIGH_WORKLOAD)
752     {
753         vector<WorkloadRecord> &records = m_workloadRecordsFindHigh[m_measureProgramNdx];
754 
755         if (records.empty() || records.back().getMedianTime() < 2.0f * records[0].getMedianTime())
756         {
757             int workloadSize;
758 
759             if (records.empty())
760                 workloadSize = 1;
761             else
762             {
763                 workloadSize = records.back().workloadSize * 2;
764 
765                 if (workloadSize > MAX_WORKLOAD_SIZE)
766                 {
767                     log << TestLog::Message << "Even workload size " << records.back().workloadSize
768                         << " doesn't give high enough frame time for program " << m_measureProgramNdx
769                         << ". Can't get sensible result." << TestLog::EndMessage;
770                     MEASUREMENT_FAIL();
771                 }
772             }
773 
774             records.push_back(WorkloadRecord(workloadSize));
775             prepareWorkload(0, workloadSize);
776             m_workloadMeasurementNdx = 0;
777         }
778         else
779         {
780             m_highWorkloadSizes[m_measureProgramNdx] = records.back().workloadSize;
781             m_measureProgramNdx++;
782 
783             if (m_measureProgramNdx >= (int)m_programs.size())
784             {
785                 m_state             = STATE_MEASURING;
786                 m_workloadNdx       = -1;
787                 m_measureProgramNdx = 0;
788             }
789 
790             prepareProgram(m_measureProgramNdx);
791             prepareNextRound();
792         }
793     }
794     else
795     {
796         m_workloadNdx++;
797 
798         if (m_workloadNdx < m_numWorkloads)
799         {
800             DE_ASSERT(m_numWorkloads > 1);
801             const int highWorkload = m_highWorkloadSizes[m_measureProgramNdx];
802             const int workload     = highWorkload > m_numWorkloads ?
803                                          1 + m_workloadNdx * (highWorkload - 1) / (m_numWorkloads - 1) :
804                                          1 + m_workloadNdx;
805 
806             prepareWorkload(m_measureProgramNdx, workload);
807 
808             m_workloadMeasurementNdx = 0;
809 
810             m_workloadRecords[m_measureProgramNdx].push_back(WorkloadRecord(workload));
811         }
812         else
813         {
814             m_measureProgramNdx++;
815 
816             if (m_measureProgramNdx < (int)m_programs.size())
817             {
818                 m_workloadNdx            = -1;
819                 m_workloadMeasurementNdx = 0;
820                 prepareProgram(m_measureProgramNdx);
821                 prepareNextRound();
822             }
823             else
824                 m_state = STATE_REPORTING;
825         }
826     }
827 }
828 
init(void)829 void OperatorPerformanceCase::init(void)
830 {
831     TestLog &log             = m_testCtx.getLog();
832     const glw::Functions &gl = m_renderCtx.getFunctions();
833 
834     // Validate that we have sane grid and viewport setup.
835     DE_ASSERT(de::inBounds(m_gridSizeX, 1, 256) && de::inBounds(m_gridSizeY, 1, 256));
836     TCU_CHECK(de::inRange(m_viewportWidth, 1, m_renderCtx.getRenderTarget().getWidth()) &&
837               de::inRange(m_viewportHeight, 1, m_renderCtx.getRenderTarget().getHeight()));
838 
839     logRenderTargetInfo(log, m_renderCtx.getRenderTarget());
840 
841     log << TestLog::Message << "Using additive blending." << TestLog::EndMessage;
842     gl.enable(GL_BLEND);
843     gl.blendEquation(GL_FUNC_ADD);
844     gl.blendFunc(GL_ONE, GL_ONE);
845 
846     // Generate programs.
847     DE_ASSERT(m_programs.empty());
848     m_programData = generateProgramData();
849     DE_ASSERT(!m_programData.empty());
850 
851     for (int progNdx = 0; progNdx < (int)m_programData.size(); progNdx++)
852     {
853         const string &vert = m_programData[progNdx].vertShaderSource;
854         const string &frag = m_programData[progNdx].fragShaderSource;
855 
856         m_programs.push_back(
857             SharedPtr<ShaderProgram>(new ShaderProgram(m_renderCtx, glu::makeVtxFragSources(vert, frag))));
858 
859         if (!m_programs.back()->isOk())
860         {
861             log << *m_programs.back();
862             TCU_FAIL("Compile failed");
863         }
864     }
865 
866     // Log all programs.
867     for (int progNdx = 0; progNdx < (int)m_programs.size(); progNdx++)
868         log << TestLog::Section("Program" + de::toString(progNdx), "Program " + de::toString(progNdx))
869             << TestLog::Message << m_programData[progNdx].description << TestLog::EndMessage << *m_programs[progNdx]
870             << TestLog::EndSection;
871 
872     m_highWorkloadSizes.resize(m_programData.size());
873     m_workloadRecordsFindHigh.resize(m_programData.size());
874     m_workloadRecords.resize(m_programData.size());
875 
876     m_calibrator.clear(
877         CalibratorParameters(m_initialCalibrationStorage->initialNumCalls, 10 /* calibrate iteration frames */,
878                              2000.0f /* calibrate iteration shortcut threshold (ms) */,
879                              16 /* max calibrate iterations */, 1000.0f / 30.0f /* frame time (ms) */,
880                              1000.0f / 60.0f /* frame time cap (ms) */, 1000.0f /* target measure duration (ms) */));
881     m_state = STATE_CALIBRATING;
882 
883     prepareProgram(0);
884     prepareNextRound();
885 }
886 
deinit(void)887 void OperatorPerformanceCase::deinit(void)
888 {
889     if (!m_attribBuffers.empty())
890     {
891         m_renderCtx.getFunctions().deleteBuffers((glw::GLsizei)m_attribBuffers.size(), &m_attribBuffers[0]);
892         m_attribBuffers.clear();
893     }
894 
895     m_programs.clear();
896 }
897 
render(int numDrawCalls)898 void OperatorPerformanceCase::render(int numDrawCalls)
899 {
900     const glw::Functions &gl = m_renderCtx.getFunctions();
901     const int numVertices    = getNumVertices(m_gridSizeX, m_gridSizeY);
902 
903     for (int callNdx = 0; callNdx < numDrawCalls; callNdx++)
904         gl.drawArrays(GL_TRIANGLES, 0, numVertices);
905 
906     glu::readPixels(m_renderCtx, 0, 0,
907                     tcu::Surface(1, 1).getAccess()); // \note Serves as a more reliable replacement for glFinish().
908 }
909 
renderAndMeasure(int numDrawCalls)910 uint64_t OperatorPerformanceCase::renderAndMeasure(int numDrawCalls)
911 {
912     const uint64_t startTime = deGetMicroseconds();
913     render(numDrawCalls);
914     return deGetMicroseconds() - startTime;
915 }
916 
adjustAndLogGridAndViewport(void)917 void OperatorPerformanceCase::adjustAndLogGridAndViewport(void)
918 {
919     TestLog &log = m_testCtx.getLog();
920 
921     // If call count is just 1, and the target frame time still wasn't reached, reduce grid or viewport size.
922     if (m_calibrator.getCallCount() == 1)
923     {
924         const gls::MeasureState &calibratorMeasure = m_calibrator.getMeasureState();
925         const float drawCallTime = (float)calibratorMeasure.getTotalTime() / (float)calibratorMeasure.frameTimes.size();
926         const float targetDrawCallTime = m_calibrator.getParameters().targetFrameTimeUs;
927         const float targetRatio        = targetDrawCallTime / drawCallTime;
928 
929         if (targetRatio < 0.95f)
930         {
931             // Reduce grid or viewport size assuming draw call time scales proportionally.
932             if (m_caseType == CASETYPE_VERTEX)
933             {
934                 const float targetRatioSqrt = deFloatSqrt(targetRatio);
935                 m_gridSizeX                 = (int)(targetRatioSqrt * (float)m_gridSizeX);
936                 m_gridSizeY                 = (int)(targetRatioSqrt * (float)m_gridSizeY);
937                 TCU_CHECK_MSG(m_gridSizeX >= 1 && m_gridSizeY >= 1,
938                               "Can't decrease grid size enough to achieve low-enough draw times");
939                 log << TestLog::Message
940                     << "Note: triangle grid size reduced from original; it's now smaller than during calibration."
941                     << TestLog::EndMessage;
942             }
943             else
944             {
945                 const float targetRatioSqrt = deFloatSqrt(targetRatio);
946                 m_viewportWidth             = (int)(targetRatioSqrt * (float)m_viewportWidth);
947                 m_viewportHeight            = (int)(targetRatioSqrt * (float)m_viewportHeight);
948                 TCU_CHECK_MSG(m_viewportWidth >= 1 && m_viewportHeight >= 1,
949                               "Can't decrease viewport size enough to achieve low-enough draw times");
950                 log << TestLog::Message
951                     << "Note: viewport size reduced from original; it's now smaller than during calibration."
952                     << TestLog::EndMessage;
953             }
954         }
955     }
956 
957     prepareProgram(0);
958 
959     // Log grid and viewport sizes.
960     log << TestLog::Message << "Grid size: " << m_gridSizeX << "x" << m_gridSizeY << TestLog::EndMessage;
961     log << TestLog::Message << "Viewport: " << m_viewportWidth << "x" << m_viewportHeight << TestLog::EndMessage;
962 }
963 
iterate(void)964 OperatorPerformanceCase::IterateResult OperatorPerformanceCase::iterate(void)
965 {
966     const TheilSenCalibrator::State calibratorState = m_calibrator.getState();
967 
968     if (calibratorState != TheilSenCalibrator::STATE_FINISHED)
969     {
970         if (calibratorState == TheilSenCalibrator::STATE_RECOMPUTE_PARAMS)
971             m_calibrator.recomputeParameters();
972         else if (calibratorState == TheilSenCalibrator::STATE_MEASURE)
973             m_calibrator.recordIteration(renderAndMeasure(m_calibrator.getCallCount()));
974         else
975             DE_ASSERT(false);
976 
977         if (m_calibrator.getState() == TheilSenCalibrator::STATE_FINISHED)
978         {
979             logCalibrationInfo(m_testCtx.getLog(), m_calibrator);
980             adjustAndLogGridAndViewport();
981             prepareNextRound();
982             m_initialCalibrationStorage->initialNumCalls = m_calibrator.getCallCount();
983         }
984     }
985     else if (m_state == STATE_FIND_HIGH_WORKLOAD || m_state == STATE_MEASURING)
986     {
987         if (m_workloadMeasurementNdx < m_numMeasurementsPerWorkload)
988         {
989             vector<WorkloadRecord> &records = m_state == STATE_FIND_HIGH_WORKLOAD ?
990                                                   m_workloadRecordsFindHigh[m_measureProgramNdx] :
991                                                   m_workloadRecords[m_measureProgramNdx];
992             records.back().addFrameTime((float)renderAndMeasure(m_calibrator.getCallCount()));
993             m_workloadMeasurementNdx++;
994         }
995         else
996             prepareNextRound();
997     }
998     else
999     {
1000         DE_ASSERT(m_state == STATE_REPORTING);
1001 
1002         TestLog &log            = m_testCtx.getLog();
1003         const int drawCallCount = m_calibrator.getCallCount();
1004 
1005         {
1006             // Compute per-program estimators for measurements.
1007             vector<SegmentedEstimator> estimators;
1008             for (int progNdx = 0; progNdx < (int)m_programs.size(); progNdx++)
1009                 estimators.push_back(computeSegmentedEstimator(getWorkloadMedianDataPoints(progNdx)));
1010 
1011             // Log measurements and their estimators for all programs.
1012             for (int progNdx = 0; progNdx < (int)m_programs.size(); progNdx++)
1013             {
1014                 const SegmentedEstimator &estimator = estimators[progNdx];
1015                 const string progNdxStr             = de::toString(progNdx);
1016                 vector<WorkloadRecord> records      = m_workloadRecords[progNdx];
1017                 std::sort(records.begin(), records.end());
1018 
1019                 {
1020                     const tcu::ScopedLogSection section(log, "Program" + progNdxStr + "Measurements",
1021                                                         "Measurements for program " + progNdxStr);
1022 
1023                     // Sample list of individual frame times.
1024 
1025                     log << TestLog::SampleList("Program" + progNdxStr + "IndividualFrameTimes",
1026                                                "Individual frame times")
1027                         << TestLog::SampleInfo
1028                         << TestLog::ValueInfo("Workload", "Workload", "", QP_SAMPLE_VALUE_TAG_PREDICTOR)
1029                         << TestLog::ValueInfo("FrameTime", "Frame time", "us", QP_SAMPLE_VALUE_TAG_RESPONSE)
1030                         << TestLog::EndSampleInfo;
1031 
1032                     for (int i = 0; i < (int)records.size(); i++)
1033                         for (int j = 0; j < (int)records[i].frameTimes.size(); j++)
1034                             log << TestLog::Sample << records[i].workloadSize << records[i].frameTimes[j]
1035                                 << TestLog::EndSample;
1036 
1037                     log << TestLog::EndSampleList;
1038 
1039                     // Sample list of median frame times.
1040 
1041                     log << TestLog::SampleList("Program" + progNdxStr + "MedianFrameTimes", "Median frame times")
1042                         << TestLog::SampleInfo
1043                         << TestLog::ValueInfo("Workload", "Workload", "", QP_SAMPLE_VALUE_TAG_PREDICTOR)
1044                         << TestLog::ValueInfo("MedianFrameTime", "Median frame time", "us",
1045                                               QP_SAMPLE_VALUE_TAG_RESPONSE)
1046                         << TestLog::EndSampleInfo;
1047 
1048                     for (int i = 0; i < (int)records.size(); i++)
1049                         log << TestLog::Sample << records[i].workloadSize << records[i].getMedianTime()
1050                             << TestLog::EndSample;
1051 
1052                     log << TestLog::EndSampleList;
1053 
1054                     log << TestLog::Float("Program" + progNdxStr + "WorkloadCostEstimate", "Workload cost estimate",
1055                                           "us / workload", QP_KEY_TAG_TIME, estimator.right.coefficient);
1056 
1057                     if (estimator.pivotX > -std::numeric_limits<float>::infinity())
1058                         log << TestLog::Message << "Note: the data points with x coordinate greater than or equal to "
1059                             << estimator.pivotX
1060                             << " seem to form a rising line, and the rest of data points seem to form a "
1061                                "near-horizontal line"
1062                             << TestLog::EndMessage << TestLog::Message << "Note: the left line is estimated to be "
1063                             << lineParamsString(estimator.left) << " and the right line "
1064                             << lineParamsString(estimator.right) << TestLog::EndMessage;
1065                     else
1066                         log << TestLog::Message
1067                             << "Note: the data seem to form a single line: " << lineParamsString(estimator.right)
1068                             << TestLog::EndMessage;
1069                 }
1070             }
1071 
1072             for (int progNdx = 0; progNdx < (int)m_programs.size(); progNdx++)
1073             {
1074                 if (estimators[progNdx].right.coefficient <= 0.0f)
1075                 {
1076                     log << TestLog::Message << "Slope of measurements for program " << progNdx
1077                         << " isn't positive. Can't get sensible result." << TestLog::EndMessage;
1078                     MEASUREMENT_FAIL();
1079                 }
1080             }
1081 
1082             // \note For each estimator, .right.coefficient is the increase in draw time (in microseconds) when
1083             // incrementing shader workload size by 1, when D draw calls are done, with a vertex/fragment count
1084             // of R.
1085             //
1086             // The measurements of any single program can't tell us the final result (time of single operation),
1087             // so we use computeSingleOperationTime to compute it from multiple programs' measurements in a
1088             // subclass-defined manner.
1089             //
1090             // After that, microseconds per operation can be calculated as singleOperationTime / (D * R).
1091 
1092             {
1093                 vector<float> perProgramSlopes;
1094                 for (int i = 0; i < (int)m_programs.size(); i++)
1095                     perProgramSlopes.push_back(estimators[i].right.coefficient);
1096 
1097                 logSingleOperationCalculationInfo();
1098 
1099                 const float maxSlope            = *std::max_element(perProgramSlopes.begin(), perProgramSlopes.end());
1100                 const float usecsPerFramePerOp  = computeSingleOperationTime(perProgramSlopes);
1101                 const int vertexOrFragmentCount = m_caseType == CASETYPE_VERTEX ?
1102                                                       getNumVertices(m_gridSizeX, m_gridSizeY) :
1103                                                       m_viewportWidth * m_viewportHeight;
1104                 const double usecsPerDrawCallPerOp = usecsPerFramePerOp / (double)drawCallCount;
1105                 const double usecsPerSingleOp      = usecsPerDrawCallPerOp / (double)vertexOrFragmentCount;
1106                 const double megaOpsPerSecond = (double)(drawCallCount * vertexOrFragmentCount) / usecsPerFramePerOp;
1107                 const int numFreeOps          = de::max(
1108                     0, (int)deFloatFloor(intersectionX(
1109                            estimators[0].left, LineParameters(estimators[0].right.offset, usecsPerFramePerOp))));
1110 
1111                 log << TestLog::Integer("VertexOrFragmentCount",
1112                                         "R = " + string(m_caseType == CASETYPE_VERTEX ? "Vertex" : "Fragment") +
1113                                             " count",
1114                                         "", QP_KEY_TAG_NONE, vertexOrFragmentCount)
1115 
1116                     << TestLog::Integer("DrawCallsPerFrame", "D = Draw calls per frame", "", QP_KEY_TAG_NONE,
1117                                         drawCallCount)
1118 
1119                     << TestLog::Integer("VerticesOrFragmentsPerFrame",
1120                                         "R*D = " + string(m_caseType == CASETYPE_VERTEX ? "Vertices" : "Fragments") +
1121                                             " per frame",
1122                                         "", QP_KEY_TAG_NONE, vertexOrFragmentCount * drawCallCount)
1123 
1124                     << TestLog::Float("TimePerFramePerOp",
1125                                       "Estimated cost of R*D " +
1126                                           string(m_caseType == CASETYPE_VERTEX ? "vertices" : "fragments") +
1127                                           " (i.e. one frame) with one shader operation",
1128                                       "us", QP_KEY_TAG_TIME, (float)usecsPerFramePerOp)
1129 
1130                     << TestLog::Float("TimePerDrawcallPerOp",
1131                                       "Estimated cost of one draw call with one shader operation", "us",
1132                                       QP_KEY_TAG_TIME, (float)usecsPerDrawCallPerOp)
1133 
1134                     << TestLog::Float("TimePerSingleOp", "Estimated cost of a single shader operation", "us",
1135                                       QP_KEY_TAG_TIME, (float)usecsPerSingleOp);
1136 
1137                 // \note Sometimes, when the operation is free or very cheap, it can happen that the shader with the operation runs,
1138                 //         for some reason, a bit faster than the shader without the operation, and thus we get a negative result. The
1139                 //         following threshold values for accepting a negative or almost-zero result are rather quick and dirty.
1140                 if (usecsPerFramePerOp <= -0.1f * maxSlope)
1141                 {
1142                     log << TestLog::Message << "Got strongly negative result." << TestLog::EndMessage;
1143                     MEASUREMENT_FAIL();
1144                 }
1145                 else if (usecsPerFramePerOp <= 0.001 * maxSlope)
1146                 {
1147                     log << TestLog::Message << "Cost of operation seems to be approximately zero."
1148                         << TestLog::EndMessage;
1149                     m_testCtx.setTestResult(QP_TEST_RESULT_PASS, "Pass");
1150                 }
1151                 else
1152                 {
1153                     log << TestLog::Float("OpsPerSecond", "Operations per second", "Million/s", QP_KEY_TAG_PERFORMANCE,
1154                                           (float)megaOpsPerSecond)
1155 
1156                         << TestLog::Integer("NumFreeOps", "Estimated number of \"free\" operations", "",
1157                                             QP_KEY_TAG_PERFORMANCE, numFreeOps);
1158 
1159                     m_testCtx.setTestResult(QP_TEST_RESULT_PASS, de::floatToString((float)megaOpsPerSecond, 2).c_str());
1160                 }
1161 
1162                 m_state = STATE_FINISHED;
1163             }
1164         }
1165 
1166         return STOP;
1167     }
1168 
1169     return CONTINUE;
1170 }
1171 
1172 // Binary operator case.
1173 class BinaryOpCase : public OperatorPerformanceCase
1174 {
1175 public:
1176     BinaryOpCase(Context &context, const char *name, const char *description, const char *op, glu::DataType type,
1177                  glu::Precision precision, bool useSwizzle, bool isVertex,
1178                  const InitialCalibrationStorage &initialCalibration);
1179 
1180 protected:
1181     vector<ProgramContext> generateProgramData(void) const;
1182     void setGeneralUniforms(uint32_t program) const;
1183     void setWorkloadSizeUniform(uint32_t program, int numOperations) const;
1184     float computeSingleOperationTime(const vector<float> &perProgramOperationCosts) const;
1185     void logSingleOperationCalculationInfo(void) const;
1186 
1187 private:
1188     enum ProgramID
1189     {
1190         // \note 0-based sequential numbering is relevant, because these are also used as vector indices.
1191         // \note The first program should be the heaviest, because OperatorPerformanceCase uses it to reduce grid/viewport size when going too slow.
1192         PROGRAM_WITH_BIGGER_LOOP = 0,
1193         PROGRAM_WITH_SMALLER_LOOP,
1194 
1195         PROGRAM_LAST
1196     };
1197 
1198     ProgramContext generateSingleProgramData(ProgramID) const;
1199 
1200     const string m_op;
1201     const glu::DataType m_type;
1202     const glu::Precision m_precision;
1203     const bool m_useSwizzle;
1204 };
1205 
BinaryOpCase(Context & context,const char * name,const char * description,const char * op,glu::DataType type,glu::Precision precision,bool useSwizzle,bool isVertex,const InitialCalibrationStorage & initialCalibration)1206 BinaryOpCase::BinaryOpCase(Context &context, const char *name, const char *description, const char *op,
1207                            glu::DataType type, glu::Precision precision, bool useSwizzle, bool isVertex,
1208                            const InitialCalibrationStorage &initialCalibration)
1209     : OperatorPerformanceCase(context.getTestContext(), context.getRenderContext(), name, description,
1210                               isVertex ? CASETYPE_VERTEX : CASETYPE_FRAGMENT, NUM_WORKLOADS, initialCalibration)
1211     , m_op(op)
1212     , m_type(type)
1213     , m_precision(precision)
1214     , m_useSwizzle(useSwizzle)
1215 {
1216 }
1217 
generateSingleProgramData(ProgramID programID) const1218 BinaryOpCase::ProgramContext BinaryOpCase::generateSingleProgramData(ProgramID programID) const
1219 {
1220     DE_ASSERT(glu::isDataTypeFloatOrVec(m_type) || glu::isDataTypeIntOrIVec(m_type));
1221 
1222     const bool isVertexCase     = m_caseType == CASETYPE_VERTEX;
1223     const char *const precision = glu::getPrecisionName(m_precision);
1224     const char *const inputPrecision =
1225         glu::isDataTypeIntOrIVec(m_type) && m_precision == glu::PRECISION_LOWP ? "mediump" : precision;
1226     const char *const typeName = getDataTypeName(m_type);
1227 
1228     std::ostringstream vtx;
1229     std::ostringstream frag;
1230     std::ostringstream &op = isVertexCase ? vtx : frag;
1231 
1232     // Attributes.
1233     vtx << "attribute highp vec4 a_position;\n";
1234     for (int i = 0; i < BINARY_OPERATOR_CASE_NUM_INDEPENDENT_CALCULATIONS + 1; i++)
1235         vtx << "attribute " << inputPrecision << " vec4 a_in" << i << ";\n";
1236 
1237     if (isVertexCase)
1238     {
1239         vtx << "varying mediump vec4 v_color;\n";
1240         frag << "varying mediump vec4 v_color;\n";
1241     }
1242     else
1243     {
1244         for (int i = 0; i < BINARY_OPERATOR_CASE_NUM_INDEPENDENT_CALCULATIONS + 1; i++)
1245         {
1246             vtx << "varying " << inputPrecision << " vec4 v_in" << i << ";\n";
1247             frag << "varying " << inputPrecision << " vec4 v_in" << i << ";\n";
1248         }
1249     }
1250 
1251     op << "uniform mediump int u_numLoopIterations;\n";
1252     if (isVertexCase)
1253         op << "uniform mediump float u_zero;\n";
1254 
1255     vtx << "\n";
1256     vtx << "void main()\n";
1257     vtx << "{\n";
1258 
1259     if (!isVertexCase)
1260         vtx << "\tgl_Position = a_position;\n";
1261 
1262     frag << "\n";
1263     frag << "void main()\n";
1264     frag << "{\n";
1265 
1266     // Expression inputs.
1267     const char *const prefix = isVertexCase ? "a_" : "v_";
1268     for (int i = 0; i < BINARY_OPERATOR_CASE_NUM_INDEPENDENT_CALCULATIONS + 1; i++)
1269     {
1270         const int inSize = getDataTypeScalarSize(m_type);
1271         const bool isInt = de::inRange<int>(m_type, TYPE_INT, TYPE_INT_VEC4);
1272         const bool cast  = isInt || (!m_useSwizzle && m_type != TYPE_FLOAT_VEC4);
1273 
1274         op << "\t" << precision << " " << typeName << " in" << i << " = ";
1275 
1276         if (cast)
1277             op << typeName << "(";
1278 
1279         op << prefix << "in" << i;
1280 
1281         if (m_useSwizzle)
1282             op << "." << s_swizzles[i % DE_LENGTH_OF_ARRAY(s_swizzles)][inSize - 1];
1283 
1284         if (cast)
1285             op << ")";
1286 
1287         op << ";\n";
1288     }
1289 
1290     // Operation accumulation variables.
1291     for (int i = 0; i < BINARY_OPERATOR_CASE_NUM_INDEPENDENT_CALCULATIONS; i++)
1292     {
1293         op << "\t" << precision << " " << typeName << " acc" << i << "a"
1294            << " = in" << i + 0 << ";\n";
1295         op << "\t" << precision << " " << typeName << " acc" << i << "b"
1296            << " = in" << i + 1 << ";\n";
1297     }
1298 
1299     // Loop, with expressions in it.
1300     op << "\tfor (int i = 0; i < u_numLoopIterations; i++)\n";
1301     op << "\t{\n";
1302     {
1303         const int unrollAmount = programID == PROGRAM_WITH_SMALLER_LOOP ?
1304                                      BINARY_OPERATOR_CASE_SMALL_PROGRAM_UNROLL_AMOUNT :
1305                                      BINARY_OPERATOR_CASE_BIG_PROGRAM_UNROLL_AMOUNT;
1306         for (int unrollNdx = 0; unrollNdx < unrollAmount; unrollNdx++)
1307         {
1308             for (int i = 0; i < BINARY_OPERATOR_CASE_NUM_INDEPENDENT_CALCULATIONS; i++)
1309             {
1310                 if (i > 0 || unrollNdx > 0)
1311                     op << "\n";
1312                 op << "\t\tacc" << i << "a = acc" << i << "b " << m_op << " acc" << i << "a"
1313                    << ";\n";
1314                 op << "\t\tacc" << i << "b = acc" << i << "a " << m_op << " acc" << i << "b"
1315                    << ";\n";
1316             }
1317         }
1318     }
1319     op << "\t}\n";
1320     op << "\n";
1321 
1322     // Result variable (sum of accumulation variables).
1323     op << "\t" << precision << " " << typeName << " res =";
1324     for (int i = 0; i < BINARY_OPERATOR_CASE_NUM_INDEPENDENT_CALCULATIONS; i++)
1325         op << (i > 0 ? " " + m_op : "") << " acc" << i << "b";
1326     op << ";\n";
1327 
1328     // Convert to color.
1329     op << "\tmediump vec4 color = ";
1330     if (m_type == TYPE_FLOAT_VEC4)
1331         op << "res";
1332     else
1333     {
1334         int size = getDataTypeScalarSize(m_type);
1335         op << "vec4(res";
1336 
1337         for (int i = size; i < 4; i++)
1338             op << ", " << (i == 3 ? "1.0" : "0.0");
1339 
1340         op << ")";
1341     }
1342     op << ";\n";
1343     op << "\t" << (isVertexCase ? "v_color" : "gl_FragColor") << " = color;\n";
1344 
1345     if (isVertexCase)
1346     {
1347         vtx << "    gl_Position = a_position + u_zero*color;\n";
1348         frag << "    gl_FragColor = v_color;\n";
1349     }
1350     else
1351     {
1352         for (int i = 0; i < BINARY_OPERATOR_CASE_NUM_INDEPENDENT_CALCULATIONS + 1; i++)
1353             vtx << "    v_in" << i << " = a_in" << i << ";\n";
1354     }
1355 
1356     vtx << "}\n";
1357     frag << "}\n";
1358 
1359     {
1360         vector<AttribSpec> attributes;
1361         for (int i = 0; i < BINARY_OPERATOR_CASE_NUM_INDEPENDENT_CALCULATIONS + 1; i++)
1362             attributes.push_back(
1363                 AttribSpec(("a_in" + de::toString(i)).c_str(),
1364                            Vec4(2.0f, 2.0f, 2.0f, 1.0f).swizzle((i + 0) % 4, (i + 1) % 4, (i + 2) % 4, (i + 3) % 4),
1365                            Vec4(1.0f, 2.0f, 1.0f, 2.0f).swizzle((i + 0) % 4, (i + 1) % 4, (i + 2) % 4, (i + 3) % 4),
1366                            Vec4(2.0f, 1.0f, 2.0f, 2.0f).swizzle((i + 0) % 4, (i + 1) % 4, (i + 2) % 4, (i + 3) % 4),
1367                            Vec4(1.0f, 1.0f, 2.0f, 1.0f).swizzle((i + 0) % 4, (i + 1) % 4, (i + 2) % 4, (i + 3) % 4)));
1368 
1369         {
1370             string description = "This is the program with the ";
1371 
1372             description += programID == PROGRAM_WITH_SMALLER_LOOP ? "smaller" :
1373                            programID == PROGRAM_WITH_BIGGER_LOOP  ? "bigger" :
1374                                                                     DE_NULL;
1375 
1376             description += " loop.\n"
1377                            "Note: workload size for this program means the number of loop iterations.";
1378 
1379             return ProgramContext(vtx.str(), frag.str(), attributes, description);
1380         }
1381     }
1382 }
1383 
generateProgramData(void) const1384 vector<BinaryOpCase::ProgramContext> BinaryOpCase::generateProgramData(void) const
1385 {
1386     vector<ProgramContext> progData;
1387     for (int i = 0; i < PROGRAM_LAST; i++)
1388         progData.push_back(generateSingleProgramData((ProgramID)i));
1389     return progData;
1390 }
1391 
setGeneralUniforms(uint32_t program) const1392 void BinaryOpCase::setGeneralUniforms(uint32_t program) const
1393 {
1394     const glw::Functions &gl = m_renderCtx.getFunctions();
1395     gl.uniform1f(gl.getUniformLocation(program, "u_zero"), 0.0f);
1396 }
1397 
setWorkloadSizeUniform(uint32_t program,int numLoopIterations) const1398 void BinaryOpCase::setWorkloadSizeUniform(uint32_t program, int numLoopIterations) const
1399 {
1400     const glw::Functions &gl = m_renderCtx.getFunctions();
1401     gl.uniform1i(gl.getUniformLocation(program, "u_numLoopIterations"), numLoopIterations);
1402 }
1403 
computeSingleOperationTime(const vector<float> & perProgramOperationCosts) const1404 float BinaryOpCase::computeSingleOperationTime(const vector<float> &perProgramOperationCosts) const
1405 {
1406     DE_ASSERT(perProgramOperationCosts.size() == PROGRAM_LAST);
1407 
1408     const int baseNumOpsInsideLoop           = 2 * BINARY_OPERATOR_CASE_NUM_INDEPENDENT_CALCULATIONS;
1409     const int numOpsInsideLoopInSmallProgram = baseNumOpsInsideLoop * BINARY_OPERATOR_CASE_SMALL_PROGRAM_UNROLL_AMOUNT;
1410     const int numOpsInsideLoopInBigProgram   = baseNumOpsInsideLoop * BINARY_OPERATOR_CASE_BIG_PROGRAM_UNROLL_AMOUNT;
1411     DE_STATIC_ASSERT(numOpsInsideLoopInBigProgram > numOpsInsideLoopInSmallProgram);
1412     const int opDiff = numOpsInsideLoopInBigProgram - numOpsInsideLoopInSmallProgram;
1413     const float programOperationCostDiff =
1414         perProgramOperationCosts[PROGRAM_WITH_BIGGER_LOOP] - perProgramOperationCosts[PROGRAM_WITH_SMALLER_LOOP];
1415 
1416     return programOperationCostDiff / (float)opDiff;
1417 }
1418 
logSingleOperationCalculationInfo(void) const1419 void BinaryOpCase::logSingleOperationCalculationInfo(void) const
1420 {
1421     const int baseNumOpsInsideLoop           = 2 * BINARY_OPERATOR_CASE_NUM_INDEPENDENT_CALCULATIONS;
1422     const int numOpsInsideLoopInSmallProgram = baseNumOpsInsideLoop * BINARY_OPERATOR_CASE_SMALL_PROGRAM_UNROLL_AMOUNT;
1423     const int numOpsInsideLoopInBigProgram   = baseNumOpsInsideLoop * BINARY_OPERATOR_CASE_BIG_PROGRAM_UNROLL_AMOUNT;
1424     const int opDiff                         = numOpsInsideLoopInBigProgram - numOpsInsideLoopInSmallProgram;
1425     const char *const opName                 = m_op == "+" ? "addition" :
1426                                                m_op == "-" ? "subtraction" :
1427                                                m_op == "*" ? "multiplication" :
1428                                                m_op == "/" ? "division" :
1429                                                              DE_NULL;
1430     DE_ASSERT(opName != DE_NULL);
1431 
1432     m_testCtx.getLog()
1433         << TestLog::Message << "Note: the bigger program contains " << opDiff << " more " << opName
1434         << " operations in one loop iteration than the small program; "
1435         << "cost of one operation is calculated as (cost_of_bigger_workload - cost_of_smaller_workload) / " << opDiff
1436         << TestLog::EndMessage;
1437 }
1438 
1439 // Built-in function case.
1440 class FunctionCase : public OperatorPerformanceCase
1441 {
1442 public:
1443     enum
1444     {
1445         MAX_PARAMS = 3
1446     };
1447 
1448     FunctionCase(
1449         Context &context, const char *name, const char *description, const char *func, glu::DataType returnType,
1450         const glu::DataType paramTypes[MAX_PARAMS], const Vec4 &attribute,
1451         int modifyParamNdx, //!< Add a compile-time constant (2.0) to the parameter at this index. This is ignored if negative.
1452         bool useNearlyConstantINputs, //!< Function inputs shouldn't be much bigger than 'attribute'.
1453         glu::Precision precision, bool isVertex, const InitialCalibrationStorage &initialCalibration);
1454 
1455 protected:
1456     vector<ProgramContext> generateProgramData(void) const;
1457     void setGeneralUniforms(uint32_t program) const;
1458     void setWorkloadSizeUniform(uint32_t program, int numOperations) const;
1459     float computeSingleOperationTime(const vector<float> &perProgramOperationCosts) const;
1460     void logSingleOperationCalculationInfo(void) const;
1461 
1462 private:
1463     enum ProgramID
1464     {
1465         // \note 0-based sequential numbering is relevant, because these are also used as vector indices.
1466         // \note The first program should be the heaviest, because OperatorPerformanceCase uses it to reduce grid/viewport size when going too slow.
1467         PROGRAM_WITH_FUNCTION_CALLS = 0,
1468         PROGRAM_WITHOUT_FUNCTION_CALLS,
1469 
1470         PROGRAM_LAST
1471     };
1472 
1473     //! Forms a "sum" expression from aExpr and bExpr; for booleans, this is "equal(a,b)", otherwise actual sum.
1474     static string sumExpr(const string &aExpr, const string &bExpr, glu::DataType type);
1475     //! Forms an expression used to increment an input value in the shader. If type is boolean, this is just
1476     //! baseExpr; otherwise, baseExpr is modified by multiplication or division by a loop index,
1477     //! to prevent simple compiler optimizations. See m_useNearlyConstantInputs for more explanation.
1478     static string incrementExpr(const string &baseExpr, glu::DataType type, bool divide);
1479 
1480     ProgramContext generateSingleProgramData(ProgramID) const;
1481 
1482     const string m_func;
1483     const glu::DataType m_returnType;
1484     glu::DataType m_paramTypes[MAX_PARAMS];
1485     // \note m_modifyParamNdx, if not negative, specifies the index of the parameter to which a
1486     //         compile-time constant (2.0) is added. This is a quick and dirty way to deal with
1487     //         functions like clamp or smoothstep that require that a certain parameter is
1488     //         greater than a certain other parameter.
1489     const int m_modifyParamNdx;
1490     // \note m_useNearlyConstantInputs determines whether the inputs given to the function
1491     //         should increase (w.r.t m_attribute) only by very small amounts. This is relevant
1492     //         for functions like asin, which requires its inputs to be in a specific range.
1493     //         In practice, this affects whether expressions used to increment the input
1494     //         variables use division instead of multiplication; normally, multiplication is used,
1495     //         but it's hard to keep the increments very small that way, and division shouldn't
1496     //         be the default, since for many functions (probably not asin, luckily), division
1497     //         is too heavy and dominates time-wise.
1498     const bool m_useNearlyConstantInputs;
1499     const Vec4 m_attribute;
1500     const glu::Precision m_precision;
1501 };
1502 
FunctionCase(Context & context,const char * name,const char * description,const char * func,glu::DataType returnType,const glu::DataType paramTypes[MAX_PARAMS],const Vec4 & attribute,int modifyParamNdx,bool useNearlyConstantInputs,glu::Precision precision,bool isVertex,const InitialCalibrationStorage & initialCalibration)1503 FunctionCase::FunctionCase(Context &context, const char *name, const char *description, const char *func,
1504                            glu::DataType returnType, const glu::DataType paramTypes[MAX_PARAMS], const Vec4 &attribute,
1505                            int modifyParamNdx, bool useNearlyConstantInputs, glu::Precision precision, bool isVertex,
1506                            const InitialCalibrationStorage &initialCalibration)
1507     : OperatorPerformanceCase(context.getTestContext(), context.getRenderContext(), name, description,
1508                               isVertex ? CASETYPE_VERTEX : CASETYPE_FRAGMENT, NUM_WORKLOADS, initialCalibration)
1509     , m_func(func)
1510     , m_returnType(returnType)
1511     , m_modifyParamNdx(modifyParamNdx)
1512     , m_useNearlyConstantInputs(useNearlyConstantInputs)
1513     , m_attribute(attribute)
1514     , m_precision(precision)
1515 {
1516     for (int i = 0; i < MAX_PARAMS; i++)
1517         m_paramTypes[i] = paramTypes[i];
1518 }
1519 
sumExpr(const string & aExpr,const string & bExpr,glu::DataType type)1520 string FunctionCase::sumExpr(const string &aExpr, const string &bExpr, glu::DataType type)
1521 {
1522     if (glu::isDataTypeBoolOrBVec(type))
1523     {
1524         if (type == glu::TYPE_BOOL)
1525             return "(" + aExpr + " == " + bExpr + ")";
1526         else
1527             return "equal(" + aExpr + ", " + bExpr + ")";
1528     }
1529     else
1530         return "(" + aExpr + " + " + bExpr + ")";
1531 }
1532 
incrementExpr(const string & baseExpr,glu::DataType type,bool divide)1533 string FunctionCase::incrementExpr(const string &baseExpr, glu::DataType type, bool divide)
1534 {
1535     const string mulOrDiv = divide ? "/" : "*";
1536 
1537     return glu::isDataTypeBoolOrBVec(type) ? baseExpr :
1538            glu::isDataTypeIntOrIVec(type)  ? "(" + baseExpr + mulOrDiv + "(i+1))" :
1539                                              "(" + baseExpr + mulOrDiv + "float(i+1))";
1540 }
1541 
generateSingleProgramData(ProgramID programID) const1542 FunctionCase::ProgramContext FunctionCase::generateSingleProgramData(ProgramID programID) const
1543 {
1544     const bool isVertexCase           = m_caseType == CASETYPE_VERTEX;
1545     const char *const precision       = glu::getPrecisionName(m_precision);
1546     const char *const returnTypeName  = getDataTypeName(m_returnType);
1547     const string returnPrecisionMaybe = glu::isDataTypeBoolOrBVec(m_returnType) ? "" : string() + precision + " ";
1548     const char *inputPrecision        = DE_NULL;
1549     const bool isMatrixReturn         = isDataTypeMatrix(m_returnType);
1550     int numParams                     = 0;
1551     const char *paramTypeNames[MAX_PARAMS];
1552     string paramPrecisionsMaybe[MAX_PARAMS];
1553 
1554     for (int i = 0; i < MAX_PARAMS; i++)
1555     {
1556         paramTypeNames[i]       = getDataTypeName(m_paramTypes[i]);
1557         paramPrecisionsMaybe[i] = glu::isDataTypeBoolOrBVec(m_paramTypes[i]) ? "" : string() + precision + " ";
1558 
1559         if (inputPrecision == DE_NULL && isDataTypeIntOrIVec(m_paramTypes[i]) && m_precision == glu::PRECISION_LOWP)
1560             inputPrecision = "mediump";
1561 
1562         if (m_paramTypes[i] != TYPE_INVALID)
1563             numParams = i + 1;
1564     }
1565 
1566     DE_ASSERT(numParams > 0);
1567 
1568     if (inputPrecision == DE_NULL)
1569         inputPrecision = precision;
1570 
1571     int numAttributes = FUNCTION_CASE_NUM_INDEPENDENT_CALCULATIONS + numParams - 1;
1572     std::ostringstream vtx;
1573     std::ostringstream frag;
1574     std::ostringstream &op = isVertexCase ? vtx : frag;
1575 
1576     // Attributes.
1577     vtx << "attribute highp vec4 a_position;\n";
1578     for (int i = 0; i < numAttributes; i++)
1579         vtx << "attribute " << inputPrecision << " vec4 a_in" << i << ";\n";
1580 
1581     if (isVertexCase)
1582     {
1583         vtx << "varying mediump vec4 v_color;\n";
1584         frag << "varying mediump vec4 v_color;\n";
1585     }
1586     else
1587     {
1588         for (int i = 0; i < numAttributes; i++)
1589         {
1590             vtx << "varying " << inputPrecision << " vec4 v_in" << i << ";\n";
1591             frag << "varying " << inputPrecision << " vec4 v_in" << i << ";\n";
1592         }
1593     }
1594 
1595     op << "uniform mediump int u_numLoopIterations;\n";
1596     if (isVertexCase)
1597         op << "uniform mediump float u_zero;\n";
1598 
1599     for (int paramNdx = 0; paramNdx < numParams; paramNdx++)
1600         op << "uniform " << paramPrecisionsMaybe[paramNdx] << paramTypeNames[paramNdx] << " u_inc"
1601            << (char)('A' + paramNdx) << ";\n";
1602 
1603     vtx << "\n";
1604     vtx << "void main()\n";
1605     vtx << "{\n";
1606 
1607     if (!isVertexCase)
1608         vtx << "\tgl_Position = a_position;\n";
1609 
1610     frag << "\n";
1611     frag << "void main()\n";
1612     frag << "{\n";
1613 
1614     // Function call input and return value accumulation variables.
1615     {
1616         const char *const inPrefix = isVertexCase ? "a_" : "v_";
1617 
1618         for (int calcNdx = 0; calcNdx < FUNCTION_CASE_NUM_INDEPENDENT_CALCULATIONS; calcNdx++)
1619         {
1620             for (int paramNdx = 0; paramNdx < numParams; paramNdx++)
1621             {
1622                 const glu::DataType paramType = m_paramTypes[paramNdx];
1623                 const bool mustCast           = paramType != glu::TYPE_FLOAT_VEC4;
1624 
1625                 op << "\t" << paramPrecisionsMaybe[paramNdx] << paramTypeNames[paramNdx] << " in" << calcNdx
1626                    << (char)('a' + paramNdx) << " = ";
1627 
1628                 if (mustCast)
1629                     op << paramTypeNames[paramNdx] << "(";
1630 
1631                 if (glu::isDataTypeMatrix(paramType))
1632                 {
1633                     static const char *const swizzles[3] = {"x", "xy", "xyz"};
1634                     const int numRows                    = glu::getDataTypeMatrixNumRows(paramType);
1635                     const int numCols                    = glu::getDataTypeMatrixNumColumns(paramType);
1636                     const string swizzle                 = numRows < 4 ? string() + "." + swizzles[numRows - 1] : "";
1637 
1638                     for (int i = 0; i < numCols; i++)
1639                         op << (i > 0 ? ", " : "") << inPrefix << "in" << calcNdx + paramNdx << swizzle;
1640                 }
1641                 else
1642                 {
1643                     op << inPrefix << "in" << calcNdx + paramNdx;
1644 
1645                     if (paramNdx == m_modifyParamNdx)
1646                     {
1647                         DE_ASSERT(glu::isDataTypeFloatOrVec(paramType));
1648                         op << " + 2.0";
1649                     }
1650                 }
1651 
1652                 if (mustCast)
1653                     op << ")";
1654 
1655                 op << ";\n";
1656             }
1657 
1658             op << "\t" << returnPrecisionMaybe << returnTypeName << " res" << calcNdx << " = " << returnTypeName
1659                << "(0);\n";
1660         }
1661     }
1662 
1663     // Loop with expressions in it.
1664     op << "\tfor (int i = 0; i < u_numLoopIterations; i++)\n";
1665     op << "\t{\n";
1666     for (int calcNdx = 0; calcNdx < FUNCTION_CASE_NUM_INDEPENDENT_CALCULATIONS; calcNdx++)
1667     {
1668         if (calcNdx > 0)
1669             op << "\n";
1670 
1671         op << "\t\t{\n";
1672 
1673         for (int inputNdx = 0; inputNdx < numParams; inputNdx++)
1674         {
1675             const string inputName = "in" + de::toString(calcNdx) + (char)('a' + inputNdx);
1676             const string incName   = string() + "u_inc" + (char)('A' + inputNdx);
1677             const string incExpr   = incrementExpr(incName, m_paramTypes[inputNdx], m_useNearlyConstantInputs);
1678 
1679             op << "\t\t\t" << inputName << " = " << sumExpr(inputName, incExpr, m_paramTypes[inputNdx]) << ";\n";
1680         }
1681 
1682         op << "\t\t\t" << returnPrecisionMaybe << returnTypeName << " eval" << calcNdx << " = ";
1683 
1684         if (programID == PROGRAM_WITH_FUNCTION_CALLS)
1685         {
1686             op << m_func << "(";
1687 
1688             for (int paramNdx = 0; paramNdx < numParams; paramNdx++)
1689             {
1690                 if (paramNdx > 0)
1691                     op << ", ";
1692 
1693                 op << "in" << calcNdx << (char)('a' + paramNdx);
1694             }
1695 
1696             op << ")";
1697         }
1698         else
1699         {
1700             DE_ASSERT(programID == PROGRAM_WITHOUT_FUNCTION_CALLS);
1701             op << returnTypeName << "(1)";
1702         }
1703 
1704         op << ";\n";
1705 
1706         {
1707             const string resName  = "res" + de::toString(calcNdx);
1708             const string evalName = "eval" + de::toString(calcNdx);
1709             const string incExpr  = incrementExpr(evalName, m_returnType, m_useNearlyConstantInputs);
1710 
1711             op << "\t\t\tres" << calcNdx << " = " << sumExpr(resName, incExpr, m_returnType) << ";\n";
1712         }
1713 
1714         op << "\t\t}\n";
1715     }
1716     op << "\t}\n";
1717     op << "\n";
1718 
1719     // Result variables.
1720     for (int inputNdx = 0; inputNdx < numParams; inputNdx++)
1721     {
1722         op << "\t" << paramPrecisionsMaybe[inputNdx] << paramTypeNames[inputNdx] << " sumIn" << (char)('A' + inputNdx)
1723            << " = ";
1724         {
1725             string expr = string() + "in0" + (char)('a' + inputNdx);
1726             for (int i = 1; i < FUNCTION_CASE_NUM_INDEPENDENT_CALCULATIONS; i++)
1727                 expr =
1728                     sumExpr(expr, string() + "in" + de::toString(i) + (char)('a' + inputNdx), m_paramTypes[inputNdx]);
1729             op << expr;
1730         }
1731         op << ";\n";
1732     }
1733 
1734     op << "\t" << returnPrecisionMaybe << returnTypeName << " sumRes = ";
1735     {
1736         string expr = "res0";
1737         for (int i = 1; i < FUNCTION_CASE_NUM_INDEPENDENT_CALCULATIONS; i++)
1738             expr = sumExpr(expr, "res" + de::toString(i), m_returnType);
1739         op << expr;
1740     }
1741     op << ";\n";
1742 
1743     {
1744         glu::DataType finalResultDataType = glu::TYPE_LAST;
1745 
1746         if (glu::isDataTypeMatrix(m_returnType))
1747         {
1748             finalResultDataType = m_returnType;
1749 
1750             op << "\t" << precision << " " << returnTypeName << " finalRes = ";
1751 
1752             for (int inputNdx = 0; inputNdx < numParams; inputNdx++)
1753             {
1754                 DE_ASSERT(m_paramTypes[inputNdx] == m_returnType);
1755                 op << "sumIn" << (char)('A' + inputNdx) << " + ";
1756             }
1757             op << "sumRes;\n";
1758         }
1759         else
1760         {
1761             int numFinalResComponents = glu::getDataTypeScalarSize(m_returnType);
1762             for (int inputNdx = 0; inputNdx < numParams; inputNdx++)
1763                 numFinalResComponents =
1764                     de::max(numFinalResComponents, glu::getDataTypeScalarSize(m_paramTypes[inputNdx]));
1765 
1766             finalResultDataType = getDataTypeFloatOrVec(numFinalResComponents);
1767 
1768             {
1769                 const string finalResType = glu::getDataTypeName(finalResultDataType);
1770                 op << "\t" << precision << " " << finalResType << " finalRes = ";
1771                 for (int inputNdx = 0; inputNdx < numParams; inputNdx++)
1772                     op << finalResType << "(sumIn" << (char)('A' + inputNdx) << ") + ";
1773                 op << finalResType << "(sumRes);\n";
1774             }
1775         }
1776 
1777         // Convert to color.
1778         op << "\tmediump vec4 color = ";
1779         if (finalResultDataType == TYPE_FLOAT_VEC4)
1780             op << "finalRes";
1781         else
1782         {
1783             int size = isMatrixReturn ? getDataTypeMatrixNumRows(finalResultDataType) :
1784                                         getDataTypeScalarSize(finalResultDataType);
1785 
1786             op << "vec4(";
1787 
1788             if (isMatrixReturn)
1789             {
1790                 for (int i = 0; i < getDataTypeMatrixNumColumns(finalResultDataType); i++)
1791                 {
1792                     if (i > 0)
1793                         op << " + ";
1794                     op << "finalRes[" << i << "]";
1795                 }
1796             }
1797             else
1798                 op << "finalRes";
1799 
1800             for (int i = size; i < 4; i++)
1801                 op << ", " << (i == 3 ? "1.0" : "0.0");
1802 
1803             op << ")";
1804         }
1805         op << ";\n";
1806         op << "\t" << (isVertexCase ? "v_color" : "gl_FragColor") << " = color;\n";
1807 
1808         if (isVertexCase)
1809         {
1810             vtx << "    gl_Position = a_position + u_zero*color;\n";
1811             frag << "    gl_FragColor = v_color;\n";
1812         }
1813         else
1814         {
1815             for (int i = 0; i < numAttributes; i++)
1816                 vtx << "    v_in" << i << " = a_in" << i << ";\n";
1817         }
1818 
1819         vtx << "}\n";
1820         frag << "}\n";
1821     }
1822 
1823     {
1824         vector<AttribSpec> attributes;
1825         for (int i = 0; i < numAttributes; i++)
1826             attributes.push_back(AttribSpec(("a_in" + de::toString(i)).c_str(),
1827                                             m_attribute.swizzle((i + 0) % 4, (i + 1) % 4, (i + 2) % 4, (i + 3) % 4),
1828                                             m_attribute.swizzle((i + 1) % 4, (i + 2) % 4, (i + 3) % 4, (i + 0) % 4),
1829                                             m_attribute.swizzle((i + 2) % 4, (i + 3) % 4, (i + 0) % 4, (i + 1) % 4),
1830                                             m_attribute.swizzle((i + 3) % 4, (i + 0) % 4, (i + 1) % 4, (i + 2) % 4)));
1831 
1832         {
1833             string description = "This is the program ";
1834 
1835             description += programID == PROGRAM_WITHOUT_FUNCTION_CALLS ? "without" :
1836                            programID == PROGRAM_WITH_FUNCTION_CALLS    ? "with" :
1837                                                                          DE_NULL;
1838 
1839             description += " '" + m_func +
1840                            "' function calls.\n"
1841                            "Note: workload size for this program means the number of loop iterations.";
1842 
1843             return ProgramContext(vtx.str(), frag.str(), attributes, description);
1844         }
1845     }
1846 }
1847 
generateProgramData(void) const1848 vector<FunctionCase::ProgramContext> FunctionCase::generateProgramData(void) const
1849 {
1850     vector<ProgramContext> progData;
1851     for (int i = 0; i < PROGRAM_LAST; i++)
1852         progData.push_back(generateSingleProgramData((ProgramID)i));
1853     return progData;
1854 }
1855 
setGeneralUniforms(uint32_t program) const1856 void FunctionCase::setGeneralUniforms(uint32_t program) const
1857 {
1858     const glw::Functions &gl = m_renderCtx.getFunctions();
1859 
1860     gl.uniform1f(gl.getUniformLocation(program, "u_zero"), 0.0f);
1861 
1862     for (int paramNdx = 0; paramNdx < MAX_PARAMS; paramNdx++)
1863     {
1864         if (m_paramTypes[paramNdx] != glu::TYPE_INVALID)
1865         {
1866             const glu::DataType paramType = m_paramTypes[paramNdx];
1867             const int scalarSize          = glu::getDataTypeScalarSize(paramType);
1868             const int location = gl.getUniformLocation(program, (string() + "u_inc" + (char)('A' + paramNdx)).c_str());
1869 
1870             if (glu::isDataTypeFloatOrVec(paramType))
1871             {
1872                 float values[4];
1873                 for (int i = 0; i < DE_LENGTH_OF_ARRAY(values); i++)
1874                     values[i] = (float)paramNdx * 0.01f + (float)i * 0.001f; // Arbitrary small values.
1875                 uniformNfv(gl, scalarSize, location, 1, &values[0]);
1876             }
1877             else if (glu::isDataTypeIntOrIVec(paramType))
1878             {
1879                 int values[4];
1880                 for (int i = 0; i < DE_LENGTH_OF_ARRAY(values); i++)
1881                     values[i] = paramNdx * 100 + i; // Arbitrary values.
1882                 uniformNiv(gl, scalarSize, location, 1, &values[0]);
1883             }
1884             else if (glu::isDataTypeBoolOrBVec(paramType))
1885             {
1886                 int values[4];
1887                 for (int i = 0; i < DE_LENGTH_OF_ARRAY(values); i++)
1888                     values[i] = (paramNdx >> i) & 1; // Arbitrary values.
1889                 uniformNiv(gl, scalarSize, location, 1, &values[0]);
1890             }
1891             else if (glu::isDataTypeMatrix(paramType))
1892             {
1893                 const int size = glu::getDataTypeMatrixNumRows(paramType);
1894                 DE_ASSERT(size == glu::getDataTypeMatrixNumColumns(paramType));
1895                 float values[4 * 4];
1896                 for (int i = 0; i < DE_LENGTH_OF_ARRAY(values); i++)
1897                     values[i] = (float)paramNdx * 0.01f + (float)i * 0.001f; // Arbitrary values.
1898                 uniformMatrixNfv(gl, size, location, 1, &values[0]);
1899             }
1900             else
1901                 DE_ASSERT(false);
1902         }
1903     }
1904 }
1905 
setWorkloadSizeUniform(uint32_t program,int numLoopIterations) const1906 void FunctionCase::setWorkloadSizeUniform(uint32_t program, int numLoopIterations) const
1907 {
1908     const glw::Functions &gl = m_renderCtx.getFunctions();
1909     const int loc            = gl.getUniformLocation(program, "u_numLoopIterations");
1910 
1911     gl.uniform1i(loc, numLoopIterations);
1912 }
1913 
computeSingleOperationTime(const vector<float> & perProgramOperationCosts) const1914 float FunctionCase::computeSingleOperationTime(const vector<float> &perProgramOperationCosts) const
1915 {
1916     DE_ASSERT(perProgramOperationCosts.size() == PROGRAM_LAST);
1917     const int numFunctionCalls           = FUNCTION_CASE_NUM_INDEPENDENT_CALCULATIONS;
1918     const float programOperationCostDiff = perProgramOperationCosts[PROGRAM_WITH_FUNCTION_CALLS] -
1919                                            perProgramOperationCosts[PROGRAM_WITHOUT_FUNCTION_CALLS];
1920 
1921     return programOperationCostDiff / (float)numFunctionCalls;
1922 }
1923 
logSingleOperationCalculationInfo(void) const1924 void FunctionCase::logSingleOperationCalculationInfo(void) const
1925 {
1926     const int numFunctionCalls = FUNCTION_CASE_NUM_INDEPENDENT_CALCULATIONS;
1927 
1928     m_testCtx.getLog() << TestLog::Message << "Note: program " << (int)PROGRAM_WITH_FUNCTION_CALLS << " contains "
1929                        << numFunctionCalls << " calls to '" << m_func << "' in one loop iteration; "
1930                        << "cost of one operation is calculated as "
1931                        << "(cost_of_workload_with_calls - cost_of_workload_without_calls) / " << numFunctionCalls
1932                        << TestLog::EndMessage;
1933 }
1934 
1935 } // namespace
1936 
ShaderOperatorTests(Context & context)1937 ShaderOperatorTests::ShaderOperatorTests(Context &context)
1938     : TestCaseGroup(context, "operator", "Operator Performance Tests")
1939 {
1940 }
1941 
~ShaderOperatorTests(void)1942 ShaderOperatorTests::~ShaderOperatorTests(void)
1943 {
1944 }
1945 
init(void)1946 void ShaderOperatorTests::init(void)
1947 {
1948     // Binary operator cases
1949 
1950     static const DataType binaryOpTypes[] = {
1951         TYPE_FLOAT, TYPE_FLOAT_VEC2, TYPE_FLOAT_VEC3, TYPE_FLOAT_VEC4,
1952         TYPE_INT,   TYPE_INT_VEC2,   TYPE_INT_VEC3,   TYPE_INT_VEC4,
1953     };
1954     static const Precision precisions[] = {PRECISION_LOWP, PRECISION_MEDIUMP, PRECISION_HIGHP};
1955     static const struct
1956     {
1957         const char *name;
1958         const char *op;
1959         bool swizzle;
1960     } binaryOps[] = {{"add", "+", false}, {"sub", "-", true}, {"mul", "*", false}, {"div", "/", true}};
1961 
1962     tcu::TestCaseGroup *const binaryOpsGroup =
1963         new tcu::TestCaseGroup(m_testCtx, "binary_operator", "Binary Operator Performance Tests");
1964     addChild(binaryOpsGroup);
1965 
1966     for (int opNdx = 0; opNdx < DE_LENGTH_OF_ARRAY(binaryOps); opNdx++)
1967     {
1968         tcu::TestCaseGroup *const opGroup = new tcu::TestCaseGroup(m_testCtx, binaryOps[opNdx].name, "");
1969         binaryOpsGroup->addChild(opGroup);
1970 
1971         for (int isFrag = 0; isFrag <= 1; isFrag++)
1972         {
1973             const BinaryOpCase::InitialCalibrationStorage shaderGroupCalibrationStorage(
1974                 new BinaryOpCase::InitialCalibration);
1975             const bool isVertex = isFrag == 0;
1976             tcu::TestCaseGroup *const shaderGroup =
1977                 new tcu::TestCaseGroup(m_testCtx, isVertex ? "vertex" : "fragment", "");
1978             opGroup->addChild(shaderGroup);
1979 
1980             for (int typeNdx = 0; typeNdx < DE_LENGTH_OF_ARRAY(binaryOpTypes); typeNdx++)
1981             {
1982                 for (int precNdx = 0; precNdx < DE_LENGTH_OF_ARRAY(precisions); precNdx++)
1983                 {
1984                     const DataType type       = binaryOpTypes[typeNdx];
1985                     const Precision precision = precisions[precNdx];
1986                     const char *const op      = binaryOps[opNdx].op;
1987                     const bool useSwizzle     = binaryOps[opNdx].swizzle;
1988                     std::ostringstream name;
1989 
1990                     name << getPrecisionName(precision) << "_" << getDataTypeName(type);
1991 
1992                     shaderGroup->addChild(new BinaryOpCase(m_context, name.str().c_str(), "", op, type, precision,
1993                                                            useSwizzle, isVertex, shaderGroupCalibrationStorage));
1994                 }
1995             }
1996         }
1997     }
1998 
1999     // Built-in function cases.
2000 
2001     // Non-specific (i.e. includes gentypes) parameter types for the functions.
2002     enum ValueType
2003     {
2004         VALUE_NONE          = 0,
2005         VALUE_FLOAT         = (1 << 0),  // float scalar
2006         VALUE_FLOAT_VEC     = (1 << 1),  // float vector
2007         VALUE_FLOAT_VEC34   = (1 << 2),  // float vector of size 3 or 4
2008         VALUE_FLOAT_GENTYPE = (1 << 3),  // float scalar/vector
2009         VALUE_VEC3          = (1 << 4),  // vec3 only
2010         VALUE_VEC4          = (1 << 5),  // vec4 only
2011         VALUE_MATRIX        = (1 << 6),  // matrix
2012         VALUE_BOOL          = (1 << 7),  // boolean scalar
2013         VALUE_BOOL_VEC      = (1 << 8),  // boolean vector
2014         VALUE_BOOL_GENTYPE  = (1 << 9),  // boolean scalar/vector
2015         VALUE_INT           = (1 << 10), // int scalar
2016         VALUE_INT_VEC       = (1 << 11), // int vector
2017         VALUE_INT_GENTYPE   = (1 << 12), // int scalar/vector
2018 
2019         // Shorthands.
2020         N   = VALUE_NONE,
2021         F   = VALUE_FLOAT,
2022         FV  = VALUE_FLOAT_VEC,
2023         VL  = VALUE_FLOAT_VEC34, // L for "large"
2024         GT  = VALUE_FLOAT_GENTYPE,
2025         V3  = VALUE_VEC3,
2026         V4  = VALUE_VEC4,
2027         M   = VALUE_MATRIX,
2028         B   = VALUE_BOOL,
2029         BV  = VALUE_BOOL_VEC,
2030         BGT = VALUE_BOOL_GENTYPE,
2031         I   = VALUE_INT,
2032         IV  = VALUE_INT_VEC,
2033         IGT = VALUE_INT_GENTYPE,
2034 
2035         VALUE_ANY_FLOAT =
2036             VALUE_FLOAT | VALUE_FLOAT_VEC | VALUE_FLOAT_GENTYPE | VALUE_VEC3 | VALUE_VEC4 | VALUE_FLOAT_VEC34,
2037         VALUE_ANY_INT  = VALUE_INT | VALUE_INT_VEC | VALUE_INT_GENTYPE,
2038         VALUE_ANY_BOOL = VALUE_BOOL | VALUE_BOOL_VEC | VALUE_BOOL_GENTYPE,
2039 
2040         VALUE_ANY_GENTYPE = VALUE_FLOAT_VEC | VALUE_FLOAT_GENTYPE | VALUE_FLOAT_VEC34 | VALUE_BOOL_VEC |
2041                             VALUE_BOOL_GENTYPE | VALUE_INT_VEC | VALUE_INT_GENTYPE | VALUE_MATRIX
2042     };
2043     enum PrecisionMask
2044     {
2045         PRECMASK_NA      = 0, //!< Precision not applicable (booleans)
2046         PRECMASK_LOWP    = (1 << PRECISION_LOWP),
2047         PRECMASK_MEDIUMP = (1 << PRECISION_MEDIUMP),
2048         PRECMASK_HIGHP   = (1 << PRECISION_HIGHP),
2049 
2050         PRECMASK_MEDIUMP_HIGHP = (1 << PRECISION_MEDIUMP) | (1 << PRECISION_HIGHP),
2051         PRECMASK_ALL           = (1 << PRECISION_LOWP) | (1 << PRECISION_MEDIUMP) | (1 << PRECISION_HIGHP)
2052     };
2053 
2054     static const DataType floatTypes[]  = {TYPE_FLOAT, TYPE_FLOAT_VEC2, TYPE_FLOAT_VEC3, TYPE_FLOAT_VEC4};
2055     static const DataType intTypes[]    = {TYPE_INT, TYPE_INT_VEC2, TYPE_INT_VEC3, TYPE_INT_VEC4};
2056     static const DataType boolTypes[]   = {TYPE_BOOL, TYPE_BOOL_VEC2, TYPE_BOOL_VEC3, TYPE_BOOL_VEC4};
2057     static const DataType matrixTypes[] = {TYPE_FLOAT_MAT2, TYPE_FLOAT_MAT3, TYPE_FLOAT_MAT4};
2058 
2059     tcu::TestCaseGroup *const angleAndTrigonometryGroup = new tcu::TestCaseGroup(
2060         m_testCtx, "angle_and_trigonometry", "Built-In Angle and Trigonometry Function Performance Tests");
2061     tcu::TestCaseGroup *const exponentialGroup =
2062         new tcu::TestCaseGroup(m_testCtx, "exponential", "Built-In Exponential Function Performance Tests");
2063     tcu::TestCaseGroup *const commonFunctionsGroup =
2064         new tcu::TestCaseGroup(m_testCtx, "common_functions", "Built-In Common Function Performance Tests");
2065     tcu::TestCaseGroup *const geometricFunctionsGroup =
2066         new tcu::TestCaseGroup(m_testCtx, "geometric", "Built-In Geometric Function Performance Tests");
2067     tcu::TestCaseGroup *const matrixFunctionsGroup =
2068         new tcu::TestCaseGroup(m_testCtx, "matrix", "Built-In Matrix Function Performance Tests");
2069     tcu::TestCaseGroup *const floatCompareGroup = new tcu::TestCaseGroup(
2070         m_testCtx, "float_compare", "Built-In Floating Point Comparison Function Performance Tests");
2071     tcu::TestCaseGroup *const intCompareGroup =
2072         new tcu::TestCaseGroup(m_testCtx, "int_compare", "Built-In Integer Comparison Function Performance Tests");
2073     tcu::TestCaseGroup *const boolCompareGroup =
2074         new tcu::TestCaseGroup(m_testCtx, "bool_compare", "Built-In Boolean Comparison Function Performance Tests");
2075 
2076     addChild(angleAndTrigonometryGroup);
2077     addChild(exponentialGroup);
2078     addChild(commonFunctionsGroup);
2079     addChild(geometricFunctionsGroup);
2080     addChild(matrixFunctionsGroup);
2081     addChild(floatCompareGroup);
2082     addChild(intCompareGroup);
2083     addChild(boolCompareGroup);
2084 
2085     // Some attributes to be used as parameters for the functions.
2086     const Vec4 attrPos    = Vec4(2.3f, 1.9f, 0.8f, 0.7f);
2087     const Vec4 attrNegPos = Vec4(-1.3f, 2.5f, -3.5f, 4.3f);
2088     const Vec4 attrSmall  = Vec4(-0.9f, 0.8f, -0.4f, 0.2f);
2089 
2090     // Function name, return type and parameter type information; also, what attribute should be used in the test.
2091     // \note Different versions of the same function (i.e. with the same group name) can be defined by putting them successively in this array.
2092     // \note In order to reduce case count and thus total execution time, we don't test all input type combinations for every function.
2093     static const struct
2094     {
2095         tcu::TestCaseGroup *parentGroup;
2096         const char *groupName;
2097         const char *func;
2098         const ValueType types[FunctionCase::MAX_PARAMS + 1]; // Return type and parameter types, in that order.
2099         const Vec4 &attribute;
2100         int modifyParamNdx;
2101         bool useNearlyConstantInputs;
2102         bool booleanCase;
2103         PrecisionMask precMask;
2104     } functionCaseGroups[] = {
2105         {angleAndTrigonometryGroup, "radians", "radians", {F, F, N, N}, attrNegPos, -1, false, false, PRECMASK_ALL},
2106         {angleAndTrigonometryGroup, "degrees", "degrees", {F, F, N, N}, attrNegPos, -1, false, false, PRECMASK_ALL},
2107         {angleAndTrigonometryGroup, "sin", "sin", {F, F, N, N}, attrNegPos, -1, false, false, PRECMASK_ALL},
2108         {angleAndTrigonometryGroup, "cos", "cos", {F, F, N, N}, attrNegPos, -1, false, false, PRECMASK_ALL},
2109         {angleAndTrigonometryGroup, "tan", "tan", {F, F, N, N}, attrNegPos, -1, false, false, PRECMASK_ALL},
2110         {angleAndTrigonometryGroup, "asin", "asin", {F, F, N, N}, attrSmall, -1, true, false, PRECMASK_ALL},
2111         {angleAndTrigonometryGroup, "acos", "acos", {F, F, N, N}, attrSmall, -1, true, false, PRECMASK_ALL},
2112         {angleAndTrigonometryGroup, "atan2", "atan", {F, F, F, N}, attrNegPos, -1, false, false, PRECMASK_ALL},
2113         {angleAndTrigonometryGroup, "atan", "atan", {F, F, N, N}, attrNegPos, -1, false, false, PRECMASK_ALL},
2114 
2115         {exponentialGroup, "pow", "pow", {F, F, F, N}, attrPos, -1, false, false, PRECMASK_ALL},
2116         {exponentialGroup, "exp", "exp", {F, F, N, N}, attrNegPos, -1, false, false, PRECMASK_ALL},
2117         {exponentialGroup, "log", "log", {F, F, N, N}, attrPos, -1, false, false, PRECMASK_ALL},
2118         {exponentialGroup, "exp2", "exp2", {F, F, N, N}, attrNegPos, -1, false, false, PRECMASK_ALL},
2119         {exponentialGroup, "log2", "log2", {F, F, N, N}, attrPos, -1, false, false, PRECMASK_ALL},
2120         {exponentialGroup, "sqrt", "sqrt", {F, F, N, N}, attrPos, -1, false, false, PRECMASK_ALL},
2121         {exponentialGroup, "inversesqrt", "inversesqrt", {F, F, N, N}, attrPos, -1, false, false, PRECMASK_ALL},
2122 
2123         {commonFunctionsGroup, "abs", "abs", {F, F, N, N}, attrNegPos, -1, false, false, PRECMASK_MEDIUMP_HIGHP},
2124         {commonFunctionsGroup, "abs", "abs", {V4, V4, N, N}, attrNegPos, -1, false, false, PRECMASK_ALL},
2125         {commonFunctionsGroup, "sign", "sign", {F, F, N, N}, attrNegPos, -1, false, false, PRECMASK_MEDIUMP_HIGHP},
2126         {commonFunctionsGroup, "sign", "sign", {V4, V4, N, N}, attrNegPos, -1, false, false, PRECMASK_ALL},
2127         {commonFunctionsGroup, "floor", "floor", {F, F, N, N}, attrNegPos, -1, false, false, PRECMASK_MEDIUMP_HIGHP},
2128         {commonFunctionsGroup, "floor", "floor", {V4, V4, N, N}, attrNegPos, -1, false, false, PRECMASK_ALL},
2129         {commonFunctionsGroup, "ceil", "ceil", {F, F, N, N}, attrNegPos, -1, false, false, PRECMASK_MEDIUMP_HIGHP},
2130         {commonFunctionsGroup, "ceil", "ceil", {V4, V4, N, N}, attrNegPos, -1, false, false, PRECMASK_ALL},
2131         {commonFunctionsGroup, "fract", "fract", {F, F, N, N}, attrNegPos, -1, false, false, PRECMASK_MEDIUMP_HIGHP},
2132         {commonFunctionsGroup, "fract", "fract", {V4, V4, N, N}, attrNegPos, -1, false, false, PRECMASK_ALL},
2133         {commonFunctionsGroup, "mod", "mod", {GT, GT, GT, N}, attrNegPos, -1, false, false, PRECMASK_ALL},
2134         {commonFunctionsGroup, "min", "min", {F, F, F, N}, attrNegPos, -1, false, false, PRECMASK_MEDIUMP_HIGHP},
2135         {commonFunctionsGroup, "min", "min", {V4, V4, V4, N}, attrNegPos, -1, false, false, PRECMASK_ALL},
2136         {commonFunctionsGroup, "max", "max", {F, F, F, N}, attrNegPos, -1, false, false, PRECMASK_MEDIUMP_HIGHP},
2137         {commonFunctionsGroup, "max", "max", {V4, V4, V4, N}, attrNegPos, -1, false, false, PRECMASK_ALL},
2138         {commonFunctionsGroup, "clamp", "clamp", {F, F, F, F}, attrSmall, 2, false, false, PRECMASK_MEDIUMP_HIGHP},
2139         {commonFunctionsGroup, "clamp", "clamp", {V4, V4, V4, V4}, attrSmall, 2, false, false, PRECMASK_ALL},
2140         {commonFunctionsGroup, "mix", "mix", {F, F, F, F}, attrNegPos, -1, false, false, PRECMASK_MEDIUMP_HIGHP},
2141         {commonFunctionsGroup, "mix", "mix", {V4, V4, V4, V4}, attrNegPos, -1, false, false, PRECMASK_ALL},
2142         {commonFunctionsGroup, "step", "step", {F, F, F, N}, attrNegPos, -1, false, false, PRECMASK_MEDIUMP_HIGHP},
2143         {commonFunctionsGroup, "step", "step", {V4, V4, V4, N}, attrNegPos, -1, false, false, PRECMASK_ALL},
2144         {commonFunctionsGroup,
2145          "smoothstep",
2146          "smoothstep",
2147          {F, F, F, F},
2148          attrSmall,
2149          1,
2150          false,
2151          false,
2152          PRECMASK_MEDIUMP_HIGHP},
2153         {commonFunctionsGroup, "smoothstep", "smoothstep", {V4, V4, V4, V4}, attrSmall, 1, false, false, PRECMASK_ALL},
2154 
2155         {geometricFunctionsGroup, "length", "length", {F, VL, N, N}, attrNegPos, -1, false, false, PRECMASK_ALL},
2156         {geometricFunctionsGroup, "distance", "distance", {F, VL, VL, N}, attrNegPos, -1, false, false, PRECMASK_ALL},
2157         {geometricFunctionsGroup, "dot", "dot", {F, VL, VL, N}, attrNegPos, -1, false, false, PRECMASK_ALL},
2158         {geometricFunctionsGroup, "cross", "cross", {V3, V3, V3, N}, attrNegPos, -1, false, false, PRECMASK_ALL},
2159         {geometricFunctionsGroup, "normalize", "normalize", {VL, VL, N, N}, attrNegPos, -1, false, false, PRECMASK_ALL},
2160         {geometricFunctionsGroup,
2161          "faceforward",
2162          "faceforward",
2163          {VL, VL, VL, VL},
2164          attrNegPos,
2165          -1,
2166          false,
2167          false,
2168          PRECMASK_ALL},
2169         {geometricFunctionsGroup, "reflect", "reflect", {VL, VL, VL, N}, attrNegPos, -1, false, false, PRECMASK_ALL},
2170         {geometricFunctionsGroup, "refract", "refract", {VL, VL, VL, F}, attrNegPos, -1, false, false, PRECMASK_ALL},
2171 
2172         {matrixFunctionsGroup,
2173          "matrixCompMult",
2174          "matrixCompMult",
2175          {M, M, M, N},
2176          attrNegPos,
2177          -1,
2178          false,
2179          false,
2180          PRECMASK_ALL},
2181 
2182         {floatCompareGroup, "lessThan", "lessThan", {BV, FV, FV, N}, attrNegPos, -1, false, false, PRECMASK_ALL},
2183         {floatCompareGroup,
2184          "lessThanEqual",
2185          "lessThanEqual",
2186          {BV, FV, FV, N},
2187          attrNegPos,
2188          -1,
2189          false,
2190          false,
2191          PRECMASK_ALL},
2192         {floatCompareGroup, "greaterThan", "greaterThan", {BV, FV, FV, N}, attrNegPos, -1, false, false, PRECMASK_ALL},
2193         {floatCompareGroup,
2194          "greaterThanEqual",
2195          "greaterThanEqual",
2196          {BV, FV, FV, N},
2197          attrNegPos,
2198          -1,
2199          false,
2200          false,
2201          PRECMASK_ALL},
2202         {floatCompareGroup, "equal", "equal", {BV, FV, FV, N}, attrNegPos, -1, false, false, PRECMASK_ALL},
2203         {floatCompareGroup, "notEqual", "notEqual", {BV, FV, FV, N}, attrNegPos, -1, false, false, PRECMASK_ALL},
2204 
2205         {intCompareGroup, "lessThan", "lessThan", {BV, IV, IV, N}, attrNegPos, -1, false, false, PRECMASK_ALL},
2206         {intCompareGroup,
2207          "lessThanEqual",
2208          "lessThanEqual",
2209          {BV, IV, IV, N},
2210          attrNegPos,
2211          -1,
2212          false,
2213          false,
2214          PRECMASK_ALL},
2215         {intCompareGroup, "greaterThan", "greaterThan", {BV, IV, IV, N}, attrNegPos, -1, false, false, PRECMASK_ALL},
2216         {intCompareGroup,
2217          "greaterThanEqual",
2218          "greaterThanEqual",
2219          {BV, IV, IV, N},
2220          attrNegPos,
2221          -1,
2222          false,
2223          false,
2224          PRECMASK_ALL},
2225         {intCompareGroup, "equal", "equal", {BV, IV, IV, N}, attrNegPos, -1, false, false, PRECMASK_ALL},
2226         {intCompareGroup, "notEqual", "notEqual", {BV, IV, IV, N}, attrNegPos, -1, false, false, PRECMASK_ALL},
2227 
2228         {boolCompareGroup, "equal", "equal", {BV, BV, BV, N}, attrNegPos, -1, false, true, PRECMASK_MEDIUMP},
2229         {boolCompareGroup, "notEqual", "notEqual", {BV, BV, BV, N}, attrNegPos, -1, false, true, PRECMASK_MEDIUMP},
2230         {boolCompareGroup, "any", "any", {B, BV, N, N}, attrNegPos, -1, false, true, PRECMASK_MEDIUMP},
2231         {boolCompareGroup, "all", "all", {B, BV, N, N}, attrNegPos, -1, false, true, PRECMASK_MEDIUMP},
2232         {boolCompareGroup, "not", "not", {BV, BV, N, N}, attrNegPos, -1, false, true, PRECMASK_MEDIUMP}};
2233 
2234     // vertexSubGroup and fragmentSubGroup are the groups where the various vertex/fragment cases of a single function are added.
2235     // \note These are defined here so that different versions (different entries in the functionCaseGroups array) of the same function can be put in the same group.
2236     tcu::TestCaseGroup *vertexSubGroup   = DE_NULL;
2237     tcu::TestCaseGroup *fragmentSubGroup = DE_NULL;
2238     FunctionCase::InitialCalibrationStorage vertexSubGroupCalibrationStorage;
2239     FunctionCase::InitialCalibrationStorage fragmentSubGroupCalibrationStorage;
2240     for (int funcNdx = 0; funcNdx < DE_LENGTH_OF_ARRAY(functionCaseGroups); funcNdx++)
2241     {
2242         tcu::TestCaseGroup *const parentGroup = functionCaseGroups[funcNdx].parentGroup;
2243         const char *const groupName           = functionCaseGroups[funcNdx].groupName;
2244         const char *const groupFunc           = functionCaseGroups[funcNdx].func;
2245         const ValueType *const funcTypes      = functionCaseGroups[funcNdx].types;
2246         const Vec4 &groupAttribute            = functionCaseGroups[funcNdx].attribute;
2247         const int modifyParamNdx              = functionCaseGroups[funcNdx].modifyParamNdx;
2248         const bool useNearlyConstantInputs    = functionCaseGroups[funcNdx].useNearlyConstantInputs;
2249         const bool booleanCase                = functionCaseGroups[funcNdx].booleanCase;
2250         const PrecisionMask precMask          = functionCaseGroups[funcNdx].precMask;
2251 
2252         // If this is a new function and not just a different version of the previously defined function, create a new group.
2253         if (funcNdx == 0 || parentGroup != functionCaseGroups[funcNdx - 1].parentGroup ||
2254             string(groupName) != functionCaseGroups[funcNdx - 1].groupName)
2255         {
2256             tcu::TestCaseGroup *const funcGroup = new tcu::TestCaseGroup(m_testCtx, groupName, "");
2257             functionCaseGroups[funcNdx].parentGroup->addChild(funcGroup);
2258 
2259             vertexSubGroup   = new tcu::TestCaseGroup(m_testCtx, "vertex", "");
2260             fragmentSubGroup = new tcu::TestCaseGroup(m_testCtx, "fragment", "");
2261 
2262             funcGroup->addChild(vertexSubGroup);
2263             funcGroup->addChild(fragmentSubGroup);
2264 
2265             vertexSubGroupCalibrationStorage =
2266                 FunctionCase::InitialCalibrationStorage(new FunctionCase::InitialCalibration);
2267             fragmentSubGroupCalibrationStorage =
2268                 FunctionCase::InitialCalibrationStorage(new FunctionCase::InitialCalibration);
2269         }
2270 
2271         DE_ASSERT(vertexSubGroup != DE_NULL);
2272         DE_ASSERT(fragmentSubGroup != DE_NULL);
2273 
2274         // Find the type size range of parameters (e.g. from 2 to 4 in case of vectors).
2275         int genTypeFirstSize = 1;
2276         int genTypeLastSize  = 1;
2277 
2278         // Find the first return value or parameter with a gentype (if any) and set sizes accordingly.
2279         // \note Assumes only matching sizes gentypes are to be found, e.g. no "genType func (vec param)"
2280         for (int i = 0; i < FunctionCase::MAX_PARAMS + 1 && genTypeLastSize == 1; i++)
2281         {
2282             switch (funcTypes[i])
2283             {
2284             case VALUE_FLOAT_VEC:
2285             case VALUE_BOOL_VEC:
2286             case VALUE_INT_VEC: // \note Fall-through.
2287                 genTypeFirstSize = 2;
2288                 genTypeLastSize  = 4;
2289                 break;
2290             case VALUE_FLOAT_VEC34:
2291                 genTypeFirstSize = 3;
2292                 genTypeLastSize  = 4;
2293                 break;
2294             case VALUE_FLOAT_GENTYPE:
2295             case VALUE_BOOL_GENTYPE:
2296             case VALUE_INT_GENTYPE: // \note Fall-through.
2297                 genTypeFirstSize = 1;
2298                 genTypeLastSize  = 4;
2299                 break;
2300             case VALUE_MATRIX:
2301                 genTypeFirstSize = 2;
2302                 genTypeLastSize  = 4;
2303                 break;
2304             // If none of the above, keep looping.
2305             default:
2306                 break;
2307             }
2308         }
2309 
2310         // Create a case for each possible size of the gentype.
2311         for (int curSize = genTypeFirstSize; curSize <= genTypeLastSize; curSize++)
2312         {
2313             // Determine specific types for return value and the parameters, according to curSize. Non-gentypes not affected by curSize.
2314             DataType types[FunctionCase::MAX_PARAMS + 1];
2315             for (int i = 0; i < FunctionCase::MAX_PARAMS + 1; i++)
2316             {
2317                 if (funcTypes[i] == VALUE_NONE)
2318                     types[i] = TYPE_INVALID;
2319                 else
2320                 {
2321                     int isFloat      = funcTypes[i] & VALUE_ANY_FLOAT;
2322                     int isBool       = funcTypes[i] & VALUE_ANY_BOOL;
2323                     int isInt        = funcTypes[i] & VALUE_ANY_INT;
2324                     int isMat        = funcTypes[i] == VALUE_MATRIX;
2325                     int inSize       = (funcTypes[i] & VALUE_ANY_GENTYPE) ? curSize :
2326                                        funcTypes[i] == VALUE_VEC3         ? 3 :
2327                                        funcTypes[i] == VALUE_VEC4         ? 4 :
2328                                                                             1;
2329                     int typeArrayNdx = isMat ? inSize - 2 : inSize - 1; // \note No matrices of size 1.
2330 
2331                     types[i] = isFloat ? floatTypes[typeArrayNdx] :
2332                                isBool  ? boolTypes[typeArrayNdx] :
2333                                isInt   ? intTypes[typeArrayNdx] :
2334                                isMat   ? matrixTypes[typeArrayNdx] :
2335                                          TYPE_LAST;
2336                 }
2337 
2338                 DE_ASSERT(types[i] != TYPE_LAST);
2339             }
2340 
2341             // Array for just the parameter types.
2342             DataType paramTypes[FunctionCase::MAX_PARAMS];
2343             for (int i = 0; i < FunctionCase::MAX_PARAMS; i++)
2344                 paramTypes[i] = types[i + 1];
2345 
2346             for (int prec = (int)PRECISION_LOWP; prec < (int)PRECISION_LAST; prec++)
2347             {
2348                 if ((precMask & (1 << prec)) == 0)
2349                     continue;
2350 
2351                 const string precisionPrefix = booleanCase ? "" : (string(getPrecisionName((Precision)prec)) + "_");
2352                 std::ostringstream caseName;
2353 
2354                 caseName << precisionPrefix;
2355 
2356                 // Write the name of each distinct parameter data type into the test case name.
2357                 for (int i = 1; i < FunctionCase::MAX_PARAMS + 1 && types[i] != TYPE_INVALID; i++)
2358                 {
2359                     if (i == 1 || types[i] != types[i - 1])
2360                     {
2361                         if (i > 1)
2362                             caseName << "_";
2363 
2364                         caseName << getDataTypeName(types[i]);
2365                     }
2366                 }
2367 
2368                 for (int fragI = 0; fragI <= 1; fragI++)
2369                 {
2370                     const bool vert                 = fragI == 0;
2371                     tcu::TestCaseGroup *const group = vert ? vertexSubGroup : fragmentSubGroup;
2372                     group->addChild(
2373                         new FunctionCase(m_context, caseName.str().c_str(), "", groupFunc, types[0], paramTypes,
2374                                          groupAttribute, modifyParamNdx, useNearlyConstantInputs, (Precision)prec, vert,
2375                                          vert ? vertexSubGroupCalibrationStorage : fragmentSubGroupCalibrationStorage));
2376                 }
2377             }
2378         }
2379     }
2380 }
2381 
2382 } // namespace Performance
2383 } // namespace gles2
2384 } // namespace deqp
2385