xref: /aosp_15_r20/external/executorch/backends/vulkan/tools/gpuinfo/include/textures.h (revision 523fa7a60841cd1ecfb9cc4201f1ca8b03ed023a)
1 /*
2  * Copyright (c) Meta Platforms, Inc. and affiliates.
3  * All rights reserved.
4  *
5  * This source code is licensed under the BSD-style license found in the
6  * LICENSE file in the root directory of this source tree.
7  */
8 
9 #pragma once
10 
11 #include "app.h"
12 #include "stats.h"
13 #include "utils.h"
14 
15 namespace gpuinfo {
16 
17 // Textures are drastically different from buffers in terms of data layout.
18 // While buffers are a contiguous range of memory, textures are opaque objects
19 // defined by the vendor and it is possible that nearby points of data are not
20 // neighboring in memory. Likewise, data points are accessed in
21 // multi-dimensional patches instead of simple lines. This makes the stride
22 // method for figuring out the cache line size not applicable. To go around
23 // this, this experiment runs an increasing amount of threads accessing
24 // different datapoints in the texture and measures latency. If the cache line
25 // is big enough to contain all requested data for the amount of threads,
26 // latency will be low. When there are more threads and hence more data than
27 // what a single cache line can handle, a second line must be fetched,
28 // increasing latency in a measurable way.
tex_cacheline_concurr(const App & app)29 void tex_cacheline_concurr(const App& app) {
30   if (!app.enabled("tex_cacheline_concurr")) {
31     std::cout << "Skipped Texture Cacheline Optimal Concurrency" << std::endl;
32     return;
33   }
34 
35   const uint32_t TEXEL_WIDTH = 4;
36   const uint32_t TEXEL_SIZE = sizeof(float) * TEXEL_WIDTH;
37 
38   const double COMPENSATE =
39       app.get_config("tex_cacheline_concurr", "compensate");
40   const double THRESHOLD = app.get_config("tex_cacheline_concurr", "threshold");
41 
42   for (int dim = 0; dim < 3; ++dim) {
43     std::cout << std::endl;
44     std::cout << "------ Texture Cacheline Optimal Concurrency (dim = " << dim
45               << ") ------" << std::endl;
46 
47     uint32_t NITER;
48 
49     const uint32_t IMG_OTHER_EDGE = dim == 0 ? app.max_tex_width
50         : dim == 1                           ? app.max_tex_height
51                                              : app.max_tex_depth;
52 
53     const uint32_t MAX_NTHREAD = std::min(app.nthread_logic, IMG_OTHER_EDGE);
54 
55     auto bench = [&](uint32_t nthread) {
56       std::vector<int64_t> sizes_whd = {
57           app.max_tex_width, app.max_tex_height, app.max_tex_depth};
58 
59       auto sizes_nchw = whd_to_nchw(sizes_whd);
60 
61       vTensor in_tensor =
62           api::vTensor(api::context(), sizes_nchw, vkapi::kFloat);
63 
64       StagingBuffer out_buf(context(), vkapi::kFloat, TEXEL_WIDTH);
65 
66       vkapi::PipelineBarrier pipeline_barrier{};
67 
68       auto shader_name = "tex_cacheline_concurr_" + std::to_string(dim);
69 
70       auto time = benchmark_on_gpu(shader_name, 100, [&]() {
71         context()->submit_compute_job(
72             VK_KERNEL_FROM_STR(shader_name),
73             pipeline_barrier,
74             {nthread, 1, 1},
75             {nthread, 1, 1},
76             {SV(NITER)},
77             VK_NULL_HANDLE,
78             0,
79             in_tensor.image(),
80             out_buf.buffer());
81       });
82       return time;
83     };
84 
85     ensure_min_niter(1000, NITER, [&]() { return bench(1); });
86 
87     DtJumpFinder<5> dj(COMPENSATE, THRESHOLD);
88     uint32_t nthread = 1;
89     for (; nthread <= MAX_NTHREAD; ++nthread) {
90       double time = bench(nthread);
91       std::cout << "Testing nthread=\t" << nthread << "\t, time=\t" << time
92                 << std::endl;
93 
94       if (dj.push(time)) {
95         auto max_concurrency = nthread - 1;
96         std::cout << "TextureCachelineConcurrencyDim" << dim << " (B),"
97                   << max_concurrency * TEXEL_SIZE << std::endl;
98         break;
99       }
100     }
101     if (nthread >= MAX_NTHREAD) {
102       std::cout
103           << "Unable to conclude an optimal texture cacheline concurrency for dim "
104           << dim << std::endl;
105     };
106   }
107 
108   // TODO: Use concurrency information to obtain the cache line size for
109   // textures as done in https://fburl.com/98xiou3g
110 }
111 
tex_bandwidth(const App & app)112 void tex_bandwidth(const App& app) {
113   if (!app.enabled("tex_bandwidth")) {
114     std::cout << "Skipped Texture Bandwidth" << std::endl;
115     return;
116   }
117 
118   for (int dim = 0; dim < 3; dim++) {
119     std::cout << "\n------ Texture Bandwidth (Dim = " << dim << ") ------"
120               << std::endl;
121     const uint32_t MAX_SIZE = dim == 0 ? app.max_tex_width
122         : dim == 1                     ? app.max_tex_height
123                                        : app.max_tex_depth;
124 
125     // rgba, float
126     const uint32_t VEC_WIDTH = 4;
127     const uint32_t VEC_SIZE = VEC_WIDTH * sizeof(float);
128     const uint32_t NVEC = MAX_SIZE;
129 
130     const uint32_t RANGE = NVEC * VEC_SIZE;
131 
132     // Cache lines flushed
133     const uint32_t NFLUSH = app.get_config("tex_bandwidth", "nflush");
134     // Number of loop unrolls. Changing this value requires an equal change in
135     // tex_bandwidth.yaml
136     const uint32_t NUNROLL = app.get_config("tex_bandwidth", "nunroll");
137     // Number of iterations. Increasing this value reduces noise in exchange
138     // for higher latency.
139     const uint32_t NITER = app.get_config("tex_bandwidth", "niter");
140     // Number of memory reads per thread
141     const uint32_t NREAD_PER_THREAD = NUNROLL * NITER;
142     // Number of threads needed to read all texells
143     const uint32_t NTHREAD = NVEC;
144     // Occupy all threads
145     const uint32_t local_x = app.nthread_logic;
146     // Ensure that global is a multiple of local, and distribute across all
147     // SMs
148     const uint32_t global_x =
149         (NTHREAD / local_x * local_x) * app.sm_count * NFLUSH;
150 
151     auto shader_name = "tex_bandwidth_" + std::to_string(dim);
152 
153     std::vector<int64_t> sizes_whd = {MAX_SIZE, 1, 1};
154     if (dim == 1) {
155       sizes_whd = {1, MAX_SIZE, 1};
156     } else if (dim == 2) {
157       sizes_whd = {1, 1, MAX_SIZE};
158     }
159     auto sizes_nchw = whd_to_nchw(sizes_whd);
160 
161     vTensor in_tensor = api::vTensor(api::context(), sizes_nchw, vkapi::kFloat);
162 
163     auto bench = [&](uint32_t access_size, uint32_t dim) {
164       // Number of texels that fit in this iteration
165       const uint32_t ntexel_access = access_size / VEC_SIZE;
166 
167       // The address mask works as a modulo because x % 2^n == x & (2^n - 1).
168       // This will help us limit address accessing to a specific set of unique
169       // addresses depending on the access size we want to measure.
170       const uint32_t addr_mask = ntexel_access - 1;
171 
172       // This is to distribute the accesses to unique addresses across the
173       // workgroups, once the size of the access excedes the workgroup width.
174       const uint32_t workgroup_width = local_x * NITER * NUNROLL;
175 
176       StagingBuffer out_buf(
177           context(), vkapi::kFloat, VEC_WIDTH * app.nthread_logic);
178       vkapi::PipelineBarrier pipeline_barrier{};
179 
180       auto time = benchmark_on_gpu(shader_name, 10, [&]() {
181         context()->submit_compute_job(
182             VK_KERNEL_FROM_STR(shader_name),
183             pipeline_barrier,
184             {global_x, 1, 1},
185             {local_x, 1, 1},
186             {SV(NITER),
187              SV(ntexel_access),
188              SV(local_x),
189              SV(addr_mask),
190              SV(workgroup_width)},
191             VK_NULL_HANDLE,
192             0,
193             in_tensor.image(),
194             out_buf.buffer());
195       });
196 
197       const uint32_t SIZE_TRANS = global_x * NREAD_PER_THREAD * VEC_SIZE;
198       double gbps = SIZE_TRANS * 1e-3 / time;
199       std::cout << "Texture bandwidth accessing \t" << access_size
200                 << "\tB unique data is \t" << gbps << " \tgbps (\t" << time
201                 << "\tus)" << std::endl;
202       return gbps;
203     };
204 
205     double max_bandwidth = 0;
206     double min_bandwidth = DBL_MAX;
207     for (uint32_t access_size = VEC_SIZE; access_size < RANGE;
208          access_size *= 2) {
209       double gbps = bench(access_size, dim);
210       max_bandwidth = std::max(gbps, max_bandwidth);
211       min_bandwidth = std::min(gbps, min_bandwidth);
212     }
213 
214     std::cout << "MaxTextureBandwidthDim" << dim << "(GB/s)," << max_bandwidth
215               << std::endl;
216     std::cout << "MinTextureBandwidthDim" << dim << "(GB/s)," << min_bandwidth
217               << std::endl;
218   }
219 }
220 } // namespace gpuinfo
221