1 /*
2 * Copyright (c) Meta Platforms, Inc. and affiliates.
3 * All rights reserved.
4 *
5 * This source code is licensed under the BSD-style license found in the
6 * LICENSE file in the root directory of this source tree.
7 */
8
9 #pragma once
10
11 #include "app.h"
12 #include "stats.h"
13 #include "utils.h"
14
15 namespace gpuinfo {
16
17 // Textures are drastically different from buffers in terms of data layout.
18 // While buffers are a contiguous range of memory, textures are opaque objects
19 // defined by the vendor and it is possible that nearby points of data are not
20 // neighboring in memory. Likewise, data points are accessed in
21 // multi-dimensional patches instead of simple lines. This makes the stride
22 // method for figuring out the cache line size not applicable. To go around
23 // this, this experiment runs an increasing amount of threads accessing
24 // different datapoints in the texture and measures latency. If the cache line
25 // is big enough to contain all requested data for the amount of threads,
26 // latency will be low. When there are more threads and hence more data than
27 // what a single cache line can handle, a second line must be fetched,
28 // increasing latency in a measurable way.
tex_cacheline_concurr(const App & app)29 void tex_cacheline_concurr(const App& app) {
30 if (!app.enabled("tex_cacheline_concurr")) {
31 std::cout << "Skipped Texture Cacheline Optimal Concurrency" << std::endl;
32 return;
33 }
34
35 const uint32_t TEXEL_WIDTH = 4;
36 const uint32_t TEXEL_SIZE = sizeof(float) * TEXEL_WIDTH;
37
38 const double COMPENSATE =
39 app.get_config("tex_cacheline_concurr", "compensate");
40 const double THRESHOLD = app.get_config("tex_cacheline_concurr", "threshold");
41
42 for (int dim = 0; dim < 3; ++dim) {
43 std::cout << std::endl;
44 std::cout << "------ Texture Cacheline Optimal Concurrency (dim = " << dim
45 << ") ------" << std::endl;
46
47 uint32_t NITER;
48
49 const uint32_t IMG_OTHER_EDGE = dim == 0 ? app.max_tex_width
50 : dim == 1 ? app.max_tex_height
51 : app.max_tex_depth;
52
53 const uint32_t MAX_NTHREAD = std::min(app.nthread_logic, IMG_OTHER_EDGE);
54
55 auto bench = [&](uint32_t nthread) {
56 std::vector<int64_t> sizes_whd = {
57 app.max_tex_width, app.max_tex_height, app.max_tex_depth};
58
59 auto sizes_nchw = whd_to_nchw(sizes_whd);
60
61 vTensor in_tensor =
62 api::vTensor(api::context(), sizes_nchw, vkapi::kFloat);
63
64 StagingBuffer out_buf(context(), vkapi::kFloat, TEXEL_WIDTH);
65
66 vkapi::PipelineBarrier pipeline_barrier{};
67
68 auto shader_name = "tex_cacheline_concurr_" + std::to_string(dim);
69
70 auto time = benchmark_on_gpu(shader_name, 100, [&]() {
71 context()->submit_compute_job(
72 VK_KERNEL_FROM_STR(shader_name),
73 pipeline_barrier,
74 {nthread, 1, 1},
75 {nthread, 1, 1},
76 {SV(NITER)},
77 VK_NULL_HANDLE,
78 0,
79 in_tensor.image(),
80 out_buf.buffer());
81 });
82 return time;
83 };
84
85 ensure_min_niter(1000, NITER, [&]() { return bench(1); });
86
87 DtJumpFinder<5> dj(COMPENSATE, THRESHOLD);
88 uint32_t nthread = 1;
89 for (; nthread <= MAX_NTHREAD; ++nthread) {
90 double time = bench(nthread);
91 std::cout << "Testing nthread=\t" << nthread << "\t, time=\t" << time
92 << std::endl;
93
94 if (dj.push(time)) {
95 auto max_concurrency = nthread - 1;
96 std::cout << "TextureCachelineConcurrencyDim" << dim << " (B),"
97 << max_concurrency * TEXEL_SIZE << std::endl;
98 break;
99 }
100 }
101 if (nthread >= MAX_NTHREAD) {
102 std::cout
103 << "Unable to conclude an optimal texture cacheline concurrency for dim "
104 << dim << std::endl;
105 };
106 }
107
108 // TODO: Use concurrency information to obtain the cache line size for
109 // textures as done in https://fburl.com/98xiou3g
110 }
111
tex_bandwidth(const App & app)112 void tex_bandwidth(const App& app) {
113 if (!app.enabled("tex_bandwidth")) {
114 std::cout << "Skipped Texture Bandwidth" << std::endl;
115 return;
116 }
117
118 for (int dim = 0; dim < 3; dim++) {
119 std::cout << "\n------ Texture Bandwidth (Dim = " << dim << ") ------"
120 << std::endl;
121 const uint32_t MAX_SIZE = dim == 0 ? app.max_tex_width
122 : dim == 1 ? app.max_tex_height
123 : app.max_tex_depth;
124
125 // rgba, float
126 const uint32_t VEC_WIDTH = 4;
127 const uint32_t VEC_SIZE = VEC_WIDTH * sizeof(float);
128 const uint32_t NVEC = MAX_SIZE;
129
130 const uint32_t RANGE = NVEC * VEC_SIZE;
131
132 // Cache lines flushed
133 const uint32_t NFLUSH = app.get_config("tex_bandwidth", "nflush");
134 // Number of loop unrolls. Changing this value requires an equal change in
135 // tex_bandwidth.yaml
136 const uint32_t NUNROLL = app.get_config("tex_bandwidth", "nunroll");
137 // Number of iterations. Increasing this value reduces noise in exchange
138 // for higher latency.
139 const uint32_t NITER = app.get_config("tex_bandwidth", "niter");
140 // Number of memory reads per thread
141 const uint32_t NREAD_PER_THREAD = NUNROLL * NITER;
142 // Number of threads needed to read all texells
143 const uint32_t NTHREAD = NVEC;
144 // Occupy all threads
145 const uint32_t local_x = app.nthread_logic;
146 // Ensure that global is a multiple of local, and distribute across all
147 // SMs
148 const uint32_t global_x =
149 (NTHREAD / local_x * local_x) * app.sm_count * NFLUSH;
150
151 auto shader_name = "tex_bandwidth_" + std::to_string(dim);
152
153 std::vector<int64_t> sizes_whd = {MAX_SIZE, 1, 1};
154 if (dim == 1) {
155 sizes_whd = {1, MAX_SIZE, 1};
156 } else if (dim == 2) {
157 sizes_whd = {1, 1, MAX_SIZE};
158 }
159 auto sizes_nchw = whd_to_nchw(sizes_whd);
160
161 vTensor in_tensor = api::vTensor(api::context(), sizes_nchw, vkapi::kFloat);
162
163 auto bench = [&](uint32_t access_size, uint32_t dim) {
164 // Number of texels that fit in this iteration
165 const uint32_t ntexel_access = access_size / VEC_SIZE;
166
167 // The address mask works as a modulo because x % 2^n == x & (2^n - 1).
168 // This will help us limit address accessing to a specific set of unique
169 // addresses depending on the access size we want to measure.
170 const uint32_t addr_mask = ntexel_access - 1;
171
172 // This is to distribute the accesses to unique addresses across the
173 // workgroups, once the size of the access excedes the workgroup width.
174 const uint32_t workgroup_width = local_x * NITER * NUNROLL;
175
176 StagingBuffer out_buf(
177 context(), vkapi::kFloat, VEC_WIDTH * app.nthread_logic);
178 vkapi::PipelineBarrier pipeline_barrier{};
179
180 auto time = benchmark_on_gpu(shader_name, 10, [&]() {
181 context()->submit_compute_job(
182 VK_KERNEL_FROM_STR(shader_name),
183 pipeline_barrier,
184 {global_x, 1, 1},
185 {local_x, 1, 1},
186 {SV(NITER),
187 SV(ntexel_access),
188 SV(local_x),
189 SV(addr_mask),
190 SV(workgroup_width)},
191 VK_NULL_HANDLE,
192 0,
193 in_tensor.image(),
194 out_buf.buffer());
195 });
196
197 const uint32_t SIZE_TRANS = global_x * NREAD_PER_THREAD * VEC_SIZE;
198 double gbps = SIZE_TRANS * 1e-3 / time;
199 std::cout << "Texture bandwidth accessing \t" << access_size
200 << "\tB unique data is \t" << gbps << " \tgbps (\t" << time
201 << "\tus)" << std::endl;
202 return gbps;
203 };
204
205 double max_bandwidth = 0;
206 double min_bandwidth = DBL_MAX;
207 for (uint32_t access_size = VEC_SIZE; access_size < RANGE;
208 access_size *= 2) {
209 double gbps = bench(access_size, dim);
210 max_bandwidth = std::max(gbps, max_bandwidth);
211 min_bandwidth = std::min(gbps, min_bandwidth);
212 }
213
214 std::cout << "MaxTextureBandwidthDim" << dim << "(GB/s)," << max_bandwidth
215 << std::endl;
216 std::cout << "MinTextureBandwidthDim" << dim << "(GB/s)," << min_bandwidth
217 << std::endl;
218 }
219 }
220 } // namespace gpuinfo
221