1 /*
2 * Copyright 2012 Nouveau Project
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20 * OTHER DEALINGS IN THE SOFTWARE.
21 *
22 * Authors: Christoph Bumiller
23 */
24
25 #include "nvc0/nvc0_context.h"
26 #include "nvc0/nve4_compute.h"
27
28 #include "nv50_ir_driver.h"
29
30 #include "drf.h"
31 #include "qmd.h"
32 #include "cla0c0qmd.h"
33 #include "clc0c0qmd.h"
34 #include "clc3c0qmd.h"
35
36 #define NVA0C0_QMDV00_06_VAL_SET(p,a...) NVVAL_MW_SET((p), NVA0C0, QMDV00_06, ##a)
37 #define NVA0C0_QMDV00_06_DEF_SET(p,a...) NVDEF_MW_SET((p), NVA0C0, QMDV00_06, ##a)
38 #define NVC0C0_QMDV02_01_VAL_SET(p,a...) NVVAL_MW_SET((p), NVC0C0, QMDV02_01, ##a)
39 #define NVC0C0_QMDV02_01_DEF_SET(p,a...) NVDEF_MW_SET((p), NVC0C0, QMDV02_01, ##a)
40 #define NVC3C0_QMDV02_02_VAL_SET(p,a...) NVVAL_MW_SET((p), NVC3C0, QMDV02_02, ##a)
41 #define NVC3C0_QMDV02_02_DEF_SET(p,a...) NVDEF_MW_SET((p), NVC3C0, QMDV02_02, ##a)
42
43 int
nve4_screen_compute_setup(struct nvc0_screen * screen,struct nouveau_pushbuf * push)44 nve4_screen_compute_setup(struct nvc0_screen *screen,
45 struct nouveau_pushbuf *push)
46 {
47 int i;
48 uint32_t obj_class = screen->compute->oclass;
49 uint64_t address;
50
51 BEGIN_NVC0(push, SUBC_CP(NV01_SUBCHAN_OBJECT), 1);
52 PUSH_DATA (push, screen->compute->oclass);
53
54 BEGIN_NVC0(push, NVE4_CP(TEMP_ADDRESS_HIGH), 2);
55 PUSH_DATAh(push, screen->tls->offset);
56 PUSH_DATA (push, screen->tls->offset);
57 /* No idea why there are 2. Divide size by 2 to be safe.
58 * Actually this might be per-MP TEMP size and looks like I'm only using
59 * 2 MPs instead of all 8.
60 */
61 BEGIN_NVC0(push, NVE4_CP(MP_TEMP_SIZE_HIGH(0)), 3);
62 PUSH_DATAh(push, screen->tls->size / screen->mp_count);
63 PUSH_DATA (push, (screen->tls->size / screen->mp_count) & ~0x7fff);
64 PUSH_DATA (push, 0xff);
65 if (obj_class < GV100_COMPUTE_CLASS) {
66 BEGIN_NVC0(push, NVE4_CP(MP_TEMP_SIZE_HIGH(1)), 3);
67 PUSH_DATAh(push, screen->tls->size / screen->mp_count);
68 PUSH_DATA (push, (screen->tls->size / screen->mp_count) & ~0x7fff);
69 PUSH_DATA (push, 0xff);
70 }
71
72 /* Unified address space ? Who needs that ? Certainly not OpenCL.
73 *
74 * FATAL: Buffers with addresses inside [0x1000000, 0x3000000] will NOT be
75 * accessible. We cannot prevent that at the moment, so expect failure.
76 */
77 if (obj_class < GV100_COMPUTE_CLASS) {
78 BEGIN_NVC0(push, NVE4_CP(LOCAL_BASE), 1);
79 PUSH_DATA (push, 0xff << 24);
80 BEGIN_NVC0(push, NVE4_CP(SHARED_BASE), 1);
81 PUSH_DATA (push, 0xfe << 24);
82
83 BEGIN_NVC0(push, NVE4_CP(CODE_ADDRESS_HIGH), 2);
84 PUSH_DATAh(push, screen->text->offset);
85 PUSH_DATA (push, screen->text->offset);
86 } else {
87 BEGIN_NVC0(push, SUBC_CP(0x2a0), 2);
88 PUSH_DATAh(push, 0xfeULL << 24);
89 PUSH_DATA (push, 0xfeULL << 24);
90 BEGIN_NVC0(push, SUBC_CP(0x7b0), 2);
91 PUSH_DATAh(push, 0xffULL << 24);
92 PUSH_DATA (push, 0xffULL << 24);
93 }
94
95 BEGIN_NVC0(push, SUBC_CP(0x0310), 1);
96 PUSH_DATA (push, (obj_class >= NVF0_COMPUTE_CLASS) ? 0x400 : 0x300);
97
98 /* NOTE: these do not affect the state used by the 3D object */
99 BEGIN_NVC0(push, NVE4_CP(TIC_ADDRESS_HIGH), 3);
100 PUSH_DATAh(push, screen->txc->offset);
101 PUSH_DATA (push, screen->txc->offset);
102 PUSH_DATA (push, NVC0_TIC_MAX_ENTRIES - 1);
103 BEGIN_NVC0(push, NVE4_CP(TSC_ADDRESS_HIGH), 3);
104 PUSH_DATAh(push, screen->txc->offset + 65536);
105 PUSH_DATA (push, screen->txc->offset + 65536);
106 PUSH_DATA (push, NVC0_TSC_MAX_ENTRIES - 1);
107
108 if (obj_class >= NVF0_COMPUTE_CLASS) {
109 /* The blob calls GK110_COMPUTE.FIRMWARE[0x6], along with the args (0x1)
110 * passed with GK110_COMPUTE.GRAPH.SCRATCH[0x2]. This is currently
111 * disabled because our firmware doesn't support these commands and the
112 * GPU hangs if they are used. */
113 BEGIN_NIC0(push, SUBC_CP(0x0248), 64);
114 for (i = 63; i >= 0; i--)
115 PUSH_DATA(push, 0x38000 | i);
116 IMMED_NVC0(push, SUBC_CP(NV50_GRAPH_SERIALIZE), 0);
117 }
118
119 BEGIN_NVC0(push, NVE4_CP(TEX_CB_INDEX), 1);
120 PUSH_DATA (push, 7); /* does not interfere with 3D */
121
122 /* Disabling this UNK command avoid a read fault when using texelFetch()
123 * from a compute shader for weird reasons.
124 if (obj_class == NVF0_COMPUTE_CLASS)
125 IMMED_NVC0(push, SUBC_CP(0x02c4), 1);
126 */
127
128 address = screen->uniform_bo->offset + NVC0_CB_AUX_INFO(5);
129
130 /* MS sample coordinate offsets: these do not work with _ALT modes ! */
131 BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2);
132 PUSH_DATAh(push, address + NVC0_CB_AUX_MS_INFO);
133 PUSH_DATA (push, address + NVC0_CB_AUX_MS_INFO);
134 BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2);
135 PUSH_DATA (push, 64);
136 PUSH_DATA (push, 1);
137 BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 17);
138 PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1));
139 PUSH_DATA (push, 0); /* 0 */
140 PUSH_DATA (push, 0);
141 PUSH_DATA (push, 1); /* 1 */
142 PUSH_DATA (push, 0);
143 PUSH_DATA (push, 0); /* 2 */
144 PUSH_DATA (push, 1);
145 PUSH_DATA (push, 1); /* 3 */
146 PUSH_DATA (push, 1);
147 PUSH_DATA (push, 2); /* 4 */
148 PUSH_DATA (push, 0);
149 PUSH_DATA (push, 3); /* 5 */
150 PUSH_DATA (push, 0);
151 PUSH_DATA (push, 2); /* 6 */
152 PUSH_DATA (push, 1);
153 PUSH_DATA (push, 3); /* 7 */
154 PUSH_DATA (push, 1);
155
156 #ifdef NOUVEAU_NVE4_MP_TRAP_HANDLER
157 BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2);
158 PUSH_DATAh(push, screen->parm->offset + NVE4_CP_INPUT_TRAP_INFO_PTR);
159 PUSH_DATA (push, screen->parm->offset + NVE4_CP_INPUT_TRAP_INFO_PTR);
160 BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2);
161 PUSH_DATA (push, 28);
162 PUSH_DATA (push, 1);
163 BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 8);
164 PUSH_DATA (push, 1);
165 PUSH_DATA (push, screen->parm->offset + NVE4_CP_PARAM_TRAP_INFO);
166 PUSH_DATAh(push, screen->parm->offset + NVE4_CP_PARAM_TRAP_INFO);
167 PUSH_DATA (push, screen->tls->offset);
168 PUSH_DATAh(push, screen->tls->offset);
169 PUSH_DATA (push, screen->tls->size / 2); /* MP TEMP block size */
170 PUSH_DATA (push, screen->tls->size / 2 / 64); /* warp TEMP block size */
171 PUSH_DATA (push, 0); /* warp cfstack size */
172 #endif
173
174 BEGIN_NVC0(push, NVE4_CP(FLUSH), 1);
175 PUSH_DATA (push, NVE4_COMPUTE_FLUSH_CB);
176
177 return 0;
178 }
179
180 static void
gm107_compute_validate_surfaces(struct nvc0_context * nvc0,struct pipe_image_view * view,int slot)181 gm107_compute_validate_surfaces(struct nvc0_context *nvc0,
182 struct pipe_image_view *view, int slot)
183 {
184 struct nv04_resource *res = nv04_resource(view->resource);
185 struct nouveau_pushbuf *push = nvc0->base.pushbuf;
186 struct nvc0_screen *screen = nvc0->screen;
187 struct nouveau_bo *txc = nvc0->screen->txc;
188 struct nv50_tic_entry *tic;
189 uint64_t address;
190 const int s = 5;
191
192 tic = nv50_tic_entry(nvc0->images_tic[s][slot]);
193
194 res = nv04_resource(tic->pipe.texture);
195 nvc0_update_tic(nvc0, tic, res);
196
197 if (tic->id < 0) {
198 tic->id = nvc0_screen_tic_alloc(nvc0->screen, tic);
199
200 /* upload the texture view */
201 PUSH_SPACE(push, 16);
202 BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2);
203 PUSH_DATAh(push, txc->offset + (tic->id * 32));
204 PUSH_DATA (push, txc->offset + (tic->id * 32));
205 BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2);
206 PUSH_DATA (push, 32);
207 PUSH_DATA (push, 1);
208 BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 9);
209 PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1));
210 PUSH_DATAp(push, &tic->tic[0], 8);
211
212 BEGIN_NIC0(push, NVE4_CP(TIC_FLUSH), 1);
213 PUSH_DATA (push, (tic->id << 4) | 1);
214 } else
215 if (res->status & NOUVEAU_BUFFER_STATUS_GPU_WRITING) {
216 BEGIN_NIC0(push, NVE4_CP(TEX_CACHE_CTL), 1);
217 PUSH_DATA (push, (tic->id << 4) | 1);
218 }
219 nvc0->screen->tic.lock[tic->id / 32] |= 1 << (tic->id % 32);
220
221 res->status &= ~NOUVEAU_BUFFER_STATUS_GPU_WRITING;
222 res->status |= NOUVEAU_BUFFER_STATUS_GPU_READING;
223
224 BCTX_REFN(nvc0->bufctx_cp, CP_SUF, res, RD);
225
226 address = screen->uniform_bo->offset + NVC0_CB_AUX_INFO(s);
227
228 /* upload the texture handle */
229 BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2);
230 PUSH_DATAh(push, address + NVC0_CB_AUX_TEX_INFO(slot + 32));
231 PUSH_DATA (push, address + NVC0_CB_AUX_TEX_INFO(slot + 32));
232 BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2);
233 PUSH_DATA (push, 4);
234 PUSH_DATA (push, 0x1);
235 BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 2);
236 PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1));
237 PUSH_DATA (push, tic->id);
238
239 BEGIN_NVC0(push, NVE4_CP(FLUSH), 1);
240 PUSH_DATA (push, NVE4_COMPUTE_FLUSH_CB);
241 }
242
243 static void
nve4_compute_validate_surfaces(struct nvc0_context * nvc0)244 nve4_compute_validate_surfaces(struct nvc0_context *nvc0)
245 {
246 struct nouveau_pushbuf *push = nvc0->base.pushbuf;
247 uint64_t address;
248 const int s = 5;
249 int i, j;
250
251 if (!nvc0->images_dirty[s])
252 return;
253
254 address = nvc0->screen->uniform_bo->offset + NVC0_CB_AUX_INFO(s);
255
256 for (i = 0; i < NVC0_MAX_IMAGES; ++i) {
257 struct pipe_image_view *view = &nvc0->images[s][i];
258
259 BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2);
260 PUSH_DATAh(push, address + NVC0_CB_AUX_SU_INFO(i));
261 PUSH_DATA (push, address + NVC0_CB_AUX_SU_INFO(i));
262 BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2);
263 PUSH_DATA (push, 16 * 4);
264 PUSH_DATA (push, 0x1);
265 BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + 16);
266 PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1));
267
268 if (view->resource) {
269 struct nv04_resource *res = nv04_resource(view->resource);
270
271 if (res->base.target == PIPE_BUFFER) {
272 if (view->access & PIPE_IMAGE_ACCESS_WRITE)
273 nvc0_mark_image_range_valid(view);
274 }
275
276 nve4_set_surface_info(push, view, nvc0);
277 BCTX_REFN(nvc0->bufctx_cp, CP_SUF, res, RDWR);
278
279 if (nvc0->screen->base.class_3d >= GM107_3D_CLASS)
280 gm107_compute_validate_surfaces(nvc0, view, i);
281 } else {
282 for (j = 0; j < 16; j++)
283 PUSH_DATA(push, 0);
284 }
285 }
286 }
287
288 /* Thankfully, textures with samplers follow the normal rules. */
289 static void
nve4_compute_validate_samplers(struct nvc0_context * nvc0)290 nve4_compute_validate_samplers(struct nvc0_context *nvc0)
291 {
292 bool need_flush = nve4_validate_tsc(nvc0, 5);
293 if (need_flush) {
294 BEGIN_NVC0(nvc0->base.pushbuf, NVE4_CP(TSC_FLUSH), 1);
295 PUSH_DATA (nvc0->base.pushbuf, 0);
296 }
297
298 /* Invalidate all 3D samplers because they are aliased. */
299 for (int s = 0; s < 5; s++)
300 nvc0->samplers_dirty[s] = ~0;
301 nvc0->dirty_3d |= NVC0_NEW_3D_SAMPLERS;
302 }
303
304 /* (Code duplicated at bottom for various non-convincing reasons.
305 * E.g. we might want to use the COMPUTE subchannel to upload TIC/TSC
306 * entries to avoid a subchannel switch.
307 * Same for texture cache flushes.
308 * Also, the bufctx differs, and more IFs in the 3D version looks ugly.)
309 */
310 static void nve4_compute_validate_textures(struct nvc0_context *);
311
312 static void
nve4_compute_set_tex_handles(struct nvc0_context * nvc0)313 nve4_compute_set_tex_handles(struct nvc0_context *nvc0)
314 {
315 struct nouveau_pushbuf *push = nvc0->base.pushbuf;
316 struct nvc0_screen *screen = nvc0->screen;
317 uint64_t address;
318 const unsigned s = nvc0_shader_stage(PIPE_SHADER_COMPUTE);
319 unsigned i, n;
320 uint32_t dirty = nvc0->textures_dirty[s] | nvc0->samplers_dirty[s];
321
322 if (!dirty)
323 return;
324 i = ffs(dirty) - 1;
325 n = util_logbase2(dirty) + 1 - i;
326 assert(n);
327
328 address = screen->uniform_bo->offset + NVC0_CB_AUX_INFO(s);
329
330 BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2);
331 PUSH_DATAh(push, address + NVC0_CB_AUX_TEX_INFO(i));
332 PUSH_DATA (push, address + NVC0_CB_AUX_TEX_INFO(i));
333 BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2);
334 PUSH_DATA (push, n * 4);
335 PUSH_DATA (push, 0x1);
336 BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + n);
337 PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1));
338 PUSH_DATAp(push, &nvc0->tex_handles[s][i], n);
339
340 BEGIN_NVC0(push, NVE4_CP(FLUSH), 1);
341 PUSH_DATA (push, NVE4_COMPUTE_FLUSH_CB);
342
343 nvc0->textures_dirty[s] = 0;
344 nvc0->samplers_dirty[s] = 0;
345 }
346
347 static void
nve4_compute_validate_constbufs(struct nvc0_context * nvc0)348 nve4_compute_validate_constbufs(struct nvc0_context *nvc0)
349 {
350 struct nouveau_pushbuf *push = nvc0->base.pushbuf;
351 const int s = 5;
352
353 while (nvc0->constbuf_dirty[s]) {
354 int i = ffs(nvc0->constbuf_dirty[s]) - 1;
355 nvc0->constbuf_dirty[s] &= ~(1 << i);
356
357 if (nvc0->constbuf[s][i].user) {
358 struct nouveau_bo *bo = nvc0->screen->uniform_bo;
359 const unsigned base = NVC0_CB_USR_INFO(s);
360 const unsigned size = nvc0->constbuf[s][0].size;
361 assert(i == 0); /* we really only want OpenGL uniforms here */
362 assert(nvc0->constbuf[s][0].u.data);
363
364 BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2);
365 PUSH_DATAh(push, bo->offset + base);
366 PUSH_DATA (push, bo->offset + base);
367 BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2);
368 PUSH_DATA (push, size);
369 PUSH_DATA (push, 0x1);
370 BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + (size / 4));
371 PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1));
372 PUSH_DATAp(push, nvc0->constbuf[s][0].u.data, size / 4);
373 }
374 else {
375 struct nv04_resource *res =
376 nv04_resource(nvc0->constbuf[s][i].u.buf);
377 if (res) {
378 uint64_t address
379 = nvc0->screen->uniform_bo->offset + NVC0_CB_AUX_INFO(s);
380
381 /* constbufs above 0 will are fetched via ubo info in the shader */
382 if (i > 0) {
383 BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2);
384 PUSH_DATAh(push, address + NVC0_CB_AUX_UBO_INFO(i - 1));
385 PUSH_DATA (push, address + NVC0_CB_AUX_UBO_INFO(i - 1));
386 BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2);
387 PUSH_DATA (push, 4 * 4);
388 PUSH_DATA (push, 0x1);
389 BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + 4);
390 PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1));
391
392 PUSH_DATA (push, res->address + nvc0->constbuf[s][i].offset);
393 PUSH_DATAh(push, res->address + nvc0->constbuf[s][i].offset);
394 PUSH_DATA (push, nvc0->constbuf[s][i].size);
395 PUSH_DATA (push, 0);
396 }
397
398 BCTX_REFN(nvc0->bufctx_cp, CP_CB(i), res, RD);
399 res->cb_bindings[s] |= 1 << i;
400 }
401 }
402 }
403
404 BEGIN_NVC0(push, NVE4_CP(FLUSH), 1);
405 PUSH_DATA (push, NVE4_COMPUTE_FLUSH_CB);
406 }
407
408 static void
nve4_compute_validate_buffers(struct nvc0_context * nvc0)409 nve4_compute_validate_buffers(struct nvc0_context *nvc0)
410 {
411 struct nouveau_pushbuf *push = nvc0->base.pushbuf;
412 uint64_t address;
413 const int s = 5;
414 int i;
415
416 address = nvc0->screen->uniform_bo->offset + NVC0_CB_AUX_INFO(s);
417
418 BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2);
419 PUSH_DATAh(push, address + NVC0_CB_AUX_BUF_INFO(0));
420 PUSH_DATA (push, address + NVC0_CB_AUX_BUF_INFO(0));
421 BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2);
422 PUSH_DATA (push, 4 * NVC0_MAX_BUFFERS * 4);
423 PUSH_DATA (push, 0x1);
424 BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + 4 * NVC0_MAX_BUFFERS);
425 PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1));
426
427 for (i = 0; i < NVC0_MAX_BUFFERS; i++) {
428 if (nvc0->buffers[s][i].buffer) {
429 struct nv04_resource *res =
430 nv04_resource(nvc0->buffers[s][i].buffer);
431 PUSH_DATA (push, res->address + nvc0->buffers[s][i].buffer_offset);
432 PUSH_DATAh(push, res->address + nvc0->buffers[s][i].buffer_offset);
433 PUSH_DATA (push, nvc0->buffers[s][i].buffer_size);
434 PUSH_DATA (push, 0);
435 BCTX_REFN(nvc0->bufctx_cp, CP_BUF, res, RDWR);
436 util_range_add(&res->base, &res->valid_buffer_range,
437 nvc0->buffers[s][i].buffer_offset,
438 nvc0->buffers[s][i].buffer_offset +
439 nvc0->buffers[s][i].buffer_size);
440 } else {
441 PUSH_DATA (push, 0);
442 PUSH_DATA (push, 0);
443 PUSH_DATA (push, 0);
444 PUSH_DATA (push, 0);
445 }
446 }
447 }
448
449 static struct nvc0_state_validate
450 validate_list_cp[] = {
451 { nvc0_compprog_validate, NVC0_NEW_CP_PROGRAM },
452 { nve4_compute_validate_textures, NVC0_NEW_CP_TEXTURES },
453 { nve4_compute_validate_samplers, NVC0_NEW_CP_SAMPLERS },
454 { nve4_compute_set_tex_handles, NVC0_NEW_CP_TEXTURES |
455 NVC0_NEW_CP_SAMPLERS },
456 { nve4_compute_validate_surfaces, NVC0_NEW_CP_SURFACES },
457 { nvc0_compute_validate_globals, NVC0_NEW_CP_GLOBALS },
458 { nve4_compute_validate_buffers, NVC0_NEW_CP_BUFFERS },
459 { nve4_compute_validate_constbufs, NVC0_NEW_CP_CONSTBUF },
460 };
461
462 static bool
nve4_state_validate_cp(struct nvc0_context * nvc0,uint32_t mask)463 nve4_state_validate_cp(struct nvc0_context *nvc0, uint32_t mask)
464 {
465 bool ret;
466
467 ret = nvc0_state_validate(nvc0, mask, validate_list_cp,
468 ARRAY_SIZE(validate_list_cp), &nvc0->dirty_cp,
469 nvc0->bufctx_cp);
470
471 if (unlikely(nvc0->state.flushed))
472 nvc0_bufctx_fence(nvc0, nvc0->bufctx_cp, true);
473 return ret;
474 }
475
476 static void
nve4_compute_upload_input(struct nvc0_context * nvc0,const struct pipe_grid_info * info)477 nve4_compute_upload_input(struct nvc0_context *nvc0,
478 const struct pipe_grid_info *info)
479 {
480 struct nvc0_screen *screen = nvc0->screen;
481 struct nouveau_pushbuf *push = nvc0->base.pushbuf;
482 struct nvc0_program *cp = nvc0->compprog;
483 uint64_t address;
484
485 address = screen->uniform_bo->offset + NVC0_CB_AUX_INFO(5);
486
487 if (cp->parm_size) {
488 BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2);
489 PUSH_DATAh(push, screen->uniform_bo->offset + NVC0_CB_USR_INFO(5));
490 PUSH_DATA (push, screen->uniform_bo->offset + NVC0_CB_USR_INFO(5));
491 BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2);
492 PUSH_DATA (push, cp->parm_size);
493 PUSH_DATA (push, 0x1);
494 BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + DIV_ROUND_UP(cp->parm_size, 4));
495 PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1));
496 PUSH_DATAb(push, info->input, cp->parm_size);
497 }
498 BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2);
499 PUSH_DATAh(push, address + NVC0_CB_AUX_GRID_INFO(0));
500 PUSH_DATA (push, address + NVC0_CB_AUX_GRID_INFO(0));
501 BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2);
502 PUSH_DATA (push, 8 * 4);
503 PUSH_DATA (push, 0x1);
504
505 if (unlikely(info->indirect)) {
506 struct nv04_resource *res = nv04_resource(info->indirect);
507 uint32_t offset = res->offset + info->indirect_offset;
508
509 PUSH_SPACE_EX(push, 32, 0, 1);
510 PUSH_REF1(push, res->bo, NOUVEAU_BO_RD | res->domain);
511
512 BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + 8);
513 PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1));
514 PUSH_DATAp(push, info->block, 3);
515 nouveau_pushbuf_data(push, res->bo, offset,
516 NVC0_IB_ENTRY_1_NO_PREFETCH | 3 * 4);
517 } else {
518 BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + 8);
519 PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1));
520 PUSH_DATAp(push, info->block, 3);
521 PUSH_DATAp(push, info->grid, 3);
522 }
523 PUSH_DATA (push, 0);
524 PUSH_DATA (push, info->work_dim);
525
526 BEGIN_NVC0(push, NVE4_CP(FLUSH), 1);
527 PUSH_DATA (push, NVE4_COMPUTE_FLUSH_CB);
528 }
529
530 static inline void
gp100_cp_launch_desc_set_cb(uint32_t * qmd,unsigned index,struct nouveau_bo * bo,uint32_t base,uint32_t size)531 gp100_cp_launch_desc_set_cb(uint32_t *qmd, unsigned index,
532 struct nouveau_bo *bo, uint32_t base, uint32_t size)
533 {
534 uint64_t address = bo->offset + base;
535
536 assert(index < 8);
537 assert(!(base & 0xff));
538
539 NVC0C0_QMDV02_01_VAL_SET(qmd, CONSTANT_BUFFER_ADDR_LOWER, index, address);
540 NVC0C0_QMDV02_01_VAL_SET(qmd, CONSTANT_BUFFER_ADDR_UPPER, index, address >> 32);
541 NVC0C0_QMDV02_01_VAL_SET(qmd, CONSTANT_BUFFER_SIZE_SHIFTED4, index,
542 DIV_ROUND_UP(size, 16));
543 NVC0C0_QMDV02_01_DEF_SET(qmd, CONSTANT_BUFFER_VALID, index, TRUE);
544 }
545
546 static inline void
nve4_cp_launch_desc_set_cb(uint32_t * qmd,unsigned index,struct nouveau_bo * bo,uint32_t base,uint32_t size)547 nve4_cp_launch_desc_set_cb(uint32_t *qmd, unsigned index, struct nouveau_bo *bo,
548 uint32_t base, uint32_t size)
549 {
550 uint64_t address = bo->offset + base;
551
552 assert(index < 8);
553 assert(!(base & 0xff));
554
555 NVA0C0_QMDV00_06_VAL_SET(qmd, CONSTANT_BUFFER_ADDR_LOWER, index, address);
556 NVA0C0_QMDV00_06_VAL_SET(qmd, CONSTANT_BUFFER_ADDR_UPPER, index, address >> 32);
557 NVA0C0_QMDV00_06_VAL_SET(qmd, CONSTANT_BUFFER_SIZE, index, size);
558 NVA0C0_QMDV00_06_DEF_SET(qmd, CONSTANT_BUFFER_VALID, index, TRUE);
559 }
560
561 static void
nve4_compute_setup_buf_cb(struct nvc0_context * nvc0,bool gp100,void * desc)562 nve4_compute_setup_buf_cb(struct nvc0_context *nvc0, bool gp100, void *desc)
563 {
564 // only user constant buffers 0-6 can be put in the descriptor, the rest are
565 // loaded through global memory
566 for (int i = 0; i <= 6; i++) {
567 if (nvc0->constbuf[5][i].user || !nvc0->constbuf[5][i].u.buf)
568 continue;
569
570 struct nv04_resource *res =
571 nv04_resource(nvc0->constbuf[5][i].u.buf);
572
573 uint32_t base = res->offset + nvc0->constbuf[5][i].offset;
574 uint32_t size = nvc0->constbuf[5][i].size;
575 if (gp100)
576 gp100_cp_launch_desc_set_cb(desc, i, res->bo, base, size);
577 else
578 nve4_cp_launch_desc_set_cb(desc, i, res->bo, base, size);
579 }
580
581 // there is no need to do FLUSH(NVE4_COMPUTE_FLUSH_CB) because
582 // nve4_compute_upload_input() does it later
583 }
584
585 static void
nve4_compute_setup_launch_desc(struct nvc0_context * nvc0,uint32_t * qmd,const struct pipe_grid_info * info)586 nve4_compute_setup_launch_desc(struct nvc0_context *nvc0, uint32_t *qmd,
587 const struct pipe_grid_info *info)
588 {
589 const struct nvc0_screen *screen = nvc0->screen;
590 const struct nvc0_program *cp = nvc0->compprog;
591 uint32_t shared_size = cp->cp.smem_size + info->variable_shared_mem;
592
593 NVA0C0_QMDV00_06_DEF_SET(qmd, INVALIDATE_TEXTURE_HEADER_CACHE, TRUE);
594 NVA0C0_QMDV00_06_DEF_SET(qmd, INVALIDATE_TEXTURE_SAMPLER_CACHE, TRUE);
595 NVA0C0_QMDV00_06_DEF_SET(qmd, INVALIDATE_TEXTURE_DATA_CACHE, TRUE);
596 NVA0C0_QMDV00_06_DEF_SET(qmd, INVALIDATE_SHADER_DATA_CACHE, TRUE);
597 NVA0C0_QMDV00_06_DEF_SET(qmd, INVALIDATE_SHADER_CONSTANT_CACHE, TRUE);
598 NVA0C0_QMDV00_06_DEF_SET(qmd, RELEASE_MEMBAR_TYPE, FE_SYSMEMBAR);
599 NVA0C0_QMDV00_06_DEF_SET(qmd, CWD_MEMBAR_TYPE, L1_SYSMEMBAR);
600 NVA0C0_QMDV00_06_DEF_SET(qmd, API_VISIBLE_CALL_LIMIT, NO_CHECK);
601 NVA0C0_QMDV00_06_VAL_SET(qmd, SASS_VERSION, 0x30);
602
603 NVA0C0_QMDV00_06_VAL_SET(qmd, PROGRAM_OFFSET, cp->code_base);
604
605 NVA0C0_QMDV00_06_VAL_SET(qmd, CTA_RASTER_WIDTH, info->grid[0]);
606 NVA0C0_QMDV00_06_VAL_SET(qmd, CTA_RASTER_HEIGHT, info->grid[1]);
607 NVA0C0_QMDV00_06_VAL_SET(qmd, CTA_RASTER_DEPTH, info->grid[2]);
608 NVA0C0_QMDV00_06_VAL_SET(qmd, CTA_THREAD_DIMENSION0, info->block[0]);
609 NVA0C0_QMDV00_06_VAL_SET(qmd, CTA_THREAD_DIMENSION1, info->block[1]);
610 NVA0C0_QMDV00_06_VAL_SET(qmd, CTA_THREAD_DIMENSION2, info->block[2]);
611
612 NVA0C0_QMDV00_06_VAL_SET(qmd, SHARED_MEMORY_SIZE, align(shared_size, 0x100));
613 NVA0C0_QMDV00_06_VAL_SET(qmd, SHADER_LOCAL_MEMORY_LOW_SIZE, cp->hdr[1] & 0xfffff0);
614 NVA0C0_QMDV00_06_VAL_SET(qmd, SHADER_LOCAL_MEMORY_HIGH_SIZE, 0);
615 NVA0C0_QMDV00_06_VAL_SET(qmd, SHADER_LOCAL_MEMORY_CRS_SIZE, 0x800);
616
617 if (shared_size > (32 << 10))
618 NVA0C0_QMDV00_06_DEF_SET(qmd, L1_CONFIGURATION,
619 DIRECTLY_ADDRESSABLE_MEMORY_SIZE_48KB);
620 else
621 if (shared_size > (16 << 10))
622 NVA0C0_QMDV00_06_DEF_SET(qmd, L1_CONFIGURATION,
623 DIRECTLY_ADDRESSABLE_MEMORY_SIZE_32KB);
624 else
625 NVA0C0_QMDV00_06_DEF_SET(qmd, L1_CONFIGURATION,
626 DIRECTLY_ADDRESSABLE_MEMORY_SIZE_16KB);
627
628 NVA0C0_QMDV00_06_VAL_SET(qmd, REGISTER_COUNT, cp->num_gprs);
629 NVA0C0_QMDV00_06_VAL_SET(qmd, BARRIER_COUNT, cp->num_barriers);
630
631 // Only bind user uniforms and the driver constant buffer through the
632 // launch descriptor because UBOs are sticked to the driver cb to avoid the
633 // limitation of 8 CBs.
634 if (nvc0->constbuf[5][0].user || cp->parm_size) {
635 nve4_cp_launch_desc_set_cb(qmd, 0, screen->uniform_bo,
636 NVC0_CB_USR_INFO(5), 1 << 16);
637
638 // Later logic will attempt to bind a real buffer at position 0. That
639 // should not happen if we've bound a user buffer.
640 assert(nvc0->constbuf[5][0].user || !nvc0->constbuf[5][0].u.buf);
641 }
642 nve4_cp_launch_desc_set_cb(qmd, 7, screen->uniform_bo,
643 NVC0_CB_AUX_INFO(5), 1 << 11);
644
645 nve4_compute_setup_buf_cb(nvc0, false, qmd);
646 }
647
648 static void
gp100_compute_setup_launch_desc(struct nvc0_context * nvc0,uint32_t * qmd,const struct pipe_grid_info * info)649 gp100_compute_setup_launch_desc(struct nvc0_context *nvc0, uint32_t *qmd,
650 const struct pipe_grid_info *info)
651 {
652 const struct nvc0_screen *screen = nvc0->screen;
653 const struct nvc0_program *cp = nvc0->compprog;
654 uint32_t shared_size = cp->cp.smem_size + info->variable_shared_mem;
655
656 NVC0C0_QMDV02_01_VAL_SET(qmd, SM_GLOBAL_CACHING_ENABLE, 1);
657 NVC0C0_QMDV02_01_DEF_SET(qmd, RELEASE_MEMBAR_TYPE, FE_SYSMEMBAR);
658 NVC0C0_QMDV02_01_DEF_SET(qmd, CWD_MEMBAR_TYPE, L1_SYSMEMBAR);
659 NVC0C0_QMDV02_01_DEF_SET(qmd, API_VISIBLE_CALL_LIMIT, NO_CHECK);
660
661 NVC0C0_QMDV02_01_VAL_SET(qmd, PROGRAM_OFFSET, cp->code_base);
662
663 NVC0C0_QMDV02_01_VAL_SET(qmd, CTA_RASTER_WIDTH, info->grid[0]);
664 NVC0C0_QMDV02_01_VAL_SET(qmd, CTA_RASTER_HEIGHT, info->grid[1]);
665 NVC0C0_QMDV02_01_VAL_SET(qmd, CTA_RASTER_DEPTH, info->grid[2]);
666 NVC0C0_QMDV02_01_VAL_SET(qmd, CTA_THREAD_DIMENSION0, info->block[0]);
667 NVC0C0_QMDV02_01_VAL_SET(qmd, CTA_THREAD_DIMENSION1, info->block[1]);
668 NVC0C0_QMDV02_01_VAL_SET(qmd, CTA_THREAD_DIMENSION2, info->block[2]);
669
670 NVC0C0_QMDV02_01_VAL_SET(qmd, SHARED_MEMORY_SIZE, align(shared_size, 0x100));
671 NVC0C0_QMDV02_01_VAL_SET(qmd, SHADER_LOCAL_MEMORY_LOW_SIZE, cp->hdr[1] & 0xfffff0);
672 NVC0C0_QMDV02_01_VAL_SET(qmd, SHADER_LOCAL_MEMORY_HIGH_SIZE, 0);
673 NVC0C0_QMDV02_01_VAL_SET(qmd, SHADER_LOCAL_MEMORY_CRS_SIZE, 0x800);
674
675 NVC0C0_QMDV02_01_VAL_SET(qmd, REGISTER_COUNT, cp->num_gprs);
676 NVC0C0_QMDV02_01_VAL_SET(qmd, BARRIER_COUNT, cp->num_barriers);
677
678 // Only bind user uniforms and the driver constant buffer through the
679 // launch descriptor because UBOs are sticked to the driver cb to avoid the
680 // limitation of 8 CBs.
681 if (nvc0->constbuf[5][0].user || cp->parm_size) {
682 gp100_cp_launch_desc_set_cb(qmd, 0, screen->uniform_bo,
683 NVC0_CB_USR_INFO(5), 1 << 16);
684
685 // Later logic will attempt to bind a real buffer at position 0. That
686 // should not happen if we've bound a user buffer.
687 assert(nvc0->constbuf[5][0].user || !nvc0->constbuf[5][0].u.buf);
688 }
689 gp100_cp_launch_desc_set_cb(qmd, 7, screen->uniform_bo,
690 NVC0_CB_AUX_INFO(5), 1 << 11);
691
692 nve4_compute_setup_buf_cb(nvc0, true, qmd);
693 }
694
695 static int
gv100_sm_config_smem_size(u32 size)696 gv100_sm_config_smem_size(u32 size)
697 {
698 if (size > 64 * 1024) size = 96 * 1024;
699 else if (size > 32 * 1024) size = 64 * 1024;
700 else if (size > 16 * 1024) size = 32 * 1024;
701 else if (size > 8 * 1024) size = 16 * 1024;
702 else size = 8 * 1024;
703 return (size / 4096) + 1;
704 }
705
706 static void
gv100_compute_setup_launch_desc(struct nvc0_context * nvc0,u32 * qmd,const struct pipe_grid_info * info)707 gv100_compute_setup_launch_desc(struct nvc0_context *nvc0, u32 *qmd,
708 const struct pipe_grid_info *info)
709 {
710 struct nvc0_program *cp = nvc0->compprog;
711 struct nvc0_screen *screen = nvc0->screen;
712 uint64_t entry = screen->text->offset + cp->code_base;
713 uint32_t shared_size = cp->cp.smem_size + info->variable_shared_mem;
714
715 NVC3C0_QMDV02_02_VAL_SET(qmd, SM_GLOBAL_CACHING_ENABLE, 1);
716 NVC3C0_QMDV02_02_DEF_SET(qmd, API_VISIBLE_CALL_LIMIT, NO_CHECK);
717 NVC3C0_QMDV02_02_DEF_SET(qmd, SAMPLER_INDEX, INDEPENDENTLY);
718 NVC3C0_QMDV02_02_VAL_SET(qmd, SHARED_MEMORY_SIZE, align(shared_size, 0x100));
719 NVC3C0_QMDV02_02_VAL_SET(qmd, SHADER_LOCAL_MEMORY_LOW_SIZE, cp->hdr[1] & 0xfffff0);
720 NVC3C0_QMDV02_02_VAL_SET(qmd, SHADER_LOCAL_MEMORY_HIGH_SIZE, 0);
721 NVC3C0_QMDV02_02_VAL_SET(qmd, MIN_SM_CONFIG_SHARED_MEM_SIZE,
722 gv100_sm_config_smem_size(8 * 1024));
723 NVC3C0_QMDV02_02_VAL_SET(qmd, MAX_SM_CONFIG_SHARED_MEM_SIZE,
724 gv100_sm_config_smem_size(96 * 1024));
725 NVC3C0_QMDV02_02_VAL_SET(qmd, QMD_VERSION, 2);
726 NVC3C0_QMDV02_02_VAL_SET(qmd, QMD_MAJOR_VERSION, 2);
727 NVC3C0_QMDV02_02_VAL_SET(qmd, TARGET_SM_CONFIG_SHARED_MEM_SIZE,
728 gv100_sm_config_smem_size(shared_size));
729
730 NVC3C0_QMDV02_02_VAL_SET(qmd, CTA_RASTER_WIDTH, info->grid[0]);
731 NVC3C0_QMDV02_02_VAL_SET(qmd, CTA_RASTER_HEIGHT, info->grid[1]);
732 NVC3C0_QMDV02_02_VAL_SET(qmd, CTA_RASTER_DEPTH, info->grid[2]);
733 NVC3C0_QMDV02_02_VAL_SET(qmd, CTA_THREAD_DIMENSION0, info->block[0]);
734 NVC3C0_QMDV02_02_VAL_SET(qmd, CTA_THREAD_DIMENSION1, info->block[1]);
735 NVC3C0_QMDV02_02_VAL_SET(qmd, CTA_THREAD_DIMENSION2, info->block[2]);
736 NVC3C0_QMDV02_02_VAL_SET(qmd, REGISTER_COUNT_V, cp->num_gprs);
737 NVC3C0_QMDV02_02_VAL_SET(qmd, BARRIER_COUNT, cp->num_barriers);
738
739 // Only bind user uniforms and the driver constant buffer through the
740 // launch descriptor because UBOs are sticked to the driver cb to avoid the
741 // limitation of 8 CBs.
742 if (nvc0->constbuf[5][0].user || cp->parm_size) {
743 gp100_cp_launch_desc_set_cb(qmd, 0, screen->uniform_bo,
744 NVC0_CB_USR_INFO(5), 1 << 16);
745
746 // Later logic will attempt to bind a real buffer at position 0. That
747 // should not happen if we've bound a user buffer.
748 assert(nvc0->constbuf[5][0].user || !nvc0->constbuf[5][0].u.buf);
749 }
750 gp100_cp_launch_desc_set_cb(qmd, 7, screen->uniform_bo,
751 NVC0_CB_AUX_INFO(5), 1 << 11);
752
753 nve4_compute_setup_buf_cb(nvc0, true, qmd);
754
755 NVC3C0_QMDV02_02_VAL_SET(qmd, PROGRAM_ADDRESS_LOWER, entry & 0xffffffff);
756 NVC3C0_QMDV02_02_VAL_SET(qmd, PROGRAM_ADDRESS_UPPER, entry >> 32);
757 }
758
759 static inline void *
nve4_compute_alloc_launch_desc(struct nouveau_context * nv,struct nouveau_bo ** pbo,uint64_t * pgpuaddr)760 nve4_compute_alloc_launch_desc(struct nouveau_context *nv,
761 struct nouveau_bo **pbo, uint64_t *pgpuaddr)
762 {
763 uint8_t *ptr = nouveau_scratch_get(nv, 512, pgpuaddr, pbo);
764 if (!ptr)
765 return NULL;
766 if (*pgpuaddr & 255) {
767 unsigned adj = 256 - (*pgpuaddr & 255);
768 ptr += adj;
769 *pgpuaddr += adj;
770 }
771 memset(ptr, 0x00, 256);
772 return ptr;
773 }
774
775 static void
nve4_upload_indirect_desc(struct nouveau_pushbuf * push,struct nv04_resource * res,uint64_t gpuaddr,uint32_t length,uint32_t bo_offset)776 nve4_upload_indirect_desc(struct nouveau_pushbuf *push,
777 struct nv04_resource *res, uint64_t gpuaddr,
778 uint32_t length, uint32_t bo_offset)
779 {
780 BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2);
781 PUSH_DATAh(push, gpuaddr);
782 PUSH_DATA (push, gpuaddr);
783 BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2);
784 PUSH_DATA (push, length);
785 PUSH_DATA (push, 1);
786
787 PUSH_SPACE_EX(push, 32, 0, 1);
788 PUSH_REF1(push, res->bo, NOUVEAU_BO_RD | res->domain);
789
790 BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + (length / 4));
791 PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x08 << 1));
792 nouveau_pushbuf_data(push, res->bo, bo_offset,
793 NVC0_IB_ENTRY_1_NO_PREFETCH | length);
794 }
795
796 void
nve4_launch_grid(struct pipe_context * pipe,const struct pipe_grid_info * info)797 nve4_launch_grid(struct pipe_context *pipe, const struct pipe_grid_info *info)
798 {
799 struct nvc0_context *nvc0 = nvc0_context(pipe);
800 struct nvc0_screen *screen = nvc0->screen;
801 struct nouveau_pushbuf *push = nvc0->base.pushbuf;
802 void *desc;
803 uint64_t desc_gpuaddr;
804 struct nouveau_bo *desc_bo;
805 int ret;
806
807 desc = nve4_compute_alloc_launch_desc(&nvc0->base, &desc_bo, &desc_gpuaddr);
808 if (!desc) {
809 ret = -1;
810 goto out;
811 }
812 BCTX_REFN_bo(nvc0->bufctx_cp, CP_DESC, NOUVEAU_BO_GART | NOUVEAU_BO_RD,
813 desc_bo);
814
815 list_for_each_entry(struct nvc0_resident, resident, &nvc0->tex_head, list) {
816 nvc0_add_resident(nvc0->bufctx_cp, NVC0_BIND_CP_BINDLESS, resident->buf,
817 resident->flags);
818 }
819
820 list_for_each_entry(struct nvc0_resident, resident, &nvc0->img_head, list) {
821 nvc0_add_resident(nvc0->bufctx_cp, NVC0_BIND_CP_BINDLESS, resident->buf,
822 resident->flags);
823 }
824
825 simple_mtx_lock(&screen->state_lock);
826 ret = !nve4_state_validate_cp(nvc0, ~0);
827 if (ret)
828 goto out_unlock;
829
830 if (nvc0->screen->compute->oclass >= GV100_COMPUTE_CLASS)
831 gv100_compute_setup_launch_desc(nvc0, desc, info);
832 else
833 if (nvc0->screen->compute->oclass >= GP100_COMPUTE_CLASS)
834 gp100_compute_setup_launch_desc(nvc0, desc, info);
835 else
836 nve4_compute_setup_launch_desc(nvc0, desc, info);
837
838 nve4_compute_upload_input(nvc0, info);
839
840 #ifndef NDEBUG
841 if (debug_get_num_option("NV50_PROG_DEBUG", 0)) {
842 debug_printf("Queue Meta Data:\n");
843 if (nvc0->screen->compute->oclass >= GV100_COMPUTE_CLASS)
844 NVC3C0QmdDump_V02_02(desc);
845 else
846 if (nvc0->screen->compute->oclass >= GP100_COMPUTE_CLASS)
847 NVC0C0QmdDump_V02_01(desc);
848 else
849 NVA0C0QmdDump_V00_06(desc);
850 }
851 #endif
852
853 if (unlikely(info->indirect)) {
854 struct nv04_resource *res = nv04_resource(info->indirect);
855 uint32_t offset = res->offset + info->indirect_offset;
856
857 /* upload the descriptor */
858 BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2);
859 PUSH_DATAh(push, desc_gpuaddr);
860 PUSH_DATA (push, desc_gpuaddr);
861 BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2);
862 PUSH_DATA (push, 256);
863 PUSH_DATA (push, 1);
864 BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + (256 / 4));
865 PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x08 << 1));
866 PUSH_DATAp(push, (const uint32_t *)desc, 256 / 4);
867
868 if (nvc0->screen->compute->oclass >= GP100_COMPUTE_CLASS) {
869 nve4_upload_indirect_desc(push, res, desc_gpuaddr + 48, 12, offset);
870 } else {
871 /* overwrite griddim_x and griddim_y as two 32-bits integers even
872 * if griddim_y must be a 16-bits integer */
873 nve4_upload_indirect_desc(push, res, desc_gpuaddr + 48, 8, offset);
874
875 /* overwrite the 16 high bits of griddim_y with griddim_z because
876 * we need (z << 16) | x */
877 nve4_upload_indirect_desc(push, res, desc_gpuaddr + 54, 4, offset + 8);
878 }
879 }
880
881 /* upload descriptor and flush */
882 PUSH_SPACE_EX(push, 32, 1, 0);
883 PUSH_REF1(push, screen->text, NV_VRAM_DOMAIN(&screen->base) | NOUVEAU_BO_RD);
884 BEGIN_NVC0(push, NVE4_CP(LAUNCH_DESC_ADDRESS), 1);
885 PUSH_DATA (push, desc_gpuaddr >> 8);
886 if (screen->compute->oclass < GA102_COMPUTE_CLASS) {
887 BEGIN_NVC0(push, NVE4_CP(LAUNCH), 1);
888 PUSH_DATA (push, 0x3);
889 } else {
890 BEGIN_NIC0(push, SUBC_CP(0x02c0), 2);
891 PUSH_DATA (push, 1);
892 PUSH_DATA (push, 2);
893 }
894 BEGIN_NVC0(push, SUBC_CP(NV50_GRAPH_SERIALIZE), 1);
895 PUSH_DATA (push, 0);
896
897 nvc0_update_compute_invocations_counter(nvc0, info);
898
899 out_unlock:
900 PUSH_KICK(push);
901 simple_mtx_unlock(&screen->state_lock);
902
903 out:
904 if (ret)
905 NOUVEAU_ERR("Failed to launch grid !\n");
906 nouveau_scratch_done(&nvc0->base);
907 nouveau_bufctx_reset(nvc0->bufctx_cp, NVC0_BIND_CP_DESC);
908 nouveau_bufctx_reset(nvc0->bufctx_cp, NVC0_BIND_CP_BINDLESS);
909 }
910
911
912 #define NVE4_TIC_ENTRY_INVALID 0x000fffff
913
914 static void
nve4_compute_validate_textures(struct nvc0_context * nvc0)915 nve4_compute_validate_textures(struct nvc0_context *nvc0)
916 {
917 struct nouveau_bo *txc = nvc0->screen->txc;
918 struct nouveau_pushbuf *push = nvc0->base.pushbuf;
919 const unsigned s = 5;
920 unsigned i;
921 uint32_t commands[2][32];
922 unsigned n[2] = { 0, 0 };
923
924 for (i = 0; i < nvc0->num_textures[s]; ++i) {
925 struct nv50_tic_entry *tic = nv50_tic_entry(nvc0->textures[s][i]);
926 struct nv04_resource *res;
927 const bool dirty = !!(nvc0->textures_dirty[s] & (1 << i));
928
929 if (!tic) {
930 nvc0->tex_handles[s][i] |= NVE4_TIC_ENTRY_INVALID;
931 continue;
932 }
933 res = nv04_resource(tic->pipe.texture);
934 nvc0_update_tic(nvc0, tic, res);
935
936 if (tic->id < 0) {
937 tic->id = nvc0_screen_tic_alloc(nvc0->screen, tic);
938
939 PUSH_SPACE(push, 16);
940 BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2);
941 PUSH_DATAh(push, txc->offset + (tic->id * 32));
942 PUSH_DATA (push, txc->offset + (tic->id * 32));
943 BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2);
944 PUSH_DATA (push, 32);
945 PUSH_DATA (push, 1);
946 BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 9);
947 PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1));
948 PUSH_DATAp(push, &tic->tic[0], 8);
949
950 commands[0][n[0]++] = (tic->id << 4) | 1;
951 } else
952 if (res->status & NOUVEAU_BUFFER_STATUS_GPU_WRITING) {
953 commands[1][n[1]++] = (tic->id << 4) | 1;
954 }
955 nvc0->screen->tic.lock[tic->id / 32] |= 1 << (tic->id % 32);
956
957 res->status &= ~NOUVEAU_BUFFER_STATUS_GPU_WRITING;
958 res->status |= NOUVEAU_BUFFER_STATUS_GPU_READING;
959
960 nvc0->tex_handles[s][i] &= ~NVE4_TIC_ENTRY_INVALID;
961 nvc0->tex_handles[s][i] |= tic->id;
962 if (dirty)
963 BCTX_REFN(nvc0->bufctx_cp, CP_TEX(i), res, RD);
964 }
965 for (; i < nvc0->state.num_textures[s]; ++i) {
966 nvc0->tex_handles[s][i] |= NVE4_TIC_ENTRY_INVALID;
967 nvc0->textures_dirty[s] |= 1 << i;
968 }
969
970 if (n[0]) {
971 BEGIN_NIC0(push, NVE4_CP(TIC_FLUSH), n[0]);
972 PUSH_DATAp(push, commands[0], n[0]);
973 }
974 if (n[1]) {
975 BEGIN_NIC0(push, NVE4_CP(TEX_CACHE_CTL), n[1]);
976 PUSH_DATAp(push, commands[1], n[1]);
977 }
978
979 nvc0->state.num_textures[s] = nvc0->num_textures[s];
980
981 /* Invalidate all 3D textures because they are aliased. */
982 for (int s = 0; s < 5; s++) {
983 for (int i = 0; i < nvc0->num_textures[s]; i++)
984 nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_3D_TEX(s, i));
985 nvc0->textures_dirty[s] = ~0;
986 }
987 nvc0->dirty_3d |= NVC0_NEW_3D_TEXTURES;
988 }
989
990 #ifdef NOUVEAU_NVE4_MP_TRAP_HANDLER
991 static void
nve4_compute_trap_info(struct nvc0_context * nvc0)992 nve4_compute_trap_info(struct nvc0_context *nvc0)
993 {
994 struct nvc0_screen *screen = nvc0->screen;
995 struct nouveau_bo *bo = screen->parm;
996 int ret, i;
997 volatile struct nve4_mp_trap_info *info;
998 uint8_t *map;
999
1000 ret = BO_MAP(&screen->base, bo, NOUVEAU_BO_RDWR, nvc0->base.client);
1001 if (ret)
1002 return;
1003 map = (uint8_t *)bo->map;
1004 info = (volatile struct nve4_mp_trap_info *)(map + NVE4_CP_PARAM_TRAP_INFO);
1005
1006 if (info->lock) {
1007 debug_printf("trapstat = %08x\n", info->trapstat);
1008 debug_printf("warperr = %08x\n", info->warperr);
1009 debug_printf("PC = %x\n", info->pc);
1010 debug_printf("tid = %u %u %u\n",
1011 info->tid[0], info->tid[1], info->tid[2]);
1012 debug_printf("ctaid = %u %u %u\n",
1013 info->ctaid[0], info->ctaid[1], info->ctaid[2]);
1014 for (i = 0; i <= 63; ++i)
1015 debug_printf("$r%i = %08x\n", i, info->r[i]);
1016 for (i = 0; i <= 6; ++i)
1017 debug_printf("$p%i = %i\n", i, (info->flags >> i) & 1);
1018 debug_printf("$c = %x\n", info->flags >> 12);
1019 }
1020 info->lock = 0;
1021 }
1022 #endif
1023