xref: /aosp_15_r20/external/mesa3d/src/gallium/drivers/nouveau/nvc0/nve4_compute.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright 2012 Nouveau Project
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice shall be included in
12  * all copies or substantial portions of the Software.
13  *
14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
18  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20  * OTHER DEALINGS IN THE SOFTWARE.
21  *
22  * Authors: Christoph Bumiller
23  */
24 
25 #include "nvc0/nvc0_context.h"
26 #include "nvc0/nve4_compute.h"
27 
28 #include "nv50_ir_driver.h"
29 
30 #include "drf.h"
31 #include "qmd.h"
32 #include "cla0c0qmd.h"
33 #include "clc0c0qmd.h"
34 #include "clc3c0qmd.h"
35 
36 #define NVA0C0_QMDV00_06_VAL_SET(p,a...) NVVAL_MW_SET((p), NVA0C0, QMDV00_06, ##a)
37 #define NVA0C0_QMDV00_06_DEF_SET(p,a...) NVDEF_MW_SET((p), NVA0C0, QMDV00_06, ##a)
38 #define NVC0C0_QMDV02_01_VAL_SET(p,a...) NVVAL_MW_SET((p), NVC0C0, QMDV02_01, ##a)
39 #define NVC0C0_QMDV02_01_DEF_SET(p,a...) NVDEF_MW_SET((p), NVC0C0, QMDV02_01, ##a)
40 #define NVC3C0_QMDV02_02_VAL_SET(p,a...) NVVAL_MW_SET((p), NVC3C0, QMDV02_02, ##a)
41 #define NVC3C0_QMDV02_02_DEF_SET(p,a...) NVDEF_MW_SET((p), NVC3C0, QMDV02_02, ##a)
42 
43 int
nve4_screen_compute_setup(struct nvc0_screen * screen,struct nouveau_pushbuf * push)44 nve4_screen_compute_setup(struct nvc0_screen *screen,
45                           struct nouveau_pushbuf *push)
46 {
47    int i;
48    uint32_t obj_class = screen->compute->oclass;
49    uint64_t address;
50 
51    BEGIN_NVC0(push, SUBC_CP(NV01_SUBCHAN_OBJECT), 1);
52    PUSH_DATA (push, screen->compute->oclass);
53 
54    BEGIN_NVC0(push, NVE4_CP(TEMP_ADDRESS_HIGH), 2);
55    PUSH_DATAh(push, screen->tls->offset);
56    PUSH_DATA (push, screen->tls->offset);
57    /* No idea why there are 2. Divide size by 2 to be safe.
58     * Actually this might be per-MP TEMP size and looks like I'm only using
59     * 2 MPs instead of all 8.
60     */
61    BEGIN_NVC0(push, NVE4_CP(MP_TEMP_SIZE_HIGH(0)), 3);
62    PUSH_DATAh(push, screen->tls->size / screen->mp_count);
63    PUSH_DATA (push, (screen->tls->size / screen->mp_count) & ~0x7fff);
64    PUSH_DATA (push, 0xff);
65    if (obj_class < GV100_COMPUTE_CLASS) {
66       BEGIN_NVC0(push, NVE4_CP(MP_TEMP_SIZE_HIGH(1)), 3);
67       PUSH_DATAh(push, screen->tls->size / screen->mp_count);
68       PUSH_DATA (push, (screen->tls->size / screen->mp_count) & ~0x7fff);
69       PUSH_DATA (push, 0xff);
70    }
71 
72    /* Unified address space ? Who needs that ? Certainly not OpenCL.
73     *
74     * FATAL: Buffers with addresses inside [0x1000000, 0x3000000] will NOT be
75     *  accessible. We cannot prevent that at the moment, so expect failure.
76     */
77    if (obj_class < GV100_COMPUTE_CLASS) {
78       BEGIN_NVC0(push, NVE4_CP(LOCAL_BASE), 1);
79       PUSH_DATA (push, 0xff << 24);
80       BEGIN_NVC0(push, NVE4_CP(SHARED_BASE), 1);
81       PUSH_DATA (push, 0xfe << 24);
82 
83       BEGIN_NVC0(push, NVE4_CP(CODE_ADDRESS_HIGH), 2);
84       PUSH_DATAh(push, screen->text->offset);
85       PUSH_DATA (push, screen->text->offset);
86    } else {
87       BEGIN_NVC0(push, SUBC_CP(0x2a0), 2);
88       PUSH_DATAh(push, 0xfeULL << 24);
89       PUSH_DATA (push, 0xfeULL << 24);
90       BEGIN_NVC0(push, SUBC_CP(0x7b0), 2);
91       PUSH_DATAh(push, 0xffULL << 24);
92       PUSH_DATA (push, 0xffULL << 24);
93    }
94 
95    BEGIN_NVC0(push, SUBC_CP(0x0310), 1);
96    PUSH_DATA (push, (obj_class >= NVF0_COMPUTE_CLASS) ? 0x400 : 0x300);
97 
98    /* NOTE: these do not affect the state used by the 3D object */
99    BEGIN_NVC0(push, NVE4_CP(TIC_ADDRESS_HIGH), 3);
100    PUSH_DATAh(push, screen->txc->offset);
101    PUSH_DATA (push, screen->txc->offset);
102    PUSH_DATA (push, NVC0_TIC_MAX_ENTRIES - 1);
103    BEGIN_NVC0(push, NVE4_CP(TSC_ADDRESS_HIGH), 3);
104    PUSH_DATAh(push, screen->txc->offset + 65536);
105    PUSH_DATA (push, screen->txc->offset + 65536);
106    PUSH_DATA (push, NVC0_TSC_MAX_ENTRIES - 1);
107 
108    if (obj_class >= NVF0_COMPUTE_CLASS) {
109       /* The blob calls GK110_COMPUTE.FIRMWARE[0x6], along with the args (0x1)
110        * passed with GK110_COMPUTE.GRAPH.SCRATCH[0x2]. This is currently
111        * disabled because our firmware doesn't support these commands and the
112        * GPU hangs if they are used. */
113       BEGIN_NIC0(push, SUBC_CP(0x0248), 64);
114       for (i = 63; i >= 0; i--)
115          PUSH_DATA(push, 0x38000 | i);
116       IMMED_NVC0(push, SUBC_CP(NV50_GRAPH_SERIALIZE), 0);
117    }
118 
119    BEGIN_NVC0(push, NVE4_CP(TEX_CB_INDEX), 1);
120    PUSH_DATA (push, 7); /* does not interfere with 3D */
121 
122    /* Disabling this UNK command avoid a read fault when using texelFetch()
123     * from a compute shader for weird reasons.
124    if (obj_class == NVF0_COMPUTE_CLASS)
125       IMMED_NVC0(push, SUBC_CP(0x02c4), 1);
126    */
127 
128    address = screen->uniform_bo->offset + NVC0_CB_AUX_INFO(5);
129 
130    /* MS sample coordinate offsets: these do not work with _ALT modes ! */
131    BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2);
132    PUSH_DATAh(push, address + NVC0_CB_AUX_MS_INFO);
133    PUSH_DATA (push, address + NVC0_CB_AUX_MS_INFO);
134    BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2);
135    PUSH_DATA (push, 64);
136    PUSH_DATA (push, 1);
137    BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 17);
138    PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1));
139    PUSH_DATA (push, 0); /* 0 */
140    PUSH_DATA (push, 0);
141    PUSH_DATA (push, 1); /* 1 */
142    PUSH_DATA (push, 0);
143    PUSH_DATA (push, 0); /* 2 */
144    PUSH_DATA (push, 1);
145    PUSH_DATA (push, 1); /* 3 */
146    PUSH_DATA (push, 1);
147    PUSH_DATA (push, 2); /* 4 */
148    PUSH_DATA (push, 0);
149    PUSH_DATA (push, 3); /* 5 */
150    PUSH_DATA (push, 0);
151    PUSH_DATA (push, 2); /* 6 */
152    PUSH_DATA (push, 1);
153    PUSH_DATA (push, 3); /* 7 */
154    PUSH_DATA (push, 1);
155 
156 #ifdef NOUVEAU_NVE4_MP_TRAP_HANDLER
157    BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2);
158    PUSH_DATAh(push, screen->parm->offset + NVE4_CP_INPUT_TRAP_INFO_PTR);
159    PUSH_DATA (push, screen->parm->offset + NVE4_CP_INPUT_TRAP_INFO_PTR);
160    BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2);
161    PUSH_DATA (push, 28);
162    PUSH_DATA (push, 1);
163    BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 8);
164    PUSH_DATA (push, 1);
165    PUSH_DATA (push, screen->parm->offset + NVE4_CP_PARAM_TRAP_INFO);
166    PUSH_DATAh(push, screen->parm->offset + NVE4_CP_PARAM_TRAP_INFO);
167    PUSH_DATA (push, screen->tls->offset);
168    PUSH_DATAh(push, screen->tls->offset);
169    PUSH_DATA (push, screen->tls->size / 2); /* MP TEMP block size */
170    PUSH_DATA (push, screen->tls->size / 2 / 64); /* warp TEMP block size */
171    PUSH_DATA (push, 0); /* warp cfstack size */
172 #endif
173 
174    BEGIN_NVC0(push, NVE4_CP(FLUSH), 1);
175    PUSH_DATA (push, NVE4_COMPUTE_FLUSH_CB);
176 
177    return 0;
178 }
179 
180 static void
gm107_compute_validate_surfaces(struct nvc0_context * nvc0,struct pipe_image_view * view,int slot)181 gm107_compute_validate_surfaces(struct nvc0_context *nvc0,
182                                 struct pipe_image_view *view, int slot)
183 {
184    struct nv04_resource *res = nv04_resource(view->resource);
185    struct nouveau_pushbuf *push = nvc0->base.pushbuf;
186    struct nvc0_screen *screen = nvc0->screen;
187    struct nouveau_bo *txc = nvc0->screen->txc;
188    struct nv50_tic_entry *tic;
189    uint64_t address;
190    const int s = 5;
191 
192    tic = nv50_tic_entry(nvc0->images_tic[s][slot]);
193 
194    res = nv04_resource(tic->pipe.texture);
195    nvc0_update_tic(nvc0, tic, res);
196 
197    if (tic->id < 0) {
198       tic->id = nvc0_screen_tic_alloc(nvc0->screen, tic);
199 
200       /* upload the texture view */
201       PUSH_SPACE(push, 16);
202       BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2);
203       PUSH_DATAh(push, txc->offset + (tic->id * 32));
204       PUSH_DATA (push, txc->offset + (tic->id * 32));
205       BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2);
206       PUSH_DATA (push, 32);
207       PUSH_DATA (push, 1);
208       BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 9);
209       PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1));
210       PUSH_DATAp(push, &tic->tic[0], 8);
211 
212       BEGIN_NIC0(push, NVE4_CP(TIC_FLUSH), 1);
213       PUSH_DATA (push, (tic->id << 4) | 1);
214    } else
215    if (res->status & NOUVEAU_BUFFER_STATUS_GPU_WRITING) {
216       BEGIN_NIC0(push, NVE4_CP(TEX_CACHE_CTL), 1);
217       PUSH_DATA (push, (tic->id << 4) | 1);
218    }
219    nvc0->screen->tic.lock[tic->id / 32] |= 1 << (tic->id % 32);
220 
221    res->status &= ~NOUVEAU_BUFFER_STATUS_GPU_WRITING;
222    res->status |=  NOUVEAU_BUFFER_STATUS_GPU_READING;
223 
224    BCTX_REFN(nvc0->bufctx_cp, CP_SUF, res, RD);
225 
226    address = screen->uniform_bo->offset + NVC0_CB_AUX_INFO(s);
227 
228    /* upload the texture handle */
229    BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2);
230    PUSH_DATAh(push, address + NVC0_CB_AUX_TEX_INFO(slot + 32));
231    PUSH_DATA (push, address + NVC0_CB_AUX_TEX_INFO(slot + 32));
232    BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2);
233    PUSH_DATA (push, 4);
234    PUSH_DATA (push, 0x1);
235    BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 2);
236    PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1));
237    PUSH_DATA (push, tic->id);
238 
239    BEGIN_NVC0(push, NVE4_CP(FLUSH), 1);
240    PUSH_DATA (push, NVE4_COMPUTE_FLUSH_CB);
241 }
242 
243 static void
nve4_compute_validate_surfaces(struct nvc0_context * nvc0)244 nve4_compute_validate_surfaces(struct nvc0_context *nvc0)
245 {
246    struct nouveau_pushbuf *push = nvc0->base.pushbuf;
247    uint64_t address;
248    const int s = 5;
249    int i, j;
250 
251    if (!nvc0->images_dirty[s])
252       return;
253 
254    address = nvc0->screen->uniform_bo->offset + NVC0_CB_AUX_INFO(s);
255 
256    for (i = 0; i < NVC0_MAX_IMAGES; ++i) {
257       struct pipe_image_view *view = &nvc0->images[s][i];
258 
259       BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2);
260       PUSH_DATAh(push, address + NVC0_CB_AUX_SU_INFO(i));
261       PUSH_DATA (push, address + NVC0_CB_AUX_SU_INFO(i));
262       BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2);
263       PUSH_DATA (push, 16 * 4);
264       PUSH_DATA (push, 0x1);
265       BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + 16);
266       PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1));
267 
268       if (view->resource) {
269          struct nv04_resource *res = nv04_resource(view->resource);
270 
271          if (res->base.target == PIPE_BUFFER) {
272             if (view->access & PIPE_IMAGE_ACCESS_WRITE)
273                nvc0_mark_image_range_valid(view);
274          }
275 
276          nve4_set_surface_info(push, view, nvc0);
277          BCTX_REFN(nvc0->bufctx_cp, CP_SUF, res, RDWR);
278 
279          if (nvc0->screen->base.class_3d >= GM107_3D_CLASS)
280             gm107_compute_validate_surfaces(nvc0, view, i);
281       } else {
282          for (j = 0; j < 16; j++)
283             PUSH_DATA(push, 0);
284       }
285    }
286 }
287 
288 /* Thankfully, textures with samplers follow the normal rules. */
289 static void
nve4_compute_validate_samplers(struct nvc0_context * nvc0)290 nve4_compute_validate_samplers(struct nvc0_context *nvc0)
291 {
292    bool need_flush = nve4_validate_tsc(nvc0, 5);
293    if (need_flush) {
294       BEGIN_NVC0(nvc0->base.pushbuf, NVE4_CP(TSC_FLUSH), 1);
295       PUSH_DATA (nvc0->base.pushbuf, 0);
296    }
297 
298    /* Invalidate all 3D samplers because they are aliased. */
299    for (int s = 0; s < 5; s++)
300       nvc0->samplers_dirty[s] = ~0;
301    nvc0->dirty_3d |= NVC0_NEW_3D_SAMPLERS;
302 }
303 
304 /* (Code duplicated at bottom for various non-convincing reasons.
305  *  E.g. we might want to use the COMPUTE subchannel to upload TIC/TSC
306  *  entries to avoid a subchannel switch.
307  *  Same for texture cache flushes.
308  *  Also, the bufctx differs, and more IFs in the 3D version looks ugly.)
309  */
310 static void nve4_compute_validate_textures(struct nvc0_context *);
311 
312 static void
nve4_compute_set_tex_handles(struct nvc0_context * nvc0)313 nve4_compute_set_tex_handles(struct nvc0_context *nvc0)
314 {
315    struct nouveau_pushbuf *push = nvc0->base.pushbuf;
316    struct nvc0_screen *screen = nvc0->screen;
317    uint64_t address;
318    const unsigned s = nvc0_shader_stage(PIPE_SHADER_COMPUTE);
319    unsigned i, n;
320    uint32_t dirty = nvc0->textures_dirty[s] | nvc0->samplers_dirty[s];
321 
322    if (!dirty)
323       return;
324    i = ffs(dirty) - 1;
325    n = util_logbase2(dirty) + 1 - i;
326    assert(n);
327 
328    address = screen->uniform_bo->offset + NVC0_CB_AUX_INFO(s);
329 
330    BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2);
331    PUSH_DATAh(push, address + NVC0_CB_AUX_TEX_INFO(i));
332    PUSH_DATA (push, address + NVC0_CB_AUX_TEX_INFO(i));
333    BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2);
334    PUSH_DATA (push, n * 4);
335    PUSH_DATA (push, 0x1);
336    BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + n);
337    PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1));
338    PUSH_DATAp(push, &nvc0->tex_handles[s][i], n);
339 
340    BEGIN_NVC0(push, NVE4_CP(FLUSH), 1);
341    PUSH_DATA (push, NVE4_COMPUTE_FLUSH_CB);
342 
343    nvc0->textures_dirty[s] = 0;
344    nvc0->samplers_dirty[s] = 0;
345 }
346 
347 static void
nve4_compute_validate_constbufs(struct nvc0_context * nvc0)348 nve4_compute_validate_constbufs(struct nvc0_context *nvc0)
349 {
350    struct nouveau_pushbuf *push = nvc0->base.pushbuf;
351    const int s = 5;
352 
353    while (nvc0->constbuf_dirty[s]) {
354       int i = ffs(nvc0->constbuf_dirty[s]) - 1;
355       nvc0->constbuf_dirty[s] &= ~(1 << i);
356 
357       if (nvc0->constbuf[s][i].user) {
358          struct nouveau_bo *bo = nvc0->screen->uniform_bo;
359          const unsigned base = NVC0_CB_USR_INFO(s);
360          const unsigned size = nvc0->constbuf[s][0].size;
361          assert(i == 0); /* we really only want OpenGL uniforms here */
362          assert(nvc0->constbuf[s][0].u.data);
363 
364          BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2);
365          PUSH_DATAh(push, bo->offset + base);
366          PUSH_DATA (push, bo->offset + base);
367          BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2);
368          PUSH_DATA (push, size);
369          PUSH_DATA (push, 0x1);
370          BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + (size / 4));
371          PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1));
372          PUSH_DATAp(push, nvc0->constbuf[s][0].u.data, size / 4);
373       }
374       else {
375          struct nv04_resource *res =
376             nv04_resource(nvc0->constbuf[s][i].u.buf);
377          if (res) {
378             uint64_t address
379                = nvc0->screen->uniform_bo->offset + NVC0_CB_AUX_INFO(s);
380 
381             /* constbufs above 0 will are fetched via ubo info in the shader */
382             if (i > 0) {
383                BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2);
384                PUSH_DATAh(push, address + NVC0_CB_AUX_UBO_INFO(i - 1));
385                PUSH_DATA (push, address + NVC0_CB_AUX_UBO_INFO(i - 1));
386                BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2);
387                PUSH_DATA (push, 4 * 4);
388                PUSH_DATA (push, 0x1);
389                BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + 4);
390                PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1));
391 
392                PUSH_DATA (push, res->address + nvc0->constbuf[s][i].offset);
393                PUSH_DATAh(push, res->address + nvc0->constbuf[s][i].offset);
394                PUSH_DATA (push, nvc0->constbuf[s][i].size);
395                PUSH_DATA (push, 0);
396             }
397 
398             BCTX_REFN(nvc0->bufctx_cp, CP_CB(i), res, RD);
399             res->cb_bindings[s] |= 1 << i;
400          }
401       }
402    }
403 
404    BEGIN_NVC0(push, NVE4_CP(FLUSH), 1);
405    PUSH_DATA (push, NVE4_COMPUTE_FLUSH_CB);
406 }
407 
408 static void
nve4_compute_validate_buffers(struct nvc0_context * nvc0)409 nve4_compute_validate_buffers(struct nvc0_context *nvc0)
410 {
411    struct nouveau_pushbuf *push = nvc0->base.pushbuf;
412    uint64_t address;
413    const int s = 5;
414    int i;
415 
416    address = nvc0->screen->uniform_bo->offset + NVC0_CB_AUX_INFO(s);
417 
418    BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2);
419    PUSH_DATAh(push, address + NVC0_CB_AUX_BUF_INFO(0));
420    PUSH_DATA (push, address + NVC0_CB_AUX_BUF_INFO(0));
421    BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2);
422    PUSH_DATA (push, 4 * NVC0_MAX_BUFFERS * 4);
423    PUSH_DATA (push, 0x1);
424    BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + 4 * NVC0_MAX_BUFFERS);
425    PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1));
426 
427    for (i = 0; i < NVC0_MAX_BUFFERS; i++) {
428       if (nvc0->buffers[s][i].buffer) {
429          struct nv04_resource *res =
430             nv04_resource(nvc0->buffers[s][i].buffer);
431          PUSH_DATA (push, res->address + nvc0->buffers[s][i].buffer_offset);
432          PUSH_DATAh(push, res->address + nvc0->buffers[s][i].buffer_offset);
433          PUSH_DATA (push, nvc0->buffers[s][i].buffer_size);
434          PUSH_DATA (push, 0);
435          BCTX_REFN(nvc0->bufctx_cp, CP_BUF, res, RDWR);
436          util_range_add(&res->base, &res->valid_buffer_range,
437                         nvc0->buffers[s][i].buffer_offset,
438                         nvc0->buffers[s][i].buffer_offset +
439                         nvc0->buffers[s][i].buffer_size);
440       } else {
441          PUSH_DATA (push, 0);
442          PUSH_DATA (push, 0);
443          PUSH_DATA (push, 0);
444          PUSH_DATA (push, 0);
445       }
446    }
447 }
448 
449 static struct nvc0_state_validate
450 validate_list_cp[] = {
451    { nvc0_compprog_validate,              NVC0_NEW_CP_PROGRAM     },
452    { nve4_compute_validate_textures,      NVC0_NEW_CP_TEXTURES    },
453    { nve4_compute_validate_samplers,      NVC0_NEW_CP_SAMPLERS    },
454    { nve4_compute_set_tex_handles,        NVC0_NEW_CP_TEXTURES |
455                                           NVC0_NEW_CP_SAMPLERS    },
456    { nve4_compute_validate_surfaces,      NVC0_NEW_CP_SURFACES    },
457    { nvc0_compute_validate_globals,       NVC0_NEW_CP_GLOBALS     },
458    { nve4_compute_validate_buffers,       NVC0_NEW_CP_BUFFERS     },
459    { nve4_compute_validate_constbufs,     NVC0_NEW_CP_CONSTBUF    },
460 };
461 
462 static bool
nve4_state_validate_cp(struct nvc0_context * nvc0,uint32_t mask)463 nve4_state_validate_cp(struct nvc0_context *nvc0, uint32_t mask)
464 {
465    bool ret;
466 
467    ret = nvc0_state_validate(nvc0, mask, validate_list_cp,
468                              ARRAY_SIZE(validate_list_cp), &nvc0->dirty_cp,
469                              nvc0->bufctx_cp);
470 
471    if (unlikely(nvc0->state.flushed))
472       nvc0_bufctx_fence(nvc0, nvc0->bufctx_cp, true);
473    return ret;
474 }
475 
476 static void
nve4_compute_upload_input(struct nvc0_context * nvc0,const struct pipe_grid_info * info)477 nve4_compute_upload_input(struct nvc0_context *nvc0,
478                           const struct pipe_grid_info *info)
479 {
480    struct nvc0_screen *screen = nvc0->screen;
481    struct nouveau_pushbuf *push = nvc0->base.pushbuf;
482    struct nvc0_program *cp = nvc0->compprog;
483    uint64_t address;
484 
485    address = screen->uniform_bo->offset + NVC0_CB_AUX_INFO(5);
486 
487    if (cp->parm_size) {
488       BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2);
489       PUSH_DATAh(push, screen->uniform_bo->offset + NVC0_CB_USR_INFO(5));
490       PUSH_DATA (push, screen->uniform_bo->offset + NVC0_CB_USR_INFO(5));
491       BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2);
492       PUSH_DATA (push, cp->parm_size);
493       PUSH_DATA (push, 0x1);
494       BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + DIV_ROUND_UP(cp->parm_size, 4));
495       PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1));
496       PUSH_DATAb(push, info->input, cp->parm_size);
497    }
498    BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2);
499    PUSH_DATAh(push, address + NVC0_CB_AUX_GRID_INFO(0));
500    PUSH_DATA (push, address + NVC0_CB_AUX_GRID_INFO(0));
501    BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2);
502    PUSH_DATA (push, 8 * 4);
503    PUSH_DATA (push, 0x1);
504 
505    if (unlikely(info->indirect)) {
506       struct nv04_resource *res = nv04_resource(info->indirect);
507       uint32_t offset = res->offset + info->indirect_offset;
508 
509       PUSH_SPACE_EX(push, 32, 0, 1);
510       PUSH_REF1(push, res->bo, NOUVEAU_BO_RD | res->domain);
511 
512       BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + 8);
513       PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1));
514       PUSH_DATAp(push, info->block, 3);
515       nouveau_pushbuf_data(push, res->bo, offset,
516                            NVC0_IB_ENTRY_1_NO_PREFETCH | 3 * 4);
517    } else {
518       BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + 8);
519       PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1));
520       PUSH_DATAp(push, info->block, 3);
521       PUSH_DATAp(push, info->grid, 3);
522    }
523    PUSH_DATA (push, 0);
524    PUSH_DATA (push, info->work_dim);
525 
526    BEGIN_NVC0(push, NVE4_CP(FLUSH), 1);
527    PUSH_DATA (push, NVE4_COMPUTE_FLUSH_CB);
528 }
529 
530 static inline void
gp100_cp_launch_desc_set_cb(uint32_t * qmd,unsigned index,struct nouveau_bo * bo,uint32_t base,uint32_t size)531 gp100_cp_launch_desc_set_cb(uint32_t *qmd, unsigned index,
532                             struct nouveau_bo *bo, uint32_t base, uint32_t size)
533 {
534    uint64_t address = bo->offset + base;
535 
536    assert(index < 8);
537    assert(!(base & 0xff));
538 
539    NVC0C0_QMDV02_01_VAL_SET(qmd, CONSTANT_BUFFER_ADDR_LOWER, index, address);
540    NVC0C0_QMDV02_01_VAL_SET(qmd, CONSTANT_BUFFER_ADDR_UPPER, index, address >> 32);
541    NVC0C0_QMDV02_01_VAL_SET(qmd, CONSTANT_BUFFER_SIZE_SHIFTED4, index,
542                                  DIV_ROUND_UP(size, 16));
543    NVC0C0_QMDV02_01_DEF_SET(qmd, CONSTANT_BUFFER_VALID, index, TRUE);
544 }
545 
546 static inline void
nve4_cp_launch_desc_set_cb(uint32_t * qmd,unsigned index,struct nouveau_bo * bo,uint32_t base,uint32_t size)547 nve4_cp_launch_desc_set_cb(uint32_t *qmd, unsigned index, struct nouveau_bo *bo,
548                            uint32_t base, uint32_t size)
549 {
550    uint64_t address = bo->offset + base;
551 
552    assert(index < 8);
553    assert(!(base & 0xff));
554 
555    NVA0C0_QMDV00_06_VAL_SET(qmd, CONSTANT_BUFFER_ADDR_LOWER, index, address);
556    NVA0C0_QMDV00_06_VAL_SET(qmd, CONSTANT_BUFFER_ADDR_UPPER, index, address >> 32);
557    NVA0C0_QMDV00_06_VAL_SET(qmd, CONSTANT_BUFFER_SIZE, index, size);
558    NVA0C0_QMDV00_06_DEF_SET(qmd, CONSTANT_BUFFER_VALID, index, TRUE);
559 }
560 
561 static void
nve4_compute_setup_buf_cb(struct nvc0_context * nvc0,bool gp100,void * desc)562 nve4_compute_setup_buf_cb(struct nvc0_context *nvc0, bool gp100, void *desc)
563 {
564    // only user constant buffers 0-6 can be put in the descriptor, the rest are
565    // loaded through global memory
566    for (int i = 0; i <= 6; i++) {
567       if (nvc0->constbuf[5][i].user || !nvc0->constbuf[5][i].u.buf)
568          continue;
569 
570       struct nv04_resource *res =
571          nv04_resource(nvc0->constbuf[5][i].u.buf);
572 
573       uint32_t base = res->offset + nvc0->constbuf[5][i].offset;
574       uint32_t size = nvc0->constbuf[5][i].size;
575       if (gp100)
576          gp100_cp_launch_desc_set_cb(desc, i, res->bo, base, size);
577       else
578          nve4_cp_launch_desc_set_cb(desc, i, res->bo, base, size);
579    }
580 
581    // there is no need to do FLUSH(NVE4_COMPUTE_FLUSH_CB) because
582    // nve4_compute_upload_input() does it later
583 }
584 
585 static void
nve4_compute_setup_launch_desc(struct nvc0_context * nvc0,uint32_t * qmd,const struct pipe_grid_info * info)586 nve4_compute_setup_launch_desc(struct nvc0_context *nvc0, uint32_t *qmd,
587                                const struct pipe_grid_info *info)
588 {
589    const struct nvc0_screen *screen = nvc0->screen;
590    const struct nvc0_program *cp = nvc0->compprog;
591    uint32_t shared_size = cp->cp.smem_size + info->variable_shared_mem;
592 
593    NVA0C0_QMDV00_06_DEF_SET(qmd, INVALIDATE_TEXTURE_HEADER_CACHE, TRUE);
594    NVA0C0_QMDV00_06_DEF_SET(qmd, INVALIDATE_TEXTURE_SAMPLER_CACHE, TRUE);
595    NVA0C0_QMDV00_06_DEF_SET(qmd, INVALIDATE_TEXTURE_DATA_CACHE, TRUE);
596    NVA0C0_QMDV00_06_DEF_SET(qmd, INVALIDATE_SHADER_DATA_CACHE, TRUE);
597    NVA0C0_QMDV00_06_DEF_SET(qmd, INVALIDATE_SHADER_CONSTANT_CACHE, TRUE);
598    NVA0C0_QMDV00_06_DEF_SET(qmd, RELEASE_MEMBAR_TYPE, FE_SYSMEMBAR);
599    NVA0C0_QMDV00_06_DEF_SET(qmd, CWD_MEMBAR_TYPE, L1_SYSMEMBAR);
600    NVA0C0_QMDV00_06_DEF_SET(qmd, API_VISIBLE_CALL_LIMIT, NO_CHECK);
601    NVA0C0_QMDV00_06_VAL_SET(qmd, SASS_VERSION, 0x30);
602 
603    NVA0C0_QMDV00_06_VAL_SET(qmd, PROGRAM_OFFSET, cp->code_base);
604 
605    NVA0C0_QMDV00_06_VAL_SET(qmd, CTA_RASTER_WIDTH, info->grid[0]);
606    NVA0C0_QMDV00_06_VAL_SET(qmd, CTA_RASTER_HEIGHT, info->grid[1]);
607    NVA0C0_QMDV00_06_VAL_SET(qmd, CTA_RASTER_DEPTH, info->grid[2]);
608    NVA0C0_QMDV00_06_VAL_SET(qmd, CTA_THREAD_DIMENSION0, info->block[0]);
609    NVA0C0_QMDV00_06_VAL_SET(qmd, CTA_THREAD_DIMENSION1, info->block[1]);
610    NVA0C0_QMDV00_06_VAL_SET(qmd, CTA_THREAD_DIMENSION2, info->block[2]);
611 
612    NVA0C0_QMDV00_06_VAL_SET(qmd, SHARED_MEMORY_SIZE, align(shared_size, 0x100));
613    NVA0C0_QMDV00_06_VAL_SET(qmd, SHADER_LOCAL_MEMORY_LOW_SIZE, cp->hdr[1] & 0xfffff0);
614    NVA0C0_QMDV00_06_VAL_SET(qmd, SHADER_LOCAL_MEMORY_HIGH_SIZE, 0);
615    NVA0C0_QMDV00_06_VAL_SET(qmd, SHADER_LOCAL_MEMORY_CRS_SIZE, 0x800);
616 
617    if (shared_size > (32 << 10))
618       NVA0C0_QMDV00_06_DEF_SET(qmd, L1_CONFIGURATION,
619                                     DIRECTLY_ADDRESSABLE_MEMORY_SIZE_48KB);
620    else
621    if (shared_size > (16 << 10))
622       NVA0C0_QMDV00_06_DEF_SET(qmd, L1_CONFIGURATION,
623                                     DIRECTLY_ADDRESSABLE_MEMORY_SIZE_32KB);
624    else
625       NVA0C0_QMDV00_06_DEF_SET(qmd, L1_CONFIGURATION,
626                                     DIRECTLY_ADDRESSABLE_MEMORY_SIZE_16KB);
627 
628    NVA0C0_QMDV00_06_VAL_SET(qmd, REGISTER_COUNT, cp->num_gprs);
629    NVA0C0_QMDV00_06_VAL_SET(qmd, BARRIER_COUNT, cp->num_barriers);
630 
631    // Only bind user uniforms and the driver constant buffer through the
632    // launch descriptor because UBOs are sticked to the driver cb to avoid the
633    // limitation of 8 CBs.
634    if (nvc0->constbuf[5][0].user || cp->parm_size) {
635       nve4_cp_launch_desc_set_cb(qmd, 0, screen->uniform_bo,
636                                  NVC0_CB_USR_INFO(5), 1 << 16);
637 
638       // Later logic will attempt to bind a real buffer at position 0. That
639       // should not happen if we've bound a user buffer.
640       assert(nvc0->constbuf[5][0].user || !nvc0->constbuf[5][0].u.buf);
641    }
642    nve4_cp_launch_desc_set_cb(qmd, 7, screen->uniform_bo,
643                               NVC0_CB_AUX_INFO(5), 1 << 11);
644 
645    nve4_compute_setup_buf_cb(nvc0, false, qmd);
646 }
647 
648 static void
gp100_compute_setup_launch_desc(struct nvc0_context * nvc0,uint32_t * qmd,const struct pipe_grid_info * info)649 gp100_compute_setup_launch_desc(struct nvc0_context *nvc0, uint32_t *qmd,
650                                 const struct pipe_grid_info *info)
651 {
652    const struct nvc0_screen *screen = nvc0->screen;
653    const struct nvc0_program *cp = nvc0->compprog;
654    uint32_t shared_size = cp->cp.smem_size + info->variable_shared_mem;
655 
656    NVC0C0_QMDV02_01_VAL_SET(qmd, SM_GLOBAL_CACHING_ENABLE, 1);
657    NVC0C0_QMDV02_01_DEF_SET(qmd, RELEASE_MEMBAR_TYPE, FE_SYSMEMBAR);
658    NVC0C0_QMDV02_01_DEF_SET(qmd, CWD_MEMBAR_TYPE, L1_SYSMEMBAR);
659    NVC0C0_QMDV02_01_DEF_SET(qmd, API_VISIBLE_CALL_LIMIT, NO_CHECK);
660 
661    NVC0C0_QMDV02_01_VAL_SET(qmd, PROGRAM_OFFSET, cp->code_base);
662 
663    NVC0C0_QMDV02_01_VAL_SET(qmd, CTA_RASTER_WIDTH, info->grid[0]);
664    NVC0C0_QMDV02_01_VAL_SET(qmd, CTA_RASTER_HEIGHT, info->grid[1]);
665    NVC0C0_QMDV02_01_VAL_SET(qmd, CTA_RASTER_DEPTH, info->grid[2]);
666    NVC0C0_QMDV02_01_VAL_SET(qmd, CTA_THREAD_DIMENSION0, info->block[0]);
667    NVC0C0_QMDV02_01_VAL_SET(qmd, CTA_THREAD_DIMENSION1, info->block[1]);
668    NVC0C0_QMDV02_01_VAL_SET(qmd, CTA_THREAD_DIMENSION2, info->block[2]);
669 
670    NVC0C0_QMDV02_01_VAL_SET(qmd, SHARED_MEMORY_SIZE, align(shared_size, 0x100));
671    NVC0C0_QMDV02_01_VAL_SET(qmd, SHADER_LOCAL_MEMORY_LOW_SIZE, cp->hdr[1] & 0xfffff0);
672    NVC0C0_QMDV02_01_VAL_SET(qmd, SHADER_LOCAL_MEMORY_HIGH_SIZE, 0);
673    NVC0C0_QMDV02_01_VAL_SET(qmd, SHADER_LOCAL_MEMORY_CRS_SIZE, 0x800);
674 
675    NVC0C0_QMDV02_01_VAL_SET(qmd, REGISTER_COUNT, cp->num_gprs);
676    NVC0C0_QMDV02_01_VAL_SET(qmd, BARRIER_COUNT, cp->num_barriers);
677 
678    // Only bind user uniforms and the driver constant buffer through the
679    // launch descriptor because UBOs are sticked to the driver cb to avoid the
680    // limitation of 8 CBs.
681    if (nvc0->constbuf[5][0].user || cp->parm_size) {
682       gp100_cp_launch_desc_set_cb(qmd, 0, screen->uniform_bo,
683                                   NVC0_CB_USR_INFO(5), 1 << 16);
684 
685       // Later logic will attempt to bind a real buffer at position 0. That
686       // should not happen if we've bound a user buffer.
687       assert(nvc0->constbuf[5][0].user || !nvc0->constbuf[5][0].u.buf);
688    }
689    gp100_cp_launch_desc_set_cb(qmd, 7, screen->uniform_bo,
690                                NVC0_CB_AUX_INFO(5), 1 << 11);
691 
692    nve4_compute_setup_buf_cb(nvc0, true, qmd);
693 }
694 
695 static int
gv100_sm_config_smem_size(u32 size)696 gv100_sm_config_smem_size(u32 size)
697 {
698    if      (size > 64 * 1024) size = 96 * 1024;
699    else if (size > 32 * 1024) size = 64 * 1024;
700    else if (size > 16 * 1024) size = 32 * 1024;
701    else if (size >  8 * 1024) size = 16 * 1024;
702    else                       size =  8 * 1024;
703    return (size / 4096) + 1;
704 }
705 
706 static void
gv100_compute_setup_launch_desc(struct nvc0_context * nvc0,u32 * qmd,const struct pipe_grid_info * info)707 gv100_compute_setup_launch_desc(struct nvc0_context *nvc0, u32 *qmd,
708                                 const struct pipe_grid_info *info)
709 {
710    struct nvc0_program *cp = nvc0->compprog;
711    struct nvc0_screen *screen = nvc0->screen;
712    uint64_t entry = screen->text->offset + cp->code_base;
713    uint32_t shared_size = cp->cp.smem_size + info->variable_shared_mem;
714 
715    NVC3C0_QMDV02_02_VAL_SET(qmd, SM_GLOBAL_CACHING_ENABLE, 1);
716    NVC3C0_QMDV02_02_DEF_SET(qmd, API_VISIBLE_CALL_LIMIT, NO_CHECK);
717    NVC3C0_QMDV02_02_DEF_SET(qmd, SAMPLER_INDEX, INDEPENDENTLY);
718    NVC3C0_QMDV02_02_VAL_SET(qmd, SHARED_MEMORY_SIZE, align(shared_size, 0x100));
719    NVC3C0_QMDV02_02_VAL_SET(qmd, SHADER_LOCAL_MEMORY_LOW_SIZE, cp->hdr[1] & 0xfffff0);
720    NVC3C0_QMDV02_02_VAL_SET(qmd, SHADER_LOCAL_MEMORY_HIGH_SIZE, 0);
721    NVC3C0_QMDV02_02_VAL_SET(qmd, MIN_SM_CONFIG_SHARED_MEM_SIZE,
722                                   gv100_sm_config_smem_size(8 * 1024));
723    NVC3C0_QMDV02_02_VAL_SET(qmd, MAX_SM_CONFIG_SHARED_MEM_SIZE,
724                                   gv100_sm_config_smem_size(96 * 1024));
725    NVC3C0_QMDV02_02_VAL_SET(qmd, QMD_VERSION, 2);
726    NVC3C0_QMDV02_02_VAL_SET(qmd, QMD_MAJOR_VERSION, 2);
727    NVC3C0_QMDV02_02_VAL_SET(qmd, TARGET_SM_CONFIG_SHARED_MEM_SIZE,
728                                   gv100_sm_config_smem_size(shared_size));
729 
730    NVC3C0_QMDV02_02_VAL_SET(qmd, CTA_RASTER_WIDTH, info->grid[0]);
731    NVC3C0_QMDV02_02_VAL_SET(qmd, CTA_RASTER_HEIGHT, info->grid[1]);
732    NVC3C0_QMDV02_02_VAL_SET(qmd, CTA_RASTER_DEPTH, info->grid[2]);
733    NVC3C0_QMDV02_02_VAL_SET(qmd, CTA_THREAD_DIMENSION0, info->block[0]);
734    NVC3C0_QMDV02_02_VAL_SET(qmd, CTA_THREAD_DIMENSION1, info->block[1]);
735    NVC3C0_QMDV02_02_VAL_SET(qmd, CTA_THREAD_DIMENSION2, info->block[2]);
736    NVC3C0_QMDV02_02_VAL_SET(qmd, REGISTER_COUNT_V, cp->num_gprs);
737    NVC3C0_QMDV02_02_VAL_SET(qmd, BARRIER_COUNT, cp->num_barriers);
738 
739    // Only bind user uniforms and the driver constant buffer through the
740    // launch descriptor because UBOs are sticked to the driver cb to avoid the
741    // limitation of 8 CBs.
742    if (nvc0->constbuf[5][0].user || cp->parm_size) {
743       gp100_cp_launch_desc_set_cb(qmd, 0, screen->uniform_bo,
744                                   NVC0_CB_USR_INFO(5), 1 << 16);
745 
746       // Later logic will attempt to bind a real buffer at position 0. That
747       // should not happen if we've bound a user buffer.
748       assert(nvc0->constbuf[5][0].user || !nvc0->constbuf[5][0].u.buf);
749    }
750    gp100_cp_launch_desc_set_cb(qmd, 7, screen->uniform_bo,
751                                NVC0_CB_AUX_INFO(5), 1 << 11);
752 
753    nve4_compute_setup_buf_cb(nvc0, true, qmd);
754 
755    NVC3C0_QMDV02_02_VAL_SET(qmd, PROGRAM_ADDRESS_LOWER, entry & 0xffffffff);
756    NVC3C0_QMDV02_02_VAL_SET(qmd, PROGRAM_ADDRESS_UPPER, entry >> 32);
757 }
758 
759 static inline void *
nve4_compute_alloc_launch_desc(struct nouveau_context * nv,struct nouveau_bo ** pbo,uint64_t * pgpuaddr)760 nve4_compute_alloc_launch_desc(struct nouveau_context *nv,
761                                struct nouveau_bo **pbo, uint64_t *pgpuaddr)
762 {
763    uint8_t *ptr = nouveau_scratch_get(nv, 512, pgpuaddr, pbo);
764    if (!ptr)
765       return NULL;
766    if (*pgpuaddr & 255) {
767       unsigned adj = 256 - (*pgpuaddr & 255);
768       ptr += adj;
769       *pgpuaddr += adj;
770    }
771    memset(ptr, 0x00, 256);
772    return ptr;
773 }
774 
775 static void
nve4_upload_indirect_desc(struct nouveau_pushbuf * push,struct nv04_resource * res,uint64_t gpuaddr,uint32_t length,uint32_t bo_offset)776 nve4_upload_indirect_desc(struct nouveau_pushbuf *push,
777                           struct nv04_resource *res,  uint64_t gpuaddr,
778                           uint32_t length, uint32_t bo_offset)
779 {
780    BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2);
781    PUSH_DATAh(push, gpuaddr);
782    PUSH_DATA (push, gpuaddr);
783    BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2);
784    PUSH_DATA (push, length);
785    PUSH_DATA (push, 1);
786 
787    PUSH_SPACE_EX(push, 32, 0, 1);
788    PUSH_REF1(push, res->bo, NOUVEAU_BO_RD | res->domain);
789 
790    BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + (length / 4));
791    PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x08 << 1));
792    nouveau_pushbuf_data(push, res->bo, bo_offset,
793                         NVC0_IB_ENTRY_1_NO_PREFETCH | length);
794 }
795 
796 void
nve4_launch_grid(struct pipe_context * pipe,const struct pipe_grid_info * info)797 nve4_launch_grid(struct pipe_context *pipe, const struct pipe_grid_info *info)
798 {
799    struct nvc0_context *nvc0 = nvc0_context(pipe);
800    struct nvc0_screen *screen = nvc0->screen;
801    struct nouveau_pushbuf *push = nvc0->base.pushbuf;
802    void *desc;
803    uint64_t desc_gpuaddr;
804    struct nouveau_bo *desc_bo;
805    int ret;
806 
807    desc = nve4_compute_alloc_launch_desc(&nvc0->base, &desc_bo, &desc_gpuaddr);
808    if (!desc) {
809       ret = -1;
810       goto out;
811    }
812    BCTX_REFN_bo(nvc0->bufctx_cp, CP_DESC, NOUVEAU_BO_GART | NOUVEAU_BO_RD,
813                 desc_bo);
814 
815    list_for_each_entry(struct nvc0_resident, resident, &nvc0->tex_head, list) {
816       nvc0_add_resident(nvc0->bufctx_cp, NVC0_BIND_CP_BINDLESS, resident->buf,
817                         resident->flags);
818    }
819 
820    list_for_each_entry(struct nvc0_resident, resident, &nvc0->img_head, list) {
821       nvc0_add_resident(nvc0->bufctx_cp, NVC0_BIND_CP_BINDLESS, resident->buf,
822                         resident->flags);
823    }
824 
825    simple_mtx_lock(&screen->state_lock);
826    ret = !nve4_state_validate_cp(nvc0, ~0);
827    if (ret)
828       goto out_unlock;
829 
830    if (nvc0->screen->compute->oclass >= GV100_COMPUTE_CLASS)
831       gv100_compute_setup_launch_desc(nvc0, desc, info);
832    else
833    if (nvc0->screen->compute->oclass >= GP100_COMPUTE_CLASS)
834       gp100_compute_setup_launch_desc(nvc0, desc, info);
835    else
836       nve4_compute_setup_launch_desc(nvc0, desc, info);
837 
838    nve4_compute_upload_input(nvc0, info);
839 
840 #ifndef NDEBUG
841    if (debug_get_num_option("NV50_PROG_DEBUG", 0)) {
842       debug_printf("Queue Meta Data:\n");
843       if (nvc0->screen->compute->oclass >= GV100_COMPUTE_CLASS)
844          NVC3C0QmdDump_V02_02(desc);
845       else
846       if (nvc0->screen->compute->oclass >= GP100_COMPUTE_CLASS)
847          NVC0C0QmdDump_V02_01(desc);
848       else
849          NVA0C0QmdDump_V00_06(desc);
850    }
851 #endif
852 
853    if (unlikely(info->indirect)) {
854       struct nv04_resource *res = nv04_resource(info->indirect);
855       uint32_t offset = res->offset + info->indirect_offset;
856 
857       /* upload the descriptor */
858       BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2);
859       PUSH_DATAh(push, desc_gpuaddr);
860       PUSH_DATA (push, desc_gpuaddr);
861       BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2);
862       PUSH_DATA (push, 256);
863       PUSH_DATA (push, 1);
864       BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + (256 / 4));
865       PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x08 << 1));
866       PUSH_DATAp(push, (const uint32_t *)desc, 256 / 4);
867 
868       if (nvc0->screen->compute->oclass >= GP100_COMPUTE_CLASS) {
869          nve4_upload_indirect_desc(push, res, desc_gpuaddr + 48, 12, offset);
870       } else {
871          /* overwrite griddim_x and griddim_y as two 32-bits integers even
872           * if griddim_y must be a 16-bits integer */
873          nve4_upload_indirect_desc(push, res, desc_gpuaddr + 48, 8, offset);
874 
875          /* overwrite the 16 high bits of griddim_y with griddim_z because
876           * we need (z << 16) | x */
877          nve4_upload_indirect_desc(push, res, desc_gpuaddr + 54, 4, offset + 8);
878       }
879    }
880 
881    /* upload descriptor and flush */
882    PUSH_SPACE_EX(push, 32, 1, 0);
883    PUSH_REF1(push, screen->text, NV_VRAM_DOMAIN(&screen->base) | NOUVEAU_BO_RD);
884    BEGIN_NVC0(push, NVE4_CP(LAUNCH_DESC_ADDRESS), 1);
885    PUSH_DATA (push, desc_gpuaddr >> 8);
886    if (screen->compute->oclass < GA102_COMPUTE_CLASS) {
887       BEGIN_NVC0(push, NVE4_CP(LAUNCH), 1);
888       PUSH_DATA (push, 0x3);
889    } else {
890       BEGIN_NIC0(push, SUBC_CP(0x02c0), 2);
891       PUSH_DATA (push, 1);
892       PUSH_DATA (push, 2);
893    }
894    BEGIN_NVC0(push, SUBC_CP(NV50_GRAPH_SERIALIZE), 1);
895    PUSH_DATA (push, 0);
896 
897    nvc0_update_compute_invocations_counter(nvc0, info);
898 
899 out_unlock:
900    PUSH_KICK(push);
901    simple_mtx_unlock(&screen->state_lock);
902 
903 out:
904    if (ret)
905       NOUVEAU_ERR("Failed to launch grid !\n");
906    nouveau_scratch_done(&nvc0->base);
907    nouveau_bufctx_reset(nvc0->bufctx_cp, NVC0_BIND_CP_DESC);
908    nouveau_bufctx_reset(nvc0->bufctx_cp, NVC0_BIND_CP_BINDLESS);
909 }
910 
911 
912 #define NVE4_TIC_ENTRY_INVALID 0x000fffff
913 
914 static void
nve4_compute_validate_textures(struct nvc0_context * nvc0)915 nve4_compute_validate_textures(struct nvc0_context *nvc0)
916 {
917    struct nouveau_bo *txc = nvc0->screen->txc;
918    struct nouveau_pushbuf *push = nvc0->base.pushbuf;
919    const unsigned s = 5;
920    unsigned i;
921    uint32_t commands[2][32];
922    unsigned n[2] = { 0, 0 };
923 
924    for (i = 0; i < nvc0->num_textures[s]; ++i) {
925       struct nv50_tic_entry *tic = nv50_tic_entry(nvc0->textures[s][i]);
926       struct nv04_resource *res;
927       const bool dirty = !!(nvc0->textures_dirty[s] & (1 << i));
928 
929       if (!tic) {
930          nvc0->tex_handles[s][i] |= NVE4_TIC_ENTRY_INVALID;
931          continue;
932       }
933       res = nv04_resource(tic->pipe.texture);
934       nvc0_update_tic(nvc0, tic, res);
935 
936       if (tic->id < 0) {
937          tic->id = nvc0_screen_tic_alloc(nvc0->screen, tic);
938 
939          PUSH_SPACE(push, 16);
940          BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2);
941          PUSH_DATAh(push, txc->offset + (tic->id * 32));
942          PUSH_DATA (push, txc->offset + (tic->id * 32));
943          BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2);
944          PUSH_DATA (push, 32);
945          PUSH_DATA (push, 1);
946          BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 9);
947          PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1));
948          PUSH_DATAp(push, &tic->tic[0], 8);
949 
950          commands[0][n[0]++] = (tic->id << 4) | 1;
951       } else
952       if (res->status & NOUVEAU_BUFFER_STATUS_GPU_WRITING) {
953          commands[1][n[1]++] = (tic->id << 4) | 1;
954       }
955       nvc0->screen->tic.lock[tic->id / 32] |= 1 << (tic->id % 32);
956 
957       res->status &= ~NOUVEAU_BUFFER_STATUS_GPU_WRITING;
958       res->status |=  NOUVEAU_BUFFER_STATUS_GPU_READING;
959 
960       nvc0->tex_handles[s][i] &= ~NVE4_TIC_ENTRY_INVALID;
961       nvc0->tex_handles[s][i] |= tic->id;
962       if (dirty)
963          BCTX_REFN(nvc0->bufctx_cp, CP_TEX(i), res, RD);
964    }
965    for (; i < nvc0->state.num_textures[s]; ++i) {
966       nvc0->tex_handles[s][i] |= NVE4_TIC_ENTRY_INVALID;
967       nvc0->textures_dirty[s] |= 1 << i;
968    }
969 
970    if (n[0]) {
971       BEGIN_NIC0(push, NVE4_CP(TIC_FLUSH), n[0]);
972       PUSH_DATAp(push, commands[0], n[0]);
973    }
974    if (n[1]) {
975       BEGIN_NIC0(push, NVE4_CP(TEX_CACHE_CTL), n[1]);
976       PUSH_DATAp(push, commands[1], n[1]);
977    }
978 
979    nvc0->state.num_textures[s] = nvc0->num_textures[s];
980 
981    /* Invalidate all 3D textures because they are aliased. */
982    for (int s = 0; s < 5; s++) {
983       for (int i = 0; i < nvc0->num_textures[s]; i++)
984          nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_3D_TEX(s, i));
985       nvc0->textures_dirty[s] = ~0;
986    }
987    nvc0->dirty_3d |= NVC0_NEW_3D_TEXTURES;
988 }
989 
990 #ifdef NOUVEAU_NVE4_MP_TRAP_HANDLER
991 static void
nve4_compute_trap_info(struct nvc0_context * nvc0)992 nve4_compute_trap_info(struct nvc0_context *nvc0)
993 {
994    struct nvc0_screen *screen = nvc0->screen;
995    struct nouveau_bo *bo = screen->parm;
996    int ret, i;
997    volatile struct nve4_mp_trap_info *info;
998    uint8_t *map;
999 
1000    ret = BO_MAP(&screen->base, bo, NOUVEAU_BO_RDWR, nvc0->base.client);
1001    if (ret)
1002       return;
1003    map = (uint8_t *)bo->map;
1004    info = (volatile struct nve4_mp_trap_info *)(map + NVE4_CP_PARAM_TRAP_INFO);
1005 
1006    if (info->lock) {
1007       debug_printf("trapstat = %08x\n", info->trapstat);
1008       debug_printf("warperr = %08x\n", info->warperr);
1009       debug_printf("PC = %x\n", info->pc);
1010       debug_printf("tid = %u %u %u\n",
1011                    info->tid[0], info->tid[1], info->tid[2]);
1012       debug_printf("ctaid = %u %u %u\n",
1013                    info->ctaid[0], info->ctaid[1], info->ctaid[2]);
1014       for (i = 0; i <= 63; ++i)
1015          debug_printf("$r%i = %08x\n", i, info->r[i]);
1016       for (i = 0; i <= 6; ++i)
1017          debug_printf("$p%i = %i\n", i, (info->flags >> i) & 1);
1018       debug_printf("$c = %x\n", info->flags >> 12);
1019    }
1020    info->lock = 0;
1021 }
1022 #endif
1023