1 /*
2 * Copyright © 2014 Broadcom
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /**
25 * @file
26 *
27 * Validates the QPU instruction sequence after register allocation and
28 * scheduling.
29 */
30
31 #include <assert.h>
32 #include <stdio.h>
33 #include <stdlib.h>
34 #include "v3d_compiler.h"
35 #include "qpu/qpu_disasm.h"
36
37 struct v3d_qpu_validate_state {
38 struct v3d_compile *c;
39 const struct v3d_qpu_instr *last;
40 int ip;
41 int last_sfu_write;
42 int last_branch_ip;
43 int last_thrsw_ip;
44 int first_tlb_z_write;
45
46 /* Set when we've found the last-THRSW signal, or if we were started
47 * in single-segment mode.
48 */
49 bool last_thrsw_found;
50
51 /* Set when we've found the THRSW after the last THRSW */
52 bool thrend_found;
53
54 int thrsw_count;
55
56 bool rtop_hazard;
57 bool rtop_valid;
58 };
59
60 static void
fail_instr(struct v3d_qpu_validate_state * state,const char * msg)61 fail_instr(struct v3d_qpu_validate_state *state, const char *msg)
62 {
63 struct v3d_compile *c = state->c;
64
65 fprintf(stderr, "v3d_qpu_validate at ip %d: %s:\n", state->ip, msg);
66
67 int dump_ip = 0;
68 vir_for_each_inst_inorder(inst, c) {
69 v3d_qpu_dump(c->devinfo, &inst->qpu);
70
71 if (dump_ip++ == state->ip)
72 fprintf(stderr, " *** ERROR ***");
73
74 fprintf(stderr, "\n");
75 }
76
77 fprintf(stderr, "\n");
78 abort();
79 }
80
81 static bool
in_branch_delay_slots(struct v3d_qpu_validate_state * state)82 in_branch_delay_slots(struct v3d_qpu_validate_state *state)
83 {
84 return (state->ip - state->last_branch_ip) < 3;
85 }
86
87 static bool
in_thrsw_delay_slots(struct v3d_qpu_validate_state * state)88 in_thrsw_delay_slots(struct v3d_qpu_validate_state *state)
89 {
90 return (state->ip - state->last_thrsw_ip) < 3;
91 }
92
93 static bool
qpu_magic_waddr_matches(const struct v3d_qpu_instr * inst,bool (* predicate)(enum v3d_qpu_waddr waddr))94 qpu_magic_waddr_matches(const struct v3d_qpu_instr *inst,
95 bool (*predicate)(enum v3d_qpu_waddr waddr))
96 {
97 if (inst->type == V3D_QPU_INSTR_TYPE_ALU)
98 return false;
99
100 if (inst->alu.add.op != V3D_QPU_A_NOP &&
101 inst->alu.add.magic_write &&
102 predicate(inst->alu.add.waddr))
103 return true;
104
105 if (inst->alu.mul.op != V3D_QPU_M_NOP &&
106 inst->alu.mul.magic_write &&
107 predicate(inst->alu.mul.waddr))
108 return true;
109
110 return false;
111 }
112
113 static void
qpu_validate_inst(struct v3d_qpu_validate_state * state,struct qinst * qinst)114 qpu_validate_inst(struct v3d_qpu_validate_state *state, struct qinst *qinst)
115 {
116 const struct v3d_device_info *devinfo = state->c->devinfo;
117
118 if (qinst->is_tlb_z_write && state->ip < state->first_tlb_z_write)
119 state->first_tlb_z_write = state->ip;
120
121 const struct v3d_qpu_instr *inst = &qinst->qpu;
122
123 if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH &&
124 state->first_tlb_z_write >= 0 &&
125 state->ip > state->first_tlb_z_write &&
126 inst->branch.msfign != V3D_QPU_MSFIGN_NONE &&
127 inst->branch.cond != V3D_QPU_BRANCH_COND_ALWAYS &&
128 inst->branch.cond != V3D_QPU_BRANCH_COND_A0 &&
129 inst->branch.cond != V3D_QPU_BRANCH_COND_NA0) {
130 fail_instr(state, "Implicit branch MSF read after TLB Z write");
131 }
132
133 if (inst->type != V3D_QPU_INSTR_TYPE_ALU)
134 return;
135
136 if (inst->alu.mul.op == V3D_QPU_M_MULTOP)
137 state->rtop_valid = true;
138
139 if (inst->alu.mul.op == V3D_QPU_M_UMUL24) {
140 if (state->rtop_hazard)
141 fail_instr(state, "UMUL24 reads rtop from MULTOP but it got cleared by a previous THRSW");
142 state->rtop_valid = false;
143 state->rtop_hazard = false;
144 }
145
146 if (inst->alu.add.op == V3D_QPU_A_SETMSF &&
147 state->first_tlb_z_write >= 0 &&
148 state->ip > state->first_tlb_z_write) {
149 fail_instr(state, "SETMSF after TLB Z write");
150 }
151
152 if (state->first_tlb_z_write >= 0 &&
153 state->ip > state->first_tlb_z_write &&
154 inst->alu.add.op == V3D_QPU_A_MSF) {
155 fail_instr(state, "MSF read after TLB Z write");
156 }
157
158 if (devinfo->ver < 71) {
159 if (inst->sig.small_imm_a || inst->sig.small_imm_c ||
160 inst->sig.small_imm_d) {
161 fail_instr(state, "small imm a/c/d added after V3D 7.1");
162 }
163 } else {
164 if ((inst->sig.small_imm_a || inst->sig.small_imm_b) &&
165 !vir_is_add(qinst)) {
166 fail_instr(state, "small imm a/b used but no ADD inst");
167 }
168 if ((inst->sig.small_imm_c || inst->sig.small_imm_d) &&
169 !vir_is_mul(qinst)) {
170 fail_instr(state, "small imm c/d used but no MUL inst");
171 }
172 if (inst->sig.small_imm_a + inst->sig.small_imm_b +
173 inst->sig.small_imm_c + inst->sig.small_imm_d > 1) {
174 fail_instr(state, "only one small immediate can be "
175 "enabled per instruction");
176 }
177 }
178
179 /* LDVARY writes r5 two instructions later and LDUNIF writes
180 * r5 one instruction later, which is illegal to have
181 * together.
182 */
183 if (state->last && state->last->sig.ldvary &&
184 (inst->sig.ldunif || inst->sig.ldunifa)) {
185 fail_instr(state, "LDUNIF after a LDVARY");
186 }
187
188 /* GFXH-1633 (fixed since V3D 4.2.14, which is Rpi4)
189 *
190 * FIXME: This would not check correctly for V3D 4.2 versions lower
191 * than V3D 4.2.14, but that is not a real issue because the simulator
192 * will still catch this, and we are not really targeting any such
193 * versions anyway.
194 */
195 if (state->c->devinfo->ver < 42) {
196 bool last_reads_ldunif = (state->last && (state->last->sig.ldunif ||
197 state->last->sig.ldunifrf));
198 bool last_reads_ldunifa = (state->last && (state->last->sig.ldunifa ||
199 state->last->sig.ldunifarf));
200 bool reads_ldunif = inst->sig.ldunif || inst->sig.ldunifrf;
201 bool reads_ldunifa = inst->sig.ldunifa || inst->sig.ldunifarf;
202 if ((last_reads_ldunif && reads_ldunifa) ||
203 (last_reads_ldunifa && reads_ldunif)) {
204 fail_instr(state,
205 "LDUNIF and LDUNIFA can't be next to each other");
206 }
207 }
208
209 int tmu_writes = 0;
210 int sfu_writes = 0;
211 int vpm_writes = 0;
212 int tlb_writes = 0;
213 int tsy_writes = 0;
214
215 if (inst->alu.add.op != V3D_QPU_A_NOP) {
216 if (inst->alu.add.magic_write) {
217 if (v3d_qpu_magic_waddr_is_tmu(state->c->devinfo,
218 inst->alu.add.waddr)) {
219 tmu_writes++;
220 }
221 if (v3d_qpu_magic_waddr_is_sfu(inst->alu.add.waddr))
222 sfu_writes++;
223 if (v3d_qpu_magic_waddr_is_vpm(inst->alu.add.waddr))
224 vpm_writes++;
225 if (v3d_qpu_magic_waddr_is_tlb(inst->alu.add.waddr))
226 tlb_writes++;
227 if (v3d_qpu_magic_waddr_is_tsy(inst->alu.add.waddr))
228 tsy_writes++;
229 }
230 }
231
232 if (inst->alu.mul.op != V3D_QPU_M_NOP) {
233 if (inst->alu.mul.magic_write) {
234 if (v3d_qpu_magic_waddr_is_tmu(state->c->devinfo,
235 inst->alu.mul.waddr)) {
236 tmu_writes++;
237 }
238 if (v3d_qpu_magic_waddr_is_sfu(inst->alu.mul.waddr))
239 sfu_writes++;
240 if (v3d_qpu_magic_waddr_is_vpm(inst->alu.mul.waddr))
241 vpm_writes++;
242 if (v3d_qpu_magic_waddr_is_tlb(inst->alu.mul.waddr))
243 tlb_writes++;
244 if (v3d_qpu_magic_waddr_is_tsy(inst->alu.mul.waddr))
245 tsy_writes++;
246 }
247 }
248
249 if (in_thrsw_delay_slots(state)) {
250 /* There's no way you want to start SFU during the THRSW delay
251 * slots, since the result would land in the other thread.
252 */
253 if (sfu_writes) {
254 fail_instr(state,
255 "SFU write started during THRSW delay slots ");
256 }
257
258 if (inst->sig.ldvary) {
259 if (devinfo->ver == 42)
260 fail_instr(state, "LDVARY during THRSW delay slots");
261 if (devinfo->ver >= 71 &&
262 state->ip - state->last_thrsw_ip == 2) {
263 fail_instr(state, "LDVARY in 2nd THRSW delay slot");
264 }
265 }
266 }
267
268 (void)qpu_magic_waddr_matches; /* XXX */
269
270 /* SFU r4 results come back two instructions later. No doing
271 * r4 read/writes or other SFU lookups until it's done.
272 */
273 if (state->ip - state->last_sfu_write < 2) {
274 if (v3d_qpu_uses_mux(inst, V3D_QPU_MUX_R4))
275 fail_instr(state, "R4 read too soon after SFU");
276
277 if (v3d_qpu_writes_r4(devinfo, inst))
278 fail_instr(state, "R4 write too soon after SFU");
279
280 if (sfu_writes)
281 fail_instr(state, "SFU write too soon after SFU");
282 }
283
284 /* XXX: The docs say VPM can happen with the others, but the simulator
285 * disagrees.
286 */
287 if (tmu_writes +
288 sfu_writes +
289 vpm_writes +
290 tlb_writes +
291 tsy_writes +
292 (devinfo->ver == 42 ? inst->sig.ldtmu : 0) +
293 inst->sig.ldtlb +
294 inst->sig.ldvpm +
295 inst->sig.ldtlbu > 1) {
296 fail_instr(state,
297 "Only one of [TMU, SFU, TSY, TLB read, VPM] allowed");
298 }
299
300 if (sfu_writes)
301 state->last_sfu_write = state->ip;
302
303 if (inst->sig.thrsw) {
304 if (in_branch_delay_slots(state))
305 fail_instr(state, "THRSW in a branch delay slot.");
306
307 if (state->last_thrsw_found)
308 state->thrend_found = true;
309
310 if (state->last_thrsw_ip == state->ip - 1) {
311 /* If it's the second THRSW in a row, then it's just a
312 * last-thrsw signal.
313 */
314 if (state->last_thrsw_found)
315 fail_instr(state, "Two last-THRSW signals");
316 state->last_thrsw_found = true;
317 } else {
318 if (in_thrsw_delay_slots(state)) {
319 fail_instr(state,
320 "THRSW too close to another THRSW.");
321 }
322 state->thrsw_count++;
323 state->last_thrsw_ip = state->ip;
324 }
325 }
326
327 if (state->thrend_found &&
328 state->last_thrsw_ip - state->ip <= 2 &&
329 inst->type == V3D_QPU_INSTR_TYPE_ALU) {
330 if ((inst->alu.add.op != V3D_QPU_A_NOP &&
331 !inst->alu.add.magic_write)) {
332 if (devinfo->ver == 42) {
333 fail_instr(state, "RF write after THREND");
334 } else if (devinfo->ver >= 71) {
335 if (state->last_thrsw_ip - state->ip == 0) {
336 fail_instr(state,
337 "ADD RF write at THREND");
338 }
339 if (inst->alu.add.waddr == 2 ||
340 inst->alu.add.waddr == 3) {
341 fail_instr(state,
342 "RF2-3 write after THREND");
343 }
344 }
345 }
346
347 if ((inst->alu.mul.op != V3D_QPU_M_NOP &&
348 !inst->alu.mul.magic_write)) {
349 if (devinfo->ver == 42) {
350 fail_instr(state, "RF write after THREND");
351 } else if (devinfo->ver >= 71) {
352 if (state->last_thrsw_ip - state->ip == 0) {
353 fail_instr(state,
354 "MUL RF write at THREND");
355 }
356
357 if (inst->alu.mul.waddr == 2 ||
358 inst->alu.mul.waddr == 3) {
359 fail_instr(state,
360 "RF2-3 write after THREND");
361 }
362 }
363 }
364
365 if (v3d_qpu_sig_writes_address(devinfo, &inst->sig) &&
366 !inst->sig_magic) {
367 if (devinfo->ver == 42) {
368 fail_instr(state, "RF write after THREND");
369 } else if (devinfo->ver >= 71 &&
370 (inst->sig_addr == 2 ||
371 inst->sig_addr == 3)) {
372 fail_instr(state, "RF2-3 write after THREND");
373 }
374 }
375
376 /* GFXH-1625: No TMUWT in the last instruction */
377 if (state->last_thrsw_ip - state->ip == 2 &&
378 inst->alu.add.op == V3D_QPU_A_TMUWT)
379 fail_instr(state, "TMUWT in last instruction");
380 }
381
382 if (state->rtop_valid && state->ip == state->last_thrsw_ip + 2) {
383 state->rtop_hazard = true;
384 state->rtop_valid = false;
385 }
386
387 if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH) {
388 if (in_branch_delay_slots(state))
389 fail_instr(state, "branch in a branch delay slot.");
390 if (in_thrsw_delay_slots(state))
391 fail_instr(state, "branch in a THRSW delay slot.");
392 state->last_branch_ip = state->ip;
393 }
394 }
395
396 static void
qpu_validate_block(struct v3d_qpu_validate_state * state,struct qblock * block)397 qpu_validate_block(struct v3d_qpu_validate_state *state, struct qblock *block)
398 {
399 vir_for_each_inst(qinst, block) {
400 qpu_validate_inst(state, qinst);
401
402 state->last = &qinst->qpu;
403 state->ip++;
404 }
405 }
406
407 /**
408 * Checks for the instruction restrictions from page 37 ("Summary of
409 * Instruction Restrictions").
410 */
411 void
qpu_validate(struct v3d_compile * c)412 qpu_validate(struct v3d_compile *c)
413 {
414 /* We don't want to do validation in release builds, but we want to
415 * keep compiling the validation code to make sure it doesn't get
416 * broken.
417 */
418 #if !MESA_DEBUG
419 return;
420 #endif
421
422 struct v3d_qpu_validate_state state = {
423 .c = c,
424 .last_sfu_write = -10,
425 .last_thrsw_ip = -10,
426 .last_branch_ip = -10,
427 .first_tlb_z_write = INT_MAX,
428 .ip = 0,
429
430 .last_thrsw_found = !c->last_thrsw,
431 .rtop_hazard = false,
432 .rtop_valid = false,
433 };
434
435 vir_for_each_block(block, c) {
436 qpu_validate_block(&state, block);
437 }
438
439 if (state.thrsw_count > 1 && !state.last_thrsw_found) {
440 fail_instr(&state,
441 "thread switch found without last-THRSW in program");
442 }
443
444 if (!state.thrend_found)
445 fail_instr(&state, "No program-end THRSW found");
446 }
447