1 /* -*- mesa-c++ -*-
2 * Copyright 2020 Collabora LTD
3 * Author: Gert Wollny <[email protected]>
4 * SPDX-License-Identifier: MIT
5 */
6
7 #include "nir.h"
8 #include "nir_builder.h"
9 #include "sfn_nir.h"
10
11 #include <iostream>
12 #include <map>
13 #include <vector>
14
15 namespace r600 {
16
17 using std::make_pair;
18 using std::map;
19 using std::pair;
20 using std::vector;
21
22 class LowerSplit64BitVar : public NirLowerInstruction {
23 public:
24 ~LowerSplit64BitVar();
25 using VarSplit = pair<nir_variable *, nir_variable *>;
26 using VarMap = map<unsigned, VarSplit>;
27
28 nir_def *split_double_load_deref(nir_intrinsic_instr *intr);
29
30 nir_def *split_double_store_deref(nir_intrinsic_instr *intr);
31
32 private:
33 nir_def *split_load_deref_array(nir_intrinsic_instr *intr, nir_src& index);
34
35 nir_def *split_load_deref_var(nir_intrinsic_instr *intr);
36
37 nir_def *split_store_deref_array(nir_intrinsic_instr *intr,
38 nir_deref_instr *deref);
39
40 nir_def *split_store_deref_var(nir_intrinsic_instr *intr, nir_deref_instr *deref1);
41
42 VarSplit get_var_pair(nir_variable *old_var);
43
44 nir_def *
45 merge_64bit_loads(nir_def *load1, nir_def *load2, bool out_is_vec3);
46
47 nir_def *split_double_load(nir_intrinsic_instr *load1);
48
49 nir_def *split_store_output(nir_intrinsic_instr *store1);
50
51 nir_def *split_double_load_uniform(nir_intrinsic_instr *intr);
52
53 nir_def *split_double_load_ssbo(nir_intrinsic_instr *intr);
54
55 nir_def *split_double_load_ubo(nir_intrinsic_instr *intr);
56
57 nir_def *
58 split_reduction(nir_def *src[2][2], nir_op op1, nir_op op2, nir_op reduction);
59
60 nir_def *
61 split_reduction3(nir_alu_instr *alu, nir_op op1, nir_op op2, nir_op reduction);
62
63 nir_def *
64 split_reduction4(nir_alu_instr *alu, nir_op op1, nir_op op2, nir_op reduction);
65
66 nir_def *split_bcsel(nir_alu_instr *alu);
67
68 nir_def *split_load_const(nir_load_const_instr *lc);
69
70 bool filter(const nir_instr *instr) const override;
71 nir_def *lower(nir_instr *instr) override;
72
73 VarMap m_varmap;
74 vector<nir_variable *> m_old_vars;
75 vector<nir_instr *> m_old_stores;
76 };
77
78 class LowerLoad64Uniform : public NirLowerInstruction {
79 bool filter(const nir_instr *instr) const override;
80 nir_def *lower(nir_instr *instr) override;
81 };
82
83 bool
filter(const nir_instr * instr) const84 LowerLoad64Uniform::filter(const nir_instr *instr) const
85 {
86 if (instr->type != nir_instr_type_intrinsic)
87 return false;
88
89 auto intr = nir_instr_as_intrinsic(instr);
90 if (intr->intrinsic != nir_intrinsic_load_uniform &&
91 intr->intrinsic != nir_intrinsic_load_ubo &&
92 intr->intrinsic != nir_intrinsic_load_ubo_vec4)
93 return false;
94
95 return intr->def.bit_size == 64;
96 }
97
98 nir_def *
lower(nir_instr * instr)99 LowerLoad64Uniform::lower(nir_instr *instr)
100 {
101 auto intr = nir_instr_as_intrinsic(instr);
102 int old_components = intr->def.num_components;
103 assert(old_components <= 2);
104 intr->def.num_components *= 2;
105 intr->def.bit_size = 32;
106 intr->num_components *= 2;
107
108 if (intr->intrinsic == nir_intrinsic_load_ubo ||
109 intr->intrinsic == nir_intrinsic_load_ubo_vec4)
110 nir_intrinsic_set_component(intr, 2 * nir_intrinsic_component(intr));
111
112 nir_def *result_vec[2] = {nullptr, nullptr};
113
114 for (int i = 0; i < old_components; ++i) {
115 result_vec[i] = nir_pack_64_2x32_split(b,
116 nir_channel(b, &intr->def, 2 * i),
117 nir_channel(b, &intr->def, 2 * i + 1));
118 }
119 if (old_components == 1)
120 return result_vec[0];
121
122 return nir_vec2(b, result_vec[0], result_vec[1]);
123 }
124
125 bool
r600_split_64bit_uniforms_and_ubo(nir_shader * sh)126 r600_split_64bit_uniforms_and_ubo(nir_shader *sh)
127 {
128 return LowerLoad64Uniform().run(sh);
129 }
130
131 class LowerSplit64op : public NirLowerInstruction {
filter(const nir_instr * instr) const132 bool filter(const nir_instr *instr) const override
133 {
134 switch (instr->type) {
135 case nir_instr_type_alu: {
136 auto alu = nir_instr_as_alu(instr);
137 switch (alu->op) {
138 case nir_op_bcsel:
139 return alu->def.bit_size == 64;
140 case nir_op_f2i32:
141 case nir_op_f2u32:
142 case nir_op_f2i64:
143 case nir_op_f2u64:
144 case nir_op_u2f64:
145 case nir_op_i2f64:
146 return nir_src_bit_size(alu->src[0].src) == 64;
147 default:
148 return false;
149 }
150 }
151 case nir_instr_type_phi: {
152 auto phi = nir_instr_as_phi(instr);
153 return phi->def.num_components == 64;
154 }
155 default:
156 return false;
157 }
158 }
159
lower(nir_instr * instr)160 nir_def *lower(nir_instr *instr) override
161 {
162
163 switch (instr->type) {
164 case nir_instr_type_alu: {
165 auto alu = nir_instr_as_alu(instr);
166 switch (alu->op) {
167
168 case nir_op_bcsel: {
169 auto lo =
170 nir_bcsel(b,
171 alu->src[0].src.ssa,
172 nir_unpack_64_2x32_split_x(b, nir_ssa_for_alu_src(b, alu, 1)),
173 nir_unpack_64_2x32_split_x(b, nir_ssa_for_alu_src(b, alu, 2)));
174 auto hi =
175 nir_bcsel(b,
176 alu->src[0].src.ssa,
177 nir_unpack_64_2x32_split_y(b, nir_ssa_for_alu_src(b, alu, 1)),
178 nir_unpack_64_2x32_split_y(b, nir_ssa_for_alu_src(b, alu, 2)));
179 return nir_pack_64_2x32_split(b, lo, hi);
180 }
181 case nir_op_f2i32: {
182 auto src = nir_ssa_for_alu_src(b, alu, 0);
183 auto gt0 = nir_fgt_imm(b, src, 0.0);
184 auto abs_src = nir_fabs(b, src);
185 auto value = nir_f2u32(b, abs_src);
186 return nir_bcsel(b, gt0, value, nir_ineg(b, value));
187 }
188 case nir_op_f2u32: {
189 /* fp32 doesn't hold sufficient bits to represent the full range of
190 * u32, therefore we have to split the values, and because f2f32
191 * rounds, we have to remove the fractional part in the hi bits
192 * For values > UINT_MAX the result is undefined */
193 auto src = nir_ssa_for_alu_src(b, alu, 0);
194 src = nir_fadd(b, src, nir_fneg(b, nir_ffract(b, src)));
195 auto gt0 = nir_fgt_imm(b, src, 0.0);
196 auto highval = nir_fmul_imm(b, src, 1.0 / 65536.0);
197 auto fract = nir_ffract(b, highval);
198 auto high = nir_f2u32(b, nir_f2f32(b, nir_fadd(b, highval, nir_fneg(b, fract))));
199 auto lowval = nir_fmul_imm(b, fract, 65536.0);
200 auto low = nir_f2u32(b, nir_f2f32(b, lowval));
201 return nir_bcsel(b,
202 gt0,
203 nir_ior(b, nir_ishl_imm(b, high, 16), low),
204 nir_imm_int(b, 0));
205 }
206 case nir_op_u2f64: {
207 auto src = nir_ssa_for_alu_src(b, alu, 0);
208 auto low = nir_unpack_64_2x32_split_x(b, src);
209 auto high = nir_unpack_64_2x32_split_y(b, src);
210 auto flow = nir_u2f64(b, low);
211 auto fhigh = nir_u2f64(b, high);
212 return nir_fadd(b, nir_fmul_imm(b, fhigh, 65536.0 * 65536.0), flow);
213 }
214 case nir_op_i2f64: {
215 auto src = nir_ssa_for_alu_src(b, alu, 0);
216 auto low = nir_unpack_64_2x32_split_x(b, src);
217 auto high = nir_unpack_64_2x32_split_y(b, src);
218 auto flow = nir_u2f64(b, low);
219 auto fhigh = nir_i2f64(b, high);
220 return nir_fadd(b, nir_fmul_imm(b, fhigh, 65536.0 * 65536.0), flow);
221 }
222 default:
223 unreachable("trying to lower instruction that was not in filter");
224 }
225 }
226 case nir_instr_type_phi: {
227 auto phi = nir_instr_as_phi(instr);
228 auto phi_lo = nir_phi_instr_create(b->shader);
229 auto phi_hi = nir_phi_instr_create(b->shader);
230 nir_def_init(
231 &phi_lo->instr, &phi_lo->def, phi->def.num_components * 2, 32);
232 nir_def_init(
233 &phi_hi->instr, &phi_hi->def, phi->def.num_components * 2, 32);
234 nir_foreach_phi_src(s, phi)
235 {
236 auto lo = nir_unpack_32_2x16_split_x(b, s->src.ssa);
237 auto hi = nir_unpack_32_2x16_split_x(b, s->src.ssa);
238 nir_phi_instr_add_src(phi_lo, s->pred, lo);
239 nir_phi_instr_add_src(phi_hi, s->pred, hi);
240 }
241 return nir_pack_64_2x32_split(b, &phi_lo->def, &phi_hi->def);
242 }
243 default:
244 unreachable("Trying to lower instruction that was not in filter");
245 }
246 }
247 };
248
249 bool
r600_split_64bit_alu_and_phi(nir_shader * sh)250 r600_split_64bit_alu_and_phi(nir_shader *sh)
251 {
252 return LowerSplit64op().run(sh);
253 }
254
255 bool
filter(const nir_instr * instr) const256 LowerSplit64BitVar::filter(const nir_instr *instr) const
257 {
258 switch (instr->type) {
259 case nir_instr_type_intrinsic: {
260 auto intr = nir_instr_as_intrinsic(instr);
261
262 switch (intr->intrinsic) {
263 case nir_intrinsic_load_deref:
264 case nir_intrinsic_load_uniform:
265 case nir_intrinsic_load_input:
266 case nir_intrinsic_load_ubo:
267 case nir_intrinsic_load_ssbo:
268 if (intr->def.bit_size != 64)
269 return false;
270 return intr->def.num_components >= 3;
271 case nir_intrinsic_store_output:
272 if (nir_src_bit_size(intr->src[0]) != 64)
273 return false;
274 return nir_src_num_components(intr->src[0]) >= 3;
275 case nir_intrinsic_store_deref:
276 if (nir_src_bit_size(intr->src[1]) != 64)
277 return false;
278 return nir_src_num_components(intr->src[1]) >= 3;
279 default:
280 return false;
281 }
282 }
283 case nir_instr_type_alu: {
284 auto alu = nir_instr_as_alu(instr);
285 switch (alu->op) {
286 case nir_op_bcsel:
287 if (alu->def.num_components < 3)
288 return false;
289 return alu->def.bit_size == 64;
290 case nir_op_bany_fnequal3:
291 case nir_op_bany_fnequal4:
292 case nir_op_ball_fequal3:
293 case nir_op_ball_fequal4:
294 case nir_op_bany_inequal3:
295 case nir_op_bany_inequal4:
296 case nir_op_ball_iequal3:
297 case nir_op_ball_iequal4:
298 case nir_op_fdot3:
299 case nir_op_fdot4:
300 return nir_src_bit_size(alu->src[1].src) == 64;
301 default:
302 return false;
303 }
304 }
305 case nir_instr_type_load_const: {
306 auto lc = nir_instr_as_load_const(instr);
307 if (lc->def.bit_size != 64)
308 return false;
309 return lc->def.num_components >= 3;
310 }
311 default:
312 return false;
313 }
314 }
315
316 nir_def *
merge_64bit_loads(nir_def * load1,nir_def * load2,bool out_is_vec3)317 LowerSplit64BitVar::merge_64bit_loads(nir_def *load1,
318 nir_def *load2,
319 bool out_is_vec3)
320 {
321 if (out_is_vec3)
322 return nir_vec3(b,
323 nir_channel(b, load1, 0),
324 nir_channel(b, load1, 1),
325 nir_channel(b, load2, 0));
326 else
327 return nir_vec4(b,
328 nir_channel(b, load1, 0),
329 nir_channel(b, load1, 1),
330 nir_channel(b, load2, 0),
331 nir_channel(b, load2, 1));
332 }
333
~LowerSplit64BitVar()334 LowerSplit64BitVar::~LowerSplit64BitVar()
335 {
336 for (auto&& v : m_old_vars)
337 exec_node_remove(&v->node);
338
339 for (auto&& v : m_old_stores)
340 nir_instr_remove(v);
341 }
342
343 nir_def *
split_double_store_deref(nir_intrinsic_instr * intr)344 LowerSplit64BitVar::split_double_store_deref(nir_intrinsic_instr *intr)
345 {
346 auto deref = nir_instr_as_deref(intr->src[0].ssa->parent_instr);
347 if (deref->deref_type == nir_deref_type_var)
348 return split_store_deref_var(intr, deref);
349 else if (deref->deref_type == nir_deref_type_array)
350 return split_store_deref_array(intr, deref);
351 else {
352 unreachable("only splitting of stores to vars and arrays is supported");
353 }
354 }
355
356 nir_def *
split_double_load_deref(nir_intrinsic_instr * intr)357 LowerSplit64BitVar::split_double_load_deref(nir_intrinsic_instr *intr)
358 {
359 auto deref = nir_instr_as_deref(intr->src[0].ssa->parent_instr);
360 if (deref->deref_type == nir_deref_type_var)
361 return split_load_deref_var(intr);
362 else if (deref->deref_type == nir_deref_type_array)
363 return split_load_deref_array(intr, deref->arr.index);
364 else {
365 unreachable("only splitting of loads from vars and arrays is supported");
366 }
367 m_old_stores.push_back(&intr->instr);
368 }
369
370 nir_def *
split_load_deref_array(nir_intrinsic_instr * intr,nir_src & index)371 LowerSplit64BitVar::split_load_deref_array(nir_intrinsic_instr *intr, nir_src& index)
372 {
373 auto old_var = nir_intrinsic_get_var(intr, 0);
374 unsigned old_components = glsl_get_components(glsl_without_array(old_var->type));
375
376 assert(old_components > 2 && old_components <= 4);
377
378 auto vars = get_var_pair(old_var);
379
380 auto deref1 = nir_build_deref_var(b, vars.first);
381 auto deref_array1 = nir_build_deref_array(b, deref1, index.ssa);
382 auto load1 =
383 nir_build_load_deref(b, 2, 64, &deref_array1->def, (enum gl_access_qualifier)0);
384
385 auto deref2 = nir_build_deref_var(b, vars.second);
386 auto deref_array2 = nir_build_deref_array(b, deref2, index.ssa);
387
388 auto load2 = nir_build_load_deref(
389 b, old_components - 2, 64, &deref_array2->def, (enum gl_access_qualifier)0);
390
391 return merge_64bit_loads(load1, load2, old_components == 3);
392 }
393
394 nir_def *
split_store_deref_array(nir_intrinsic_instr * intr,nir_deref_instr * deref)395 LowerSplit64BitVar::split_store_deref_array(nir_intrinsic_instr *intr,
396 nir_deref_instr *deref)
397 {
398 auto old_var = nir_intrinsic_get_var(intr, 0);
399 unsigned old_components = glsl_get_components(glsl_without_array(old_var->type));
400
401 assert(old_components > 2 && old_components <= 4);
402
403 auto src_xy = nir_trim_vector(b, intr->src[1].ssa, 2);
404
405 auto vars = get_var_pair(old_var);
406
407 auto deref1 = nir_build_deref_var(b, vars.first);
408 auto deref_array1 =
409 nir_build_deref_array(b, deref1, deref->arr.index.ssa);
410
411 nir_build_store_deref(b, &deref_array1->def, src_xy, 3);
412
413 auto deref2 = nir_build_deref_var(b, vars.second);
414 auto deref_array2 =
415 nir_build_deref_array(b, deref2, deref->arr.index.ssa);
416
417 if (old_components == 3)
418 nir_build_store_deref(b,
419 &deref_array2->def,
420 nir_channel(b, intr->src[1].ssa, 2),
421 1);
422 else
423 nir_build_store_deref(b,
424 &deref_array2->def,
425 nir_channels(b, intr->src[1].ssa, 0xc),
426 3);
427
428 return NIR_LOWER_INSTR_PROGRESS_REPLACE;
429 }
430
431 nir_def *
split_store_deref_var(nir_intrinsic_instr * intr,UNUSED nir_deref_instr * deref)432 LowerSplit64BitVar::split_store_deref_var(nir_intrinsic_instr *intr,
433 UNUSED nir_deref_instr *deref)
434 {
435 auto old_var = nir_intrinsic_get_var(intr, 0);
436 unsigned old_components = glsl_get_components(glsl_without_array(old_var->type));
437
438 assert(old_components > 2 && old_components <= 4);
439
440 auto src_xy = nir_trim_vector(b, intr->src[1].ssa, 2);
441
442 auto vars = get_var_pair(old_var);
443
444 auto deref1 = nir_build_deref_var(b, vars.first);
445 nir_build_store_deref(b, &deref1->def, src_xy, 3);
446
447 auto deref2 = nir_build_deref_var(b, vars.second);
448 if (old_components == 3)
449 nir_build_store_deref(b, &deref2->def, nir_channel(b, intr->src[1].ssa, 2), 1);
450 else
451 nir_build_store_deref(b,
452 &deref2->def,
453 nir_channels(b, intr->src[1].ssa, 0xc),
454 3);
455
456 return NIR_LOWER_INSTR_PROGRESS_REPLACE;
457 }
458
459 nir_def *
split_load_deref_var(nir_intrinsic_instr * intr)460 LowerSplit64BitVar::split_load_deref_var(nir_intrinsic_instr *intr)
461 {
462 auto old_var = nir_intrinsic_get_var(intr, 0);
463 auto vars = get_var_pair(old_var);
464 unsigned old_components = glsl_get_components(old_var->type);
465
466 nir_deref_instr *deref1 = nir_build_deref_var(b, vars.first);
467 auto *load1 = nir_load_deref(b, deref1);
468
469 nir_deref_instr *deref2 = nir_build_deref_var(b, vars.second);
470 deref2->type = vars.second->type;
471
472 auto *load2 = nir_load_deref(b, deref2);
473
474 return merge_64bit_loads(load1, load2, old_components == 3);
475 }
476
477 LowerSplit64BitVar::VarSplit
get_var_pair(nir_variable * old_var)478 LowerSplit64BitVar::get_var_pair(nir_variable *old_var)
479 {
480 auto split_vars = m_varmap.find(old_var->data.driver_location);
481
482 assert(glsl_get_components(glsl_without_array(old_var->type)) > 2);
483
484 if (split_vars == m_varmap.end()) {
485 auto var1 = nir_variable_clone(old_var, b->shader);
486 auto var2 = nir_variable_clone(old_var, b->shader);
487
488 var1->type = glsl_dvec_type(2);
489 var2->type = glsl_dvec_type(glsl_get_components(glsl_without_array(old_var->type)) - 2);
490
491 if (glsl_type_is_array(old_var->type)) {
492 var1->type = glsl_array_type(var1->type, glsl_array_size(old_var->type), 0);
493 var2->type = glsl_array_type(var2->type, glsl_array_size(old_var->type), 0);
494 }
495
496 if (old_var->data.mode == nir_var_shader_in ||
497 old_var->data.mode == nir_var_shader_out) {
498 ++var2->data.driver_location;
499 ++var2->data.location;
500 nir_shader_add_variable(b->shader, var1);
501 nir_shader_add_variable(b->shader, var2);
502 } else if (old_var->data.mode == nir_var_function_temp) {
503 exec_list_push_tail(&b->impl->locals, &var1->node);
504 exec_list_push_tail(&b->impl->locals, &var2->node);
505 }
506
507 m_varmap[old_var->data.driver_location] = make_pair(var1, var2);
508 }
509 return m_varmap[old_var->data.driver_location];
510 }
511
512 nir_def *
split_double_load(nir_intrinsic_instr * load1)513 LowerSplit64BitVar::split_double_load(nir_intrinsic_instr *load1)
514 {
515 unsigned old_components = load1->def.num_components;
516 auto load2 = nir_instr_as_intrinsic(nir_instr_clone(b->shader, &load1->instr));
517 nir_io_semantics sem = nir_intrinsic_io_semantics(load1);
518
519 load1->def.num_components = 2;
520 sem.num_slots = 1;
521 nir_intrinsic_set_io_semantics(load1, sem);
522
523 load2->def.num_components = old_components - 2;
524 sem.location += 1;
525 nir_intrinsic_set_io_semantics(load2, sem);
526 nir_intrinsic_set_base(load2, nir_intrinsic_base(load1) + 1);
527 nir_builder_instr_insert(b, &load2->instr);
528
529 return merge_64bit_loads(&load1->def, &load2->def, old_components == 3);
530 }
531
532 nir_def *
split_store_output(nir_intrinsic_instr * store1)533 LowerSplit64BitVar::split_store_output(nir_intrinsic_instr *store1)
534 {
535 auto src = store1->src[0];
536 unsigned old_components = nir_src_num_components(src);
537 nir_io_semantics sem = nir_intrinsic_io_semantics(store1);
538
539 auto store2 = nir_instr_as_intrinsic(nir_instr_clone(b->shader, &store1->instr));
540 auto src1 = nir_trim_vector(b, src.ssa, 2);
541 auto src2 = nir_channels(b, src.ssa, old_components == 3 ? 4 : 0xc);
542
543 nir_src_rewrite(&src, src1);
544 nir_intrinsic_set_write_mask(store1, 3);
545
546 nir_src_rewrite(&src, src2);
547 nir_intrinsic_set_write_mask(store2, old_components == 3 ? 1 : 3);
548
549 sem.num_slots = 1;
550 nir_intrinsic_set_io_semantics(store1, sem);
551
552 sem.location += 1;
553 nir_intrinsic_set_io_semantics(store2, sem);
554 nir_intrinsic_set_base(store2, nir_intrinsic_base(store1));
555
556 nir_builder_instr_insert(b, &store2->instr);
557 return NIR_LOWER_INSTR_PROGRESS;
558 }
559
560 nir_def *
split_double_load_uniform(nir_intrinsic_instr * intr)561 LowerSplit64BitVar::split_double_load_uniform(nir_intrinsic_instr *intr)
562 {
563 unsigned second_components = intr->def.num_components - 2;
564 nir_intrinsic_instr *load2 =
565 nir_intrinsic_instr_create(b->shader, nir_intrinsic_load_uniform);
566 load2->src[0] = nir_src_for_ssa(nir_iadd_imm(b, intr->src[0].ssa, 1));
567 nir_intrinsic_set_dest_type(load2, nir_intrinsic_dest_type(intr));
568 nir_intrinsic_set_base(load2, nir_intrinsic_base(intr));
569 nir_intrinsic_set_range(load2, nir_intrinsic_range(intr));
570 load2->num_components = second_components;
571
572 nir_def_init(&load2->instr, &load2->def, second_components, 64);
573 nir_builder_instr_insert(b, &load2->instr);
574
575 intr->def.num_components = intr->num_components = 2;
576
577 if (second_components == 1)
578 return nir_vec3(b,
579 nir_channel(b, &intr->def, 0),
580 nir_channel(b, &intr->def, 1),
581 nir_channel(b, &load2->def, 0));
582 else
583 return nir_vec4(b,
584 nir_channel(b, &intr->def, 0),
585 nir_channel(b, &intr->def, 1),
586 nir_channel(b, &load2->def, 0),
587 nir_channel(b, &load2->def, 1));
588 }
589
590 nir_def *
split_double_load_ssbo(nir_intrinsic_instr * intr)591 LowerSplit64BitVar::split_double_load_ssbo(nir_intrinsic_instr *intr)
592 {
593 unsigned second_components = intr->def.num_components - 2;
594 nir_intrinsic_instr *load2 =
595 nir_instr_as_intrinsic(nir_instr_clone(b->shader, &intr->instr));
596
597 nir_src_rewrite(&load2->src[0], nir_iadd_imm(b, intr->src[0].ssa, 1));
598 load2->num_components = second_components;
599 nir_def_init(&load2->instr, &load2->def, second_components, 64);
600
601 nir_intrinsic_set_dest_type(load2, nir_intrinsic_dest_type(intr));
602 nir_builder_instr_insert(b, &load2->instr);
603
604 intr->def.num_components = intr->num_components = 2;
605
606 return merge_64bit_loads(&intr->def, &load2->def, second_components == 1);
607 }
608
609 nir_def *
split_double_load_ubo(nir_intrinsic_instr * intr)610 LowerSplit64BitVar::split_double_load_ubo(nir_intrinsic_instr *intr)
611 {
612 unsigned second_components = intr->def.num_components - 2;
613 nir_intrinsic_instr *load2 =
614 nir_instr_as_intrinsic(nir_instr_clone(b->shader, &intr->instr));
615 load2->src[0] = intr->src[0];
616 load2->src[1] = nir_src_for_ssa(nir_iadd_imm(b, intr->src[1].ssa, 16));
617 nir_intrinsic_set_range_base(load2, nir_intrinsic_range_base(intr) + 16);
618 nir_intrinsic_set_range(load2, nir_intrinsic_range(intr));
619 nir_intrinsic_set_access(load2, nir_intrinsic_access(intr));
620 nir_intrinsic_set_align_mul(load2, nir_intrinsic_align_mul(intr));
621 nir_intrinsic_set_align_offset(load2, nir_intrinsic_align_offset(intr));
622
623 load2->num_components = second_components;
624
625 nir_def_init(&load2->instr, &load2->def, second_components, 64);
626 nir_builder_instr_insert(b, &load2->instr);
627
628 intr->def.num_components = intr->num_components = 2;
629
630 return merge_64bit_loads(&intr->def, &load2->def, second_components == 1);
631 }
632
633 nir_def *
split_reduction(nir_def * src[2][2],nir_op op1,nir_op op2,nir_op reduction)634 LowerSplit64BitVar::split_reduction(nir_def *src[2][2],
635 nir_op op1,
636 nir_op op2,
637 nir_op reduction)
638 {
639 auto cmp0 = nir_build_alu(b, op1, src[0][0], src[0][1], nullptr, nullptr);
640 auto cmp1 = nir_build_alu(b, op2, src[1][0], src[1][1], nullptr, nullptr);
641 return nir_build_alu(b, reduction, cmp0, cmp1, nullptr, nullptr);
642 }
643
644 nir_def *
split_reduction3(nir_alu_instr * alu,nir_op op1,nir_op op2,nir_op reduction)645 LowerSplit64BitVar::split_reduction3(nir_alu_instr *alu,
646 nir_op op1,
647 nir_op op2,
648 nir_op reduction)
649 {
650 nir_def *src[2][2];
651
652 src[0][0] = nir_trim_vector(b, alu->src[0].src.ssa, 2);
653 src[0][1] = nir_trim_vector(b, alu->src[1].src.ssa, 2);
654
655 src[1][0] = nir_channel(b, alu->src[0].src.ssa, 2);
656 src[1][1] = nir_channel(b, alu->src[1].src.ssa, 2);
657
658 return split_reduction(src, op1, op2, reduction);
659 }
660
661 nir_def *
split_reduction4(nir_alu_instr * alu,nir_op op1,nir_op op2,nir_op reduction)662 LowerSplit64BitVar::split_reduction4(nir_alu_instr *alu,
663 nir_op op1,
664 nir_op op2,
665 nir_op reduction)
666 {
667 nir_def *src[2][2];
668
669 src[0][0] = nir_trim_vector(b, alu->src[0].src.ssa, 2);
670 src[0][1] = nir_trim_vector(b, alu->src[1].src.ssa, 2);
671
672 src[1][0] = nir_channels(b, alu->src[0].src.ssa, 0xc);
673 src[1][1] = nir_channels(b, alu->src[1].src.ssa, 0xc);
674
675 return split_reduction(src, op1, op2, reduction);
676 }
677
678 nir_def *
split_bcsel(nir_alu_instr * alu)679 LowerSplit64BitVar::split_bcsel(nir_alu_instr *alu)
680 {
681 static nir_def *dest[4];
682 for (unsigned i = 0; i < alu->def.num_components; ++i) {
683 dest[i] = nir_bcsel(b,
684 nir_channel(b, alu->src[0].src.ssa, i),
685 nir_channel(b, alu->src[1].src.ssa, i),
686 nir_channel(b, alu->src[2].src.ssa, i));
687 }
688 return nir_vec(b, dest, alu->def.num_components);
689 }
690
691 nir_def *
split_load_const(nir_load_const_instr * lc)692 LowerSplit64BitVar::split_load_const(nir_load_const_instr *lc)
693 {
694 nir_def *ir[4];
695 for (unsigned i = 0; i < lc->def.num_components; ++i)
696 ir[i] = nir_imm_double(b, lc->value[i].f64);
697
698 return nir_vec(b, ir, lc->def.num_components);
699 }
700
701 nir_def *
lower(nir_instr * instr)702 LowerSplit64BitVar::lower(nir_instr *instr)
703 {
704 switch (instr->type) {
705 case nir_instr_type_intrinsic: {
706 auto intr = nir_instr_as_intrinsic(instr);
707 switch (intr->intrinsic) {
708 case nir_intrinsic_load_deref:
709 return this->split_double_load_deref(intr);
710 case nir_intrinsic_load_uniform:
711 return split_double_load_uniform(intr);
712 case nir_intrinsic_load_ubo:
713 return split_double_load_ubo(intr);
714 case nir_intrinsic_load_ssbo:
715 return split_double_load_ssbo(intr);
716 case nir_intrinsic_load_input:
717 return split_double_load(intr);
718 case nir_intrinsic_store_output:
719 return split_store_output(intr);
720 case nir_intrinsic_store_deref:
721 return split_double_store_deref(intr);
722 default:
723 assert(0);
724 }
725 }
726 case nir_instr_type_alu: {
727 auto alu = nir_instr_as_alu(instr);
728 switch (alu->op) {
729 case nir_op_bany_fnequal3:
730 return split_reduction3(alu, nir_op_bany_fnequal2, nir_op_fneu, nir_op_ior);
731 case nir_op_ball_fequal3:
732 return split_reduction3(alu, nir_op_ball_fequal2, nir_op_feq, nir_op_iand);
733 case nir_op_bany_inequal3:
734 return split_reduction3(alu, nir_op_bany_inequal2, nir_op_ine, nir_op_ior);
735 case nir_op_ball_iequal3:
736 return split_reduction3(alu, nir_op_ball_iequal2, nir_op_ieq, nir_op_iand);
737 case nir_op_fdot3:
738 return split_reduction3(alu, nir_op_fdot2, nir_op_fmul, nir_op_fadd);
739 case nir_op_bany_fnequal4:
740 return split_reduction4(alu,
741 nir_op_bany_fnequal2,
742 nir_op_bany_fnequal2,
743 nir_op_ior);
744 case nir_op_ball_fequal4:
745 return split_reduction4(alu,
746 nir_op_ball_fequal2,
747 nir_op_ball_fequal2,
748 nir_op_iand);
749 case nir_op_bany_inequal4:
750 return split_reduction4(alu,
751 nir_op_bany_inequal2,
752 nir_op_bany_inequal2,
753 nir_op_ior);
754 case nir_op_ball_iequal4:
755 return split_reduction4(alu,
756 nir_op_bany_fnequal2,
757 nir_op_bany_fnequal2,
758 nir_op_ior);
759 case nir_op_fdot4:
760 return split_reduction4(alu, nir_op_fdot2, nir_op_fdot2, nir_op_fadd);
761 case nir_op_bcsel:
762 return split_bcsel(alu);
763 default:
764 assert(0);
765 }
766 }
767 case nir_instr_type_load_const: {
768 auto lc = nir_instr_as_load_const(instr);
769 return split_load_const(lc);
770 }
771 default:
772 assert(0);
773 }
774 return nullptr;
775 }
776
777 /* Split 64 bit instruction so that at most two 64 bit components are
778 * used in one instruction */
779
780 bool
r600_nir_split_64bit_io(nir_shader * sh)781 r600_nir_split_64bit_io(nir_shader *sh)
782 {
783 return LowerSplit64BitVar().run(sh);
784 }
785
786 /* */
787 class Lower64BitToVec2 : public NirLowerInstruction {
788
789 private:
790 bool filter(const nir_instr *instr) const override;
791 nir_def *lower(nir_instr *instr) override;
792
793 nir_def *load_deref_64_to_vec2(nir_intrinsic_instr *intr);
794 nir_def *load_uniform_64_to_vec2(nir_intrinsic_instr *intr);
795 nir_def *load_ssbo_64_to_vec2(nir_intrinsic_instr *intr);
796 nir_def *load_64_to_vec2(nir_intrinsic_instr *intr);
797 nir_def *store_64_to_vec2(nir_intrinsic_instr *intr);
798 };
799
800 bool
filter(const nir_instr * instr) const801 Lower64BitToVec2::filter(const nir_instr *instr) const
802 {
803 switch (instr->type) {
804 case nir_instr_type_intrinsic: {
805 auto intr = nir_instr_as_intrinsic(instr);
806
807 switch (intr->intrinsic) {
808 case nir_intrinsic_load_deref:
809 case nir_intrinsic_load_input:
810 case nir_intrinsic_load_uniform:
811 case nir_intrinsic_load_ubo:
812 case nir_intrinsic_load_global:
813 case nir_intrinsic_load_global_constant:
814 case nir_intrinsic_load_ubo_vec4:
815 case nir_intrinsic_load_ssbo:
816 return intr->def.bit_size == 64;
817 case nir_intrinsic_store_deref: {
818 if (nir_src_bit_size(intr->src[1]) == 64)
819 return true;
820 auto var = nir_intrinsic_get_var(intr, 0);
821 if (glsl_get_bit_size(glsl_without_array(var->type)) == 64)
822 return true;
823 return (glsl_get_components(glsl_without_array(var->type)) != intr->num_components);
824 }
825 case nir_intrinsic_store_global:
826 return nir_src_bit_size(intr->src[0]) == 64;
827 default:
828 return false;
829 }
830 }
831 case nir_instr_type_alu: {
832 auto alu = nir_instr_as_alu(instr);
833 return alu->def.bit_size == 64;
834 }
835 case nir_instr_type_phi: {
836 auto phi = nir_instr_as_phi(instr);
837 return phi->def.bit_size == 64;
838 }
839 case nir_instr_type_load_const: {
840 auto lc = nir_instr_as_load_const(instr);
841 return lc->def.bit_size == 64;
842 }
843 case nir_instr_type_undef: {
844 auto undef = nir_instr_as_undef(instr);
845 return undef->def.bit_size == 64;
846 }
847 default:
848 return false;
849 }
850 }
851
852 nir_def *
lower(nir_instr * instr)853 Lower64BitToVec2::lower(nir_instr *instr)
854 {
855 switch (instr->type) {
856 case nir_instr_type_intrinsic: {
857 auto intr = nir_instr_as_intrinsic(instr);
858 switch (intr->intrinsic) {
859 case nir_intrinsic_load_deref:
860 return load_deref_64_to_vec2(intr);
861 case nir_intrinsic_load_uniform:
862 return load_uniform_64_to_vec2(intr);
863 case nir_intrinsic_load_ssbo:
864 return load_ssbo_64_to_vec2(intr);
865 case nir_intrinsic_load_input:
866 case nir_intrinsic_load_global:
867 case nir_intrinsic_load_global_constant:
868 case nir_intrinsic_load_ubo:
869 case nir_intrinsic_load_ubo_vec4:
870 return load_64_to_vec2(intr);
871 case nir_intrinsic_store_deref:
872 return store_64_to_vec2(intr);
873 default:
874
875 return nullptr;
876 }
877 }
878 case nir_instr_type_alu: {
879 auto alu = nir_instr_as_alu(instr);
880 alu->def.bit_size = 32;
881 alu->def.num_components *= 2;
882 switch (alu->op) {
883 case nir_op_pack_64_2x32_split:
884 alu->op = nir_op_vec2;
885 break;
886 case nir_op_pack_64_2x32:
887 alu->op = nir_op_mov;
888 break;
889 case nir_op_vec2:
890 return nir_vec4(b,
891 nir_channel(b, alu->src[0].src.ssa, 0),
892 nir_channel(b, alu->src[0].src.ssa, 1),
893 nir_channel(b, alu->src[1].src.ssa, 0),
894 nir_channel(b, alu->src[1].src.ssa, 1));
895 default:
896 return NULL;
897 }
898 return NIR_LOWER_INSTR_PROGRESS;
899 }
900 case nir_instr_type_phi: {
901 auto phi = nir_instr_as_phi(instr);
902 phi->def.bit_size = 32;
903 phi->def.num_components = 2;
904 return NIR_LOWER_INSTR_PROGRESS;
905 }
906 case nir_instr_type_load_const: {
907 auto lc = nir_instr_as_load_const(instr);
908 assert(lc->def.num_components <= 2);
909 nir_const_value val[4];
910 for (uint i = 0; i < lc->def.num_components; ++i) {
911 uint64_t v = lc->value[i].u64;
912 val[i * 2 + 0] = nir_const_value_for_uint(v & 0xffffffff, 32);
913 val[i * 2 + 1] = nir_const_value_for_uint(v >> 32, 32);
914 }
915
916 return nir_build_imm(b, 2 * lc->def.num_components, 32, val);
917 }
918 case nir_instr_type_undef: {
919 auto undef = nir_instr_as_undef(instr);
920 undef->def.num_components *= 2;
921 undef->def.bit_size = 32;
922 return NIR_LOWER_INSTR_PROGRESS;
923 }
924 default:
925 return nullptr;
926 }
927 }
928
929 nir_def *
load_deref_64_to_vec2(nir_intrinsic_instr * intr)930 Lower64BitToVec2::load_deref_64_to_vec2(nir_intrinsic_instr *intr)
931 {
932 auto deref = nir_instr_as_deref(intr->src[0].ssa->parent_instr);
933 auto var = nir_intrinsic_get_var(intr, 0);
934 unsigned components = glsl_get_components(glsl_without_array(var->type));
935 if (glsl_get_bit_size(glsl_without_array(var->type)) == 64) {
936 components *= 2;
937 if (deref->deref_type == nir_deref_type_var) {
938 var->type = glsl_vec_type(components);
939 } else if (deref->deref_type == nir_deref_type_array) {
940
941 var->type =
942 glsl_array_type(glsl_vec_type(components), glsl_array_size(var->type), 0);
943
944 } else {
945 nir_print_shader(b->shader, stderr);
946 assert(0 && "Only lowring of var and array derefs supported\n");
947 }
948 }
949 deref->type = var->type;
950 if (deref->deref_type == nir_deref_type_array) {
951 auto deref_array = nir_instr_as_deref(deref->parent.ssa->parent_instr);
952 deref_array->type = var->type;
953 deref->type = glsl_without_array(deref_array->type);
954 }
955
956 intr->num_components = components;
957 intr->def.bit_size = 32;
958 intr->def.num_components = components;
959 return NIR_LOWER_INSTR_PROGRESS;
960 }
961
962 nir_def *
store_64_to_vec2(nir_intrinsic_instr * intr)963 Lower64BitToVec2::store_64_to_vec2(nir_intrinsic_instr *intr)
964 {
965 auto deref = nir_instr_as_deref(intr->src[0].ssa->parent_instr);
966 auto var = nir_intrinsic_get_var(intr, 0);
967
968 unsigned components = glsl_get_components(glsl_without_array(var->type));
969 unsigned wrmask = nir_intrinsic_write_mask(intr);
970 if (glsl_get_bit_size(glsl_without_array(var->type)) == 64) {
971 components *= 2;
972 if (deref->deref_type == nir_deref_type_var) {
973 var->type = glsl_vec_type(components);
974 } else if (deref->deref_type == nir_deref_type_array) {
975 var->type =
976 glsl_array_type(glsl_vec_type(components), glsl_array_size(var->type), 0);
977 } else {
978 nir_print_shader(b->shader, stderr);
979 assert(0 && "Only lowring of var and array derefs supported\n");
980 }
981 }
982 deref->type = var->type;
983 if (deref->deref_type == nir_deref_type_array) {
984 auto deref_array = nir_instr_as_deref(deref->parent.ssa->parent_instr);
985 deref_array->type = var->type;
986 deref->type = glsl_without_array(deref_array->type);
987 }
988 intr->num_components = components;
989 nir_intrinsic_set_write_mask(intr, wrmask == 1 ? 3 : 0xf);
990 return NIR_LOWER_INSTR_PROGRESS;
991 }
992
993 nir_def *
load_uniform_64_to_vec2(nir_intrinsic_instr * intr)994 Lower64BitToVec2::load_uniform_64_to_vec2(nir_intrinsic_instr *intr)
995 {
996 intr->num_components *= 2;
997 intr->def.bit_size = 32;
998 intr->def.num_components *= 2;
999 nir_intrinsic_set_dest_type(intr, nir_type_float32);
1000 return NIR_LOWER_INSTR_PROGRESS;
1001 }
1002
1003 nir_def *
load_64_to_vec2(nir_intrinsic_instr * intr)1004 Lower64BitToVec2::load_64_to_vec2(nir_intrinsic_instr *intr)
1005 {
1006 intr->num_components *= 2;
1007 intr->def.bit_size = 32;
1008 intr->def.num_components *= 2;
1009 if (nir_intrinsic_has_component(intr))
1010 nir_intrinsic_set_component(intr, nir_intrinsic_component(intr) * 2);
1011 return NIR_LOWER_INSTR_PROGRESS;
1012 }
1013
1014 nir_def *
load_ssbo_64_to_vec2(nir_intrinsic_instr * intr)1015 Lower64BitToVec2::load_ssbo_64_to_vec2(nir_intrinsic_instr *intr)
1016 {
1017 intr->num_components *= 2;
1018 intr->def.bit_size = 32;
1019 intr->def.num_components *= 2;
1020 return NIR_LOWER_INSTR_PROGRESS;
1021 }
1022
1023 static bool
store_64bit_intr(nir_src * src,void * state)1024 store_64bit_intr(nir_src *src, void *state)
1025 {
1026 bool *s = (bool *)state;
1027 *s = nir_src_bit_size(*src) == 64;
1028 return !*s;
1029 }
1030
1031 static bool
double2vec2(nir_src * src,UNUSED void * state)1032 double2vec2(nir_src *src, UNUSED void *state)
1033 {
1034 if (nir_src_bit_size(*src) != 64)
1035 return true;
1036
1037 src->ssa->bit_size = 32;
1038 src->ssa->num_components *= 2;
1039 return true;
1040 }
1041
1042 bool
r600_nir_64_to_vec2(nir_shader * sh)1043 r600_nir_64_to_vec2(nir_shader *sh)
1044 {
1045 vector<nir_instr *> intr64bit;
1046 nir_foreach_function_impl(impl, sh)
1047 {
1048 nir_foreach_block(block, impl)
1049 {
1050 nir_foreach_instr_safe(instr, block)
1051 {
1052 switch (instr->type) {
1053 case nir_instr_type_alu: {
1054 bool success = false;
1055 nir_foreach_src(instr, store_64bit_intr, &success);
1056 if (success)
1057 intr64bit.push_back(instr);
1058 break;
1059 }
1060 case nir_instr_type_intrinsic: {
1061 auto ir = nir_instr_as_intrinsic(instr);
1062 switch (ir->intrinsic) {
1063 case nir_intrinsic_store_output:
1064 case nir_intrinsic_store_global:
1065 case nir_intrinsic_store_ssbo: {
1066 bool success = false;
1067 nir_foreach_src(instr, store_64bit_intr, &success);
1068 if (success) {
1069 auto wm = nir_intrinsic_write_mask(ir);
1070 nir_intrinsic_set_write_mask(ir, (wm == 1) ? 3 : 0xf);
1071 ir->num_components *= 2;
1072 }
1073 break;
1074 }
1075 default:;
1076 }
1077 }
1078 default:;
1079 }
1080 }
1081 }
1082 }
1083
1084 bool result = Lower64BitToVec2().run(sh);
1085
1086 if (result || !intr64bit.empty()) {
1087
1088 for (auto&& instr : intr64bit) {
1089 if (instr->type == nir_instr_type_alu) {
1090 auto alu = nir_instr_as_alu(instr);
1091 auto alu_info = nir_op_infos[alu->op];
1092 for (unsigned i = 0; i < alu_info.num_inputs; ++i) {
1093 int swizzle[NIR_MAX_VEC_COMPONENTS] = {0};
1094 for (unsigned k = 0; k < NIR_MAX_VEC_COMPONENTS / 2; k++) {
1095 if (!nir_alu_instr_channel_used(alu, i, k)) {
1096 continue;
1097 }
1098
1099 switch (alu->op) {
1100 case nir_op_unpack_64_2x32_split_x:
1101 swizzle[2 * k] = alu->src[i].swizzle[k] * 2;
1102 alu->op = nir_op_mov;
1103 break;
1104 case nir_op_unpack_64_2x32_split_y:
1105 swizzle[2 * k] = alu->src[i].swizzle[k] * 2 + 1;
1106 alu->op = nir_op_mov;
1107 break;
1108 case nir_op_unpack_64_2x32:
1109 alu->op = nir_op_mov;
1110 break;
1111 case nir_op_bcsel:
1112 if (i == 0) {
1113 swizzle[2 * k] = swizzle[2 * k + 1] = alu->src[i].swizzle[k] * 2;
1114 break;
1115 }
1116 FALLTHROUGH;
1117 default:
1118 swizzle[2 * k] = alu->src[i].swizzle[k] * 2;
1119 swizzle[2 * k + 1] = alu->src[i].swizzle[k] * 2 + 1;
1120 }
1121 }
1122 for (unsigned k = 0; k < NIR_MAX_VEC_COMPONENTS; ++k) {
1123 alu->src[i].swizzle[k] = swizzle[k];
1124 }
1125 }
1126 } else
1127 nir_foreach_src(instr, double2vec2, nullptr);
1128 }
1129 result = true;
1130 }
1131
1132 return result;
1133 }
1134
1135 using std::map;
1136 using std::pair;
1137 using std::vector;
1138
1139 class StoreMerger {
1140 public:
1141 StoreMerger(nir_shader *shader);
1142 void collect_stores();
1143 bool combine();
1144 void combine_one_slot(vector<nir_intrinsic_instr *>& stores);
1145
1146 using StoreCombos = map<unsigned, vector<nir_intrinsic_instr *>>;
1147
1148 StoreCombos m_stores;
1149 nir_shader *sh;
1150 };
1151
StoreMerger(nir_shader * shader)1152 StoreMerger::StoreMerger(nir_shader *shader):
1153 sh(shader)
1154 {
1155 }
1156
1157 void
collect_stores()1158 StoreMerger::collect_stores()
1159 {
1160 unsigned vertex = 0;
1161 nir_foreach_function_impl(impl, sh)
1162 {
1163 nir_foreach_block(block, impl)
1164 {
1165 nir_foreach_instr_safe(instr, block)
1166 {
1167 if (instr->type != nir_instr_type_intrinsic)
1168 continue;
1169
1170 auto ir = nir_instr_as_intrinsic(instr);
1171 if (ir->intrinsic == nir_intrinsic_emit_vertex ||
1172 ir->intrinsic == nir_intrinsic_emit_vertex_with_counter) {
1173 ++vertex;
1174 continue;
1175 }
1176 if (ir->intrinsic != nir_intrinsic_store_output)
1177 continue;
1178
1179 unsigned index = nir_intrinsic_base(ir) + 64 * vertex +
1180 8 * 64 * nir_intrinsic_io_semantics(ir).gs_streams;
1181 m_stores[index].push_back(ir);
1182 }
1183 }
1184 }
1185 }
1186
1187 bool
combine()1188 StoreMerger::combine()
1189 {
1190 bool progress = false;
1191 for (auto&& i : m_stores) {
1192 if (i.second.size() < 2)
1193 continue;
1194
1195 combine_one_slot(i.second);
1196 progress = true;
1197 }
1198 return progress;
1199 }
1200
1201 void
combine_one_slot(vector<nir_intrinsic_instr * > & stores)1202 StoreMerger::combine_one_slot(vector<nir_intrinsic_instr *>& stores)
1203 {
1204 nir_def *srcs[4] = {nullptr};
1205
1206 auto last_store = *stores.rbegin();
1207
1208 nir_builder b = nir_builder_at(nir_before_instr(&last_store->instr));
1209
1210 unsigned comps = 0;
1211 unsigned writemask = 0;
1212 unsigned first_comp = 4;
1213 for (auto&& store : stores) {
1214 int cmp = nir_intrinsic_component(store);
1215 for (unsigned i = 0; i < nir_src_num_components(store->src[0]); ++i, ++comps) {
1216 unsigned out_comp = i + cmp;
1217 srcs[out_comp] = nir_channel(&b, store->src[0].ssa, i);
1218 writemask |= 1 << out_comp;
1219 if (first_comp > out_comp)
1220 first_comp = out_comp;
1221 }
1222 }
1223
1224 auto new_src = nir_vec(&b, srcs, comps);
1225
1226 nir_src_rewrite(&last_store->src[0], new_src);
1227 last_store->num_components = comps;
1228 nir_intrinsic_set_component(last_store, first_comp);
1229 nir_intrinsic_set_write_mask(last_store, writemask);
1230
1231 for (auto i = stores.begin(); i != stores.end() - 1; ++i)
1232 nir_instr_remove(&(*i)->instr);
1233 }
1234
1235 bool
r600_merge_vec2_stores(nir_shader * shader)1236 r600_merge_vec2_stores(nir_shader *shader)
1237 {
1238 r600::StoreMerger merger(shader);
1239 merger.collect_stores();
1240 return merger.combine();
1241 }
1242
1243 static bool
r600_lower_64bit_intrinsic(nir_builder * b,nir_intrinsic_instr * instr)1244 r600_lower_64bit_intrinsic(nir_builder *b, nir_intrinsic_instr *instr)
1245 {
1246 b->cursor = nir_after_instr(&instr->instr);
1247
1248 switch (instr->intrinsic) {
1249 case nir_intrinsic_load_ubo:
1250 case nir_intrinsic_load_ubo_vec4:
1251 case nir_intrinsic_load_uniform:
1252 case nir_intrinsic_load_ssbo:
1253 case nir_intrinsic_load_input:
1254 case nir_intrinsic_load_interpolated_input:
1255 case nir_intrinsic_load_per_vertex_input:
1256 case nir_intrinsic_store_output:
1257 case nir_intrinsic_store_per_vertex_output:
1258 case nir_intrinsic_store_ssbo:
1259 break;
1260 default:
1261 return false;
1262 }
1263
1264 if (instr->num_components <= 2)
1265 return false;
1266
1267 bool has_dest = nir_intrinsic_infos[instr->intrinsic].has_dest;
1268 if (has_dest) {
1269 if (instr->def.bit_size != 64)
1270 return false;
1271 } else {
1272 if (nir_src_bit_size(instr->src[0]) != 64)
1273 return false;
1274 }
1275
1276 nir_intrinsic_instr *first =
1277 nir_instr_as_intrinsic(nir_instr_clone(b->shader, &instr->instr));
1278 nir_intrinsic_instr *second =
1279 nir_instr_as_intrinsic(nir_instr_clone(b->shader, &instr->instr));
1280
1281 switch (instr->intrinsic) {
1282 case nir_intrinsic_load_ubo:
1283 case nir_intrinsic_load_ubo_vec4:
1284 case nir_intrinsic_load_uniform:
1285 case nir_intrinsic_load_ssbo:
1286 case nir_intrinsic_store_ssbo:
1287 break;
1288
1289 default: {
1290 nir_io_semantics semantics = nir_intrinsic_io_semantics(second);
1291 semantics.location++;
1292 semantics.num_slots--;
1293 nir_intrinsic_set_io_semantics(second, semantics);
1294
1295 nir_intrinsic_set_base(second, nir_intrinsic_base(second) + 1);
1296 break;
1297 }
1298 }
1299
1300 first->num_components = 2;
1301 second->num_components -= 2;
1302 if (has_dest) {
1303 first->def.num_components = 2;
1304 second->def.num_components -= 2;
1305 }
1306
1307 nir_builder_instr_insert(b, &first->instr);
1308 nir_builder_instr_insert(b, &second->instr);
1309
1310 if (has_dest) {
1311 /* Merge the two loads' results back into a vector. */
1312 nir_scalar channels[4] = {
1313 nir_get_scalar(&first->def, 0),
1314 nir_get_scalar(&first->def, 1),
1315 nir_get_scalar(&second->def, 0),
1316 nir_get_scalar(&second->def, second->num_components > 1 ? 1 : 0),
1317 };
1318 nir_def *new_ir = nir_vec_scalars(b, channels, instr->num_components);
1319 nir_def_rewrite_uses(&instr->def, new_ir);
1320 } else {
1321 /* Split the src value across the two stores. */
1322 b->cursor = nir_before_instr(&instr->instr);
1323
1324 nir_def *src0 = instr->src[0].ssa;
1325 nir_scalar channels[4] = {{0}};
1326 for (int i = 0; i < instr->num_components; i++)
1327 channels[i] = nir_get_scalar(src0, i);
1328
1329 nir_intrinsic_set_write_mask(first, nir_intrinsic_write_mask(instr) & 3);
1330 nir_intrinsic_set_write_mask(second, nir_intrinsic_write_mask(instr) >> 2);
1331
1332 nir_src_rewrite(&first->src[0], nir_vec_scalars(b, channels, 2));
1333 nir_src_rewrite(&second->src[0],
1334 nir_vec_scalars(b, &channels[2], second->num_components));
1335 }
1336
1337 int offset_src = -1;
1338 uint32_t offset_amount = 16;
1339
1340 switch (instr->intrinsic) {
1341 case nir_intrinsic_load_ssbo:
1342 case nir_intrinsic_load_ubo:
1343 offset_src = 1;
1344 break;
1345 case nir_intrinsic_load_ubo_vec4:
1346 case nir_intrinsic_load_uniform:
1347 offset_src = 0;
1348 offset_amount = 1;
1349 break;
1350 case nir_intrinsic_store_ssbo:
1351 offset_src = 2;
1352 break;
1353 default:
1354 break;
1355 }
1356 if (offset_src != -1) {
1357 b->cursor = nir_before_instr(&second->instr);
1358 nir_def *second_offset =
1359 nir_iadd_imm(b, second->src[offset_src].ssa, offset_amount);
1360 nir_src_rewrite(&second->src[offset_src], second_offset);
1361 }
1362
1363 /* DCE stores we generated with no writemask (nothing else does this
1364 * currently).
1365 */
1366 if (!has_dest) {
1367 if (nir_intrinsic_write_mask(first) == 0)
1368 nir_instr_remove(&first->instr);
1369 if (nir_intrinsic_write_mask(second) == 0)
1370 nir_instr_remove(&second->instr);
1371 }
1372
1373 nir_instr_remove(&instr->instr);
1374
1375 return true;
1376 }
1377
1378 static bool
r600_lower_64bit_load_const(nir_builder * b,nir_load_const_instr * instr)1379 r600_lower_64bit_load_const(nir_builder *b, nir_load_const_instr *instr)
1380 {
1381 int num_components = instr->def.num_components;
1382
1383 if (instr->def.bit_size != 64 || num_components <= 2)
1384 return false;
1385
1386 b->cursor = nir_before_instr(&instr->instr);
1387
1388 nir_load_const_instr *first = nir_load_const_instr_create(b->shader, 2, 64);
1389 nir_load_const_instr *second =
1390 nir_load_const_instr_create(b->shader, num_components - 2, 64);
1391
1392 first->value[0] = instr->value[0];
1393 first->value[1] = instr->value[1];
1394 second->value[0] = instr->value[2];
1395 if (num_components == 4)
1396 second->value[1] = instr->value[3];
1397
1398 nir_builder_instr_insert(b, &first->instr);
1399 nir_builder_instr_insert(b, &second->instr);
1400
1401 nir_def *channels[4] = {
1402 nir_channel(b, &first->def, 0),
1403 nir_channel(b, &first->def, 1),
1404 nir_channel(b, &second->def, 0),
1405 num_components == 4 ? nir_channel(b, &second->def, 1) : NULL,
1406 };
1407 nir_def *new_ir = nir_vec(b, channels, num_components);
1408 nir_def_replace(&instr->def, new_ir);
1409
1410 return true;
1411 }
1412
1413 static bool
r600_lower_64bit_to_vec2_instr(nir_builder * b,nir_instr * instr,UNUSED void * data)1414 r600_lower_64bit_to_vec2_instr(nir_builder *b, nir_instr *instr, UNUSED void *data)
1415 {
1416 switch (instr->type) {
1417 case nir_instr_type_load_const:
1418 return r600_lower_64bit_load_const(b, nir_instr_as_load_const(instr));
1419
1420 case nir_instr_type_intrinsic:
1421 return r600_lower_64bit_intrinsic(b, nir_instr_as_intrinsic(instr));
1422 default:
1423 return false;
1424 }
1425 }
1426
1427 bool
r600_lower_64bit_to_vec2(nir_shader * s)1428 r600_lower_64bit_to_vec2(nir_shader *s)
1429 {
1430 return nir_shader_instructions_pass(s,
1431 r600_lower_64bit_to_vec2_instr,
1432 nir_metadata_control_flow,
1433 NULL);
1434 }
1435
1436 } // end namespace r600
1437