xref: /aosp_15_r20/external/mesa3d/src/gallium/drivers/r600/sfn/sfn_nir_lower_64bit.cpp (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /* -*- mesa-c++  -*-
2  * Copyright 2020 Collabora LTD
3  * Author: Gert Wollny <[email protected]>
4  * SPDX-License-Identifier: MIT
5  */
6 
7 #include "nir.h"
8 #include "nir_builder.h"
9 #include "sfn_nir.h"
10 
11 #include <iostream>
12 #include <map>
13 #include <vector>
14 
15 namespace r600 {
16 
17 using std::make_pair;
18 using std::map;
19 using std::pair;
20 using std::vector;
21 
22 class LowerSplit64BitVar : public NirLowerInstruction {
23 public:
24    ~LowerSplit64BitVar();
25    using VarSplit = pair<nir_variable *, nir_variable *>;
26    using VarMap = map<unsigned, VarSplit>;
27 
28    nir_def *split_double_load_deref(nir_intrinsic_instr *intr);
29 
30    nir_def *split_double_store_deref(nir_intrinsic_instr *intr);
31 
32 private:
33    nir_def *split_load_deref_array(nir_intrinsic_instr *intr, nir_src& index);
34 
35    nir_def *split_load_deref_var(nir_intrinsic_instr *intr);
36 
37    nir_def *split_store_deref_array(nir_intrinsic_instr *intr,
38                                         nir_deref_instr *deref);
39 
40    nir_def *split_store_deref_var(nir_intrinsic_instr *intr, nir_deref_instr *deref1);
41 
42    VarSplit get_var_pair(nir_variable *old_var);
43 
44    nir_def *
45    merge_64bit_loads(nir_def *load1, nir_def *load2, bool out_is_vec3);
46 
47    nir_def *split_double_load(nir_intrinsic_instr *load1);
48 
49    nir_def *split_store_output(nir_intrinsic_instr *store1);
50 
51    nir_def *split_double_load_uniform(nir_intrinsic_instr *intr);
52 
53    nir_def *split_double_load_ssbo(nir_intrinsic_instr *intr);
54 
55    nir_def *split_double_load_ubo(nir_intrinsic_instr *intr);
56 
57    nir_def *
58    split_reduction(nir_def *src[2][2], nir_op op1, nir_op op2, nir_op reduction);
59 
60    nir_def *
61    split_reduction3(nir_alu_instr *alu, nir_op op1, nir_op op2, nir_op reduction);
62 
63    nir_def *
64    split_reduction4(nir_alu_instr *alu, nir_op op1, nir_op op2, nir_op reduction);
65 
66    nir_def *split_bcsel(nir_alu_instr *alu);
67 
68    nir_def *split_load_const(nir_load_const_instr *lc);
69 
70    bool filter(const nir_instr *instr) const override;
71    nir_def *lower(nir_instr *instr) override;
72 
73    VarMap m_varmap;
74    vector<nir_variable *> m_old_vars;
75    vector<nir_instr *> m_old_stores;
76 };
77 
78 class LowerLoad64Uniform : public NirLowerInstruction {
79    bool filter(const nir_instr *instr) const override;
80    nir_def *lower(nir_instr *instr) override;
81 };
82 
83 bool
filter(const nir_instr * instr) const84 LowerLoad64Uniform::filter(const nir_instr *instr) const
85 {
86    if (instr->type != nir_instr_type_intrinsic)
87       return false;
88 
89    auto intr = nir_instr_as_intrinsic(instr);
90    if (intr->intrinsic != nir_intrinsic_load_uniform &&
91        intr->intrinsic != nir_intrinsic_load_ubo &&
92        intr->intrinsic != nir_intrinsic_load_ubo_vec4)
93       return false;
94 
95    return intr->def.bit_size == 64;
96 }
97 
98 nir_def *
lower(nir_instr * instr)99 LowerLoad64Uniform::lower(nir_instr *instr)
100 {
101    auto intr = nir_instr_as_intrinsic(instr);
102    int old_components = intr->def.num_components;
103    assert(old_components <= 2);
104    intr->def.num_components *= 2;
105    intr->def.bit_size = 32;
106    intr->num_components *= 2;
107 
108    if (intr->intrinsic == nir_intrinsic_load_ubo ||
109        intr->intrinsic == nir_intrinsic_load_ubo_vec4)
110       nir_intrinsic_set_component(intr, 2 * nir_intrinsic_component(intr));
111 
112    nir_def *result_vec[2] = {nullptr, nullptr};
113 
114    for (int i = 0; i < old_components; ++i) {
115       result_vec[i] = nir_pack_64_2x32_split(b,
116                                              nir_channel(b, &intr->def, 2 * i),
117                                              nir_channel(b, &intr->def, 2 * i + 1));
118    }
119    if (old_components == 1)
120       return result_vec[0];
121 
122    return nir_vec2(b, result_vec[0], result_vec[1]);
123 }
124 
125 bool
r600_split_64bit_uniforms_and_ubo(nir_shader * sh)126 r600_split_64bit_uniforms_and_ubo(nir_shader *sh)
127 {
128    return LowerLoad64Uniform().run(sh);
129 }
130 
131 class LowerSplit64op : public NirLowerInstruction {
filter(const nir_instr * instr) const132    bool filter(const nir_instr *instr) const override
133    {
134       switch (instr->type) {
135       case nir_instr_type_alu: {
136          auto alu = nir_instr_as_alu(instr);
137          switch (alu->op) {
138          case nir_op_bcsel:
139             return alu->def.bit_size == 64;
140          case nir_op_f2i32:
141          case nir_op_f2u32:
142          case nir_op_f2i64:
143          case nir_op_f2u64:
144          case nir_op_u2f64:
145          case nir_op_i2f64:
146             return nir_src_bit_size(alu->src[0].src) == 64;
147          default:
148             return false;
149          }
150       }
151       case nir_instr_type_phi: {
152          auto phi = nir_instr_as_phi(instr);
153          return phi->def.num_components == 64;
154       }
155       default:
156          return false;
157       }
158    }
159 
lower(nir_instr * instr)160    nir_def *lower(nir_instr *instr) override
161    {
162 
163       switch (instr->type) {
164       case nir_instr_type_alu: {
165          auto alu = nir_instr_as_alu(instr);
166          switch (alu->op) {
167 
168          case nir_op_bcsel: {
169             auto lo =
170                nir_bcsel(b,
171                          alu->src[0].src.ssa,
172                          nir_unpack_64_2x32_split_x(b, nir_ssa_for_alu_src(b, alu, 1)),
173                          nir_unpack_64_2x32_split_x(b, nir_ssa_for_alu_src(b, alu, 2)));
174             auto hi =
175                nir_bcsel(b,
176                          alu->src[0].src.ssa,
177                          nir_unpack_64_2x32_split_y(b, nir_ssa_for_alu_src(b, alu, 1)),
178                          nir_unpack_64_2x32_split_y(b, nir_ssa_for_alu_src(b, alu, 2)));
179             return nir_pack_64_2x32_split(b, lo, hi);
180          }
181          case nir_op_f2i32: {
182             auto src = nir_ssa_for_alu_src(b, alu, 0);
183             auto gt0 = nir_fgt_imm(b, src, 0.0);
184             auto abs_src = nir_fabs(b, src);
185             auto value = nir_f2u32(b, abs_src);
186             return nir_bcsel(b, gt0, value, nir_ineg(b, value));
187          }
188          case nir_op_f2u32: {
189             /* fp32 doesn't hold sufficient bits to represent the full range of
190              * u32, therefore we have to split the values, and because f2f32
191              * rounds, we have to remove the fractional part in the hi bits
192              * For values > UINT_MAX the result is undefined */
193             auto src = nir_ssa_for_alu_src(b, alu, 0);
194             src = nir_fadd(b, src, nir_fneg(b, nir_ffract(b, src)));
195             auto gt0 = nir_fgt_imm(b, src, 0.0);
196             auto highval = nir_fmul_imm(b, src, 1.0 / 65536.0);
197             auto fract = nir_ffract(b, highval);
198             auto high = nir_f2u32(b, nir_f2f32(b, nir_fadd(b, highval, nir_fneg(b, fract))));
199             auto lowval = nir_fmul_imm(b, fract, 65536.0);
200             auto low = nir_f2u32(b, nir_f2f32(b, lowval));
201             return nir_bcsel(b,
202                              gt0,
203                              nir_ior(b, nir_ishl_imm(b, high, 16), low),
204                              nir_imm_int(b, 0));
205          }
206          case nir_op_u2f64: {
207             auto src = nir_ssa_for_alu_src(b, alu, 0);
208             auto low = nir_unpack_64_2x32_split_x(b, src);
209             auto high = nir_unpack_64_2x32_split_y(b, src);
210             auto flow = nir_u2f64(b, low);
211             auto fhigh = nir_u2f64(b, high);
212             return nir_fadd(b, nir_fmul_imm(b, fhigh, 65536.0 * 65536.0), flow);
213          }
214          case nir_op_i2f64: {
215             auto src = nir_ssa_for_alu_src(b, alu, 0);
216             auto low = nir_unpack_64_2x32_split_x(b, src);
217             auto high = nir_unpack_64_2x32_split_y(b, src);
218             auto flow = nir_u2f64(b, low);
219             auto fhigh = nir_i2f64(b, high);
220             return nir_fadd(b, nir_fmul_imm(b, fhigh, 65536.0 * 65536.0), flow);
221          }
222          default:
223             unreachable("trying to lower instruction that was not in filter");
224          }
225       }
226       case nir_instr_type_phi: {
227          auto phi = nir_instr_as_phi(instr);
228          auto phi_lo = nir_phi_instr_create(b->shader);
229          auto phi_hi = nir_phi_instr_create(b->shader);
230          nir_def_init(
231             &phi_lo->instr, &phi_lo->def, phi->def.num_components * 2, 32);
232          nir_def_init(
233             &phi_hi->instr, &phi_hi->def, phi->def.num_components * 2, 32);
234          nir_foreach_phi_src(s, phi)
235          {
236             auto lo = nir_unpack_32_2x16_split_x(b, s->src.ssa);
237             auto hi = nir_unpack_32_2x16_split_x(b, s->src.ssa);
238             nir_phi_instr_add_src(phi_lo, s->pred, lo);
239             nir_phi_instr_add_src(phi_hi, s->pred, hi);
240          }
241          return nir_pack_64_2x32_split(b, &phi_lo->def, &phi_hi->def);
242       }
243       default:
244          unreachable("Trying to lower instruction that was not in filter");
245       }
246    }
247 };
248 
249 bool
r600_split_64bit_alu_and_phi(nir_shader * sh)250 r600_split_64bit_alu_and_phi(nir_shader *sh)
251 {
252    return LowerSplit64op().run(sh);
253 }
254 
255 bool
filter(const nir_instr * instr) const256 LowerSplit64BitVar::filter(const nir_instr *instr) const
257 {
258    switch (instr->type) {
259    case nir_instr_type_intrinsic: {
260       auto intr = nir_instr_as_intrinsic(instr);
261 
262       switch (intr->intrinsic) {
263       case nir_intrinsic_load_deref:
264       case nir_intrinsic_load_uniform:
265       case nir_intrinsic_load_input:
266       case nir_intrinsic_load_ubo:
267       case nir_intrinsic_load_ssbo:
268          if (intr->def.bit_size != 64)
269             return false;
270          return intr->def.num_components >= 3;
271       case nir_intrinsic_store_output:
272          if (nir_src_bit_size(intr->src[0]) != 64)
273             return false;
274          return nir_src_num_components(intr->src[0]) >= 3;
275       case nir_intrinsic_store_deref:
276          if (nir_src_bit_size(intr->src[1]) != 64)
277             return false;
278          return nir_src_num_components(intr->src[1]) >= 3;
279       default:
280          return false;
281       }
282    }
283    case nir_instr_type_alu: {
284       auto alu = nir_instr_as_alu(instr);
285       switch (alu->op) {
286       case nir_op_bcsel:
287          if (alu->def.num_components < 3)
288             return false;
289          return alu->def.bit_size == 64;
290       case nir_op_bany_fnequal3:
291       case nir_op_bany_fnequal4:
292       case nir_op_ball_fequal3:
293       case nir_op_ball_fequal4:
294       case nir_op_bany_inequal3:
295       case nir_op_bany_inequal4:
296       case nir_op_ball_iequal3:
297       case nir_op_ball_iequal4:
298       case nir_op_fdot3:
299       case nir_op_fdot4:
300          return nir_src_bit_size(alu->src[1].src) == 64;
301       default:
302          return false;
303       }
304    }
305    case nir_instr_type_load_const: {
306       auto lc = nir_instr_as_load_const(instr);
307       if (lc->def.bit_size != 64)
308          return false;
309       return lc->def.num_components >= 3;
310    }
311    default:
312       return false;
313    }
314 }
315 
316 nir_def *
merge_64bit_loads(nir_def * load1,nir_def * load2,bool out_is_vec3)317 LowerSplit64BitVar::merge_64bit_loads(nir_def *load1,
318                                       nir_def *load2,
319                                       bool out_is_vec3)
320 {
321    if (out_is_vec3)
322       return nir_vec3(b,
323                       nir_channel(b, load1, 0),
324                       nir_channel(b, load1, 1),
325                       nir_channel(b, load2, 0));
326    else
327       return nir_vec4(b,
328                       nir_channel(b, load1, 0),
329                       nir_channel(b, load1, 1),
330                       nir_channel(b, load2, 0),
331                       nir_channel(b, load2, 1));
332 }
333 
~LowerSplit64BitVar()334 LowerSplit64BitVar::~LowerSplit64BitVar()
335 {
336    for (auto&& v : m_old_vars)
337       exec_node_remove(&v->node);
338 
339    for (auto&& v : m_old_stores)
340       nir_instr_remove(v);
341 }
342 
343 nir_def *
split_double_store_deref(nir_intrinsic_instr * intr)344 LowerSplit64BitVar::split_double_store_deref(nir_intrinsic_instr *intr)
345 {
346    auto deref = nir_instr_as_deref(intr->src[0].ssa->parent_instr);
347    if (deref->deref_type == nir_deref_type_var)
348       return split_store_deref_var(intr, deref);
349    else if (deref->deref_type == nir_deref_type_array)
350       return split_store_deref_array(intr, deref);
351    else {
352       unreachable("only splitting of stores to vars and arrays is supported");
353    }
354 }
355 
356 nir_def *
split_double_load_deref(nir_intrinsic_instr * intr)357 LowerSplit64BitVar::split_double_load_deref(nir_intrinsic_instr *intr)
358 {
359    auto deref = nir_instr_as_deref(intr->src[0].ssa->parent_instr);
360    if (deref->deref_type == nir_deref_type_var)
361       return split_load_deref_var(intr);
362    else if (deref->deref_type == nir_deref_type_array)
363       return split_load_deref_array(intr, deref->arr.index);
364    else {
365       unreachable("only splitting of loads from vars and arrays is supported");
366    }
367    m_old_stores.push_back(&intr->instr);
368 }
369 
370 nir_def *
split_load_deref_array(nir_intrinsic_instr * intr,nir_src & index)371 LowerSplit64BitVar::split_load_deref_array(nir_intrinsic_instr *intr, nir_src& index)
372 {
373    auto old_var = nir_intrinsic_get_var(intr, 0);
374    unsigned old_components = glsl_get_components(glsl_without_array(old_var->type));
375 
376    assert(old_components > 2 && old_components <= 4);
377 
378    auto vars = get_var_pair(old_var);
379 
380    auto deref1 = nir_build_deref_var(b, vars.first);
381    auto deref_array1 = nir_build_deref_array(b, deref1, index.ssa);
382    auto load1 =
383       nir_build_load_deref(b, 2, 64, &deref_array1->def, (enum gl_access_qualifier)0);
384 
385    auto deref2 = nir_build_deref_var(b, vars.second);
386    auto deref_array2 = nir_build_deref_array(b, deref2, index.ssa);
387 
388    auto load2 = nir_build_load_deref(
389       b, old_components - 2, 64, &deref_array2->def, (enum gl_access_qualifier)0);
390 
391    return merge_64bit_loads(load1, load2, old_components == 3);
392 }
393 
394 nir_def *
split_store_deref_array(nir_intrinsic_instr * intr,nir_deref_instr * deref)395 LowerSplit64BitVar::split_store_deref_array(nir_intrinsic_instr *intr,
396                                             nir_deref_instr *deref)
397 {
398    auto old_var = nir_intrinsic_get_var(intr, 0);
399    unsigned old_components = glsl_get_components(glsl_without_array(old_var->type));
400 
401    assert(old_components > 2 && old_components <= 4);
402 
403    auto src_xy = nir_trim_vector(b, intr->src[1].ssa, 2);
404 
405    auto vars = get_var_pair(old_var);
406 
407    auto deref1 = nir_build_deref_var(b, vars.first);
408    auto deref_array1 =
409       nir_build_deref_array(b, deref1, deref->arr.index.ssa);
410 
411    nir_build_store_deref(b, &deref_array1->def, src_xy, 3);
412 
413    auto deref2 = nir_build_deref_var(b, vars.second);
414    auto deref_array2 =
415       nir_build_deref_array(b, deref2, deref->arr.index.ssa);
416 
417    if (old_components == 3)
418       nir_build_store_deref(b,
419                             &deref_array2->def,
420                             nir_channel(b, intr->src[1].ssa, 2),
421                             1);
422    else
423       nir_build_store_deref(b,
424                             &deref_array2->def,
425                             nir_channels(b, intr->src[1].ssa, 0xc),
426                             3);
427 
428    return NIR_LOWER_INSTR_PROGRESS_REPLACE;
429 }
430 
431 nir_def *
split_store_deref_var(nir_intrinsic_instr * intr,UNUSED nir_deref_instr * deref)432 LowerSplit64BitVar::split_store_deref_var(nir_intrinsic_instr *intr,
433                                           UNUSED nir_deref_instr *deref)
434 {
435    auto old_var = nir_intrinsic_get_var(intr, 0);
436    unsigned old_components = glsl_get_components(glsl_without_array(old_var->type));
437 
438    assert(old_components > 2 && old_components <= 4);
439 
440    auto src_xy = nir_trim_vector(b, intr->src[1].ssa, 2);
441 
442    auto vars = get_var_pair(old_var);
443 
444    auto deref1 = nir_build_deref_var(b, vars.first);
445    nir_build_store_deref(b, &deref1->def, src_xy, 3);
446 
447    auto deref2 = nir_build_deref_var(b, vars.second);
448    if (old_components == 3)
449       nir_build_store_deref(b, &deref2->def, nir_channel(b, intr->src[1].ssa, 2), 1);
450    else
451       nir_build_store_deref(b,
452                             &deref2->def,
453                             nir_channels(b, intr->src[1].ssa, 0xc),
454                             3);
455 
456    return NIR_LOWER_INSTR_PROGRESS_REPLACE;
457 }
458 
459 nir_def *
split_load_deref_var(nir_intrinsic_instr * intr)460 LowerSplit64BitVar::split_load_deref_var(nir_intrinsic_instr *intr)
461 {
462    auto old_var = nir_intrinsic_get_var(intr, 0);
463    auto vars = get_var_pair(old_var);
464    unsigned old_components = glsl_get_components(old_var->type);
465 
466    nir_deref_instr *deref1 = nir_build_deref_var(b, vars.first);
467    auto *load1 = nir_load_deref(b, deref1);
468 
469    nir_deref_instr *deref2 = nir_build_deref_var(b, vars.second);
470    deref2->type = vars.second->type;
471 
472    auto *load2 = nir_load_deref(b, deref2);
473 
474    return merge_64bit_loads(load1, load2, old_components == 3);
475 }
476 
477 LowerSplit64BitVar::VarSplit
get_var_pair(nir_variable * old_var)478 LowerSplit64BitVar::get_var_pair(nir_variable *old_var)
479 {
480    auto split_vars = m_varmap.find(old_var->data.driver_location);
481 
482    assert(glsl_get_components(glsl_without_array(old_var->type)) > 2);
483 
484    if (split_vars == m_varmap.end()) {
485       auto var1 = nir_variable_clone(old_var, b->shader);
486       auto var2 = nir_variable_clone(old_var, b->shader);
487 
488       var1->type = glsl_dvec_type(2);
489       var2->type = glsl_dvec_type(glsl_get_components(glsl_without_array(old_var->type)) - 2);
490 
491       if (glsl_type_is_array(old_var->type)) {
492          var1->type = glsl_array_type(var1->type, glsl_array_size(old_var->type), 0);
493          var2->type = glsl_array_type(var2->type, glsl_array_size(old_var->type), 0);
494       }
495 
496       if (old_var->data.mode == nir_var_shader_in ||
497           old_var->data.mode == nir_var_shader_out) {
498          ++var2->data.driver_location;
499          ++var2->data.location;
500          nir_shader_add_variable(b->shader, var1);
501          nir_shader_add_variable(b->shader, var2);
502       } else if (old_var->data.mode == nir_var_function_temp) {
503          exec_list_push_tail(&b->impl->locals, &var1->node);
504          exec_list_push_tail(&b->impl->locals, &var2->node);
505       }
506 
507       m_varmap[old_var->data.driver_location] = make_pair(var1, var2);
508    }
509    return m_varmap[old_var->data.driver_location];
510 }
511 
512 nir_def *
split_double_load(nir_intrinsic_instr * load1)513 LowerSplit64BitVar::split_double_load(nir_intrinsic_instr *load1)
514 {
515    unsigned old_components = load1->def.num_components;
516    auto load2 = nir_instr_as_intrinsic(nir_instr_clone(b->shader, &load1->instr));
517    nir_io_semantics sem = nir_intrinsic_io_semantics(load1);
518 
519    load1->def.num_components = 2;
520    sem.num_slots = 1;
521    nir_intrinsic_set_io_semantics(load1, sem);
522 
523    load2->def.num_components = old_components - 2;
524    sem.location += 1;
525    nir_intrinsic_set_io_semantics(load2, sem);
526    nir_intrinsic_set_base(load2, nir_intrinsic_base(load1) + 1);
527    nir_builder_instr_insert(b, &load2->instr);
528 
529    return merge_64bit_loads(&load1->def, &load2->def, old_components == 3);
530 }
531 
532 nir_def *
split_store_output(nir_intrinsic_instr * store1)533 LowerSplit64BitVar::split_store_output(nir_intrinsic_instr *store1)
534 {
535    auto src = store1->src[0];
536    unsigned old_components = nir_src_num_components(src);
537    nir_io_semantics sem = nir_intrinsic_io_semantics(store1);
538 
539    auto store2 = nir_instr_as_intrinsic(nir_instr_clone(b->shader, &store1->instr));
540    auto src1 = nir_trim_vector(b, src.ssa, 2);
541    auto src2 = nir_channels(b, src.ssa, old_components == 3 ? 4 : 0xc);
542 
543    nir_src_rewrite(&src, src1);
544    nir_intrinsic_set_write_mask(store1, 3);
545 
546    nir_src_rewrite(&src, src2);
547    nir_intrinsic_set_write_mask(store2, old_components == 3 ? 1 : 3);
548 
549    sem.num_slots = 1;
550    nir_intrinsic_set_io_semantics(store1, sem);
551 
552    sem.location += 1;
553    nir_intrinsic_set_io_semantics(store2, sem);
554    nir_intrinsic_set_base(store2, nir_intrinsic_base(store1));
555 
556    nir_builder_instr_insert(b, &store2->instr);
557    return NIR_LOWER_INSTR_PROGRESS;
558 }
559 
560 nir_def *
split_double_load_uniform(nir_intrinsic_instr * intr)561 LowerSplit64BitVar::split_double_load_uniform(nir_intrinsic_instr *intr)
562 {
563    unsigned second_components = intr->def.num_components - 2;
564    nir_intrinsic_instr *load2 =
565       nir_intrinsic_instr_create(b->shader, nir_intrinsic_load_uniform);
566    load2->src[0] = nir_src_for_ssa(nir_iadd_imm(b, intr->src[0].ssa, 1));
567    nir_intrinsic_set_dest_type(load2, nir_intrinsic_dest_type(intr));
568    nir_intrinsic_set_base(load2, nir_intrinsic_base(intr));
569    nir_intrinsic_set_range(load2, nir_intrinsic_range(intr));
570    load2->num_components = second_components;
571 
572    nir_def_init(&load2->instr, &load2->def, second_components, 64);
573    nir_builder_instr_insert(b, &load2->instr);
574 
575    intr->def.num_components = intr->num_components = 2;
576 
577    if (second_components == 1)
578       return nir_vec3(b,
579                       nir_channel(b, &intr->def, 0),
580                       nir_channel(b, &intr->def, 1),
581                       nir_channel(b, &load2->def, 0));
582    else
583       return nir_vec4(b,
584                       nir_channel(b, &intr->def, 0),
585                       nir_channel(b, &intr->def, 1),
586                       nir_channel(b, &load2->def, 0),
587                       nir_channel(b, &load2->def, 1));
588 }
589 
590 nir_def *
split_double_load_ssbo(nir_intrinsic_instr * intr)591 LowerSplit64BitVar::split_double_load_ssbo(nir_intrinsic_instr *intr)
592 {
593    unsigned second_components = intr->def.num_components - 2;
594    nir_intrinsic_instr *load2 =
595       nir_instr_as_intrinsic(nir_instr_clone(b->shader, &intr->instr));
596 
597    nir_src_rewrite(&load2->src[0], nir_iadd_imm(b, intr->src[0].ssa, 1));
598    load2->num_components = second_components;
599    nir_def_init(&load2->instr, &load2->def, second_components, 64);
600 
601    nir_intrinsic_set_dest_type(load2, nir_intrinsic_dest_type(intr));
602    nir_builder_instr_insert(b, &load2->instr);
603 
604    intr->def.num_components = intr->num_components = 2;
605 
606    return merge_64bit_loads(&intr->def, &load2->def, second_components == 1);
607 }
608 
609 nir_def *
split_double_load_ubo(nir_intrinsic_instr * intr)610 LowerSplit64BitVar::split_double_load_ubo(nir_intrinsic_instr *intr)
611 {
612    unsigned second_components = intr->def.num_components - 2;
613    nir_intrinsic_instr *load2 =
614       nir_instr_as_intrinsic(nir_instr_clone(b->shader, &intr->instr));
615    load2->src[0] = intr->src[0];
616    load2->src[1] = nir_src_for_ssa(nir_iadd_imm(b, intr->src[1].ssa, 16));
617    nir_intrinsic_set_range_base(load2, nir_intrinsic_range_base(intr) + 16);
618    nir_intrinsic_set_range(load2, nir_intrinsic_range(intr));
619    nir_intrinsic_set_access(load2, nir_intrinsic_access(intr));
620    nir_intrinsic_set_align_mul(load2, nir_intrinsic_align_mul(intr));
621    nir_intrinsic_set_align_offset(load2, nir_intrinsic_align_offset(intr));
622 
623    load2->num_components = second_components;
624 
625    nir_def_init(&load2->instr, &load2->def, second_components, 64);
626    nir_builder_instr_insert(b, &load2->instr);
627 
628    intr->def.num_components = intr->num_components = 2;
629 
630    return merge_64bit_loads(&intr->def, &load2->def, second_components == 1);
631 }
632 
633 nir_def *
split_reduction(nir_def * src[2][2],nir_op op1,nir_op op2,nir_op reduction)634 LowerSplit64BitVar::split_reduction(nir_def *src[2][2],
635                                     nir_op op1,
636                                     nir_op op2,
637                                     nir_op reduction)
638 {
639    auto cmp0 = nir_build_alu(b, op1, src[0][0], src[0][1], nullptr, nullptr);
640    auto cmp1 = nir_build_alu(b, op2, src[1][0], src[1][1], nullptr, nullptr);
641    return nir_build_alu(b, reduction, cmp0, cmp1, nullptr, nullptr);
642 }
643 
644 nir_def *
split_reduction3(nir_alu_instr * alu,nir_op op1,nir_op op2,nir_op reduction)645 LowerSplit64BitVar::split_reduction3(nir_alu_instr *alu,
646                                      nir_op op1,
647                                      nir_op op2,
648                                      nir_op reduction)
649 {
650    nir_def *src[2][2];
651 
652    src[0][0] = nir_trim_vector(b, alu->src[0].src.ssa, 2);
653    src[0][1] = nir_trim_vector(b, alu->src[1].src.ssa, 2);
654 
655    src[1][0] = nir_channel(b, alu->src[0].src.ssa, 2);
656    src[1][1] = nir_channel(b, alu->src[1].src.ssa, 2);
657 
658    return split_reduction(src, op1, op2, reduction);
659 }
660 
661 nir_def *
split_reduction4(nir_alu_instr * alu,nir_op op1,nir_op op2,nir_op reduction)662 LowerSplit64BitVar::split_reduction4(nir_alu_instr *alu,
663                                      nir_op op1,
664                                      nir_op op2,
665                                      nir_op reduction)
666 {
667    nir_def *src[2][2];
668 
669    src[0][0] = nir_trim_vector(b, alu->src[0].src.ssa, 2);
670    src[0][1] = nir_trim_vector(b, alu->src[1].src.ssa, 2);
671 
672    src[1][0] = nir_channels(b, alu->src[0].src.ssa, 0xc);
673    src[1][1] = nir_channels(b, alu->src[1].src.ssa, 0xc);
674 
675    return split_reduction(src, op1, op2, reduction);
676 }
677 
678 nir_def *
split_bcsel(nir_alu_instr * alu)679 LowerSplit64BitVar::split_bcsel(nir_alu_instr *alu)
680 {
681    static nir_def *dest[4];
682    for (unsigned i = 0; i < alu->def.num_components; ++i) {
683       dest[i] = nir_bcsel(b,
684                           nir_channel(b, alu->src[0].src.ssa, i),
685                           nir_channel(b, alu->src[1].src.ssa, i),
686                           nir_channel(b, alu->src[2].src.ssa, i));
687    }
688    return nir_vec(b, dest, alu->def.num_components);
689 }
690 
691 nir_def *
split_load_const(nir_load_const_instr * lc)692 LowerSplit64BitVar::split_load_const(nir_load_const_instr *lc)
693 {
694    nir_def *ir[4];
695    for (unsigned i = 0; i < lc->def.num_components; ++i)
696       ir[i] = nir_imm_double(b, lc->value[i].f64);
697 
698    return nir_vec(b, ir, lc->def.num_components);
699 }
700 
701 nir_def *
lower(nir_instr * instr)702 LowerSplit64BitVar::lower(nir_instr *instr)
703 {
704    switch (instr->type) {
705    case nir_instr_type_intrinsic: {
706       auto intr = nir_instr_as_intrinsic(instr);
707       switch (intr->intrinsic) {
708       case nir_intrinsic_load_deref:
709          return this->split_double_load_deref(intr);
710       case nir_intrinsic_load_uniform:
711          return split_double_load_uniform(intr);
712       case nir_intrinsic_load_ubo:
713          return split_double_load_ubo(intr);
714       case nir_intrinsic_load_ssbo:
715          return split_double_load_ssbo(intr);
716       case nir_intrinsic_load_input:
717          return split_double_load(intr);
718       case nir_intrinsic_store_output:
719          return split_store_output(intr);
720       case nir_intrinsic_store_deref:
721          return split_double_store_deref(intr);
722       default:
723          assert(0);
724       }
725    }
726    case nir_instr_type_alu: {
727       auto alu = nir_instr_as_alu(instr);
728       switch (alu->op) {
729       case nir_op_bany_fnequal3:
730          return split_reduction3(alu, nir_op_bany_fnequal2, nir_op_fneu, nir_op_ior);
731       case nir_op_ball_fequal3:
732          return split_reduction3(alu, nir_op_ball_fequal2, nir_op_feq, nir_op_iand);
733       case nir_op_bany_inequal3:
734          return split_reduction3(alu, nir_op_bany_inequal2, nir_op_ine, nir_op_ior);
735       case nir_op_ball_iequal3:
736          return split_reduction3(alu, nir_op_ball_iequal2, nir_op_ieq, nir_op_iand);
737       case nir_op_fdot3:
738          return split_reduction3(alu, nir_op_fdot2, nir_op_fmul, nir_op_fadd);
739       case nir_op_bany_fnequal4:
740          return split_reduction4(alu,
741                                  nir_op_bany_fnequal2,
742                                  nir_op_bany_fnequal2,
743                                  nir_op_ior);
744       case nir_op_ball_fequal4:
745          return split_reduction4(alu,
746                                  nir_op_ball_fequal2,
747                                  nir_op_ball_fequal2,
748                                  nir_op_iand);
749       case nir_op_bany_inequal4:
750          return split_reduction4(alu,
751                                  nir_op_bany_inequal2,
752                                  nir_op_bany_inequal2,
753                                  nir_op_ior);
754       case nir_op_ball_iequal4:
755          return split_reduction4(alu,
756                                  nir_op_bany_fnequal2,
757                                  nir_op_bany_fnequal2,
758                                  nir_op_ior);
759       case nir_op_fdot4:
760          return split_reduction4(alu, nir_op_fdot2, nir_op_fdot2, nir_op_fadd);
761       case nir_op_bcsel:
762          return split_bcsel(alu);
763       default:
764          assert(0);
765       }
766    }
767    case nir_instr_type_load_const: {
768       auto lc = nir_instr_as_load_const(instr);
769       return split_load_const(lc);
770    }
771    default:
772       assert(0);
773    }
774    return nullptr;
775 }
776 
777 /* Split 64 bit instruction so that at most two 64 bit components are
778  * used in one instruction */
779 
780 bool
r600_nir_split_64bit_io(nir_shader * sh)781 r600_nir_split_64bit_io(nir_shader *sh)
782 {
783    return LowerSplit64BitVar().run(sh);
784 }
785 
786 /* */
787 class Lower64BitToVec2 : public NirLowerInstruction {
788 
789 private:
790    bool filter(const nir_instr *instr) const override;
791    nir_def *lower(nir_instr *instr) override;
792 
793    nir_def *load_deref_64_to_vec2(nir_intrinsic_instr *intr);
794    nir_def *load_uniform_64_to_vec2(nir_intrinsic_instr *intr);
795    nir_def *load_ssbo_64_to_vec2(nir_intrinsic_instr *intr);
796    nir_def *load_64_to_vec2(nir_intrinsic_instr *intr);
797    nir_def *store_64_to_vec2(nir_intrinsic_instr *intr);
798 };
799 
800 bool
filter(const nir_instr * instr) const801 Lower64BitToVec2::filter(const nir_instr *instr) const
802 {
803    switch (instr->type) {
804    case nir_instr_type_intrinsic: {
805       auto intr = nir_instr_as_intrinsic(instr);
806 
807       switch (intr->intrinsic) {
808       case nir_intrinsic_load_deref:
809       case nir_intrinsic_load_input:
810       case nir_intrinsic_load_uniform:
811       case nir_intrinsic_load_ubo:
812       case nir_intrinsic_load_global:
813       case nir_intrinsic_load_global_constant:
814       case nir_intrinsic_load_ubo_vec4:
815       case nir_intrinsic_load_ssbo:
816          return intr->def.bit_size == 64;
817       case nir_intrinsic_store_deref: {
818          if (nir_src_bit_size(intr->src[1]) == 64)
819             return true;
820          auto var = nir_intrinsic_get_var(intr, 0);
821          if (glsl_get_bit_size(glsl_without_array(var->type)) == 64)
822             return true;
823          return (glsl_get_components(glsl_without_array(var->type)) != intr->num_components);
824       }
825       case nir_intrinsic_store_global:
826          return nir_src_bit_size(intr->src[0]) == 64;
827       default:
828          return false;
829       }
830    }
831    case nir_instr_type_alu: {
832       auto alu = nir_instr_as_alu(instr);
833       return alu->def.bit_size == 64;
834    }
835    case nir_instr_type_phi: {
836       auto phi = nir_instr_as_phi(instr);
837       return phi->def.bit_size == 64;
838    }
839    case nir_instr_type_load_const: {
840       auto lc = nir_instr_as_load_const(instr);
841       return lc->def.bit_size == 64;
842    }
843    case nir_instr_type_undef: {
844       auto undef = nir_instr_as_undef(instr);
845       return undef->def.bit_size == 64;
846    }
847    default:
848       return false;
849    }
850 }
851 
852 nir_def *
lower(nir_instr * instr)853 Lower64BitToVec2::lower(nir_instr *instr)
854 {
855    switch (instr->type) {
856    case nir_instr_type_intrinsic: {
857       auto intr = nir_instr_as_intrinsic(instr);
858       switch (intr->intrinsic) {
859       case nir_intrinsic_load_deref:
860          return load_deref_64_to_vec2(intr);
861       case nir_intrinsic_load_uniform:
862          return load_uniform_64_to_vec2(intr);
863       case nir_intrinsic_load_ssbo:
864          return load_ssbo_64_to_vec2(intr);
865       case nir_intrinsic_load_input:
866       case nir_intrinsic_load_global:
867       case nir_intrinsic_load_global_constant:
868       case nir_intrinsic_load_ubo:
869       case nir_intrinsic_load_ubo_vec4:
870          return load_64_to_vec2(intr);
871       case nir_intrinsic_store_deref:
872          return store_64_to_vec2(intr);
873       default:
874 
875          return nullptr;
876       }
877    }
878    case nir_instr_type_alu: {
879       auto alu = nir_instr_as_alu(instr);
880       alu->def.bit_size = 32;
881       alu->def.num_components *= 2;
882       switch (alu->op) {
883       case nir_op_pack_64_2x32_split:
884          alu->op = nir_op_vec2;
885          break;
886       case nir_op_pack_64_2x32:
887          alu->op = nir_op_mov;
888          break;
889       case nir_op_vec2:
890          return nir_vec4(b,
891                          nir_channel(b, alu->src[0].src.ssa, 0),
892                          nir_channel(b, alu->src[0].src.ssa, 1),
893                          nir_channel(b, alu->src[1].src.ssa, 0),
894                          nir_channel(b, alu->src[1].src.ssa, 1));
895       default:
896          return NULL;
897       }
898       return NIR_LOWER_INSTR_PROGRESS;
899    }
900    case nir_instr_type_phi: {
901       auto phi = nir_instr_as_phi(instr);
902       phi->def.bit_size = 32;
903       phi->def.num_components = 2;
904       return NIR_LOWER_INSTR_PROGRESS;
905    }
906    case nir_instr_type_load_const: {
907       auto lc = nir_instr_as_load_const(instr);
908       assert(lc->def.num_components <= 2);
909       nir_const_value val[4];
910       for (uint i = 0; i < lc->def.num_components; ++i) {
911          uint64_t v = lc->value[i].u64;
912          val[i * 2 + 0] = nir_const_value_for_uint(v & 0xffffffff, 32);
913          val[i * 2 + 1] = nir_const_value_for_uint(v >> 32, 32);
914       }
915 
916       return nir_build_imm(b, 2 * lc->def.num_components, 32, val);
917    }
918    case nir_instr_type_undef: {
919       auto undef = nir_instr_as_undef(instr);
920       undef->def.num_components *= 2;
921       undef->def.bit_size = 32;
922       return NIR_LOWER_INSTR_PROGRESS;
923    }
924    default:
925       return nullptr;
926    }
927 }
928 
929 nir_def *
load_deref_64_to_vec2(nir_intrinsic_instr * intr)930 Lower64BitToVec2::load_deref_64_to_vec2(nir_intrinsic_instr *intr)
931 {
932    auto deref = nir_instr_as_deref(intr->src[0].ssa->parent_instr);
933    auto var = nir_intrinsic_get_var(intr, 0);
934    unsigned components = glsl_get_components(glsl_without_array(var->type));
935    if (glsl_get_bit_size(glsl_without_array(var->type)) == 64) {
936       components *= 2;
937       if (deref->deref_type == nir_deref_type_var) {
938          var->type = glsl_vec_type(components);
939       } else if (deref->deref_type == nir_deref_type_array) {
940 
941          var->type =
942             glsl_array_type(glsl_vec_type(components), glsl_array_size(var->type), 0);
943 
944       } else {
945          nir_print_shader(b->shader, stderr);
946          assert(0 && "Only lowring of var and array derefs supported\n");
947       }
948    }
949    deref->type = var->type;
950    if (deref->deref_type == nir_deref_type_array) {
951       auto deref_array = nir_instr_as_deref(deref->parent.ssa->parent_instr);
952       deref_array->type = var->type;
953       deref->type = glsl_without_array(deref_array->type);
954    }
955 
956    intr->num_components = components;
957    intr->def.bit_size = 32;
958    intr->def.num_components = components;
959    return NIR_LOWER_INSTR_PROGRESS;
960 }
961 
962 nir_def *
store_64_to_vec2(nir_intrinsic_instr * intr)963 Lower64BitToVec2::store_64_to_vec2(nir_intrinsic_instr *intr)
964 {
965    auto deref = nir_instr_as_deref(intr->src[0].ssa->parent_instr);
966    auto var = nir_intrinsic_get_var(intr, 0);
967 
968    unsigned components = glsl_get_components(glsl_without_array(var->type));
969    unsigned wrmask = nir_intrinsic_write_mask(intr);
970    if (glsl_get_bit_size(glsl_without_array(var->type)) == 64) {
971       components *= 2;
972       if (deref->deref_type == nir_deref_type_var) {
973          var->type = glsl_vec_type(components);
974       } else if (deref->deref_type == nir_deref_type_array) {
975          var->type =
976             glsl_array_type(glsl_vec_type(components), glsl_array_size(var->type), 0);
977       } else {
978          nir_print_shader(b->shader, stderr);
979          assert(0 && "Only lowring of var and array derefs supported\n");
980       }
981    }
982    deref->type = var->type;
983    if (deref->deref_type == nir_deref_type_array) {
984       auto deref_array = nir_instr_as_deref(deref->parent.ssa->parent_instr);
985       deref_array->type = var->type;
986       deref->type = glsl_without_array(deref_array->type);
987    }
988    intr->num_components = components;
989    nir_intrinsic_set_write_mask(intr, wrmask == 1 ? 3 : 0xf);
990    return NIR_LOWER_INSTR_PROGRESS;
991 }
992 
993 nir_def *
load_uniform_64_to_vec2(nir_intrinsic_instr * intr)994 Lower64BitToVec2::load_uniform_64_to_vec2(nir_intrinsic_instr *intr)
995 {
996    intr->num_components *= 2;
997    intr->def.bit_size = 32;
998    intr->def.num_components *= 2;
999    nir_intrinsic_set_dest_type(intr, nir_type_float32);
1000    return NIR_LOWER_INSTR_PROGRESS;
1001 }
1002 
1003 nir_def *
load_64_to_vec2(nir_intrinsic_instr * intr)1004 Lower64BitToVec2::load_64_to_vec2(nir_intrinsic_instr *intr)
1005 {
1006    intr->num_components *= 2;
1007    intr->def.bit_size = 32;
1008    intr->def.num_components *= 2;
1009    if (nir_intrinsic_has_component(intr))
1010       nir_intrinsic_set_component(intr, nir_intrinsic_component(intr) * 2);
1011    return NIR_LOWER_INSTR_PROGRESS;
1012 }
1013 
1014 nir_def *
load_ssbo_64_to_vec2(nir_intrinsic_instr * intr)1015 Lower64BitToVec2::load_ssbo_64_to_vec2(nir_intrinsic_instr *intr)
1016 {
1017    intr->num_components *= 2;
1018    intr->def.bit_size = 32;
1019    intr->def.num_components *= 2;
1020    return NIR_LOWER_INSTR_PROGRESS;
1021 }
1022 
1023 static bool
store_64bit_intr(nir_src * src,void * state)1024 store_64bit_intr(nir_src *src, void *state)
1025 {
1026    bool *s = (bool *)state;
1027    *s = nir_src_bit_size(*src) == 64;
1028    return !*s;
1029 }
1030 
1031 static bool
double2vec2(nir_src * src,UNUSED void * state)1032 double2vec2(nir_src *src, UNUSED void *state)
1033 {
1034    if (nir_src_bit_size(*src) != 64)
1035       return true;
1036 
1037    src->ssa->bit_size = 32;
1038    src->ssa->num_components *= 2;
1039    return true;
1040 }
1041 
1042 bool
r600_nir_64_to_vec2(nir_shader * sh)1043 r600_nir_64_to_vec2(nir_shader *sh)
1044 {
1045    vector<nir_instr *> intr64bit;
1046    nir_foreach_function_impl(impl, sh)
1047    {
1048       nir_foreach_block(block, impl)
1049       {
1050          nir_foreach_instr_safe(instr, block)
1051          {
1052             switch (instr->type) {
1053             case nir_instr_type_alu: {
1054                bool success = false;
1055                nir_foreach_src(instr, store_64bit_intr, &success);
1056                if (success)
1057                   intr64bit.push_back(instr);
1058                break;
1059             }
1060             case nir_instr_type_intrinsic: {
1061                auto ir = nir_instr_as_intrinsic(instr);
1062                switch (ir->intrinsic) {
1063                case nir_intrinsic_store_output:
1064                case nir_intrinsic_store_global:
1065                case nir_intrinsic_store_ssbo: {
1066                   bool success = false;
1067                   nir_foreach_src(instr, store_64bit_intr, &success);
1068                   if (success) {
1069                      auto wm = nir_intrinsic_write_mask(ir);
1070                      nir_intrinsic_set_write_mask(ir, (wm == 1) ? 3 : 0xf);
1071                      ir->num_components *= 2;
1072                   }
1073                   break;
1074                }
1075                default:;
1076                }
1077             }
1078             default:;
1079             }
1080          }
1081       }
1082    }
1083 
1084    bool result = Lower64BitToVec2().run(sh);
1085 
1086    if (result || !intr64bit.empty()) {
1087 
1088       for (auto&& instr : intr64bit) {
1089          if (instr->type == nir_instr_type_alu) {
1090             auto alu = nir_instr_as_alu(instr);
1091             auto alu_info = nir_op_infos[alu->op];
1092             for (unsigned i = 0; i < alu_info.num_inputs; ++i) {
1093                int swizzle[NIR_MAX_VEC_COMPONENTS] = {0};
1094                for (unsigned k = 0; k < NIR_MAX_VEC_COMPONENTS / 2; k++) {
1095                   if (!nir_alu_instr_channel_used(alu, i, k)) {
1096                      continue;
1097                   }
1098 
1099                   switch (alu->op) {
1100                   case nir_op_unpack_64_2x32_split_x:
1101                      swizzle[2 * k] = alu->src[i].swizzle[k] * 2;
1102                      alu->op = nir_op_mov;
1103                      break;
1104                   case nir_op_unpack_64_2x32_split_y:
1105                      swizzle[2 * k] = alu->src[i].swizzle[k] * 2 + 1;
1106                      alu->op = nir_op_mov;
1107                      break;
1108                   case nir_op_unpack_64_2x32:
1109                      alu->op = nir_op_mov;
1110                      break;
1111                   case nir_op_bcsel:
1112                      if (i == 0) {
1113                         swizzle[2 * k] = swizzle[2 * k + 1] = alu->src[i].swizzle[k] * 2;
1114                         break;
1115                      }
1116                      FALLTHROUGH;
1117                   default:
1118                      swizzle[2 * k] = alu->src[i].swizzle[k] * 2;
1119                      swizzle[2 * k + 1] = alu->src[i].swizzle[k] * 2 + 1;
1120                   }
1121                }
1122                for (unsigned k = 0; k < NIR_MAX_VEC_COMPONENTS; ++k) {
1123                   alu->src[i].swizzle[k] = swizzle[k];
1124                }
1125             }
1126          } else
1127             nir_foreach_src(instr, double2vec2, nullptr);
1128       }
1129       result = true;
1130    }
1131 
1132    return result;
1133 }
1134 
1135 using std::map;
1136 using std::pair;
1137 using std::vector;
1138 
1139 class StoreMerger {
1140 public:
1141    StoreMerger(nir_shader *shader);
1142    void collect_stores();
1143    bool combine();
1144    void combine_one_slot(vector<nir_intrinsic_instr *>& stores);
1145 
1146    using StoreCombos = map<unsigned, vector<nir_intrinsic_instr *>>;
1147 
1148    StoreCombos m_stores;
1149    nir_shader *sh;
1150 };
1151 
StoreMerger(nir_shader * shader)1152 StoreMerger::StoreMerger(nir_shader *shader):
1153     sh(shader)
1154 {
1155 }
1156 
1157 void
collect_stores()1158 StoreMerger::collect_stores()
1159 {
1160    unsigned vertex = 0;
1161    nir_foreach_function_impl(impl, sh)
1162    {
1163       nir_foreach_block(block, impl)
1164       {
1165          nir_foreach_instr_safe(instr, block)
1166          {
1167             if (instr->type != nir_instr_type_intrinsic)
1168                continue;
1169 
1170             auto ir = nir_instr_as_intrinsic(instr);
1171             if (ir->intrinsic == nir_intrinsic_emit_vertex ||
1172                 ir->intrinsic == nir_intrinsic_emit_vertex_with_counter) {
1173                ++vertex;
1174                continue;
1175             }
1176             if (ir->intrinsic != nir_intrinsic_store_output)
1177                continue;
1178 
1179             unsigned index = nir_intrinsic_base(ir) + 64 * vertex +
1180                              8 * 64 * nir_intrinsic_io_semantics(ir).gs_streams;
1181             m_stores[index].push_back(ir);
1182          }
1183       }
1184    }
1185 }
1186 
1187 bool
combine()1188 StoreMerger::combine()
1189 {
1190    bool progress = false;
1191    for (auto&& i : m_stores) {
1192       if (i.second.size() < 2)
1193          continue;
1194 
1195       combine_one_slot(i.second);
1196       progress = true;
1197    }
1198    return progress;
1199 }
1200 
1201 void
combine_one_slot(vector<nir_intrinsic_instr * > & stores)1202 StoreMerger::combine_one_slot(vector<nir_intrinsic_instr *>& stores)
1203 {
1204    nir_def *srcs[4] = {nullptr};
1205 
1206    auto last_store = *stores.rbegin();
1207 
1208    nir_builder b = nir_builder_at(nir_before_instr(&last_store->instr));
1209 
1210    unsigned comps = 0;
1211    unsigned writemask = 0;
1212    unsigned first_comp = 4;
1213    for (auto&& store : stores) {
1214       int cmp = nir_intrinsic_component(store);
1215       for (unsigned i = 0; i < nir_src_num_components(store->src[0]); ++i, ++comps) {
1216          unsigned out_comp = i + cmp;
1217          srcs[out_comp] = nir_channel(&b, store->src[0].ssa, i);
1218          writemask |= 1 << out_comp;
1219          if (first_comp > out_comp)
1220             first_comp = out_comp;
1221       }
1222    }
1223 
1224    auto new_src = nir_vec(&b, srcs, comps);
1225 
1226    nir_src_rewrite(&last_store->src[0], new_src);
1227    last_store->num_components = comps;
1228    nir_intrinsic_set_component(last_store, first_comp);
1229    nir_intrinsic_set_write_mask(last_store, writemask);
1230 
1231    for (auto i = stores.begin(); i != stores.end() - 1; ++i)
1232       nir_instr_remove(&(*i)->instr);
1233 }
1234 
1235 bool
r600_merge_vec2_stores(nir_shader * shader)1236 r600_merge_vec2_stores(nir_shader *shader)
1237 {
1238    r600::StoreMerger merger(shader);
1239    merger.collect_stores();
1240    return merger.combine();
1241 }
1242 
1243 static bool
r600_lower_64bit_intrinsic(nir_builder * b,nir_intrinsic_instr * instr)1244 r600_lower_64bit_intrinsic(nir_builder *b, nir_intrinsic_instr *instr)
1245 {
1246    b->cursor = nir_after_instr(&instr->instr);
1247 
1248    switch (instr->intrinsic) {
1249    case nir_intrinsic_load_ubo:
1250    case nir_intrinsic_load_ubo_vec4:
1251    case nir_intrinsic_load_uniform:
1252    case nir_intrinsic_load_ssbo:
1253    case nir_intrinsic_load_input:
1254    case nir_intrinsic_load_interpolated_input:
1255    case nir_intrinsic_load_per_vertex_input:
1256    case nir_intrinsic_store_output:
1257    case nir_intrinsic_store_per_vertex_output:
1258    case nir_intrinsic_store_ssbo:
1259       break;
1260    default:
1261       return false;
1262    }
1263 
1264    if (instr->num_components <= 2)
1265       return false;
1266 
1267    bool has_dest = nir_intrinsic_infos[instr->intrinsic].has_dest;
1268    if (has_dest) {
1269       if (instr->def.bit_size != 64)
1270          return false;
1271    } else {
1272       if (nir_src_bit_size(instr->src[0]) != 64)
1273          return false;
1274    }
1275 
1276    nir_intrinsic_instr *first =
1277       nir_instr_as_intrinsic(nir_instr_clone(b->shader, &instr->instr));
1278    nir_intrinsic_instr *second =
1279       nir_instr_as_intrinsic(nir_instr_clone(b->shader, &instr->instr));
1280 
1281    switch (instr->intrinsic) {
1282    case nir_intrinsic_load_ubo:
1283    case nir_intrinsic_load_ubo_vec4:
1284    case nir_intrinsic_load_uniform:
1285    case nir_intrinsic_load_ssbo:
1286    case nir_intrinsic_store_ssbo:
1287       break;
1288 
1289    default: {
1290       nir_io_semantics semantics = nir_intrinsic_io_semantics(second);
1291       semantics.location++;
1292       semantics.num_slots--;
1293       nir_intrinsic_set_io_semantics(second, semantics);
1294 
1295       nir_intrinsic_set_base(second, nir_intrinsic_base(second) + 1);
1296       break;
1297    }
1298    }
1299 
1300    first->num_components = 2;
1301    second->num_components -= 2;
1302    if (has_dest) {
1303       first->def.num_components = 2;
1304       second->def.num_components -= 2;
1305    }
1306 
1307    nir_builder_instr_insert(b, &first->instr);
1308    nir_builder_instr_insert(b, &second->instr);
1309 
1310    if (has_dest) {
1311       /* Merge the two loads' results back into a vector. */
1312       nir_scalar channels[4] = {
1313          nir_get_scalar(&first->def, 0),
1314          nir_get_scalar(&first->def, 1),
1315          nir_get_scalar(&second->def, 0),
1316          nir_get_scalar(&second->def, second->num_components > 1 ? 1 : 0),
1317       };
1318       nir_def *new_ir = nir_vec_scalars(b, channels, instr->num_components);
1319       nir_def_rewrite_uses(&instr->def, new_ir);
1320    } else {
1321       /* Split the src value across the two stores. */
1322       b->cursor = nir_before_instr(&instr->instr);
1323 
1324       nir_def *src0 = instr->src[0].ssa;
1325       nir_scalar channels[4] = {{0}};
1326       for (int i = 0; i < instr->num_components; i++)
1327          channels[i] = nir_get_scalar(src0, i);
1328 
1329       nir_intrinsic_set_write_mask(first, nir_intrinsic_write_mask(instr) & 3);
1330       nir_intrinsic_set_write_mask(second, nir_intrinsic_write_mask(instr) >> 2);
1331 
1332       nir_src_rewrite(&first->src[0], nir_vec_scalars(b, channels, 2));
1333       nir_src_rewrite(&second->src[0],
1334                       nir_vec_scalars(b, &channels[2], second->num_components));
1335    }
1336 
1337    int offset_src = -1;
1338    uint32_t offset_amount = 16;
1339 
1340    switch (instr->intrinsic) {
1341    case nir_intrinsic_load_ssbo:
1342    case nir_intrinsic_load_ubo:
1343       offset_src = 1;
1344       break;
1345    case nir_intrinsic_load_ubo_vec4:
1346    case nir_intrinsic_load_uniform:
1347       offset_src = 0;
1348       offset_amount = 1;
1349       break;
1350    case nir_intrinsic_store_ssbo:
1351       offset_src = 2;
1352       break;
1353    default:
1354       break;
1355    }
1356    if (offset_src != -1) {
1357       b->cursor = nir_before_instr(&second->instr);
1358       nir_def *second_offset =
1359          nir_iadd_imm(b, second->src[offset_src].ssa, offset_amount);
1360       nir_src_rewrite(&second->src[offset_src], second_offset);
1361    }
1362 
1363    /* DCE stores we generated with no writemask (nothing else does this
1364     * currently).
1365     */
1366    if (!has_dest) {
1367       if (nir_intrinsic_write_mask(first) == 0)
1368          nir_instr_remove(&first->instr);
1369       if (nir_intrinsic_write_mask(second) == 0)
1370          nir_instr_remove(&second->instr);
1371    }
1372 
1373    nir_instr_remove(&instr->instr);
1374 
1375    return true;
1376 }
1377 
1378 static bool
r600_lower_64bit_load_const(nir_builder * b,nir_load_const_instr * instr)1379 r600_lower_64bit_load_const(nir_builder *b, nir_load_const_instr *instr)
1380 {
1381    int num_components = instr->def.num_components;
1382 
1383    if (instr->def.bit_size != 64 || num_components <= 2)
1384       return false;
1385 
1386    b->cursor = nir_before_instr(&instr->instr);
1387 
1388    nir_load_const_instr *first = nir_load_const_instr_create(b->shader, 2, 64);
1389    nir_load_const_instr *second =
1390       nir_load_const_instr_create(b->shader, num_components - 2, 64);
1391 
1392    first->value[0] = instr->value[0];
1393    first->value[1] = instr->value[1];
1394    second->value[0] = instr->value[2];
1395    if (num_components == 4)
1396       second->value[1] = instr->value[3];
1397 
1398    nir_builder_instr_insert(b, &first->instr);
1399    nir_builder_instr_insert(b, &second->instr);
1400 
1401    nir_def *channels[4] = {
1402       nir_channel(b, &first->def, 0),
1403       nir_channel(b, &first->def, 1),
1404       nir_channel(b, &second->def, 0),
1405       num_components == 4 ? nir_channel(b, &second->def, 1) : NULL,
1406    };
1407    nir_def *new_ir = nir_vec(b, channels, num_components);
1408    nir_def_replace(&instr->def, new_ir);
1409 
1410    return true;
1411 }
1412 
1413 static bool
r600_lower_64bit_to_vec2_instr(nir_builder * b,nir_instr * instr,UNUSED void * data)1414 r600_lower_64bit_to_vec2_instr(nir_builder *b, nir_instr *instr, UNUSED void *data)
1415 {
1416    switch (instr->type) {
1417    case nir_instr_type_load_const:
1418       return r600_lower_64bit_load_const(b, nir_instr_as_load_const(instr));
1419 
1420    case nir_instr_type_intrinsic:
1421       return r600_lower_64bit_intrinsic(b, nir_instr_as_intrinsic(instr));
1422    default:
1423       return false;
1424    }
1425 }
1426 
1427 bool
r600_lower_64bit_to_vec2(nir_shader * s)1428 r600_lower_64bit_to_vec2(nir_shader *s)
1429 {
1430    return nir_shader_instructions_pass(s,
1431                                        r600_lower_64bit_to_vec2_instr,
1432                                        nir_metadata_control_flow,
1433                                        NULL);
1434 }
1435 
1436 } // end namespace r600
1437