ir3.c (revision 6104692788411f58d303aa86923a9ff6ecaded22) - OpenGrok cross reference for /aosp_15_r20/external/mesa3d/src/freedreno/ir3/ir3.c

/*
 * Copyright © 2012 Rob Clark <[email protected]>
 * SPDX-License-Identifier: MIT
 */

#include "ir3.h"

#include <assert.h>
#include <errno.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#include "util/bitscan.h"
#include "util/half_float.h"
#include "util/ralloc.h"
#include "util/u_math.h"

#include "instr-a3xx.h"
#include "ir3_shader.h"

/* simple allocator to carve allocations out of an up-front allocated heap,
 * so that we can free everything easily in one shot.
 */
void *
ir3_alloc(struct ir3 *shader, int sz)
{
   return rzalloc_size(shader, sz); /* TODO: don't use rzalloc */
}

struct ir3 *
ir3_create(struct ir3_compiler *compiler, struct ir3_shader_variant *v)
{
   struct ir3 *shader = rzalloc(v, struct ir3);

   shader->compiler = compiler;
   shader->type = v->type;

   list_inithead(&shader->block_list);
   list_inithead(&shader->array_list);

   return shader;
}

void
ir3_destroy(struct ir3 *shader)
{
   ralloc_free(shader);
}

static bool
is_shared_consts(struct ir3_compiler *compiler,
                 const struct ir3_const_state *const_state,
                 struct ir3_register *reg)
{
   if (const_state->push_consts_type == IR3_PUSH_CONSTS_SHARED &&
       reg->flags & IR3_REG_CONST) {
      uint32_t min_const_reg = regid(compiler->shared_consts_base_offset, 0);
      uint32_t max_const_reg =
         regid(compiler->shared_consts_base_offset +
               compiler->shared_consts_size, 0);
      return reg->num >= min_const_reg && min_const_reg < max_const_reg;
   }

   return false;
}

static void
collect_reg_info(struct ir3_instruction *instr, struct ir3_register *reg,
                 struct ir3_info *info)
{
   struct ir3_shader_variant *v = info->data;

   if (reg->flags & IR3_REG_IMMED) {
      /* nothing to do */
      return;
   }

   /* Shared consts don't need to be included into constlen. */
   if (is_shared_consts(v->compiler, ir3_const_state(v), reg))
      return;

   unsigned components;
   int16_t max;

   if (reg->flags & IR3_REG_RELATIV) {
      components = reg->size;
      max = (reg->array.base + components - 1);
   } else {
      components = util_last_bit(reg->wrmask);
      max = (reg->num + components - 1);
   }

   if (reg->flags & IR3_REG_CONST) {
      info->max_const = MAX2(info->max_const, max >> 2);
   } else if (max < regid(48, 0)) {
      if (reg->flags & IR3_REG_HALF) {
         if (v->mergedregs) {
            /* starting w/ a6xx, half regs conflict with full regs: */
            info->max_reg = MAX2(info->max_reg, max >> 3);
         } else {
            info->max_half_reg = MAX2(info->max_half_reg, max >> 2);
         }
      } else {
         info->max_reg = MAX2(info->max_reg, max >> 2);
      }
   }
}

bool
ir3_should_double_threadsize(struct ir3_shader_variant *v, unsigned regs_count)
{
   const struct ir3_compiler *compiler = v->compiler;

   /* If the user forced a particular wavesize respect that. */
   if (v->shader_options.real_wavesize == IR3_SINGLE_ONLY)
      return false;
   if (v->shader_options.real_wavesize == IR3_DOUBLE_ONLY)
      return true;

   /* We can't support more than compiler->branchstack_size diverging threads
    * in a wave. Thus, doubling the threadsize is only possible if we don't
    * exceed the branchstack size limit.
    */
   if (MIN2(v->branchstack, compiler->threadsize_base * 2) >
       compiler->branchstack_size) {
      return false;
   }

   switch (v->type) {
   case MESA_SHADER_KERNEL:
   case MESA_SHADER_COMPUTE: {
      unsigned threads_per_wg =
         v->local_size[0] * v->local_size[1] * v->local_size[2];

      /* For a5xx, if the workgroup size is greater than the maximum number
       * of threads per core with 32 threads per wave (512) then we have to
       * use the doubled threadsize because otherwise the workgroup wouldn't
       * fit. For smaller workgroup sizes, we follow the blob and use the
       * smaller threadsize.
       */
      if (compiler->gen < 6) {
         return v->local_size_variable ||
                threads_per_wg >
                   compiler->threadsize_base * compiler->max_waves;
      }

      /* On a6xx, we prefer the larger threadsize unless the workgroup is
       * small enough that it would be useless. Note that because
       * threadsize_base is bumped to 64, we don't have to worry about the
       * workgroup fitting, unlike the a5xx case.
       */
      if (!v->local_size_variable) {
         if (threads_per_wg <= compiler->threadsize_base)
            return false;
      }
   }
      FALLTHROUGH;
   case MESA_SHADER_FRAGMENT: {
      /* Check that doubling the threadsize wouldn't exceed the regfile size */
      return regs_count * 2 <= compiler->reg_size_vec4;
   }

   default:
      /* On a6xx+, it's impossible to use a doubled wavesize in the geometry
       * stages - the bit doesn't exist. The blob never used it for the VS
       * on earlier gen's anyway.
       */
      return false;
   }
}

/* Get the maximum number of waves that could be used even if this shader
 * didn't use any registers.
 */
unsigned
ir3_get_reg_independent_max_waves(struct ir3_shader_variant *v,
                                  bool double_threadsize)
{
   const struct ir3_compiler *compiler = v->compiler;
   unsigned max_waves = compiler->max_waves;

   /* Compute the limit based on branchstack */
   if (v->branchstack > 0) {
      unsigned branchstack_max_waves = compiler->branchstack_size /
                                       v->branchstack *
                                       compiler->wave_granularity;
      max_waves = MIN2(max_waves, branchstack_max_waves);
   }

   /* If this is a compute shader, compute the limit based on shared size */
   if ((v->type == MESA_SHADER_COMPUTE) ||
       (v->type == MESA_SHADER_KERNEL)) {
      unsigned threads_per_wg =
         v->local_size[0] * v->local_size[1] * v->local_size[2];
      unsigned waves_per_wg =
         DIV_ROUND_UP(threads_per_wg, compiler->threadsize_base *
                                         (double_threadsize ? 2 : 1) *
                                         compiler->wave_granularity);

      /* Shared is allocated in chunks of 1k */
      unsigned shared_per_wg = ALIGN_POT(v->shared_size, 1024);
      if (shared_per_wg > 0 && !v->local_size_variable) {
         unsigned wgs_per_core = compiler->local_mem_size / shared_per_wg;

         max_waves = MIN2(max_waves, waves_per_wg * wgs_per_core *
                                        compiler->wave_granularity);
      }

      /* If we have a compute shader that has a big workgroup, a barrier, and
       * a branchstack which limits max_waves - this may result in a situation
       * when we cannot run concurrently all waves of the workgroup, which
       * would lead to a hang.
       *
       * TODO: Could we spill branchstack or is there other way around?
       * Blob just explodes in such case.
       */
      if (v->has_barrier && (max_waves < waves_per_wg)) {
         mesa_loge(
            "Compute shader (%s) which has workgroup barrier cannot be used "
            "because it's impossible to have enough concurrent waves.",
            v->name);
         exit(1);
      }
   }

   return max_waves;
}

/* Get the maximum number of waves that could be launched limited by reg size.
 */
unsigned
ir3_get_reg_dependent_max_waves(const struct ir3_compiler *compiler,
                                unsigned reg_count, bool double_threadsize)
{
   return reg_count ? (compiler->reg_size_vec4 /
                       (reg_count * (double_threadsize ? 2 : 1)) *
                       compiler->wave_granularity)
                    : compiler->max_waves;
}

void
ir3_collect_info(struct ir3_shader_variant *v)
{
   struct ir3_info *info = &v->info;
   struct ir3 *shader = v->ir;
   const struct ir3_compiler *compiler = v->compiler;

   memset(info, 0, sizeof(*info));
   info->data = v;
   info->max_reg = -1;
   info->max_half_reg = -1;
   info->max_const = -1;
   info->multi_dword_ldp_stp = false;

   uint32_t instr_count = 0;
   foreach_block (block, &shader->block_list) {
      foreach_instr (instr, &block->instr_list) {
         instr_count++;
      }
   }

   v->instrlen = DIV_ROUND_UP(instr_count, compiler->instr_align);

   /* Pad out with NOPs to instrlen, including at least 4 so that cffdump
    * doesn't try to decode the following data as instructions (such as the
    * next stage's shader in turnip)
    */
   info->size = MAX2(v->instrlen * compiler->instr_align, instr_count + 4) * 8;
   info->sizedwords = info->size / 4;

   info->early_preamble = v->early_preamble;

   bool in_preamble = false;
   bool has_eq = false;

   foreach_block (block, &shader->block_list) {
      int sfu_delay = 0, mem_delay = 0;

      foreach_instr (instr, &block->instr_list) {

         foreach_src (reg, instr) {
            collect_reg_info(instr, reg, info);
         }

         foreach_dst (reg, instr) {
            if (is_dest_gpr(reg)) {
               collect_reg_info(instr, reg, info);
            }
         }

         if ((instr->opc == OPC_STP || instr->opc == OPC_LDP)) {
            unsigned components = instr->srcs[2]->uim_val;

            /* This covers any multi-component access that could straddle
             * across multiple double-words.
             */
            if (components > 1)
               info->multi_dword_ldp_stp = true;

            if (instr->opc == OPC_STP)
               info->stp_count += components;
            else
               info->ldp_count += components;
         }

         if ((instr->opc == OPC_BARY_F || instr->opc == OPC_FLAT_B) &&
             (instr->dsts[0]->flags & IR3_REG_EI))
            info->last_baryf = info->instrs_count;

         if ((instr->opc == OPC_NOP) && (instr->flags & IR3_INSTR_EQ)) {
            info->last_helper = info->instrs_count;
            has_eq = true;
         }

         if (v->type == MESA_SHADER_FRAGMENT && v->need_pixlod &&
             instr->opc == OPC_END && !v->prefetch_end_of_quad && !has_eq)
            info->last_helper = info->instrs_count;

         if (instr->opc == OPC_SHPS)
            in_preamble = true;

         /* Don't count instructions in the preamble for instruction-count type
          * stats, because their effect should be much smaller.
          * TODO: we should probably have separate stats for preamble
          * instructions, but that would blow up the amount of stats...
          */
         if (!in_preamble) {
            unsigned instrs_count = 1 + instr->repeat + instr->nop;
            unsigned nops_count = instr->nop;

            if (instr->opc == OPC_NOP) {
               nops_count = 1 + instr->repeat;
               info->instrs_per_cat[0] += nops_count;
            } else if (!is_meta(instr)) {
               info->instrs_per_cat[opc_cat(instr->opc)] += 1 + instr->repeat;
               info->instrs_per_cat[0] += nops_count;
            }

            if (instr->opc == OPC_MOV) {
               if (instr->cat1.src_type == instr->cat1.dst_type) {
                  info->mov_count += 1 + instr->repeat;
               } else {
                  info->cov_count += 1 + instr->repeat;
               }
            }

            info->instrs_count += instrs_count;
            info->nops_count += nops_count;

            if (instr->flags & IR3_INSTR_SS) {
               info->ss++;
               info->sstall += sfu_delay;
               sfu_delay = 0;
            }

            if (instr->flags & IR3_INSTR_SY) {
               info->sy++;
               info->systall += mem_delay;
               mem_delay = 0;
            }

            if (is_ss_producer(instr)) {
               sfu_delay = soft_ss_delay(instr);
            } else {
               int n = MIN2(sfu_delay, 1 + instr->repeat + instr->nop);
               sfu_delay -= n;
            }

            if (is_sy_producer(instr)) {
               mem_delay = soft_sy_delay(instr, shader);
            } else {
               int n = MIN2(mem_delay, 1 + instr->repeat + instr->nop);
               mem_delay -= n;
            }
         }

         if (instr->opc == OPC_SHPE)
            in_preamble = false;
      }
   }

   /* for vertex shader, the inputs are loaded into registers before the shader
    * is executed, so max_regs from the shader instructions might not properly
    * reflect the # of registers actually used, especially in case passthrough
    * varyings.
    *
    * Likewise, for fragment shader, we can have some regs which are passed
    * input values but never touched by the resulting shader (ie. as result
    * of dead code elimination or simply because we don't know how to turn
    * the reg off.
    */
   for (unsigned i = 0; i < v->inputs_count; i++) {
      /* skip frag inputs fetch via bary.f since their reg's are
       * not written by gpu before shader starts (and in fact the
       * regid's might not even be valid)
       */
      if (v->inputs[i].bary)
         continue;

      /* ignore high regs that are global to all threads in a warp
       * (they exist by default) (a5xx+)
       */
      if (v->inputs[i].regid >= regid(48, 0))
         continue;

      if (v->inputs[i].compmask) {
         unsigned n = util_last_bit(v->inputs[i].compmask) - 1;
         int32_t regid = v->inputs[i].regid + n;
         if (v->inputs[i].half) {
            if (!v->mergedregs) {
               v->info.max_half_reg = MAX2(v->info.max_half_reg, regid >> 2);
            } else {
               v->info.max_reg = MAX2(v->info.max_reg, regid >> 3);
            }
         } else {
            v->info.max_reg = MAX2(v->info.max_reg, regid >> 2);
         }
      }
   }

   for (unsigned i = 0; i < v->num_sampler_prefetch; i++) {
      unsigned n = util_last_bit(v->sampler_prefetch[i].wrmask) - 1;
      int32_t regid = v->sampler_prefetch[i].dst + n;
      if (v->sampler_prefetch[i].half_precision) {
         if (!v->mergedregs) {
            v->info.max_half_reg = MAX2(v->info.max_half_reg, regid >> 2);
         } else {
            v->info.max_reg = MAX2(v->info.max_reg, regid >> 3);
         }
      } else {
         v->info.max_reg = MAX2(v->info.max_reg, regid >> 2);
      }
   }

   /* TODO: for a5xx and below, is there a separate regfile for
    * half-registers?
    */
   unsigned regs_count =
      info->max_reg + 1 +
      (compiler->gen >= 6 ? ((info->max_half_reg + 2) / 2) : 0);

   info->double_threadsize = ir3_should_double_threadsize(v, regs_count);

   /* TODO this is different for earlier gens, but earlier gens don't use this */
   info->subgroup_size = v->info.double_threadsize ? 128 : 64;

   unsigned reg_independent_max_waves =
      ir3_get_reg_independent_max_waves(v, info->double_threadsize);
   unsigned reg_dependent_max_waves = ir3_get_reg_dependent_max_waves(
      compiler, regs_count, info->double_threadsize);
   info->max_waves = MIN2(reg_independent_max_waves, reg_dependent_max_waves);
   assert(info->max_waves <= v->compiler->max_waves);
}

static struct ir3_register *
reg_create(struct ir3 *shader, int num, int flags)
{
   struct ir3_register *reg = ir3_alloc(shader, sizeof(struct ir3_register));
   reg->wrmask = 1;
   reg->flags = flags;
   reg->num = num;
   return reg;
}

static void
insert_instr(struct ir3_cursor cursor, struct ir3_instruction *instr)
{
   struct ir3 *shader = instr->block->shader;

   instr->serialno = ++shader->instr_count;

   switch (cursor.option) {
   case IR3_CURSOR_BEFORE_BLOCK:
      list_add(&instr->node, &cursor.block->instr_list);
      break;
   case IR3_CURSOR_AFTER_BLOCK:
      list_addtail(&instr->node, &cursor.block->instr_list);
      break;
   case IR3_CURSOR_BEFORE_INSTR:
      list_addtail(&instr->node, &cursor.instr->node);
      break;
   case IR3_CURSOR_AFTER_INSTR:
      list_add(&instr->node, &cursor.instr->node);
      break;
   }

   if (is_input(instr))
      array_insert(shader, shader->baryfs, instr);
}

struct ir3_block *
ir3_block_create(struct ir3 *shader)
{
   struct ir3_block *block = ir3_alloc(shader, sizeof(*block));
#if MESA_DEBUG
   block->serialno = ++shader->block_count;
#endif
   block->shader = shader;
   list_inithead(&block->node);
   list_inithead(&block->instr_list);
   return block;
}

static struct ir3_instruction *
block_get_last_instruction(struct ir3_block *block)
{
   if (list_is_empty(&block->instr_list))
      return NULL;
   return list_last_entry(&block->instr_list, struct ir3_instruction, node);
}

struct ir3_instruction *
ir3_block_get_terminator(struct ir3_block *block)
{
   struct ir3_instruction *last = block_get_last_instruction(block);

   if (last && is_terminator(last))
      return last;

   return NULL;
}

struct ir3_instruction *
ir3_block_take_terminator(struct ir3_block *block)
{
   struct ir3_instruction *terminator = ir3_block_get_terminator(block);

   if (terminator)
      list_delinit(&terminator->node);

   return terminator;
}

struct ir3_instruction *
ir3_block_get_last_non_terminator(struct ir3_block *block)
{
   struct ir3_instruction *last = block_get_last_instruction(block);

   if (!last)
      return NULL;

   if (!is_terminator(last))
      return last;

   if (last->node.prev != &block->instr_list)
      return list_entry(last->node.prev, struct ir3_instruction, node);

   return NULL;
}

struct ir3_instruction *
ir3_block_get_last_phi(struct ir3_block *block)
{
   struct ir3_instruction *last_phi = NULL;

   foreach_instr (instr, &block->instr_list) {
      if (instr->opc != OPC_META_PHI)
         break;

      last_phi = instr;
   }

   return last_phi;
}

void
ir3_block_add_predecessor(struct ir3_block *block, struct ir3_block *pred)
{
   array_insert(block, block->predecessors, pred);
}

void
ir3_block_link_physical(struct ir3_block *pred,
                        struct ir3_block *succ)
{
   array_insert(pred, pred->physical_successors, succ);
   array_insert(succ, succ->physical_predecessors, pred);
}

void
ir3_block_remove_predecessor(struct ir3_block *block, struct ir3_block *pred)
{
   for (unsigned i = 0; i < block->predecessors_count; i++) {
      if (block->predecessors[i] == pred) {
         if (i < block->predecessors_count - 1) {
            block->predecessors[i] =
               block->predecessors[block->predecessors_count - 1];
         }

         block->predecessors_count--;
         return;
      }
   }
}

unsigned
ir3_block_get_pred_index(struct ir3_block *block, struct ir3_block *pred)
{
   for (unsigned i = 0; i < block->predecessors_count; i++) {
      if (block->predecessors[i] == pred) {
         return i;
      }
   }

   unreachable("ir3_block_get_pred_index() invalid predecessor");
}

static struct ir3_instruction *
instr_create(struct ir3_block *block, opc_t opc, int ndst, int nsrc)
{
   /* Add extra sources for array destinations and the address reg */
   if (1 <= opc_cat(opc))
      nsrc += 2;
   struct ir3_instruction *instr;
   unsigned sz = sizeof(*instr) + (ndst * sizeof(instr->dsts[0])) +
                 (nsrc * sizeof(instr->srcs[0]));
   char *ptr = ir3_alloc(block->shader, sz);

   instr = (struct ir3_instruction *)ptr;
   ptr += sizeof(*instr);
   instr->dsts = (struct ir3_register **)ptr;
   instr->srcs = instr->dsts + ndst;

#if MESA_DEBUG
   instr->dsts_max = ndst;
   instr->srcs_max = nsrc;
#endif

   list_inithead(&instr->rpt_node);
   return instr;
}

static void
add_to_address_users(struct ir3_instruction *instr)
{
   assert(instr->address != NULL);

   struct ir3 *ir = instr->block->shader;
   struct ir3_register *addr_reg = instr->address->def;
   assert(reg_num(addr_reg) == REG_A0);
   unsigned comp = reg_comp(addr_reg);
   if (comp == 0) {
      array_insert(ir, ir->a0_users, instr);
   } else {
      assert(comp == 1);
      array_insert(ir, ir->a1_users, instr);
   }
}

static struct ir3_block *
get_block(struct ir3_cursor cursor)
{
   switch (cursor.option) {
   case IR3_CURSOR_BEFORE_BLOCK:
   case IR3_CURSOR_AFTER_BLOCK:
      return cursor.block;
   case IR3_CURSOR_BEFORE_INSTR:
   case IR3_CURSOR_AFTER_INSTR:
      return cursor.instr->block;
   }

   unreachable("illegal cursor option");
}

struct ir3_instruction *
ir3_instr_create_at(struct ir3_cursor cursor, opc_t opc, int ndst, int nsrc)
{
   struct ir3_block *block = get_block(cursor);
   struct ir3_instruction *instr = instr_create(block, opc, ndst, nsrc);
   instr->block = block;
   instr->opc = opc;
   insert_instr(cursor, instr);
   return instr;
}

struct ir3_instruction *
ir3_build_instr(struct ir3_builder *builder, opc_t opc, int ndst, int nsrc)
{
   struct ir3_instruction *instr =
      ir3_instr_create_at(builder->cursor, opc, ndst, nsrc);
   builder->cursor = ir3_after_instr(instr);
   return instr;
}

struct ir3_instruction *
ir3_instr_create(struct ir3_block *block, opc_t opc, int ndst, int nsrc)
{
   return ir3_instr_create_at(ir3_before_terminator(block), opc, ndst, nsrc);
}

struct ir3_instruction *
ir3_instr_create_at_end(struct ir3_block *block, opc_t opc, int ndst, int nsrc)
{
   return ir3_instr_create_at(ir3_after_block(block), opc, ndst, nsrc);
}

struct ir3_instruction *
ir3_instr_clone(struct ir3_instruction *instr)
{
   struct ir3_instruction *new_instr = instr_create(
      instr->block, instr->opc, instr->dsts_count, instr->srcs_count);
   struct ir3_register **dsts, **srcs;

   dsts = new_instr->dsts;
   srcs = new_instr->srcs;
   *new_instr = *instr;
   new_instr->dsts = dsts;
   new_instr->srcs = srcs;
   list_inithead(&new_instr->rpt_node);

   insert_instr(ir3_before_terminator(instr->block), new_instr);

   /* clone registers: */
   new_instr->dsts_count = 0;
   new_instr->srcs_count = 0;
   foreach_dst (reg, instr) {
      struct ir3_register *new_reg =
         ir3_dst_create(new_instr, reg->num, reg->flags);
      *new_reg = *reg;
      if (new_reg->instr)
         new_reg->instr = new_instr;
   }
   foreach_src (reg, instr) {
      struct ir3_register *new_reg =
         ir3_src_create(new_instr, reg->num, reg->flags);
      *new_reg = *reg;
   }

   if (instr->address) {
      assert(instr->srcs_count > 0);
      new_instr->address = new_instr->srcs[instr->srcs_count - 1];
      add_to_address_users(new_instr);
   }

   return new_instr;
}

/* Add a false dependency to instruction, to ensure it is scheduled first: */
void
ir3_instr_add_dep(struct ir3_instruction *instr, struct ir3_instruction *dep)
{
   for (unsigned i = 0; i < instr->deps_count; i++) {
      if (instr->deps[i] == dep)
         return;
   }

   array_insert(instr, instr->deps, dep);
}

void
ir3_instr_remove(struct ir3_instruction *instr)
{
   list_delinit(&instr->node);
   list_delinit(&instr->rpt_node);
}

void
ir3_instr_create_rpt(struct ir3_instruction **instrs, unsigned n)
{
   assert(n > 0 && !ir3_instr_is_rpt(instrs[0]));

   for (unsigned i = 1; i < n; ++i) {
      assert(!ir3_instr_is_rpt(instrs[i]));
      assert(instrs[i]->serialno > instrs[i - 1]->serialno);

      list_addtail(&instrs[i]->rpt_node, &instrs[0]->rpt_node);
   }
}

bool
ir3_instr_is_rpt(const struct ir3_instruction *instr)
{
   return !list_is_empty(&instr->rpt_node);
}

bool
ir3_instr_is_first_rpt(const struct ir3_instruction *instr)
{
   if (!ir3_instr_is_rpt(instr))
      return false;

   struct ir3_instruction *prev_rpt =
      list_entry(instr->rpt_node.prev, struct ir3_instruction, rpt_node);
   return prev_rpt->serialno > instr->serialno;
}

struct ir3_instruction *
ir3_instr_prev_rpt(const struct ir3_instruction *instr)
{
   assert(ir3_instr_is_rpt(instr));

   if (ir3_instr_is_first_rpt(instr))
      return NULL;
   return list_entry(instr->rpt_node.prev, struct ir3_instruction, rpt_node);
}

struct ir3_instruction *
ir3_instr_first_rpt(struct ir3_instruction *instr)
{
   assert(ir3_instr_is_rpt(instr));

   while (!ir3_instr_is_first_rpt(instr)) {
      instr = ir3_instr_prev_rpt(instr);
      assert(instr);
   }

   return instr;
}

unsigned
ir3_instr_rpt_length(const struct ir3_instruction *instr)
{
   assert(ir3_instr_is_first_rpt(instr));

   return list_length(&instr->rpt_node) + 1;
}

struct ir3_register *
ir3_src_create(struct ir3_instruction *instr, int num, int flags)
{
   struct ir3 *shader = instr->block->shader;
#if MESA_DEBUG
   assert(instr->srcs_count < instr->srcs_max);
#endif
   struct ir3_register *reg = reg_create(shader, num, flags);
   instr->srcs[instr->srcs_count++] = reg;
   return reg;
}

struct ir3_register *
ir3_dst_create(struct ir3_instruction *instr, int num, int flags)
{
   struct ir3 *shader = instr->block->shader;
#if MESA_DEBUG
   assert(instr->dsts_count < instr->dsts_max);
#endif
   struct ir3_register *reg = reg_create(shader, num, flags);
   instr->dsts[instr->dsts_count++] = reg;
   return reg;
}

struct ir3_register *
ir3_reg_clone(struct ir3 *shader, struct ir3_register *reg)
{
   struct ir3_register *new_reg = reg_create(shader, 0, 0);
   *new_reg = *reg;
   return new_reg;
}

void
ir3_reg_set_last_array(struct ir3_instruction *instr, struct ir3_register *reg,
                       struct ir3_register *last_write)
{
   assert(reg->flags & IR3_REG_ARRAY);
   struct ir3_register *new_reg = ir3_src_create(instr, 0, 0);
   *new_reg = *reg;
   new_reg->def = last_write;
   ir3_reg_tie(reg, new_reg);
}

void
ir3_instr_set_address(struct ir3_instruction *instr,
                      struct ir3_instruction *addr)
{
   if (!instr->address) {
      assert(instr->block == addr->block);

      instr->address =
         ir3_src_create(instr, addr->dsts[0]->num, addr->dsts[0]->flags);
      instr->address->def = addr->dsts[0];
      add_to_address_users(instr);
   } else {
      assert(instr->address->def->instr == addr);
   }
}

/* Does this instruction use the scalar ALU?
 */
bool
is_scalar_alu(struct ir3_instruction *instr,
              const struct ir3_compiler *compiler)
{
   /* MOVMSK seems to always need (ss) even with other scalar ALU instructions
    */
   return instr->opc != OPC_MOVMSK &&
      instr->opc != OPC_SCAN_CLUSTERS_MACRO &&
      instr->opc != OPC_SCAN_MACRO &&
      is_alu(instr) && (instr->dsts[0]->flags & IR3_REG_SHARED) &&
      /* scalar->scalar mov instructions (but NOT cov) were supported before the
       * scalar ALU was supported, but they still required (ss) whereas on GPUs
       * that have a scalar ALU they are executed on it and do not require (ss).
       * We have to be careful to return false for these if scalar ALU isn't
       * supported, so that we treat them like vector->scalar mov instructions
       * (such as requiring (ss)).
       */
      compiler->has_scalar_alu &&
      /* moves from normal to shared seem to use a separate ALU as before and
       * require a (ss) on dependent instructions.
       */
      ((instr->opc != OPC_MOV && !is_subgroup_cond_mov_macro(instr)) ||
       (instr->srcs[0]->flags & (IR3_REG_SHARED | IR3_REG_IMMED | IR3_REG_CONST)));
}

void
ir3_block_clear_mark(struct ir3_block *block)
{
   foreach_instr (instr, &block->instr_list)
      instr->flags &= ~IR3_INSTR_MARK;
}

void
ir3_clear_mark(struct ir3 *ir)
{
   foreach_block (block, &ir->block_list) {
      ir3_block_clear_mark(block);
   }
}

unsigned
ir3_count_instructions(struct ir3 *ir)
{
   unsigned cnt = 1;
   foreach_block (block, &ir->block_list) {
      block->start_ip = cnt;
      foreach_instr (instr, &block->instr_list) {
         instr->ip = cnt++;
      }
      block->end_ip = cnt;
   }
   return cnt;
}

unsigned
ir3_count_instructions_sched(struct ir3 *ir)
{
   unsigned cnt = 1;
   foreach_block (block, &ir->block_list) {
      block->start_ip = cnt;
      foreach_instr (instr, &block->instr_list) {
         if (!is_terminator(instr))
            instr->ip = cnt++;
      }
      block->end_ip = cnt;
   }
   return cnt;
}

/* When counting instructions for RA, we insert extra fake instructions at the
 * beginning of each block, where values become live, and at the end where
 * values die. This prevents problems where values live-in at the beginning or
 * live-out at the end of a block from being treated as if they were
 * live-in/live-out at the first/last instruction, which would be incorrect.
 * In ir3_legalize these ip's are assumed to be actual ip's of the final
 * program, so it would be incorrect to use this everywhere.
 */

unsigned
ir3_count_instructions_ra(struct ir3 *ir)
{
   unsigned cnt = 1;
   foreach_block (block, &ir->block_list) {
      block->start_ip = cnt++;
      foreach_instr (instr, &block->instr_list) {
         instr->ip = cnt++;
      }
      block->end_ip = cnt++;
   }
   return cnt;
}

struct ir3_array *
ir3_lookup_array(struct ir3 *ir, unsigned id)
{
   foreach_array (arr, &ir->array_list)
      if (arr->id == id)
         return arr;
   return NULL;
}

void ir3_find_ssa_uses_for(struct ir3 *ir, void *mem_ctx, use_filter_cb filter)
{
   /* We could do this in a single pass if we can assume instructions
    * are always sorted.  Which currently might not always be true.
    * (In particular after ir3_group pass, but maybe other places.)
    */
   foreach_block (block, &ir->block_list)
      foreach_instr (instr, &block->instr_list)
         instr->uses = NULL;

   foreach_block (block, &ir->block_list) {
      foreach_instr (instr, &block->instr_list) {
         foreach_ssa_src_n (src, n, instr) {
            if (!filter(instr, n))
               continue;
            if (!src->uses)
               src->uses = _mesa_pointer_set_create(mem_ctx);
            _mesa_set_add(src->uses, instr);
         }
      }
   }
}

static bool
no_false_deps(struct ir3_instruction *instr, unsigned src_n)
{
   return !__is_false_dep(instr, src_n);
}

static bool
any_src(struct ir3_instruction *instr, unsigned src_n)
{
   return true;
}

void
ir3_find_ssa_uses(struct ir3 *ir, void *mem_ctx, bool falsedeps)
{
   if (falsedeps)
      return ir3_find_ssa_uses_for(ir, mem_ctx, any_src);
   return ir3_find_ssa_uses_for(ir, mem_ctx, no_false_deps);
}

/**
 * Set the destination type of an instruction, for example if a
 * conversion is folded in, handling the special cases where the
 * instruction's dest type or opcode needs to be fixed up.
 */
void
ir3_set_dst_type(struct ir3_instruction *instr, bool half)
{
   if (half) {
      instr->dsts[0]->flags |= IR3_REG_HALF;
   } else {
      instr->dsts[0]->flags &= ~IR3_REG_HALF;
   }

   switch (opc_cat(instr->opc)) {
   case 1: /* move instructions */
      if (half) {
         instr->cat1.dst_type = half_type(instr->cat1.dst_type);
      } else {
         instr->cat1.dst_type = full_type(instr->cat1.dst_type);
      }
      break;
   case 4:
      if (half) {
         instr->opc = cat4_half_opc(instr->opc);
      } else {
         instr->opc = cat4_full_opc(instr->opc);
      }
      break;
   case 5:
      if (half) {
         instr->cat5.type = half_type(instr->cat5.type);
      } else {
         instr->cat5.type = full_type(instr->cat5.type);
      }
      break;
   }
}

/**
 * One-time fixup for instruction src-types.  Other than cov's that
 * are folded, an instruction's src type does not change.
 */
void
ir3_fixup_src_type(struct ir3_instruction *instr)
{
   if (instr->srcs_count == 0)
      return;

   switch (opc_cat(instr->opc)) {
   case 1: /* move instructions */
      if (instr->srcs[0]->flags & IR3_REG_HALF) {
         instr->cat1.src_type = half_type(instr->cat1.src_type);
      } else {
         instr->cat1.src_type = full_type(instr->cat1.src_type);
      }
      break;
   case 3:
      if (instr->srcs[0]->flags & IR3_REG_HALF) {
         instr->opc = cat3_half_opc(instr->opc);
      } else {
         instr->opc = cat3_full_opc(instr->opc);
      }
      break;
   }
}

/**
 * Map a floating point immed to FLUT (float lookup table) value,
 * returns negative for immediates that cannot be mapped.
 */
int
ir3_flut(struct ir3_register *src_reg)
{
   static const struct {
      uint32_t f32;
      uint16_t f16;
   } flut[] = {
         { .f32 = 0x00000000, .f16 = 0x0000 },    /* 0.0 */
         { .f32 = 0x3f000000, .f16 = 0x3800 },    /* 0.5 */
         { .f32 = 0x3f800000, .f16 = 0x3c00 },    /* 1.0 */
         { .f32 = 0x40000000, .f16 = 0x4000 },    /* 2.0 */
         { .f32 = 0x402df854, .f16 = 0x4170 },    /* e */
         { .f32 = 0x40490fdb, .f16 = 0x4248 },    /* pi */
         { .f32 = 0x3ea2f983, .f16 = 0x3518 },    /* 1/pi */
         { .f32 = 0x3f317218, .f16 = 0x398c },    /* 1/log2(e) */
         { .f32 = 0x3fb8aa3b, .f16 = 0x3dc5 },    /* log2(e) */
         { .f32 = 0x3e9a209b, .f16 = 0x34d1 },    /* 1/log2(10) */
         { .f32 = 0x40549a78, .f16 = 0x42a5 },    /* log2(10) */
         { .f32 = 0x40800000, .f16 = 0x4400 },    /* 4.0 */
   };

   if (src_reg->flags & IR3_REG_HALF) {
      /* Note that half-float immeds are already lowered to 16b in nir: */
      uint32_t imm = src_reg->uim_val;
      for (unsigned i = 0; i < ARRAY_SIZE(flut); i++) {
         if (flut[i].f16 == imm) {
            return i;
         }
      }
   } else {
      uint32_t imm = src_reg->uim_val;
      for (unsigned i = 0; i < ARRAY_SIZE(flut); i++) {
         if (flut[i].f32 == imm) {
            return i;
         }
      }
   }

   return -1;
}

static unsigned
cp_flags(unsigned flags)
{
   /* only considering these flags (at least for now): */
   flags &= (IR3_REG_CONST | IR3_REG_IMMED | IR3_REG_FNEG | IR3_REG_FABS |
             IR3_REG_SNEG | IR3_REG_SABS | IR3_REG_BNOT | IR3_REG_RELATIV |
             IR3_REG_SHARED);
   return flags;
}

bool
ir3_valid_flags(struct ir3_instruction *instr, unsigned n, unsigned flags)
{
   struct ir3_compiler *compiler = instr->block->shader->compiler;
   unsigned valid_flags;

   flags = cp_flags(flags);

   /* If destination is indirect, then source cannot be.. at least
    * I don't think so..
    */
   if (instr->dsts_count > 0 && (instr->dsts[0]->flags & IR3_REG_RELATIV) &&
       (flags & IR3_REG_RELATIV))
      return false;

   if (flags & IR3_REG_RELATIV) {
      /* TODO need to test on earlier gens.. pretty sure the earlier
       * problem was just that we didn't check that the src was from
       * same block (since we can't propagate address register values
       * across blocks currently)
       */
      if (compiler->gen < 6)
         return false;

      /* NOTE in the special try_swap_mad_two_srcs() case we can be
       * called on a src that has already had an indirect load folded
       * in, in which case ssa() returns NULL
       */
      if (instr->srcs[n]->flags & IR3_REG_SSA) {
         struct ir3_instruction *src = ssa(instr->srcs[n]);
         if (src->address->def->instr->block != instr->block)
            return false;
      }
   }

   if (is_meta(instr)) {
      /* collect and phi nodes support const/immed sources, which will be
       * turned into move instructions, but not anything else.
       */
      if (flags & ~(IR3_REG_IMMED | IR3_REG_CONST | IR3_REG_SHARED))
         return false;

      /* Except for immed/const sources, source and dest shared-ness must match.
       */
      if (!(flags & (IR3_REG_IMMED | IR3_REG_CONST)) &&
          (flags & IR3_REG_SHARED) != (instr->dsts[0]->flags & IR3_REG_SHARED))
         return false;

      return true;
   }

   switch (opc_cat(instr->opc)) {
   case 0: /* end, chmask */
      return flags == 0;
   case 1:
      switch (instr->opc) {
      case OPC_MOVMSK:
      case OPC_SWZ:
      case OPC_SCT:
      case OPC_GAT:
         valid_flags = IR3_REG_SHARED;
         break;
      case OPC_SCAN_MACRO:
         if (n == 0)
            return flags == 0;
         else
            return flags == IR3_REG_SHARED;
         break;
      case OPC_SCAN_CLUSTERS_MACRO:
         if (n == 0)
            return flags == IR3_REG_SHARED;
         else
            return flags == 0;
         break;
      default: {
         valid_flags =
            IR3_REG_IMMED | IR3_REG_CONST | IR3_REG_RELATIV | IR3_REG_SHARED;

         /* floating-point conversions when moving from non-shared to shared
          * seem not to work. We only use floating-point types in ir3 for
          * conversions, so don't bother specially handling the case where the
          * types are equal. Same goes for 8-bit sign extension.
          */
         if ((instr->dsts[0]->flags & IR3_REG_SHARED) &&
             !(flags & (IR3_REG_SHARED | IR3_REG_IMMED | IR3_REG_CONST)) &&
             ((full_type(instr->cat1.src_type) == TYPE_F32 ||
               full_type(instr->cat1.dst_type) == TYPE_F32) ||
              (instr->cat1.src_type == TYPE_U8 &&
               full_type(instr->cat1.dst_type) == TYPE_S32)))
            return false;

         /* Conversions seem not to work in shared->shared copies before scalar
          * ALU is supported.
          */
         if (!compiler->has_scalar_alu &&
             (flags & IR3_REG_SHARED) &&
             (instr->dsts[0]->flags & IR3_REG_SHARED) &&
             instr->cat1.src_type != instr->cat1.dst_type)
            return false;
      }
      }
      if (flags & ~valid_flags)
         return false;
      break;
   case 2:
      valid_flags = ir3_cat2_absneg(instr->opc) | IR3_REG_CONST |
                    IR3_REG_RELATIV | IR3_REG_IMMED | IR3_REG_SHARED;

      if (flags & ~valid_flags)
         return false;

      /* Allow an immediate src1 for flat.b, since it's ignored */
      if (instr->opc == OPC_FLAT_B &&
          n == 1 && flags == IR3_REG_IMMED)
         return true;

      /* cat2/cat3 scalar ALU instructions must not have regular sources. */
      if (instr->dsts[0]->flags & IR3_REG_SHARED) {
         if (!(flags & (IR3_REG_SHARED | IR3_REG_IMMED | IR3_REG_CONST)))
            return false;
      }

      if (flags & (IR3_REG_CONST | IR3_REG_IMMED | IR3_REG_SHARED)) {
         unsigned m = n ^ 1;
         /* cannot deal w/ const or shared in both srcs:
          * (note that some cat2 actually only have a single src)
          */
         if (m < instr->srcs_count) {
            struct ir3_register *reg = instr->srcs[m];
            if (instr->dsts[0]->flags & IR3_REG_SHARED) {
               if ((flags & IR3_REG_CONST) && (reg->flags & IR3_REG_CONST))
                  return false;
            } else {
               if ((flags & (IR3_REG_CONST | IR3_REG_SHARED)) &&
                   (reg->flags & (IR3_REG_CONST | IR3_REG_SHARED)))
                  return false;
            }
            if ((flags & IR3_REG_IMMED) && reg->flags & (IR3_REG_IMMED))
               return false;
         }
      }
      break;
   case 3:
      valid_flags =
         ir3_cat3_absneg(instr->opc) | IR3_REG_RELATIV | IR3_REG_SHARED;

      switch (instr->opc) {
      case OPC_SHRM:
      case OPC_SHLM:
      case OPC_SHRG:
      case OPC_SHLG:
      case OPC_ANDG: {
         valid_flags |= IR3_REG_IMMED;
         /* Can be RELATIV+CONST but not CONST: */
         if (flags & IR3_REG_RELATIV)
            valid_flags |= IR3_REG_CONST;
         break;
      }
      case OPC_WMM:
      case OPC_WMM_ACCU: {
         valid_flags = IR3_REG_SHARED;
         if (n == 2)
            valid_flags = IR3_REG_CONST;
         break;
      }
      case OPC_DP2ACC:
      case OPC_DP4ACC:
         break;
      default:
         valid_flags |= IR3_REG_CONST;
      }

      if (flags & ~valid_flags)
         return false;

      if (flags & (IR3_REG_CONST | IR3_REG_RELATIV) ||
          (!(instr->dsts[0]->flags & IR3_REG_SHARED) &&
           (flags & IR3_REG_SHARED))) {
         /* cannot deal w/ const/shared/relativ in 2nd src: */
         if (n == 1)
            return false;
      }

      if (instr->dsts[0]->flags & IR3_REG_SHARED) {
         if (!(flags & (IR3_REG_SHARED | IR3_REG_IMMED | IR3_REG_CONST)))
            return false;
      }

      break;
   case 4:
      if ((instr->dsts[0]->flags & IR3_REG_SHARED) != (flags & IR3_REG_SHARED))
         return false;
      /* seems like blob compiler avoids const as src.. */
      /* TODO double check if this is still the case on a4xx */
      if (flags & (IR3_REG_CONST | IR3_REG_IMMED))
         return false;
      if (flags & (IR3_REG_SABS | IR3_REG_SNEG))
         return false;
      break;
   case 5:
      if (instr->opc == OPC_ISAM && (instr->flags & IR3_INSTR_V)) {
         if (((instr->flags & IR3_INSTR_S2EN) && n == 2) ||
             (!(instr->flags & IR3_INSTR_S2EN) && n == 1)) {
            return flags == IR3_REG_IMMED;
         }
      }
      /* no flags allowed */
      if (flags)
         return false;
      break;
   case 6:
      valid_flags = IR3_REG_IMMED;

      if (instr->opc == OPC_STC && n == 1)
         valid_flags |= IR3_REG_SHARED;

      if (flags & ~valid_flags)
         return false;

      if (flags & IR3_REG_IMMED) {
         /* doesn't seem like we can have immediate src for store
          * instructions:
          *
          * TODO this restriction could also apply to load instructions,
          * but for load instructions this arg is the address (and not
          * really sure any good way to test a hard-coded immed addr src)
          */
         if (is_store(instr) && (instr->opc != OPC_STG) && (n == 1))
            return false;

         if ((instr->opc == OPC_LDL) && (n == 0))
            return false;

         if ((instr->opc == OPC_STL) && (n != 2))
            return false;

         if ((instr->opc == OPC_LDP) && (n == 0))
            return false;

         if ((instr->opc == OPC_STP) && (n != 2))
            return false;

         if (instr->opc == OPC_STLW && n == 0)
            return false;

         if (instr->opc == OPC_LDLW && n == 0)
            return false;

         /* disallow immediates in anything but the SSBO slot argument for
          * cat6 instructions:
          */
         if (is_global_a3xx_atomic(instr->opc) && (n != 0))
            return false;

         if (is_local_atomic(instr->opc) || is_global_a6xx_atomic(instr->opc) ||
             is_bindless_atomic(instr->opc))
            return false;

         if (instr->opc == OPC_STG && (n == 2))
            return false;

         if (instr->opc == OPC_STG_A && (n == 4))
            return false;

         if (instr->opc == OPC_LDG && (n == 0))
            return false;

         if (instr->opc == OPC_LDG_A && (n < 2))
            return false;

         if (instr->opc == OPC_STC && n != 0)
            return false;

         /* as with atomics, these cat6 instrs can only have an immediate
          * for SSBO/IBO slot argument
          */
         switch (instr->opc) {
         case OPC_LDIB:
         case OPC_STIB:
            if (n != 0 && n != 2)
               return false;
            break;
         case OPC_RESINFO:
            if (n != 0)
               return false;
            break;
         default:
            break;
         }
      }

      break;
   }

   return true;
}

bool
ir3_valid_immediate(struct ir3_instruction *instr, int32_t immed)
{
   if (instr->opc == OPC_MOV || is_meta(instr))
      return true;

   if (is_mem(instr)) {
      switch (instr->opc) {
      /* Some load/store instructions have a 13-bit offset and size which must
       * always be an immediate and the rest of the sources cannot be
       * immediates, so the frontend is responsible for checking the size:
       */
      case OPC_LDL:
      case OPC_STL:
      case OPC_LDP:
      case OPC_STP:
      case OPC_LDG:
      case OPC_STG:
      case OPC_SPILL_MACRO:
      case OPC_RELOAD_MACRO:
      case OPC_LDG_A:
      case OPC_STG_A:
      case OPC_LDLW:
      case OPC_STLW:
      case OPC_LDLV:
         return true;
      default:
         /* most cat6 src immediates can only encode 8 bits: */
         return !(immed & ~0xff);
      }
   }

   /* Other than cat1 (mov) we can only encode up to 10 bits, sign-extended: */
   return !(immed & ~0x1ff) || !(-immed & ~0x1ff);
}

struct ir3_instruction *
ir3_get_cond_for_nonzero_compare(struct ir3_instruction *instr)
{
   /* If instr is a negation (likely as a result of an nir_b2n), we can ignore
    * that and use its source, since the nonzero-ness stays the same.
    */
   if (instr->opc == OPC_ABSNEG_S && instr->flags == 0 &&
       (instr->srcs[0]->flags & (IR3_REG_SNEG | IR3_REG_SABS)) ==
          IR3_REG_SNEG) {
      return instr->srcs[0]->def->instr;
   }

   return instr;
}

bool
ir3_supports_rpt(struct ir3_compiler *compiler, unsigned opc)
{
   switch (opc_cat(opc)) {
   case 0:
      return opc == OPC_NOP;
   case 1:
      return opc == OPC_MOV || opc == OPC_SWZ || opc == OPC_MOVMSK;
   case 2:
      if (opc == OPC_BARY_F && !compiler->has_rpt_bary_f)
         return false;
      return true;
   case 3:
      return opc != OPC_DP2ACC && opc != OPC_DP4ACC;
   case 4:
      return opc != OPC_RCP;
   default:
      return false;
   }
}