1 /* 2 * Copyright © 2021 Igalia S.L. 3 * SPDX-License-Identifier: MIT 4 */ 5 6 #ifndef TU_AUTOTUNE_H 7 #define TU_AUTOTUNE_H 8 9 #include "tu_common.h" 10 11 #include "util/hash_table.h" 12 #include "util/rwlock.h" 13 14 #include "tu_suballoc.h" 15 16 struct tu_renderpass_history; 17 18 /** 19 * "autotune" our decisions about bypass vs GMEM rendering, based on historical 20 * data about a given render target. 21 * 22 * In deciding which path to take there are tradeoffs, including some that 23 * are not reasonably estimateable without having some additional information: 24 * 25 * (1) If you know you are touching every pixel (ie. there is a clear), 26 * then the GMEM path will at least not cost more memory bandwidth than 27 * sysmem[1] 28 * 29 * (2) If there is no clear, GMEM could potentially cost *more* bandwidth 30 * if there is sysmem->GMEM restore pass. 31 * 32 * (3) If you see a high draw count, that is an indication that there will be 33 * enough pixels accessed multiple times to benefit from the reduced 34 * memory bandwidth that GMEM brings 35 * 36 * (4) But high draw count where there is not much overdraw can actually be 37 * faster in bypass mode if it is pushing a lot of state change, due to 38 * not having to go thru the state changes per-tile[1] 39 * 40 * The approach taken is to measure the samples-passed for the batch to estimate 41 * the amount of overdraw to detect cases where the number of pixels touched is 42 * low. 43 * 44 * [1] ignoring early-tile-exit optimizations, but any draw that touches all/ 45 * most of the tiles late in the tile-pass can defeat that 46 */ 47 struct tu_autotune { 48 49 /* We may have to disable autotuner if there are too many 50 * renderpasses in-flight. 51 */ 52 bool enabled; 53 54 struct tu_device *device; 55 56 /** 57 * Cache to map renderpass key to historical information about 58 * rendering to that particular render target. 59 */ 60 struct hash_table *ht; 61 struct u_rwlock ht_lock; 62 63 /** 64 * List of per-renderpass results that we are waiting for the GPU 65 * to finish with before reading back the results. 66 */ 67 struct list_head pending_results; 68 69 /** 70 * List of per-submission data that we may want to free after we 71 * processed submission results. 72 * This could happend after command buffers which were in the submission 73 * are destroyed. 74 */ 75 struct list_head pending_submission_data; 76 77 /** 78 * List of per-submission data that has been finished and can be reused. 79 */ 80 struct list_head submission_data_pool; 81 82 uint32_t fence_counter; 83 uint32_t idx_counter; 84 }; 85 86 /** 87 * From the cmdstream, the captured samples-passed values are recorded 88 * at the start and end of the batch. 89 * 90 * Note that we do the math on the CPU to avoid a WFI. But pre-emption 91 * may force us to revisit that. 92 */ 93 struct PACKED tu_renderpass_samples { 94 uint64_t samples_start; 95 /* hw requires the sample start/stop locations to be 128b aligned. */ 96 uint64_t __pad0; 97 uint64_t samples_end; 98 uint64_t __pad1; 99 }; 100 101 /* Necessary when writing sample counts using CP_EVENT_WRITE7::ZPASS_DONE. */ 102 static_assert(offsetof(struct tu_renderpass_samples, samples_end) == 16); 103 104 /** 105 * Tracks the results from an individual renderpass. Initially created 106 * per renderpass, and appended to the tail of at->pending_results. At a later 107 * time, when the GPU has finished writing the results, we fill samples_passed. 108 */ 109 struct tu_renderpass_result { 110 /* Points into GPU memory */ 111 struct tu_renderpass_samples* samples; 112 113 struct tu_suballoc_bo bo; 114 115 /* 116 * Below here, only used internally within autotune 117 */ 118 uint64_t rp_key; 119 struct tu_renderpass_history *history; 120 struct list_head node; 121 uint32_t fence; 122 uint64_t samples_passed; 123 }; 124 125 VkResult tu_autotune_init(struct tu_autotune *at, struct tu_device *dev); 126 void tu_autotune_fini(struct tu_autotune *at, struct tu_device *dev); 127 128 bool tu_autotune_use_bypass(struct tu_autotune *at, 129 struct tu_cmd_buffer *cmd_buffer, 130 struct tu_renderpass_result **autotune_result); 131 void tu_autotune_free_results(struct tu_device *dev, struct list_head *results); 132 133 bool tu_autotune_submit_requires_fence(struct tu_cmd_buffer **cmd_buffers, 134 uint32_t cmd_buffer_count); 135 136 /** 137 * A magic 8-ball that tells the gmem code whether we should do bypass mode 138 * for moar fps. 139 */ 140 struct tu_cs *tu_autotune_on_submit(struct tu_device *dev, 141 struct tu_autotune *at, 142 struct tu_cmd_buffer **cmd_buffers, 143 uint32_t cmd_buffer_count); 144 145 struct tu_autotune_results_buffer; 146 147 template <chip CHIP> 148 void tu_autotune_begin_renderpass(struct tu_cmd_buffer *cmd, 149 struct tu_cs *cs, 150 struct tu_renderpass_result *autotune_result); 151 152 template <chip CHIP> 153 void tu_autotune_end_renderpass(struct tu_cmd_buffer *cmd, 154 struct tu_cs *cs, 155 struct tu_renderpass_result *autotune_result); 156 157 #endif /* TU_AUTOTUNE_H */ 158