1/* 2 * Copyright 2020-2022 Matias N. Goldberg 3 * Copyright 2022 Intel Corporation 4 * 5 * Permission is hereby granted, free of charge, to any person obtaining a 6 * copy of this software and associated documentation files (the "Software"), 7 * to deal in the Software without restriction, including without limitation 8 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 * and/or sell copies of the Software, and to permit persons to whom the 10 * Software is furnished to do so, subject to the following conditions: 11 * 12 * The above copyright notice and this permission notice shall be included in 13 * all copies or substantial portions of the Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 21 * DEALINGS IN THE SOFTWARE. 22 */ 23 24#version 310 es 25 26#if defined(GL_ES) && GL_ES == 1 27 // Desktop GLSL allows the const keyword for either compile-time or 28 // run-time constants. GLSL ES only allows the keyword for compile-time 29 // constants. Since we use const on run-time constants, define it to 30 // nothing. 31 #define const 32#endif 33 34#define __sharedOnlyBarrier memoryBarrierShared();barrier(); 35 36%s // include "CrossPlatformSettings_piece_all.glsl" 37 38shared float2 g_minMaxValues[4u * 4u * 4u]; 39shared uint2 g_mask[4u * 4u]; 40 41layout( location = 0 ) uniform uint2 params; 42 43#define p_channelIdx params.x 44#define p_useSNorm params.y 45 46uniform sampler2D srcTex; 47 48layout( rgba16ui ) uniform restrict writeonly mediump uimage2D dstTexture; 49 50layout( local_size_x = 4, // 51 local_size_y = 4, // 52 local_size_z = 4 ) in; 53 54/// Each block is 16 pixels 55/// Each thread works on 4 pixels 56/// Therefore each block needs 4 threads, generating 8 masks 57/// At the end these 8 masks get merged into 2 and results written to output 58/// 59/// **Q: Why 4 pixels per thread? Why not 1 pixel per thread? Why not 2? Why not 16?** 60/// 61/// A: It's a sweetspot. 62/// - Very short threads cannot fill expensive GPUs with enough work (dispatch bound) 63/// - Lots of threads means lots of synchronization (e.g. evaluating min/max, merging masks) 64/// overhead, and also more LDS usage which reduces occupancy. 65/// - Long threads (e.g. 1 thread per block) misses parallelism opportunities 66void main() 67{ 68 float minVal, maxVal; 69 float4 srcPixel; 70 71 const uint blockThreadId = gl_LocalInvocationID.x; 72 73 const uint2 pixelsToLoadBase = gl_GlobalInvocationID.yz << 2u; 74 75 for( uint i = 0u; i < 4u; ++i ) 76 { 77 const uint2 pixelsToLoad = pixelsToLoadBase + uint2( i, blockThreadId ); 78 79 const float4 value = OGRE_Load2D( srcTex, int2( pixelsToLoad ), 0 ).xyzw; 80 srcPixel[i] = p_channelIdx == 0u ? value.x : ( p_channelIdx == 1u ? value.y : value.w ); 81 srcPixel[i] *= 255.0f; 82 } 83 84 minVal = min3( srcPixel.x, srcPixel.y, srcPixel.z ); 85 maxVal = max3( srcPixel.x, srcPixel.y, srcPixel.z ); 86 minVal = min( minVal, srcPixel.w ); 87 maxVal = max( maxVal, srcPixel.w ); 88 89 const uint minMaxIdxBase = ( gl_LocalInvocationID.z << 4u ) + ( gl_LocalInvocationID.y << 2u ); 90 const uint maskIdxBase = ( gl_LocalInvocationID.z << 2u ) + gl_LocalInvocationID.y; 91 92 g_minMaxValues[minMaxIdxBase + blockThreadId] = float2( minVal, maxVal ); 93 g_mask[maskIdxBase] = uint2( 0u, 0u ); 94 95 __sharedOnlyBarrier; 96 97 // Have all 4 threads in the block grab the min/max value by comparing what all 4 threads uploaded 98 for( uint i = 0u; i < 4u; ++i ) 99 { 100 minVal = min( g_minMaxValues[minMaxIdxBase + i].x, minVal ); 101 maxVal = max( g_minMaxValues[minMaxIdxBase + i].y, maxVal ); 102 } 103 104 // determine bias and emit color indices 105 // given the choice of maxVal/minVal, these indices are optimal: 106 // http://fgiesen.wordpress.com/2009/12/15/dxt5-alpha-block-index-determination/ 107 float dist = maxVal - minVal; 108 float dist4 = dist * 4.0f; 109 float dist2 = dist * 2.0f; 110 float bias = ( dist < 8.0f ) ? ( dist - 1.0f ) : ( trunc( dist * 0.5f ) + 2.0f ); 111 bias -= minVal * 7.0f; 112 113 uint mask0 = 0u, mask1 = 0u; 114 115 for( uint i = 0u; i < 4u; ++i ) 116 { 117 float a = srcPixel[i] * 7.0f + bias; 118 119 int ind = 0; 120 121 // select index. this is a "linear scale" lerp factor between 0 (val=min) and 7 (val=max). 122 if( a >= dist4 ) 123 { 124 ind = 4; 125 a -= dist4; 126 } 127 128 if( a >= dist2 ) 129 { 130 ind += 2; 131 a -= dist2; 132 } 133 134 if( a >= dist ) 135 ind += 1; 136 137 // turn linear scale into DXT index (0/1 are extremal pts) 138 ind = -ind & 7; 139 ind ^= ( 2 > ind ) ? 1 : 0; 140 141 // write index 142 const uint bits = 16u + ( ( blockThreadId << 2u ) + i ) * 3u; 143 if( bits < 32u ) 144 { 145 mask0 |= uint( ind ) << bits; 146 if( bits + 3u > 32u ) 147 { 148 mask1 |= uint( ind ) >> ( 32u - bits ); 149 } 150 } 151 else 152 { 153 mask1 |= uint( ind ) << ( bits - 32u ); 154 } 155 } 156 157 if( mask0 != 0u ) 158 atomicOr( g_mask[maskIdxBase].x, mask0 ); 159 if( mask1 != 0u ) 160 atomicOr( g_mask[maskIdxBase].y, mask1 ); 161 162 __sharedOnlyBarrier; 163 164 if( blockThreadId == 0u ) 165 { 166 // Save data 167 uint4 outputBytes; 168 169 if( p_useSNorm != 0u ) 170 { 171 outputBytes.x = 172 packSnorm4x8( float4( maxVal * ( 1.0f / 255.0f ) * 2.0f - 1.0f, 173 minVal * ( 1.0f / 255.0f ) * 2.0f - 1.0f, 0.0f, 0.0f ) ); 174 } 175 else 176 { 177 outputBytes.x = packUnorm4x8( 178 float4( maxVal * ( 1.0f / 255.0f ), minVal * ( 1.0f / 255.0f ), 0.0f, 0.0f ) ); 179 } 180 outputBytes.y = g_mask[maskIdxBase].x >> 16u; 181 outputBytes.z = g_mask[maskIdxBase].y & 0xFFFFu; 182 outputBytes.w = g_mask[maskIdxBase].y >> 16u; 183 184 uint2 dstUV = gl_GlobalInvocationID.yz; 185 imageStore( dstTexture, int2( dstUV ), outputBytes ); 186 } 187} 188