xref: /aosp_15_r20/external/mesa3d/src/compiler/glsl/bc4.glsl (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1/*
2 * Copyright 2020-2022 Matias N. Goldberg
3 * Copyright 2022 Intel Corporation
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice shall be included in
13 * all copies or substantial portions of the Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21 * DEALINGS IN THE SOFTWARE.
22 */
23
24#version 310 es
25
26#if defined(GL_ES) && GL_ES == 1
27	// Desktop GLSL allows the const keyword for either compile-time or
28	// run-time constants. GLSL ES only allows the keyword for compile-time
29	// constants. Since we use const on run-time constants, define it to
30	// nothing.
31	#define const
32#endif
33
34#define __sharedOnlyBarrier memoryBarrierShared();barrier();
35
36%s // include "CrossPlatformSettings_piece_all.glsl"
37
38shared float2 g_minMaxValues[4u * 4u * 4u];
39shared uint2 g_mask[4u * 4u];
40
41layout( location = 0 ) uniform uint2 params;
42
43#define p_channelIdx params.x
44#define p_useSNorm params.y
45
46uniform sampler2D srcTex;
47
48layout( rgba16ui ) uniform restrict writeonly mediump uimage2D dstTexture;
49
50layout( local_size_x = 4,  //
51		local_size_y = 4,  //
52		local_size_z = 4 ) in;
53
54/// Each block is 16 pixels
55/// Each thread works on 4 pixels
56/// Therefore each block needs 4 threads, generating 8 masks
57/// At the end these 8 masks get merged into 2 and results written to output
58///
59/// **Q: Why 4 pixels per thread? Why not 1 pixel per thread? Why not 2? Why not 16?**
60///
61/// A: It's a sweetspot.
62///  - Very short threads cannot fill expensive GPUs with enough work (dispatch bound)
63///  - Lots of threads means lots of synchronization (e.g. evaluating min/max, merging masks)
64///    overhead, and also more LDS usage which reduces occupancy.
65///  - Long threads (e.g. 1 thread per block) misses parallelism opportunities
66void main()
67{
68	float minVal, maxVal;
69	float4 srcPixel;
70
71	const uint blockThreadId = gl_LocalInvocationID.x;
72
73	const uint2 pixelsToLoadBase = gl_GlobalInvocationID.yz << 2u;
74
75	for( uint i = 0u; i < 4u; ++i )
76	{
77		const uint2 pixelsToLoad = pixelsToLoadBase + uint2( i, blockThreadId );
78
79		const float4 value = OGRE_Load2D( srcTex, int2( pixelsToLoad ), 0 ).xyzw;
80		srcPixel[i] = p_channelIdx == 0u ? value.x : ( p_channelIdx == 1u ? value.y : value.w );
81		srcPixel[i] *= 255.0f;
82	}
83
84	minVal = min3( srcPixel.x, srcPixel.y, srcPixel.z );
85	maxVal = max3( srcPixel.x, srcPixel.y, srcPixel.z );
86	minVal = min( minVal, srcPixel.w );
87	maxVal = max( maxVal, srcPixel.w );
88
89	const uint minMaxIdxBase = ( gl_LocalInvocationID.z << 4u ) + ( gl_LocalInvocationID.y << 2u );
90	const uint maskIdxBase = ( gl_LocalInvocationID.z << 2u ) + gl_LocalInvocationID.y;
91
92	g_minMaxValues[minMaxIdxBase + blockThreadId] = float2( minVal, maxVal );
93	g_mask[maskIdxBase] = uint2( 0u, 0u );
94
95	__sharedOnlyBarrier;
96
97	// Have all 4 threads in the block grab the min/max value by comparing what all 4 threads uploaded
98	for( uint i = 0u; i < 4u; ++i )
99	{
100		minVal = min( g_minMaxValues[minMaxIdxBase + i].x, minVal );
101		maxVal = max( g_minMaxValues[minMaxIdxBase + i].y, maxVal );
102	}
103
104	// determine bias and emit color indices
105	// given the choice of maxVal/minVal, these indices are optimal:
106	// http://fgiesen.wordpress.com/2009/12/15/dxt5-alpha-block-index-determination/
107	float dist = maxVal - minVal;
108	float dist4 = dist * 4.0f;
109	float dist2 = dist * 2.0f;
110	float bias = ( dist < 8.0f ) ? ( dist - 1.0f ) : ( trunc( dist * 0.5f ) + 2.0f );
111	bias -= minVal * 7.0f;
112
113	uint mask0 = 0u, mask1 = 0u;
114
115	for( uint i = 0u; i < 4u; ++i )
116	{
117		float a = srcPixel[i] * 7.0f + bias;
118
119		int ind = 0;
120
121		// select index. this is a "linear scale" lerp factor between 0 (val=min) and 7 (val=max).
122		if( a >= dist4 )
123		{
124			ind = 4;
125			a -= dist4;
126		}
127
128		if( a >= dist2 )
129		{
130			ind += 2;
131			a -= dist2;
132		}
133
134		if( a >= dist )
135			ind += 1;
136
137		// turn linear scale into DXT index (0/1 are extremal pts)
138		ind = -ind & 7;
139		ind ^= ( 2 > ind ) ? 1 : 0;
140
141		// write index
142		const uint bits = 16u + ( ( blockThreadId << 2u ) + i ) * 3u;
143		if( bits < 32u )
144		{
145			mask0 |= uint( ind ) << bits;
146			if( bits + 3u > 32u )
147			{
148				mask1 |= uint( ind ) >> ( 32u - bits );
149			}
150		}
151		else
152		{
153			mask1 |= uint( ind ) << ( bits - 32u );
154		}
155	}
156
157	if( mask0 != 0u )
158		atomicOr( g_mask[maskIdxBase].x, mask0 );
159	if( mask1 != 0u )
160		atomicOr( g_mask[maskIdxBase].y, mask1 );
161
162	__sharedOnlyBarrier;
163
164	if( blockThreadId == 0u )
165	{
166		// Save data
167		uint4 outputBytes;
168
169		if( p_useSNorm != 0u )
170		{
171			outputBytes.x =
172				packSnorm4x8( float4( maxVal * ( 1.0f / 255.0f ) * 2.0f - 1.0f,
173									  minVal * ( 1.0f / 255.0f ) * 2.0f - 1.0f, 0.0f, 0.0f ) );
174		}
175		else
176		{
177			outputBytes.x = packUnorm4x8(
178				float4( maxVal * ( 1.0f / 255.0f ), minVal * ( 1.0f / 255.0f ), 0.0f, 0.0f ) );
179		}
180		outputBytes.y = g_mask[maskIdxBase].x >> 16u;
181		outputBytes.z = g_mask[maskIdxBase].y & 0xFFFFu;
182		outputBytes.w = g_mask[maskIdxBase].y >> 16u;
183
184		uint2 dstUV = gl_GlobalInvocationID.yz;
185		imageStore( dstTexture, int2( dstUV ), outputBytes );
186	}
187}
188