1 // SPDX-License-Identifier: Apache-2.0
2 // ----------------------------------------------------------------------------
3 // Copyright 2011-2022 Arm Limited
4 //
5 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
6 // use this file except in compliance with the License. You may obtain a copy
7 // of the License at:
8 //
9 // http://www.apache.org/licenses/LICENSE-2.0
10 //
11 // Unless required by applicable law or agreed to in writing, software
12 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
13 // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
14 // License for the specific language governing permissions and limitations
15 // under the License.
16 // ----------------------------------------------------------------------------
17
18 #if !defined(ASTCENC_DECOMPRESS_ONLY)
19
20 /**
21 * @brief Functions to compress a symbolic block.
22 */
23
24 #include "astcenc_internal.h"
25 #include "astcenc_diagnostic_trace.h"
26
27 #include <cassert>
28
29 /**
30 * @brief Merge two planes of endpoints into a single vector.
31 *
32 * @param ep_plane1 The endpoints for plane 1.
33 * @param ep_plane2 The endpoints for plane 2.
34 * @param component_plane2 The color component for plane 2.
35 * @param[out] result The merged output.
36 */
merge_endpoints(const endpoints & ep_plane1,const endpoints & ep_plane2,unsigned int component_plane2,endpoints & result)37 static void merge_endpoints(
38 const endpoints& ep_plane1,
39 const endpoints& ep_plane2,
40 unsigned int component_plane2,
41 endpoints& result
42 ) {
43 unsigned int partition_count = ep_plane1.partition_count;
44 assert(partition_count == 1);
45
46 vmask4 sep_mask = vint4::lane_id() == vint4(component_plane2);
47
48 result.partition_count = partition_count;
49 result.endpt0[0] = select(ep_plane1.endpt0[0], ep_plane2.endpt0[0], sep_mask);
50 result.endpt1[0] = select(ep_plane1.endpt1[0], ep_plane2.endpt1[0], sep_mask);
51 }
52
53 /**
54 * @brief Attempt to improve weights given a chosen configuration.
55 *
56 * Given a fixed weight grid decimation and weight value quantization, iterate over all weights (per
57 * partition and per plane) and attempt to improve image quality by moving each weight up by one or
58 * down by one quantization step.
59 *
60 * This is a specialized function which only supports operating on undecimated weight grids,
61 * therefore primarily improving the performance of 4x4 and 5x5 blocks where grid decimation
62 * is needed less often.
63 *
64 * @param decode_mode The decode mode (LDR, HDR).
65 * @param bsd The block size information.
66 * @param blk The image block color data to compress.
67 * @param[out] scb The symbolic compressed block output.
68 */
realign_weights_undecimated(astcenc_profile decode_mode,const block_size_descriptor & bsd,const image_block & blk,symbolic_compressed_block & scb)69 static bool realign_weights_undecimated(
70 astcenc_profile decode_mode,
71 const block_size_descriptor& bsd,
72 const image_block& blk,
73 symbolic_compressed_block& scb
74 ) {
75 // Get the partition descriptor
76 unsigned int partition_count = scb.partition_count;
77 const auto& pi = bsd.get_partition_info(partition_count, scb.partition_index);
78
79 // Get the quantization table
80 const block_mode& bm = bsd.get_block_mode(scb.block_mode);
81 unsigned int weight_quant_level = bm.quant_mode;
82 const quant_and_transfer_table& qat = quant_and_xfer_tables[weight_quant_level];
83
84 unsigned int max_plane = bm.is_dual_plane;
85 int plane2_component = bm.is_dual_plane ? scb.plane2_component : -1;
86 vmask4 plane_mask = vint4::lane_id() == vint4(plane2_component);
87
88 // Decode the color endpoints
89 bool rgb_hdr;
90 bool alpha_hdr;
91 vint4 endpnt0[BLOCK_MAX_PARTITIONS];
92 vint4 endpnt1[BLOCK_MAX_PARTITIONS];
93 vfloat4 endpnt0f[BLOCK_MAX_PARTITIONS];
94 vfloat4 offset[BLOCK_MAX_PARTITIONS];
95
96 promise(partition_count > 0);
97
98 for (unsigned int pa_idx = 0; pa_idx < partition_count; pa_idx++)
99 {
100 unpack_color_endpoints(decode_mode,
101 scb.color_formats[pa_idx],
102 scb.get_color_quant_mode(),
103 scb.color_values[pa_idx],
104 rgb_hdr, alpha_hdr,
105 endpnt0[pa_idx],
106 endpnt1[pa_idx]);
107 }
108
109 uint8_t* dec_weights_uquant = scb.weights;
110 bool adjustments = false;
111
112 // For each plane and partition ...
113 for (unsigned int pl_idx = 0; pl_idx <= max_plane; pl_idx++)
114 {
115 for (unsigned int pa_idx = 0; pa_idx < partition_count; pa_idx++)
116 {
117 // Compute the endpoint delta for all components in current plane
118 vint4 epd = endpnt1[pa_idx] - endpnt0[pa_idx];
119 epd = select(epd, vint4::zero(), plane_mask);
120
121 endpnt0f[pa_idx] = int_to_float(endpnt0[pa_idx]);
122 offset[pa_idx] = int_to_float(epd) * (1.0f / 64.0f);
123 }
124
125 // For each weight compute previous, current, and next errors
126 promise(bsd.texel_count > 0);
127 for (unsigned int texel = 0; texel < bsd.texel_count; texel++)
128 {
129 int uqw = dec_weights_uquant[texel];
130
131 uint32_t prev_and_next = qat.prev_next_values[uqw];
132 int uqw_down = prev_and_next & 0xFF;
133 int uqw_up = (prev_and_next >> 8) & 0xFF;
134
135 // Interpolate the colors to create the diffs
136 float weight_base = static_cast<float>(uqw);
137 float weight_down = static_cast<float>(uqw_down - uqw);
138 float weight_up = static_cast<float>(uqw_up - uqw);
139
140 unsigned int partition = pi.partition_of_texel[texel];
141 vfloat4 color_offset = offset[partition];
142 vfloat4 color_base = endpnt0f[partition];
143
144 vfloat4 color = color_base + color_offset * weight_base;
145 vfloat4 orig_color = blk.texel(texel);
146 vfloat4 error_weight = blk.channel_weight;
147
148 vfloat4 color_diff = color - orig_color;
149 vfloat4 color_diff_down = color_diff + color_offset * weight_down;
150 vfloat4 color_diff_up = color_diff + color_offset * weight_up;
151
152 float error_base = dot_s(color_diff * color_diff, error_weight);
153 float error_down = dot_s(color_diff_down * color_diff_down, error_weight);
154 float error_up = dot_s(color_diff_up * color_diff_up, error_weight);
155
156 // Check if the prev or next error is better, and if so use it
157 if ((error_up < error_base) && (error_up < error_down) && (uqw < 64))
158 {
159 dec_weights_uquant[texel] = static_cast<uint8_t>(uqw_up);
160 adjustments = true;
161 }
162 else if ((error_down < error_base) && (uqw > 0))
163 {
164 dec_weights_uquant[texel] = static_cast<uint8_t>(uqw_down);
165 adjustments = true;
166 }
167 }
168
169 // Prepare iteration for plane 2
170 dec_weights_uquant += WEIGHTS_PLANE2_OFFSET;
171 plane_mask = ~plane_mask;
172 }
173
174 return adjustments;
175 }
176
177 /**
178 * @brief Attempt to improve weights given a chosen configuration.
179 *
180 * Given a fixed weight grid decimation and weight value quantization, iterate over all weights (per
181 * partition and per plane) and attempt to improve image quality by moving each weight up by one or
182 * down by one quantization step.
183 *
184 * @param decode_mode The decode mode (LDR, HDR).
185 * @param bsd The block size information.
186 * @param blk The image block color data to compress.
187 * @param[out] scb The symbolic compressed block output.
188 */
realign_weights_decimated(astcenc_profile decode_mode,const block_size_descriptor & bsd,const image_block & blk,symbolic_compressed_block & scb)189 static bool realign_weights_decimated(
190 astcenc_profile decode_mode,
191 const block_size_descriptor& bsd,
192 const image_block& blk,
193 symbolic_compressed_block& scb
194 ) {
195 // Get the partition descriptor
196 unsigned int partition_count = scb.partition_count;
197 const auto& pi = bsd.get_partition_info(partition_count, scb.partition_index);
198
199 // Get the quantization table
200 const block_mode& bm = bsd.get_block_mode(scb.block_mode);
201 unsigned int weight_quant_level = bm.quant_mode;
202 const quant_and_transfer_table& qat = quant_and_xfer_tables[weight_quant_level];
203
204 // Get the decimation table
205 const decimation_info& di = bsd.get_decimation_info(bm.decimation_mode);
206 unsigned int weight_count = di.weight_count;
207 assert(weight_count != bsd.texel_count);
208
209 unsigned int max_plane = bm.is_dual_plane;
210 int plane2_component = bm.is_dual_plane ? scb.plane2_component : -1;
211 vmask4 plane_mask = vint4::lane_id() == vint4(plane2_component);
212
213 // Decode the color endpoints
214 bool rgb_hdr;
215 bool alpha_hdr;
216 vint4 endpnt0[BLOCK_MAX_PARTITIONS];
217 vint4 endpnt1[BLOCK_MAX_PARTITIONS];
218 vfloat4 endpnt0f[BLOCK_MAX_PARTITIONS];
219 vfloat4 offset[BLOCK_MAX_PARTITIONS];
220
221 promise(partition_count > 0);
222 promise(weight_count > 0);
223
224 for (unsigned int pa_idx = 0; pa_idx < partition_count; pa_idx++)
225 {
226 unpack_color_endpoints(decode_mode,
227 scb.color_formats[pa_idx],
228 scb.get_color_quant_mode(),
229 scb.color_values[pa_idx],
230 rgb_hdr, alpha_hdr,
231 endpnt0[pa_idx],
232 endpnt1[pa_idx]);
233 }
234
235 uint8_t* dec_weights_uquant = scb.weights;
236 bool adjustments = false;
237
238 // For each plane and partition ...
239 for (unsigned int pl_idx = 0; pl_idx <= max_plane; pl_idx++)
240 {
241 for (unsigned int pa_idx = 0; pa_idx < partition_count; pa_idx++)
242 {
243 // Compute the endpoint delta for all components in current plane
244 vint4 epd = endpnt1[pa_idx] - endpnt0[pa_idx];
245 epd = select(epd, vint4::zero(), plane_mask);
246
247 endpnt0f[pa_idx] = int_to_float(endpnt0[pa_idx]);
248 offset[pa_idx] = int_to_float(epd) * (1.0f / 64.0f);
249 }
250
251 // Create an unquantized weight grid for this decimation level
252 alignas(ASTCENC_VECALIGN) float uq_weightsf[BLOCK_MAX_WEIGHTS];
253 for (unsigned int we_idx = 0; we_idx < weight_count; we_idx += ASTCENC_SIMD_WIDTH)
254 {
255 vint unquant_value(dec_weights_uquant + we_idx);
256 vfloat unquant_valuef = int_to_float(unquant_value);
257 storea(unquant_valuef, uq_weightsf + we_idx);
258 }
259
260 // For each weight compute previous, current, and next errors
261 for (unsigned int we_idx = 0; we_idx < weight_count; we_idx++)
262 {
263 int uqw = dec_weights_uquant[we_idx];
264 uint32_t prev_and_next = qat.prev_next_values[uqw];
265
266 float uqw_base = uq_weightsf[we_idx];
267 float uqw_down = static_cast<float>(prev_and_next & 0xFF);
268 float uqw_up = static_cast<float>((prev_and_next >> 8) & 0xFF);
269
270 float uqw_diff_down = uqw_down - uqw_base;
271 float uqw_diff_up = uqw_up - uqw_base;
272
273 vfloat4 error_basev = vfloat4::zero();
274 vfloat4 error_downv = vfloat4::zero();
275 vfloat4 error_upv = vfloat4::zero();
276
277 // Interpolate the colors to create the diffs
278 unsigned int texels_to_evaluate = di.weight_texel_count[we_idx];
279 promise(texels_to_evaluate > 0);
280 for (unsigned int te_idx = 0; te_idx < texels_to_evaluate; te_idx++)
281 {
282 unsigned int texel = di.weight_texel[te_idx][we_idx];
283
284 const uint8_t *texel_weights = di.texel_weights_texel[we_idx][te_idx];
285 const float *texel_weights_float = di.texel_weights_float_texel[we_idx][te_idx];
286
287 float tw_base = texel_weights_float[0];
288
289 float weight_base = (uqw_base * tw_base
290 + uq_weightsf[texel_weights[1]] * texel_weights_float[1])
291 + (uq_weightsf[texel_weights[2]] * texel_weights_float[2]
292 + uq_weightsf[texel_weights[3]] * texel_weights_float[3]);
293
294 // Ideally this is integer rounded, but IQ gain it isn't worth the overhead
295 // float weight = astc::flt_rd(weight_base + 0.5f);
296 // float weight_down = astc::flt_rd(weight_base + 0.5f + uqw_diff_down * tw_base) - weight;
297 // float weight_up = astc::flt_rd(weight_base + 0.5f + uqw_diff_up * tw_base) - weight;
298 float weight_down = weight_base + uqw_diff_down * tw_base - weight_base;
299 float weight_up = weight_base + uqw_diff_up * tw_base - weight_base;
300
301 unsigned int partition = pi.partition_of_texel[texel];
302 vfloat4 color_offset = offset[partition];
303 vfloat4 color_base = endpnt0f[partition];
304
305 vfloat4 color = color_base + color_offset * weight_base;
306 vfloat4 orig_color = blk.texel(texel);
307
308 vfloat4 color_diff = color - orig_color;
309 vfloat4 color_down_diff = color_diff + color_offset * weight_down;
310 vfloat4 color_up_diff = color_diff + color_offset * weight_up;
311
312 error_basev += color_diff * color_diff;
313 error_downv += color_down_diff * color_down_diff;
314 error_upv += color_up_diff * color_up_diff;
315 }
316
317 vfloat4 error_weight = blk.channel_weight;
318 float error_base = hadd_s(error_basev * error_weight);
319 float error_down = hadd_s(error_downv * error_weight);
320 float error_up = hadd_s(error_upv * error_weight);
321
322 // Check if the prev or next error is better, and if so use it
323 if ((error_up < error_base) && (error_up < error_down) && (uqw < 64))
324 {
325 uq_weightsf[we_idx] = uqw_up;
326 dec_weights_uquant[we_idx] = static_cast<uint8_t>(uqw_up);
327 adjustments = true;
328 }
329 else if ((error_down < error_base) && (uqw > 0))
330 {
331 uq_weightsf[we_idx] = uqw_down;
332 dec_weights_uquant[we_idx] = static_cast<uint8_t>(uqw_down);
333 adjustments = true;
334 }
335 }
336
337 // Prepare iteration for plane 2
338 dec_weights_uquant += WEIGHTS_PLANE2_OFFSET;
339 plane_mask = ~plane_mask;
340 }
341
342 return adjustments;
343 }
344
345 /**
346 * @brief Compress a block using a chosen partitioning and 1 plane of weights.
347 *
348 * @param config The compressor configuration.
349 * @param bsd The block size information.
350 * @param blk The image block color data to compress.
351 * @param only_always True if we only use "always" percentile block modes.
352 * @param tune_errorval_threshold The error value threshold.
353 * @param partition_count The partition count.
354 * @param partition_index The partition index if @c partition_count is 2-4.
355 * @param[out] scb The symbolic compressed block output.
356 * @param[out] tmpbuf The quantized weights for plane 1.
357 */
compress_symbolic_block_for_partition_1plane(const astcenc_config & config,const block_size_descriptor & bsd,const image_block & blk,bool only_always,float tune_errorval_threshold,unsigned int partition_count,unsigned int partition_index,symbolic_compressed_block & scb,compression_working_buffers & tmpbuf,int quant_limit)358 static float compress_symbolic_block_for_partition_1plane(
359 const astcenc_config& config,
360 const block_size_descriptor& bsd,
361 const image_block& blk,
362 bool only_always,
363 float tune_errorval_threshold,
364 unsigned int partition_count,
365 unsigned int partition_index,
366 symbolic_compressed_block& scb,
367 compression_working_buffers& tmpbuf,
368 int quant_limit
369 ) {
370 promise(partition_count > 0);
371 promise(config.tune_candidate_limit > 0);
372 promise(config.tune_refinement_limit > 0);
373
374 int max_weight_quant = astc::min(static_cast<int>(QUANT_32), quant_limit);
375
376 auto compute_difference = &compute_symbolic_block_difference_1plane;
377 if ((partition_count == 1) && !(config.flags & ASTCENC_FLG_MAP_RGBM))
378 {
379 compute_difference = &compute_symbolic_block_difference_1plane_1partition;
380 }
381
382 const auto& pi = bsd.get_partition_info(partition_count, partition_index);
383
384 // Compute ideal weights and endpoint colors, with no quantization or decimation
385 endpoints_and_weights& ei = tmpbuf.ei1;
386 compute_ideal_colors_and_weights_1plane(blk, pi, ei);
387
388 // Compute ideal weights and endpoint colors for every decimation
389 float* dec_weights_ideal = tmpbuf.dec_weights_ideal;
390 uint8_t* dec_weights_uquant = tmpbuf.dec_weights_uquant;
391
392 // For each decimation mode, compute an ideal set of weights with no quantization
393 unsigned int max_decimation_modes = only_always ? bsd.decimation_mode_count_always
394 : bsd.decimation_mode_count_selected;
395 promise(max_decimation_modes > 0);
396 for (unsigned int i = 0; i < max_decimation_modes; i++)
397 {
398 const auto& dm = bsd.get_decimation_mode(i);
399 if (!dm.is_ref_1_plane(static_cast<quant_method>(max_weight_quant)))
400 {
401 continue;
402 }
403
404 const auto& di = bsd.get_decimation_info(i);
405
406 compute_ideal_weights_for_decimation(
407 ei,
408 di,
409 dec_weights_ideal + i * BLOCK_MAX_WEIGHTS);
410 }
411
412 // Compute maximum colors for the endpoints and ideal weights, then for each endpoint and ideal
413 // weight pair, compute the smallest weight that will result in a color value greater than 1
414 vfloat4 min_ep(10.0f);
415 for (unsigned int i = 0; i < partition_count; i++)
416 {
417 vfloat4 ep = (vfloat4(1.0f) - ei.ep.endpt0[i]) / (ei.ep.endpt1[i] - ei.ep.endpt0[i]);
418
419 vmask4 use_ep = (ep > vfloat4(0.5f)) & (ep < min_ep);
420 min_ep = select(min_ep, ep, use_ep);
421 }
422
423 float min_wt_cutoff = hmin_s(min_ep);
424
425 // For each mode, use the angular method to compute a shift
426 compute_angular_endpoints_1plane(
427 only_always, bsd, dec_weights_ideal, max_weight_quant, tmpbuf);
428
429 float* weight_low_value = tmpbuf.weight_low_value1;
430 float* weight_high_value = tmpbuf.weight_high_value1;
431 int8_t* qwt_bitcounts = tmpbuf.qwt_bitcounts;
432 float* qwt_errors = tmpbuf.qwt_errors;
433
434 // For each mode (which specifies a decimation and a quantization):
435 // * Compute number of bits needed for the quantized weights
436 // * Generate an optimized set of quantized weights
437 // * Compute quantization errors for the mode
438
439
440 static const int8_t free_bits_for_partition_count[4] {
441 115 - 4, 111 - 4 - PARTITION_INDEX_BITS, 108 - 4 - PARTITION_INDEX_BITS, 105 - 4 - PARTITION_INDEX_BITS
442 };
443
444 unsigned int max_block_modes = only_always ? bsd.block_mode_count_1plane_always
445 : bsd.block_mode_count_1plane_selected;
446 promise(max_block_modes > 0);
447 for (unsigned int i = 0; i < max_block_modes; i++)
448 {
449 const block_mode& bm = bsd.block_modes[i];
450
451 if (bm.quant_mode > max_weight_quant)
452 {
453 qwt_errors[i] = 1e38f;
454 continue;
455 }
456
457 assert(!bm.is_dual_plane);
458 int bitcount = free_bits_for_partition_count[partition_count - 1] - bm.weight_bits;
459 if (bitcount <= 0)
460 {
461 qwt_errors[i] = 1e38f;
462 continue;
463 }
464
465 if (weight_high_value[i] > 1.02f * min_wt_cutoff)
466 {
467 weight_high_value[i] = 1.0f;
468 }
469
470 int decimation_mode = bm.decimation_mode;
471 const auto& di = bsd.get_decimation_info(decimation_mode);
472
473 qwt_bitcounts[i] = static_cast<int8_t>(bitcount);
474
475 alignas(ASTCENC_VECALIGN) float dec_weights_uquantf[BLOCK_MAX_WEIGHTS];
476
477 // Generate the optimized set of weights for the weight mode
478 compute_quantized_weights_for_decimation(
479 di,
480 weight_low_value[i], weight_high_value[i],
481 dec_weights_ideal + BLOCK_MAX_WEIGHTS * decimation_mode,
482 dec_weights_uquantf,
483 dec_weights_uquant + BLOCK_MAX_WEIGHTS * i,
484 bm.get_weight_quant_mode());
485
486 // Compute weight quantization errors for the block mode
487 qwt_errors[i] = compute_error_of_weight_set_1plane(
488 ei,
489 di,
490 dec_weights_uquantf);
491 }
492
493 // Decide the optimal combination of color endpoint encodings and weight encodings
494 uint8_t partition_format_specifiers[TUNE_MAX_TRIAL_CANDIDATES][BLOCK_MAX_PARTITIONS];
495 int block_mode_index[TUNE_MAX_TRIAL_CANDIDATES];
496
497 quant_method color_quant_level[TUNE_MAX_TRIAL_CANDIDATES];
498 quant_method color_quant_level_mod[TUNE_MAX_TRIAL_CANDIDATES];
499
500 unsigned int candidate_count = compute_ideal_endpoint_formats(
501 pi, blk, ei.ep, qwt_bitcounts, qwt_errors,
502 config.tune_candidate_limit, 0, max_block_modes,
503 partition_format_specifiers, block_mode_index,
504 color_quant_level, color_quant_level_mod, tmpbuf);
505
506 // Iterate over the N believed-to-be-best modes to find out which one is actually best
507 float best_errorval_in_mode = ERROR_CALC_DEFAULT;
508 float best_errorval_in_scb = scb.errorval;
509
510 for (unsigned int i = 0; i < candidate_count; i++)
511 {
512 TRACE_NODE(node0, "candidate");
513
514 const int bm_packed_index = block_mode_index[i];
515 assert(bm_packed_index >= 0 && bm_packed_index < static_cast<int>(bsd.block_mode_count_1plane_selected));
516 const block_mode& qw_bm = bsd.block_modes[bm_packed_index];
517
518 int decimation_mode = qw_bm.decimation_mode;
519 const auto& di = bsd.get_decimation_info(decimation_mode);
520 promise(di.weight_count > 0);
521
522 trace_add_data("weight_x", di.weight_x);
523 trace_add_data("weight_y", di.weight_y);
524 trace_add_data("weight_z", di.weight_z);
525 trace_add_data("weight_quant", qw_bm.quant_mode);
526
527 // Recompute the ideal color endpoints before storing them
528 vfloat4 rgbs_colors[BLOCK_MAX_PARTITIONS];
529 vfloat4 rgbo_colors[BLOCK_MAX_PARTITIONS];
530
531 symbolic_compressed_block workscb;
532 endpoints workep = ei.ep;
533
534 uint8_t* u8_weight_src = dec_weights_uquant + BLOCK_MAX_WEIGHTS * bm_packed_index;
535
536 for (unsigned int j = 0; j < di.weight_count; j++)
537 {
538 workscb.weights[j] = u8_weight_src[j];
539 }
540
541 for (unsigned int l = 0; l < config.tune_refinement_limit; l++)
542 {
543 recompute_ideal_colors_1plane(
544 blk, pi, di, workscb.weights,
545 workep, rgbs_colors, rgbo_colors);
546
547 // Quantize the chosen color, tracking if worth trying the mod value
548 bool all_same = color_quant_level[i] != color_quant_level_mod[i];
549 for (unsigned int j = 0; j < partition_count; j++)
550 {
551 workscb.color_formats[j] = pack_color_endpoints(
552 workep.endpt0[j],
553 workep.endpt1[j],
554 rgbs_colors[j],
555 rgbo_colors[j],
556 partition_format_specifiers[i][j],
557 workscb.color_values[j],
558 color_quant_level[i]);
559
560 all_same = all_same && workscb.color_formats[j] == workscb.color_formats[0];
561 }
562
563 // If all the color endpoint modes are the same, we get a few more bits to store colors;
564 // let's see if we can take advantage of this: requantize all the colors and see if the
565 // endpoint modes remain the same.
566 workscb.color_formats_matched = 0;
567 if (partition_count >= 2 && all_same)
568 {
569 uint8_t colorvals[BLOCK_MAX_PARTITIONS][12];
570 uint8_t color_formats_mod[BLOCK_MAX_PARTITIONS] { 0 };
571 bool all_same_mod = true;
572 for (unsigned int j = 0; j < partition_count; j++)
573 {
574 color_formats_mod[j] = pack_color_endpoints(
575 workep.endpt0[j],
576 workep.endpt1[j],
577 rgbs_colors[j],
578 rgbo_colors[j],
579 partition_format_specifiers[i][j],
580 colorvals[j],
581 color_quant_level_mod[i]);
582
583 // Early out as soon as it's no longer possible to use mod
584 if (color_formats_mod[j] != color_formats_mod[0])
585 {
586 all_same_mod = false;
587 break;
588 }
589 }
590
591 if (all_same_mod)
592 {
593 workscb.color_formats_matched = 1;
594 for (unsigned int j = 0; j < BLOCK_MAX_PARTITIONS; j++)
595 {
596 for (unsigned int k = 0; k < 8; k++)
597 {
598 workscb.color_values[j][k] = colorvals[j][k];
599 }
600
601 workscb.color_formats[j] = color_formats_mod[j];
602 }
603 }
604 }
605
606 // Store header fields
607 workscb.partition_count = static_cast<uint8_t>(partition_count);
608 workscb.partition_index = static_cast<uint16_t>(partition_index);
609 workscb.plane2_component = -1;
610 workscb.quant_mode = workscb.color_formats_matched ? color_quant_level_mod[i] : color_quant_level[i];
611 workscb.block_mode = qw_bm.mode_index;
612 workscb.block_type = SYM_BTYPE_NONCONST;
613
614 // Pre-realign test
615 if (l == 0)
616 {
617 float errorval = compute_difference(config, bsd, workscb, blk);
618 if (errorval == -ERROR_CALC_DEFAULT)
619 {
620 errorval = -errorval;
621 workscb.block_type = SYM_BTYPE_ERROR;
622 }
623
624 trace_add_data("error_prerealign", errorval);
625 best_errorval_in_mode = astc::min(errorval, best_errorval_in_mode);
626
627 // Average refinement improvement is 3.5% per iteration (allow 5%), but the first
628 // iteration can help more so we give it a extra 10% leeway. Use this knowledge to
629 // drive a heuristic to skip blocks that are unlikely to catch up with the best
630 // block we have already.
631 unsigned int iters_remaining = config.tune_refinement_limit - l;
632 float threshold = (0.05f * static_cast<float>(iters_remaining)) + 1.1f;
633 if (errorval > (threshold * best_errorval_in_scb))
634 {
635 break;
636 }
637
638 if (errorval < best_errorval_in_scb)
639 {
640 best_errorval_in_scb = errorval;
641 workscb.errorval = errorval;
642 scb = workscb;
643
644 if (errorval < tune_errorval_threshold)
645 {
646 // Skip remaining candidates - this is "good enough"
647 i = candidate_count;
648 break;
649 }
650 }
651 }
652
653 bool adjustments;
654 if (di.weight_count != bsd.texel_count)
655 {
656 adjustments = realign_weights_decimated(
657 config.profile, bsd, blk, workscb);
658 }
659 else
660 {
661 adjustments = realign_weights_undecimated(
662 config.profile, bsd, blk, workscb);
663 }
664
665 // Post-realign test
666 float errorval = compute_difference(config, bsd, workscb, blk);
667 if (errorval == -ERROR_CALC_DEFAULT)
668 {
669 errorval = -errorval;
670 workscb.block_type = SYM_BTYPE_ERROR;
671 }
672
673 trace_add_data("error_postrealign", errorval);
674 best_errorval_in_mode = astc::min(errorval, best_errorval_in_mode);
675
676 // Average refinement improvement is 3.5% per iteration, so skip blocks that are
677 // unlikely to catch up with the best block we have already. Assume a 5% per step to
678 // give benefit of the doubt ...
679 unsigned int iters_remaining = config.tune_refinement_limit - 1 - l;
680 float threshold = (0.05f * static_cast<float>(iters_remaining)) + 1.0f;
681 if (errorval > (threshold * best_errorval_in_scb))
682 {
683 break;
684 }
685
686 if (errorval < best_errorval_in_scb)
687 {
688 best_errorval_in_scb = errorval;
689 workscb.errorval = errorval;
690 scb = workscb;
691
692 if (errorval < tune_errorval_threshold)
693 {
694 // Skip remaining candidates - this is "good enough"
695 i = candidate_count;
696 break;
697 }
698 }
699
700 if (!adjustments)
701 {
702 break;
703 }
704 }
705 }
706
707 return best_errorval_in_mode;
708 }
709
710 /**
711 * @brief Compress a block using a chosen partitioning and 2 planes of weights.
712 *
713 * @param config The compressor configuration.
714 * @param bsd The block size information.
715 * @param blk The image block color data to compress.
716 * @param tune_errorval_threshold The error value threshold.
717 * @param plane2_component The component index for the second plane of weights.
718 * @param[out] scb The symbolic compressed block output.
719 * @param[out] tmpbuf The quantized weights for plane 1.
720 */
compress_symbolic_block_for_partition_2planes(const astcenc_config & config,const block_size_descriptor & bsd,const image_block & blk,float tune_errorval_threshold,unsigned int plane2_component,symbolic_compressed_block & scb,compression_working_buffers & tmpbuf,int quant_limit)721 static float compress_symbolic_block_for_partition_2planes(
722 const astcenc_config& config,
723 const block_size_descriptor& bsd,
724 const image_block& blk,
725 float tune_errorval_threshold,
726 unsigned int plane2_component,
727 symbolic_compressed_block& scb,
728 compression_working_buffers& tmpbuf,
729 int quant_limit
730 ) {
731 promise(config.tune_candidate_limit > 0);
732 promise(config.tune_refinement_limit > 0);
733 promise(bsd.decimation_mode_count_selected > 0);
734
735 int max_weight_quant = astc::min(static_cast<int>(QUANT_32), quant_limit);
736
737 // Compute ideal weights and endpoint colors, with no quantization or decimation
738 endpoints_and_weights& ei1 = tmpbuf.ei1;
739 endpoints_and_weights& ei2 = tmpbuf.ei2;
740
741 compute_ideal_colors_and_weights_2planes(bsd, blk, plane2_component, ei1, ei2);
742
743 // Compute ideal weights and endpoint colors for every decimation
744 float* dec_weights_ideal = tmpbuf.dec_weights_ideal;
745 uint8_t* dec_weights_uquant = tmpbuf.dec_weights_uquant;
746
747 // For each decimation mode, compute an ideal set of weights with no quantization
748 for (unsigned int i = 0; i < bsd.decimation_mode_count_selected; i++)
749 {
750 const auto& dm = bsd.get_decimation_mode(i);
751 if (!dm.is_ref_2_plane(static_cast<quant_method>(max_weight_quant)))
752 {
753 continue;
754 }
755
756 const auto& di = bsd.get_decimation_info(i);
757
758 compute_ideal_weights_for_decimation(
759 ei1,
760 di,
761 dec_weights_ideal + i * BLOCK_MAX_WEIGHTS);
762
763 compute_ideal_weights_for_decimation(
764 ei2,
765 di,
766 dec_weights_ideal + i * BLOCK_MAX_WEIGHTS + WEIGHTS_PLANE2_OFFSET);
767 }
768
769 // Compute maximum colors for the endpoints and ideal weights, then for each endpoint and ideal
770 // weight pair, compute the smallest weight that will result in a color value greater than 1
771 vfloat4 min_ep1(10.0f);
772 vfloat4 min_ep2(10.0f);
773
774 vfloat4 ep1 = (vfloat4(1.0f) - ei1.ep.endpt0[0]) / (ei1.ep.endpt1[0] - ei1.ep.endpt0[0]);
775 vmask4 use_ep1 = (ep1 > vfloat4(0.5f)) & (ep1 < min_ep1);
776 min_ep1 = select(min_ep1, ep1, use_ep1);
777
778 vfloat4 ep2 = (vfloat4(1.0f) - ei2.ep.endpt0[0]) / (ei2.ep.endpt1[0] - ei2.ep.endpt0[0]);
779 vmask4 use_ep2 = (ep2 > vfloat4(0.5f)) & (ep2 < min_ep2);
780 min_ep2 = select(min_ep2, ep2, use_ep2);
781
782 vfloat4 err_max(ERROR_CALC_DEFAULT);
783 vmask4 err_mask = vint4::lane_id() == vint4(plane2_component);
784
785 // Set the plane2 component to max error in ep1
786 min_ep1 = select(min_ep1, err_max, err_mask);
787
788 float min_wt_cutoff1 = hmin_s(min_ep1);
789
790 // Set the minwt2 to the plane2 component min in ep2
791 float min_wt_cutoff2 = hmin_s(select(err_max, min_ep2, err_mask));
792
793 compute_angular_endpoints_2planes(
794 bsd, dec_weights_ideal, max_weight_quant, tmpbuf);
795
796 // For each mode (which specifies a decimation and a quantization):
797 // * Compute number of bits needed for the quantized weights
798 // * Generate an optimized set of quantized weights
799 // * Compute quantization errors for the mode
800
801 float* weight_low_value1 = tmpbuf.weight_low_value1;
802 float* weight_high_value1 = tmpbuf.weight_high_value1;
803 float* weight_low_value2 = tmpbuf.weight_low_value2;
804 float* weight_high_value2 = tmpbuf.weight_high_value2;
805
806 int8_t* qwt_bitcounts = tmpbuf.qwt_bitcounts;
807 float* qwt_errors = tmpbuf.qwt_errors;
808
809 unsigned int start_2plane = bsd.block_mode_count_1plane_selected;
810 unsigned int end_2plane = bsd.block_mode_count_1plane_2plane_selected;
811
812 for (unsigned int i = start_2plane; i < end_2plane; i++)
813 {
814 const block_mode& bm = bsd.block_modes[i];
815 assert(bm.is_dual_plane);
816
817 if (bm.quant_mode > max_weight_quant)
818 {
819 qwt_errors[i] = 1e38f;
820 continue;
821 }
822
823 qwt_bitcounts[i] = static_cast<int8_t>(109 - bm.weight_bits);
824
825 if (weight_high_value1[i] > 1.02f * min_wt_cutoff1)
826 {
827 weight_high_value1[i] = 1.0f;
828 }
829
830 if (weight_high_value2[i] > 1.02f * min_wt_cutoff2)
831 {
832 weight_high_value2[i] = 1.0f;
833 }
834
835 unsigned int decimation_mode = bm.decimation_mode;
836 const auto& di = bsd.get_decimation_info(decimation_mode);
837
838 alignas(ASTCENC_VECALIGN) float dec_weights_uquantf[BLOCK_MAX_WEIGHTS];
839
840 // Generate the optimized set of weights for the mode
841 compute_quantized_weights_for_decimation(
842 di,
843 weight_low_value1[i],
844 weight_high_value1[i],
845 dec_weights_ideal + BLOCK_MAX_WEIGHTS * decimation_mode,
846 dec_weights_uquantf,
847 dec_weights_uquant + BLOCK_MAX_WEIGHTS * i,
848 bm.get_weight_quant_mode());
849
850 compute_quantized_weights_for_decimation(
851 di,
852 weight_low_value2[i],
853 weight_high_value2[i],
854 dec_weights_ideal + BLOCK_MAX_WEIGHTS * decimation_mode + WEIGHTS_PLANE2_OFFSET,
855 dec_weights_uquantf + WEIGHTS_PLANE2_OFFSET,
856 dec_weights_uquant + BLOCK_MAX_WEIGHTS * i + WEIGHTS_PLANE2_OFFSET,
857 bm.get_weight_quant_mode());
858
859 // Compute weight quantization errors for the block mode
860 qwt_errors[i] = compute_error_of_weight_set_2planes(
861 ei1,
862 ei2,
863 di,
864 dec_weights_uquantf,
865 dec_weights_uquantf + WEIGHTS_PLANE2_OFFSET);
866 }
867
868 // Decide the optimal combination of color endpoint encodings and weight encodings
869 uint8_t partition_format_specifiers[TUNE_MAX_TRIAL_CANDIDATES][BLOCK_MAX_PARTITIONS];
870 int block_mode_index[TUNE_MAX_TRIAL_CANDIDATES];
871
872 quant_method color_quant_level[TUNE_MAX_TRIAL_CANDIDATES];
873 quant_method color_quant_level_mod[TUNE_MAX_TRIAL_CANDIDATES];
874
875 endpoints epm;
876 merge_endpoints(ei1.ep, ei2.ep, plane2_component, epm);
877
878 const auto& pi = bsd.get_partition_info(1, 0);
879 unsigned int candidate_count = compute_ideal_endpoint_formats(
880 pi, blk, epm, qwt_bitcounts, qwt_errors,
881 config.tune_candidate_limit,
882 bsd.block_mode_count_1plane_selected, bsd.block_mode_count_1plane_2plane_selected,
883 partition_format_specifiers, block_mode_index,
884 color_quant_level, color_quant_level_mod, tmpbuf);
885
886 // Iterate over the N believed-to-be-best modes to find out which one is actually best
887 float best_errorval_in_mode = ERROR_CALC_DEFAULT;
888 float best_errorval_in_scb = scb.errorval;
889
890 for (unsigned int i = 0; i < candidate_count; i++)
891 {
892 TRACE_NODE(node0, "candidate");
893
894 const int bm_packed_index = block_mode_index[i];
895 assert(bm_packed_index >= static_cast<int>(bsd.block_mode_count_1plane_selected) &&
896 bm_packed_index < static_cast<int>(bsd.block_mode_count_1plane_2plane_selected));
897 const block_mode& qw_bm = bsd.block_modes[bm_packed_index];
898
899 int decimation_mode = qw_bm.decimation_mode;
900 const auto& di = bsd.get_decimation_info(decimation_mode);
901 promise(di.weight_count > 0);
902
903 trace_add_data("weight_x", di.weight_x);
904 trace_add_data("weight_y", di.weight_y);
905 trace_add_data("weight_z", di.weight_z);
906 trace_add_data("weight_quant", qw_bm.quant_mode);
907
908 vfloat4 rgbs_color;
909 vfloat4 rgbo_color;
910
911 symbolic_compressed_block workscb;
912 endpoints workep = epm;
913
914 uint8_t* u8_weight1_src = dec_weights_uquant + BLOCK_MAX_WEIGHTS * bm_packed_index;
915 uint8_t* u8_weight2_src = dec_weights_uquant + BLOCK_MAX_WEIGHTS * bm_packed_index + WEIGHTS_PLANE2_OFFSET;
916
917 for (int j = 0; j < di.weight_count; j++)
918 {
919 workscb.weights[j] = u8_weight1_src[j];
920 workscb.weights[j + WEIGHTS_PLANE2_OFFSET] = u8_weight2_src[j];
921 }
922
923 for (unsigned int l = 0; l < config.tune_refinement_limit; l++)
924 {
925 recompute_ideal_colors_2planes(
926 blk, bsd, di,
927 workscb.weights, workscb.weights + WEIGHTS_PLANE2_OFFSET,
928 workep, rgbs_color, rgbo_color, plane2_component);
929
930 // Quantize the chosen color
931 workscb.color_formats[0] = pack_color_endpoints(
932 workep.endpt0[0],
933 workep.endpt1[0],
934 rgbs_color, rgbo_color,
935 partition_format_specifiers[i][0],
936 workscb.color_values[0],
937 color_quant_level[i]);
938
939 // Store header fields
940 workscb.partition_count = 1;
941 workscb.partition_index = 0;
942 workscb.quant_mode = color_quant_level[i];
943 workscb.color_formats_matched = 0;
944 workscb.block_mode = qw_bm.mode_index;
945 workscb.plane2_component = static_cast<int8_t>(plane2_component);
946 workscb.block_type = SYM_BTYPE_NONCONST;
947
948 // Pre-realign test
949 if (l == 0)
950 {
951 float errorval = compute_symbolic_block_difference_2plane(config, bsd, workscb, blk);
952 if (errorval == -ERROR_CALC_DEFAULT)
953 {
954 errorval = -errorval;
955 workscb.block_type = SYM_BTYPE_ERROR;
956 }
957
958 trace_add_data("error_prerealign", errorval);
959 best_errorval_in_mode = astc::min(errorval, best_errorval_in_mode);
960
961 // Average refinement improvement is 3.5% per iteration (allow 5%), but the first
962 // iteration can help more so we give it a extra 10% leeway. Use this knowledge to
963 // drive a heuristic to skip blocks that are unlikely to catch up with the best
964 // block we have already.
965 unsigned int iters_remaining = config.tune_refinement_limit - l;
966 float threshold = (0.05f * static_cast<float>(iters_remaining)) + 1.1f;
967 if (errorval > (threshold * best_errorval_in_scb))
968 {
969 break;
970 }
971
972 if (errorval < best_errorval_in_scb)
973 {
974 best_errorval_in_scb = errorval;
975 workscb.errorval = errorval;
976 scb = workscb;
977
978 if (errorval < tune_errorval_threshold)
979 {
980 // Skip remaining candidates - this is "good enough"
981 i = candidate_count;
982 break;
983 }
984 }
985 }
986
987 // Perform a final pass over the weights to try to improve them.
988 bool adjustments;
989 if (di.weight_count != bsd.texel_count)
990 {
991 adjustments = realign_weights_decimated(
992 config.profile, bsd, blk, workscb);
993 }
994 else
995 {
996 adjustments = realign_weights_undecimated(
997 config.profile, bsd, blk, workscb);
998 }
999
1000 // Post-realign test
1001 float errorval = compute_symbolic_block_difference_2plane(config, bsd, workscb, blk);
1002 if (errorval == -ERROR_CALC_DEFAULT)
1003 {
1004 errorval = -errorval;
1005 workscb.block_type = SYM_BTYPE_ERROR;
1006 }
1007
1008 trace_add_data("error_postrealign", errorval);
1009 best_errorval_in_mode = astc::min(errorval, best_errorval_in_mode);
1010
1011 // Average refinement improvement is 3.5% per iteration, so skip blocks that are
1012 // unlikely to catch up with the best block we have already. Assume a 5% per step to
1013 // give benefit of the doubt ...
1014 unsigned int iters_remaining = config.tune_refinement_limit - 1 - l;
1015 float threshold = (0.05f * static_cast<float>(iters_remaining)) + 1.0f;
1016 if (errorval > (threshold * best_errorval_in_scb))
1017 {
1018 break;
1019 }
1020
1021 if (errorval < best_errorval_in_scb)
1022 {
1023 best_errorval_in_scb = errorval;
1024 workscb.errorval = errorval;
1025 scb = workscb;
1026
1027 if (errorval < tune_errorval_threshold)
1028 {
1029 // Skip remaining candidates - this is "good enough"
1030 i = candidate_count;
1031 break;
1032 }
1033 }
1034
1035 if (!adjustments)
1036 {
1037 break;
1038 }
1039 }
1040 }
1041
1042 return best_errorval_in_mode;
1043 }
1044
1045 /**
1046 * @brief Determine the lowest cross-channel correlation factor.
1047 *
1048 * @param texels_per_block The number of texels in a block.
1049 * @param blk The image block color data to compress.
1050 *
1051 * @return Return the lowest correlation factor.
1052 */
prepare_block_statistics(int texels_per_block,const image_block & blk)1053 static float prepare_block_statistics(
1054 int texels_per_block,
1055 const image_block& blk
1056 ) {
1057 // Compute covariance matrix, as a collection of 10 scalars that form the upper-triangular row
1058 // of the matrix. The matrix is symmetric, so this is all we need for this use case.
1059 float rs = 0.0f;
1060 float gs = 0.0f;
1061 float bs = 0.0f;
1062 float as = 0.0f;
1063 float rr_var = 0.0f;
1064 float gg_var = 0.0f;
1065 float bb_var = 0.0f;
1066 float aa_var = 0.0f;
1067 float rg_cov = 0.0f;
1068 float rb_cov = 0.0f;
1069 float ra_cov = 0.0f;
1070 float gb_cov = 0.0f;
1071 float ga_cov = 0.0f;
1072 float ba_cov = 0.0f;
1073
1074 float weight_sum = 0.0f;
1075
1076 promise(texels_per_block > 0);
1077 for (int i = 0; i < texels_per_block; i++)
1078 {
1079 float weight = hadd_s(blk.channel_weight) / 4.0f;
1080 assert(weight >= 0.0f);
1081 weight_sum += weight;
1082
1083 float r = blk.data_r[i];
1084 float g = blk.data_g[i];
1085 float b = blk.data_b[i];
1086 float a = blk.data_a[i];
1087
1088 float rw = r * weight;
1089 rs += rw;
1090 rr_var += r * rw;
1091 rg_cov += g * rw;
1092 rb_cov += b * rw;
1093 ra_cov += a * rw;
1094
1095 float gw = g * weight;
1096 gs += gw;
1097 gg_var += g * gw;
1098 gb_cov += b * gw;
1099 ga_cov += a * gw;
1100
1101 float bw = b * weight;
1102 bs += bw;
1103 bb_var += b * bw;
1104 ba_cov += a * bw;
1105
1106 float aw = a * weight;
1107 as += aw;
1108 aa_var += a * aw;
1109 }
1110
1111 float rpt = 1.0f / astc::max(weight_sum, 1e-7f);
1112
1113 rr_var -= rs * (rs * rpt);
1114 rg_cov -= gs * (rs * rpt);
1115 rb_cov -= bs * (rs * rpt);
1116 ra_cov -= as * (rs * rpt);
1117
1118 gg_var -= gs * (gs * rpt);
1119 gb_cov -= bs * (gs * rpt);
1120 ga_cov -= as * (gs * rpt);
1121
1122 bb_var -= bs * (bs * rpt);
1123 ba_cov -= as * (bs * rpt);
1124
1125 aa_var -= as * (as * rpt);
1126
1127 // These will give a NaN if a channel is constant - these are fixed up in the next step
1128 rg_cov *= astc::rsqrt(rr_var * gg_var);
1129 rb_cov *= astc::rsqrt(rr_var * bb_var);
1130 ra_cov *= astc::rsqrt(rr_var * aa_var);
1131 gb_cov *= astc::rsqrt(gg_var * bb_var);
1132 ga_cov *= astc::rsqrt(gg_var * aa_var);
1133 ba_cov *= astc::rsqrt(bb_var * aa_var);
1134
1135 if (astc::isnan(rg_cov)) rg_cov = 1.0f;
1136 if (astc::isnan(rb_cov)) rb_cov = 1.0f;
1137 if (astc::isnan(ra_cov)) ra_cov = 1.0f;
1138 if (astc::isnan(gb_cov)) gb_cov = 1.0f;
1139 if (astc::isnan(ga_cov)) ga_cov = 1.0f;
1140 if (astc::isnan(ba_cov)) ba_cov = 1.0f;
1141
1142 float lowest_correlation = astc::min(fabsf(rg_cov), fabsf(rb_cov));
1143 lowest_correlation = astc::min(lowest_correlation, fabsf(ra_cov));
1144 lowest_correlation = astc::min(lowest_correlation, fabsf(gb_cov));
1145 lowest_correlation = astc::min(lowest_correlation, fabsf(ga_cov));
1146 lowest_correlation = astc::min(lowest_correlation, fabsf(ba_cov));
1147
1148 // Diagnostic trace points
1149 trace_add_data("min_r", blk.data_min.lane<0>());
1150 trace_add_data("max_r", blk.data_max.lane<0>());
1151 trace_add_data("min_g", blk.data_min.lane<1>());
1152 trace_add_data("max_g", blk.data_max.lane<1>());
1153 trace_add_data("min_b", blk.data_min.lane<2>());
1154 trace_add_data("max_b", blk.data_max.lane<2>());
1155 trace_add_data("min_a", blk.data_min.lane<3>());
1156 trace_add_data("max_a", blk.data_max.lane<3>());
1157 trace_add_data("cov_rg", fabsf(rg_cov));
1158 trace_add_data("cov_rb", fabsf(rb_cov));
1159 trace_add_data("cov_ra", fabsf(ra_cov));
1160 trace_add_data("cov_gb", fabsf(gb_cov));
1161 trace_add_data("cov_ga", fabsf(ga_cov));
1162 trace_add_data("cov_ba", fabsf(ba_cov));
1163
1164 return lowest_correlation;
1165 }
1166
1167 /* See header for documentation. */
compress_block(const astcenc_contexti & ctx,const image_block & blk,physical_compressed_block & pcb,compression_working_buffers & tmpbuf)1168 void compress_block(
1169 const astcenc_contexti& ctx,
1170 const image_block& blk,
1171 physical_compressed_block& pcb,
1172 compression_working_buffers& tmpbuf)
1173 {
1174 astcenc_profile decode_mode = ctx.config.profile;
1175 symbolic_compressed_block scb;
1176 const block_size_descriptor& bsd = *ctx.bsd;
1177 float lowest_correl;
1178
1179 TRACE_NODE(node0, "block");
1180 trace_add_data("pos_x", blk.xpos);
1181 trace_add_data("pos_y", blk.ypos);
1182 trace_add_data("pos_z", blk.zpos);
1183
1184 // Set stricter block targets for luminance data as we have more bits to play with
1185 bool block_is_l = blk.is_luminance();
1186 float block_is_l_scale = block_is_l ? 1.0f / 1.5f : 1.0f;
1187
1188 // Set slightly stricter block targets for lumalpha data as we have more bits to play with
1189 bool block_is_la = blk.is_luminancealpha();
1190 float block_is_la_scale = block_is_la ? 1.0f / 1.05f : 1.0f;
1191
1192 bool block_skip_two_plane = false;
1193 int max_partitions = ctx.config.tune_partition_count_limit;
1194
1195 unsigned int requested_partition_indices[3] {
1196 ctx.config.tune_2partition_index_limit,
1197 ctx.config.tune_3partition_index_limit,
1198 ctx.config.tune_4partition_index_limit
1199 };
1200
1201 unsigned int requested_partition_trials[3] {
1202 ctx.config.tune_2partitioning_candidate_limit,
1203 ctx.config.tune_3partitioning_candidate_limit,
1204 ctx.config.tune_4partitioning_candidate_limit
1205 };
1206
1207 #if defined(ASTCENC_DIAGNOSTICS)
1208 // Do this early in diagnostic builds so we can dump uniform metrics
1209 // for every block. Do it later in release builds to avoid redundant work!
1210 float error_weight_sum = hadd_s(blk.channel_weight) * bsd.texel_count;
1211 float error_threshold = ctx.config.tune_db_limit
1212 * error_weight_sum
1213 * block_is_l_scale
1214 * block_is_la_scale;
1215
1216 lowest_correl = prepare_block_statistics(bsd.texel_count, blk);
1217 trace_add_data("lowest_correl", lowest_correl);
1218 trace_add_data("tune_error_threshold", error_threshold);
1219 #endif
1220
1221 // Detected a constant-color block
1222 if (all(blk.data_min == blk.data_max))
1223 {
1224 TRACE_NODE(node1, "pass");
1225 trace_add_data("partition_count", 0);
1226 trace_add_data("plane_count", 1);
1227
1228 scb.partition_count = 0;
1229
1230 // Encode as FP16 if using HDR
1231 if ((decode_mode == ASTCENC_PRF_HDR) ||
1232 (decode_mode == ASTCENC_PRF_HDR_RGB_LDR_A))
1233 {
1234 scb.block_type = SYM_BTYPE_CONST_F16;
1235 vint4 color_f16 = float_to_float16(blk.origin_texel);
1236 store(color_f16, scb.constant_color);
1237 }
1238 // Encode as UNORM16 if NOT using HDR
1239 else
1240 {
1241 scb.block_type = SYM_BTYPE_CONST_U16;
1242 vfloat4 color_f32 = clamp(0.0f, 1.0f, blk.origin_texel) * 65535.0f;
1243 vint4 color_u16 = float_to_int_rtn(color_f32);
1244 store(color_u16, scb.constant_color);
1245 }
1246
1247 trace_add_data("exit", "quality hit");
1248
1249 symbolic_to_physical(bsd, scb, pcb);
1250 return;
1251 }
1252
1253 #if !defined(ASTCENC_DIAGNOSTICS)
1254 float error_weight_sum = hadd_s(blk.channel_weight) * bsd.texel_count;
1255 float error_threshold = ctx.config.tune_db_limit
1256 * error_weight_sum
1257 * block_is_l_scale
1258 * block_is_la_scale;
1259 #endif
1260
1261 // Set SCB and mode errors to a very high error value
1262 scb.errorval = ERROR_CALC_DEFAULT;
1263 scb.block_type = SYM_BTYPE_ERROR;
1264
1265 float best_errorvals_for_pcount[BLOCK_MAX_PARTITIONS] {
1266 ERROR_CALC_DEFAULT, ERROR_CALC_DEFAULT, ERROR_CALC_DEFAULT, ERROR_CALC_DEFAULT
1267 };
1268
1269 float exit_thresholds_for_pcount[BLOCK_MAX_PARTITIONS] {
1270 0.0f,
1271 ctx.config.tune_2_partition_early_out_limit_factor,
1272 ctx.config.tune_3_partition_early_out_limit_factor,
1273 0.0f
1274 };
1275
1276 // Trial using 1 plane of weights and 1 partition.
1277
1278 // Most of the time we test it twice, first with a mode cutoff of 0 and then with the specified
1279 // mode cutoff. This causes an early-out that speeds up encoding of easy blocks. However, this
1280 // optimization is disabled for 4x4 and 5x4 blocks where it nearly always slows down the
1281 // compression and slightly reduces image quality.
1282
1283 float errorval_mult[2] {
1284 1.0f / ctx.config.tune_mode0_mse_overshoot,
1285 1.0f
1286 };
1287
1288 static const float errorval_overshoot = 1.0f / ctx.config.tune_refinement_mse_overshoot;
1289
1290 // Only enable MODE0 fast path (trial 0) if 2D and more than 25 texels
1291 int start_trial = 1;
1292 if ((bsd.texel_count >= TUNE_MIN_TEXELS_MODE0_FASTPATH) && (bsd.zdim == 1))
1293 {
1294 start_trial = 0;
1295 }
1296
1297 int quant_limit = QUANT_32;
1298 for (int i = start_trial; i < 2; i++)
1299 {
1300 TRACE_NODE(node1, "pass");
1301 trace_add_data("partition_count", 1);
1302 trace_add_data("plane_count", 1);
1303 trace_add_data("search_mode", i);
1304
1305 float errorval = compress_symbolic_block_for_partition_1plane(
1306 ctx.config, bsd, blk, i == 0,
1307 error_threshold * errorval_mult[i] * errorval_overshoot,
1308 1, 0, scb, tmpbuf, QUANT_32);
1309
1310 // Record the quant level so we can use the filter later searches
1311 const auto& bm = bsd.get_block_mode(scb.block_mode);
1312 quant_limit = bm.get_weight_quant_mode();
1313
1314 best_errorvals_for_pcount[0] = astc::min(best_errorvals_for_pcount[0], errorval);
1315 if (errorval < (error_threshold * errorval_mult[i]))
1316 {
1317 trace_add_data("exit", "quality hit");
1318 goto END_OF_TESTS;
1319 }
1320 }
1321
1322 #if !defined(ASTCENC_DIAGNOSTICS)
1323 lowest_correl = prepare_block_statistics(bsd.texel_count, blk);
1324 #endif
1325
1326 block_skip_two_plane = lowest_correl > ctx.config.tune_2_plane_early_out_limit_correlation;
1327
1328 // Test the four possible 1-partition, 2-planes modes. Do this in reverse, as
1329 // alpha is the most likely to be non-correlated if it is present in the data.
1330 for (int i = BLOCK_MAX_COMPONENTS - 1; i >= 0; i--)
1331 {
1332 TRACE_NODE(node1, "pass");
1333 trace_add_data("partition_count", 1);
1334 trace_add_data("plane_count", 2);
1335 trace_add_data("plane_component", i);
1336
1337 if (block_skip_two_plane)
1338 {
1339 trace_add_data("skip", "tune_2_plane_early_out_limit_correlation");
1340 continue;
1341 }
1342
1343 if (blk.grayscale && i != 3)
1344 {
1345 trace_add_data("skip", "grayscale block");
1346 continue;
1347 }
1348
1349 if (blk.is_constant_channel(i))
1350 {
1351 trace_add_data("skip", "constant component");
1352 continue;
1353 }
1354
1355 float errorval = compress_symbolic_block_for_partition_2planes(
1356 ctx.config, bsd, blk, error_threshold * errorval_overshoot,
1357 i, scb, tmpbuf, quant_limit);
1358
1359 // If attempting two planes is much worse than the best one plane result
1360 // then further two plane searches are unlikely to help so move on ...
1361 if (errorval > (best_errorvals_for_pcount[0] * 2.0f))
1362 {
1363 break;
1364 }
1365
1366 if (errorval < error_threshold)
1367 {
1368 trace_add_data("exit", "quality hit");
1369 goto END_OF_TESTS;
1370 }
1371 }
1372
1373 // Find best blocks for 2, 3 and 4 partitions
1374 for (int partition_count = 2; partition_count <= max_partitions; partition_count++)
1375 {
1376 unsigned int partition_indices[TUNE_MAX_PARTITIIONING_CANDIDATES];
1377
1378 unsigned int requested_indices = requested_partition_indices[partition_count - 2];
1379
1380 unsigned int requested_trials = requested_partition_trials[partition_count - 2];
1381 requested_trials = astc::min(requested_trials, requested_indices);
1382
1383 unsigned int actual_trials = find_best_partition_candidates(
1384 bsd, blk, partition_count, requested_indices, partition_indices, requested_trials);
1385
1386 float best_error_in_prev = best_errorvals_for_pcount[partition_count - 2];
1387
1388 for (unsigned int i = 0; i < actual_trials; i++)
1389 {
1390 TRACE_NODE(node1, "pass");
1391 trace_add_data("partition_count", partition_count);
1392 trace_add_data("partition_index", partition_indices[i]);
1393 trace_add_data("plane_count", 1);
1394 trace_add_data("search_mode", i);
1395
1396 float errorval = compress_symbolic_block_for_partition_1plane(
1397 ctx.config, bsd, blk, false,
1398 error_threshold * errorval_overshoot,
1399 partition_count, partition_indices[i],
1400 scb, tmpbuf, quant_limit);
1401
1402 best_errorvals_for_pcount[partition_count - 1] = astc::min(best_errorvals_for_pcount[partition_count - 1], errorval);
1403
1404 // If using N partitions doesn't improve much over using N-1 partitions then skip trying
1405 // N+1. Error can dramatically improve if the data is correlated or non-correlated and
1406 // aligns with a partitioning that suits that encoding, so for this inner loop check add
1407 // a large error scale because the "other" trial could be a lot better. In total the
1408 // error must be at least 2x worse than the best existing error to early-out.
1409 float best_error = best_errorvals_for_pcount[partition_count - 1];
1410 float best_error_scale = exit_thresholds_for_pcount[partition_count - 1] * 2.0f;
1411 if (best_error > (best_error_in_prev * best_error_scale))
1412 {
1413 trace_add_data("skip", "tune_partition_early_out_limit_factor");
1414 goto END_OF_TESTS;
1415 }
1416
1417 if (errorval < error_threshold)
1418 {
1419 trace_add_data("exit", "quality hit");
1420 goto END_OF_TESTS;
1421 }
1422 }
1423
1424 // If using N partitions doesn't improve much over using N-1 partitions then skip trying N+1
1425 float best_error = best_errorvals_for_pcount[partition_count - 1];
1426 float best_error_scale = exit_thresholds_for_pcount[partition_count - 1];
1427 if (best_error > (best_error_in_prev * best_error_scale))
1428 {
1429 trace_add_data("skip", "tune_partition_early_out_limit_factor");
1430 goto END_OF_TESTS;
1431 }
1432 }
1433
1434 trace_add_data("exit", "quality not hit");
1435
1436 END_OF_TESTS:
1437 // If we still have an error block then convert to something we can encode
1438 // TODO: Do something more sensible here, such as average color block
1439 if (scb.block_type == SYM_BTYPE_ERROR)
1440 {
1441 #if defined(ASTCENC_DIAGNOSTICS)
1442 static bool printed_once = false;
1443 if (!printed_once)
1444 {
1445 printed_once = true;
1446 printf("WARN: At least one block failed to find a valid encoding.\n"
1447 " Try increasing compression quality settings.\n\n");
1448 }
1449 #endif
1450
1451 scb.block_type = SYM_BTYPE_CONST_U16;
1452 vfloat4 color_f32 = clamp(0.0f, 1.0f, blk.origin_texel) * 65535.0f;
1453 vint4 color_u16 = float_to_int_rtn(color_f32);
1454 store(color_u16, scb.constant_color);
1455 }
1456
1457 // Compress to a physical block
1458 symbolic_to_physical(bsd, scb, pcb);
1459 }
1460
1461 #endif
1462