1 // Copyright 2011 Google Inc. All Rights Reserved.
2 //
3 // Use of this source code is governed by a BSD-style license
4 // that can be found in the COPYING file in the root of the source
5 // tree. An additional intellectual property rights grant can be found
6 // in the file PATENTS. All contributing project authors may
7 // be found in the AUTHORS file in the root of the source tree.
8 // -----------------------------------------------------------------------------
9 //
10 // Quantization
11 //
12 // Author: Skal ([email protected])
13
14 #include <assert.h>
15 #include <math.h>
16 #include <stdlib.h> // for abs()
17
18 #include "src/dsp/quant.h"
19 #include "src/enc/vp8i_enc.h"
20 #include "src/enc/cost_enc.h"
21
22 #define DO_TRELLIS_I4 1
23 #define DO_TRELLIS_I16 1 // not a huge gain, but ok at low bitrate.
24 #define DO_TRELLIS_UV 0 // disable trellis for UV. Risky. Not worth.
25 #define USE_TDISTO 1
26
27 #define MID_ALPHA 64 // neutral value for susceptibility
28 #define MIN_ALPHA 30 // lowest usable value for susceptibility
29 #define MAX_ALPHA 100 // higher meaningful value for susceptibility
30
31 #define SNS_TO_DQ 0.9 // Scaling constant between the sns value and the QP
32 // power-law modulation. Must be strictly less than 1.
33
34 // number of non-zero coeffs below which we consider the block very flat
35 // (and apply a penalty to complex predictions)
36 #define FLATNESS_LIMIT_I16 0 // I16 mode (special case)
37 #define FLATNESS_LIMIT_I4 3 // I4 mode
38 #define FLATNESS_LIMIT_UV 2 // UV mode
39 #define FLATNESS_PENALTY 140 // roughly ~1bit per block
40
41 #define MULT_8B(a, b) (((a) * (b) + 128) >> 8)
42
43 #define RD_DISTO_MULT 256 // distortion multiplier (equivalent of lambda)
44
45 // #define DEBUG_BLOCK
46
47 //------------------------------------------------------------------------------
48
49 #if defined(DEBUG_BLOCK)
50
51 #include <stdio.h>
52 #include <stdlib.h>
53
PrintBlockInfo(const VP8EncIterator * const it,const VP8ModeScore * const rd)54 static void PrintBlockInfo(const VP8EncIterator* const it,
55 const VP8ModeScore* const rd) {
56 int i, j;
57 const int is_i16 = (it->mb_->type_ == 1);
58 const uint8_t* const y_in = it->yuv_in_ + Y_OFF_ENC;
59 const uint8_t* const y_out = it->yuv_out_ + Y_OFF_ENC;
60 const uint8_t* const uv_in = it->yuv_in_ + U_OFF_ENC;
61 const uint8_t* const uv_out = it->yuv_out_ + U_OFF_ENC;
62 printf("SOURCE / OUTPUT / ABS DELTA\n");
63 for (j = 0; j < 16; ++j) {
64 for (i = 0; i < 16; ++i) printf("%3d ", y_in[i + j * BPS]);
65 printf(" ");
66 for (i = 0; i < 16; ++i) printf("%3d ", y_out[i + j * BPS]);
67 printf(" ");
68 for (i = 0; i < 16; ++i) {
69 printf("%1d ", abs(y_in[i + j * BPS] - y_out[i + j * BPS]));
70 }
71 printf("\n");
72 }
73 printf("\n"); // newline before the U/V block
74 for (j = 0; j < 8; ++j) {
75 for (i = 0; i < 8; ++i) printf("%3d ", uv_in[i + j * BPS]);
76 printf(" ");
77 for (i = 8; i < 16; ++i) printf("%3d ", uv_in[i + j * BPS]);
78 printf(" ");
79 for (i = 0; i < 8; ++i) printf("%3d ", uv_out[i + j * BPS]);
80 printf(" ");
81 for (i = 8; i < 16; ++i) printf("%3d ", uv_out[i + j * BPS]);
82 printf(" ");
83 for (i = 0; i < 8; ++i) {
84 printf("%1d ", abs(uv_out[i + j * BPS] - uv_in[i + j * BPS]));
85 }
86 printf(" ");
87 for (i = 8; i < 16; ++i) {
88 printf("%1d ", abs(uv_out[i + j * BPS] - uv_in[i + j * BPS]));
89 }
90 printf("\n");
91 }
92 printf("\nD:%d SD:%d R:%d H:%d nz:0x%x score:%d\n",
93 (int)rd->D, (int)rd->SD, (int)rd->R, (int)rd->H, (int)rd->nz,
94 (int)rd->score);
95 if (is_i16) {
96 printf("Mode: %d\n", rd->mode_i16);
97 printf("y_dc_levels:");
98 for (i = 0; i < 16; ++i) printf("%3d ", rd->y_dc_levels[i]);
99 printf("\n");
100 } else {
101 printf("Modes[16]: ");
102 for (i = 0; i < 16; ++i) printf("%d ", rd->modes_i4[i]);
103 printf("\n");
104 }
105 printf("y_ac_levels:\n");
106 for (j = 0; j < 16; ++j) {
107 for (i = is_i16 ? 1 : 0; i < 16; ++i) {
108 printf("%4d ", rd->y_ac_levels[j][i]);
109 }
110 printf("\n");
111 }
112 printf("\n");
113 printf("uv_levels (mode=%d):\n", rd->mode_uv);
114 for (j = 0; j < 8; ++j) {
115 for (i = 0; i < 16; ++i) {
116 printf("%4d ", rd->uv_levels[j][i]);
117 }
118 printf("\n");
119 }
120 }
121
122 #endif // DEBUG_BLOCK
123
124 //------------------------------------------------------------------------------
125
clip(int v,int m,int M)126 static WEBP_INLINE int clip(int v, int m, int M) {
127 return v < m ? m : v > M ? M : v;
128 }
129
130 static const uint8_t kZigzag[16] = {
131 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15
132 };
133
134 static const uint8_t kDcTable[128] = {
135 4, 5, 6, 7, 8, 9, 10, 10,
136 11, 12, 13, 14, 15, 16, 17, 17,
137 18, 19, 20, 20, 21, 21, 22, 22,
138 23, 23, 24, 25, 25, 26, 27, 28,
139 29, 30, 31, 32, 33, 34, 35, 36,
140 37, 37, 38, 39, 40, 41, 42, 43,
141 44, 45, 46, 46, 47, 48, 49, 50,
142 51, 52, 53, 54, 55, 56, 57, 58,
143 59, 60, 61, 62, 63, 64, 65, 66,
144 67, 68, 69, 70, 71, 72, 73, 74,
145 75, 76, 76, 77, 78, 79, 80, 81,
146 82, 83, 84, 85, 86, 87, 88, 89,
147 91, 93, 95, 96, 98, 100, 101, 102,
148 104, 106, 108, 110, 112, 114, 116, 118,
149 122, 124, 126, 128, 130, 132, 134, 136,
150 138, 140, 143, 145, 148, 151, 154, 157
151 };
152
153 static const uint16_t kAcTable[128] = {
154 4, 5, 6, 7, 8, 9, 10, 11,
155 12, 13, 14, 15, 16, 17, 18, 19,
156 20, 21, 22, 23, 24, 25, 26, 27,
157 28, 29, 30, 31, 32, 33, 34, 35,
158 36, 37, 38, 39, 40, 41, 42, 43,
159 44, 45, 46, 47, 48, 49, 50, 51,
160 52, 53, 54, 55, 56, 57, 58, 60,
161 62, 64, 66, 68, 70, 72, 74, 76,
162 78, 80, 82, 84, 86, 88, 90, 92,
163 94, 96, 98, 100, 102, 104, 106, 108,
164 110, 112, 114, 116, 119, 122, 125, 128,
165 131, 134, 137, 140, 143, 146, 149, 152,
166 155, 158, 161, 164, 167, 170, 173, 177,
167 181, 185, 189, 193, 197, 201, 205, 209,
168 213, 217, 221, 225, 229, 234, 239, 245,
169 249, 254, 259, 264, 269, 274, 279, 284
170 };
171
172 static const uint16_t kAcTable2[128] = {
173 8, 8, 9, 10, 12, 13, 15, 17,
174 18, 20, 21, 23, 24, 26, 27, 29,
175 31, 32, 34, 35, 37, 38, 40, 41,
176 43, 44, 46, 48, 49, 51, 52, 54,
177 55, 57, 58, 60, 62, 63, 65, 66,
178 68, 69, 71, 72, 74, 75, 77, 79,
179 80, 82, 83, 85, 86, 88, 89, 93,
180 96, 99, 102, 105, 108, 111, 114, 117,
181 120, 124, 127, 130, 133, 136, 139, 142,
182 145, 148, 151, 155, 158, 161, 164, 167,
183 170, 173, 176, 179, 184, 189, 193, 198,
184 203, 207, 212, 217, 221, 226, 230, 235,
185 240, 244, 249, 254, 258, 263, 268, 274,
186 280, 286, 292, 299, 305, 311, 317, 323,
187 330, 336, 342, 348, 354, 362, 370, 379,
188 385, 393, 401, 409, 416, 424, 432, 440
189 };
190
191 static const uint8_t kBiasMatrices[3][2] = { // [luma-ac,luma-dc,chroma][dc,ac]
192 { 96, 110 }, { 96, 108 }, { 110, 115 }
193 };
194
195 // Sharpening by (slightly) raising the hi-frequency coeffs.
196 // Hack-ish but helpful for mid-bitrate range. Use with care.
197 #define SHARPEN_BITS 11 // number of descaling bits for sharpening bias
198 static const uint8_t kFreqSharpening[16] = {
199 0, 30, 60, 90,
200 30, 60, 90, 90,
201 60, 90, 90, 90,
202 90, 90, 90, 90
203 };
204
205 //------------------------------------------------------------------------------
206 // Initialize quantization parameters in VP8Matrix
207
208 // Returns the average quantizer
ExpandMatrix(VP8Matrix * const m,int type)209 static int ExpandMatrix(VP8Matrix* const m, int type) {
210 int i, sum;
211 for (i = 0; i < 2; ++i) {
212 const int is_ac_coeff = (i > 0);
213 const int bias = kBiasMatrices[type][is_ac_coeff];
214 m->iq_[i] = (1 << QFIX) / m->q_[i];
215 m->bias_[i] = BIAS(bias);
216 // zthresh_ is the exact value such that QUANTDIV(coeff, iQ, B) is:
217 // * zero if coeff <= zthresh
218 // * non-zero if coeff > zthresh
219 m->zthresh_[i] = ((1 << QFIX) - 1 - m->bias_[i]) / m->iq_[i];
220 }
221 for (i = 2; i < 16; ++i) {
222 m->q_[i] = m->q_[1];
223 m->iq_[i] = m->iq_[1];
224 m->bias_[i] = m->bias_[1];
225 m->zthresh_[i] = m->zthresh_[1];
226 }
227 for (sum = 0, i = 0; i < 16; ++i) {
228 if (type == 0) { // we only use sharpening for AC luma coeffs
229 m->sharpen_[i] = (kFreqSharpening[i] * m->q_[i]) >> SHARPEN_BITS;
230 } else {
231 m->sharpen_[i] = 0;
232 }
233 sum += m->q_[i];
234 }
235 return (sum + 8) >> 4;
236 }
237
CheckLambdaValue(int * const v)238 static void CheckLambdaValue(int* const v) { if (*v < 1) *v = 1; }
239
SetupMatrices(VP8Encoder * enc)240 static void SetupMatrices(VP8Encoder* enc) {
241 int i;
242 const int tlambda_scale =
243 (enc->method_ >= 4) ? enc->config_->sns_strength
244 : 0;
245 const int num_segments = enc->segment_hdr_.num_segments_;
246 for (i = 0; i < num_segments; ++i) {
247 VP8SegmentInfo* const m = &enc->dqm_[i];
248 const int q = m->quant_;
249 int q_i4, q_i16, q_uv;
250 m->y1_.q_[0] = kDcTable[clip(q + enc->dq_y1_dc_, 0, 127)];
251 m->y1_.q_[1] = kAcTable[clip(q, 0, 127)];
252
253 m->y2_.q_[0] = kDcTable[ clip(q + enc->dq_y2_dc_, 0, 127)] * 2;
254 m->y2_.q_[1] = kAcTable2[clip(q + enc->dq_y2_ac_, 0, 127)];
255
256 m->uv_.q_[0] = kDcTable[clip(q + enc->dq_uv_dc_, 0, 117)];
257 m->uv_.q_[1] = kAcTable[clip(q + enc->dq_uv_ac_, 0, 127)];
258
259 q_i4 = ExpandMatrix(&m->y1_, 0);
260 q_i16 = ExpandMatrix(&m->y2_, 1);
261 q_uv = ExpandMatrix(&m->uv_, 2);
262
263 m->lambda_i4_ = (3 * q_i4 * q_i4) >> 7;
264 m->lambda_i16_ = (3 * q_i16 * q_i16);
265 m->lambda_uv_ = (3 * q_uv * q_uv) >> 6;
266 m->lambda_mode_ = (1 * q_i4 * q_i4) >> 7;
267 m->lambda_trellis_i4_ = (7 * q_i4 * q_i4) >> 3;
268 m->lambda_trellis_i16_ = (q_i16 * q_i16) >> 2;
269 m->lambda_trellis_uv_ = (q_uv * q_uv) << 1;
270 m->tlambda_ = (tlambda_scale * q_i4) >> 5;
271
272 // none of these constants should be < 1
273 CheckLambdaValue(&m->lambda_i4_);
274 CheckLambdaValue(&m->lambda_i16_);
275 CheckLambdaValue(&m->lambda_uv_);
276 CheckLambdaValue(&m->lambda_mode_);
277 CheckLambdaValue(&m->lambda_trellis_i4_);
278 CheckLambdaValue(&m->lambda_trellis_i16_);
279 CheckLambdaValue(&m->lambda_trellis_uv_);
280 CheckLambdaValue(&m->tlambda_);
281
282 m->min_disto_ = 20 * m->y1_.q_[0]; // quantization-aware min disto
283 m->max_edge_ = 0;
284
285 m->i4_penalty_ = 1000 * q_i4 * q_i4;
286 }
287 }
288
289 //------------------------------------------------------------------------------
290 // Initialize filtering parameters
291
292 // Very small filter-strength values have close to no visual effect. So we can
293 // save a little decoding-CPU by turning filtering off for these.
294 #define FSTRENGTH_CUTOFF 2
295
SetupFilterStrength(VP8Encoder * const enc)296 static void SetupFilterStrength(VP8Encoder* const enc) {
297 int i;
298 // level0 is in [0..500]. Using '-f 50' as filter_strength is mid-filtering.
299 const int level0 = 5 * enc->config_->filter_strength;
300 for (i = 0; i < NUM_MB_SEGMENTS; ++i) {
301 VP8SegmentInfo* const m = &enc->dqm_[i];
302 // We focus on the quantization of AC coeffs.
303 const int qstep = kAcTable[clip(m->quant_, 0, 127)] >> 2;
304 const int base_strength =
305 VP8FilterStrengthFromDelta(enc->filter_hdr_.sharpness_, qstep);
306 // Segments with lower complexity ('beta') will be less filtered.
307 const int f = base_strength * level0 / (256 + m->beta_);
308 m->fstrength_ = (f < FSTRENGTH_CUTOFF) ? 0 : (f > 63) ? 63 : f;
309 }
310 // We record the initial strength (mainly for the case of 1-segment only).
311 enc->filter_hdr_.level_ = enc->dqm_[0].fstrength_;
312 enc->filter_hdr_.simple_ = (enc->config_->filter_type == 0);
313 enc->filter_hdr_.sharpness_ = enc->config_->filter_sharpness;
314 }
315
316 //------------------------------------------------------------------------------
317
318 // Note: if you change the values below, remember that the max range
319 // allowed by the syntax for DQ_UV is [-16,16].
320 #define MAX_DQ_UV (6)
321 #define MIN_DQ_UV (-4)
322
323 // We want to emulate jpeg-like behaviour where the expected "good" quality
324 // is around q=75. Internally, our "good" middle is around c=50. So we
325 // map accordingly using linear piece-wise function
QualityToCompression(double c)326 static double QualityToCompression(double c) {
327 const double linear_c = (c < 0.75) ? c * (2. / 3.) : 2. * c - 1.;
328 // The file size roughly scales as pow(quantizer, 3.). Actually, the
329 // exponent is somewhere between 2.8 and 3.2, but we're mostly interested
330 // in the mid-quant range. So we scale the compressibility inversely to
331 // this power-law: quant ~= compression ^ 1/3. This law holds well for
332 // low quant. Finer modeling for high-quant would make use of kAcTable[]
333 // more explicitly.
334 const double v = pow(linear_c, 1 / 3.);
335 return v;
336 }
337
QualityToJPEGCompression(double c,double alpha)338 static double QualityToJPEGCompression(double c, double alpha) {
339 // We map the complexity 'alpha' and quality setting 'c' to a compression
340 // exponent empirically matched to the compression curve of libjpeg6b.
341 // On average, the WebP output size will be roughly similar to that of a
342 // JPEG file compressed with same quality factor.
343 const double amin = 0.30;
344 const double amax = 0.85;
345 const double exp_min = 0.4;
346 const double exp_max = 0.9;
347 const double slope = (exp_min - exp_max) / (amax - amin);
348 // Linearly interpolate 'expn' from exp_min to exp_max
349 // in the [amin, amax] range.
350 const double expn = (alpha > amax) ? exp_min
351 : (alpha < amin) ? exp_max
352 : exp_max + slope * (alpha - amin);
353 const double v = pow(c, expn);
354 return v;
355 }
356
SegmentsAreEquivalent(const VP8SegmentInfo * const S1,const VP8SegmentInfo * const S2)357 static int SegmentsAreEquivalent(const VP8SegmentInfo* const S1,
358 const VP8SegmentInfo* const S2) {
359 return (S1->quant_ == S2->quant_) && (S1->fstrength_ == S2->fstrength_);
360 }
361
SimplifySegments(VP8Encoder * const enc)362 static void SimplifySegments(VP8Encoder* const enc) {
363 int map[NUM_MB_SEGMENTS] = { 0, 1, 2, 3 };
364 // 'num_segments_' is previously validated and <= NUM_MB_SEGMENTS, but an
365 // explicit check is needed to avoid a spurious warning about 'i' exceeding
366 // array bounds of 'dqm_' with some compilers (noticed with gcc-4.9).
367 const int num_segments = (enc->segment_hdr_.num_segments_ < NUM_MB_SEGMENTS)
368 ? enc->segment_hdr_.num_segments_
369 : NUM_MB_SEGMENTS;
370 int num_final_segments = 1;
371 int s1, s2;
372 for (s1 = 1; s1 < num_segments; ++s1) { // find similar segments
373 const VP8SegmentInfo* const S1 = &enc->dqm_[s1];
374 int found = 0;
375 // check if we already have similar segment
376 for (s2 = 0; s2 < num_final_segments; ++s2) {
377 const VP8SegmentInfo* const S2 = &enc->dqm_[s2];
378 if (SegmentsAreEquivalent(S1, S2)) {
379 found = 1;
380 break;
381 }
382 }
383 map[s1] = s2;
384 if (!found) {
385 if (num_final_segments != s1) {
386 enc->dqm_[num_final_segments] = enc->dqm_[s1];
387 }
388 ++num_final_segments;
389 }
390 }
391 if (num_final_segments < num_segments) { // Remap
392 int i = enc->mb_w_ * enc->mb_h_;
393 while (i-- > 0) enc->mb_info_[i].segment_ = map[enc->mb_info_[i].segment_];
394 enc->segment_hdr_.num_segments_ = num_final_segments;
395 // Replicate the trailing segment infos (it's mostly cosmetics)
396 for (i = num_final_segments; i < num_segments; ++i) {
397 enc->dqm_[i] = enc->dqm_[num_final_segments - 1];
398 }
399 }
400 }
401
VP8SetSegmentParams(VP8Encoder * const enc,float quality)402 void VP8SetSegmentParams(VP8Encoder* const enc, float quality) {
403 int i;
404 int dq_uv_ac, dq_uv_dc;
405 const int num_segments = enc->segment_hdr_.num_segments_;
406 const double amp = SNS_TO_DQ * enc->config_->sns_strength / 100. / 128.;
407 const double Q = quality / 100.;
408 const double c_base = enc->config_->emulate_jpeg_size ?
409 QualityToJPEGCompression(Q, enc->alpha_ / 255.) :
410 QualityToCompression(Q);
411 for (i = 0; i < num_segments; ++i) {
412 // We modulate the base coefficient to accommodate for the quantization
413 // susceptibility and allow denser segments to be quantized more.
414 const double expn = 1. - amp * enc->dqm_[i].alpha_;
415 const double c = pow(c_base, expn);
416 const int q = (int)(127. * (1. - c));
417 assert(expn > 0.);
418 enc->dqm_[i].quant_ = clip(q, 0, 127);
419 }
420
421 // purely indicative in the bitstream (except for the 1-segment case)
422 enc->base_quant_ = enc->dqm_[0].quant_;
423
424 // fill-in values for the unused segments (required by the syntax)
425 for (i = num_segments; i < NUM_MB_SEGMENTS; ++i) {
426 enc->dqm_[i].quant_ = enc->base_quant_;
427 }
428
429 // uv_alpha_ is normally spread around ~60. The useful range is
430 // typically ~30 (quite bad) to ~100 (ok to decimate UV more).
431 // We map it to the safe maximal range of MAX/MIN_DQ_UV for dq_uv.
432 dq_uv_ac = (enc->uv_alpha_ - MID_ALPHA) * (MAX_DQ_UV - MIN_DQ_UV)
433 / (MAX_ALPHA - MIN_ALPHA);
434 // we rescale by the user-defined strength of adaptation
435 dq_uv_ac = dq_uv_ac * enc->config_->sns_strength / 100;
436 // and make it safe.
437 dq_uv_ac = clip(dq_uv_ac, MIN_DQ_UV, MAX_DQ_UV);
438 // We also boost the dc-uv-quant a little, based on sns-strength, since
439 // U/V channels are quite more reactive to high quants (flat DC-blocks
440 // tend to appear, and are unpleasant).
441 dq_uv_dc = -4 * enc->config_->sns_strength / 100;
442 dq_uv_dc = clip(dq_uv_dc, -15, 15); // 4bit-signed max allowed
443
444 enc->dq_y1_dc_ = 0; // TODO(skal): dq-lum
445 enc->dq_y2_dc_ = 0;
446 enc->dq_y2_ac_ = 0;
447 enc->dq_uv_dc_ = dq_uv_dc;
448 enc->dq_uv_ac_ = dq_uv_ac;
449
450 SetupFilterStrength(enc); // initialize segments' filtering, eventually
451
452 if (num_segments > 1) SimplifySegments(enc);
453
454 SetupMatrices(enc); // finalize quantization matrices
455 }
456
457 //------------------------------------------------------------------------------
458 // Form the predictions in cache
459
460 // Must be ordered using {DC_PRED, TM_PRED, V_PRED, H_PRED} as index
461 const uint16_t VP8I16ModeOffsets[4] = { I16DC16, I16TM16, I16VE16, I16HE16 };
462 const uint16_t VP8UVModeOffsets[4] = { C8DC8, C8TM8, C8VE8, C8HE8 };
463
464 // Must be indexed using {B_DC_PRED -> B_HU_PRED} as index
465 const uint16_t VP8I4ModeOffsets[NUM_BMODES] = {
466 I4DC4, I4TM4, I4VE4, I4HE4, I4RD4, I4VR4, I4LD4, I4VL4, I4HD4, I4HU4
467 };
468
VP8MakeLuma16Preds(const VP8EncIterator * const it)469 void VP8MakeLuma16Preds(const VP8EncIterator* const it) {
470 const uint8_t* const left = it->x_ ? it->y_left_ : NULL;
471 const uint8_t* const top = it->y_ ? it->y_top_ : NULL;
472 VP8EncPredLuma16(it->yuv_p_, left, top);
473 }
474
VP8MakeChroma8Preds(const VP8EncIterator * const it)475 void VP8MakeChroma8Preds(const VP8EncIterator* const it) {
476 const uint8_t* const left = it->x_ ? it->u_left_ : NULL;
477 const uint8_t* const top = it->y_ ? it->uv_top_ : NULL;
478 VP8EncPredChroma8(it->yuv_p_, left, top);
479 }
480
VP8MakeIntra4Preds(const VP8EncIterator * const it)481 void VP8MakeIntra4Preds(const VP8EncIterator* const it) {
482 VP8EncPredLuma4(it->yuv_p_, it->i4_top_);
483 }
484
485 //------------------------------------------------------------------------------
486 // Quantize
487
488 // Layout:
489 // +----+----+
490 // |YYYY|UUVV| 0
491 // |YYYY|UUVV| 4
492 // |YYYY|....| 8
493 // |YYYY|....| 12
494 // +----+----+
495
496 const uint16_t VP8Scan[16] = { // Luma
497 0 + 0 * BPS, 4 + 0 * BPS, 8 + 0 * BPS, 12 + 0 * BPS,
498 0 + 4 * BPS, 4 + 4 * BPS, 8 + 4 * BPS, 12 + 4 * BPS,
499 0 + 8 * BPS, 4 + 8 * BPS, 8 + 8 * BPS, 12 + 8 * BPS,
500 0 + 12 * BPS, 4 + 12 * BPS, 8 + 12 * BPS, 12 + 12 * BPS,
501 };
502
503 static const uint16_t VP8ScanUV[4 + 4] = {
504 0 + 0 * BPS, 4 + 0 * BPS, 0 + 4 * BPS, 4 + 4 * BPS, // U
505 8 + 0 * BPS, 12 + 0 * BPS, 8 + 4 * BPS, 12 + 4 * BPS // V
506 };
507
508 //------------------------------------------------------------------------------
509 // Distortion measurement
510
511 static const uint16_t kWeightY[16] = {
512 38, 32, 20, 9, 32, 28, 17, 7, 20, 17, 10, 4, 9, 7, 4, 2
513 };
514
515 static const uint16_t kWeightTrellis[16] = {
516 #if USE_TDISTO == 0
517 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
518 #else
519 30, 27, 19, 11,
520 27, 24, 17, 10,
521 19, 17, 12, 8,
522 11, 10, 8, 6
523 #endif
524 };
525
526 // Init/Copy the common fields in score.
InitScore(VP8ModeScore * const rd)527 static void InitScore(VP8ModeScore* const rd) {
528 rd->D = 0;
529 rd->SD = 0;
530 rd->R = 0;
531 rd->H = 0;
532 rd->nz = 0;
533 rd->score = MAX_COST;
534 }
535
CopyScore(VP8ModeScore * WEBP_RESTRICT const dst,const VP8ModeScore * WEBP_RESTRICT const src)536 static void CopyScore(VP8ModeScore* WEBP_RESTRICT const dst,
537 const VP8ModeScore* WEBP_RESTRICT const src) {
538 dst->D = src->D;
539 dst->SD = src->SD;
540 dst->R = src->R;
541 dst->H = src->H;
542 dst->nz = src->nz; // note that nz is not accumulated, but just copied.
543 dst->score = src->score;
544 }
545
AddScore(VP8ModeScore * WEBP_RESTRICT const dst,const VP8ModeScore * WEBP_RESTRICT const src)546 static void AddScore(VP8ModeScore* WEBP_RESTRICT const dst,
547 const VP8ModeScore* WEBP_RESTRICT const src) {
548 dst->D += src->D;
549 dst->SD += src->SD;
550 dst->R += src->R;
551 dst->H += src->H;
552 dst->nz |= src->nz; // here, new nz bits are accumulated.
553 dst->score += src->score;
554 }
555
556 //------------------------------------------------------------------------------
557 // Performs trellis-optimized quantization.
558
559 // Trellis node
560 typedef struct {
561 int8_t prev; // best previous node
562 int8_t sign; // sign of coeff_i
563 int16_t level; // level
564 } Node;
565
566 // Score state
567 typedef struct {
568 score_t score; // partial RD score
569 const uint16_t* costs; // shortcut to cost tables
570 } ScoreState;
571
572 // If a coefficient was quantized to a value Q (using a neutral bias),
573 // we test all alternate possibilities between [Q-MIN_DELTA, Q+MAX_DELTA]
574 // We don't test negative values though.
575 #define MIN_DELTA 0 // how much lower level to try
576 #define MAX_DELTA 1 // how much higher
577 #define NUM_NODES (MIN_DELTA + 1 + MAX_DELTA)
578 #define NODE(n, l) (nodes[(n)][(l) + MIN_DELTA])
579 #define SCORE_STATE(n, l) (score_states[n][(l) + MIN_DELTA])
580
SetRDScore(int lambda,VP8ModeScore * const rd)581 static WEBP_INLINE void SetRDScore(int lambda, VP8ModeScore* const rd) {
582 rd->score = (rd->R + rd->H) * lambda + RD_DISTO_MULT * (rd->D + rd->SD);
583 }
584
RDScoreTrellis(int lambda,score_t rate,score_t distortion)585 static WEBP_INLINE score_t RDScoreTrellis(int lambda, score_t rate,
586 score_t distortion) {
587 return rate * lambda + RD_DISTO_MULT * distortion;
588 }
589
590 // Coefficient type.
591 enum { TYPE_I16_AC = 0, TYPE_I16_DC = 1, TYPE_CHROMA_A = 2, TYPE_I4_AC = 3 };
592
TrellisQuantizeBlock(const VP8Encoder * WEBP_RESTRICT const enc,int16_t in[16],int16_t out[16],int ctx0,int coeff_type,const VP8Matrix * WEBP_RESTRICT const mtx,int lambda)593 static int TrellisQuantizeBlock(const VP8Encoder* WEBP_RESTRICT const enc,
594 int16_t in[16], int16_t out[16],
595 int ctx0, int coeff_type,
596 const VP8Matrix* WEBP_RESTRICT const mtx,
597 int lambda) {
598 const ProbaArray* const probas = enc->proba_.coeffs_[coeff_type];
599 CostArrayPtr const costs =
600 (CostArrayPtr)enc->proba_.remapped_costs_[coeff_type];
601 const int first = (coeff_type == TYPE_I16_AC) ? 1 : 0;
602 Node nodes[16][NUM_NODES];
603 ScoreState score_states[2][NUM_NODES];
604 ScoreState* ss_cur = &SCORE_STATE(0, MIN_DELTA);
605 ScoreState* ss_prev = &SCORE_STATE(1, MIN_DELTA);
606 int best_path[3] = {-1, -1, -1}; // store best-last/best-level/best-previous
607 score_t best_score;
608 int n, m, p, last;
609
610 {
611 score_t cost;
612 const int thresh = mtx->q_[1] * mtx->q_[1] / 4;
613 const int last_proba = probas[VP8EncBands[first]][ctx0][0];
614
615 // compute the position of the last interesting coefficient
616 last = first - 1;
617 for (n = 15; n >= first; --n) {
618 const int j = kZigzag[n];
619 const int err = in[j] * in[j];
620 if (err > thresh) {
621 last = n;
622 break;
623 }
624 }
625 // we don't need to go inspect up to n = 16 coeffs. We can just go up
626 // to last + 1 (inclusive) without losing much.
627 if (last < 15) ++last;
628
629 // compute 'skip' score. This is the max score one can do.
630 cost = VP8BitCost(0, last_proba);
631 best_score = RDScoreTrellis(lambda, cost, 0);
632
633 // initialize source node.
634 for (m = -MIN_DELTA; m <= MAX_DELTA; ++m) {
635 const score_t rate = (ctx0 == 0) ? VP8BitCost(1, last_proba) : 0;
636 ss_cur[m].score = RDScoreTrellis(lambda, rate, 0);
637 ss_cur[m].costs = costs[first][ctx0];
638 }
639 }
640
641 // traverse trellis.
642 for (n = first; n <= last; ++n) {
643 const int j = kZigzag[n];
644 const uint32_t Q = mtx->q_[j];
645 const uint32_t iQ = mtx->iq_[j];
646 const uint32_t B = BIAS(0x00); // neutral bias
647 // note: it's important to take sign of the _original_ coeff,
648 // so we don't have to consider level < 0 afterward.
649 const int sign = (in[j] < 0);
650 const uint32_t coeff0 = (sign ? -in[j] : in[j]) + mtx->sharpen_[j];
651 int level0 = QUANTDIV(coeff0, iQ, B);
652 int thresh_level = QUANTDIV(coeff0, iQ, BIAS(0x80));
653 if (thresh_level > MAX_LEVEL) thresh_level = MAX_LEVEL;
654 if (level0 > MAX_LEVEL) level0 = MAX_LEVEL;
655
656 { // Swap current and previous score states
657 ScoreState* const tmp = ss_cur;
658 ss_cur = ss_prev;
659 ss_prev = tmp;
660 }
661
662 // test all alternate level values around level0.
663 for (m = -MIN_DELTA; m <= MAX_DELTA; ++m) {
664 Node* const cur = &NODE(n, m);
665 const int level = level0 + m;
666 const int ctx = (level > 2) ? 2 : level;
667 const int band = VP8EncBands[n + 1];
668 score_t base_score;
669 score_t best_cur_score;
670 int best_prev;
671 score_t cost, score;
672
673 ss_cur[m].costs = costs[n + 1][ctx];
674 if (level < 0 || level > thresh_level) {
675 ss_cur[m].score = MAX_COST;
676 // Node is dead.
677 continue;
678 }
679
680 {
681 // Compute delta_error = how much coding this level will
682 // subtract to max_error as distortion.
683 // Here, distortion = sum of (|coeff_i| - level_i * Q_i)^2
684 const int new_error = coeff0 - level * Q;
685 const int delta_error =
686 kWeightTrellis[j] * (new_error * new_error - coeff0 * coeff0);
687 base_score = RDScoreTrellis(lambda, 0, delta_error);
688 }
689
690 // Inspect all possible non-dead predecessors. Retain only the best one.
691 // The base_score is added to all scores so it is only added for the final
692 // value after the loop.
693 cost = VP8LevelCost(ss_prev[-MIN_DELTA].costs, level);
694 best_cur_score =
695 ss_prev[-MIN_DELTA].score + RDScoreTrellis(lambda, cost, 0);
696 best_prev = -MIN_DELTA;
697 for (p = -MIN_DELTA + 1; p <= MAX_DELTA; ++p) {
698 // Dead nodes (with ss_prev[p].score >= MAX_COST) are automatically
699 // eliminated since their score can't be better than the current best.
700 cost = VP8LevelCost(ss_prev[p].costs, level);
701 // Examine node assuming it's a non-terminal one.
702 score = ss_prev[p].score + RDScoreTrellis(lambda, cost, 0);
703 if (score < best_cur_score) {
704 best_cur_score = score;
705 best_prev = p;
706 }
707 }
708 best_cur_score += base_score;
709 // Store best finding in current node.
710 cur->sign = sign;
711 cur->level = level;
712 cur->prev = best_prev;
713 ss_cur[m].score = best_cur_score;
714
715 // Now, record best terminal node (and thus best entry in the graph).
716 if (level != 0 && best_cur_score < best_score) {
717 const score_t last_pos_cost =
718 (n < 15) ? VP8BitCost(0, probas[band][ctx][0]) : 0;
719 const score_t last_pos_score = RDScoreTrellis(lambda, last_pos_cost, 0);
720 score = best_cur_score + last_pos_score;
721 if (score < best_score) {
722 best_score = score;
723 best_path[0] = n; // best eob position
724 best_path[1] = m; // best node index
725 best_path[2] = best_prev; // best predecessor
726 }
727 }
728 }
729 }
730
731 // Fresh start
732 // Beware! We must preserve in[0]/out[0] value for TYPE_I16_AC case.
733 if (coeff_type == TYPE_I16_AC) {
734 memset(in + 1, 0, 15 * sizeof(*in));
735 memset(out + 1, 0, 15 * sizeof(*out));
736 } else {
737 memset(in, 0, 16 * sizeof(*in));
738 memset(out, 0, 16 * sizeof(*out));
739 }
740 if (best_path[0] == -1) {
741 return 0; // skip!
742 }
743
744 {
745 // Unwind the best path.
746 // Note: best-prev on terminal node is not necessarily equal to the
747 // best_prev for non-terminal. So we patch best_path[2] in.
748 int nz = 0;
749 int best_node = best_path[1];
750 n = best_path[0];
751 NODE(n, best_node).prev = best_path[2]; // force best-prev for terminal
752
753 for (; n >= first; --n) {
754 const Node* const node = &NODE(n, best_node);
755 const int j = kZigzag[n];
756 out[n] = node->sign ? -node->level : node->level;
757 nz |= node->level;
758 in[j] = out[n] * mtx->q_[j];
759 best_node = node->prev;
760 }
761 return (nz != 0);
762 }
763 }
764
765 #undef NODE
766
767 //------------------------------------------------------------------------------
768 // Performs: difference, transform, quantize, back-transform, add
769 // all at once. Output is the reconstructed block in *yuv_out, and the
770 // quantized levels in *levels.
771
ReconstructIntra16(VP8EncIterator * WEBP_RESTRICT const it,VP8ModeScore * WEBP_RESTRICT const rd,uint8_t * WEBP_RESTRICT const yuv_out,int mode)772 static int ReconstructIntra16(VP8EncIterator* WEBP_RESTRICT const it,
773 VP8ModeScore* WEBP_RESTRICT const rd,
774 uint8_t* WEBP_RESTRICT const yuv_out,
775 int mode) {
776 const VP8Encoder* const enc = it->enc_;
777 const uint8_t* const ref = it->yuv_p_ + VP8I16ModeOffsets[mode];
778 const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC;
779 const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
780 int nz = 0;
781 int n;
782 int16_t tmp[16][16], dc_tmp[16];
783
784 for (n = 0; n < 16; n += 2) {
785 VP8FTransform2(src + VP8Scan[n], ref + VP8Scan[n], tmp[n]);
786 }
787 VP8FTransformWHT(tmp[0], dc_tmp);
788 nz |= VP8EncQuantizeBlockWHT(dc_tmp, rd->y_dc_levels, &dqm->y2_) << 24;
789
790 if (DO_TRELLIS_I16 && it->do_trellis_) {
791 int x, y;
792 VP8IteratorNzToBytes(it);
793 for (y = 0, n = 0; y < 4; ++y) {
794 for (x = 0; x < 4; ++x, ++n) {
795 const int ctx = it->top_nz_[x] + it->left_nz_[y];
796 const int non_zero = TrellisQuantizeBlock(
797 enc, tmp[n], rd->y_ac_levels[n], ctx, TYPE_I16_AC, &dqm->y1_,
798 dqm->lambda_trellis_i16_);
799 it->top_nz_[x] = it->left_nz_[y] = non_zero;
800 rd->y_ac_levels[n][0] = 0;
801 nz |= non_zero << n;
802 }
803 }
804 } else {
805 for (n = 0; n < 16; n += 2) {
806 // Zero-out the first coeff, so that: a) nz is correct below, and
807 // b) finding 'last' non-zero coeffs in SetResidualCoeffs() is simplified.
808 tmp[n][0] = tmp[n + 1][0] = 0;
809 nz |= VP8EncQuantize2Blocks(tmp[n], rd->y_ac_levels[n], &dqm->y1_) << n;
810 assert(rd->y_ac_levels[n + 0][0] == 0);
811 assert(rd->y_ac_levels[n + 1][0] == 0);
812 }
813 }
814
815 // Transform back
816 VP8TransformWHT(dc_tmp, tmp[0]);
817 for (n = 0; n < 16; n += 2) {
818 VP8ITransform(ref + VP8Scan[n], tmp[n], yuv_out + VP8Scan[n], 1);
819 }
820
821 return nz;
822 }
823
ReconstructIntra4(VP8EncIterator * WEBP_RESTRICT const it,int16_t levels[16],const uint8_t * WEBP_RESTRICT const src,uint8_t * WEBP_RESTRICT const yuv_out,int mode)824 static int ReconstructIntra4(VP8EncIterator* WEBP_RESTRICT const it,
825 int16_t levels[16],
826 const uint8_t* WEBP_RESTRICT const src,
827 uint8_t* WEBP_RESTRICT const yuv_out,
828 int mode) {
829 const VP8Encoder* const enc = it->enc_;
830 const uint8_t* const ref = it->yuv_p_ + VP8I4ModeOffsets[mode];
831 const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
832 int nz = 0;
833 int16_t tmp[16];
834
835 VP8FTransform(src, ref, tmp);
836 if (DO_TRELLIS_I4 && it->do_trellis_) {
837 const int x = it->i4_ & 3, y = it->i4_ >> 2;
838 const int ctx = it->top_nz_[x] + it->left_nz_[y];
839 nz = TrellisQuantizeBlock(enc, tmp, levels, ctx, TYPE_I4_AC, &dqm->y1_,
840 dqm->lambda_trellis_i4_);
841 } else {
842 nz = VP8EncQuantizeBlock(tmp, levels, &dqm->y1_);
843 }
844 VP8ITransform(ref, tmp, yuv_out, 0);
845 return nz;
846 }
847
848 //------------------------------------------------------------------------------
849 // DC-error diffusion
850
851 // Diffusion weights. We under-correct a bit (15/16th of the error is actually
852 // diffused) to avoid 'rainbow' chessboard pattern of blocks at q~=0.
853 #define C1 7 // fraction of error sent to the 4x4 block below
854 #define C2 8 // fraction of error sent to the 4x4 block on the right
855 #define DSHIFT 4
856 #define DSCALE 1 // storage descaling, needed to make the error fit int8_t
857
858 // Quantize as usual, but also compute and return the quantization error.
859 // Error is already divided by DSHIFT.
QuantizeSingle(int16_t * WEBP_RESTRICT const v,const VP8Matrix * WEBP_RESTRICT const mtx)860 static int QuantizeSingle(int16_t* WEBP_RESTRICT const v,
861 const VP8Matrix* WEBP_RESTRICT const mtx) {
862 int V = *v;
863 const int sign = (V < 0);
864 if (sign) V = -V;
865 if (V > (int)mtx->zthresh_[0]) {
866 const int qV = QUANTDIV(V, mtx->iq_[0], mtx->bias_[0]) * mtx->q_[0];
867 const int err = (V - qV);
868 *v = sign ? -qV : qV;
869 return (sign ? -err : err) >> DSCALE;
870 }
871 *v = 0;
872 return (sign ? -V : V) >> DSCALE;
873 }
874
CorrectDCValues(const VP8EncIterator * WEBP_RESTRICT const it,const VP8Matrix * WEBP_RESTRICT const mtx,int16_t tmp[][16],VP8ModeScore * WEBP_RESTRICT const rd)875 static void CorrectDCValues(const VP8EncIterator* WEBP_RESTRICT const it,
876 const VP8Matrix* WEBP_RESTRICT const mtx,
877 int16_t tmp[][16],
878 VP8ModeScore* WEBP_RESTRICT const rd) {
879 // | top[0] | top[1]
880 // --------+--------+---------
881 // left[0] | tmp[0] tmp[1] <-> err0 err1
882 // left[1] | tmp[2] tmp[3] err2 err3
883 //
884 // Final errors {err1,err2,err3} are preserved and later restored
885 // as top[]/left[] on the next block.
886 int ch;
887 for (ch = 0; ch <= 1; ++ch) {
888 const int8_t* const top = it->top_derr_[it->x_][ch];
889 const int8_t* const left = it->left_derr_[ch];
890 int16_t (* const c)[16] = &tmp[ch * 4];
891 int err0, err1, err2, err3;
892 c[0][0] += (C1 * top[0] + C2 * left[0]) >> (DSHIFT - DSCALE);
893 err0 = QuantizeSingle(&c[0][0], mtx);
894 c[1][0] += (C1 * top[1] + C2 * err0) >> (DSHIFT - DSCALE);
895 err1 = QuantizeSingle(&c[1][0], mtx);
896 c[2][0] += (C1 * err0 + C2 * left[1]) >> (DSHIFT - DSCALE);
897 err2 = QuantizeSingle(&c[2][0], mtx);
898 c[3][0] += (C1 * err1 + C2 * err2) >> (DSHIFT - DSCALE);
899 err3 = QuantizeSingle(&c[3][0], mtx);
900 // error 'err' is bounded by mtx->q_[0] which is 132 at max. Hence
901 // err >> DSCALE will fit in an int8_t type if DSCALE>=1.
902 assert(abs(err1) <= 127 && abs(err2) <= 127 && abs(err3) <= 127);
903 rd->derr[ch][0] = (int8_t)err1;
904 rd->derr[ch][1] = (int8_t)err2;
905 rd->derr[ch][2] = (int8_t)err3;
906 }
907 }
908
StoreDiffusionErrors(VP8EncIterator * WEBP_RESTRICT const it,const VP8ModeScore * WEBP_RESTRICT const rd)909 static void StoreDiffusionErrors(VP8EncIterator* WEBP_RESTRICT const it,
910 const VP8ModeScore* WEBP_RESTRICT const rd) {
911 int ch;
912 for (ch = 0; ch <= 1; ++ch) {
913 int8_t* const top = it->top_derr_[it->x_][ch];
914 int8_t* const left = it->left_derr_[ch];
915 left[0] = rd->derr[ch][0]; // restore err1
916 left[1] = 3 * rd->derr[ch][2] >> 2; // ... 3/4th of err3
917 top[0] = rd->derr[ch][1]; // ... err2
918 top[1] = rd->derr[ch][2] - left[1]; // ... 1/4th of err3.
919 }
920 }
921
922 #undef C1
923 #undef C2
924 #undef DSHIFT
925 #undef DSCALE
926
927 //------------------------------------------------------------------------------
928
ReconstructUV(VP8EncIterator * WEBP_RESTRICT const it,VP8ModeScore * WEBP_RESTRICT const rd,uint8_t * WEBP_RESTRICT const yuv_out,int mode)929 static int ReconstructUV(VP8EncIterator* WEBP_RESTRICT const it,
930 VP8ModeScore* WEBP_RESTRICT const rd,
931 uint8_t* WEBP_RESTRICT const yuv_out, int mode) {
932 const VP8Encoder* const enc = it->enc_;
933 const uint8_t* const ref = it->yuv_p_ + VP8UVModeOffsets[mode];
934 const uint8_t* const src = it->yuv_in_ + U_OFF_ENC;
935 const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
936 int nz = 0;
937 int n;
938 int16_t tmp[8][16];
939
940 for (n = 0; n < 8; n += 2) {
941 VP8FTransform2(src + VP8ScanUV[n], ref + VP8ScanUV[n], tmp[n]);
942 }
943 if (it->top_derr_ != NULL) CorrectDCValues(it, &dqm->uv_, tmp, rd);
944
945 if (DO_TRELLIS_UV && it->do_trellis_) {
946 int ch, x, y;
947 for (ch = 0, n = 0; ch <= 2; ch += 2) {
948 for (y = 0; y < 2; ++y) {
949 for (x = 0; x < 2; ++x, ++n) {
950 const int ctx = it->top_nz_[4 + ch + x] + it->left_nz_[4 + ch + y];
951 const int non_zero = TrellisQuantizeBlock(
952 enc, tmp[n], rd->uv_levels[n], ctx, TYPE_CHROMA_A, &dqm->uv_,
953 dqm->lambda_trellis_uv_);
954 it->top_nz_[4 + ch + x] = it->left_nz_[4 + ch + y] = non_zero;
955 nz |= non_zero << n;
956 }
957 }
958 }
959 } else {
960 for (n = 0; n < 8; n += 2) {
961 nz |= VP8EncQuantize2Blocks(tmp[n], rd->uv_levels[n], &dqm->uv_) << n;
962 }
963 }
964
965 for (n = 0; n < 8; n += 2) {
966 VP8ITransform(ref + VP8ScanUV[n], tmp[n], yuv_out + VP8ScanUV[n], 1);
967 }
968 return (nz << 16);
969 }
970
971 //------------------------------------------------------------------------------
972 // RD-opt decision. Reconstruct each modes, evalue distortion and bit-cost.
973 // Pick the mode is lower RD-cost = Rate + lambda * Distortion.
974
StoreMaxDelta(VP8SegmentInfo * const dqm,const int16_t DCs[16])975 static void StoreMaxDelta(VP8SegmentInfo* const dqm, const int16_t DCs[16]) {
976 // We look at the first three AC coefficients to determine what is the average
977 // delta between each sub-4x4 block.
978 const int v0 = abs(DCs[1]);
979 const int v1 = abs(DCs[2]);
980 const int v2 = abs(DCs[4]);
981 int max_v = (v1 > v0) ? v1 : v0;
982 max_v = (v2 > max_v) ? v2 : max_v;
983 if (max_v > dqm->max_edge_) dqm->max_edge_ = max_v;
984 }
985
SwapModeScore(VP8ModeScore ** a,VP8ModeScore ** b)986 static void SwapModeScore(VP8ModeScore** a, VP8ModeScore** b) {
987 VP8ModeScore* const tmp = *a;
988 *a = *b;
989 *b = tmp;
990 }
991
SwapPtr(uint8_t ** a,uint8_t ** b)992 static void SwapPtr(uint8_t** a, uint8_t** b) {
993 uint8_t* const tmp = *a;
994 *a = *b;
995 *b = tmp;
996 }
997
SwapOut(VP8EncIterator * const it)998 static void SwapOut(VP8EncIterator* const it) {
999 SwapPtr(&it->yuv_out_, &it->yuv_out2_);
1000 }
1001
PickBestIntra16(VP8EncIterator * WEBP_RESTRICT const it,VP8ModeScore * WEBP_RESTRICT rd)1002 static void PickBestIntra16(VP8EncIterator* WEBP_RESTRICT const it,
1003 VP8ModeScore* WEBP_RESTRICT rd) {
1004 const int kNumBlocks = 16;
1005 VP8SegmentInfo* const dqm = &it->enc_->dqm_[it->mb_->segment_];
1006 const int lambda = dqm->lambda_i16_;
1007 const int tlambda = dqm->tlambda_;
1008 const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC;
1009 VP8ModeScore rd_tmp;
1010 VP8ModeScore* rd_cur = &rd_tmp;
1011 VP8ModeScore* rd_best = rd;
1012 int mode;
1013 int is_flat = IsFlatSource16(it->yuv_in_ + Y_OFF_ENC);
1014
1015 rd->mode_i16 = -1;
1016 for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
1017 uint8_t* const tmp_dst = it->yuv_out2_ + Y_OFF_ENC; // scratch buffer
1018 rd_cur->mode_i16 = mode;
1019
1020 // Reconstruct
1021 rd_cur->nz = ReconstructIntra16(it, rd_cur, tmp_dst, mode);
1022
1023 // Measure RD-score
1024 rd_cur->D = VP8SSE16x16(src, tmp_dst);
1025 rd_cur->SD =
1026 tlambda ? MULT_8B(tlambda, VP8TDisto16x16(src, tmp_dst, kWeightY)) : 0;
1027 rd_cur->H = VP8FixedCostsI16[mode];
1028 rd_cur->R = VP8GetCostLuma16(it, rd_cur);
1029 if (is_flat) {
1030 // refine the first impression (which was in pixel space)
1031 is_flat = IsFlat(rd_cur->y_ac_levels[0], kNumBlocks, FLATNESS_LIMIT_I16);
1032 if (is_flat) {
1033 // Block is very flat. We put emphasis on the distortion being very low!
1034 rd_cur->D *= 2;
1035 rd_cur->SD *= 2;
1036 }
1037 }
1038
1039 // Since we always examine Intra16 first, we can overwrite *rd directly.
1040 SetRDScore(lambda, rd_cur);
1041 if (mode == 0 || rd_cur->score < rd_best->score) {
1042 SwapModeScore(&rd_cur, &rd_best);
1043 SwapOut(it);
1044 }
1045 }
1046 if (rd_best != rd) {
1047 memcpy(rd, rd_best, sizeof(*rd));
1048 }
1049 SetRDScore(dqm->lambda_mode_, rd); // finalize score for mode decision.
1050 VP8SetIntra16Mode(it, rd->mode_i16);
1051
1052 // we have a blocky macroblock (only DCs are non-zero) with fairly high
1053 // distortion, record max delta so we can later adjust the minimal filtering
1054 // strength needed to smooth these blocks out.
1055 if ((rd->nz & 0x100ffff) == 0x1000000 && rd->D > dqm->min_disto_) {
1056 StoreMaxDelta(dqm, rd->y_dc_levels);
1057 }
1058 }
1059
1060 //------------------------------------------------------------------------------
1061
1062 // return the cost array corresponding to the surrounding prediction modes.
GetCostModeI4(VP8EncIterator * WEBP_RESTRICT const it,const uint8_t modes[16])1063 static const uint16_t* GetCostModeI4(VP8EncIterator* WEBP_RESTRICT const it,
1064 const uint8_t modes[16]) {
1065 const int preds_w = it->enc_->preds_w_;
1066 const int x = (it->i4_ & 3), y = it->i4_ >> 2;
1067 const int left = (x == 0) ? it->preds_[y * preds_w - 1] : modes[it->i4_ - 1];
1068 const int top = (y == 0) ? it->preds_[-preds_w + x] : modes[it->i4_ - 4];
1069 return VP8FixedCostsI4[top][left];
1070 }
1071
PickBestIntra4(VP8EncIterator * WEBP_RESTRICT const it,VP8ModeScore * WEBP_RESTRICT const rd)1072 static int PickBestIntra4(VP8EncIterator* WEBP_RESTRICT const it,
1073 VP8ModeScore* WEBP_RESTRICT const rd) {
1074 const VP8Encoder* const enc = it->enc_;
1075 const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
1076 const int lambda = dqm->lambda_i4_;
1077 const int tlambda = dqm->tlambda_;
1078 const uint8_t* const src0 = it->yuv_in_ + Y_OFF_ENC;
1079 uint8_t* const best_blocks = it->yuv_out2_ + Y_OFF_ENC;
1080 int total_header_bits = 0;
1081 VP8ModeScore rd_best;
1082
1083 if (enc->max_i4_header_bits_ == 0) {
1084 return 0;
1085 }
1086
1087 InitScore(&rd_best);
1088 rd_best.H = 211; // '211' is the value of VP8BitCost(0, 145)
1089 SetRDScore(dqm->lambda_mode_, &rd_best);
1090 VP8IteratorStartI4(it);
1091 do {
1092 const int kNumBlocks = 1;
1093 VP8ModeScore rd_i4;
1094 int mode;
1095 int best_mode = -1;
1096 const uint8_t* const src = src0 + VP8Scan[it->i4_];
1097 const uint16_t* const mode_costs = GetCostModeI4(it, rd->modes_i4);
1098 uint8_t* best_block = best_blocks + VP8Scan[it->i4_];
1099 uint8_t* tmp_dst = it->yuv_p_ + I4TMP; // scratch buffer.
1100
1101 InitScore(&rd_i4);
1102 VP8MakeIntra4Preds(it);
1103 for (mode = 0; mode < NUM_BMODES; ++mode) {
1104 VP8ModeScore rd_tmp;
1105 int16_t tmp_levels[16];
1106
1107 // Reconstruct
1108 rd_tmp.nz =
1109 ReconstructIntra4(it, tmp_levels, src, tmp_dst, mode) << it->i4_;
1110
1111 // Compute RD-score
1112 rd_tmp.D = VP8SSE4x4(src, tmp_dst);
1113 rd_tmp.SD =
1114 tlambda ? MULT_8B(tlambda, VP8TDisto4x4(src, tmp_dst, kWeightY))
1115 : 0;
1116 rd_tmp.H = mode_costs[mode];
1117
1118 // Add flatness penalty, to avoid flat area to be mispredicted
1119 // by a complex mode.
1120 if (mode > 0 && IsFlat(tmp_levels, kNumBlocks, FLATNESS_LIMIT_I4)) {
1121 rd_tmp.R = FLATNESS_PENALTY * kNumBlocks;
1122 } else {
1123 rd_tmp.R = 0;
1124 }
1125
1126 // early-out check
1127 SetRDScore(lambda, &rd_tmp);
1128 if (best_mode >= 0 && rd_tmp.score >= rd_i4.score) continue;
1129
1130 // finish computing score
1131 rd_tmp.R += VP8GetCostLuma4(it, tmp_levels);
1132 SetRDScore(lambda, &rd_tmp);
1133
1134 if (best_mode < 0 || rd_tmp.score < rd_i4.score) {
1135 CopyScore(&rd_i4, &rd_tmp);
1136 best_mode = mode;
1137 SwapPtr(&tmp_dst, &best_block);
1138 memcpy(rd_best.y_ac_levels[it->i4_], tmp_levels,
1139 sizeof(rd_best.y_ac_levels[it->i4_]));
1140 }
1141 }
1142 SetRDScore(dqm->lambda_mode_, &rd_i4);
1143 AddScore(&rd_best, &rd_i4);
1144 if (rd_best.score >= rd->score) {
1145 return 0;
1146 }
1147 total_header_bits += (int)rd_i4.H; // <- equal to mode_costs[best_mode];
1148 if (total_header_bits > enc->max_i4_header_bits_) {
1149 return 0;
1150 }
1151 // Copy selected samples if not in the right place already.
1152 if (best_block != best_blocks + VP8Scan[it->i4_]) {
1153 VP8Copy4x4(best_block, best_blocks + VP8Scan[it->i4_]);
1154 }
1155 rd->modes_i4[it->i4_] = best_mode;
1156 it->top_nz_[it->i4_ & 3] = it->left_nz_[it->i4_ >> 2] = (rd_i4.nz ? 1 : 0);
1157 } while (VP8IteratorRotateI4(it, best_blocks));
1158
1159 // finalize state
1160 CopyScore(rd, &rd_best);
1161 VP8SetIntra4Mode(it, rd->modes_i4);
1162 SwapOut(it);
1163 memcpy(rd->y_ac_levels, rd_best.y_ac_levels, sizeof(rd->y_ac_levels));
1164 return 1; // select intra4x4 over intra16x16
1165 }
1166
1167 //------------------------------------------------------------------------------
1168
PickBestUV(VP8EncIterator * WEBP_RESTRICT const it,VP8ModeScore * WEBP_RESTRICT const rd)1169 static void PickBestUV(VP8EncIterator* WEBP_RESTRICT const it,
1170 VP8ModeScore* WEBP_RESTRICT const rd) {
1171 const int kNumBlocks = 8;
1172 const VP8SegmentInfo* const dqm = &it->enc_->dqm_[it->mb_->segment_];
1173 const int lambda = dqm->lambda_uv_;
1174 const uint8_t* const src = it->yuv_in_ + U_OFF_ENC;
1175 uint8_t* tmp_dst = it->yuv_out2_ + U_OFF_ENC; // scratch buffer
1176 uint8_t* dst0 = it->yuv_out_ + U_OFF_ENC;
1177 uint8_t* dst = dst0;
1178 VP8ModeScore rd_best;
1179 int mode;
1180
1181 rd->mode_uv = -1;
1182 InitScore(&rd_best);
1183 for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
1184 VP8ModeScore rd_uv;
1185
1186 // Reconstruct
1187 rd_uv.nz = ReconstructUV(it, &rd_uv, tmp_dst, mode);
1188
1189 // Compute RD-score
1190 rd_uv.D = VP8SSE16x8(src, tmp_dst);
1191 rd_uv.SD = 0; // not calling TDisto here: it tends to flatten areas.
1192 rd_uv.H = VP8FixedCostsUV[mode];
1193 rd_uv.R = VP8GetCostUV(it, &rd_uv);
1194 if (mode > 0 && IsFlat(rd_uv.uv_levels[0], kNumBlocks, FLATNESS_LIMIT_UV)) {
1195 rd_uv.R += FLATNESS_PENALTY * kNumBlocks;
1196 }
1197
1198 SetRDScore(lambda, &rd_uv);
1199 if (mode == 0 || rd_uv.score < rd_best.score) {
1200 CopyScore(&rd_best, &rd_uv);
1201 rd->mode_uv = mode;
1202 memcpy(rd->uv_levels, rd_uv.uv_levels, sizeof(rd->uv_levels));
1203 if (it->top_derr_ != NULL) {
1204 memcpy(rd->derr, rd_uv.derr, sizeof(rd_uv.derr));
1205 }
1206 SwapPtr(&dst, &tmp_dst);
1207 }
1208 }
1209 VP8SetIntraUVMode(it, rd->mode_uv);
1210 AddScore(rd, &rd_best);
1211 if (dst != dst0) { // copy 16x8 block if needed
1212 VP8Copy16x8(dst, dst0);
1213 }
1214 if (it->top_derr_ != NULL) { // store diffusion errors for next block
1215 StoreDiffusionErrors(it, rd);
1216 }
1217 }
1218
1219 //------------------------------------------------------------------------------
1220 // Final reconstruction and quantization.
1221
SimpleQuantize(VP8EncIterator * WEBP_RESTRICT const it,VP8ModeScore * WEBP_RESTRICT const rd)1222 static void SimpleQuantize(VP8EncIterator* WEBP_RESTRICT const it,
1223 VP8ModeScore* WEBP_RESTRICT const rd) {
1224 const VP8Encoder* const enc = it->enc_;
1225 const int is_i16 = (it->mb_->type_ == 1);
1226 int nz = 0;
1227
1228 if (is_i16) {
1229 nz = ReconstructIntra16(it, rd, it->yuv_out_ + Y_OFF_ENC, it->preds_[0]);
1230 } else {
1231 VP8IteratorStartI4(it);
1232 do {
1233 const int mode =
1234 it->preds_[(it->i4_ & 3) + (it->i4_ >> 2) * enc->preds_w_];
1235 const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC + VP8Scan[it->i4_];
1236 uint8_t* const dst = it->yuv_out_ + Y_OFF_ENC + VP8Scan[it->i4_];
1237 VP8MakeIntra4Preds(it);
1238 nz |= ReconstructIntra4(it, rd->y_ac_levels[it->i4_],
1239 src, dst, mode) << it->i4_;
1240 } while (VP8IteratorRotateI4(it, it->yuv_out_ + Y_OFF_ENC));
1241 }
1242
1243 nz |= ReconstructUV(it, rd, it->yuv_out_ + U_OFF_ENC, it->mb_->uv_mode_);
1244 rd->nz = nz;
1245 }
1246
1247 // Refine intra16/intra4 sub-modes based on distortion only (not rate).
RefineUsingDistortion(VP8EncIterator * WEBP_RESTRICT const it,int try_both_modes,int refine_uv_mode,VP8ModeScore * WEBP_RESTRICT const rd)1248 static void RefineUsingDistortion(VP8EncIterator* WEBP_RESTRICT const it,
1249 int try_both_modes, int refine_uv_mode,
1250 VP8ModeScore* WEBP_RESTRICT const rd) {
1251 score_t best_score = MAX_COST;
1252 int nz = 0;
1253 int mode;
1254 int is_i16 = try_both_modes || (it->mb_->type_ == 1);
1255
1256 const VP8SegmentInfo* const dqm = &it->enc_->dqm_[it->mb_->segment_];
1257 // Some empiric constants, of approximate order of magnitude.
1258 const int lambda_d_i16 = 106;
1259 const int lambda_d_i4 = 11;
1260 const int lambda_d_uv = 120;
1261 score_t score_i4 = dqm->i4_penalty_;
1262 score_t i4_bit_sum = 0;
1263 const score_t bit_limit = try_both_modes ? it->enc_->mb_header_limit_
1264 : MAX_COST; // no early-out allowed
1265
1266 if (is_i16) { // First, evaluate Intra16 distortion
1267 int best_mode = -1;
1268 const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC;
1269 for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
1270 const uint8_t* const ref = it->yuv_p_ + VP8I16ModeOffsets[mode];
1271 const score_t score = (score_t)VP8SSE16x16(src, ref) * RD_DISTO_MULT
1272 + VP8FixedCostsI16[mode] * lambda_d_i16;
1273 if (mode > 0 && VP8FixedCostsI16[mode] > bit_limit) {
1274 continue;
1275 }
1276
1277 if (score < best_score) {
1278 best_mode = mode;
1279 best_score = score;
1280 }
1281 }
1282 if (it->x_ == 0 || it->y_ == 0) {
1283 // avoid starting a checkerboard resonance from the border. See bug #432.
1284 if (IsFlatSource16(src)) {
1285 best_mode = (it->x_ == 0) ? 0 : 2;
1286 try_both_modes = 0; // stick to i16
1287 }
1288 }
1289 VP8SetIntra16Mode(it, best_mode);
1290 // we'll reconstruct later, if i16 mode actually gets selected
1291 }
1292
1293 // Next, evaluate Intra4
1294 if (try_both_modes || !is_i16) {
1295 // We don't evaluate the rate here, but just account for it through a
1296 // constant penalty (i4 mode usually needs more bits compared to i16).
1297 is_i16 = 0;
1298 VP8IteratorStartI4(it);
1299 do {
1300 int best_i4_mode = -1;
1301 score_t best_i4_score = MAX_COST;
1302 const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC + VP8Scan[it->i4_];
1303 const uint16_t* const mode_costs = GetCostModeI4(it, rd->modes_i4);
1304
1305 VP8MakeIntra4Preds(it);
1306 for (mode = 0; mode < NUM_BMODES; ++mode) {
1307 const uint8_t* const ref = it->yuv_p_ + VP8I4ModeOffsets[mode];
1308 const score_t score = VP8SSE4x4(src, ref) * RD_DISTO_MULT
1309 + mode_costs[mode] * lambda_d_i4;
1310 if (score < best_i4_score) {
1311 best_i4_mode = mode;
1312 best_i4_score = score;
1313 }
1314 }
1315 i4_bit_sum += mode_costs[best_i4_mode];
1316 rd->modes_i4[it->i4_] = best_i4_mode;
1317 score_i4 += best_i4_score;
1318 if (score_i4 >= best_score || i4_bit_sum > bit_limit) {
1319 // Intra4 won't be better than Intra16. Bail out and pick Intra16.
1320 is_i16 = 1;
1321 break;
1322 } else { // reconstruct partial block inside yuv_out2_ buffer
1323 uint8_t* const tmp_dst = it->yuv_out2_ + Y_OFF_ENC + VP8Scan[it->i4_];
1324 nz |= ReconstructIntra4(it, rd->y_ac_levels[it->i4_],
1325 src, tmp_dst, best_i4_mode) << it->i4_;
1326 }
1327 } while (VP8IteratorRotateI4(it, it->yuv_out2_ + Y_OFF_ENC));
1328 }
1329
1330 // Final reconstruction, depending on which mode is selected.
1331 if (!is_i16) {
1332 VP8SetIntra4Mode(it, rd->modes_i4);
1333 SwapOut(it);
1334 best_score = score_i4;
1335 } else {
1336 nz = ReconstructIntra16(it, rd, it->yuv_out_ + Y_OFF_ENC, it->preds_[0]);
1337 }
1338
1339 // ... and UV!
1340 if (refine_uv_mode) {
1341 int best_mode = -1;
1342 score_t best_uv_score = MAX_COST;
1343 const uint8_t* const src = it->yuv_in_ + U_OFF_ENC;
1344 for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
1345 const uint8_t* const ref = it->yuv_p_ + VP8UVModeOffsets[mode];
1346 const score_t score = VP8SSE16x8(src, ref) * RD_DISTO_MULT
1347 + VP8FixedCostsUV[mode] * lambda_d_uv;
1348 if (score < best_uv_score) {
1349 best_mode = mode;
1350 best_uv_score = score;
1351 }
1352 }
1353 VP8SetIntraUVMode(it, best_mode);
1354 }
1355 nz |= ReconstructUV(it, rd, it->yuv_out_ + U_OFF_ENC, it->mb_->uv_mode_);
1356
1357 rd->nz = nz;
1358 rd->score = best_score;
1359 }
1360
1361 //------------------------------------------------------------------------------
1362 // Entry point
1363
VP8Decimate(VP8EncIterator * WEBP_RESTRICT const it,VP8ModeScore * WEBP_RESTRICT const rd,VP8RDLevel rd_opt)1364 int VP8Decimate(VP8EncIterator* WEBP_RESTRICT const it,
1365 VP8ModeScore* WEBP_RESTRICT const rd,
1366 VP8RDLevel rd_opt) {
1367 int is_skipped;
1368 const int method = it->enc_->method_;
1369
1370 InitScore(rd);
1371
1372 // We can perform predictions for Luma16x16 and Chroma8x8 already.
1373 // Luma4x4 predictions needs to be done as-we-go.
1374 VP8MakeLuma16Preds(it);
1375 VP8MakeChroma8Preds(it);
1376
1377 if (rd_opt > RD_OPT_NONE) {
1378 it->do_trellis_ = (rd_opt >= RD_OPT_TRELLIS_ALL);
1379 PickBestIntra16(it, rd);
1380 if (method >= 2) {
1381 PickBestIntra4(it, rd);
1382 }
1383 PickBestUV(it, rd);
1384 if (rd_opt == RD_OPT_TRELLIS) { // finish off with trellis-optim now
1385 it->do_trellis_ = 1;
1386 SimpleQuantize(it, rd);
1387 }
1388 } else {
1389 // At this point we have heuristically decided intra16 / intra4.
1390 // For method >= 2, pick the best intra4/intra16 based on SSE (~tad slower).
1391 // For method <= 1, we don't re-examine the decision but just go ahead with
1392 // quantization/reconstruction.
1393 RefineUsingDistortion(it, (method >= 2), (method >= 1), rd);
1394 }
1395 is_skipped = (rd->nz == 0);
1396 VP8SetSkip(it, is_skipped);
1397 return is_skipped;
1398 }
1399