xref: /aosp_15_r20/external/webp/src/enc/quant_enc.c (revision b2055c353e87c8814eb2b6b1b11112a1562253bd)
1 // Copyright 2011 Google Inc. All Rights Reserved.
2 //
3 // Use of this source code is governed by a BSD-style license
4 // that can be found in the COPYING file in the root of the source
5 // tree. An additional intellectual property rights grant can be found
6 // in the file PATENTS. All contributing project authors may
7 // be found in the AUTHORS file in the root of the source tree.
8 // -----------------------------------------------------------------------------
9 //
10 //   Quantization
11 //
12 // Author: Skal ([email protected])
13 
14 #include <assert.h>
15 #include <math.h>
16 #include <stdlib.h>  // for abs()
17 
18 #include "src/dsp/quant.h"
19 #include "src/enc/vp8i_enc.h"
20 #include "src/enc/cost_enc.h"
21 
22 #define DO_TRELLIS_I4  1
23 #define DO_TRELLIS_I16 1   // not a huge gain, but ok at low bitrate.
24 #define DO_TRELLIS_UV  0   // disable trellis for UV. Risky. Not worth.
25 #define USE_TDISTO 1
26 
27 #define MID_ALPHA 64      // neutral value for susceptibility
28 #define MIN_ALPHA 30      // lowest usable value for susceptibility
29 #define MAX_ALPHA 100     // higher meaningful value for susceptibility
30 
31 #define SNS_TO_DQ 0.9     // Scaling constant between the sns value and the QP
32                           // power-law modulation. Must be strictly less than 1.
33 
34 // number of non-zero coeffs below which we consider the block very flat
35 // (and apply a penalty to complex predictions)
36 #define FLATNESS_LIMIT_I16 0       // I16 mode (special case)
37 #define FLATNESS_LIMIT_I4  3       // I4 mode
38 #define FLATNESS_LIMIT_UV  2       // UV mode
39 #define FLATNESS_PENALTY   140     // roughly ~1bit per block
40 
41 #define MULT_8B(a, b) (((a) * (b) + 128) >> 8)
42 
43 #define RD_DISTO_MULT      256  // distortion multiplier (equivalent of lambda)
44 
45 // #define DEBUG_BLOCK
46 
47 //------------------------------------------------------------------------------
48 
49 #if defined(DEBUG_BLOCK)
50 
51 #include <stdio.h>
52 #include <stdlib.h>
53 
PrintBlockInfo(const VP8EncIterator * const it,const VP8ModeScore * const rd)54 static void PrintBlockInfo(const VP8EncIterator* const it,
55                            const VP8ModeScore* const rd) {
56   int i, j;
57   const int is_i16 = (it->mb_->type_ == 1);
58   const uint8_t* const y_in = it->yuv_in_ + Y_OFF_ENC;
59   const uint8_t* const y_out = it->yuv_out_ + Y_OFF_ENC;
60   const uint8_t* const uv_in = it->yuv_in_ + U_OFF_ENC;
61   const uint8_t* const uv_out = it->yuv_out_ + U_OFF_ENC;
62   printf("SOURCE / OUTPUT / ABS DELTA\n");
63   for (j = 0; j < 16; ++j) {
64     for (i = 0; i < 16; ++i) printf("%3d ", y_in[i + j * BPS]);
65     printf("     ");
66     for (i = 0; i < 16; ++i) printf("%3d ", y_out[i + j * BPS]);
67     printf("     ");
68     for (i = 0; i < 16; ++i) {
69       printf("%1d ", abs(y_in[i + j * BPS] - y_out[i + j * BPS]));
70     }
71     printf("\n");
72   }
73   printf("\n");   // newline before the U/V block
74   for (j = 0; j < 8; ++j) {
75     for (i = 0; i < 8; ++i) printf("%3d ", uv_in[i + j * BPS]);
76     printf(" ");
77     for (i = 8; i < 16; ++i) printf("%3d ", uv_in[i + j * BPS]);
78     printf("    ");
79     for (i = 0; i < 8; ++i) printf("%3d ", uv_out[i + j * BPS]);
80     printf(" ");
81     for (i = 8; i < 16; ++i) printf("%3d ", uv_out[i + j * BPS]);
82     printf("   ");
83     for (i = 0; i < 8; ++i) {
84       printf("%1d ", abs(uv_out[i + j * BPS] - uv_in[i + j * BPS]));
85     }
86     printf(" ");
87     for (i = 8; i < 16; ++i) {
88       printf("%1d ", abs(uv_out[i + j * BPS] - uv_in[i + j * BPS]));
89     }
90     printf("\n");
91   }
92   printf("\nD:%d SD:%d R:%d H:%d nz:0x%x score:%d\n",
93     (int)rd->D, (int)rd->SD, (int)rd->R, (int)rd->H, (int)rd->nz,
94     (int)rd->score);
95   if (is_i16) {
96     printf("Mode: %d\n", rd->mode_i16);
97     printf("y_dc_levels:");
98     for (i = 0; i < 16; ++i) printf("%3d ", rd->y_dc_levels[i]);
99     printf("\n");
100   } else {
101     printf("Modes[16]: ");
102     for (i = 0; i < 16; ++i) printf("%d ", rd->modes_i4[i]);
103     printf("\n");
104   }
105   printf("y_ac_levels:\n");
106   for (j = 0; j < 16; ++j) {
107     for (i = is_i16 ? 1 : 0; i < 16; ++i) {
108       printf("%4d ", rd->y_ac_levels[j][i]);
109     }
110     printf("\n");
111   }
112   printf("\n");
113   printf("uv_levels (mode=%d):\n", rd->mode_uv);
114   for (j = 0; j < 8; ++j) {
115     for (i = 0; i < 16; ++i) {
116       printf("%4d ", rd->uv_levels[j][i]);
117     }
118     printf("\n");
119   }
120 }
121 
122 #endif   // DEBUG_BLOCK
123 
124 //------------------------------------------------------------------------------
125 
clip(int v,int m,int M)126 static WEBP_INLINE int clip(int v, int m, int M) {
127   return v < m ? m : v > M ? M : v;
128 }
129 
130 static const uint8_t kZigzag[16] = {
131   0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15
132 };
133 
134 static const uint8_t kDcTable[128] = {
135   4,     5,   6,   7,   8,   9,  10,  10,
136   11,   12,  13,  14,  15,  16,  17,  17,
137   18,   19,  20,  20,  21,  21,  22,  22,
138   23,   23,  24,  25,  25,  26,  27,  28,
139   29,   30,  31,  32,  33,  34,  35,  36,
140   37,   37,  38,  39,  40,  41,  42,  43,
141   44,   45,  46,  46,  47,  48,  49,  50,
142   51,   52,  53,  54,  55,  56,  57,  58,
143   59,   60,  61,  62,  63,  64,  65,  66,
144   67,   68,  69,  70,  71,  72,  73,  74,
145   75,   76,  76,  77,  78,  79,  80,  81,
146   82,   83,  84,  85,  86,  87,  88,  89,
147   91,   93,  95,  96,  98, 100, 101, 102,
148   104, 106, 108, 110, 112, 114, 116, 118,
149   122, 124, 126, 128, 130, 132, 134, 136,
150   138, 140, 143, 145, 148, 151, 154, 157
151 };
152 
153 static const uint16_t kAcTable[128] = {
154   4,     5,   6,   7,   8,   9,  10,  11,
155   12,   13,  14,  15,  16,  17,  18,  19,
156   20,   21,  22,  23,  24,  25,  26,  27,
157   28,   29,  30,  31,  32,  33,  34,  35,
158   36,   37,  38,  39,  40,  41,  42,  43,
159   44,   45,  46,  47,  48,  49,  50,  51,
160   52,   53,  54,  55,  56,  57,  58,  60,
161   62,   64,  66,  68,  70,  72,  74,  76,
162   78,   80,  82,  84,  86,  88,  90,  92,
163   94,   96,  98, 100, 102, 104, 106, 108,
164   110, 112, 114, 116, 119, 122, 125, 128,
165   131, 134, 137, 140, 143, 146, 149, 152,
166   155, 158, 161, 164, 167, 170, 173, 177,
167   181, 185, 189, 193, 197, 201, 205, 209,
168   213, 217, 221, 225, 229, 234, 239, 245,
169   249, 254, 259, 264, 269, 274, 279, 284
170 };
171 
172 static const uint16_t kAcTable2[128] = {
173   8,     8,   9,  10,  12,  13,  15,  17,
174   18,   20,  21,  23,  24,  26,  27,  29,
175   31,   32,  34,  35,  37,  38,  40,  41,
176   43,   44,  46,  48,  49,  51,  52,  54,
177   55,   57,  58,  60,  62,  63,  65,  66,
178   68,   69,  71,  72,  74,  75,  77,  79,
179   80,   82,  83,  85,  86,  88,  89,  93,
180   96,   99, 102, 105, 108, 111, 114, 117,
181   120, 124, 127, 130, 133, 136, 139, 142,
182   145, 148, 151, 155, 158, 161, 164, 167,
183   170, 173, 176, 179, 184, 189, 193, 198,
184   203, 207, 212, 217, 221, 226, 230, 235,
185   240, 244, 249, 254, 258, 263, 268, 274,
186   280, 286, 292, 299, 305, 311, 317, 323,
187   330, 336, 342, 348, 354, 362, 370, 379,
188   385, 393, 401, 409, 416, 424, 432, 440
189 };
190 
191 static const uint8_t kBiasMatrices[3][2] = {  // [luma-ac,luma-dc,chroma][dc,ac]
192   { 96, 110 }, { 96, 108 }, { 110, 115 }
193 };
194 
195 // Sharpening by (slightly) raising the hi-frequency coeffs.
196 // Hack-ish but helpful for mid-bitrate range. Use with care.
197 #define SHARPEN_BITS 11  // number of descaling bits for sharpening bias
198 static const uint8_t kFreqSharpening[16] = {
199   0,  30, 60, 90,
200   30, 60, 90, 90,
201   60, 90, 90, 90,
202   90, 90, 90, 90
203 };
204 
205 //------------------------------------------------------------------------------
206 // Initialize quantization parameters in VP8Matrix
207 
208 // Returns the average quantizer
ExpandMatrix(VP8Matrix * const m,int type)209 static int ExpandMatrix(VP8Matrix* const m, int type) {
210   int i, sum;
211   for (i = 0; i < 2; ++i) {
212     const int is_ac_coeff = (i > 0);
213     const int bias = kBiasMatrices[type][is_ac_coeff];
214     m->iq_[i] = (1 << QFIX) / m->q_[i];
215     m->bias_[i] = BIAS(bias);
216     // zthresh_ is the exact value such that QUANTDIV(coeff, iQ, B) is:
217     //   * zero if coeff <= zthresh
218     //   * non-zero if coeff > zthresh
219     m->zthresh_[i] = ((1 << QFIX) - 1 - m->bias_[i]) / m->iq_[i];
220   }
221   for (i = 2; i < 16; ++i) {
222     m->q_[i] = m->q_[1];
223     m->iq_[i] = m->iq_[1];
224     m->bias_[i] = m->bias_[1];
225     m->zthresh_[i] = m->zthresh_[1];
226   }
227   for (sum = 0, i = 0; i < 16; ++i) {
228     if (type == 0) {  // we only use sharpening for AC luma coeffs
229       m->sharpen_[i] = (kFreqSharpening[i] * m->q_[i]) >> SHARPEN_BITS;
230     } else {
231       m->sharpen_[i] = 0;
232     }
233     sum += m->q_[i];
234   }
235   return (sum + 8) >> 4;
236 }
237 
CheckLambdaValue(int * const v)238 static void CheckLambdaValue(int* const v) { if (*v < 1) *v = 1; }
239 
SetupMatrices(VP8Encoder * enc)240 static void SetupMatrices(VP8Encoder* enc) {
241   int i;
242   const int tlambda_scale =
243     (enc->method_ >= 4) ? enc->config_->sns_strength
244                         : 0;
245   const int num_segments = enc->segment_hdr_.num_segments_;
246   for (i = 0; i < num_segments; ++i) {
247     VP8SegmentInfo* const m = &enc->dqm_[i];
248     const int q = m->quant_;
249     int q_i4, q_i16, q_uv;
250     m->y1_.q_[0] = kDcTable[clip(q + enc->dq_y1_dc_, 0, 127)];
251     m->y1_.q_[1] = kAcTable[clip(q,                  0, 127)];
252 
253     m->y2_.q_[0] = kDcTable[ clip(q + enc->dq_y2_dc_, 0, 127)] * 2;
254     m->y2_.q_[1] = kAcTable2[clip(q + enc->dq_y2_ac_, 0, 127)];
255 
256     m->uv_.q_[0] = kDcTable[clip(q + enc->dq_uv_dc_, 0, 117)];
257     m->uv_.q_[1] = kAcTable[clip(q + enc->dq_uv_ac_, 0, 127)];
258 
259     q_i4  = ExpandMatrix(&m->y1_, 0);
260     q_i16 = ExpandMatrix(&m->y2_, 1);
261     q_uv  = ExpandMatrix(&m->uv_, 2);
262 
263     m->lambda_i4_          = (3 * q_i4 * q_i4) >> 7;
264     m->lambda_i16_         = (3 * q_i16 * q_i16);
265     m->lambda_uv_          = (3 * q_uv * q_uv) >> 6;
266     m->lambda_mode_        = (1 * q_i4 * q_i4) >> 7;
267     m->lambda_trellis_i4_  = (7 * q_i4 * q_i4) >> 3;
268     m->lambda_trellis_i16_ = (q_i16 * q_i16) >> 2;
269     m->lambda_trellis_uv_  = (q_uv * q_uv) << 1;
270     m->tlambda_            = (tlambda_scale * q_i4) >> 5;
271 
272     // none of these constants should be < 1
273     CheckLambdaValue(&m->lambda_i4_);
274     CheckLambdaValue(&m->lambda_i16_);
275     CheckLambdaValue(&m->lambda_uv_);
276     CheckLambdaValue(&m->lambda_mode_);
277     CheckLambdaValue(&m->lambda_trellis_i4_);
278     CheckLambdaValue(&m->lambda_trellis_i16_);
279     CheckLambdaValue(&m->lambda_trellis_uv_);
280     CheckLambdaValue(&m->tlambda_);
281 
282     m->min_disto_ = 20 * m->y1_.q_[0];   // quantization-aware min disto
283     m->max_edge_  = 0;
284 
285     m->i4_penalty_ = 1000 * q_i4 * q_i4;
286   }
287 }
288 
289 //------------------------------------------------------------------------------
290 // Initialize filtering parameters
291 
292 // Very small filter-strength values have close to no visual effect. So we can
293 // save a little decoding-CPU by turning filtering off for these.
294 #define FSTRENGTH_CUTOFF 2
295 
SetupFilterStrength(VP8Encoder * const enc)296 static void SetupFilterStrength(VP8Encoder* const enc) {
297   int i;
298   // level0 is in [0..500]. Using '-f 50' as filter_strength is mid-filtering.
299   const int level0 = 5 * enc->config_->filter_strength;
300   for (i = 0; i < NUM_MB_SEGMENTS; ++i) {
301     VP8SegmentInfo* const m = &enc->dqm_[i];
302     // We focus on the quantization of AC coeffs.
303     const int qstep = kAcTable[clip(m->quant_, 0, 127)] >> 2;
304     const int base_strength =
305         VP8FilterStrengthFromDelta(enc->filter_hdr_.sharpness_, qstep);
306     // Segments with lower complexity ('beta') will be less filtered.
307     const int f = base_strength * level0 / (256 + m->beta_);
308     m->fstrength_ = (f < FSTRENGTH_CUTOFF) ? 0 : (f > 63) ? 63 : f;
309   }
310   // We record the initial strength (mainly for the case of 1-segment only).
311   enc->filter_hdr_.level_ = enc->dqm_[0].fstrength_;
312   enc->filter_hdr_.simple_ = (enc->config_->filter_type == 0);
313   enc->filter_hdr_.sharpness_ = enc->config_->filter_sharpness;
314 }
315 
316 //------------------------------------------------------------------------------
317 
318 // Note: if you change the values below, remember that the max range
319 // allowed by the syntax for DQ_UV is [-16,16].
320 #define MAX_DQ_UV (6)
321 #define MIN_DQ_UV (-4)
322 
323 // We want to emulate jpeg-like behaviour where the expected "good" quality
324 // is around q=75. Internally, our "good" middle is around c=50. So we
325 // map accordingly using linear piece-wise function
QualityToCompression(double c)326 static double QualityToCompression(double c) {
327   const double linear_c = (c < 0.75) ? c * (2. / 3.) : 2. * c - 1.;
328   // The file size roughly scales as pow(quantizer, 3.). Actually, the
329   // exponent is somewhere between 2.8 and 3.2, but we're mostly interested
330   // in the mid-quant range. So we scale the compressibility inversely to
331   // this power-law: quant ~= compression ^ 1/3. This law holds well for
332   // low quant. Finer modeling for high-quant would make use of kAcTable[]
333   // more explicitly.
334   const double v = pow(linear_c, 1 / 3.);
335   return v;
336 }
337 
QualityToJPEGCompression(double c,double alpha)338 static double QualityToJPEGCompression(double c, double alpha) {
339   // We map the complexity 'alpha' and quality setting 'c' to a compression
340   // exponent empirically matched to the compression curve of libjpeg6b.
341   // On average, the WebP output size will be roughly similar to that of a
342   // JPEG file compressed with same quality factor.
343   const double amin = 0.30;
344   const double amax = 0.85;
345   const double exp_min = 0.4;
346   const double exp_max = 0.9;
347   const double slope = (exp_min - exp_max) / (amax - amin);
348   // Linearly interpolate 'expn' from exp_min to exp_max
349   // in the [amin, amax] range.
350   const double expn = (alpha > amax) ? exp_min
351                     : (alpha < amin) ? exp_max
352                     : exp_max + slope * (alpha - amin);
353   const double v = pow(c, expn);
354   return v;
355 }
356 
SegmentsAreEquivalent(const VP8SegmentInfo * const S1,const VP8SegmentInfo * const S2)357 static int SegmentsAreEquivalent(const VP8SegmentInfo* const S1,
358                                  const VP8SegmentInfo* const S2) {
359   return (S1->quant_ == S2->quant_) && (S1->fstrength_ == S2->fstrength_);
360 }
361 
SimplifySegments(VP8Encoder * const enc)362 static void SimplifySegments(VP8Encoder* const enc) {
363   int map[NUM_MB_SEGMENTS] = { 0, 1, 2, 3 };
364   // 'num_segments_' is previously validated and <= NUM_MB_SEGMENTS, but an
365   // explicit check is needed to avoid a spurious warning about 'i' exceeding
366   // array bounds of 'dqm_' with some compilers (noticed with gcc-4.9).
367   const int num_segments = (enc->segment_hdr_.num_segments_ < NUM_MB_SEGMENTS)
368                                ? enc->segment_hdr_.num_segments_
369                                : NUM_MB_SEGMENTS;
370   int num_final_segments = 1;
371   int s1, s2;
372   for (s1 = 1; s1 < num_segments; ++s1) {    // find similar segments
373     const VP8SegmentInfo* const S1 = &enc->dqm_[s1];
374     int found = 0;
375     // check if we already have similar segment
376     for (s2 = 0; s2 < num_final_segments; ++s2) {
377       const VP8SegmentInfo* const S2 = &enc->dqm_[s2];
378       if (SegmentsAreEquivalent(S1, S2)) {
379         found = 1;
380         break;
381       }
382     }
383     map[s1] = s2;
384     if (!found) {
385       if (num_final_segments != s1) {
386         enc->dqm_[num_final_segments] = enc->dqm_[s1];
387       }
388       ++num_final_segments;
389     }
390   }
391   if (num_final_segments < num_segments) {  // Remap
392     int i = enc->mb_w_ * enc->mb_h_;
393     while (i-- > 0) enc->mb_info_[i].segment_ = map[enc->mb_info_[i].segment_];
394     enc->segment_hdr_.num_segments_ = num_final_segments;
395     // Replicate the trailing segment infos (it's mostly cosmetics)
396     for (i = num_final_segments; i < num_segments; ++i) {
397       enc->dqm_[i] = enc->dqm_[num_final_segments - 1];
398     }
399   }
400 }
401 
VP8SetSegmentParams(VP8Encoder * const enc,float quality)402 void VP8SetSegmentParams(VP8Encoder* const enc, float quality) {
403   int i;
404   int dq_uv_ac, dq_uv_dc;
405   const int num_segments = enc->segment_hdr_.num_segments_;
406   const double amp = SNS_TO_DQ * enc->config_->sns_strength / 100. / 128.;
407   const double Q = quality / 100.;
408   const double c_base = enc->config_->emulate_jpeg_size ?
409       QualityToJPEGCompression(Q, enc->alpha_ / 255.) :
410       QualityToCompression(Q);
411   for (i = 0; i < num_segments; ++i) {
412     // We modulate the base coefficient to accommodate for the quantization
413     // susceptibility and allow denser segments to be quantized more.
414     const double expn = 1. - amp * enc->dqm_[i].alpha_;
415     const double c = pow(c_base, expn);
416     const int q = (int)(127. * (1. - c));
417     assert(expn > 0.);
418     enc->dqm_[i].quant_ = clip(q, 0, 127);
419   }
420 
421   // purely indicative in the bitstream (except for the 1-segment case)
422   enc->base_quant_ = enc->dqm_[0].quant_;
423 
424   // fill-in values for the unused segments (required by the syntax)
425   for (i = num_segments; i < NUM_MB_SEGMENTS; ++i) {
426     enc->dqm_[i].quant_ = enc->base_quant_;
427   }
428 
429   // uv_alpha_ is normally spread around ~60. The useful range is
430   // typically ~30 (quite bad) to ~100 (ok to decimate UV more).
431   // We map it to the safe maximal range of MAX/MIN_DQ_UV for dq_uv.
432   dq_uv_ac = (enc->uv_alpha_ - MID_ALPHA) * (MAX_DQ_UV - MIN_DQ_UV)
433                                           / (MAX_ALPHA - MIN_ALPHA);
434   // we rescale by the user-defined strength of adaptation
435   dq_uv_ac = dq_uv_ac * enc->config_->sns_strength / 100;
436   // and make it safe.
437   dq_uv_ac = clip(dq_uv_ac, MIN_DQ_UV, MAX_DQ_UV);
438   // We also boost the dc-uv-quant a little, based on sns-strength, since
439   // U/V channels are quite more reactive to high quants (flat DC-blocks
440   // tend to appear, and are unpleasant).
441   dq_uv_dc = -4 * enc->config_->sns_strength / 100;
442   dq_uv_dc = clip(dq_uv_dc, -15, 15);   // 4bit-signed max allowed
443 
444   enc->dq_y1_dc_ = 0;       // TODO(skal): dq-lum
445   enc->dq_y2_dc_ = 0;
446   enc->dq_y2_ac_ = 0;
447   enc->dq_uv_dc_ = dq_uv_dc;
448   enc->dq_uv_ac_ = dq_uv_ac;
449 
450   SetupFilterStrength(enc);   // initialize segments' filtering, eventually
451 
452   if (num_segments > 1) SimplifySegments(enc);
453 
454   SetupMatrices(enc);         // finalize quantization matrices
455 }
456 
457 //------------------------------------------------------------------------------
458 // Form the predictions in cache
459 
460 // Must be ordered using {DC_PRED, TM_PRED, V_PRED, H_PRED} as index
461 const uint16_t VP8I16ModeOffsets[4] = { I16DC16, I16TM16, I16VE16, I16HE16 };
462 const uint16_t VP8UVModeOffsets[4] = { C8DC8, C8TM8, C8VE8, C8HE8 };
463 
464 // Must be indexed using {B_DC_PRED -> B_HU_PRED} as index
465 const uint16_t VP8I4ModeOffsets[NUM_BMODES] = {
466   I4DC4, I4TM4, I4VE4, I4HE4, I4RD4, I4VR4, I4LD4, I4VL4, I4HD4, I4HU4
467 };
468 
VP8MakeLuma16Preds(const VP8EncIterator * const it)469 void VP8MakeLuma16Preds(const VP8EncIterator* const it) {
470   const uint8_t* const left = it->x_ ? it->y_left_ : NULL;
471   const uint8_t* const top = it->y_ ? it->y_top_ : NULL;
472   VP8EncPredLuma16(it->yuv_p_, left, top);
473 }
474 
VP8MakeChroma8Preds(const VP8EncIterator * const it)475 void VP8MakeChroma8Preds(const VP8EncIterator* const it) {
476   const uint8_t* const left = it->x_ ? it->u_left_ : NULL;
477   const uint8_t* const top = it->y_ ? it->uv_top_ : NULL;
478   VP8EncPredChroma8(it->yuv_p_, left, top);
479 }
480 
VP8MakeIntra4Preds(const VP8EncIterator * const it)481 void VP8MakeIntra4Preds(const VP8EncIterator* const it) {
482   VP8EncPredLuma4(it->yuv_p_, it->i4_top_);
483 }
484 
485 //------------------------------------------------------------------------------
486 // Quantize
487 
488 // Layout:
489 // +----+----+
490 // |YYYY|UUVV| 0
491 // |YYYY|UUVV| 4
492 // |YYYY|....| 8
493 // |YYYY|....| 12
494 // +----+----+
495 
496 const uint16_t VP8Scan[16] = {  // Luma
497   0 +  0 * BPS,  4 +  0 * BPS, 8 +  0 * BPS, 12 +  0 * BPS,
498   0 +  4 * BPS,  4 +  4 * BPS, 8 +  4 * BPS, 12 +  4 * BPS,
499   0 +  8 * BPS,  4 +  8 * BPS, 8 +  8 * BPS, 12 +  8 * BPS,
500   0 + 12 * BPS,  4 + 12 * BPS, 8 + 12 * BPS, 12 + 12 * BPS,
501 };
502 
503 static const uint16_t VP8ScanUV[4 + 4] = {
504   0 + 0 * BPS,   4 + 0 * BPS, 0 + 4 * BPS,  4 + 4 * BPS,    // U
505   8 + 0 * BPS,  12 + 0 * BPS, 8 + 4 * BPS, 12 + 4 * BPS     // V
506 };
507 
508 //------------------------------------------------------------------------------
509 // Distortion measurement
510 
511 static const uint16_t kWeightY[16] = {
512   38, 32, 20, 9, 32, 28, 17, 7, 20, 17, 10, 4, 9, 7, 4, 2
513 };
514 
515 static const uint16_t kWeightTrellis[16] = {
516 #if USE_TDISTO == 0
517   16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
518 #else
519   30, 27, 19, 11,
520   27, 24, 17, 10,
521   19, 17, 12,  8,
522   11, 10,  8,  6
523 #endif
524 };
525 
526 // Init/Copy the common fields in score.
InitScore(VP8ModeScore * const rd)527 static void InitScore(VP8ModeScore* const rd) {
528   rd->D  = 0;
529   rd->SD = 0;
530   rd->R  = 0;
531   rd->H  = 0;
532   rd->nz = 0;
533   rd->score = MAX_COST;
534 }
535 
CopyScore(VP8ModeScore * WEBP_RESTRICT const dst,const VP8ModeScore * WEBP_RESTRICT const src)536 static void CopyScore(VP8ModeScore* WEBP_RESTRICT const dst,
537                       const VP8ModeScore* WEBP_RESTRICT const src) {
538   dst->D  = src->D;
539   dst->SD = src->SD;
540   dst->R  = src->R;
541   dst->H  = src->H;
542   dst->nz = src->nz;      // note that nz is not accumulated, but just copied.
543   dst->score = src->score;
544 }
545 
AddScore(VP8ModeScore * WEBP_RESTRICT const dst,const VP8ModeScore * WEBP_RESTRICT const src)546 static void AddScore(VP8ModeScore* WEBP_RESTRICT const dst,
547                      const VP8ModeScore* WEBP_RESTRICT const src) {
548   dst->D  += src->D;
549   dst->SD += src->SD;
550   dst->R  += src->R;
551   dst->H  += src->H;
552   dst->nz |= src->nz;     // here, new nz bits are accumulated.
553   dst->score += src->score;
554 }
555 
556 //------------------------------------------------------------------------------
557 // Performs trellis-optimized quantization.
558 
559 // Trellis node
560 typedef struct {
561   int8_t prev;            // best previous node
562   int8_t sign;            // sign of coeff_i
563   int16_t level;          // level
564 } Node;
565 
566 // Score state
567 typedef struct {
568   score_t score;          // partial RD score
569   const uint16_t* costs;  // shortcut to cost tables
570 } ScoreState;
571 
572 // If a coefficient was quantized to a value Q (using a neutral bias),
573 // we test all alternate possibilities between [Q-MIN_DELTA, Q+MAX_DELTA]
574 // We don't test negative values though.
575 #define MIN_DELTA 0   // how much lower level to try
576 #define MAX_DELTA 1   // how much higher
577 #define NUM_NODES (MIN_DELTA + 1 + MAX_DELTA)
578 #define NODE(n, l) (nodes[(n)][(l) + MIN_DELTA])
579 #define SCORE_STATE(n, l) (score_states[n][(l) + MIN_DELTA])
580 
SetRDScore(int lambda,VP8ModeScore * const rd)581 static WEBP_INLINE void SetRDScore(int lambda, VP8ModeScore* const rd) {
582   rd->score = (rd->R + rd->H) * lambda + RD_DISTO_MULT * (rd->D + rd->SD);
583 }
584 
RDScoreTrellis(int lambda,score_t rate,score_t distortion)585 static WEBP_INLINE score_t RDScoreTrellis(int lambda, score_t rate,
586                                           score_t distortion) {
587   return rate * lambda + RD_DISTO_MULT * distortion;
588 }
589 
590 // Coefficient type.
591 enum { TYPE_I16_AC = 0, TYPE_I16_DC = 1, TYPE_CHROMA_A = 2, TYPE_I4_AC = 3 };
592 
TrellisQuantizeBlock(const VP8Encoder * WEBP_RESTRICT const enc,int16_t in[16],int16_t out[16],int ctx0,int coeff_type,const VP8Matrix * WEBP_RESTRICT const mtx,int lambda)593 static int TrellisQuantizeBlock(const VP8Encoder* WEBP_RESTRICT const enc,
594                                 int16_t in[16], int16_t out[16],
595                                 int ctx0, int coeff_type,
596                                 const VP8Matrix* WEBP_RESTRICT const mtx,
597                                 int lambda) {
598   const ProbaArray* const probas = enc->proba_.coeffs_[coeff_type];
599   CostArrayPtr const costs =
600       (CostArrayPtr)enc->proba_.remapped_costs_[coeff_type];
601   const int first = (coeff_type == TYPE_I16_AC) ? 1 : 0;
602   Node nodes[16][NUM_NODES];
603   ScoreState score_states[2][NUM_NODES];
604   ScoreState* ss_cur = &SCORE_STATE(0, MIN_DELTA);
605   ScoreState* ss_prev = &SCORE_STATE(1, MIN_DELTA);
606   int best_path[3] = {-1, -1, -1};   // store best-last/best-level/best-previous
607   score_t best_score;
608   int n, m, p, last;
609 
610   {
611     score_t cost;
612     const int thresh = mtx->q_[1] * mtx->q_[1] / 4;
613     const int last_proba = probas[VP8EncBands[first]][ctx0][0];
614 
615     // compute the position of the last interesting coefficient
616     last = first - 1;
617     for (n = 15; n >= first; --n) {
618       const int j = kZigzag[n];
619       const int err = in[j] * in[j];
620       if (err > thresh) {
621         last = n;
622         break;
623       }
624     }
625     // we don't need to go inspect up to n = 16 coeffs. We can just go up
626     // to last + 1 (inclusive) without losing much.
627     if (last < 15) ++last;
628 
629     // compute 'skip' score. This is the max score one can do.
630     cost = VP8BitCost(0, last_proba);
631     best_score = RDScoreTrellis(lambda, cost, 0);
632 
633     // initialize source node.
634     for (m = -MIN_DELTA; m <= MAX_DELTA; ++m) {
635       const score_t rate = (ctx0 == 0) ? VP8BitCost(1, last_proba) : 0;
636       ss_cur[m].score = RDScoreTrellis(lambda, rate, 0);
637       ss_cur[m].costs = costs[first][ctx0];
638     }
639   }
640 
641   // traverse trellis.
642   for (n = first; n <= last; ++n) {
643     const int j = kZigzag[n];
644     const uint32_t Q  = mtx->q_[j];
645     const uint32_t iQ = mtx->iq_[j];
646     const uint32_t B = BIAS(0x00);     // neutral bias
647     // note: it's important to take sign of the _original_ coeff,
648     // so we don't have to consider level < 0 afterward.
649     const int sign = (in[j] < 0);
650     const uint32_t coeff0 = (sign ? -in[j] : in[j]) + mtx->sharpen_[j];
651     int level0 = QUANTDIV(coeff0, iQ, B);
652     int thresh_level = QUANTDIV(coeff0, iQ, BIAS(0x80));
653     if (thresh_level > MAX_LEVEL) thresh_level = MAX_LEVEL;
654     if (level0 > MAX_LEVEL) level0 = MAX_LEVEL;
655 
656     {   // Swap current and previous score states
657       ScoreState* const tmp = ss_cur;
658       ss_cur = ss_prev;
659       ss_prev = tmp;
660     }
661 
662     // test all alternate level values around level0.
663     for (m = -MIN_DELTA; m <= MAX_DELTA; ++m) {
664       Node* const cur = &NODE(n, m);
665       const int level = level0 + m;
666       const int ctx = (level > 2) ? 2 : level;
667       const int band = VP8EncBands[n + 1];
668       score_t base_score;
669       score_t best_cur_score;
670       int best_prev;
671       score_t cost, score;
672 
673       ss_cur[m].costs = costs[n + 1][ctx];
674       if (level < 0 || level > thresh_level) {
675         ss_cur[m].score = MAX_COST;
676         // Node is dead.
677         continue;
678       }
679 
680       {
681         // Compute delta_error = how much coding this level will
682         // subtract to max_error as distortion.
683         // Here, distortion = sum of (|coeff_i| - level_i * Q_i)^2
684         const int new_error = coeff0 - level * Q;
685         const int delta_error =
686             kWeightTrellis[j] * (new_error * new_error - coeff0 * coeff0);
687         base_score = RDScoreTrellis(lambda, 0, delta_error);
688       }
689 
690       // Inspect all possible non-dead predecessors. Retain only the best one.
691       // The base_score is added to all scores so it is only added for the final
692       // value after the loop.
693       cost = VP8LevelCost(ss_prev[-MIN_DELTA].costs, level);
694       best_cur_score =
695           ss_prev[-MIN_DELTA].score + RDScoreTrellis(lambda, cost, 0);
696       best_prev = -MIN_DELTA;
697       for (p = -MIN_DELTA + 1; p <= MAX_DELTA; ++p) {
698         // Dead nodes (with ss_prev[p].score >= MAX_COST) are automatically
699         // eliminated since their score can't be better than the current best.
700         cost = VP8LevelCost(ss_prev[p].costs, level);
701         // Examine node assuming it's a non-terminal one.
702         score = ss_prev[p].score + RDScoreTrellis(lambda, cost, 0);
703         if (score < best_cur_score) {
704           best_cur_score = score;
705           best_prev = p;
706         }
707       }
708       best_cur_score += base_score;
709       // Store best finding in current node.
710       cur->sign = sign;
711       cur->level = level;
712       cur->prev = best_prev;
713       ss_cur[m].score = best_cur_score;
714 
715       // Now, record best terminal node (and thus best entry in the graph).
716       if (level != 0 && best_cur_score < best_score) {
717         const score_t last_pos_cost =
718             (n < 15) ? VP8BitCost(0, probas[band][ctx][0]) : 0;
719         const score_t last_pos_score = RDScoreTrellis(lambda, last_pos_cost, 0);
720         score = best_cur_score + last_pos_score;
721         if (score < best_score) {
722           best_score = score;
723           best_path[0] = n;                     // best eob position
724           best_path[1] = m;                     // best node index
725           best_path[2] = best_prev;             // best predecessor
726         }
727       }
728     }
729   }
730 
731   // Fresh start
732   // Beware! We must preserve in[0]/out[0] value for TYPE_I16_AC case.
733   if (coeff_type == TYPE_I16_AC) {
734     memset(in + 1, 0, 15 * sizeof(*in));
735     memset(out + 1, 0, 15 * sizeof(*out));
736   } else {
737     memset(in, 0, 16 * sizeof(*in));
738     memset(out, 0, 16 * sizeof(*out));
739   }
740   if (best_path[0] == -1) {
741     return 0;  // skip!
742   }
743 
744   {
745     // Unwind the best path.
746     // Note: best-prev on terminal node is not necessarily equal to the
747     // best_prev for non-terminal. So we patch best_path[2] in.
748     int nz = 0;
749     int best_node = best_path[1];
750     n = best_path[0];
751     NODE(n, best_node).prev = best_path[2];   // force best-prev for terminal
752 
753     for (; n >= first; --n) {
754       const Node* const node = &NODE(n, best_node);
755       const int j = kZigzag[n];
756       out[n] = node->sign ? -node->level : node->level;
757       nz |= node->level;
758       in[j] = out[n] * mtx->q_[j];
759       best_node = node->prev;
760     }
761     return (nz != 0);
762   }
763 }
764 
765 #undef NODE
766 
767 //------------------------------------------------------------------------------
768 // Performs: difference, transform, quantize, back-transform, add
769 // all at once. Output is the reconstructed block in *yuv_out, and the
770 // quantized levels in *levels.
771 
ReconstructIntra16(VP8EncIterator * WEBP_RESTRICT const it,VP8ModeScore * WEBP_RESTRICT const rd,uint8_t * WEBP_RESTRICT const yuv_out,int mode)772 static int ReconstructIntra16(VP8EncIterator* WEBP_RESTRICT const it,
773                               VP8ModeScore* WEBP_RESTRICT const rd,
774                               uint8_t* WEBP_RESTRICT const yuv_out,
775                               int mode) {
776   const VP8Encoder* const enc = it->enc_;
777   const uint8_t* const ref = it->yuv_p_ + VP8I16ModeOffsets[mode];
778   const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC;
779   const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
780   int nz = 0;
781   int n;
782   int16_t tmp[16][16], dc_tmp[16];
783 
784   for (n = 0; n < 16; n += 2) {
785     VP8FTransform2(src + VP8Scan[n], ref + VP8Scan[n], tmp[n]);
786   }
787   VP8FTransformWHT(tmp[0], dc_tmp);
788   nz |= VP8EncQuantizeBlockWHT(dc_tmp, rd->y_dc_levels, &dqm->y2_) << 24;
789 
790   if (DO_TRELLIS_I16 && it->do_trellis_) {
791     int x, y;
792     VP8IteratorNzToBytes(it);
793     for (y = 0, n = 0; y < 4; ++y) {
794       for (x = 0; x < 4; ++x, ++n) {
795         const int ctx = it->top_nz_[x] + it->left_nz_[y];
796         const int non_zero = TrellisQuantizeBlock(
797             enc, tmp[n], rd->y_ac_levels[n], ctx, TYPE_I16_AC, &dqm->y1_,
798             dqm->lambda_trellis_i16_);
799         it->top_nz_[x] = it->left_nz_[y] = non_zero;
800         rd->y_ac_levels[n][0] = 0;
801         nz |= non_zero << n;
802       }
803     }
804   } else {
805     for (n = 0; n < 16; n += 2) {
806       // Zero-out the first coeff, so that: a) nz is correct below, and
807       // b) finding 'last' non-zero coeffs in SetResidualCoeffs() is simplified.
808       tmp[n][0] = tmp[n + 1][0] = 0;
809       nz |= VP8EncQuantize2Blocks(tmp[n], rd->y_ac_levels[n], &dqm->y1_) << n;
810       assert(rd->y_ac_levels[n + 0][0] == 0);
811       assert(rd->y_ac_levels[n + 1][0] == 0);
812     }
813   }
814 
815   // Transform back
816   VP8TransformWHT(dc_tmp, tmp[0]);
817   for (n = 0; n < 16; n += 2) {
818     VP8ITransform(ref + VP8Scan[n], tmp[n], yuv_out + VP8Scan[n], 1);
819   }
820 
821   return nz;
822 }
823 
ReconstructIntra4(VP8EncIterator * WEBP_RESTRICT const it,int16_t levels[16],const uint8_t * WEBP_RESTRICT const src,uint8_t * WEBP_RESTRICT const yuv_out,int mode)824 static int ReconstructIntra4(VP8EncIterator* WEBP_RESTRICT const it,
825                              int16_t levels[16],
826                              const uint8_t* WEBP_RESTRICT const src,
827                              uint8_t* WEBP_RESTRICT const yuv_out,
828                              int mode) {
829   const VP8Encoder* const enc = it->enc_;
830   const uint8_t* const ref = it->yuv_p_ + VP8I4ModeOffsets[mode];
831   const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
832   int nz = 0;
833   int16_t tmp[16];
834 
835   VP8FTransform(src, ref, tmp);
836   if (DO_TRELLIS_I4 && it->do_trellis_) {
837     const int x = it->i4_ & 3, y = it->i4_ >> 2;
838     const int ctx = it->top_nz_[x] + it->left_nz_[y];
839     nz = TrellisQuantizeBlock(enc, tmp, levels, ctx, TYPE_I4_AC, &dqm->y1_,
840                               dqm->lambda_trellis_i4_);
841   } else {
842     nz = VP8EncQuantizeBlock(tmp, levels, &dqm->y1_);
843   }
844   VP8ITransform(ref, tmp, yuv_out, 0);
845   return nz;
846 }
847 
848 //------------------------------------------------------------------------------
849 // DC-error diffusion
850 
851 // Diffusion weights. We under-correct a bit (15/16th of the error is actually
852 // diffused) to avoid 'rainbow' chessboard pattern of blocks at q~=0.
853 #define C1 7    // fraction of error sent to the 4x4 block below
854 #define C2 8    // fraction of error sent to the 4x4 block on the right
855 #define DSHIFT 4
856 #define DSCALE 1   // storage descaling, needed to make the error fit int8_t
857 
858 // Quantize as usual, but also compute and return the quantization error.
859 // Error is already divided by DSHIFT.
QuantizeSingle(int16_t * WEBP_RESTRICT const v,const VP8Matrix * WEBP_RESTRICT const mtx)860 static int QuantizeSingle(int16_t* WEBP_RESTRICT const v,
861                           const VP8Matrix* WEBP_RESTRICT const mtx) {
862   int V = *v;
863   const int sign = (V < 0);
864   if (sign) V = -V;
865   if (V > (int)mtx->zthresh_[0]) {
866     const int qV = QUANTDIV(V, mtx->iq_[0], mtx->bias_[0]) * mtx->q_[0];
867     const int err = (V - qV);
868     *v = sign ? -qV : qV;
869     return (sign ? -err : err) >> DSCALE;
870   }
871   *v = 0;
872   return (sign ? -V : V) >> DSCALE;
873 }
874 
CorrectDCValues(const VP8EncIterator * WEBP_RESTRICT const it,const VP8Matrix * WEBP_RESTRICT const mtx,int16_t tmp[][16],VP8ModeScore * WEBP_RESTRICT const rd)875 static void CorrectDCValues(const VP8EncIterator* WEBP_RESTRICT const it,
876                             const VP8Matrix* WEBP_RESTRICT const mtx,
877                             int16_t tmp[][16],
878                             VP8ModeScore* WEBP_RESTRICT const rd) {
879   //         | top[0] | top[1]
880   // --------+--------+---------
881   // left[0] | tmp[0]   tmp[1]  <->   err0 err1
882   // left[1] | tmp[2]   tmp[3]        err2 err3
883   //
884   // Final errors {err1,err2,err3} are preserved and later restored
885   // as top[]/left[] on the next block.
886   int ch;
887   for (ch = 0; ch <= 1; ++ch) {
888     const int8_t* const top = it->top_derr_[it->x_][ch];
889     const int8_t* const left = it->left_derr_[ch];
890     int16_t (* const c)[16] = &tmp[ch * 4];
891     int err0, err1, err2, err3;
892     c[0][0] += (C1 * top[0] + C2 * left[0]) >> (DSHIFT - DSCALE);
893     err0 = QuantizeSingle(&c[0][0], mtx);
894     c[1][0] += (C1 * top[1] + C2 * err0) >> (DSHIFT - DSCALE);
895     err1 = QuantizeSingle(&c[1][0], mtx);
896     c[2][0] += (C1 * err0 + C2 * left[1]) >> (DSHIFT - DSCALE);
897     err2 = QuantizeSingle(&c[2][0], mtx);
898     c[3][0] += (C1 * err1 + C2 * err2) >> (DSHIFT - DSCALE);
899     err3 = QuantizeSingle(&c[3][0], mtx);
900     // error 'err' is bounded by mtx->q_[0] which is 132 at max. Hence
901     // err >> DSCALE will fit in an int8_t type if DSCALE>=1.
902     assert(abs(err1) <= 127 && abs(err2) <= 127 && abs(err3) <= 127);
903     rd->derr[ch][0] = (int8_t)err1;
904     rd->derr[ch][1] = (int8_t)err2;
905     rd->derr[ch][2] = (int8_t)err3;
906   }
907 }
908 
StoreDiffusionErrors(VP8EncIterator * WEBP_RESTRICT const it,const VP8ModeScore * WEBP_RESTRICT const rd)909 static void StoreDiffusionErrors(VP8EncIterator* WEBP_RESTRICT const it,
910                                  const VP8ModeScore* WEBP_RESTRICT const rd) {
911   int ch;
912   for (ch = 0; ch <= 1; ++ch) {
913     int8_t* const top = it->top_derr_[it->x_][ch];
914     int8_t* const left = it->left_derr_[ch];
915     left[0] = rd->derr[ch][0];            // restore err1
916     left[1] = 3 * rd->derr[ch][2] >> 2;   //     ... 3/4th of err3
917     top[0]  = rd->derr[ch][1];            //     ... err2
918     top[1]  = rd->derr[ch][2] - left[1];  //     ... 1/4th of err3.
919   }
920 }
921 
922 #undef C1
923 #undef C2
924 #undef DSHIFT
925 #undef DSCALE
926 
927 //------------------------------------------------------------------------------
928 
ReconstructUV(VP8EncIterator * WEBP_RESTRICT const it,VP8ModeScore * WEBP_RESTRICT const rd,uint8_t * WEBP_RESTRICT const yuv_out,int mode)929 static int ReconstructUV(VP8EncIterator* WEBP_RESTRICT const it,
930                          VP8ModeScore* WEBP_RESTRICT const rd,
931                          uint8_t* WEBP_RESTRICT const yuv_out, int mode) {
932   const VP8Encoder* const enc = it->enc_;
933   const uint8_t* const ref = it->yuv_p_ + VP8UVModeOffsets[mode];
934   const uint8_t* const src = it->yuv_in_ + U_OFF_ENC;
935   const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
936   int nz = 0;
937   int n;
938   int16_t tmp[8][16];
939 
940   for (n = 0; n < 8; n += 2) {
941     VP8FTransform2(src + VP8ScanUV[n], ref + VP8ScanUV[n], tmp[n]);
942   }
943   if (it->top_derr_ != NULL) CorrectDCValues(it, &dqm->uv_, tmp, rd);
944 
945   if (DO_TRELLIS_UV && it->do_trellis_) {
946     int ch, x, y;
947     for (ch = 0, n = 0; ch <= 2; ch += 2) {
948       for (y = 0; y < 2; ++y) {
949         for (x = 0; x < 2; ++x, ++n) {
950           const int ctx = it->top_nz_[4 + ch + x] + it->left_nz_[4 + ch + y];
951           const int non_zero = TrellisQuantizeBlock(
952               enc, tmp[n], rd->uv_levels[n], ctx, TYPE_CHROMA_A, &dqm->uv_,
953               dqm->lambda_trellis_uv_);
954           it->top_nz_[4 + ch + x] = it->left_nz_[4 + ch + y] = non_zero;
955           nz |= non_zero << n;
956         }
957       }
958     }
959   } else {
960     for (n = 0; n < 8; n += 2) {
961       nz |= VP8EncQuantize2Blocks(tmp[n], rd->uv_levels[n], &dqm->uv_) << n;
962     }
963   }
964 
965   for (n = 0; n < 8; n += 2) {
966     VP8ITransform(ref + VP8ScanUV[n], tmp[n], yuv_out + VP8ScanUV[n], 1);
967   }
968   return (nz << 16);
969 }
970 
971 //------------------------------------------------------------------------------
972 // RD-opt decision. Reconstruct each modes, evalue distortion and bit-cost.
973 // Pick the mode is lower RD-cost = Rate + lambda * Distortion.
974 
StoreMaxDelta(VP8SegmentInfo * const dqm,const int16_t DCs[16])975 static void StoreMaxDelta(VP8SegmentInfo* const dqm, const int16_t DCs[16]) {
976   // We look at the first three AC coefficients to determine what is the average
977   // delta between each sub-4x4 block.
978   const int v0 = abs(DCs[1]);
979   const int v1 = abs(DCs[2]);
980   const int v2 = abs(DCs[4]);
981   int max_v = (v1 > v0) ? v1 : v0;
982   max_v = (v2 > max_v) ? v2 : max_v;
983   if (max_v > dqm->max_edge_) dqm->max_edge_ = max_v;
984 }
985 
SwapModeScore(VP8ModeScore ** a,VP8ModeScore ** b)986 static void SwapModeScore(VP8ModeScore** a, VP8ModeScore** b) {
987   VP8ModeScore* const tmp = *a;
988   *a = *b;
989   *b = tmp;
990 }
991 
SwapPtr(uint8_t ** a,uint8_t ** b)992 static void SwapPtr(uint8_t** a, uint8_t** b) {
993   uint8_t* const tmp = *a;
994   *a = *b;
995   *b = tmp;
996 }
997 
SwapOut(VP8EncIterator * const it)998 static void SwapOut(VP8EncIterator* const it) {
999   SwapPtr(&it->yuv_out_, &it->yuv_out2_);
1000 }
1001 
PickBestIntra16(VP8EncIterator * WEBP_RESTRICT const it,VP8ModeScore * WEBP_RESTRICT rd)1002 static void PickBestIntra16(VP8EncIterator* WEBP_RESTRICT const it,
1003                             VP8ModeScore* WEBP_RESTRICT rd) {
1004   const int kNumBlocks = 16;
1005   VP8SegmentInfo* const dqm = &it->enc_->dqm_[it->mb_->segment_];
1006   const int lambda = dqm->lambda_i16_;
1007   const int tlambda = dqm->tlambda_;
1008   const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC;
1009   VP8ModeScore rd_tmp;
1010   VP8ModeScore* rd_cur = &rd_tmp;
1011   VP8ModeScore* rd_best = rd;
1012   int mode;
1013   int is_flat = IsFlatSource16(it->yuv_in_ + Y_OFF_ENC);
1014 
1015   rd->mode_i16 = -1;
1016   for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
1017     uint8_t* const tmp_dst = it->yuv_out2_ + Y_OFF_ENC;  // scratch buffer
1018     rd_cur->mode_i16 = mode;
1019 
1020     // Reconstruct
1021     rd_cur->nz = ReconstructIntra16(it, rd_cur, tmp_dst, mode);
1022 
1023     // Measure RD-score
1024     rd_cur->D = VP8SSE16x16(src, tmp_dst);
1025     rd_cur->SD =
1026         tlambda ? MULT_8B(tlambda, VP8TDisto16x16(src, tmp_dst, kWeightY)) : 0;
1027     rd_cur->H = VP8FixedCostsI16[mode];
1028     rd_cur->R = VP8GetCostLuma16(it, rd_cur);
1029     if (is_flat) {
1030       // refine the first impression (which was in pixel space)
1031       is_flat = IsFlat(rd_cur->y_ac_levels[0], kNumBlocks, FLATNESS_LIMIT_I16);
1032       if (is_flat) {
1033         // Block is very flat. We put emphasis on the distortion being very low!
1034         rd_cur->D *= 2;
1035         rd_cur->SD *= 2;
1036       }
1037     }
1038 
1039     // Since we always examine Intra16 first, we can overwrite *rd directly.
1040     SetRDScore(lambda, rd_cur);
1041     if (mode == 0 || rd_cur->score < rd_best->score) {
1042       SwapModeScore(&rd_cur, &rd_best);
1043       SwapOut(it);
1044     }
1045   }
1046   if (rd_best != rd) {
1047     memcpy(rd, rd_best, sizeof(*rd));
1048   }
1049   SetRDScore(dqm->lambda_mode_, rd);   // finalize score for mode decision.
1050   VP8SetIntra16Mode(it, rd->mode_i16);
1051 
1052   // we have a blocky macroblock (only DCs are non-zero) with fairly high
1053   // distortion, record max delta so we can later adjust the minimal filtering
1054   // strength needed to smooth these blocks out.
1055   if ((rd->nz & 0x100ffff) == 0x1000000 && rd->D > dqm->min_disto_) {
1056     StoreMaxDelta(dqm, rd->y_dc_levels);
1057   }
1058 }
1059 
1060 //------------------------------------------------------------------------------
1061 
1062 // return the cost array corresponding to the surrounding prediction modes.
GetCostModeI4(VP8EncIterator * WEBP_RESTRICT const it,const uint8_t modes[16])1063 static const uint16_t* GetCostModeI4(VP8EncIterator* WEBP_RESTRICT const it,
1064                                      const uint8_t modes[16]) {
1065   const int preds_w = it->enc_->preds_w_;
1066   const int x = (it->i4_ & 3), y = it->i4_ >> 2;
1067   const int left = (x == 0) ? it->preds_[y * preds_w - 1] : modes[it->i4_ - 1];
1068   const int top = (y == 0) ? it->preds_[-preds_w + x] : modes[it->i4_ - 4];
1069   return VP8FixedCostsI4[top][left];
1070 }
1071 
PickBestIntra4(VP8EncIterator * WEBP_RESTRICT const it,VP8ModeScore * WEBP_RESTRICT const rd)1072 static int PickBestIntra4(VP8EncIterator* WEBP_RESTRICT const it,
1073                           VP8ModeScore* WEBP_RESTRICT const rd) {
1074   const VP8Encoder* const enc = it->enc_;
1075   const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
1076   const int lambda = dqm->lambda_i4_;
1077   const int tlambda = dqm->tlambda_;
1078   const uint8_t* const src0 = it->yuv_in_ + Y_OFF_ENC;
1079   uint8_t* const best_blocks = it->yuv_out2_ + Y_OFF_ENC;
1080   int total_header_bits = 0;
1081   VP8ModeScore rd_best;
1082 
1083   if (enc->max_i4_header_bits_ == 0) {
1084     return 0;
1085   }
1086 
1087   InitScore(&rd_best);
1088   rd_best.H = 211;  // '211' is the value of VP8BitCost(0, 145)
1089   SetRDScore(dqm->lambda_mode_, &rd_best);
1090   VP8IteratorStartI4(it);
1091   do {
1092     const int kNumBlocks = 1;
1093     VP8ModeScore rd_i4;
1094     int mode;
1095     int best_mode = -1;
1096     const uint8_t* const src = src0 + VP8Scan[it->i4_];
1097     const uint16_t* const mode_costs = GetCostModeI4(it, rd->modes_i4);
1098     uint8_t* best_block = best_blocks + VP8Scan[it->i4_];
1099     uint8_t* tmp_dst = it->yuv_p_ + I4TMP;    // scratch buffer.
1100 
1101     InitScore(&rd_i4);
1102     VP8MakeIntra4Preds(it);
1103     for (mode = 0; mode < NUM_BMODES; ++mode) {
1104       VP8ModeScore rd_tmp;
1105       int16_t tmp_levels[16];
1106 
1107       // Reconstruct
1108       rd_tmp.nz =
1109           ReconstructIntra4(it, tmp_levels, src, tmp_dst, mode) << it->i4_;
1110 
1111       // Compute RD-score
1112       rd_tmp.D = VP8SSE4x4(src, tmp_dst);
1113       rd_tmp.SD =
1114           tlambda ? MULT_8B(tlambda, VP8TDisto4x4(src, tmp_dst, kWeightY))
1115                   : 0;
1116       rd_tmp.H = mode_costs[mode];
1117 
1118       // Add flatness penalty, to avoid flat area to be mispredicted
1119       // by a complex mode.
1120       if (mode > 0 && IsFlat(tmp_levels, kNumBlocks, FLATNESS_LIMIT_I4)) {
1121         rd_tmp.R = FLATNESS_PENALTY * kNumBlocks;
1122       } else {
1123         rd_tmp.R = 0;
1124       }
1125 
1126       // early-out check
1127       SetRDScore(lambda, &rd_tmp);
1128       if (best_mode >= 0 && rd_tmp.score >= rd_i4.score) continue;
1129 
1130       // finish computing score
1131       rd_tmp.R += VP8GetCostLuma4(it, tmp_levels);
1132       SetRDScore(lambda, &rd_tmp);
1133 
1134       if (best_mode < 0 || rd_tmp.score < rd_i4.score) {
1135         CopyScore(&rd_i4, &rd_tmp);
1136         best_mode = mode;
1137         SwapPtr(&tmp_dst, &best_block);
1138         memcpy(rd_best.y_ac_levels[it->i4_], tmp_levels,
1139                sizeof(rd_best.y_ac_levels[it->i4_]));
1140       }
1141     }
1142     SetRDScore(dqm->lambda_mode_, &rd_i4);
1143     AddScore(&rd_best, &rd_i4);
1144     if (rd_best.score >= rd->score) {
1145       return 0;
1146     }
1147     total_header_bits += (int)rd_i4.H;   // <- equal to mode_costs[best_mode];
1148     if (total_header_bits > enc->max_i4_header_bits_) {
1149       return 0;
1150     }
1151     // Copy selected samples if not in the right place already.
1152     if (best_block != best_blocks + VP8Scan[it->i4_]) {
1153       VP8Copy4x4(best_block, best_blocks + VP8Scan[it->i4_]);
1154     }
1155     rd->modes_i4[it->i4_] = best_mode;
1156     it->top_nz_[it->i4_ & 3] = it->left_nz_[it->i4_ >> 2] = (rd_i4.nz ? 1 : 0);
1157   } while (VP8IteratorRotateI4(it, best_blocks));
1158 
1159   // finalize state
1160   CopyScore(rd, &rd_best);
1161   VP8SetIntra4Mode(it, rd->modes_i4);
1162   SwapOut(it);
1163   memcpy(rd->y_ac_levels, rd_best.y_ac_levels, sizeof(rd->y_ac_levels));
1164   return 1;   // select intra4x4 over intra16x16
1165 }
1166 
1167 //------------------------------------------------------------------------------
1168 
PickBestUV(VP8EncIterator * WEBP_RESTRICT const it,VP8ModeScore * WEBP_RESTRICT const rd)1169 static void PickBestUV(VP8EncIterator* WEBP_RESTRICT const it,
1170                        VP8ModeScore* WEBP_RESTRICT const rd) {
1171   const int kNumBlocks = 8;
1172   const VP8SegmentInfo* const dqm = &it->enc_->dqm_[it->mb_->segment_];
1173   const int lambda = dqm->lambda_uv_;
1174   const uint8_t* const src = it->yuv_in_ + U_OFF_ENC;
1175   uint8_t* tmp_dst = it->yuv_out2_ + U_OFF_ENC;  // scratch buffer
1176   uint8_t* dst0 = it->yuv_out_ + U_OFF_ENC;
1177   uint8_t* dst = dst0;
1178   VP8ModeScore rd_best;
1179   int mode;
1180 
1181   rd->mode_uv = -1;
1182   InitScore(&rd_best);
1183   for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
1184     VP8ModeScore rd_uv;
1185 
1186     // Reconstruct
1187     rd_uv.nz = ReconstructUV(it, &rd_uv, tmp_dst, mode);
1188 
1189     // Compute RD-score
1190     rd_uv.D  = VP8SSE16x8(src, tmp_dst);
1191     rd_uv.SD = 0;    // not calling TDisto here: it tends to flatten areas.
1192     rd_uv.H  = VP8FixedCostsUV[mode];
1193     rd_uv.R  = VP8GetCostUV(it, &rd_uv);
1194     if (mode > 0 && IsFlat(rd_uv.uv_levels[0], kNumBlocks, FLATNESS_LIMIT_UV)) {
1195       rd_uv.R += FLATNESS_PENALTY * kNumBlocks;
1196     }
1197 
1198     SetRDScore(lambda, &rd_uv);
1199     if (mode == 0 || rd_uv.score < rd_best.score) {
1200       CopyScore(&rd_best, &rd_uv);
1201       rd->mode_uv = mode;
1202       memcpy(rd->uv_levels, rd_uv.uv_levels, sizeof(rd->uv_levels));
1203       if (it->top_derr_ != NULL) {
1204         memcpy(rd->derr, rd_uv.derr, sizeof(rd_uv.derr));
1205       }
1206       SwapPtr(&dst, &tmp_dst);
1207     }
1208   }
1209   VP8SetIntraUVMode(it, rd->mode_uv);
1210   AddScore(rd, &rd_best);
1211   if (dst != dst0) {   // copy 16x8 block if needed
1212     VP8Copy16x8(dst, dst0);
1213   }
1214   if (it->top_derr_ != NULL) {  // store diffusion errors for next block
1215     StoreDiffusionErrors(it, rd);
1216   }
1217 }
1218 
1219 //------------------------------------------------------------------------------
1220 // Final reconstruction and quantization.
1221 
SimpleQuantize(VP8EncIterator * WEBP_RESTRICT const it,VP8ModeScore * WEBP_RESTRICT const rd)1222 static void SimpleQuantize(VP8EncIterator* WEBP_RESTRICT const it,
1223                            VP8ModeScore* WEBP_RESTRICT const rd) {
1224   const VP8Encoder* const enc = it->enc_;
1225   const int is_i16 = (it->mb_->type_ == 1);
1226   int nz = 0;
1227 
1228   if (is_i16) {
1229     nz = ReconstructIntra16(it, rd, it->yuv_out_ + Y_OFF_ENC, it->preds_[0]);
1230   } else {
1231     VP8IteratorStartI4(it);
1232     do {
1233       const int mode =
1234           it->preds_[(it->i4_ & 3) + (it->i4_ >> 2) * enc->preds_w_];
1235       const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC + VP8Scan[it->i4_];
1236       uint8_t* const dst = it->yuv_out_ + Y_OFF_ENC + VP8Scan[it->i4_];
1237       VP8MakeIntra4Preds(it);
1238       nz |= ReconstructIntra4(it, rd->y_ac_levels[it->i4_],
1239                               src, dst, mode) << it->i4_;
1240     } while (VP8IteratorRotateI4(it, it->yuv_out_ + Y_OFF_ENC));
1241   }
1242 
1243   nz |= ReconstructUV(it, rd, it->yuv_out_ + U_OFF_ENC, it->mb_->uv_mode_);
1244   rd->nz = nz;
1245 }
1246 
1247 // Refine intra16/intra4 sub-modes based on distortion only (not rate).
RefineUsingDistortion(VP8EncIterator * WEBP_RESTRICT const it,int try_both_modes,int refine_uv_mode,VP8ModeScore * WEBP_RESTRICT const rd)1248 static void RefineUsingDistortion(VP8EncIterator* WEBP_RESTRICT const it,
1249                                   int try_both_modes, int refine_uv_mode,
1250                                   VP8ModeScore* WEBP_RESTRICT const rd) {
1251   score_t best_score = MAX_COST;
1252   int nz = 0;
1253   int mode;
1254   int is_i16 = try_both_modes || (it->mb_->type_ == 1);
1255 
1256   const VP8SegmentInfo* const dqm = &it->enc_->dqm_[it->mb_->segment_];
1257   // Some empiric constants, of approximate order of magnitude.
1258   const int lambda_d_i16 = 106;
1259   const int lambda_d_i4 = 11;
1260   const int lambda_d_uv = 120;
1261   score_t score_i4 = dqm->i4_penalty_;
1262   score_t i4_bit_sum = 0;
1263   const score_t bit_limit = try_both_modes ? it->enc_->mb_header_limit_
1264                                            : MAX_COST;  // no early-out allowed
1265 
1266   if (is_i16) {   // First, evaluate Intra16 distortion
1267     int best_mode = -1;
1268     const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC;
1269     for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
1270       const uint8_t* const ref = it->yuv_p_ + VP8I16ModeOffsets[mode];
1271       const score_t score = (score_t)VP8SSE16x16(src, ref) * RD_DISTO_MULT
1272                           + VP8FixedCostsI16[mode] * lambda_d_i16;
1273       if (mode > 0 && VP8FixedCostsI16[mode] > bit_limit) {
1274         continue;
1275       }
1276 
1277       if (score < best_score) {
1278         best_mode = mode;
1279         best_score = score;
1280       }
1281     }
1282     if (it->x_ == 0 || it->y_ == 0) {
1283       // avoid starting a checkerboard resonance from the border. See bug #432.
1284       if (IsFlatSource16(src)) {
1285         best_mode = (it->x_ == 0) ? 0 : 2;
1286         try_both_modes = 0;  // stick to i16
1287       }
1288     }
1289     VP8SetIntra16Mode(it, best_mode);
1290     // we'll reconstruct later, if i16 mode actually gets selected
1291   }
1292 
1293   // Next, evaluate Intra4
1294   if (try_both_modes || !is_i16) {
1295     // We don't evaluate the rate here, but just account for it through a
1296     // constant penalty (i4 mode usually needs more bits compared to i16).
1297     is_i16 = 0;
1298     VP8IteratorStartI4(it);
1299     do {
1300       int best_i4_mode = -1;
1301       score_t best_i4_score = MAX_COST;
1302       const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC + VP8Scan[it->i4_];
1303       const uint16_t* const mode_costs = GetCostModeI4(it, rd->modes_i4);
1304 
1305       VP8MakeIntra4Preds(it);
1306       for (mode = 0; mode < NUM_BMODES; ++mode) {
1307         const uint8_t* const ref = it->yuv_p_ + VP8I4ModeOffsets[mode];
1308         const score_t score = VP8SSE4x4(src, ref) * RD_DISTO_MULT
1309                             + mode_costs[mode] * lambda_d_i4;
1310         if (score < best_i4_score) {
1311           best_i4_mode = mode;
1312           best_i4_score = score;
1313         }
1314       }
1315       i4_bit_sum += mode_costs[best_i4_mode];
1316       rd->modes_i4[it->i4_] = best_i4_mode;
1317       score_i4 += best_i4_score;
1318       if (score_i4 >= best_score || i4_bit_sum > bit_limit) {
1319         // Intra4 won't be better than Intra16. Bail out and pick Intra16.
1320         is_i16 = 1;
1321         break;
1322       } else {  // reconstruct partial block inside yuv_out2_ buffer
1323         uint8_t* const tmp_dst = it->yuv_out2_ + Y_OFF_ENC + VP8Scan[it->i4_];
1324         nz |= ReconstructIntra4(it, rd->y_ac_levels[it->i4_],
1325                                 src, tmp_dst, best_i4_mode) << it->i4_;
1326       }
1327     } while (VP8IteratorRotateI4(it, it->yuv_out2_ + Y_OFF_ENC));
1328   }
1329 
1330   // Final reconstruction, depending on which mode is selected.
1331   if (!is_i16) {
1332     VP8SetIntra4Mode(it, rd->modes_i4);
1333     SwapOut(it);
1334     best_score = score_i4;
1335   } else {
1336     nz = ReconstructIntra16(it, rd, it->yuv_out_ + Y_OFF_ENC, it->preds_[0]);
1337   }
1338 
1339   // ... and UV!
1340   if (refine_uv_mode) {
1341     int best_mode = -1;
1342     score_t best_uv_score = MAX_COST;
1343     const uint8_t* const src = it->yuv_in_ + U_OFF_ENC;
1344     for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
1345       const uint8_t* const ref = it->yuv_p_ + VP8UVModeOffsets[mode];
1346       const score_t score = VP8SSE16x8(src, ref) * RD_DISTO_MULT
1347                           + VP8FixedCostsUV[mode] * lambda_d_uv;
1348       if (score < best_uv_score) {
1349         best_mode = mode;
1350         best_uv_score = score;
1351       }
1352     }
1353     VP8SetIntraUVMode(it, best_mode);
1354   }
1355   nz |= ReconstructUV(it, rd, it->yuv_out_ + U_OFF_ENC, it->mb_->uv_mode_);
1356 
1357   rd->nz = nz;
1358   rd->score = best_score;
1359 }
1360 
1361 //------------------------------------------------------------------------------
1362 // Entry point
1363 
VP8Decimate(VP8EncIterator * WEBP_RESTRICT const it,VP8ModeScore * WEBP_RESTRICT const rd,VP8RDLevel rd_opt)1364 int VP8Decimate(VP8EncIterator* WEBP_RESTRICT const it,
1365                 VP8ModeScore* WEBP_RESTRICT const rd,
1366                 VP8RDLevel rd_opt) {
1367   int is_skipped;
1368   const int method = it->enc_->method_;
1369 
1370   InitScore(rd);
1371 
1372   // We can perform predictions for Luma16x16 and Chroma8x8 already.
1373   // Luma4x4 predictions needs to be done as-we-go.
1374   VP8MakeLuma16Preds(it);
1375   VP8MakeChroma8Preds(it);
1376 
1377   if (rd_opt > RD_OPT_NONE) {
1378     it->do_trellis_ = (rd_opt >= RD_OPT_TRELLIS_ALL);
1379     PickBestIntra16(it, rd);
1380     if (method >= 2) {
1381       PickBestIntra4(it, rd);
1382     }
1383     PickBestUV(it, rd);
1384     if (rd_opt == RD_OPT_TRELLIS) {   // finish off with trellis-optim now
1385       it->do_trellis_ = 1;
1386       SimpleQuantize(it, rd);
1387     }
1388   } else {
1389     // At this point we have heuristically decided intra16 / intra4.
1390     // For method >= 2, pick the best intra4/intra16 based on SSE (~tad slower).
1391     // For method <= 1, we don't re-examine the decision but just go ahead with
1392     // quantization/reconstruction.
1393     RefineUsingDistortion(it, (method >= 2), (method >= 1), rd);
1394   }
1395   is_skipped = (rd->nz == 0);
1396   VP8SetSkip(it, is_skipped);
1397   return is_skipped;
1398 }
1399