1*b2055c35SXin Li // Copyright 2022 Google Inc. All Rights Reserved.
2*b2055c35SXin Li //
3*b2055c35SXin Li // Use of this source code is governed by a BSD-style license
4*b2055c35SXin Li // that can be found in the COPYING file in the root of the source
5*b2055c35SXin Li // tree. An additional intellectual property rights grant can be found
6*b2055c35SXin Li // in the file PATENTS. All contributing project authors may
7*b2055c35SXin Li // be found in the AUTHORS file in the root of the source tree.
8*b2055c35SXin Li // -----------------------------------------------------------------------------
9*b2055c35SXin Li //
10*b2055c35SXin Li // Sharp RGB to YUV conversion.
11*b2055c35SXin Li //
12*b2055c35SXin Li // Author: Skal ([email protected])
13*b2055c35SXin Li
14*b2055c35SXin Li #include "sharpyuv/sharpyuv.h"
15*b2055c35SXin Li
16*b2055c35SXin Li #include <assert.h>
17*b2055c35SXin Li #include <limits.h>
18*b2055c35SXin Li #include <stddef.h>
19*b2055c35SXin Li #include <stdlib.h>
20*b2055c35SXin Li #include <string.h>
21*b2055c35SXin Li
22*b2055c35SXin Li #include "src/webp/types.h"
23*b2055c35SXin Li #include "sharpyuv/sharpyuv_cpu.h"
24*b2055c35SXin Li #include "sharpyuv/sharpyuv_dsp.h"
25*b2055c35SXin Li #include "sharpyuv/sharpyuv_gamma.h"
26*b2055c35SXin Li
27*b2055c35SXin Li //------------------------------------------------------------------------------
28*b2055c35SXin Li
SharpYuvGetVersion(void)29*b2055c35SXin Li int SharpYuvGetVersion(void) {
30*b2055c35SXin Li return SHARPYUV_VERSION;
31*b2055c35SXin Li }
32*b2055c35SXin Li
33*b2055c35SXin Li //------------------------------------------------------------------------------
34*b2055c35SXin Li // Sharp RGB->YUV conversion
35*b2055c35SXin Li
36*b2055c35SXin Li static const int kNumIterations = 4;
37*b2055c35SXin Li
38*b2055c35SXin Li #define YUV_FIX 16 // fixed-point precision for RGB->YUV
39*b2055c35SXin Li static const int kYuvHalf = 1 << (YUV_FIX - 1);
40*b2055c35SXin Li
41*b2055c35SXin Li // Max bit depth so that intermediate calculations fit in 16 bits.
42*b2055c35SXin Li static const int kMaxBitDepth = 14;
43*b2055c35SXin Li
44*b2055c35SXin Li // Returns the precision shift to use based on the input rgb_bit_depth.
GetPrecisionShift(int rgb_bit_depth)45*b2055c35SXin Li static int GetPrecisionShift(int rgb_bit_depth) {
46*b2055c35SXin Li // Try to add 2 bits of precision if it fits in kMaxBitDepth. Otherwise remove
47*b2055c35SXin Li // bits if needed.
48*b2055c35SXin Li return ((rgb_bit_depth + 2) <= kMaxBitDepth) ? 2
49*b2055c35SXin Li : (kMaxBitDepth - rgb_bit_depth);
50*b2055c35SXin Li }
51*b2055c35SXin Li
52*b2055c35SXin Li typedef int16_t fixed_t; // signed type with extra precision for UV
53*b2055c35SXin Li typedef uint16_t fixed_y_t; // unsigned type with extra precision for W
54*b2055c35SXin Li
55*b2055c35SXin Li //------------------------------------------------------------------------------
56*b2055c35SXin Li
clip_8b(fixed_t v)57*b2055c35SXin Li static uint8_t clip_8b(fixed_t v) {
58*b2055c35SXin Li return (!(v & ~0xff)) ? (uint8_t)v : (v < 0) ? 0u : 255u;
59*b2055c35SXin Li }
60*b2055c35SXin Li
clip(fixed_t v,int max)61*b2055c35SXin Li static uint16_t clip(fixed_t v, int max) {
62*b2055c35SXin Li return (v < 0) ? 0 : (v > max) ? max : (uint16_t)v;
63*b2055c35SXin Li }
64*b2055c35SXin Li
clip_bit_depth(int y,int bit_depth)65*b2055c35SXin Li static fixed_y_t clip_bit_depth(int y, int bit_depth) {
66*b2055c35SXin Li const int max = (1 << bit_depth) - 1;
67*b2055c35SXin Li return (!(y & ~max)) ? (fixed_y_t)y : (y < 0) ? 0 : max;
68*b2055c35SXin Li }
69*b2055c35SXin Li
70*b2055c35SXin Li //------------------------------------------------------------------------------
71*b2055c35SXin Li
RGBToGray(int64_t r,int64_t g,int64_t b)72*b2055c35SXin Li static int RGBToGray(int64_t r, int64_t g, int64_t b) {
73*b2055c35SXin Li const int64_t luma = 13933 * r + 46871 * g + 4732 * b + kYuvHalf;
74*b2055c35SXin Li return (int)(luma >> YUV_FIX);
75*b2055c35SXin Li }
76*b2055c35SXin Li
ScaleDown(uint16_t a,uint16_t b,uint16_t c,uint16_t d,int rgb_bit_depth,SharpYuvTransferFunctionType transfer_type)77*b2055c35SXin Li static uint32_t ScaleDown(uint16_t a, uint16_t b, uint16_t c, uint16_t d,
78*b2055c35SXin Li int rgb_bit_depth,
79*b2055c35SXin Li SharpYuvTransferFunctionType transfer_type) {
80*b2055c35SXin Li const int bit_depth = rgb_bit_depth + GetPrecisionShift(rgb_bit_depth);
81*b2055c35SXin Li const uint32_t A = SharpYuvGammaToLinear(a, bit_depth, transfer_type);
82*b2055c35SXin Li const uint32_t B = SharpYuvGammaToLinear(b, bit_depth, transfer_type);
83*b2055c35SXin Li const uint32_t C = SharpYuvGammaToLinear(c, bit_depth, transfer_type);
84*b2055c35SXin Li const uint32_t D = SharpYuvGammaToLinear(d, bit_depth, transfer_type);
85*b2055c35SXin Li return SharpYuvLinearToGamma((A + B + C + D + 2) >> 2, bit_depth,
86*b2055c35SXin Li transfer_type);
87*b2055c35SXin Li }
88*b2055c35SXin Li
UpdateW(const fixed_y_t * src,fixed_y_t * dst,int w,int rgb_bit_depth,SharpYuvTransferFunctionType transfer_type)89*b2055c35SXin Li static WEBP_INLINE void UpdateW(const fixed_y_t* src, fixed_y_t* dst, int w,
90*b2055c35SXin Li int rgb_bit_depth,
91*b2055c35SXin Li SharpYuvTransferFunctionType transfer_type) {
92*b2055c35SXin Li const int bit_depth = rgb_bit_depth + GetPrecisionShift(rgb_bit_depth);
93*b2055c35SXin Li int i = 0;
94*b2055c35SXin Li do {
95*b2055c35SXin Li const uint32_t R =
96*b2055c35SXin Li SharpYuvGammaToLinear(src[0 * w + i], bit_depth, transfer_type);
97*b2055c35SXin Li const uint32_t G =
98*b2055c35SXin Li SharpYuvGammaToLinear(src[1 * w + i], bit_depth, transfer_type);
99*b2055c35SXin Li const uint32_t B =
100*b2055c35SXin Li SharpYuvGammaToLinear(src[2 * w + i], bit_depth, transfer_type);
101*b2055c35SXin Li const uint32_t Y = RGBToGray(R, G, B);
102*b2055c35SXin Li dst[i] = (fixed_y_t)SharpYuvLinearToGamma(Y, bit_depth, transfer_type);
103*b2055c35SXin Li } while (++i < w);
104*b2055c35SXin Li }
105*b2055c35SXin Li
UpdateChroma(const fixed_y_t * src1,const fixed_y_t * src2,fixed_t * dst,int uv_w,int rgb_bit_depth,SharpYuvTransferFunctionType transfer_type)106*b2055c35SXin Li static void UpdateChroma(const fixed_y_t* src1, const fixed_y_t* src2,
107*b2055c35SXin Li fixed_t* dst, int uv_w, int rgb_bit_depth,
108*b2055c35SXin Li SharpYuvTransferFunctionType transfer_type) {
109*b2055c35SXin Li int i = 0;
110*b2055c35SXin Li do {
111*b2055c35SXin Li const int r =
112*b2055c35SXin Li ScaleDown(src1[0 * uv_w + 0], src1[0 * uv_w + 1], src2[0 * uv_w + 0],
113*b2055c35SXin Li src2[0 * uv_w + 1], rgb_bit_depth, transfer_type);
114*b2055c35SXin Li const int g =
115*b2055c35SXin Li ScaleDown(src1[2 * uv_w + 0], src1[2 * uv_w + 1], src2[2 * uv_w + 0],
116*b2055c35SXin Li src2[2 * uv_w + 1], rgb_bit_depth, transfer_type);
117*b2055c35SXin Li const int b =
118*b2055c35SXin Li ScaleDown(src1[4 * uv_w + 0], src1[4 * uv_w + 1], src2[4 * uv_w + 0],
119*b2055c35SXin Li src2[4 * uv_w + 1], rgb_bit_depth, transfer_type);
120*b2055c35SXin Li const int W = RGBToGray(r, g, b);
121*b2055c35SXin Li dst[0 * uv_w] = (fixed_t)(r - W);
122*b2055c35SXin Li dst[1 * uv_w] = (fixed_t)(g - W);
123*b2055c35SXin Li dst[2 * uv_w] = (fixed_t)(b - W);
124*b2055c35SXin Li dst += 1;
125*b2055c35SXin Li src1 += 2;
126*b2055c35SXin Li src2 += 2;
127*b2055c35SXin Li } while (++i < uv_w);
128*b2055c35SXin Li }
129*b2055c35SXin Li
StoreGray(const fixed_y_t * rgb,fixed_y_t * y,int w)130*b2055c35SXin Li static void StoreGray(const fixed_y_t* rgb, fixed_y_t* y, int w) {
131*b2055c35SXin Li int i = 0;
132*b2055c35SXin Li assert(w > 0);
133*b2055c35SXin Li do {
134*b2055c35SXin Li y[i] = RGBToGray(rgb[0 * w + i], rgb[1 * w + i], rgb[2 * w + i]);
135*b2055c35SXin Li } while (++i < w);
136*b2055c35SXin Li }
137*b2055c35SXin Li
138*b2055c35SXin Li //------------------------------------------------------------------------------
139*b2055c35SXin Li
Filter2(int A,int B,int W0,int bit_depth)140*b2055c35SXin Li static WEBP_INLINE fixed_y_t Filter2(int A, int B, int W0, int bit_depth) {
141*b2055c35SXin Li const int v0 = (A * 3 + B + 2) >> 2;
142*b2055c35SXin Li return clip_bit_depth(v0 + W0, bit_depth);
143*b2055c35SXin Li }
144*b2055c35SXin Li
145*b2055c35SXin Li //------------------------------------------------------------------------------
146*b2055c35SXin Li
Shift(int v,int shift)147*b2055c35SXin Li static WEBP_INLINE int Shift(int v, int shift) {
148*b2055c35SXin Li return (shift >= 0) ? (v << shift) : (v >> -shift);
149*b2055c35SXin Li }
150*b2055c35SXin Li
ImportOneRow(const uint8_t * const r_ptr,const uint8_t * const g_ptr,const uint8_t * const b_ptr,int rgb_step,int rgb_bit_depth,int pic_width,fixed_y_t * const dst)151*b2055c35SXin Li static void ImportOneRow(const uint8_t* const r_ptr,
152*b2055c35SXin Li const uint8_t* const g_ptr,
153*b2055c35SXin Li const uint8_t* const b_ptr,
154*b2055c35SXin Li int rgb_step,
155*b2055c35SXin Li int rgb_bit_depth,
156*b2055c35SXin Li int pic_width,
157*b2055c35SXin Li fixed_y_t* const dst) {
158*b2055c35SXin Li // Convert the rgb_step from a number of bytes to a number of uint8_t or
159*b2055c35SXin Li // uint16_t values depending the bit depth.
160*b2055c35SXin Li const int step = (rgb_bit_depth > 8) ? rgb_step / 2 : rgb_step;
161*b2055c35SXin Li int i = 0;
162*b2055c35SXin Li const int w = (pic_width + 1) & ~1;
163*b2055c35SXin Li do {
164*b2055c35SXin Li const int off = i * step;
165*b2055c35SXin Li const int shift = GetPrecisionShift(rgb_bit_depth);
166*b2055c35SXin Li if (rgb_bit_depth == 8) {
167*b2055c35SXin Li dst[i + 0 * w] = Shift(r_ptr[off], shift);
168*b2055c35SXin Li dst[i + 1 * w] = Shift(g_ptr[off], shift);
169*b2055c35SXin Li dst[i + 2 * w] = Shift(b_ptr[off], shift);
170*b2055c35SXin Li } else {
171*b2055c35SXin Li dst[i + 0 * w] = Shift(((uint16_t*)r_ptr)[off], shift);
172*b2055c35SXin Li dst[i + 1 * w] = Shift(((uint16_t*)g_ptr)[off], shift);
173*b2055c35SXin Li dst[i + 2 * w] = Shift(((uint16_t*)b_ptr)[off], shift);
174*b2055c35SXin Li }
175*b2055c35SXin Li } while (++i < pic_width);
176*b2055c35SXin Li if (pic_width & 1) { // replicate rightmost pixel
177*b2055c35SXin Li dst[pic_width + 0 * w] = dst[pic_width + 0 * w - 1];
178*b2055c35SXin Li dst[pic_width + 1 * w] = dst[pic_width + 1 * w - 1];
179*b2055c35SXin Li dst[pic_width + 2 * w] = dst[pic_width + 2 * w - 1];
180*b2055c35SXin Li }
181*b2055c35SXin Li }
182*b2055c35SXin Li
InterpolateTwoRows(const fixed_y_t * const best_y,const fixed_t * prev_uv,const fixed_t * cur_uv,const fixed_t * next_uv,int w,fixed_y_t * out1,fixed_y_t * out2,int rgb_bit_depth)183*b2055c35SXin Li static void InterpolateTwoRows(const fixed_y_t* const best_y,
184*b2055c35SXin Li const fixed_t* prev_uv,
185*b2055c35SXin Li const fixed_t* cur_uv,
186*b2055c35SXin Li const fixed_t* next_uv,
187*b2055c35SXin Li int w,
188*b2055c35SXin Li fixed_y_t* out1,
189*b2055c35SXin Li fixed_y_t* out2,
190*b2055c35SXin Li int rgb_bit_depth) {
191*b2055c35SXin Li const int uv_w = w >> 1;
192*b2055c35SXin Li const int len = (w - 1) >> 1; // length to filter
193*b2055c35SXin Li int k = 3;
194*b2055c35SXin Li const int bit_depth = rgb_bit_depth + GetPrecisionShift(rgb_bit_depth);
195*b2055c35SXin Li while (k-- > 0) { // process each R/G/B segments in turn
196*b2055c35SXin Li // special boundary case for i==0
197*b2055c35SXin Li out1[0] = Filter2(cur_uv[0], prev_uv[0], best_y[0], bit_depth);
198*b2055c35SXin Li out2[0] = Filter2(cur_uv[0], next_uv[0], best_y[w], bit_depth);
199*b2055c35SXin Li
200*b2055c35SXin Li SharpYuvFilterRow(cur_uv, prev_uv, len, best_y + 0 + 1, out1 + 1,
201*b2055c35SXin Li bit_depth);
202*b2055c35SXin Li SharpYuvFilterRow(cur_uv, next_uv, len, best_y + w + 1, out2 + 1,
203*b2055c35SXin Li bit_depth);
204*b2055c35SXin Li
205*b2055c35SXin Li // special boundary case for i == w - 1 when w is even
206*b2055c35SXin Li if (!(w & 1)) {
207*b2055c35SXin Li out1[w - 1] = Filter2(cur_uv[uv_w - 1], prev_uv[uv_w - 1],
208*b2055c35SXin Li best_y[w - 1 + 0], bit_depth);
209*b2055c35SXin Li out2[w - 1] = Filter2(cur_uv[uv_w - 1], next_uv[uv_w - 1],
210*b2055c35SXin Li best_y[w - 1 + w], bit_depth);
211*b2055c35SXin Li }
212*b2055c35SXin Li out1 += w;
213*b2055c35SXin Li out2 += w;
214*b2055c35SXin Li prev_uv += uv_w;
215*b2055c35SXin Li cur_uv += uv_w;
216*b2055c35SXin Li next_uv += uv_w;
217*b2055c35SXin Li }
218*b2055c35SXin Li }
219*b2055c35SXin Li
RGBToYUVComponent(int r,int g,int b,const int coeffs[4],int sfix)220*b2055c35SXin Li static WEBP_INLINE int RGBToYUVComponent(int r, int g, int b,
221*b2055c35SXin Li const int coeffs[4], int sfix) {
222*b2055c35SXin Li const int srounder = 1 << (YUV_FIX + sfix - 1);
223*b2055c35SXin Li const int luma = coeffs[0] * r + coeffs[1] * g + coeffs[2] * b +
224*b2055c35SXin Li coeffs[3] + srounder;
225*b2055c35SXin Li return (luma >> (YUV_FIX + sfix));
226*b2055c35SXin Li }
227*b2055c35SXin Li
ConvertWRGBToYUV(const fixed_y_t * best_y,const fixed_t * best_uv,uint8_t * y_ptr,int y_stride,uint8_t * u_ptr,int u_stride,uint8_t * v_ptr,int v_stride,int rgb_bit_depth,int yuv_bit_depth,int width,int height,const SharpYuvConversionMatrix * yuv_matrix)228*b2055c35SXin Li static int ConvertWRGBToYUV(const fixed_y_t* best_y, const fixed_t* best_uv,
229*b2055c35SXin Li uint8_t* y_ptr, int y_stride, uint8_t* u_ptr,
230*b2055c35SXin Li int u_stride, uint8_t* v_ptr, int v_stride,
231*b2055c35SXin Li int rgb_bit_depth,
232*b2055c35SXin Li int yuv_bit_depth, int width, int height,
233*b2055c35SXin Li const SharpYuvConversionMatrix* yuv_matrix) {
234*b2055c35SXin Li int i, j;
235*b2055c35SXin Li const fixed_t* const best_uv_base = best_uv;
236*b2055c35SXin Li const int w = (width + 1) & ~1;
237*b2055c35SXin Li const int h = (height + 1) & ~1;
238*b2055c35SXin Li const int uv_w = w >> 1;
239*b2055c35SXin Li const int uv_h = h >> 1;
240*b2055c35SXin Li const int sfix = GetPrecisionShift(rgb_bit_depth);
241*b2055c35SXin Li const int yuv_max = (1 << yuv_bit_depth) - 1;
242*b2055c35SXin Li
243*b2055c35SXin Li best_uv = best_uv_base;
244*b2055c35SXin Li j = 0;
245*b2055c35SXin Li do {
246*b2055c35SXin Li i = 0;
247*b2055c35SXin Li do {
248*b2055c35SXin Li const int off = (i >> 1);
249*b2055c35SXin Li const int W = best_y[i];
250*b2055c35SXin Li const int r = best_uv[off + 0 * uv_w] + W;
251*b2055c35SXin Li const int g = best_uv[off + 1 * uv_w] + W;
252*b2055c35SXin Li const int b = best_uv[off + 2 * uv_w] + W;
253*b2055c35SXin Li const int y = RGBToYUVComponent(r, g, b, yuv_matrix->rgb_to_y, sfix);
254*b2055c35SXin Li if (yuv_bit_depth <= 8) {
255*b2055c35SXin Li y_ptr[i] = clip_8b(y);
256*b2055c35SXin Li } else {
257*b2055c35SXin Li ((uint16_t*)y_ptr)[i] = clip(y, yuv_max);
258*b2055c35SXin Li }
259*b2055c35SXin Li } while (++i < width);
260*b2055c35SXin Li best_y += w;
261*b2055c35SXin Li best_uv += (j & 1) * 3 * uv_w;
262*b2055c35SXin Li y_ptr += y_stride;
263*b2055c35SXin Li } while (++j < height);
264*b2055c35SXin Li
265*b2055c35SXin Li best_uv = best_uv_base;
266*b2055c35SXin Li j = 0;
267*b2055c35SXin Li do {
268*b2055c35SXin Li i = 0;
269*b2055c35SXin Li do {
270*b2055c35SXin Li // Note r, g and b values here are off by W, but a constant offset on all
271*b2055c35SXin Li // 3 components doesn't change the value of u and v with a YCbCr matrix.
272*b2055c35SXin Li const int r = best_uv[i + 0 * uv_w];
273*b2055c35SXin Li const int g = best_uv[i + 1 * uv_w];
274*b2055c35SXin Li const int b = best_uv[i + 2 * uv_w];
275*b2055c35SXin Li const int u = RGBToYUVComponent(r, g, b, yuv_matrix->rgb_to_u, sfix);
276*b2055c35SXin Li const int v = RGBToYUVComponent(r, g, b, yuv_matrix->rgb_to_v, sfix);
277*b2055c35SXin Li if (yuv_bit_depth <= 8) {
278*b2055c35SXin Li u_ptr[i] = clip_8b(u);
279*b2055c35SXin Li v_ptr[i] = clip_8b(v);
280*b2055c35SXin Li } else {
281*b2055c35SXin Li ((uint16_t*)u_ptr)[i] = clip(u, yuv_max);
282*b2055c35SXin Li ((uint16_t*)v_ptr)[i] = clip(v, yuv_max);
283*b2055c35SXin Li }
284*b2055c35SXin Li } while (++i < uv_w);
285*b2055c35SXin Li best_uv += 3 * uv_w;
286*b2055c35SXin Li u_ptr += u_stride;
287*b2055c35SXin Li v_ptr += v_stride;
288*b2055c35SXin Li } while (++j < uv_h);
289*b2055c35SXin Li return 1;
290*b2055c35SXin Li }
291*b2055c35SXin Li
292*b2055c35SXin Li //------------------------------------------------------------------------------
293*b2055c35SXin Li // Main function
294*b2055c35SXin Li
SafeMalloc(uint64_t nmemb,size_t size)295*b2055c35SXin Li static void* SafeMalloc(uint64_t nmemb, size_t size) {
296*b2055c35SXin Li const uint64_t total_size = nmemb * (uint64_t)size;
297*b2055c35SXin Li if (total_size != (size_t)total_size) return NULL;
298*b2055c35SXin Li return malloc((size_t)total_size);
299*b2055c35SXin Li }
300*b2055c35SXin Li
301*b2055c35SXin Li #define SAFE_ALLOC(W, H, T) ((T*)SafeMalloc((uint64_t)(W) * (H), sizeof(T)))
302*b2055c35SXin Li
DoSharpArgbToYuv(const uint8_t * r_ptr,const uint8_t * g_ptr,const uint8_t * b_ptr,int rgb_step,int rgb_stride,int rgb_bit_depth,uint8_t * y_ptr,int y_stride,uint8_t * u_ptr,int u_stride,uint8_t * v_ptr,int v_stride,int yuv_bit_depth,int width,int height,const SharpYuvConversionMatrix * yuv_matrix,SharpYuvTransferFunctionType transfer_type)303*b2055c35SXin Li static int DoSharpArgbToYuv(const uint8_t* r_ptr, const uint8_t* g_ptr,
304*b2055c35SXin Li const uint8_t* b_ptr, int rgb_step, int rgb_stride,
305*b2055c35SXin Li int rgb_bit_depth, uint8_t* y_ptr, int y_stride,
306*b2055c35SXin Li uint8_t* u_ptr, int u_stride, uint8_t* v_ptr,
307*b2055c35SXin Li int v_stride, int yuv_bit_depth, int width,
308*b2055c35SXin Li int height,
309*b2055c35SXin Li const SharpYuvConversionMatrix* yuv_matrix,
310*b2055c35SXin Li SharpYuvTransferFunctionType transfer_type) {
311*b2055c35SXin Li // we expand the right/bottom border if needed
312*b2055c35SXin Li const int w = (width + 1) & ~1;
313*b2055c35SXin Li const int h = (height + 1) & ~1;
314*b2055c35SXin Li const int uv_w = w >> 1;
315*b2055c35SXin Li const int uv_h = h >> 1;
316*b2055c35SXin Li const int y_bit_depth = rgb_bit_depth + GetPrecisionShift(rgb_bit_depth);
317*b2055c35SXin Li uint64_t prev_diff_y_sum = ~0;
318*b2055c35SXin Li int j, iter;
319*b2055c35SXin Li
320*b2055c35SXin Li // TODO(skal): allocate one big memory chunk. But for now, it's easier
321*b2055c35SXin Li // for valgrind debugging to have several chunks.
322*b2055c35SXin Li fixed_y_t* const tmp_buffer = SAFE_ALLOC(w * 3, 2, fixed_y_t); // scratch
323*b2055c35SXin Li fixed_y_t* const best_y_base = SAFE_ALLOC(w, h, fixed_y_t);
324*b2055c35SXin Li fixed_y_t* const target_y_base = SAFE_ALLOC(w, h, fixed_y_t);
325*b2055c35SXin Li fixed_y_t* const best_rgb_y = SAFE_ALLOC(w, 2, fixed_y_t);
326*b2055c35SXin Li fixed_t* const best_uv_base = SAFE_ALLOC(uv_w * 3, uv_h, fixed_t);
327*b2055c35SXin Li fixed_t* const target_uv_base = SAFE_ALLOC(uv_w * 3, uv_h, fixed_t);
328*b2055c35SXin Li fixed_t* const best_rgb_uv = SAFE_ALLOC(uv_w * 3, 1, fixed_t);
329*b2055c35SXin Li fixed_y_t* best_y = best_y_base;
330*b2055c35SXin Li fixed_y_t* target_y = target_y_base;
331*b2055c35SXin Li fixed_t* best_uv = best_uv_base;
332*b2055c35SXin Li fixed_t* target_uv = target_uv_base;
333*b2055c35SXin Li const uint64_t diff_y_threshold = (uint64_t)(3.0 * w * h);
334*b2055c35SXin Li int ok;
335*b2055c35SXin Li assert(w > 0);
336*b2055c35SXin Li assert(h > 0);
337*b2055c35SXin Li
338*b2055c35SXin Li if (best_y_base == NULL || best_uv_base == NULL ||
339*b2055c35SXin Li target_y_base == NULL || target_uv_base == NULL ||
340*b2055c35SXin Li best_rgb_y == NULL || best_rgb_uv == NULL ||
341*b2055c35SXin Li tmp_buffer == NULL) {
342*b2055c35SXin Li ok = 0;
343*b2055c35SXin Li goto End;
344*b2055c35SXin Li }
345*b2055c35SXin Li
346*b2055c35SXin Li // Import RGB samples to W/RGB representation.
347*b2055c35SXin Li for (j = 0; j < height; j += 2) {
348*b2055c35SXin Li const int is_last_row = (j == height - 1);
349*b2055c35SXin Li fixed_y_t* const src1 = tmp_buffer + 0 * w;
350*b2055c35SXin Li fixed_y_t* const src2 = tmp_buffer + 3 * w;
351*b2055c35SXin Li
352*b2055c35SXin Li // prepare two rows of input
353*b2055c35SXin Li ImportOneRow(r_ptr, g_ptr, b_ptr, rgb_step, rgb_bit_depth, width,
354*b2055c35SXin Li src1);
355*b2055c35SXin Li if (!is_last_row) {
356*b2055c35SXin Li ImportOneRow(r_ptr + rgb_stride, g_ptr + rgb_stride, b_ptr + rgb_stride,
357*b2055c35SXin Li rgb_step, rgb_bit_depth, width, src2);
358*b2055c35SXin Li } else {
359*b2055c35SXin Li memcpy(src2, src1, 3 * w * sizeof(*src2));
360*b2055c35SXin Li }
361*b2055c35SXin Li StoreGray(src1, best_y + 0, w);
362*b2055c35SXin Li StoreGray(src2, best_y + w, w);
363*b2055c35SXin Li
364*b2055c35SXin Li UpdateW(src1, target_y, w, rgb_bit_depth, transfer_type);
365*b2055c35SXin Li UpdateW(src2, target_y + w, w, rgb_bit_depth, transfer_type);
366*b2055c35SXin Li UpdateChroma(src1, src2, target_uv, uv_w, rgb_bit_depth, transfer_type);
367*b2055c35SXin Li memcpy(best_uv, target_uv, 3 * uv_w * sizeof(*best_uv));
368*b2055c35SXin Li best_y += 2 * w;
369*b2055c35SXin Li best_uv += 3 * uv_w;
370*b2055c35SXin Li target_y += 2 * w;
371*b2055c35SXin Li target_uv += 3 * uv_w;
372*b2055c35SXin Li r_ptr += 2 * rgb_stride;
373*b2055c35SXin Li g_ptr += 2 * rgb_stride;
374*b2055c35SXin Li b_ptr += 2 * rgb_stride;
375*b2055c35SXin Li }
376*b2055c35SXin Li
377*b2055c35SXin Li // Iterate and resolve clipping conflicts.
378*b2055c35SXin Li for (iter = 0; iter < kNumIterations; ++iter) {
379*b2055c35SXin Li const fixed_t* cur_uv = best_uv_base;
380*b2055c35SXin Li const fixed_t* prev_uv = best_uv_base;
381*b2055c35SXin Li uint64_t diff_y_sum = 0;
382*b2055c35SXin Li
383*b2055c35SXin Li best_y = best_y_base;
384*b2055c35SXin Li best_uv = best_uv_base;
385*b2055c35SXin Li target_y = target_y_base;
386*b2055c35SXin Li target_uv = target_uv_base;
387*b2055c35SXin Li j = 0;
388*b2055c35SXin Li do {
389*b2055c35SXin Li fixed_y_t* const src1 = tmp_buffer + 0 * w;
390*b2055c35SXin Li fixed_y_t* const src2 = tmp_buffer + 3 * w;
391*b2055c35SXin Li {
392*b2055c35SXin Li const fixed_t* const next_uv = cur_uv + ((j < h - 2) ? 3 * uv_w : 0);
393*b2055c35SXin Li InterpolateTwoRows(best_y, prev_uv, cur_uv, next_uv, w,
394*b2055c35SXin Li src1, src2, rgb_bit_depth);
395*b2055c35SXin Li prev_uv = cur_uv;
396*b2055c35SXin Li cur_uv = next_uv;
397*b2055c35SXin Li }
398*b2055c35SXin Li
399*b2055c35SXin Li UpdateW(src1, best_rgb_y + 0 * w, w, rgb_bit_depth, transfer_type);
400*b2055c35SXin Li UpdateW(src2, best_rgb_y + 1 * w, w, rgb_bit_depth, transfer_type);
401*b2055c35SXin Li UpdateChroma(src1, src2, best_rgb_uv, uv_w, rgb_bit_depth, transfer_type);
402*b2055c35SXin Li
403*b2055c35SXin Li // update two rows of Y and one row of RGB
404*b2055c35SXin Li diff_y_sum +=
405*b2055c35SXin Li SharpYuvUpdateY(target_y, best_rgb_y, best_y, 2 * w, y_bit_depth);
406*b2055c35SXin Li SharpYuvUpdateRGB(target_uv, best_rgb_uv, best_uv, 3 * uv_w);
407*b2055c35SXin Li
408*b2055c35SXin Li best_y += 2 * w;
409*b2055c35SXin Li best_uv += 3 * uv_w;
410*b2055c35SXin Li target_y += 2 * w;
411*b2055c35SXin Li target_uv += 3 * uv_w;
412*b2055c35SXin Li j += 2;
413*b2055c35SXin Li } while (j < h);
414*b2055c35SXin Li // test exit condition
415*b2055c35SXin Li if (iter > 0) {
416*b2055c35SXin Li if (diff_y_sum < diff_y_threshold) break;
417*b2055c35SXin Li if (diff_y_sum > prev_diff_y_sum) break;
418*b2055c35SXin Li }
419*b2055c35SXin Li prev_diff_y_sum = diff_y_sum;
420*b2055c35SXin Li }
421*b2055c35SXin Li
422*b2055c35SXin Li // final reconstruction
423*b2055c35SXin Li ok = ConvertWRGBToYUV(best_y_base, best_uv_base, y_ptr, y_stride, u_ptr,
424*b2055c35SXin Li u_stride, v_ptr, v_stride, rgb_bit_depth, yuv_bit_depth,
425*b2055c35SXin Li width, height, yuv_matrix);
426*b2055c35SXin Li
427*b2055c35SXin Li End:
428*b2055c35SXin Li free(best_y_base);
429*b2055c35SXin Li free(best_uv_base);
430*b2055c35SXin Li free(target_y_base);
431*b2055c35SXin Li free(target_uv_base);
432*b2055c35SXin Li free(best_rgb_y);
433*b2055c35SXin Li free(best_rgb_uv);
434*b2055c35SXin Li free(tmp_buffer);
435*b2055c35SXin Li return ok;
436*b2055c35SXin Li }
437*b2055c35SXin Li
438*b2055c35SXin Li #undef SAFE_ALLOC
439*b2055c35SXin Li
440*b2055c35SXin Li #if defined(WEBP_USE_THREAD) && !defined(_WIN32)
441*b2055c35SXin Li #include <pthread.h> // NOLINT
442*b2055c35SXin Li
443*b2055c35SXin Li #define LOCK_ACCESS \
444*b2055c35SXin Li static pthread_mutex_t sharpyuv_lock = PTHREAD_MUTEX_INITIALIZER; \
445*b2055c35SXin Li if (pthread_mutex_lock(&sharpyuv_lock)) return
446*b2055c35SXin Li #define UNLOCK_ACCESS_AND_RETURN \
447*b2055c35SXin Li do { \
448*b2055c35SXin Li (void)pthread_mutex_unlock(&sharpyuv_lock); \
449*b2055c35SXin Li return; \
450*b2055c35SXin Li } while (0)
451*b2055c35SXin Li #else // !(defined(WEBP_USE_THREAD) && !defined(_WIN32))
452*b2055c35SXin Li #define LOCK_ACCESS do {} while (0)
453*b2055c35SXin Li #define UNLOCK_ACCESS_AND_RETURN return
454*b2055c35SXin Li #endif // defined(WEBP_USE_THREAD) && !defined(_WIN32)
455*b2055c35SXin Li
456*b2055c35SXin Li // Hidden exported init function.
457*b2055c35SXin Li // By default SharpYuvConvert calls it with SharpYuvGetCPUInfo. If needed,
458*b2055c35SXin Li // users can declare it as extern and call it with an alternate VP8CPUInfo
459*b2055c35SXin Li // function.
460*b2055c35SXin Li extern VP8CPUInfo SharpYuvGetCPUInfo;
461*b2055c35SXin Li SHARPYUV_EXTERN void SharpYuvInit(VP8CPUInfo cpu_info_func);
SharpYuvInit(VP8CPUInfo cpu_info_func)462*b2055c35SXin Li void SharpYuvInit(VP8CPUInfo cpu_info_func) {
463*b2055c35SXin Li static volatile VP8CPUInfo sharpyuv_last_cpuinfo_used =
464*b2055c35SXin Li (VP8CPUInfo)&sharpyuv_last_cpuinfo_used;
465*b2055c35SXin Li LOCK_ACCESS;
466*b2055c35SXin Li // Only update SharpYuvGetCPUInfo when called from external code to avoid a
467*b2055c35SXin Li // race on reading the value in SharpYuvConvert().
468*b2055c35SXin Li if (cpu_info_func != (VP8CPUInfo)&SharpYuvGetCPUInfo) {
469*b2055c35SXin Li SharpYuvGetCPUInfo = cpu_info_func;
470*b2055c35SXin Li }
471*b2055c35SXin Li if (sharpyuv_last_cpuinfo_used == SharpYuvGetCPUInfo) {
472*b2055c35SXin Li UNLOCK_ACCESS_AND_RETURN;
473*b2055c35SXin Li }
474*b2055c35SXin Li
475*b2055c35SXin Li SharpYuvInitDsp();
476*b2055c35SXin Li SharpYuvInitGammaTables();
477*b2055c35SXin Li
478*b2055c35SXin Li sharpyuv_last_cpuinfo_used = SharpYuvGetCPUInfo;
479*b2055c35SXin Li UNLOCK_ACCESS_AND_RETURN;
480*b2055c35SXin Li }
481*b2055c35SXin Li
SharpYuvConvert(const void * r_ptr,const void * g_ptr,const void * b_ptr,int rgb_step,int rgb_stride,int rgb_bit_depth,void * y_ptr,int y_stride,void * u_ptr,int u_stride,void * v_ptr,int v_stride,int yuv_bit_depth,int width,int height,const SharpYuvConversionMatrix * yuv_matrix)482*b2055c35SXin Li int SharpYuvConvert(const void* r_ptr, const void* g_ptr, const void* b_ptr,
483*b2055c35SXin Li int rgb_step, int rgb_stride, int rgb_bit_depth,
484*b2055c35SXin Li void* y_ptr, int y_stride, void* u_ptr, int u_stride,
485*b2055c35SXin Li void* v_ptr, int v_stride, int yuv_bit_depth, int width,
486*b2055c35SXin Li int height, const SharpYuvConversionMatrix* yuv_matrix) {
487*b2055c35SXin Li SharpYuvOptions options;
488*b2055c35SXin Li options.yuv_matrix = yuv_matrix;
489*b2055c35SXin Li options.transfer_type = kSharpYuvTransferFunctionSrgb;
490*b2055c35SXin Li return SharpYuvConvertWithOptions(
491*b2055c35SXin Li r_ptr, g_ptr, b_ptr, rgb_step, rgb_stride, rgb_bit_depth, y_ptr, y_stride,
492*b2055c35SXin Li u_ptr, u_stride, v_ptr, v_stride, yuv_bit_depth, width, height, &options);
493*b2055c35SXin Li }
494*b2055c35SXin Li
SharpYuvOptionsInitInternal(const SharpYuvConversionMatrix * yuv_matrix,SharpYuvOptions * options,int version)495*b2055c35SXin Li int SharpYuvOptionsInitInternal(const SharpYuvConversionMatrix* yuv_matrix,
496*b2055c35SXin Li SharpYuvOptions* options, int version) {
497*b2055c35SXin Li const int major = (version >> 24);
498*b2055c35SXin Li const int minor = (version >> 16) & 0xff;
499*b2055c35SXin Li if (options == NULL || yuv_matrix == NULL ||
500*b2055c35SXin Li (major == SHARPYUV_VERSION_MAJOR && major == 0 &&
501*b2055c35SXin Li minor != SHARPYUV_VERSION_MINOR) ||
502*b2055c35SXin Li (major != SHARPYUV_VERSION_MAJOR)) {
503*b2055c35SXin Li return 0;
504*b2055c35SXin Li }
505*b2055c35SXin Li options->yuv_matrix = yuv_matrix;
506*b2055c35SXin Li options->transfer_type = kSharpYuvTransferFunctionSrgb;
507*b2055c35SXin Li return 1;
508*b2055c35SXin Li }
509*b2055c35SXin Li
SharpYuvConvertWithOptions(const void * r_ptr,const void * g_ptr,const void * b_ptr,int rgb_step,int rgb_stride,int rgb_bit_depth,void * y_ptr,int y_stride,void * u_ptr,int u_stride,void * v_ptr,int v_stride,int yuv_bit_depth,int width,int height,const SharpYuvOptions * options)510*b2055c35SXin Li int SharpYuvConvertWithOptions(const void* r_ptr, const void* g_ptr,
511*b2055c35SXin Li const void* b_ptr, int rgb_step, int rgb_stride,
512*b2055c35SXin Li int rgb_bit_depth, void* y_ptr, int y_stride,
513*b2055c35SXin Li void* u_ptr, int u_stride, void* v_ptr,
514*b2055c35SXin Li int v_stride, int yuv_bit_depth, int width,
515*b2055c35SXin Li int height, const SharpYuvOptions* options) {
516*b2055c35SXin Li const SharpYuvConversionMatrix* yuv_matrix = options->yuv_matrix;
517*b2055c35SXin Li SharpYuvTransferFunctionType transfer_type = options->transfer_type;
518*b2055c35SXin Li SharpYuvConversionMatrix scaled_matrix;
519*b2055c35SXin Li const int rgb_max = (1 << rgb_bit_depth) - 1;
520*b2055c35SXin Li const int rgb_round = 1 << (rgb_bit_depth - 1);
521*b2055c35SXin Li const int yuv_max = (1 << yuv_bit_depth) - 1;
522*b2055c35SXin Li const int sfix = GetPrecisionShift(rgb_bit_depth);
523*b2055c35SXin Li
524*b2055c35SXin Li if (width < 1 || height < 1 || width == INT_MAX || height == INT_MAX ||
525*b2055c35SXin Li r_ptr == NULL || g_ptr == NULL || b_ptr == NULL || y_ptr == NULL ||
526*b2055c35SXin Li u_ptr == NULL || v_ptr == NULL) {
527*b2055c35SXin Li return 0;
528*b2055c35SXin Li }
529*b2055c35SXin Li if (rgb_bit_depth != 8 && rgb_bit_depth != 10 && rgb_bit_depth != 12 &&
530*b2055c35SXin Li rgb_bit_depth != 16) {
531*b2055c35SXin Li return 0;
532*b2055c35SXin Li }
533*b2055c35SXin Li if (yuv_bit_depth != 8 && yuv_bit_depth != 10 && yuv_bit_depth != 12) {
534*b2055c35SXin Li return 0;
535*b2055c35SXin Li }
536*b2055c35SXin Li if (rgb_bit_depth > 8 && (rgb_step % 2 != 0 || rgb_stride % 2 != 0)) {
537*b2055c35SXin Li // Step/stride should be even for uint16_t buffers.
538*b2055c35SXin Li return 0;
539*b2055c35SXin Li }
540*b2055c35SXin Li if (yuv_bit_depth > 8 &&
541*b2055c35SXin Li (y_stride % 2 != 0 || u_stride % 2 != 0 || v_stride % 2 != 0)) {
542*b2055c35SXin Li // Stride should be even for uint16_t buffers.
543*b2055c35SXin Li return 0;
544*b2055c35SXin Li }
545*b2055c35SXin Li // The address of the function pointer is used to avoid a read race.
546*b2055c35SXin Li SharpYuvInit((VP8CPUInfo)&SharpYuvGetCPUInfo);
547*b2055c35SXin Li
548*b2055c35SXin Li // Add scaling factor to go from rgb_bit_depth to yuv_bit_depth, to the
549*b2055c35SXin Li // rgb->yuv conversion matrix.
550*b2055c35SXin Li if (rgb_bit_depth == yuv_bit_depth) {
551*b2055c35SXin Li memcpy(&scaled_matrix, yuv_matrix, sizeof(scaled_matrix));
552*b2055c35SXin Li } else {
553*b2055c35SXin Li int i;
554*b2055c35SXin Li for (i = 0; i < 3; ++i) {
555*b2055c35SXin Li scaled_matrix.rgb_to_y[i] =
556*b2055c35SXin Li (yuv_matrix->rgb_to_y[i] * yuv_max + rgb_round) / rgb_max;
557*b2055c35SXin Li scaled_matrix.rgb_to_u[i] =
558*b2055c35SXin Li (yuv_matrix->rgb_to_u[i] * yuv_max + rgb_round) / rgb_max;
559*b2055c35SXin Li scaled_matrix.rgb_to_v[i] =
560*b2055c35SXin Li (yuv_matrix->rgb_to_v[i] * yuv_max + rgb_round) / rgb_max;
561*b2055c35SXin Li }
562*b2055c35SXin Li }
563*b2055c35SXin Li // Also incorporate precision change scaling.
564*b2055c35SXin Li scaled_matrix.rgb_to_y[3] = Shift(yuv_matrix->rgb_to_y[3], sfix);
565*b2055c35SXin Li scaled_matrix.rgb_to_u[3] = Shift(yuv_matrix->rgb_to_u[3], sfix);
566*b2055c35SXin Li scaled_matrix.rgb_to_v[3] = Shift(yuv_matrix->rgb_to_v[3], sfix);
567*b2055c35SXin Li
568*b2055c35SXin Li return DoSharpArgbToYuv(r_ptr, g_ptr, b_ptr, rgb_step, rgb_stride,
569*b2055c35SXin Li rgb_bit_depth, y_ptr, y_stride, u_ptr, u_stride,
570*b2055c35SXin Li v_ptr, v_stride, yuv_bit_depth, width, height,
571*b2055c35SXin Li &scaled_matrix, transfer_type);
572*b2055c35SXin Li }
573*b2055c35SXin Li
574*b2055c35SXin Li //------------------------------------------------------------------------------
575