xref: /aosp_15_r20/external/webp/src/dsp/yuv_sse2.c (revision b2055c353e87c8814eb2b6b1b11112a1562253bd)
1 // Copyright 2014 Google Inc. All Rights Reserved.
2 //
3 // Use of this source code is governed by a BSD-style license
4 // that can be found in the COPYING file in the root of the source
5 // tree. An additional intellectual property rights grant can be found
6 // in the file PATENTS. All contributing project authors may
7 // be found in the AUTHORS file in the root of the source tree.
8 // -----------------------------------------------------------------------------
9 //
10 // YUV->RGB conversion functions
11 //
12 // Author: Skal ([email protected])
13 
14 #include "src/dsp/yuv.h"
15 
16 #if defined(WEBP_USE_SSE2)
17 
18 #include <stdlib.h>
19 #include <emmintrin.h>
20 
21 #include "src/dsp/common_sse2.h"
22 #include "src/utils/utils.h"
23 
24 //-----------------------------------------------------------------------------
25 // Convert spans of 32 pixels to various RGB formats for the fancy upsampler.
26 
27 // These constants are 14b fixed-point version of ITU-R BT.601 constants.
28 // R = (19077 * y             + 26149 * v - 14234) >> 6
29 // G = (19077 * y -  6419 * u - 13320 * v +  8708) >> 6
30 // B = (19077 * y + 33050 * u             - 17685) >> 6
ConvertYUV444ToRGB_SSE2(const __m128i * const Y0,const __m128i * const U0,const __m128i * const V0,__m128i * const R,__m128i * const G,__m128i * const B)31 static void ConvertYUV444ToRGB_SSE2(const __m128i* const Y0,
32                                     const __m128i* const U0,
33                                     const __m128i* const V0,
34                                     __m128i* const R,
35                                     __m128i* const G,
36                                     __m128i* const B) {
37   const __m128i k19077 = _mm_set1_epi16(19077);
38   const __m128i k26149 = _mm_set1_epi16(26149);
39   const __m128i k14234 = _mm_set1_epi16(14234);
40   // 33050 doesn't fit in a signed short: only use this with unsigned arithmetic
41   const __m128i k33050 = _mm_set1_epi16((short)33050);
42   const __m128i k17685 = _mm_set1_epi16(17685);
43   const __m128i k6419  = _mm_set1_epi16(6419);
44   const __m128i k13320 = _mm_set1_epi16(13320);
45   const __m128i k8708  = _mm_set1_epi16(8708);
46 
47   const __m128i Y1 = _mm_mulhi_epu16(*Y0, k19077);
48 
49   const __m128i R0 = _mm_mulhi_epu16(*V0, k26149);
50   const __m128i R1 = _mm_sub_epi16(Y1, k14234);
51   const __m128i R2 = _mm_add_epi16(R1, R0);
52 
53   const __m128i G0 = _mm_mulhi_epu16(*U0, k6419);
54   const __m128i G1 = _mm_mulhi_epu16(*V0, k13320);
55   const __m128i G2 = _mm_add_epi16(Y1, k8708);
56   const __m128i G3 = _mm_add_epi16(G0, G1);
57   const __m128i G4 = _mm_sub_epi16(G2, G3);
58 
59   // be careful with the saturated *unsigned* arithmetic here!
60   const __m128i B0 = _mm_mulhi_epu16(*U0, k33050);
61   const __m128i B1 = _mm_adds_epu16(B0, Y1);
62   const __m128i B2 = _mm_subs_epu16(B1, k17685);
63 
64   // use logical shift for B2, which can be larger than 32767
65   *R = _mm_srai_epi16(R2, 6);   // range: [-14234, 30815]
66   *G = _mm_srai_epi16(G4, 6);   // range: [-10953, 27710]
67   *B = _mm_srli_epi16(B2, 6);   // range: [0, 34238]
68 }
69 
70 // Load the bytes into the *upper* part of 16b words. That's "<< 8", basically.
Load_HI_16_SSE2(const uint8_t * src)71 static WEBP_INLINE __m128i Load_HI_16_SSE2(const uint8_t* src) {
72   const __m128i zero = _mm_setzero_si128();
73   return _mm_unpacklo_epi8(zero, _mm_loadl_epi64((const __m128i*)src));
74 }
75 
76 // Load and replicate the U/V samples
Load_UV_HI_8_SSE2(const uint8_t * src)77 static WEBP_INLINE __m128i Load_UV_HI_8_SSE2(const uint8_t* src) {
78   const __m128i zero = _mm_setzero_si128();
79   const __m128i tmp0 = _mm_cvtsi32_si128(WebPMemToInt32(src));
80   const __m128i tmp1 = _mm_unpacklo_epi8(zero, tmp0);
81   return _mm_unpacklo_epi16(tmp1, tmp1);   // replicate samples
82 }
83 
84 // Convert 32 samples of YUV444 to R/G/B
YUV444ToRGB_SSE2(const uint8_t * const y,const uint8_t * const u,const uint8_t * const v,__m128i * const R,__m128i * const G,__m128i * const B)85 static void YUV444ToRGB_SSE2(const uint8_t* const y,
86                              const uint8_t* const u,
87                              const uint8_t* const v,
88                              __m128i* const R, __m128i* const G,
89                              __m128i* const B) {
90   const __m128i Y0 = Load_HI_16_SSE2(y), U0 = Load_HI_16_SSE2(u),
91                 V0 = Load_HI_16_SSE2(v);
92   ConvertYUV444ToRGB_SSE2(&Y0, &U0, &V0, R, G, B);
93 }
94 
95 // Convert 32 samples of YUV420 to R/G/B
YUV420ToRGB_SSE2(const uint8_t * const y,const uint8_t * const u,const uint8_t * const v,__m128i * const R,__m128i * const G,__m128i * const B)96 static void YUV420ToRGB_SSE2(const uint8_t* const y,
97                              const uint8_t* const u,
98                              const uint8_t* const v,
99                              __m128i* const R, __m128i* const G,
100                              __m128i* const B) {
101   const __m128i Y0 = Load_HI_16_SSE2(y), U0 = Load_UV_HI_8_SSE2(u),
102                 V0 = Load_UV_HI_8_SSE2(v);
103   ConvertYUV444ToRGB_SSE2(&Y0, &U0, &V0, R, G, B);
104 }
105 
106 // Pack R/G/B/A results into 32b output.
PackAndStore4_SSE2(const __m128i * const R,const __m128i * const G,const __m128i * const B,const __m128i * const A,uint8_t * const dst)107 static WEBP_INLINE void PackAndStore4_SSE2(const __m128i* const R,
108                                            const __m128i* const G,
109                                            const __m128i* const B,
110                                            const __m128i* const A,
111                                            uint8_t* const dst) {
112   const __m128i rb = _mm_packus_epi16(*R, *B);
113   const __m128i ga = _mm_packus_epi16(*G, *A);
114   const __m128i rg = _mm_unpacklo_epi8(rb, ga);
115   const __m128i ba = _mm_unpackhi_epi8(rb, ga);
116   const __m128i RGBA_lo = _mm_unpacklo_epi16(rg, ba);
117   const __m128i RGBA_hi = _mm_unpackhi_epi16(rg, ba);
118   _mm_storeu_si128((__m128i*)(dst +  0), RGBA_lo);
119   _mm_storeu_si128((__m128i*)(dst + 16), RGBA_hi);
120 }
121 
122 // Pack R/G/B/A results into 16b output.
PackAndStore4444_SSE2(const __m128i * const R,const __m128i * const G,const __m128i * const B,const __m128i * const A,uint8_t * const dst)123 static WEBP_INLINE void PackAndStore4444_SSE2(const __m128i* const R,
124                                               const __m128i* const G,
125                                               const __m128i* const B,
126                                               const __m128i* const A,
127                                               uint8_t* const dst) {
128 #if (WEBP_SWAP_16BIT_CSP == 0)
129   const __m128i rg0 = _mm_packus_epi16(*R, *G);
130   const __m128i ba0 = _mm_packus_epi16(*B, *A);
131 #else
132   const __m128i rg0 = _mm_packus_epi16(*B, *A);
133   const __m128i ba0 = _mm_packus_epi16(*R, *G);
134 #endif
135   const __m128i mask_0xf0 = _mm_set1_epi8((char)0xf0);
136   const __m128i rb1 = _mm_unpacklo_epi8(rg0, ba0);  // rbrbrbrbrb...
137   const __m128i ga1 = _mm_unpackhi_epi8(rg0, ba0);  // gagagagaga...
138   const __m128i rb2 = _mm_and_si128(rb1, mask_0xf0);
139   const __m128i ga2 = _mm_srli_epi16(_mm_and_si128(ga1, mask_0xf0), 4);
140   const __m128i rgba4444 = _mm_or_si128(rb2, ga2);
141   _mm_storeu_si128((__m128i*)dst, rgba4444);
142 }
143 
144 // Pack R/G/B results into 16b output.
PackAndStore565_SSE2(const __m128i * const R,const __m128i * const G,const __m128i * const B,uint8_t * const dst)145 static WEBP_INLINE void PackAndStore565_SSE2(const __m128i* const R,
146                                              const __m128i* const G,
147                                              const __m128i* const B,
148                                              uint8_t* const dst) {
149   const __m128i r0 = _mm_packus_epi16(*R, *R);
150   const __m128i g0 = _mm_packus_epi16(*G, *G);
151   const __m128i b0 = _mm_packus_epi16(*B, *B);
152   const __m128i r1 = _mm_and_si128(r0, _mm_set1_epi8((char)0xf8));
153   const __m128i b1 = _mm_and_si128(_mm_srli_epi16(b0, 3), _mm_set1_epi8(0x1f));
154   const __m128i g1 =
155       _mm_srli_epi16(_mm_and_si128(g0, _mm_set1_epi8((char)0xe0)), 5);
156   const __m128i g2 = _mm_slli_epi16(_mm_and_si128(g0, _mm_set1_epi8(0x1c)), 3);
157   const __m128i rg = _mm_or_si128(r1, g1);
158   const __m128i gb = _mm_or_si128(g2, b1);
159 #if (WEBP_SWAP_16BIT_CSP == 0)
160   const __m128i rgb565 = _mm_unpacklo_epi8(rg, gb);
161 #else
162   const __m128i rgb565 = _mm_unpacklo_epi8(gb, rg);
163 #endif
164   _mm_storeu_si128((__m128i*)dst, rgb565);
165 }
166 
167 // Pack the planar buffers
168 // rrrr... rrrr... gggg... gggg... bbbb... bbbb....
169 // triplet by triplet in the output buffer rgb as rgbrgbrgbrgb ...
PlanarTo24b_SSE2(__m128i * const in0,__m128i * const in1,__m128i * const in2,__m128i * const in3,__m128i * const in4,__m128i * const in5,uint8_t * const rgb)170 static WEBP_INLINE void PlanarTo24b_SSE2(__m128i* const in0, __m128i* const in1,
171                                          __m128i* const in2, __m128i* const in3,
172                                          __m128i* const in4, __m128i* const in5,
173                                          uint8_t* const rgb) {
174   // The input is 6 registers of sixteen 8b but for the sake of explanation,
175   // let's take 6 registers of four 8b values.
176   // To pack, we will keep taking one every two 8b integer and move it
177   // around as follows:
178   // Input:
179   //   r0r1r2r3 | r4r5r6r7 | g0g1g2g3 | g4g5g6g7 | b0b1b2b3 | b4b5b6b7
180   // Split the 6 registers in two sets of 3 registers: the first set as the even
181   // 8b bytes, the second the odd ones:
182   //   r0r2r4r6 | g0g2g4g6 | b0b2b4b6 | r1r3r5r7 | g1g3g5g7 | b1b3b5b7
183   // Repeat the same permutations twice more:
184   //   r0r4g0g4 | b0b4r1r5 | g1g5b1b5 | r2r6g2g6 | b2b6r3r7 | g3g7b3b7
185   //   r0g0b0r1 | g1b1r2g2 | b2r3g3b3 | r4g4b4r5 | g5b5r6g6 | b6r7g7b7
186   VP8PlanarTo24b_SSE2(in0, in1, in2, in3, in4, in5);
187 
188   _mm_storeu_si128((__m128i*)(rgb +  0), *in0);
189   _mm_storeu_si128((__m128i*)(rgb + 16), *in1);
190   _mm_storeu_si128((__m128i*)(rgb + 32), *in2);
191   _mm_storeu_si128((__m128i*)(rgb + 48), *in3);
192   _mm_storeu_si128((__m128i*)(rgb + 64), *in4);
193   _mm_storeu_si128((__m128i*)(rgb + 80), *in5);
194 }
195 
VP8YuvToRgba32_SSE2(const uint8_t * y,const uint8_t * u,const uint8_t * v,uint8_t * dst)196 void VP8YuvToRgba32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
197                          uint8_t* dst) {
198   const __m128i kAlpha = _mm_set1_epi16(255);
199   int n;
200   for (n = 0; n < 32; n += 8, dst += 32) {
201     __m128i R, G, B;
202     YUV444ToRGB_SSE2(y + n, u + n, v + n, &R, &G, &B);
203     PackAndStore4_SSE2(&R, &G, &B, &kAlpha, dst);
204   }
205 }
206 
VP8YuvToBgra32_SSE2(const uint8_t * y,const uint8_t * u,const uint8_t * v,uint8_t * dst)207 void VP8YuvToBgra32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
208                          uint8_t* dst) {
209   const __m128i kAlpha = _mm_set1_epi16(255);
210   int n;
211   for (n = 0; n < 32; n += 8, dst += 32) {
212     __m128i R, G, B;
213     YUV444ToRGB_SSE2(y + n, u + n, v + n, &R, &G, &B);
214     PackAndStore4_SSE2(&B, &G, &R, &kAlpha, dst);
215   }
216 }
217 
VP8YuvToArgb32_SSE2(const uint8_t * y,const uint8_t * u,const uint8_t * v,uint8_t * dst)218 void VP8YuvToArgb32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
219                          uint8_t* dst) {
220   const __m128i kAlpha = _mm_set1_epi16(255);
221   int n;
222   for (n = 0; n < 32; n += 8, dst += 32) {
223     __m128i R, G, B;
224     YUV444ToRGB_SSE2(y + n, u + n, v + n, &R, &G, &B);
225     PackAndStore4_SSE2(&kAlpha, &R, &G, &B, dst);
226   }
227 }
228 
VP8YuvToRgba444432_SSE2(const uint8_t * y,const uint8_t * u,const uint8_t * v,uint8_t * dst)229 void VP8YuvToRgba444432_SSE2(const uint8_t* y, const uint8_t* u,
230                              const uint8_t* v, uint8_t* dst) {
231   const __m128i kAlpha = _mm_set1_epi16(255);
232   int n;
233   for (n = 0; n < 32; n += 8, dst += 16) {
234     __m128i R, G, B;
235     YUV444ToRGB_SSE2(y + n, u + n, v + n, &R, &G, &B);
236     PackAndStore4444_SSE2(&R, &G, &B, &kAlpha, dst);
237   }
238 }
239 
VP8YuvToRgb56532_SSE2(const uint8_t * y,const uint8_t * u,const uint8_t * v,uint8_t * dst)240 void VP8YuvToRgb56532_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
241                            uint8_t* dst) {
242   int n;
243   for (n = 0; n < 32; n += 8, dst += 16) {
244     __m128i R, G, B;
245     YUV444ToRGB_SSE2(y + n, u + n, v + n, &R, &G, &B);
246     PackAndStore565_SSE2(&R, &G, &B, dst);
247   }
248 }
249 
VP8YuvToRgb32_SSE2(const uint8_t * y,const uint8_t * u,const uint8_t * v,uint8_t * dst)250 void VP8YuvToRgb32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
251                         uint8_t* dst) {
252   __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
253   __m128i rgb0, rgb1, rgb2, rgb3, rgb4, rgb5;
254 
255   YUV444ToRGB_SSE2(y + 0, u + 0, v + 0, &R0, &G0, &B0);
256   YUV444ToRGB_SSE2(y + 8, u + 8, v + 8, &R1, &G1, &B1);
257   YUV444ToRGB_SSE2(y + 16, u + 16, v + 16, &R2, &G2, &B2);
258   YUV444ToRGB_SSE2(y + 24, u + 24, v + 24, &R3, &G3, &B3);
259 
260   // Cast to 8b and store as RRRRGGGGBBBB.
261   rgb0 = _mm_packus_epi16(R0, R1);
262   rgb1 = _mm_packus_epi16(R2, R3);
263   rgb2 = _mm_packus_epi16(G0, G1);
264   rgb3 = _mm_packus_epi16(G2, G3);
265   rgb4 = _mm_packus_epi16(B0, B1);
266   rgb5 = _mm_packus_epi16(B2, B3);
267 
268   // Pack as RGBRGBRGBRGB.
269   PlanarTo24b_SSE2(&rgb0, &rgb1, &rgb2, &rgb3, &rgb4, &rgb5, dst);
270 }
271 
VP8YuvToBgr32_SSE2(const uint8_t * y,const uint8_t * u,const uint8_t * v,uint8_t * dst)272 void VP8YuvToBgr32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
273                         uint8_t* dst) {
274   __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
275   __m128i bgr0, bgr1, bgr2, bgr3, bgr4, bgr5;
276 
277   YUV444ToRGB_SSE2(y +  0, u +  0, v +  0, &R0, &G0, &B0);
278   YUV444ToRGB_SSE2(y +  8, u +  8, v +  8, &R1, &G1, &B1);
279   YUV444ToRGB_SSE2(y + 16, u + 16, v + 16, &R2, &G2, &B2);
280   YUV444ToRGB_SSE2(y + 24, u + 24, v + 24, &R3, &G3, &B3);
281 
282   // Cast to 8b and store as BBBBGGGGRRRR.
283   bgr0 = _mm_packus_epi16(B0, B1);
284   bgr1 = _mm_packus_epi16(B2, B3);
285   bgr2 = _mm_packus_epi16(G0, G1);
286   bgr3 = _mm_packus_epi16(G2, G3);
287   bgr4 = _mm_packus_epi16(R0, R1);
288   bgr5= _mm_packus_epi16(R2, R3);
289 
290   // Pack as BGRBGRBGRBGR.
291   PlanarTo24b_SSE2(&bgr0, &bgr1, &bgr2, &bgr3, &bgr4, &bgr5, dst);
292 }
293 
294 //-----------------------------------------------------------------------------
295 // Arbitrary-length row conversion functions
296 
YuvToRgbaRow_SSE2(const uint8_t * y,const uint8_t * u,const uint8_t * v,uint8_t * dst,int len)297 static void YuvToRgbaRow_SSE2(const uint8_t* y,
298                               const uint8_t* u, const uint8_t* v,
299                               uint8_t* dst, int len) {
300   const __m128i kAlpha = _mm_set1_epi16(255);
301   int n;
302   for (n = 0; n + 8 <= len; n += 8, dst += 32) {
303     __m128i R, G, B;
304     YUV420ToRGB_SSE2(y, u, v, &R, &G, &B);
305     PackAndStore4_SSE2(&R, &G, &B, &kAlpha, dst);
306     y += 8;
307     u += 4;
308     v += 4;
309   }
310   for (; n < len; ++n) {   // Finish off
311     VP8YuvToRgba(y[0], u[0], v[0], dst);
312     dst += 4;
313     y += 1;
314     u += (n & 1);
315     v += (n & 1);
316   }
317 }
318 
YuvToBgraRow_SSE2(const uint8_t * y,const uint8_t * u,const uint8_t * v,uint8_t * dst,int len)319 static void YuvToBgraRow_SSE2(const uint8_t* y,
320                               const uint8_t* u, const uint8_t* v,
321                               uint8_t* dst, int len) {
322   const __m128i kAlpha = _mm_set1_epi16(255);
323   int n;
324   for (n = 0; n + 8 <= len; n += 8, dst += 32) {
325     __m128i R, G, B;
326     YUV420ToRGB_SSE2(y, u, v, &R, &G, &B);
327     PackAndStore4_SSE2(&B, &G, &R, &kAlpha, dst);
328     y += 8;
329     u += 4;
330     v += 4;
331   }
332   for (; n < len; ++n) {   // Finish off
333     VP8YuvToBgra(y[0], u[0], v[0], dst);
334     dst += 4;
335     y += 1;
336     u += (n & 1);
337     v += (n & 1);
338   }
339 }
340 
YuvToArgbRow_SSE2(const uint8_t * y,const uint8_t * u,const uint8_t * v,uint8_t * dst,int len)341 static void YuvToArgbRow_SSE2(const uint8_t* y,
342                               const uint8_t* u, const uint8_t* v,
343                               uint8_t* dst, int len) {
344   const __m128i kAlpha = _mm_set1_epi16(255);
345   int n;
346   for (n = 0; n + 8 <= len; n += 8, dst += 32) {
347     __m128i R, G, B;
348     YUV420ToRGB_SSE2(y, u, v, &R, &G, &B);
349     PackAndStore4_SSE2(&kAlpha, &R, &G, &B, dst);
350     y += 8;
351     u += 4;
352     v += 4;
353   }
354   for (; n < len; ++n) {   // Finish off
355     VP8YuvToArgb(y[0], u[0], v[0], dst);
356     dst += 4;
357     y += 1;
358     u += (n & 1);
359     v += (n & 1);
360   }
361 }
362 
YuvToRgbRow_SSE2(const uint8_t * y,const uint8_t * u,const uint8_t * v,uint8_t * dst,int len)363 static void YuvToRgbRow_SSE2(const uint8_t* y,
364                              const uint8_t* u, const uint8_t* v,
365                              uint8_t* dst, int len) {
366   int n;
367   for (n = 0; n + 32 <= len; n += 32, dst += 32 * 3) {
368     __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
369     __m128i rgb0, rgb1, rgb2, rgb3, rgb4, rgb5;
370 
371     YUV420ToRGB_SSE2(y +  0, u +  0, v +  0, &R0, &G0, &B0);
372     YUV420ToRGB_SSE2(y +  8, u +  4, v +  4, &R1, &G1, &B1);
373     YUV420ToRGB_SSE2(y + 16, u +  8, v +  8, &R2, &G2, &B2);
374     YUV420ToRGB_SSE2(y + 24, u + 12, v + 12, &R3, &G3, &B3);
375 
376     // Cast to 8b and store as RRRRGGGGBBBB.
377     rgb0 = _mm_packus_epi16(R0, R1);
378     rgb1 = _mm_packus_epi16(R2, R3);
379     rgb2 = _mm_packus_epi16(G0, G1);
380     rgb3 = _mm_packus_epi16(G2, G3);
381     rgb4 = _mm_packus_epi16(B0, B1);
382     rgb5 = _mm_packus_epi16(B2, B3);
383 
384     // Pack as RGBRGBRGBRGB.
385     PlanarTo24b_SSE2(&rgb0, &rgb1, &rgb2, &rgb3, &rgb4, &rgb5, dst);
386 
387     y += 32;
388     u += 16;
389     v += 16;
390   }
391   for (; n < len; ++n) {   // Finish off
392     VP8YuvToRgb(y[0], u[0], v[0], dst);
393     dst += 3;
394     y += 1;
395     u += (n & 1);
396     v += (n & 1);
397   }
398 }
399 
YuvToBgrRow_SSE2(const uint8_t * y,const uint8_t * u,const uint8_t * v,uint8_t * dst,int len)400 static void YuvToBgrRow_SSE2(const uint8_t* y,
401                              const uint8_t* u, const uint8_t* v,
402                              uint8_t* dst, int len) {
403   int n;
404   for (n = 0; n + 32 <= len; n += 32, dst += 32 * 3) {
405     __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
406     __m128i bgr0, bgr1, bgr2, bgr3, bgr4, bgr5;
407 
408     YUV420ToRGB_SSE2(y +  0, u +  0, v +  0, &R0, &G0, &B0);
409     YUV420ToRGB_SSE2(y +  8, u +  4, v +  4, &R1, &G1, &B1);
410     YUV420ToRGB_SSE2(y + 16, u +  8, v +  8, &R2, &G2, &B2);
411     YUV420ToRGB_SSE2(y + 24, u + 12, v + 12, &R3, &G3, &B3);
412 
413     // Cast to 8b and store as BBBBGGGGRRRR.
414     bgr0 = _mm_packus_epi16(B0, B1);
415     bgr1 = _mm_packus_epi16(B2, B3);
416     bgr2 = _mm_packus_epi16(G0, G1);
417     bgr3 = _mm_packus_epi16(G2, G3);
418     bgr4 = _mm_packus_epi16(R0, R1);
419     bgr5 = _mm_packus_epi16(R2, R3);
420 
421     // Pack as BGRBGRBGRBGR.
422     PlanarTo24b_SSE2(&bgr0, &bgr1, &bgr2, &bgr3, &bgr4, &bgr5, dst);
423 
424     y += 32;
425     u += 16;
426     v += 16;
427   }
428   for (; n < len; ++n) {   // Finish off
429     VP8YuvToBgr(y[0], u[0], v[0], dst);
430     dst += 3;
431     y += 1;
432     u += (n & 1);
433     v += (n & 1);
434   }
435 }
436 
437 //------------------------------------------------------------------------------
438 // Entry point
439 
440 extern void WebPInitSamplersSSE2(void);
441 
WebPInitSamplersSSE2(void)442 WEBP_TSAN_IGNORE_FUNCTION void WebPInitSamplersSSE2(void) {
443   WebPSamplers[MODE_RGB]  = YuvToRgbRow_SSE2;
444   WebPSamplers[MODE_RGBA] = YuvToRgbaRow_SSE2;
445   WebPSamplers[MODE_BGR]  = YuvToBgrRow_SSE2;
446   WebPSamplers[MODE_BGRA] = YuvToBgraRow_SSE2;
447   WebPSamplers[MODE_ARGB] = YuvToArgbRow_SSE2;
448 }
449 
450 //------------------------------------------------------------------------------
451 // RGB24/32 -> YUV converters
452 
453 // Load eight 16b-words from *src.
454 #define LOAD_16(src) _mm_loadu_si128((const __m128i*)(src))
455 // Store either 16b-words into *dst
456 #define STORE_16(V, dst) _mm_storeu_si128((__m128i*)(dst), (V))
457 
458 // Function that inserts a value of the second half of the in buffer in between
459 // every two char of the first half.
RGB24PackedToPlanarHelper_SSE2(const __m128i * const in,__m128i * const out)460 static WEBP_INLINE void RGB24PackedToPlanarHelper_SSE2(
461     const __m128i* const in /*in[6]*/, __m128i* const out /*out[6]*/) {
462   out[0] = _mm_unpacklo_epi8(in[0], in[3]);
463   out[1] = _mm_unpackhi_epi8(in[0], in[3]);
464   out[2] = _mm_unpacklo_epi8(in[1], in[4]);
465   out[3] = _mm_unpackhi_epi8(in[1], in[4]);
466   out[4] = _mm_unpacklo_epi8(in[2], in[5]);
467   out[5] = _mm_unpackhi_epi8(in[2], in[5]);
468 }
469 
470 // Unpack the 8b input rgbrgbrgbrgb ... as contiguous registers:
471 // rrrr... rrrr... gggg... gggg... bbbb... bbbb....
472 // Similar to PlanarTo24bHelper(), but in reverse order.
RGB24PackedToPlanar_SSE2(const uint8_t * const rgb,__m128i * const out)473 static WEBP_INLINE void RGB24PackedToPlanar_SSE2(
474     const uint8_t* const rgb, __m128i* const out /*out[6]*/) {
475   __m128i tmp[6];
476   tmp[0] = _mm_loadu_si128((const __m128i*)(rgb +  0));
477   tmp[1] = _mm_loadu_si128((const __m128i*)(rgb + 16));
478   tmp[2] = _mm_loadu_si128((const __m128i*)(rgb + 32));
479   tmp[3] = _mm_loadu_si128((const __m128i*)(rgb + 48));
480   tmp[4] = _mm_loadu_si128((const __m128i*)(rgb + 64));
481   tmp[5] = _mm_loadu_si128((const __m128i*)(rgb + 80));
482 
483   RGB24PackedToPlanarHelper_SSE2(tmp, out);
484   RGB24PackedToPlanarHelper_SSE2(out, tmp);
485   RGB24PackedToPlanarHelper_SSE2(tmp, out);
486   RGB24PackedToPlanarHelper_SSE2(out, tmp);
487   RGB24PackedToPlanarHelper_SSE2(tmp, out);
488 }
489 
490 // Convert 8 packed ARGB to r[], g[], b[]
RGB32PackedToPlanar_SSE2(const uint32_t * const argb,__m128i * const rgb)491 static WEBP_INLINE void RGB32PackedToPlanar_SSE2(const uint32_t* const argb,
492                                                  __m128i* const rgb /*in[6]*/) {
493   const __m128i zero = _mm_setzero_si128();
494   __m128i a0 = LOAD_16(argb + 0);
495   __m128i a1 = LOAD_16(argb + 4);
496   __m128i a2 = LOAD_16(argb + 8);
497   __m128i a3 = LOAD_16(argb + 12);
498   VP8L32bToPlanar_SSE2(&a0, &a1, &a2, &a3);
499   rgb[0] = _mm_unpacklo_epi8(a1, zero);
500   rgb[1] = _mm_unpackhi_epi8(a1, zero);
501   rgb[2] = _mm_unpacklo_epi8(a2, zero);
502   rgb[3] = _mm_unpackhi_epi8(a2, zero);
503   rgb[4] = _mm_unpacklo_epi8(a3, zero);
504   rgb[5] = _mm_unpackhi_epi8(a3, zero);
505 }
506 
507 // This macro computes (RG * MULT_RG + GB * MULT_GB + ROUNDER) >> DESCALE_FIX
508 // It's a macro and not a function because we need to use immediate values with
509 // srai_epi32, e.g.
510 #define TRANSFORM(RG_LO, RG_HI, GB_LO, GB_HI, MULT_RG, MULT_GB, \
511                   ROUNDER, DESCALE_FIX, OUT) do {               \
512   const __m128i V0_lo = _mm_madd_epi16(RG_LO, MULT_RG);         \
513   const __m128i V0_hi = _mm_madd_epi16(RG_HI, MULT_RG);         \
514   const __m128i V1_lo = _mm_madd_epi16(GB_LO, MULT_GB);         \
515   const __m128i V1_hi = _mm_madd_epi16(GB_HI, MULT_GB);         \
516   const __m128i V2_lo = _mm_add_epi32(V0_lo, V1_lo);            \
517   const __m128i V2_hi = _mm_add_epi32(V0_hi, V1_hi);            \
518   const __m128i V3_lo = _mm_add_epi32(V2_lo, ROUNDER);          \
519   const __m128i V3_hi = _mm_add_epi32(V2_hi, ROUNDER);          \
520   const __m128i V5_lo = _mm_srai_epi32(V3_lo, DESCALE_FIX);     \
521   const __m128i V5_hi = _mm_srai_epi32(V3_hi, DESCALE_FIX);     \
522   (OUT) = _mm_packs_epi32(V5_lo, V5_hi);                        \
523 } while (0)
524 
525 #define MK_CST_16(A, B) _mm_set_epi16((B), (A), (B), (A), (B), (A), (B), (A))
ConvertRGBToY_SSE2(const __m128i * const R,const __m128i * const G,const __m128i * const B,__m128i * const Y)526 static WEBP_INLINE void ConvertRGBToY_SSE2(const __m128i* const R,
527                                            const __m128i* const G,
528                                            const __m128i* const B,
529                                            __m128i* const Y) {
530   const __m128i kRG_y = MK_CST_16(16839, 33059 - 16384);
531   const __m128i kGB_y = MK_CST_16(16384, 6420);
532   const __m128i kHALF_Y = _mm_set1_epi32((16 << YUV_FIX) + YUV_HALF);
533 
534   const __m128i RG_lo = _mm_unpacklo_epi16(*R, *G);
535   const __m128i RG_hi = _mm_unpackhi_epi16(*R, *G);
536   const __m128i GB_lo = _mm_unpacklo_epi16(*G, *B);
537   const __m128i GB_hi = _mm_unpackhi_epi16(*G, *B);
538   TRANSFORM(RG_lo, RG_hi, GB_lo, GB_hi, kRG_y, kGB_y, kHALF_Y, YUV_FIX, *Y);
539 }
540 
ConvertRGBToUV_SSE2(const __m128i * const R,const __m128i * const G,const __m128i * const B,__m128i * const U,__m128i * const V)541 static WEBP_INLINE void ConvertRGBToUV_SSE2(const __m128i* const R,
542                                             const __m128i* const G,
543                                             const __m128i* const B,
544                                             __m128i* const U,
545                                             __m128i* const V) {
546   const __m128i kRG_u = MK_CST_16(-9719, -19081);
547   const __m128i kGB_u = MK_CST_16(0, 28800);
548   const __m128i kRG_v = MK_CST_16(28800, 0);
549   const __m128i kGB_v = MK_CST_16(-24116, -4684);
550   const __m128i kHALF_UV = _mm_set1_epi32(((128 << YUV_FIX) + YUV_HALF) << 2);
551 
552   const __m128i RG_lo = _mm_unpacklo_epi16(*R, *G);
553   const __m128i RG_hi = _mm_unpackhi_epi16(*R, *G);
554   const __m128i GB_lo = _mm_unpacklo_epi16(*G, *B);
555   const __m128i GB_hi = _mm_unpackhi_epi16(*G, *B);
556   TRANSFORM(RG_lo, RG_hi, GB_lo, GB_hi, kRG_u, kGB_u,
557             kHALF_UV, YUV_FIX + 2, *U);
558   TRANSFORM(RG_lo, RG_hi, GB_lo, GB_hi, kRG_v, kGB_v,
559             kHALF_UV, YUV_FIX + 2, *V);
560 }
561 
562 #undef MK_CST_16
563 #undef TRANSFORM
564 
ConvertRGB24ToY_SSE2(const uint8_t * rgb,uint8_t * y,int width)565 static void ConvertRGB24ToY_SSE2(const uint8_t* rgb, uint8_t* y, int width) {
566   const int max_width = width & ~31;
567   int i;
568   for (i = 0; i < max_width; rgb += 3 * 16 * 2) {
569     __m128i rgb_plane[6];
570     int j;
571 
572     RGB24PackedToPlanar_SSE2(rgb, rgb_plane);
573 
574     for (j = 0; j < 2; ++j, i += 16) {
575       const __m128i zero = _mm_setzero_si128();
576       __m128i r, g, b, Y0, Y1;
577 
578       // Convert to 16-bit Y.
579       r = _mm_unpacklo_epi8(rgb_plane[0 + j], zero);
580       g = _mm_unpacklo_epi8(rgb_plane[2 + j], zero);
581       b = _mm_unpacklo_epi8(rgb_plane[4 + j], zero);
582       ConvertRGBToY_SSE2(&r, &g, &b, &Y0);
583 
584       // Convert to 16-bit Y.
585       r = _mm_unpackhi_epi8(rgb_plane[0 + j], zero);
586       g = _mm_unpackhi_epi8(rgb_plane[2 + j], zero);
587       b = _mm_unpackhi_epi8(rgb_plane[4 + j], zero);
588       ConvertRGBToY_SSE2(&r, &g, &b, &Y1);
589 
590       // Cast to 8-bit and store.
591       STORE_16(_mm_packus_epi16(Y0, Y1), y + i);
592     }
593   }
594   for (; i < width; ++i, rgb += 3) {   // left-over
595     y[i] = VP8RGBToY(rgb[0], rgb[1], rgb[2], YUV_HALF);
596   }
597 }
598 
ConvertBGR24ToY_SSE2(const uint8_t * bgr,uint8_t * y,int width)599 static void ConvertBGR24ToY_SSE2(const uint8_t* bgr, uint8_t* y, int width) {
600   const int max_width = width & ~31;
601   int i;
602   for (i = 0; i < max_width; bgr += 3 * 16 * 2) {
603     __m128i bgr_plane[6];
604     int j;
605 
606     RGB24PackedToPlanar_SSE2(bgr, bgr_plane);
607 
608     for (j = 0; j < 2; ++j, i += 16) {
609       const __m128i zero = _mm_setzero_si128();
610       __m128i r, g, b, Y0, Y1;
611 
612       // Convert to 16-bit Y.
613       b = _mm_unpacklo_epi8(bgr_plane[0 + j], zero);
614       g = _mm_unpacklo_epi8(bgr_plane[2 + j], zero);
615       r = _mm_unpacklo_epi8(bgr_plane[4 + j], zero);
616       ConvertRGBToY_SSE2(&r, &g, &b, &Y0);
617 
618       // Convert to 16-bit Y.
619       b = _mm_unpackhi_epi8(bgr_plane[0 + j], zero);
620       g = _mm_unpackhi_epi8(bgr_plane[2 + j], zero);
621       r = _mm_unpackhi_epi8(bgr_plane[4 + j], zero);
622       ConvertRGBToY_SSE2(&r, &g, &b, &Y1);
623 
624       // Cast to 8-bit and store.
625       STORE_16(_mm_packus_epi16(Y0, Y1), y + i);
626     }
627   }
628   for (; i < width; ++i, bgr += 3) {  // left-over
629     y[i] = VP8RGBToY(bgr[2], bgr[1], bgr[0], YUV_HALF);
630   }
631 }
632 
ConvertARGBToY_SSE2(const uint32_t * argb,uint8_t * y,int width)633 static void ConvertARGBToY_SSE2(const uint32_t* argb, uint8_t* y, int width) {
634   const int max_width = width & ~15;
635   int i;
636   for (i = 0; i < max_width; i += 16) {
637     __m128i Y0, Y1, rgb[6];
638     RGB32PackedToPlanar_SSE2(&argb[i], rgb);
639     ConvertRGBToY_SSE2(&rgb[0], &rgb[2], &rgb[4], &Y0);
640     ConvertRGBToY_SSE2(&rgb[1], &rgb[3], &rgb[5], &Y1);
641     STORE_16(_mm_packus_epi16(Y0, Y1), y + i);
642   }
643   for (; i < width; ++i) {   // left-over
644     const uint32_t p = argb[i];
645     y[i] = VP8RGBToY((p >> 16) & 0xff, (p >> 8) & 0xff, (p >>  0) & 0xff,
646                      YUV_HALF);
647   }
648 }
649 
650 // Horizontal add (doubled) of two 16b values, result is 16b.
651 // in: A | B | C | D | ... -> out: 2*(A+B) | 2*(C+D) | ...
HorizontalAddPack_SSE2(const __m128i * const A,const __m128i * const B,__m128i * const out)652 static void HorizontalAddPack_SSE2(const __m128i* const A,
653                                    const __m128i* const B,
654                                    __m128i* const out) {
655   const __m128i k2 = _mm_set1_epi16(2);
656   const __m128i C = _mm_madd_epi16(*A, k2);
657   const __m128i D = _mm_madd_epi16(*B, k2);
658   *out = _mm_packs_epi32(C, D);
659 }
660 
ConvertARGBToUV_SSE2(const uint32_t * argb,uint8_t * u,uint8_t * v,int src_width,int do_store)661 static void ConvertARGBToUV_SSE2(const uint32_t* argb,
662                                  uint8_t* u, uint8_t* v,
663                                  int src_width, int do_store) {
664   const int max_width = src_width & ~31;
665   int i;
666   for (i = 0; i < max_width; i += 32, u += 16, v += 16) {
667     __m128i rgb[6], U0, V0, U1, V1;
668     RGB32PackedToPlanar_SSE2(&argb[i], rgb);
669     HorizontalAddPack_SSE2(&rgb[0], &rgb[1], &rgb[0]);
670     HorizontalAddPack_SSE2(&rgb[2], &rgb[3], &rgb[2]);
671     HorizontalAddPack_SSE2(&rgb[4], &rgb[5], &rgb[4]);
672     ConvertRGBToUV_SSE2(&rgb[0], &rgb[2], &rgb[4], &U0, &V0);
673 
674     RGB32PackedToPlanar_SSE2(&argb[i + 16], rgb);
675     HorizontalAddPack_SSE2(&rgb[0], &rgb[1], &rgb[0]);
676     HorizontalAddPack_SSE2(&rgb[2], &rgb[3], &rgb[2]);
677     HorizontalAddPack_SSE2(&rgb[4], &rgb[5], &rgb[4]);
678     ConvertRGBToUV_SSE2(&rgb[0], &rgb[2], &rgb[4], &U1, &V1);
679 
680     U0 = _mm_packus_epi16(U0, U1);
681     V0 = _mm_packus_epi16(V0, V1);
682     if (!do_store) {
683       const __m128i prev_u = LOAD_16(u);
684       const __m128i prev_v = LOAD_16(v);
685       U0 = _mm_avg_epu8(U0, prev_u);
686       V0 = _mm_avg_epu8(V0, prev_v);
687     }
688     STORE_16(U0, u);
689     STORE_16(V0, v);
690   }
691   if (i < src_width) {  // left-over
692     WebPConvertARGBToUV_C(argb + i, u, v, src_width - i, do_store);
693   }
694 }
695 
696 // Convert 16 packed ARGB 16b-values to r[], g[], b[]
RGBA32PackedToPlanar_16b_SSE2(const uint16_t * const rgbx,__m128i * const r,__m128i * const g,__m128i * const b)697 static WEBP_INLINE void RGBA32PackedToPlanar_16b_SSE2(
698     const uint16_t* const rgbx,
699     __m128i* const r, __m128i* const g, __m128i* const b) {
700   const __m128i in0 = LOAD_16(rgbx +  0);  // r0 | g0 | b0 |x| r1 | g1 | b1 |x
701   const __m128i in1 = LOAD_16(rgbx +  8);  // r2 | g2 | b2 |x| r3 | g3 | b3 |x
702   const __m128i in2 = LOAD_16(rgbx + 16);  // r4 | ...
703   const __m128i in3 = LOAD_16(rgbx + 24);  // r6 | ...
704   // column-wise transpose
705   const __m128i A0 = _mm_unpacklo_epi16(in0, in1);
706   const __m128i A1 = _mm_unpackhi_epi16(in0, in1);
707   const __m128i A2 = _mm_unpacklo_epi16(in2, in3);
708   const __m128i A3 = _mm_unpackhi_epi16(in2, in3);
709   const __m128i B0 = _mm_unpacklo_epi16(A0, A1);  // r0 r1 r2 r3 | g0 g1 ..
710   const __m128i B1 = _mm_unpackhi_epi16(A0, A1);  // b0 b1 b2 b3 | x x x x
711   const __m128i B2 = _mm_unpacklo_epi16(A2, A3);  // r4 r5 r6 r7 | g4 g5 ..
712   const __m128i B3 = _mm_unpackhi_epi16(A2, A3);  // b4 b5 b6 b7 | x x x x
713   *r = _mm_unpacklo_epi64(B0, B2);
714   *g = _mm_unpackhi_epi64(B0, B2);
715   *b = _mm_unpacklo_epi64(B1, B3);
716 }
717 
ConvertRGBA32ToUV_SSE2(const uint16_t * rgb,uint8_t * u,uint8_t * v,int width)718 static void ConvertRGBA32ToUV_SSE2(const uint16_t* rgb,
719                                    uint8_t* u, uint8_t* v, int width) {
720   const int max_width = width & ~15;
721   const uint16_t* const last_rgb = rgb + 4 * max_width;
722   while (rgb < last_rgb) {
723     __m128i r, g, b, U0, V0, U1, V1;
724     RGBA32PackedToPlanar_16b_SSE2(rgb +  0, &r, &g, &b);
725     ConvertRGBToUV_SSE2(&r, &g, &b, &U0, &V0);
726     RGBA32PackedToPlanar_16b_SSE2(rgb + 32, &r, &g, &b);
727     ConvertRGBToUV_SSE2(&r, &g, &b, &U1, &V1);
728     STORE_16(_mm_packus_epi16(U0, U1), u);
729     STORE_16(_mm_packus_epi16(V0, V1), v);
730     u += 16;
731     v += 16;
732     rgb += 2 * 32;
733   }
734   if (max_width < width) {  // left-over
735     WebPConvertRGBA32ToUV_C(rgb, u, v, width - max_width);
736   }
737 }
738 
739 //------------------------------------------------------------------------------
740 
741 extern void WebPInitConvertARGBToYUVSSE2(void);
742 
WebPInitConvertARGBToYUVSSE2(void)743 WEBP_TSAN_IGNORE_FUNCTION void WebPInitConvertARGBToYUVSSE2(void) {
744   WebPConvertARGBToY = ConvertARGBToY_SSE2;
745   WebPConvertARGBToUV = ConvertARGBToUV_SSE2;
746 
747   WebPConvertRGB24ToY = ConvertRGB24ToY_SSE2;
748   WebPConvertBGR24ToY = ConvertBGR24ToY_SSE2;
749 
750   WebPConvertRGBA32ToUV = ConvertRGBA32ToUV_SSE2;
751 }
752 
753 #else  // !WEBP_USE_SSE2
754 
755 WEBP_DSP_INIT_STUB(WebPInitSamplersSSE2)
756 WEBP_DSP_INIT_STUB(WebPInitConvertARGBToYUVSSE2)
757 
758 #endif  // WEBP_USE_SSE2
759