xref: /aosp_15_r20/external/webp/src/dsp/alpha_processing_sse41.c (revision b2055c353e87c8814eb2b6b1b11112a1562253bd)
1*b2055c35SXin Li // Copyright 2015 Google Inc. All Rights Reserved.
2*b2055c35SXin Li //
3*b2055c35SXin Li // Use of this source code is governed by a BSD-style license
4*b2055c35SXin Li // that can be found in the COPYING file in the root of the source
5*b2055c35SXin Li // tree. An additional intellectual property rights grant can be found
6*b2055c35SXin Li // in the file PATENTS. All contributing project authors may
7*b2055c35SXin Li // be found in the AUTHORS file in the root of the source tree.
8*b2055c35SXin Li // -----------------------------------------------------------------------------
9*b2055c35SXin Li //
10*b2055c35SXin Li // Utilities for processing transparent channel, SSE4.1 variant.
11*b2055c35SXin Li //
12*b2055c35SXin Li // Author: Skal ([email protected])
13*b2055c35SXin Li 
14*b2055c35SXin Li #include "src/dsp/dsp.h"
15*b2055c35SXin Li 
16*b2055c35SXin Li #if defined(WEBP_USE_SSE41)
17*b2055c35SXin Li 
18*b2055c35SXin Li #include <smmintrin.h>
19*b2055c35SXin Li 
20*b2055c35SXin Li //------------------------------------------------------------------------------
21*b2055c35SXin Li 
ExtractAlpha_SSE41(const uint8_t * WEBP_RESTRICT argb,int argb_stride,int width,int height,uint8_t * WEBP_RESTRICT alpha,int alpha_stride)22*b2055c35SXin Li static int ExtractAlpha_SSE41(const uint8_t* WEBP_RESTRICT argb,
23*b2055c35SXin Li                               int argb_stride, int width, int height,
24*b2055c35SXin Li                               uint8_t* WEBP_RESTRICT alpha, int alpha_stride) {
25*b2055c35SXin Li   // alpha_and stores an 'and' operation of all the alpha[] values. The final
26*b2055c35SXin Li   // value is not 0xff if any of the alpha[] is not equal to 0xff.
27*b2055c35SXin Li   uint32_t alpha_and = 0xff;
28*b2055c35SXin Li   int i, j;
29*b2055c35SXin Li   const __m128i all_0xff = _mm_set1_epi32(~0);
30*b2055c35SXin Li   __m128i all_alphas = all_0xff;
31*b2055c35SXin Li 
32*b2055c35SXin Li   // We must be able to access 3 extra bytes after the last written byte
33*b2055c35SXin Li   // 'src[4 * width - 4]', because we don't know if alpha is the first or the
34*b2055c35SXin Li   // last byte of the quadruplet.
35*b2055c35SXin Li   const int limit = (width - 1) & ~15;
36*b2055c35SXin Li   const __m128i kCstAlpha0 = _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1,
37*b2055c35SXin Li                                           -1, -1, -1, -1, 12, 8, 4, 0);
38*b2055c35SXin Li   const __m128i kCstAlpha1 = _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1,
39*b2055c35SXin Li                                           12, 8, 4, 0, -1, -1, -1, -1);
40*b2055c35SXin Li   const __m128i kCstAlpha2 = _mm_set_epi8(-1, -1, -1, -1, 12, 8, 4, 0,
41*b2055c35SXin Li                                           -1, -1, -1, -1, -1, -1, -1, -1);
42*b2055c35SXin Li   const __m128i kCstAlpha3 = _mm_set_epi8(12, 8, 4, 0, -1, -1, -1, -1,
43*b2055c35SXin Li                                           -1, -1, -1, -1, -1, -1, -1, -1);
44*b2055c35SXin Li   for (j = 0; j < height; ++j) {
45*b2055c35SXin Li     const __m128i* src = (const __m128i*)argb;
46*b2055c35SXin Li     for (i = 0; i < limit; i += 16) {
47*b2055c35SXin Li       // load 64 argb bytes
48*b2055c35SXin Li       const __m128i a0 = _mm_loadu_si128(src + 0);
49*b2055c35SXin Li       const __m128i a1 = _mm_loadu_si128(src + 1);
50*b2055c35SXin Li       const __m128i a2 = _mm_loadu_si128(src + 2);
51*b2055c35SXin Li       const __m128i a3 = _mm_loadu_si128(src + 3);
52*b2055c35SXin Li       const __m128i b0 = _mm_shuffle_epi8(a0, kCstAlpha0);
53*b2055c35SXin Li       const __m128i b1 = _mm_shuffle_epi8(a1, kCstAlpha1);
54*b2055c35SXin Li       const __m128i b2 = _mm_shuffle_epi8(a2, kCstAlpha2);
55*b2055c35SXin Li       const __m128i b3 = _mm_shuffle_epi8(a3, kCstAlpha3);
56*b2055c35SXin Li       const __m128i c0 = _mm_or_si128(b0, b1);
57*b2055c35SXin Li       const __m128i c1 = _mm_or_si128(b2, b3);
58*b2055c35SXin Li       const __m128i d0 = _mm_or_si128(c0, c1);
59*b2055c35SXin Li       // store
60*b2055c35SXin Li       _mm_storeu_si128((__m128i*)&alpha[i], d0);
61*b2055c35SXin Li       // accumulate sixteen alpha 'and' in parallel
62*b2055c35SXin Li       all_alphas = _mm_and_si128(all_alphas, d0);
63*b2055c35SXin Li       src += 4;
64*b2055c35SXin Li     }
65*b2055c35SXin Li     for (; i < width; ++i) {
66*b2055c35SXin Li       const uint32_t alpha_value = argb[4 * i];
67*b2055c35SXin Li       alpha[i] = alpha_value;
68*b2055c35SXin Li       alpha_and &= alpha_value;
69*b2055c35SXin Li     }
70*b2055c35SXin Li     argb += argb_stride;
71*b2055c35SXin Li     alpha += alpha_stride;
72*b2055c35SXin Li   }
73*b2055c35SXin Li   // Combine the sixteen alpha 'and' into an 8-bit mask.
74*b2055c35SXin Li   alpha_and |= 0xff00u;  // pretend the upper bits [8..15] were tested ok.
75*b2055c35SXin Li   alpha_and &= _mm_movemask_epi8(_mm_cmpeq_epi8(all_alphas, all_0xff));
76*b2055c35SXin Li   return (alpha_and == 0xffffu);
77*b2055c35SXin Li }
78*b2055c35SXin Li 
79*b2055c35SXin Li //------------------------------------------------------------------------------
80*b2055c35SXin Li // Entry point
81*b2055c35SXin Li 
82*b2055c35SXin Li extern void WebPInitAlphaProcessingSSE41(void);
83*b2055c35SXin Li 
WebPInitAlphaProcessingSSE41(void)84*b2055c35SXin Li WEBP_TSAN_IGNORE_FUNCTION void WebPInitAlphaProcessingSSE41(void) {
85*b2055c35SXin Li   WebPExtractAlpha = ExtractAlpha_SSE41;
86*b2055c35SXin Li }
87*b2055c35SXin Li 
88*b2055c35SXin Li #else  // !WEBP_USE_SSE41
89*b2055c35SXin Li 
90*b2055c35SXin Li WEBP_DSP_INIT_STUB(WebPInitAlphaProcessingSSE41)
91*b2055c35SXin Li 
92*b2055c35SXin Li #endif  // WEBP_USE_SSE41
93