xref: /aosp_15_r20/frameworks/rs/toolkit/x86.cpp (revision e1eccf28f96817838ad6867f7f39d2351ec11f56)
1 /*
2  * Copyright (C) 2011 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include <stdint.h>
18 #include <x86intrin.h>
19 
20 namespace android {
21 namespace renderscript {
22 
23 /* Unsigned extend packed 8-bit integer (in LBS) into packed 32-bit integer */
cvtepu8_epi32(__m128i x)24 static inline __m128i cvtepu8_epi32(__m128i x) {
25 #if defined(__SSE4_1__)
26     return _mm_cvtepu8_epi32(x);
27 #elif defined(__SSSE3__)
28     const __m128i M8to32 = _mm_set_epi32(0xffffff03, 0xffffff02, 0xffffff01, 0xffffff00);
29     x = _mm_shuffle_epi8(x, M8to32);
30     return x;
31 #else
32 #   error "Require at least SSSE3"
33 #endif
34 }
35 
packus_epi32(__m128i lo,__m128i hi)36 static inline __m128i packus_epi32(__m128i lo, __m128i hi) {
37 #if defined(__SSE4_1__)
38     return _mm_packus_epi32(lo, hi);
39 #elif defined(__SSSE3__)
40     const __m128i C0 = _mm_set_epi32(0x0000, 0x0000, 0x0000, 0x0000);
41     const __m128i C1 = _mm_set_epi32(0xffff, 0xffff, 0xffff, 0xffff);
42     const __m128i M32to16L = _mm_set_epi32(0xffffffff, 0xffffffff, 0x0d0c0908, 0x05040100);
43     const __m128i M32to16H = _mm_set_epi32(0x0d0c0908, 0x05040100, 0xffffffff, 0xffffffff);
44     lo = _mm_and_si128(lo, _mm_cmpgt_epi32(lo, C0));
45     lo = _mm_or_si128(lo, _mm_cmpgt_epi32(lo, C1));
46     hi = _mm_and_si128(hi, _mm_cmpgt_epi32(hi, C0));
47     hi = _mm_or_si128(hi, _mm_cmpgt_epi32(hi, C1));
48     return _mm_or_si128(_mm_shuffle_epi8(lo, M32to16L),
49                         _mm_shuffle_epi8(hi, M32to16H));
50 #else
51 #   error "Require at least SSSE3"
52 #endif
53 }
54 
mullo_epi32(__m128i x,__m128i y)55 static inline __m128i mullo_epi32(__m128i x, __m128i y) {
56 #if defined(__SSE4_1__)
57     return _mm_mullo_epi32(x, y);
58 #elif defined(__SSSE3__)
59     const __m128i Meven = _mm_set_epi32(0x00000000, 0xffffffff, 0x00000000, 0xffffffff);
60     __m128i even = _mm_mul_epu32(x, y);
61     __m128i odd = _mm_mul_epu32(_mm_srli_si128(x, 4),
62                                 _mm_srli_si128(y, 4));
63     even = _mm_and_si128(even, Meven);
64     odd = _mm_and_si128(odd, Meven);
65     return _mm_or_si128(even, _mm_slli_si128(odd, 4));
66 #else
67 #   error "Require at least SSSE3"
68 #endif
69 }
70 
71 /* 'mask' must packed 8-bit of 0x00 or 0xff */
blendv_epi8(__m128i x,__m128i y,__m128i mask)72 static inline __m128i blendv_epi8(__m128i x, __m128i y, __m128i mask) {
73 #if defined(__SSE4_1__)
74     return _mm_blendv_epi8(x, y, mask);
75 #elif defined(__SSSE3__)
76     return _mm_or_si128(_mm_andnot_si128(mask, x), _mm_and_si128(y, mask));
77 #else
78 #   error "Require at least SSSE3"
79 #endif
80 }
81 
rsdIntrinsicConvolve3x3_K(void * dst,const void * y0,const void * y1,const void * y2,const short * coef,uint32_t count)82 extern "C" void rsdIntrinsicConvolve3x3_K(void *dst, const void *y0,
83                                           const void *y1, const void *y2,
84                                           const short *coef, uint32_t count) {
85     __m128i x;
86     __m128i c0, c2, c4, c6, c8;
87     __m128i r0, r1, r2;
88     __m128i p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11;
89     __m128i o0, o1;
90     uint32_t i;
91 
92     x = _mm_loadl_epi64((const __m128i *)(coef+0));
93     c0 = _mm_shuffle_epi32(x, 0x00);
94     c2 = _mm_shuffle_epi32(x, 0x55);
95     x = _mm_loadl_epi64((const __m128i *)(coef+4));
96     c4 = _mm_shuffle_epi32(x, 0x00);
97     c6 = _mm_shuffle_epi32(x, 0x55);
98     x = _mm_loadl_epi64((const __m128i *)(coef+8));
99     c8 = _mm_shuffle_epi32(x, 0x00);
100 
101     for (i = 0; i < count; ++i) {
102 
103         p0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0)), _mm_setzero_si128());
104         p1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+1)), _mm_setzero_si128());
105         p2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+2)), _mm_setzero_si128());
106         p3 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+3)), _mm_setzero_si128());
107         p4 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1)), _mm_setzero_si128());
108         p5 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+1)), _mm_setzero_si128());
109         p6 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+2)), _mm_setzero_si128());
110         p7 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+3)), _mm_setzero_si128());
111         p8 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2)), _mm_setzero_si128());
112         p9 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+1)), _mm_setzero_si128());
113         p10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+2)), _mm_setzero_si128());
114         p11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+3)), _mm_setzero_si128());
115 
116         o0 = _mm_madd_epi16(_mm_unpacklo_epi16(p0, p1), c0);
117         o1 = _mm_madd_epi16(_mm_unpacklo_epi16(p1, p2), c0);
118 
119         o0 = _mm_add_epi32(o0, _mm_madd_epi16(_mm_unpacklo_epi16(p2, p4), c2));
120         o1 = _mm_add_epi32(o1, _mm_madd_epi16(_mm_unpacklo_epi16(p3, p5), c2));
121 
122         o0 = _mm_add_epi32(o0, _mm_madd_epi16(_mm_unpacklo_epi16(p5, p6), c4));
123         o1 = _mm_add_epi32(o1, _mm_madd_epi16(_mm_unpacklo_epi16(p6, p7), c4));
124 
125         o0 = _mm_add_epi32(o0, _mm_madd_epi16(_mm_unpacklo_epi16(p8, p9), c6));
126         o1 = _mm_add_epi32(o1, _mm_madd_epi16(_mm_unpacklo_epi16(p9, p10), c6));
127 
128         o0 = _mm_add_epi32(o0, _mm_madd_epi16(_mm_unpacklo_epi16(p10, _mm_setzero_si128()), c8));
129         o1 = _mm_add_epi32(o1, _mm_madd_epi16(_mm_unpacklo_epi16(p11, _mm_setzero_si128()), c8));
130 
131         o0 = _mm_srai_epi32(o0, 8);
132         o1 = _mm_srai_epi32(o1, 8);
133 
134         o0 = packus_epi32(o0, o1);
135         o0 = _mm_packus_epi16(o0, o0);
136         _mm_storel_epi64((__m128i *)dst, o0);
137 
138         y0 = (const char *)y0 + 8;
139         y1 = (const char *)y1 + 8;
140         y2 = (const char *)y2 + 8;
141         dst = (char *)dst + 8;
142     }
143 }
144 
rsdIntrinsicColorMatrix4x4_K(void * dst,const void * src,const short * coef,uint32_t count)145 void rsdIntrinsicColorMatrix4x4_K(void *dst, const void *src,
146                                   const short *coef, uint32_t count) {
147     const __m128i T4x4 = _mm_set_epi8(15, 11, 7, 3,
148                                       14, 10, 6, 2,
149                                       13,  9, 5, 1,
150                                       12,  8, 4, 0);
151 
152     const __m128i Mxy = _mm_set_epi32(0xff0dff0c, 0xff09ff08, 0xff05ff04, 0xff01ff00);
153     const __m128i Mzw = _mm_set_epi32(0xff0fff0e, 0xff0bff0a, 0xff07ff06, 0xff03ff02);
154     __m128i c0, c1, c2, c3;
155     __m128i i4, o4;
156     __m128i xy, zw;
157     __m128i x2, y2, z2, w2;
158     uint32_t i;
159 
160     c0 = _mm_loadl_epi64((const __m128i *)(coef+0));
161     c1 = _mm_loadl_epi64((const __m128i *)(coef+4));
162     c0 = _mm_unpacklo_epi16(c0, c1);
163 
164     c2 = _mm_loadl_epi64((const __m128i *)(coef+8));
165     c3 = _mm_loadl_epi64((const __m128i *)(coef+12));
166     c2 = _mm_unpacklo_epi16(c2, c3);
167 
168     for (i = 0; i < count; ++i) {
169         i4 = _mm_load_si128((const __m128i *)src);
170         xy = _mm_shuffle_epi8(i4, Mxy);
171         zw = _mm_shuffle_epi8(i4, Mzw);
172 
173         x2 = _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0x00));
174         y2 = _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0x55));
175         z2 = _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0xaa));
176         w2 = _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0xff));
177 
178         x2 = _mm_add_epi32(x2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0x00)));
179         y2 = _mm_add_epi32(y2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0x55)));
180         z2 = _mm_add_epi32(z2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0xaa)));
181         w2 = _mm_add_epi32(w2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0xff)));
182 
183         x2 = _mm_srai_epi32(x2, 8);
184         y2 = _mm_srai_epi32(y2, 8);
185         z2 = _mm_srai_epi32(z2, 8);
186         w2 = _mm_srai_epi32(w2, 8);
187 
188         x2 = packus_epi32(x2, y2);
189         z2 = packus_epi32(z2, w2);
190         o4 = _mm_packus_epi16(x2, z2);
191 
192         o4 = _mm_shuffle_epi8(o4, T4x4);
193         _mm_storeu_si128((__m128i *)dst, o4);
194 
195         src = (const char *)src + 16;
196         dst = (char *)dst + 16;
197     }
198 }
199 
rsdIntrinsicColorMatrix3x3_K(void * dst,const void * src,const short * coef,uint32_t count)200 void rsdIntrinsicColorMatrix3x3_K(void *dst, const void *src,
201                                   const short *coef, uint32_t count) {
202     const __m128i T4x4 = _mm_set_epi8(15, 11, 7, 3,
203                                       14, 10, 6, 2,
204                                       13,  9, 5, 1,
205                                       12,  8, 4, 0);
206 
207     const __m128i Mxy = _mm_set_epi32(0xff0dff0c, 0xff09ff08, 0xff05ff04, 0xff01ff00);
208     const __m128i Mzw = _mm_set_epi32(0xff0fff0e, 0xff0bff0a, 0xff07ff06, 0xff03ff02);
209 
210     __m128i c0, c1, c2, c3;
211     __m128i i4, o4;
212     __m128i xy, zw;
213     __m128i x2, y2, z2, w2;
214     uint32_t i;
215 
216     c0 = _mm_loadl_epi64((const __m128i *)(coef+0));
217     c1 = _mm_loadl_epi64((const __m128i *)(coef+4));
218     c0 = _mm_unpacklo_epi16(c0, c1);
219 
220     c2 = _mm_loadl_epi64((const __m128i *)(coef+8));
221     c3 = _mm_loadl_epi64((const __m128i *)(coef+12));
222     c2 = _mm_unpacklo_epi16(c2, c3);
223 
224     for (i = 0; i < count; ++i) {
225         i4 = _mm_loadu_si128((const __m128i *)src);
226         xy = _mm_shuffle_epi8(i4, Mxy);
227         zw = _mm_shuffle_epi8(i4, Mzw);
228 
229         x2 =  _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0x00));
230         y2 =  _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0x55));
231         z2 =  _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0xaa));
232 
233         x2 = _mm_add_epi32(x2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0x00)));
234         y2 = _mm_add_epi32(y2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0x55)));
235         z2 = _mm_add_epi32(z2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0xaa)));
236 
237         x2 = _mm_srai_epi32(x2, 8);
238         y2 = _mm_srai_epi32(y2, 8);
239         z2 = _mm_srai_epi32(z2, 8);
240         w2 = _mm_srli_epi32(zw, 16);
241 
242         x2 = packus_epi32(x2, y2);
243         z2 = packus_epi32(z2, w2);
244         o4 = _mm_packus_epi16(x2, z2);
245 
246         o4 = _mm_shuffle_epi8(o4, T4x4);
247         _mm_storeu_si128((__m128i *)dst, o4);
248 
249         src = (const char *)src + 16;
250         dst = (char *)dst + 16;
251     }
252 }
253 
rsdIntrinsicColorMatrixDot_K(void * dst,const void * src,const short * coef,uint32_t count)254 void rsdIntrinsicColorMatrixDot_K(void *dst, const void *src,
255                                   const short *coef, uint32_t count) {
256     const __m128i T4x4 = _mm_set_epi8(15, 11, 7, 3,
257                                       14, 10, 6, 2,
258                                       13,  9, 5, 1,
259                                       12,  8, 4, 0);
260     const __m128i Mxy = _mm_set_epi32(0xff0dff0c, 0xff09ff08, 0xff05ff04, 0xff01ff00);
261     const __m128i Mzw = _mm_set_epi32(0xff0fff0e, 0xff0bff0a, 0xff07ff06, 0xff03ff02);
262     __m128i c0, c1, c2, c3;
263     __m128i i4, o4;
264     __m128i xy, zw;
265     __m128i x2, y2, z2, w2;
266     uint32_t i;
267 
268     c0 = _mm_loadl_epi64((const __m128i *)(coef+0));
269     c0 = _mm_shufflelo_epi16(c0, 0);
270     c1 = _mm_loadl_epi64((const __m128i *)(coef+4));
271     c1 = _mm_shufflelo_epi16(c1, 0);
272     c0 = _mm_unpacklo_epi16(c0, c1);
273 
274     c2 = _mm_loadl_epi64((const __m128i *)(coef+8));
275     c2 = _mm_shufflelo_epi16(c2, 0);
276     c3 = _mm_loadl_epi64((const __m128i *)(coef+12));
277     c3 = _mm_shufflelo_epi16(c3, 0);
278     c2 = _mm_unpacklo_epi16(c2, c3);
279 
280     for (i = 0; i < count; ++i) {
281         i4 = _mm_loadu_si128((const __m128i *)src);
282 
283         xy = _mm_shuffle_epi8(i4, Mxy);
284         zw = _mm_shuffle_epi8(i4, Mzw);
285 
286         x2 =  _mm_madd_epi16(xy, c0);
287         x2 = _mm_add_epi32(x2, _mm_madd_epi16(zw, c2));
288 
289         x2 = _mm_srai_epi32(x2, 8);
290         y2 = x2;
291         z2 = x2;
292         w2 = _mm_srli_epi32(zw, 16);
293 
294         x2 = packus_epi32(x2, y2);
295         z2 = packus_epi32(z2, w2);
296         o4 = _mm_packus_epi16(x2, z2);
297 
298         o4 = _mm_shuffle_epi8(o4, T4x4);
299         _mm_storeu_si128((__m128i *)dst, o4);
300 
301         src = (const char *)src + 16;
302         dst = (char *)dst + 16;
303     }
304 }
305 
rsdIntrinsicBlurVFU4_K(void * dst,const void * pin,int stride,const void * gptr,int rct,int x1,int x2)306 void rsdIntrinsicBlurVFU4_K(void *dst,
307                           const void *pin, int stride, const void *gptr,
308                           int rct, int x1, int x2) {
309     const char *pi;
310     __m128i pi0, pi1;
311     __m128 pf0, pf1;
312     __m128 bp0, bp1;
313     __m128 x;
314     int r;
315 
316     for (; x1 < x2; x1 += 2) {
317         pi = (const char *)pin + (x1 << 2);
318         bp0 = _mm_setzero_ps();
319         bp1 = _mm_setzero_ps();
320 
321         for (r = 0; r < rct; ++r) {
322             x = _mm_load_ss((const float *)gptr + r);
323             x = _mm_shuffle_ps(x, x, _MM_SHUFFLE(0, 0, 0, 0));
324 
325             pi0 = _mm_cvtsi32_si128(*(const int *)pi);
326             pi1 = _mm_cvtsi32_si128(*((const int *)pi + 1));
327 
328             pf0 = _mm_cvtepi32_ps(cvtepu8_epi32(pi0));
329             pf1 = _mm_cvtepi32_ps(cvtepu8_epi32(pi1));
330 
331             bp0 = _mm_add_ps(bp0, _mm_mul_ps(pf0, x));
332             bp1 = _mm_add_ps(bp1, _mm_mul_ps(pf1, x));
333 
334             pi += stride;
335         }
336 
337         _mm_storeu_ps((float *)dst, bp0);
338         _mm_storeu_ps((float *)dst + 4, bp1);
339         dst = (char *)dst + 32;
340     }
341 }
342 
rsdIntrinsicBlurHFU4_K(void * dst,const void * pin,const void * gptr,int rct,int x1,int x2)343 void rsdIntrinsicBlurHFU4_K(void *dst,
344                           const void *pin, const void *gptr,
345                           int rct, int x1, int x2) {
346     const __m128i Mu8 = _mm_set_epi32(0xffffffff, 0xffffffff, 0xffffffff, 0x0c080400);
347     const float *pi;
348     __m128 pf, x, y;
349     __m128i o;
350     int r;
351 
352     for (; x1 < x2; ++x1) {
353         /* rct is define as 2*r+1 by the caller */
354         x = _mm_load_ss((const float *)gptr);
355         x = _mm_shuffle_ps(x, x, _MM_SHUFFLE(0, 0, 0, 0));
356 
357         pi = (const float *)pin + (x1 << 2);
358         pf = _mm_mul_ps(x, _mm_load_ps(pi));
359 
360         for (r = 1; r < rct; r += 2) {
361             x = _mm_load_ss((const float *)gptr + r);
362             y = _mm_load_ss((const float *)gptr + r + 1);
363             x = _mm_shuffle_ps(x, x, _MM_SHUFFLE(0, 0, 0, 0));
364             y = _mm_shuffle_ps(y, y, _MM_SHUFFLE(0, 0, 0, 0));
365 
366             pf = _mm_add_ps(pf, _mm_mul_ps(x, _mm_load_ps(pi + (r << 2))));
367             pf = _mm_add_ps(pf, _mm_mul_ps(y, _mm_load_ps(pi + (r << 2) + 4)));
368         }
369 
370         o = _mm_cvtps_epi32(pf);
371         *(int *)dst = _mm_cvtsi128_si32(_mm_shuffle_epi8(o, Mu8));
372         dst = (char *)dst + 4;
373     }
374 }
375 
rsdIntrinsicBlurHFU1_K(void * dst,const void * pin,const void * gptr,int rct,int x1,int x2)376 void rsdIntrinsicBlurHFU1_K(void *dst,
377                           const void *pin, const void *gptr,
378                           int rct, int x1, int x2) {
379     const __m128i Mu8 = _mm_set_epi32(0xffffffff, 0xffffffff, 0xffffffff, 0x0c080400);
380     const float *pi;
381     __m128 pf, g0, g1, g2, g3, gx, p0, p1;
382     __m128i o;
383     int r;
384 
385     for (; x1 < x2; x1+=4) {
386         g0 = _mm_load_ss((const float *)gptr);
387         g0 = _mm_shuffle_ps(g0, g0, _MM_SHUFFLE(0, 0, 0, 0));
388 
389         pi = (const float *)pin + x1;
390         pf = _mm_mul_ps(g0, _mm_loadu_ps(pi));
391 
392         for (r = 1; r < rct; r += 4) {
393             gx = _mm_loadu_ps((const float *)gptr + r);
394             p0 = _mm_loadu_ps(pi + r);
395             p1 = _mm_loadu_ps(pi + r + 4);
396 
397             g0 = _mm_shuffle_ps(gx, gx, _MM_SHUFFLE(0, 0, 0, 0));
398             pf = _mm_add_ps(pf, _mm_mul_ps(g0, p0));
399             g1 = _mm_shuffle_ps(gx, gx, _MM_SHUFFLE(1, 1, 1, 1));
400             pf = _mm_add_ps(pf, _mm_mul_ps(g1, _mm_alignr_epi8(p1, p0, 4)));
401             g2 = _mm_shuffle_ps(gx, gx, _MM_SHUFFLE(2, 2, 2, 2));
402             pf = _mm_add_ps(pf, _mm_mul_ps(g2, _mm_alignr_epi8(p1, p0, 8)));
403             g3 = _mm_shuffle_ps(gx, gx, _MM_SHUFFLE(3, 3, 3, 3));
404             pf = _mm_add_ps(pf, _mm_mul_ps(g3, _mm_alignr_epi8(p1, p0, 12)));
405         }
406 
407         o = _mm_cvtps_epi32(pf);
408         *(int *)dst = _mm_cvtsi128_si32(_mm_shuffle_epi8(o, Mu8));
409         dst = (char *)dst + 4;
410     }
411 }
412 
rsdIntrinsicYuv_K(void * dst,const unsigned char * pY,const unsigned char * pUV,uint32_t count,const short * param)413 void rsdIntrinsicYuv_K(void *dst,
414                        const unsigned char *pY, const unsigned char *pUV,
415                        uint32_t count, const short *param) {
416     __m128i biasY, biasUV;
417     __m128i c0, c1, c2, c3, c4;
418 
419     biasY = _mm_set1_epi32(param[8]);   /*  16 */
420     biasUV = _mm_set1_epi32(param[16]); /* 128 */
421 
422     c0 = _mm_set1_epi32(param[0]);  /*  298 */
423     c1 = _mm_set1_epi32(param[1]);  /*  409 */
424     c2 = _mm_set1_epi32(param[2]);  /* -100 */
425     c3 = _mm_set1_epi32(param[3]);  /*  516 */
426     c4 = _mm_set1_epi32(param[4]);  /* -208 */
427 
428     __m128i Y, UV, U, V, R, G, B, A;
429 
430     A = _mm_set1_epi32(255);
431     uint32_t i;
432 
433     for (i = 0; i < (count << 1); ++i) {
434         Y = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pY));
435         UV = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pUV));
436 
437         Y = _mm_sub_epi32(Y, biasY);
438         UV = _mm_sub_epi32(UV, biasUV);
439 
440         U = _mm_shuffle_epi32(UV, 0xf5);
441         V = _mm_shuffle_epi32(UV, 0xa0);
442 
443         Y = mullo_epi32(Y, c0);
444 
445         R = _mm_add_epi32(Y, mullo_epi32(V, c1));
446         R = _mm_add_epi32(R, biasUV);
447         R = _mm_srai_epi32(R, 8);
448 
449         G = _mm_add_epi32(Y, mullo_epi32(U, c2));
450         G = _mm_add_epi32(G, mullo_epi32(V, c4));
451         G = _mm_add_epi32(G, biasUV);
452         G = _mm_srai_epi32(G, 8);
453 
454         B = _mm_add_epi32(Y, mullo_epi32(U, c3));
455         B = _mm_add_epi32(B, biasUV);
456         B = _mm_srai_epi32(B, 8);
457 
458         __m128i y1, y2, y3, y4;
459 
460         y1 = packus_epi32(R, G);
461         y2 = packus_epi32(B, A);
462         y3 = _mm_packus_epi16(y1, y2);
463         const __m128i T4x4 = _mm_set_epi8(15, 11, 7, 3,
464                                           14, 10, 6, 2,
465                                           13,  9, 5, 1,
466                                           12,  8, 4, 0);
467         y4 = _mm_shuffle_epi8(y3, T4x4);
468         _mm_storeu_si128((__m128i *)dst, y4);
469         pY += 4;
470         pUV += 4;
471         dst = (__m128i *)dst + 1;
472     }
473 }
474 
rsdIntrinsicYuvR_K(void * dst,const unsigned char * pY,const unsigned char * pUV,uint32_t count,const short * param)475 void rsdIntrinsicYuvR_K(void *dst,
476                        const unsigned char *pY, const unsigned char *pUV,
477                        uint32_t count, const short *param) {
478     __m128i biasY, biasUV;
479     __m128i c0, c1, c2, c3, c4;
480 
481     biasY = _mm_set1_epi32(param[8]);   /*  16 */
482     biasUV = _mm_set1_epi32(param[16]); /* 128 */
483 
484     c0 = _mm_set1_epi32(param[0]);  /*  298 */
485     c1 = _mm_set1_epi32(param[1]);  /*  409 */
486     c2 = _mm_set1_epi32(param[2]);  /* -100 */
487     c3 = _mm_set1_epi32(param[3]);  /*  516 */
488     c4 = _mm_set1_epi32(param[4]);  /* -208 */
489 
490     __m128i Y, UV, U, V, R, G, B, A;
491 
492     A = _mm_set1_epi32(255);
493     uint32_t i;
494 
495     for (i = 0; i < (count << 1); ++i) {
496         Y = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pY));
497         UV = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pUV));
498 
499         Y = _mm_sub_epi32(Y, biasY);
500         UV = _mm_sub_epi32(UV, biasUV);
501 
502         V = _mm_shuffle_epi32(UV, 0xf5);
503         U = _mm_shuffle_epi32(UV, 0xa0);
504 
505         Y = mullo_epi32(Y, c0);
506 
507         R = _mm_add_epi32(Y, mullo_epi32(V, c1));
508         R = _mm_add_epi32(R, biasUV);
509         R = _mm_srai_epi32(R, 8);
510 
511         G = _mm_add_epi32(Y, mullo_epi32(U, c2));
512         G = _mm_add_epi32(G, mullo_epi32(V, c4));
513         G = _mm_add_epi32(G, biasUV);
514         G = _mm_srai_epi32(G, 8);
515 
516         B = _mm_add_epi32(Y, mullo_epi32(U, c3));
517         B = _mm_add_epi32(B, biasUV);
518         B = _mm_srai_epi32(B, 8);
519 
520         __m128i y1, y2, y3, y4;
521 
522         y1 = packus_epi32(R, G);
523         y2 = packus_epi32(B, A);
524         y3 = _mm_packus_epi16(y1, y2);
525         const __m128i T4x4 = _mm_set_epi8(15, 11, 7, 3,
526                                           14, 10, 6, 2,
527                                           13,  9, 5, 1,
528                                           12,  8, 4, 0);
529         y4 = _mm_shuffle_epi8(y3, T4x4);
530         _mm_storeu_si128((__m128i *)dst, y4);
531         pY += 4;
532         pUV += 4;
533         dst = (__m128i *)dst + 1;
534     }
535 }
536 
rsdIntrinsicYuv2_K(void * dst,const unsigned char * pY,const unsigned char * pU,const unsigned char * pV,uint32_t count,const short * param)537 void rsdIntrinsicYuv2_K(void *dst,
538                        const unsigned char *pY, const unsigned char *pU,
539                        const unsigned char *pV, uint32_t count, const short *param) {
540     __m128i biasY, biasUV;
541     __m128i c0, c1, c2, c3, c4;
542 
543     biasY = _mm_set1_epi32(param[8]);   /*  16 */
544     biasUV = _mm_set1_epi32(param[16]); /* 128 */
545 
546     c0 = _mm_set1_epi32(param[0]);  /*  298 */
547     c1 = _mm_set1_epi32(param[1]);  /*  409 */
548     c2 = _mm_set1_epi32(param[2]);  /* -100 */
549     c3 = _mm_set1_epi32(param[3]);  /*  516 */
550     c4 = _mm_set1_epi32(param[4]);  /* -208 */
551 
552     __m128i Y, U, V, R, G, B, A;
553 
554     A = _mm_set1_epi32(255);
555     uint32_t i;
556 
557     for (i = 0; i < (count << 1); ++i) {
558         Y = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pY));
559         U = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pU));
560 		V = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pV));
561 
562         Y = _mm_sub_epi32(Y, biasY);
563         U = _mm_sub_epi32(U, biasUV);
564 		V = _mm_sub_epi32(V, biasUV);
565 
566         Y = mullo_epi32(Y, c0);
567 
568         R = _mm_add_epi32(Y, mullo_epi32(V, c1));
569         R = _mm_add_epi32(R, biasUV);
570         R = _mm_srai_epi32(R, 8);
571 
572         G = _mm_add_epi32(Y, mullo_epi32(U, c2));
573         G = _mm_add_epi32(G, mullo_epi32(V, c4));
574         G = _mm_add_epi32(G, biasUV);
575         G = _mm_srai_epi32(G, 8);
576 
577         B = _mm_add_epi32(Y, mullo_epi32(U, c3));
578         B = _mm_add_epi32(B, biasUV);
579         B = _mm_srai_epi32(B, 8);
580 
581         __m128i y1, y2, y3, y4;
582 
583         y1 = packus_epi32(R, G);
584         y2 = packus_epi32(B, A);
585         y3 = _mm_packus_epi16(y1, y2);
586         const __m128i T4x4 = _mm_set_epi8(15, 11, 7, 3,
587                                           14, 10, 6, 2,
588                                           13,  9, 5, 1,
589                                           12,  8, 4, 0);
590         y4 = _mm_shuffle_epi8(y3, T4x4);
591         _mm_storeu_si128((__m128i *)dst, y4);
592         pY += 4;
593         pU += 4;
594 		pV += 4;
595         dst = (__m128i *)dst + 1;
596     }
597 }
598 
rsdIntrinsicConvolve5x5_K(void * dst,const void * y0,const void * y1,const void * y2,const void * y3,const void * y4,const short * coef,uint32_t count)599 extern "C" void rsdIntrinsicConvolve5x5_K(void *dst, const void *y0,
600                                           const void *y1, const void *y2,
601                                           const void *y3, const void *y4,
602                                           const short *coef, uint32_t count) {
603     __m128i x;
604     __m128i c0, c2, c4, c6, c8, c10, c12;
605     __m128i c14, c16, c18, c20, c22, c24;
606     __m128i r0, r1, r2, r3, r4, r5, r6, r7, r8, r9;
607     __m128i p0,  p1,  p2,  p3,  p4,  p5,  p6,  p7;
608     __m128i p8,  p9, p10, p11, p12, p13, p14, p15;
609     __m128i p16, p17, p18, p19, p20, p21, p22, p23;
610     __m128i p24, p25, p26, p27, p28, p29, p30, p31;
611     __m128i p32, p33, p34, p35, p36, p37, p38, p39;
612     __m128i o0, o1, o2, o3;
613     uint32_t i;
614 
615     x = _mm_loadl_epi64((const __m128i *)(coef+0));
616     c0  = _mm_shuffle_epi32(x, 0x00);
617     c2  = _mm_shuffle_epi32(x, 0x55);
618 
619     x = _mm_loadl_epi64((const __m128i *)(coef+4));
620     c4  = _mm_shuffle_epi32(x, 0x00);
621     c6  = _mm_shuffle_epi32(x, 0x55);
622 
623     x = _mm_loadl_epi64((const __m128i *)(coef+8));
624     c8  = _mm_shuffle_epi32(x, 0x00);
625     c10  = _mm_shuffle_epi32(x, 0x55);
626 
627     x = _mm_loadl_epi64((const __m128i *)(coef+12));
628     c12  = _mm_shuffle_epi32(x, 0x00);
629     c14  = _mm_shuffle_epi32(x, 0x55);
630 
631     x = _mm_loadl_epi64((const __m128i *)(coef+16));
632     c16  = _mm_shuffle_epi32(x, 0x00);
633     c18  = _mm_shuffle_epi32(x, 0x55);
634 
635     x = _mm_loadl_epi64((const __m128i *)(coef+20));
636     c20  = _mm_shuffle_epi32(x, 0x00);
637     c22  = _mm_shuffle_epi32(x, 0x55);
638 
639     x = _mm_loadl_epi64((const __m128i *)(coef+24));
640     c24  = _mm_shuffle_epi32(x, 0x00);
641 
642     for (i = 0; i < count; ++i) {
643 
644         p0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int32_t *)y0), _mm_setzero_si128());
645         p1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+1)), _mm_setzero_si128());
646         p2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+2)), _mm_setzero_si128());
647         p3 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+3)), _mm_setzero_si128());
648         p4 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+4)), _mm_setzero_si128());
649         p5 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+5)), _mm_setzero_si128());
650         p6 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+6)), _mm_setzero_si128());
651         p7 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+7)), _mm_setzero_si128());
652 
653         p8 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1)), _mm_setzero_si128());
654         p9 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+1)), _mm_setzero_si128());
655         p10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+2)), _mm_setzero_si128());
656         p11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+3)), _mm_setzero_si128());
657         p12 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+4)), _mm_setzero_si128());
658         p13 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+5)), _mm_setzero_si128());
659         p14 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+6)), _mm_setzero_si128());
660         p15 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+7)), _mm_setzero_si128());
661 
662         p16 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2)), _mm_setzero_si128());
663         p17 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+1)), _mm_setzero_si128());
664         p18 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+2)), _mm_setzero_si128());
665         p19 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+3)), _mm_setzero_si128());
666         p20 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+4)), _mm_setzero_si128());
667         p21 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+5)), _mm_setzero_si128());
668         p22 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+6)), _mm_setzero_si128());
669         p23 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+7)), _mm_setzero_si128());
670 
671         p24 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3)), _mm_setzero_si128());
672         p25 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+1)), _mm_setzero_si128());
673         p26 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+2)), _mm_setzero_si128());
674         p27 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+3)), _mm_setzero_si128());
675         p28 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+4)), _mm_setzero_si128());
676         p29 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+5)), _mm_setzero_si128());
677         p30 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+6)), _mm_setzero_si128());
678         p31 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+7)), _mm_setzero_si128());
679 
680         p32 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4)), _mm_setzero_si128());
681         p33 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+1)), _mm_setzero_si128());
682         p34 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+2)), _mm_setzero_si128());
683         p35 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+3)), _mm_setzero_si128());
684         p36 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+4)), _mm_setzero_si128());
685         p37 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+5)), _mm_setzero_si128());
686         p38 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+6)), _mm_setzero_si128());
687         p39 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+7)), _mm_setzero_si128());
688 
689         o0 =                   _mm_madd_epi16( _mm_unpacklo_epi16(p0, p1),  c0);
690         o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p2, p3),  c2));
691         o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p4, p8),  c4));
692         o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p9,p10),  c6));
693         o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p11, p12),  c8));
694         o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p16, p17), c10));
695         o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p18, p19), c12));
696         o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p20, p24), c14));
697         o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p25,p26), c16));
698         o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p27, p28), c18));
699         o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p32, p33), c20));
700         o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p34, p35), c22));
701         o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p36, _mm_setzero_si128()), c24));
702         o0 = _mm_srai_epi32(o0, 8);
703 
704         o1 =                   _mm_madd_epi16( _mm_unpacklo_epi16(p1, p2),  c0);
705         o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p3,p4),  c2));
706         o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p5, p9),  c4));
707         o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p10,p11),  c6));
708         o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p12,p13),  c8));
709         o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p17,p18), c10));
710         o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p19,p20), c12));
711         o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p21,p25), c14));
712         o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p26, p27), c16));
713         o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p28, p29), c18));
714         o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p33, p34), c20));
715         o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p35, p36), c22));
716         o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p37, _mm_setzero_si128()), c24));
717         o1 = _mm_srai_epi32(o1, 8);
718 
719         o2 =                   _mm_madd_epi16( _mm_unpacklo_epi16(p2,p3),  c0);
720         o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p4, p5),  c2));
721         o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p6, p10),  c4));
722         o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p11, p12),  c6));
723         o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p13, p14),  c8));
724         o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p18, p19), c10));
725         o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p20, p21), c12));
726         o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p22, p26), c14));
727         o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p27, p28), c16));
728         o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p29, p30), c18));
729         o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p34, p35), c20));
730         o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p36, p37), c22));
731         o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p38, _mm_setzero_si128()), c24));
732         o2 = _mm_srai_epi32(o2, 8);
733 
734         o3 =                   _mm_madd_epi16( _mm_unpacklo_epi16(p3,p4),  c0);
735         o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p5, p6),  c2));
736         o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p7, p11),  c4));
737         o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p12, p13),  c6));
738         o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p14, p15),  c8));
739         o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p19, p20), c10));
740         o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p21, p22), c12));
741         o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p23, p27), c14));
742         o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p28, p29), c16));
743         o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p30, p31), c18));
744         o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p35, p36), c20));
745         o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p37,p38), c22));
746         o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p39, _mm_setzero_si128()), c24));
747         o3 = _mm_srai_epi32(o3, 8);
748 
749         o0 = packus_epi32(o0, o1);
750         o2 = packus_epi32(o2, o3);
751         o0 = _mm_packus_epi16(o0, o2);
752         _mm_storeu_si128((__m128i *)dst, o0);
753 
754         y0 = (const char *)y0 + 16;
755         y1 = (const char *)y1 + 16;
756         y2 = (const char *)y2 + 16;
757         y3 = (const char *)y3 + 16;
758         y4 = (const char *)y4 + 16;
759         dst = (char *)dst + 16;
760     }
761 }
762 
rsdIntrinsicBlendSrcOver_K(void * dst,const void * src,uint32_t count8)763 void rsdIntrinsicBlendSrcOver_K(void *dst, const void *src, uint32_t count8) {
764     __m128i all1s, ina, ins;
765     __m128i in0, in1, out0, out1;
766     __m128i t0, t1, t2, t3;
767     uint32_t i;
768 
769     all1s = _mm_set1_epi16(255);
770 
771     for (i = 0; i < count8; ++i) {
772         in0 = _mm_loadu_si128((const __m128i *)src);
773         in1 = _mm_loadu_si128((const __m128i *)src + 1);
774         out0 = _mm_loadu_si128((const __m128i *)dst);
775         out1 = _mm_loadu_si128((const __m128i *)dst + 1);
776 
777         ins = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
778         ina = _mm_shufflelo_epi16(ins, 0xFF);
779         ina = _mm_shufflehi_epi16(ina, 0xFF);
780         t0 = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
781         t0 = _mm_mullo_epi16(t0, _mm_sub_epi16(all1s, ina));
782         t0 = _mm_srli_epi16(t0, 8);
783         t0 = _mm_add_epi16(t0, ins);
784 
785         ins = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
786         ina = _mm_shufflelo_epi16(ins, 0xFF);
787         ina = _mm_shufflehi_epi16(ina, 0xFF);
788         t1 = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
789         t1 = _mm_mullo_epi16(t1, _mm_sub_epi16(all1s, ina));
790         t1 = _mm_srli_epi16(t1, 8);
791         t1 = _mm_add_epi16(t1, ins);
792 
793         ins = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
794         ina = _mm_shufflelo_epi16(ins, 0xFF);
795         ina = _mm_shufflehi_epi16(ina, 0xFF);
796         t2 = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
797         t2 = _mm_mullo_epi16(t2, _mm_sub_epi16(all1s, ina));
798         t2 = _mm_srli_epi16(t2, 8);
799         t2 = _mm_add_epi16(t2, ins);
800 
801         ins = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
802         ina = _mm_shufflelo_epi16(ins, 0xFF);
803         ina = _mm_shufflehi_epi16(ina, 0xFF);
804         t3 = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
805         t3 = _mm_mullo_epi16(t3, _mm_sub_epi16(all1s, ina));
806         t3 = _mm_srli_epi16(t3, 8);
807         t3 = _mm_add_epi16(t3, ins);
808 
809         t0 = _mm_packus_epi16(t0, t1);
810         t2 = _mm_packus_epi16(t2, t3);
811         _mm_storeu_si128((__m128i *)dst, t0);
812         _mm_storeu_si128((__m128i *)dst + 1, t2);
813 
814         src = (const __m128i *)src + 2;
815         dst = (__m128i *)dst + 2;
816     }
817 }
818 
rsdIntrinsicBlendDstOver_K(void * dst,const void * src,uint32_t count8)819 void rsdIntrinsicBlendDstOver_K(void *dst, const void *src, uint32_t count8) {
820     __m128i all1s, outa, outs;
821     __m128i in0, in1, out0, out1;
822     __m128i t0, t1, t2, t3;
823     uint32_t i;
824 
825     all1s = _mm_set1_epi16(255);
826 
827     for (i = 0; i < count8; ++i) {
828         in0 = _mm_loadu_si128((const __m128i *)src);
829         in1 = _mm_loadu_si128((const __m128i *)src + 1);
830         out0 = _mm_loadu_si128((const __m128i *)dst);
831         out1 = _mm_loadu_si128((const __m128i *)dst + 1);
832 
833 
834         outs = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
835         outa = _mm_shufflelo_epi16(outs, 0xFF);
836         outa = _mm_shufflehi_epi16(outa, 0xFF);
837         t0 = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
838         t0 = _mm_mullo_epi16(t0, _mm_sub_epi16(all1s, outa));
839         t0 = _mm_srli_epi16(t0, 8);
840         t0 = _mm_add_epi16(t0, outs);
841 
842         outs = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
843         outa = _mm_shufflelo_epi16(outs, 0xFF);
844         outa = _mm_shufflehi_epi16(outa, 0xFF);
845         t1 = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
846         t1 = _mm_mullo_epi16(t1, _mm_sub_epi16(all1s, outa));
847         t1 = _mm_srli_epi16(t1, 8);
848         t1 = _mm_add_epi16(t1, outs);
849 
850         outs = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
851         outa = _mm_shufflelo_epi16(outs, 0xFF);
852         outa = _mm_shufflehi_epi16(outa, 0xFF);
853         t2 = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
854         t2 = _mm_mullo_epi16(t2, _mm_sub_epi16(all1s, outa));
855         t2 = _mm_srli_epi16(t2, 8);
856         t2 = _mm_add_epi16(t2, outs);
857 
858         outs = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
859         outa = _mm_shufflelo_epi16(outs, 0xFF);
860         outa = _mm_shufflehi_epi16(outa, 0xFF);
861         t3 = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
862         t3 = _mm_mullo_epi16(t3, _mm_sub_epi16(all1s, outa));
863         t3 = _mm_srli_epi16(t3, 8);
864         t3 = _mm_add_epi16(t3, outs);
865 
866         t0 = _mm_packus_epi16(t0, t1);
867         t2 = _mm_packus_epi16(t2, t3);
868         _mm_storeu_si128((__m128i *)dst, t0);
869         _mm_storeu_si128((__m128i *)dst + 1, t2);
870 
871         src = (const __m128i *)src + 2;
872         dst = (__m128i *)dst + 2;
873     }
874 }
875 
rsdIntrinsicBlendSrcIn_K(void * dst,const void * src,uint32_t count8)876 void rsdIntrinsicBlendSrcIn_K(void *dst, const void *src, uint32_t count8) {
877     __m128i outa;
878     __m128i in0, in1, out0, out1;
879     __m128i t0, t1, t2, t3;
880     uint32_t i;
881 
882     for (i = 0; i < count8; ++i) {
883         in0 = _mm_loadu_si128((const __m128i *)src);
884         in1 = _mm_loadu_si128((const __m128i *)src + 1);
885         out0 = _mm_loadu_si128((const __m128i *)dst);
886         out1 = _mm_loadu_si128((const __m128i *)dst + 1);
887 
888         outa = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
889         outa = _mm_shufflelo_epi16(outa, 0xFF);
890         outa = _mm_shufflehi_epi16(outa, 0xFF);
891         t0 = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
892         t0 = _mm_mullo_epi16(t0, outa);
893         t0 = _mm_srli_epi16(t0, 8);
894 
895         outa = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
896         outa = _mm_shufflelo_epi16(outa, 0xFF);
897         outa = _mm_shufflehi_epi16(outa, 0xFF);
898         t1 = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
899         t1 = _mm_mullo_epi16(t1, outa);
900         t1 = _mm_srli_epi16(t1, 8);
901 
902         outa = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
903         outa = _mm_shufflelo_epi16(outa, 0xFF);
904         outa = _mm_shufflehi_epi16(outa, 0xFF);
905         t2 = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
906         t2 = _mm_mullo_epi16(t2, outa);
907         t2 = _mm_srli_epi16(t2, 8);
908 
909         outa = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
910         outa = _mm_shufflelo_epi16(outa, 0xFF);
911         outa = _mm_shufflehi_epi16(outa, 0xFF);
912         t3 = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
913         t3 = _mm_mullo_epi16(t3, outa);
914         t3 = _mm_srli_epi16(t3, 8);
915 
916         t0 = _mm_packus_epi16(t0, t1);
917         t2 = _mm_packus_epi16(t2, t3);
918         _mm_storeu_si128((__m128i *)dst, t0);
919         _mm_storeu_si128((__m128i *)dst + 1, t2);
920 
921         src = (const __m128i *)src + 2;
922         dst = (__m128i *)dst + 2;
923     }
924 }
925 
rsdIntrinsicBlendDstIn_K(void * dst,const void * src,uint32_t count8)926 void rsdIntrinsicBlendDstIn_K(void *dst, const void *src, uint32_t count8) {
927     __m128i ina;
928     __m128i in0, in1, out0, out1;
929     __m128i t0, t1, t2, t3;
930     uint32_t i;
931 
932     for (i = 0; i < count8; ++i) {
933         in0 = _mm_loadu_si128((const __m128i *)src);
934         in1 = _mm_loadu_si128((const __m128i *)src + 1);
935         out0 = _mm_loadu_si128((const __m128i *)dst);
936         out1 = _mm_loadu_si128((const __m128i *)dst + 1);
937 
938         ina = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
939         ina = _mm_shufflelo_epi16(ina, 0xFF);
940         ina = _mm_shufflehi_epi16(ina, 0xFF);
941         t0 = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
942         t0 = _mm_mullo_epi16(t0, ina);
943         t0 = _mm_srli_epi16(t0, 8);
944 
945         ina = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
946         ina = _mm_shufflelo_epi16(ina, 0xFF);
947         ina = _mm_shufflehi_epi16(ina, 0xFF);
948         t1 = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
949         t1 = _mm_mullo_epi16(t1, ina);
950         t1 = _mm_srli_epi16(t1, 8);
951 
952         ina = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
953         ina = _mm_shufflelo_epi16(ina, 0xFF);
954         ina = _mm_shufflehi_epi16(ina, 0xFF);
955         t2 = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
956         t2 = _mm_mullo_epi16(t2, ina);
957         t2 = _mm_srli_epi16(t2, 8);
958 
959         ina = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
960         ina = _mm_shufflelo_epi16(ina, 0xFF);
961         ina = _mm_shufflehi_epi16(ina, 0xFF);
962         t3 = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
963         t3 = _mm_mullo_epi16(t3, ina);
964         t3 = _mm_srli_epi16(t3, 8);
965 
966         t0 = _mm_packus_epi16(t0, t1);
967         t2 = _mm_packus_epi16(t2, t3);
968         _mm_storeu_si128((__m128i *)dst, t0);
969         _mm_storeu_si128((__m128i *)dst + 1, t2);
970 
971         src = (const __m128i *)src + 2;
972         dst = (__m128i *)dst + 2;
973     }
974 }
975 
rsdIntrinsicBlendSrcOut_K(void * dst,const void * src,uint32_t count8)976 void rsdIntrinsicBlendSrcOut_K(void *dst, const void *src, uint32_t count8) {
977     __m128i all1s, outa;
978     __m128i in0, in1, out0, out1;
979     __m128i t0, t1, t2, t3;
980     uint32_t i;
981 
982     all1s = _mm_set1_epi16(255);
983 
984     for (i = 0; i < count8; ++i) {
985         in0 = _mm_loadu_si128((const __m128i *)src);
986         in1 = _mm_loadu_si128((const __m128i *)src + 1);
987         out0 = _mm_loadu_si128((const __m128i *)dst);
988         out1 = _mm_loadu_si128((const __m128i *)dst + 1);
989 
990         outa = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
991         outa = _mm_shufflelo_epi16(outa, 0xFF);
992         outa = _mm_shufflehi_epi16(outa, 0xFF);
993         t0 = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
994         t0 = _mm_mullo_epi16(t0, _mm_sub_epi16(all1s, outa));
995         t0 = _mm_srli_epi16(t0, 8);
996 
997         outa = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
998         outa = _mm_shufflelo_epi16(outa, 0xFF);
999         outa = _mm_shufflehi_epi16(outa, 0xFF);
1000         t1 = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
1001         t1 = _mm_mullo_epi16(t1, _mm_sub_epi16(all1s, outa));
1002         t1 = _mm_srli_epi16(t1, 8);
1003 
1004         outa = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
1005         outa = _mm_shufflelo_epi16(outa, 0xFF);
1006         outa = _mm_shufflehi_epi16(outa, 0xFF);
1007         t2 = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
1008         t2 = _mm_mullo_epi16(t2, _mm_sub_epi16(all1s, outa));
1009         t2 = _mm_srli_epi16(t2, 8);
1010 
1011         outa = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
1012         outa = _mm_shufflelo_epi16(outa, 0xFF);
1013         outa = _mm_shufflehi_epi16(outa, 0xFF);
1014         t3 = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
1015         t3 = _mm_mullo_epi16(t3, _mm_sub_epi16(all1s, outa));
1016         t3 = _mm_srli_epi16(t3, 8);
1017 
1018         t0 = _mm_packus_epi16(t0, t1);
1019         t2 = _mm_packus_epi16(t2, t3);
1020         _mm_storeu_si128((__m128i *)dst, t0);
1021         _mm_storeu_si128((__m128i *)dst + 1, t2);
1022 
1023         src = (const __m128i *)src + 2;
1024         dst = (__m128i *)dst + 2;
1025     }
1026 }
1027 
rsdIntrinsicBlendDstOut_K(void * dst,const void * src,uint32_t count8)1028 void rsdIntrinsicBlendDstOut_K(void *dst, const void *src, uint32_t count8) {
1029     __m128i all1s, ina;
1030     __m128i in0, in1, out0, out1;
1031     __m128i t0, t1, t2, t3;
1032     uint32_t i;
1033 
1034     all1s = _mm_set1_epi16(255);
1035 
1036     for (i = 0; i < count8; ++i) {
1037         in0 = _mm_loadu_si128((const __m128i *)src);
1038         in1 = _mm_loadu_si128((const __m128i *)src + 1);
1039         out0 = _mm_loadu_si128((const __m128i *)dst);
1040         out1 = _mm_loadu_si128((const __m128i *)dst + 1);
1041 
1042         ina = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
1043         ina = _mm_shufflelo_epi16(ina, 0xFF);
1044         ina = _mm_shufflehi_epi16(ina, 0xFF);
1045         t0 = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
1046         t0 = _mm_mullo_epi16(t0, _mm_sub_epi16(all1s, ina));
1047         t0 = _mm_srli_epi16(t0, 8);
1048 
1049         ina = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
1050         ina = _mm_shufflelo_epi16(ina, 0xFF);
1051         ina = _mm_shufflehi_epi16(ina, 0xFF);
1052         t1 = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
1053         t1 = _mm_mullo_epi16(t1, _mm_sub_epi16(all1s, ina));
1054         t1 = _mm_srli_epi16(t1, 8);
1055 
1056         ina = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
1057         ina = _mm_shufflelo_epi16(ina, 0xFF);
1058         ina = _mm_shufflehi_epi16(ina, 0xFF);
1059         t2 = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
1060         t2 = _mm_mullo_epi16(t2, _mm_sub_epi16(all1s, ina));
1061         t2 = _mm_srli_epi16(t2, 8);
1062 
1063         ina = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
1064         ina = _mm_shufflelo_epi16(ina, 0xFF);
1065         ina = _mm_shufflehi_epi16(ina, 0xFF);
1066         t3 = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
1067         t3 = _mm_mullo_epi16(t3, _mm_sub_epi16(all1s, ina));
1068         t3 = _mm_srli_epi16(t3, 8);
1069 
1070         t0 = _mm_packus_epi16(t0, t1);
1071         t2 = _mm_packus_epi16(t2, t3);
1072         _mm_storeu_si128((__m128i *)dst, t0);
1073         _mm_storeu_si128((__m128i *)dst + 1, t2);
1074 
1075         src = (const __m128i *)src + 2;
1076         dst = (__m128i *)dst + 2;
1077     }
1078 }
1079 
rsdIntrinsicBlendSrcAtop_K(void * dst,const void * src,uint32_t count8)1080 void rsdIntrinsicBlendSrcAtop_K(void *dst, const void *src, uint32_t count8) {
1081     const __m128i M0001 = _mm_set_epi32(0xff000000, 0xff000000, 0xff000000, 0xff000000);
1082     __m128i all1s, ina, outa, ins, outs;
1083     __m128i in0, in1, out0, out1;
1084     __m128i t0, t1, t2, t3;
1085     uint32_t i;
1086 
1087     all1s = _mm_set1_epi16(255);
1088 
1089     for (i = 0; i < count8; ++i) {
1090         in0 = _mm_loadu_si128((const __m128i *)src);
1091         in1 = _mm_loadu_si128((const __m128i *)src + 1);
1092         out0 = _mm_loadu_si128((const __m128i *)dst);
1093         out1 = _mm_loadu_si128((const __m128i *)dst + 1);
1094 
1095         ins = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
1096         ina = _mm_shufflelo_epi16(ins, 0xFF);
1097         ina = _mm_shufflehi_epi16(ina, 0xFF);
1098         outs = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
1099         outa = _mm_shufflelo_epi16(outs, 0xFF);
1100         outa = _mm_shufflehi_epi16(outa, 0xFF);
1101         t0 = _mm_sub_epi16(all1s, ina);
1102         t0 = _mm_mullo_epi16(t0, outs);
1103         t0 = _mm_adds_epu16(t0, _mm_mullo_epi16(outa, ins));
1104         t0 = _mm_srli_epi16(t0, 8);
1105 
1106         ins = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
1107         ina = _mm_shufflelo_epi16(ins, 0xFF);
1108         ina = _mm_shufflehi_epi16(ina, 0xFF);
1109         outs = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
1110         outa = _mm_shufflelo_epi16(outs, 0xFF);
1111         outa = _mm_shufflehi_epi16(outa, 0xFF);
1112         t1 = _mm_sub_epi16(all1s, ina);
1113         t1 = _mm_mullo_epi16(t1, outs);
1114         t1 = _mm_adds_epu16(t1, _mm_mullo_epi16(outa, ins));
1115         t1 = _mm_srli_epi16(t1, 8);
1116 
1117         ins = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
1118         ina = _mm_shufflelo_epi16(ins, 0xFF);
1119         ina = _mm_shufflehi_epi16(ina, 0xFF);
1120         outs = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
1121         outa = _mm_shufflelo_epi16(outs, 0xFF);
1122         outa = _mm_shufflehi_epi16(outa, 0xFF);
1123         t2 = _mm_sub_epi16(all1s, ina);
1124         t2 = _mm_mullo_epi16(t2, outs);
1125         t2 = _mm_adds_epu16(t2, _mm_mullo_epi16(outa, ins));
1126         t2 = _mm_srli_epi16(t2, 8);
1127 
1128         ins = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
1129         ina = _mm_shufflelo_epi16(ins, 0xFF);
1130         ina = _mm_shufflehi_epi16(ina, 0xFF);
1131         outs = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
1132         outa = _mm_shufflelo_epi16(outs, 0xFF);
1133         outa = _mm_shufflehi_epi16(outa, 0xFF);
1134         t3 = _mm_sub_epi16(all1s, ina);
1135         t3 = _mm_mullo_epi16(t3, outs);
1136         t3 = _mm_adds_epu16(t3, _mm_mullo_epi16(outa, ins));
1137         t3 = _mm_srli_epi16(t3, 8);
1138 
1139         t0 = _mm_packus_epi16(t0, t1);
1140         t0 = blendv_epi8(t0, out0, M0001);
1141         t2 = _mm_packus_epi16(t2, t3);
1142         t2 = blendv_epi8(t2, out1, M0001);
1143         _mm_storeu_si128((__m128i *)dst, t0);
1144         _mm_storeu_si128((__m128i *)dst + 1, t2);
1145 
1146         src = (const __m128i *)src + 2;
1147         dst = (__m128i *)dst + 2;
1148     }
1149 }
1150 
rsdIntrinsicBlendDstAtop_K(void * dst,const void * src,uint32_t count8)1151 void rsdIntrinsicBlendDstAtop_K(void *dst, const void *src, uint32_t count8) {
1152     const __m128i M0001 = _mm_set_epi32(0xff000000, 0xff000000, 0xff000000, 0xff000000);
1153     __m128i all1s, ina, ins, outa, outs;
1154     __m128i in0, in1, out0, out1;
1155     __m128i t0, t1, t2, t3;
1156     uint32_t i;
1157 
1158     all1s = _mm_set1_epi16(255);
1159 
1160     for (i = 0; i < count8; ++i) {
1161         in0 = _mm_loadu_si128((const __m128i *)src);
1162         in1 = _mm_loadu_si128((const __m128i *)src + 1);
1163         out0 = _mm_loadu_si128((const __m128i *)dst);
1164         out1 = _mm_loadu_si128((const __m128i *)dst + 1);
1165 
1166         ins = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
1167         ina = _mm_shufflelo_epi16(ins, 0xFF);
1168         ina = _mm_shufflehi_epi16(ina, 0xFF);
1169         outs = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
1170         outa = _mm_shufflelo_epi16(outs, 0xFF);
1171         outa = _mm_shufflehi_epi16(outa, 0xFF);
1172         t0 = _mm_sub_epi16(all1s, outa);
1173         t0 = _mm_mullo_epi16(t0, ins);
1174         t0 = _mm_adds_epu16(t0, _mm_mullo_epi16(ina, outs));
1175         t0 = _mm_srli_epi16(t0, 8);
1176 
1177         ins = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
1178         ina = _mm_shufflelo_epi16(ins, 0xFF);
1179         ina = _mm_shufflehi_epi16(ina, 0xFF);
1180         outs = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
1181         outa = _mm_shufflelo_epi16(outs, 0xFF);
1182         outa = _mm_shufflehi_epi16(outa, 0xFF);
1183         t1 = _mm_sub_epi16(all1s, outa);
1184         t1 = _mm_mullo_epi16(t1, ins);
1185         t1 = _mm_adds_epu16(t1, _mm_mullo_epi16(ina, outs));
1186         t1 = _mm_srli_epi16(t1, 8);
1187 
1188         ins = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
1189         ina = _mm_shufflelo_epi16(ins, 0xFF);
1190         ina = _mm_shufflehi_epi16(ina, 0xFF);
1191         outs = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
1192         outa = _mm_shufflelo_epi16(outs, 0xFF);
1193         outa = _mm_shufflehi_epi16(outa, 0xFF);
1194         t2 = _mm_sub_epi16(all1s, outa);
1195         t2 = _mm_mullo_epi16(t2, ins);
1196         t2 = _mm_adds_epu16(t2, _mm_mullo_epi16(ina, outs));
1197         t2 = _mm_srli_epi16(t2, 8);
1198 
1199         ins = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
1200         ina = _mm_shufflelo_epi16(ins, 0xFF);
1201         ina = _mm_shufflehi_epi16(ina, 0xFF);
1202         outs = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
1203         outa = _mm_shufflelo_epi16(outs, 0xFF);
1204         outa = _mm_shufflehi_epi16(outa, 0xFF);
1205         t3 = _mm_sub_epi16(all1s, outa);
1206         t3 = _mm_mullo_epi16(t3, ins);
1207         t3 = _mm_adds_epu16(t3, _mm_mullo_epi16(ina, outs));
1208         t3 = _mm_srli_epi16(t3, 8);
1209 
1210         t0 = _mm_packus_epi16(t0, t1);
1211         t0 = blendv_epi8(t0, in0, M0001);
1212         t2 = _mm_packus_epi16(t2, t3);
1213         t2 = blendv_epi8(t2, in1, M0001);
1214         _mm_storeu_si128((__m128i *)dst, t0);
1215         _mm_storeu_si128((__m128i *)dst + 1, t2);
1216 
1217         src = (const __m128i *)src + 2;
1218         dst = (__m128i *)dst + 2;
1219     }
1220 }
1221 
rsdIntrinsicBlendXor_K(void * dst,const void * src,uint32_t count8)1222 void rsdIntrinsicBlendXor_K(void *dst, const void *src, uint32_t count8) {
1223     __m128i in0, in1, out0, out1;
1224     uint32_t i;
1225 
1226     for (i = 0; i < count8; ++i) {
1227         in0 = _mm_loadu_si128((const __m128i *)src);
1228         in1 = _mm_loadu_si128((const __m128i *)src + 1);
1229         out0 = _mm_loadu_si128((const __m128i *)dst);
1230         out1 = _mm_loadu_si128((const __m128i *)dst + 1);
1231 
1232         out0 = _mm_xor_si128(out0, in0);
1233         out1 = _mm_xor_si128(out1, in1);
1234 
1235         _mm_storeu_si128((__m128i *)dst, out0);
1236         _mm_storeu_si128((__m128i *)dst + 1, out1);
1237 
1238         src = (const __m128i *)src + 2;
1239         dst = (__m128i *)dst + 2;
1240     }
1241 }
1242 
rsdIntrinsicBlendMultiply_K(void * dst,const void * src,uint32_t count8)1243 void rsdIntrinsicBlendMultiply_K(void *dst, const void *src, uint32_t count8) {
1244     __m128i in0, in1, out0, out1;
1245     __m128i t0, t1, t2, t3;
1246     uint32_t i;
1247 
1248     for (i = 0; i < count8; ++i) {
1249         in0 = _mm_loadu_si128((const __m128i *)src);
1250         in1 = _mm_loadu_si128((const __m128i *)src + 1);
1251         out0 = _mm_loadu_si128((const __m128i *)dst);
1252         out1 = _mm_loadu_si128((const __m128i *)dst + 1);
1253 
1254         t0 = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
1255         t0 = _mm_mullo_epi16(t0, _mm_unpacklo_epi8(out0, _mm_setzero_si128()));
1256         t0 = _mm_srli_epi16(t0, 8);
1257 
1258         t1 = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
1259         t1 = _mm_mullo_epi16(t1, _mm_unpackhi_epi8(out0, _mm_setzero_si128()));
1260         t1 = _mm_srli_epi16(t1, 8);
1261 
1262         t2 = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
1263         t2 = _mm_mullo_epi16(t2, _mm_unpacklo_epi8(out1, _mm_setzero_si128()));
1264         t2 = _mm_srli_epi16(t2, 8);
1265 
1266         t3 = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
1267         t3 = _mm_mullo_epi16(t3, _mm_unpackhi_epi8(out1, _mm_setzero_si128()));
1268         t3 = _mm_srli_epi16(t3, 8);
1269 
1270         t0 = _mm_packus_epi16(t0, t1);
1271         t2 = _mm_packus_epi16(t2, t3);
1272         _mm_storeu_si128((__m128i *)dst, t0);
1273         _mm_storeu_si128((__m128i *)dst + 1, t2);
1274 
1275         src = (const __m128i *)src + 2;
1276         dst = (__m128i *)dst + 2;
1277     }
1278 }
1279 
rsdIntrinsicBlendAdd_K(void * dst,const void * src,uint32_t count8)1280 void rsdIntrinsicBlendAdd_K(void *dst, const void *src, uint32_t count8) {
1281     __m128i in0, in1, out0, out1;
1282     uint32_t i;
1283 
1284     for (i = 0; i < count8; ++i) {
1285         in0 = _mm_loadu_si128((const __m128i *)src);
1286         in1 = _mm_loadu_si128((const __m128i *)src + 1);
1287         out0 = _mm_loadu_si128((const __m128i *)dst);
1288         out1 = _mm_loadu_si128((const __m128i *)dst + 1);
1289 
1290         out0 = _mm_adds_epu8(out0, in0);
1291         out1 = _mm_adds_epu8(out1, in1);
1292 
1293         _mm_storeu_si128((__m128i *)dst, out0);
1294         _mm_storeu_si128((__m128i *)dst + 1, out1);
1295 
1296         src = (const __m128i *)src + 2;
1297         dst = (__m128i *)dst + 2;
1298     }
1299 }
1300 
rsdIntrinsicBlendSub_K(void * dst,const void * src,uint32_t count8)1301 void rsdIntrinsicBlendSub_K(void *dst, const void *src, uint32_t count8) {
1302     __m128i in0, in1, out0, out1;
1303     uint32_t i;
1304 
1305     for (i = 0; i < count8; ++i) {
1306         in0 = _mm_loadu_si128((const __m128i *)src);
1307         in1 = _mm_loadu_si128((const __m128i *)src + 1);
1308         out0 = _mm_loadu_si128((const __m128i *)dst);
1309         out1 = _mm_loadu_si128((const __m128i *)dst + 1);
1310 
1311         out0 = _mm_subs_epu8(out0, in0);
1312         out1 = _mm_subs_epu8(out1, in1);
1313 
1314         _mm_storeu_si128((__m128i *)dst, out0);
1315         _mm_storeu_si128((__m128i *)dst + 1, out1);
1316 
1317         src = (const __m128i *)src + 2;
1318         dst = (__m128i *)dst + 2;
1319     }
1320 }
1321 
1322 }  // namespace android
1323 }  // namespace renderscript
1324