1*e1eccf28SAndroid Build Coastguard Worker /*
2*e1eccf28SAndroid Build Coastguard Worker * Copyright (C) 2011 The Android Open Source Project
3*e1eccf28SAndroid Build Coastguard Worker *
4*e1eccf28SAndroid Build Coastguard Worker * Licensed under the Apache License, Version 2.0 (the "License");
5*e1eccf28SAndroid Build Coastguard Worker * you may not use this file except in compliance with the License.
6*e1eccf28SAndroid Build Coastguard Worker * You may obtain a copy of the License at
7*e1eccf28SAndroid Build Coastguard Worker *
8*e1eccf28SAndroid Build Coastguard Worker * http://www.apache.org/licenses/LICENSE-2.0
9*e1eccf28SAndroid Build Coastguard Worker *
10*e1eccf28SAndroid Build Coastguard Worker * Unless required by applicable law or agreed to in writing, software
11*e1eccf28SAndroid Build Coastguard Worker * distributed under the License is distributed on an "AS IS" BASIS,
12*e1eccf28SAndroid Build Coastguard Worker * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13*e1eccf28SAndroid Build Coastguard Worker * See the License for the specific language governing permissions and
14*e1eccf28SAndroid Build Coastguard Worker * limitations under the License.
15*e1eccf28SAndroid Build Coastguard Worker */
16*e1eccf28SAndroid Build Coastguard Worker
17*e1eccf28SAndroid Build Coastguard Worker #include <stdint.h>
18*e1eccf28SAndroid Build Coastguard Worker #include <x86intrin.h>
19*e1eccf28SAndroid Build Coastguard Worker
20*e1eccf28SAndroid Build Coastguard Worker namespace android {
21*e1eccf28SAndroid Build Coastguard Worker namespace renderscript {
22*e1eccf28SAndroid Build Coastguard Worker
23*e1eccf28SAndroid Build Coastguard Worker /* Unsigned extend packed 8-bit integer (in LBS) into packed 32-bit integer */
cvtepu8_epi32(__m128i x)24*e1eccf28SAndroid Build Coastguard Worker static inline __m128i cvtepu8_epi32(__m128i x) {
25*e1eccf28SAndroid Build Coastguard Worker #if defined(__SSE4_1__)
26*e1eccf28SAndroid Build Coastguard Worker return _mm_cvtepu8_epi32(x);
27*e1eccf28SAndroid Build Coastguard Worker #elif defined(__SSSE3__)
28*e1eccf28SAndroid Build Coastguard Worker const __m128i M8to32 = _mm_set_epi32(0xffffff03, 0xffffff02, 0xffffff01, 0xffffff00);
29*e1eccf28SAndroid Build Coastguard Worker x = _mm_shuffle_epi8(x, M8to32);
30*e1eccf28SAndroid Build Coastguard Worker return x;
31*e1eccf28SAndroid Build Coastguard Worker #else
32*e1eccf28SAndroid Build Coastguard Worker # error "Require at least SSSE3"
33*e1eccf28SAndroid Build Coastguard Worker #endif
34*e1eccf28SAndroid Build Coastguard Worker }
35*e1eccf28SAndroid Build Coastguard Worker
packus_epi32(__m128i lo,__m128i hi)36*e1eccf28SAndroid Build Coastguard Worker static inline __m128i packus_epi32(__m128i lo, __m128i hi) {
37*e1eccf28SAndroid Build Coastguard Worker #if defined(__SSE4_1__)
38*e1eccf28SAndroid Build Coastguard Worker return _mm_packus_epi32(lo, hi);
39*e1eccf28SAndroid Build Coastguard Worker #elif defined(__SSSE3__)
40*e1eccf28SAndroid Build Coastguard Worker const __m128i C0 = _mm_set_epi32(0x0000, 0x0000, 0x0000, 0x0000);
41*e1eccf28SAndroid Build Coastguard Worker const __m128i C1 = _mm_set_epi32(0xffff, 0xffff, 0xffff, 0xffff);
42*e1eccf28SAndroid Build Coastguard Worker const __m128i M32to16L = _mm_set_epi32(0xffffffff, 0xffffffff, 0x0d0c0908, 0x05040100);
43*e1eccf28SAndroid Build Coastguard Worker const __m128i M32to16H = _mm_set_epi32(0x0d0c0908, 0x05040100, 0xffffffff, 0xffffffff);
44*e1eccf28SAndroid Build Coastguard Worker lo = _mm_and_si128(lo, _mm_cmpgt_epi32(lo, C0));
45*e1eccf28SAndroid Build Coastguard Worker lo = _mm_or_si128(lo, _mm_cmpgt_epi32(lo, C1));
46*e1eccf28SAndroid Build Coastguard Worker hi = _mm_and_si128(hi, _mm_cmpgt_epi32(hi, C0));
47*e1eccf28SAndroid Build Coastguard Worker hi = _mm_or_si128(hi, _mm_cmpgt_epi32(hi, C1));
48*e1eccf28SAndroid Build Coastguard Worker return _mm_or_si128(_mm_shuffle_epi8(lo, M32to16L),
49*e1eccf28SAndroid Build Coastguard Worker _mm_shuffle_epi8(hi, M32to16H));
50*e1eccf28SAndroid Build Coastguard Worker #else
51*e1eccf28SAndroid Build Coastguard Worker # error "Require at least SSSE3"
52*e1eccf28SAndroid Build Coastguard Worker #endif
53*e1eccf28SAndroid Build Coastguard Worker }
54*e1eccf28SAndroid Build Coastguard Worker
mullo_epi32(__m128i x,__m128i y)55*e1eccf28SAndroid Build Coastguard Worker static inline __m128i mullo_epi32(__m128i x, __m128i y) {
56*e1eccf28SAndroid Build Coastguard Worker #if defined(__SSE4_1__)
57*e1eccf28SAndroid Build Coastguard Worker return _mm_mullo_epi32(x, y);
58*e1eccf28SAndroid Build Coastguard Worker #elif defined(__SSSE3__)
59*e1eccf28SAndroid Build Coastguard Worker const __m128i Meven = _mm_set_epi32(0x00000000, 0xffffffff, 0x00000000, 0xffffffff);
60*e1eccf28SAndroid Build Coastguard Worker __m128i even = _mm_mul_epu32(x, y);
61*e1eccf28SAndroid Build Coastguard Worker __m128i odd = _mm_mul_epu32(_mm_srli_si128(x, 4),
62*e1eccf28SAndroid Build Coastguard Worker _mm_srli_si128(y, 4));
63*e1eccf28SAndroid Build Coastguard Worker even = _mm_and_si128(even, Meven);
64*e1eccf28SAndroid Build Coastguard Worker odd = _mm_and_si128(odd, Meven);
65*e1eccf28SAndroid Build Coastguard Worker return _mm_or_si128(even, _mm_slli_si128(odd, 4));
66*e1eccf28SAndroid Build Coastguard Worker #else
67*e1eccf28SAndroid Build Coastguard Worker # error "Require at least SSSE3"
68*e1eccf28SAndroid Build Coastguard Worker #endif
69*e1eccf28SAndroid Build Coastguard Worker }
70*e1eccf28SAndroid Build Coastguard Worker
71*e1eccf28SAndroid Build Coastguard Worker /* 'mask' must packed 8-bit of 0x00 or 0xff */
blendv_epi8(__m128i x,__m128i y,__m128i mask)72*e1eccf28SAndroid Build Coastguard Worker static inline __m128i blendv_epi8(__m128i x, __m128i y, __m128i mask) {
73*e1eccf28SAndroid Build Coastguard Worker #if defined(__SSE4_1__)
74*e1eccf28SAndroid Build Coastguard Worker return _mm_blendv_epi8(x, y, mask);
75*e1eccf28SAndroid Build Coastguard Worker #elif defined(__SSSE3__)
76*e1eccf28SAndroid Build Coastguard Worker return _mm_or_si128(_mm_andnot_si128(mask, x), _mm_and_si128(y, mask));
77*e1eccf28SAndroid Build Coastguard Worker #else
78*e1eccf28SAndroid Build Coastguard Worker # error "Require at least SSSE3"
79*e1eccf28SAndroid Build Coastguard Worker #endif
80*e1eccf28SAndroid Build Coastguard Worker }
81*e1eccf28SAndroid Build Coastguard Worker
rsdIntrinsicConvolve3x3_K(void * dst,const void * y0,const void * y1,const void * y2,const short * coef,uint32_t count)82*e1eccf28SAndroid Build Coastguard Worker extern "C" void rsdIntrinsicConvolve3x3_K(void *dst, const void *y0,
83*e1eccf28SAndroid Build Coastguard Worker const void *y1, const void *y2,
84*e1eccf28SAndroid Build Coastguard Worker const short *coef, uint32_t count) {
85*e1eccf28SAndroid Build Coastguard Worker __m128i x;
86*e1eccf28SAndroid Build Coastguard Worker __m128i c0, c2, c4, c6, c8;
87*e1eccf28SAndroid Build Coastguard Worker __m128i r0, r1, r2;
88*e1eccf28SAndroid Build Coastguard Worker __m128i p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11;
89*e1eccf28SAndroid Build Coastguard Worker __m128i o0, o1;
90*e1eccf28SAndroid Build Coastguard Worker uint32_t i;
91*e1eccf28SAndroid Build Coastguard Worker
92*e1eccf28SAndroid Build Coastguard Worker x = _mm_loadl_epi64((const __m128i *)(coef+0));
93*e1eccf28SAndroid Build Coastguard Worker c0 = _mm_shuffle_epi32(x, 0x00);
94*e1eccf28SAndroid Build Coastguard Worker c2 = _mm_shuffle_epi32(x, 0x55);
95*e1eccf28SAndroid Build Coastguard Worker x = _mm_loadl_epi64((const __m128i *)(coef+4));
96*e1eccf28SAndroid Build Coastguard Worker c4 = _mm_shuffle_epi32(x, 0x00);
97*e1eccf28SAndroid Build Coastguard Worker c6 = _mm_shuffle_epi32(x, 0x55);
98*e1eccf28SAndroid Build Coastguard Worker x = _mm_loadl_epi64((const __m128i *)(coef+8));
99*e1eccf28SAndroid Build Coastguard Worker c8 = _mm_shuffle_epi32(x, 0x00);
100*e1eccf28SAndroid Build Coastguard Worker
101*e1eccf28SAndroid Build Coastguard Worker for (i = 0; i < count; ++i) {
102*e1eccf28SAndroid Build Coastguard Worker
103*e1eccf28SAndroid Build Coastguard Worker p0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0)), _mm_setzero_si128());
104*e1eccf28SAndroid Build Coastguard Worker p1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+1)), _mm_setzero_si128());
105*e1eccf28SAndroid Build Coastguard Worker p2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+2)), _mm_setzero_si128());
106*e1eccf28SAndroid Build Coastguard Worker p3 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+3)), _mm_setzero_si128());
107*e1eccf28SAndroid Build Coastguard Worker p4 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1)), _mm_setzero_si128());
108*e1eccf28SAndroid Build Coastguard Worker p5 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+1)), _mm_setzero_si128());
109*e1eccf28SAndroid Build Coastguard Worker p6 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+2)), _mm_setzero_si128());
110*e1eccf28SAndroid Build Coastguard Worker p7 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+3)), _mm_setzero_si128());
111*e1eccf28SAndroid Build Coastguard Worker p8 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2)), _mm_setzero_si128());
112*e1eccf28SAndroid Build Coastguard Worker p9 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+1)), _mm_setzero_si128());
113*e1eccf28SAndroid Build Coastguard Worker p10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+2)), _mm_setzero_si128());
114*e1eccf28SAndroid Build Coastguard Worker p11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+3)), _mm_setzero_si128());
115*e1eccf28SAndroid Build Coastguard Worker
116*e1eccf28SAndroid Build Coastguard Worker o0 = _mm_madd_epi16(_mm_unpacklo_epi16(p0, p1), c0);
117*e1eccf28SAndroid Build Coastguard Worker o1 = _mm_madd_epi16(_mm_unpacklo_epi16(p1, p2), c0);
118*e1eccf28SAndroid Build Coastguard Worker
119*e1eccf28SAndroid Build Coastguard Worker o0 = _mm_add_epi32(o0, _mm_madd_epi16(_mm_unpacklo_epi16(p2, p4), c2));
120*e1eccf28SAndroid Build Coastguard Worker o1 = _mm_add_epi32(o1, _mm_madd_epi16(_mm_unpacklo_epi16(p3, p5), c2));
121*e1eccf28SAndroid Build Coastguard Worker
122*e1eccf28SAndroid Build Coastguard Worker o0 = _mm_add_epi32(o0, _mm_madd_epi16(_mm_unpacklo_epi16(p5, p6), c4));
123*e1eccf28SAndroid Build Coastguard Worker o1 = _mm_add_epi32(o1, _mm_madd_epi16(_mm_unpacklo_epi16(p6, p7), c4));
124*e1eccf28SAndroid Build Coastguard Worker
125*e1eccf28SAndroid Build Coastguard Worker o0 = _mm_add_epi32(o0, _mm_madd_epi16(_mm_unpacklo_epi16(p8, p9), c6));
126*e1eccf28SAndroid Build Coastguard Worker o1 = _mm_add_epi32(o1, _mm_madd_epi16(_mm_unpacklo_epi16(p9, p10), c6));
127*e1eccf28SAndroid Build Coastguard Worker
128*e1eccf28SAndroid Build Coastguard Worker o0 = _mm_add_epi32(o0, _mm_madd_epi16(_mm_unpacklo_epi16(p10, _mm_setzero_si128()), c8));
129*e1eccf28SAndroid Build Coastguard Worker o1 = _mm_add_epi32(o1, _mm_madd_epi16(_mm_unpacklo_epi16(p11, _mm_setzero_si128()), c8));
130*e1eccf28SAndroid Build Coastguard Worker
131*e1eccf28SAndroid Build Coastguard Worker o0 = _mm_srai_epi32(o0, 8);
132*e1eccf28SAndroid Build Coastguard Worker o1 = _mm_srai_epi32(o1, 8);
133*e1eccf28SAndroid Build Coastguard Worker
134*e1eccf28SAndroid Build Coastguard Worker o0 = packus_epi32(o0, o1);
135*e1eccf28SAndroid Build Coastguard Worker o0 = _mm_packus_epi16(o0, o0);
136*e1eccf28SAndroid Build Coastguard Worker _mm_storel_epi64((__m128i *)dst, o0);
137*e1eccf28SAndroid Build Coastguard Worker
138*e1eccf28SAndroid Build Coastguard Worker y0 = (const char *)y0 + 8;
139*e1eccf28SAndroid Build Coastguard Worker y1 = (const char *)y1 + 8;
140*e1eccf28SAndroid Build Coastguard Worker y2 = (const char *)y2 + 8;
141*e1eccf28SAndroid Build Coastguard Worker dst = (char *)dst + 8;
142*e1eccf28SAndroid Build Coastguard Worker }
143*e1eccf28SAndroid Build Coastguard Worker }
144*e1eccf28SAndroid Build Coastguard Worker
rsdIntrinsicColorMatrix4x4_K(void * dst,const void * src,const short * coef,uint32_t count)145*e1eccf28SAndroid Build Coastguard Worker void rsdIntrinsicColorMatrix4x4_K(void *dst, const void *src,
146*e1eccf28SAndroid Build Coastguard Worker const short *coef, uint32_t count) {
147*e1eccf28SAndroid Build Coastguard Worker const __m128i T4x4 = _mm_set_epi8(15, 11, 7, 3,
148*e1eccf28SAndroid Build Coastguard Worker 14, 10, 6, 2,
149*e1eccf28SAndroid Build Coastguard Worker 13, 9, 5, 1,
150*e1eccf28SAndroid Build Coastguard Worker 12, 8, 4, 0);
151*e1eccf28SAndroid Build Coastguard Worker
152*e1eccf28SAndroid Build Coastguard Worker const __m128i Mxy = _mm_set_epi32(0xff0dff0c, 0xff09ff08, 0xff05ff04, 0xff01ff00);
153*e1eccf28SAndroid Build Coastguard Worker const __m128i Mzw = _mm_set_epi32(0xff0fff0e, 0xff0bff0a, 0xff07ff06, 0xff03ff02);
154*e1eccf28SAndroid Build Coastguard Worker __m128i c0, c1, c2, c3;
155*e1eccf28SAndroid Build Coastguard Worker __m128i i4, o4;
156*e1eccf28SAndroid Build Coastguard Worker __m128i xy, zw;
157*e1eccf28SAndroid Build Coastguard Worker __m128i x2, y2, z2, w2;
158*e1eccf28SAndroid Build Coastguard Worker uint32_t i;
159*e1eccf28SAndroid Build Coastguard Worker
160*e1eccf28SAndroid Build Coastguard Worker c0 = _mm_loadl_epi64((const __m128i *)(coef+0));
161*e1eccf28SAndroid Build Coastguard Worker c1 = _mm_loadl_epi64((const __m128i *)(coef+4));
162*e1eccf28SAndroid Build Coastguard Worker c0 = _mm_unpacklo_epi16(c0, c1);
163*e1eccf28SAndroid Build Coastguard Worker
164*e1eccf28SAndroid Build Coastguard Worker c2 = _mm_loadl_epi64((const __m128i *)(coef+8));
165*e1eccf28SAndroid Build Coastguard Worker c3 = _mm_loadl_epi64((const __m128i *)(coef+12));
166*e1eccf28SAndroid Build Coastguard Worker c2 = _mm_unpacklo_epi16(c2, c3);
167*e1eccf28SAndroid Build Coastguard Worker
168*e1eccf28SAndroid Build Coastguard Worker for (i = 0; i < count; ++i) {
169*e1eccf28SAndroid Build Coastguard Worker i4 = _mm_load_si128((const __m128i *)src);
170*e1eccf28SAndroid Build Coastguard Worker xy = _mm_shuffle_epi8(i4, Mxy);
171*e1eccf28SAndroid Build Coastguard Worker zw = _mm_shuffle_epi8(i4, Mzw);
172*e1eccf28SAndroid Build Coastguard Worker
173*e1eccf28SAndroid Build Coastguard Worker x2 = _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0x00));
174*e1eccf28SAndroid Build Coastguard Worker y2 = _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0x55));
175*e1eccf28SAndroid Build Coastguard Worker z2 = _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0xaa));
176*e1eccf28SAndroid Build Coastguard Worker w2 = _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0xff));
177*e1eccf28SAndroid Build Coastguard Worker
178*e1eccf28SAndroid Build Coastguard Worker x2 = _mm_add_epi32(x2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0x00)));
179*e1eccf28SAndroid Build Coastguard Worker y2 = _mm_add_epi32(y2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0x55)));
180*e1eccf28SAndroid Build Coastguard Worker z2 = _mm_add_epi32(z2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0xaa)));
181*e1eccf28SAndroid Build Coastguard Worker w2 = _mm_add_epi32(w2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0xff)));
182*e1eccf28SAndroid Build Coastguard Worker
183*e1eccf28SAndroid Build Coastguard Worker x2 = _mm_srai_epi32(x2, 8);
184*e1eccf28SAndroid Build Coastguard Worker y2 = _mm_srai_epi32(y2, 8);
185*e1eccf28SAndroid Build Coastguard Worker z2 = _mm_srai_epi32(z2, 8);
186*e1eccf28SAndroid Build Coastguard Worker w2 = _mm_srai_epi32(w2, 8);
187*e1eccf28SAndroid Build Coastguard Worker
188*e1eccf28SAndroid Build Coastguard Worker x2 = packus_epi32(x2, y2);
189*e1eccf28SAndroid Build Coastguard Worker z2 = packus_epi32(z2, w2);
190*e1eccf28SAndroid Build Coastguard Worker o4 = _mm_packus_epi16(x2, z2);
191*e1eccf28SAndroid Build Coastguard Worker
192*e1eccf28SAndroid Build Coastguard Worker o4 = _mm_shuffle_epi8(o4, T4x4);
193*e1eccf28SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i *)dst, o4);
194*e1eccf28SAndroid Build Coastguard Worker
195*e1eccf28SAndroid Build Coastguard Worker src = (const char *)src + 16;
196*e1eccf28SAndroid Build Coastguard Worker dst = (char *)dst + 16;
197*e1eccf28SAndroid Build Coastguard Worker }
198*e1eccf28SAndroid Build Coastguard Worker }
199*e1eccf28SAndroid Build Coastguard Worker
rsdIntrinsicColorMatrix3x3_K(void * dst,const void * src,const short * coef,uint32_t count)200*e1eccf28SAndroid Build Coastguard Worker void rsdIntrinsicColorMatrix3x3_K(void *dst, const void *src,
201*e1eccf28SAndroid Build Coastguard Worker const short *coef, uint32_t count) {
202*e1eccf28SAndroid Build Coastguard Worker const __m128i T4x4 = _mm_set_epi8(15, 11, 7, 3,
203*e1eccf28SAndroid Build Coastguard Worker 14, 10, 6, 2,
204*e1eccf28SAndroid Build Coastguard Worker 13, 9, 5, 1,
205*e1eccf28SAndroid Build Coastguard Worker 12, 8, 4, 0);
206*e1eccf28SAndroid Build Coastguard Worker
207*e1eccf28SAndroid Build Coastguard Worker const __m128i Mxy = _mm_set_epi32(0xff0dff0c, 0xff09ff08, 0xff05ff04, 0xff01ff00);
208*e1eccf28SAndroid Build Coastguard Worker const __m128i Mzw = _mm_set_epi32(0xff0fff0e, 0xff0bff0a, 0xff07ff06, 0xff03ff02);
209*e1eccf28SAndroid Build Coastguard Worker
210*e1eccf28SAndroid Build Coastguard Worker __m128i c0, c1, c2, c3;
211*e1eccf28SAndroid Build Coastguard Worker __m128i i4, o4;
212*e1eccf28SAndroid Build Coastguard Worker __m128i xy, zw;
213*e1eccf28SAndroid Build Coastguard Worker __m128i x2, y2, z2, w2;
214*e1eccf28SAndroid Build Coastguard Worker uint32_t i;
215*e1eccf28SAndroid Build Coastguard Worker
216*e1eccf28SAndroid Build Coastguard Worker c0 = _mm_loadl_epi64((const __m128i *)(coef+0));
217*e1eccf28SAndroid Build Coastguard Worker c1 = _mm_loadl_epi64((const __m128i *)(coef+4));
218*e1eccf28SAndroid Build Coastguard Worker c0 = _mm_unpacklo_epi16(c0, c1);
219*e1eccf28SAndroid Build Coastguard Worker
220*e1eccf28SAndroid Build Coastguard Worker c2 = _mm_loadl_epi64((const __m128i *)(coef+8));
221*e1eccf28SAndroid Build Coastguard Worker c3 = _mm_loadl_epi64((const __m128i *)(coef+12));
222*e1eccf28SAndroid Build Coastguard Worker c2 = _mm_unpacklo_epi16(c2, c3);
223*e1eccf28SAndroid Build Coastguard Worker
224*e1eccf28SAndroid Build Coastguard Worker for (i = 0; i < count; ++i) {
225*e1eccf28SAndroid Build Coastguard Worker i4 = _mm_loadu_si128((const __m128i *)src);
226*e1eccf28SAndroid Build Coastguard Worker xy = _mm_shuffle_epi8(i4, Mxy);
227*e1eccf28SAndroid Build Coastguard Worker zw = _mm_shuffle_epi8(i4, Mzw);
228*e1eccf28SAndroid Build Coastguard Worker
229*e1eccf28SAndroid Build Coastguard Worker x2 = _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0x00));
230*e1eccf28SAndroid Build Coastguard Worker y2 = _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0x55));
231*e1eccf28SAndroid Build Coastguard Worker z2 = _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0xaa));
232*e1eccf28SAndroid Build Coastguard Worker
233*e1eccf28SAndroid Build Coastguard Worker x2 = _mm_add_epi32(x2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0x00)));
234*e1eccf28SAndroid Build Coastguard Worker y2 = _mm_add_epi32(y2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0x55)));
235*e1eccf28SAndroid Build Coastguard Worker z2 = _mm_add_epi32(z2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0xaa)));
236*e1eccf28SAndroid Build Coastguard Worker
237*e1eccf28SAndroid Build Coastguard Worker x2 = _mm_srai_epi32(x2, 8);
238*e1eccf28SAndroid Build Coastguard Worker y2 = _mm_srai_epi32(y2, 8);
239*e1eccf28SAndroid Build Coastguard Worker z2 = _mm_srai_epi32(z2, 8);
240*e1eccf28SAndroid Build Coastguard Worker w2 = _mm_srli_epi32(zw, 16);
241*e1eccf28SAndroid Build Coastguard Worker
242*e1eccf28SAndroid Build Coastguard Worker x2 = packus_epi32(x2, y2);
243*e1eccf28SAndroid Build Coastguard Worker z2 = packus_epi32(z2, w2);
244*e1eccf28SAndroid Build Coastguard Worker o4 = _mm_packus_epi16(x2, z2);
245*e1eccf28SAndroid Build Coastguard Worker
246*e1eccf28SAndroid Build Coastguard Worker o4 = _mm_shuffle_epi8(o4, T4x4);
247*e1eccf28SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i *)dst, o4);
248*e1eccf28SAndroid Build Coastguard Worker
249*e1eccf28SAndroid Build Coastguard Worker src = (const char *)src + 16;
250*e1eccf28SAndroid Build Coastguard Worker dst = (char *)dst + 16;
251*e1eccf28SAndroid Build Coastguard Worker }
252*e1eccf28SAndroid Build Coastguard Worker }
253*e1eccf28SAndroid Build Coastguard Worker
rsdIntrinsicColorMatrixDot_K(void * dst,const void * src,const short * coef,uint32_t count)254*e1eccf28SAndroid Build Coastguard Worker void rsdIntrinsicColorMatrixDot_K(void *dst, const void *src,
255*e1eccf28SAndroid Build Coastguard Worker const short *coef, uint32_t count) {
256*e1eccf28SAndroid Build Coastguard Worker const __m128i T4x4 = _mm_set_epi8(15, 11, 7, 3,
257*e1eccf28SAndroid Build Coastguard Worker 14, 10, 6, 2,
258*e1eccf28SAndroid Build Coastguard Worker 13, 9, 5, 1,
259*e1eccf28SAndroid Build Coastguard Worker 12, 8, 4, 0);
260*e1eccf28SAndroid Build Coastguard Worker const __m128i Mxy = _mm_set_epi32(0xff0dff0c, 0xff09ff08, 0xff05ff04, 0xff01ff00);
261*e1eccf28SAndroid Build Coastguard Worker const __m128i Mzw = _mm_set_epi32(0xff0fff0e, 0xff0bff0a, 0xff07ff06, 0xff03ff02);
262*e1eccf28SAndroid Build Coastguard Worker __m128i c0, c1, c2, c3;
263*e1eccf28SAndroid Build Coastguard Worker __m128i i4, o4;
264*e1eccf28SAndroid Build Coastguard Worker __m128i xy, zw;
265*e1eccf28SAndroid Build Coastguard Worker __m128i x2, y2, z2, w2;
266*e1eccf28SAndroid Build Coastguard Worker uint32_t i;
267*e1eccf28SAndroid Build Coastguard Worker
268*e1eccf28SAndroid Build Coastguard Worker c0 = _mm_loadl_epi64((const __m128i *)(coef+0));
269*e1eccf28SAndroid Build Coastguard Worker c0 = _mm_shufflelo_epi16(c0, 0);
270*e1eccf28SAndroid Build Coastguard Worker c1 = _mm_loadl_epi64((const __m128i *)(coef+4));
271*e1eccf28SAndroid Build Coastguard Worker c1 = _mm_shufflelo_epi16(c1, 0);
272*e1eccf28SAndroid Build Coastguard Worker c0 = _mm_unpacklo_epi16(c0, c1);
273*e1eccf28SAndroid Build Coastguard Worker
274*e1eccf28SAndroid Build Coastguard Worker c2 = _mm_loadl_epi64((const __m128i *)(coef+8));
275*e1eccf28SAndroid Build Coastguard Worker c2 = _mm_shufflelo_epi16(c2, 0);
276*e1eccf28SAndroid Build Coastguard Worker c3 = _mm_loadl_epi64((const __m128i *)(coef+12));
277*e1eccf28SAndroid Build Coastguard Worker c3 = _mm_shufflelo_epi16(c3, 0);
278*e1eccf28SAndroid Build Coastguard Worker c2 = _mm_unpacklo_epi16(c2, c3);
279*e1eccf28SAndroid Build Coastguard Worker
280*e1eccf28SAndroid Build Coastguard Worker for (i = 0; i < count; ++i) {
281*e1eccf28SAndroid Build Coastguard Worker i4 = _mm_loadu_si128((const __m128i *)src);
282*e1eccf28SAndroid Build Coastguard Worker
283*e1eccf28SAndroid Build Coastguard Worker xy = _mm_shuffle_epi8(i4, Mxy);
284*e1eccf28SAndroid Build Coastguard Worker zw = _mm_shuffle_epi8(i4, Mzw);
285*e1eccf28SAndroid Build Coastguard Worker
286*e1eccf28SAndroid Build Coastguard Worker x2 = _mm_madd_epi16(xy, c0);
287*e1eccf28SAndroid Build Coastguard Worker x2 = _mm_add_epi32(x2, _mm_madd_epi16(zw, c2));
288*e1eccf28SAndroid Build Coastguard Worker
289*e1eccf28SAndroid Build Coastguard Worker x2 = _mm_srai_epi32(x2, 8);
290*e1eccf28SAndroid Build Coastguard Worker y2 = x2;
291*e1eccf28SAndroid Build Coastguard Worker z2 = x2;
292*e1eccf28SAndroid Build Coastguard Worker w2 = _mm_srli_epi32(zw, 16);
293*e1eccf28SAndroid Build Coastguard Worker
294*e1eccf28SAndroid Build Coastguard Worker x2 = packus_epi32(x2, y2);
295*e1eccf28SAndroid Build Coastguard Worker z2 = packus_epi32(z2, w2);
296*e1eccf28SAndroid Build Coastguard Worker o4 = _mm_packus_epi16(x2, z2);
297*e1eccf28SAndroid Build Coastguard Worker
298*e1eccf28SAndroid Build Coastguard Worker o4 = _mm_shuffle_epi8(o4, T4x4);
299*e1eccf28SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i *)dst, o4);
300*e1eccf28SAndroid Build Coastguard Worker
301*e1eccf28SAndroid Build Coastguard Worker src = (const char *)src + 16;
302*e1eccf28SAndroid Build Coastguard Worker dst = (char *)dst + 16;
303*e1eccf28SAndroid Build Coastguard Worker }
304*e1eccf28SAndroid Build Coastguard Worker }
305*e1eccf28SAndroid Build Coastguard Worker
rsdIntrinsicBlurVFU4_K(void * dst,const void * pin,int stride,const void * gptr,int rct,int x1,int x2)306*e1eccf28SAndroid Build Coastguard Worker void rsdIntrinsicBlurVFU4_K(void *dst,
307*e1eccf28SAndroid Build Coastguard Worker const void *pin, int stride, const void *gptr,
308*e1eccf28SAndroid Build Coastguard Worker int rct, int x1, int x2) {
309*e1eccf28SAndroid Build Coastguard Worker const char *pi;
310*e1eccf28SAndroid Build Coastguard Worker __m128i pi0, pi1;
311*e1eccf28SAndroid Build Coastguard Worker __m128 pf0, pf1;
312*e1eccf28SAndroid Build Coastguard Worker __m128 bp0, bp1;
313*e1eccf28SAndroid Build Coastguard Worker __m128 x;
314*e1eccf28SAndroid Build Coastguard Worker int r;
315*e1eccf28SAndroid Build Coastguard Worker
316*e1eccf28SAndroid Build Coastguard Worker for (; x1 < x2; x1 += 2) {
317*e1eccf28SAndroid Build Coastguard Worker pi = (const char *)pin + (x1 << 2);
318*e1eccf28SAndroid Build Coastguard Worker bp0 = _mm_setzero_ps();
319*e1eccf28SAndroid Build Coastguard Worker bp1 = _mm_setzero_ps();
320*e1eccf28SAndroid Build Coastguard Worker
321*e1eccf28SAndroid Build Coastguard Worker for (r = 0; r < rct; ++r) {
322*e1eccf28SAndroid Build Coastguard Worker x = _mm_load_ss((const float *)gptr + r);
323*e1eccf28SAndroid Build Coastguard Worker x = _mm_shuffle_ps(x, x, _MM_SHUFFLE(0, 0, 0, 0));
324*e1eccf28SAndroid Build Coastguard Worker
325*e1eccf28SAndroid Build Coastguard Worker pi0 = _mm_cvtsi32_si128(*(const int *)pi);
326*e1eccf28SAndroid Build Coastguard Worker pi1 = _mm_cvtsi32_si128(*((const int *)pi + 1));
327*e1eccf28SAndroid Build Coastguard Worker
328*e1eccf28SAndroid Build Coastguard Worker pf0 = _mm_cvtepi32_ps(cvtepu8_epi32(pi0));
329*e1eccf28SAndroid Build Coastguard Worker pf1 = _mm_cvtepi32_ps(cvtepu8_epi32(pi1));
330*e1eccf28SAndroid Build Coastguard Worker
331*e1eccf28SAndroid Build Coastguard Worker bp0 = _mm_add_ps(bp0, _mm_mul_ps(pf0, x));
332*e1eccf28SAndroid Build Coastguard Worker bp1 = _mm_add_ps(bp1, _mm_mul_ps(pf1, x));
333*e1eccf28SAndroid Build Coastguard Worker
334*e1eccf28SAndroid Build Coastguard Worker pi += stride;
335*e1eccf28SAndroid Build Coastguard Worker }
336*e1eccf28SAndroid Build Coastguard Worker
337*e1eccf28SAndroid Build Coastguard Worker _mm_storeu_ps((float *)dst, bp0);
338*e1eccf28SAndroid Build Coastguard Worker _mm_storeu_ps((float *)dst + 4, bp1);
339*e1eccf28SAndroid Build Coastguard Worker dst = (char *)dst + 32;
340*e1eccf28SAndroid Build Coastguard Worker }
341*e1eccf28SAndroid Build Coastguard Worker }
342*e1eccf28SAndroid Build Coastguard Worker
rsdIntrinsicBlurHFU4_K(void * dst,const void * pin,const void * gptr,int rct,int x1,int x2)343*e1eccf28SAndroid Build Coastguard Worker void rsdIntrinsicBlurHFU4_K(void *dst,
344*e1eccf28SAndroid Build Coastguard Worker const void *pin, const void *gptr,
345*e1eccf28SAndroid Build Coastguard Worker int rct, int x1, int x2) {
346*e1eccf28SAndroid Build Coastguard Worker const __m128i Mu8 = _mm_set_epi32(0xffffffff, 0xffffffff, 0xffffffff, 0x0c080400);
347*e1eccf28SAndroid Build Coastguard Worker const float *pi;
348*e1eccf28SAndroid Build Coastguard Worker __m128 pf, x, y;
349*e1eccf28SAndroid Build Coastguard Worker __m128i o;
350*e1eccf28SAndroid Build Coastguard Worker int r;
351*e1eccf28SAndroid Build Coastguard Worker
352*e1eccf28SAndroid Build Coastguard Worker for (; x1 < x2; ++x1) {
353*e1eccf28SAndroid Build Coastguard Worker /* rct is define as 2*r+1 by the caller */
354*e1eccf28SAndroid Build Coastguard Worker x = _mm_load_ss((const float *)gptr);
355*e1eccf28SAndroid Build Coastguard Worker x = _mm_shuffle_ps(x, x, _MM_SHUFFLE(0, 0, 0, 0));
356*e1eccf28SAndroid Build Coastguard Worker
357*e1eccf28SAndroid Build Coastguard Worker pi = (const float *)pin + (x1 << 2);
358*e1eccf28SAndroid Build Coastguard Worker pf = _mm_mul_ps(x, _mm_load_ps(pi));
359*e1eccf28SAndroid Build Coastguard Worker
360*e1eccf28SAndroid Build Coastguard Worker for (r = 1; r < rct; r += 2) {
361*e1eccf28SAndroid Build Coastguard Worker x = _mm_load_ss((const float *)gptr + r);
362*e1eccf28SAndroid Build Coastguard Worker y = _mm_load_ss((const float *)gptr + r + 1);
363*e1eccf28SAndroid Build Coastguard Worker x = _mm_shuffle_ps(x, x, _MM_SHUFFLE(0, 0, 0, 0));
364*e1eccf28SAndroid Build Coastguard Worker y = _mm_shuffle_ps(y, y, _MM_SHUFFLE(0, 0, 0, 0));
365*e1eccf28SAndroid Build Coastguard Worker
366*e1eccf28SAndroid Build Coastguard Worker pf = _mm_add_ps(pf, _mm_mul_ps(x, _mm_load_ps(pi + (r << 2))));
367*e1eccf28SAndroid Build Coastguard Worker pf = _mm_add_ps(pf, _mm_mul_ps(y, _mm_load_ps(pi + (r << 2) + 4)));
368*e1eccf28SAndroid Build Coastguard Worker }
369*e1eccf28SAndroid Build Coastguard Worker
370*e1eccf28SAndroid Build Coastguard Worker o = _mm_cvtps_epi32(pf);
371*e1eccf28SAndroid Build Coastguard Worker *(int *)dst = _mm_cvtsi128_si32(_mm_shuffle_epi8(o, Mu8));
372*e1eccf28SAndroid Build Coastguard Worker dst = (char *)dst + 4;
373*e1eccf28SAndroid Build Coastguard Worker }
374*e1eccf28SAndroid Build Coastguard Worker }
375*e1eccf28SAndroid Build Coastguard Worker
rsdIntrinsicBlurHFU1_K(void * dst,const void * pin,const void * gptr,int rct,int x1,int x2)376*e1eccf28SAndroid Build Coastguard Worker void rsdIntrinsicBlurHFU1_K(void *dst,
377*e1eccf28SAndroid Build Coastguard Worker const void *pin, const void *gptr,
378*e1eccf28SAndroid Build Coastguard Worker int rct, int x1, int x2) {
379*e1eccf28SAndroid Build Coastguard Worker const __m128i Mu8 = _mm_set_epi32(0xffffffff, 0xffffffff, 0xffffffff, 0x0c080400);
380*e1eccf28SAndroid Build Coastguard Worker const float *pi;
381*e1eccf28SAndroid Build Coastguard Worker __m128 pf, g0, g1, g2, g3, gx, p0, p1;
382*e1eccf28SAndroid Build Coastguard Worker __m128i o;
383*e1eccf28SAndroid Build Coastguard Worker int r;
384*e1eccf28SAndroid Build Coastguard Worker
385*e1eccf28SAndroid Build Coastguard Worker for (; x1 < x2; x1+=4) {
386*e1eccf28SAndroid Build Coastguard Worker g0 = _mm_load_ss((const float *)gptr);
387*e1eccf28SAndroid Build Coastguard Worker g0 = _mm_shuffle_ps(g0, g0, _MM_SHUFFLE(0, 0, 0, 0));
388*e1eccf28SAndroid Build Coastguard Worker
389*e1eccf28SAndroid Build Coastguard Worker pi = (const float *)pin + x1;
390*e1eccf28SAndroid Build Coastguard Worker pf = _mm_mul_ps(g0, _mm_loadu_ps(pi));
391*e1eccf28SAndroid Build Coastguard Worker
392*e1eccf28SAndroid Build Coastguard Worker for (r = 1; r < rct; r += 4) {
393*e1eccf28SAndroid Build Coastguard Worker gx = _mm_loadu_ps((const float *)gptr + r);
394*e1eccf28SAndroid Build Coastguard Worker p0 = _mm_loadu_ps(pi + r);
395*e1eccf28SAndroid Build Coastguard Worker p1 = _mm_loadu_ps(pi + r + 4);
396*e1eccf28SAndroid Build Coastguard Worker
397*e1eccf28SAndroid Build Coastguard Worker g0 = _mm_shuffle_ps(gx, gx, _MM_SHUFFLE(0, 0, 0, 0));
398*e1eccf28SAndroid Build Coastguard Worker pf = _mm_add_ps(pf, _mm_mul_ps(g0, p0));
399*e1eccf28SAndroid Build Coastguard Worker g1 = _mm_shuffle_ps(gx, gx, _MM_SHUFFLE(1, 1, 1, 1));
400*e1eccf28SAndroid Build Coastguard Worker pf = _mm_add_ps(pf, _mm_mul_ps(g1, _mm_alignr_epi8(p1, p0, 4)));
401*e1eccf28SAndroid Build Coastguard Worker g2 = _mm_shuffle_ps(gx, gx, _MM_SHUFFLE(2, 2, 2, 2));
402*e1eccf28SAndroid Build Coastguard Worker pf = _mm_add_ps(pf, _mm_mul_ps(g2, _mm_alignr_epi8(p1, p0, 8)));
403*e1eccf28SAndroid Build Coastguard Worker g3 = _mm_shuffle_ps(gx, gx, _MM_SHUFFLE(3, 3, 3, 3));
404*e1eccf28SAndroid Build Coastguard Worker pf = _mm_add_ps(pf, _mm_mul_ps(g3, _mm_alignr_epi8(p1, p0, 12)));
405*e1eccf28SAndroid Build Coastguard Worker }
406*e1eccf28SAndroid Build Coastguard Worker
407*e1eccf28SAndroid Build Coastguard Worker o = _mm_cvtps_epi32(pf);
408*e1eccf28SAndroid Build Coastguard Worker *(int *)dst = _mm_cvtsi128_si32(_mm_shuffle_epi8(o, Mu8));
409*e1eccf28SAndroid Build Coastguard Worker dst = (char *)dst + 4;
410*e1eccf28SAndroid Build Coastguard Worker }
411*e1eccf28SAndroid Build Coastguard Worker }
412*e1eccf28SAndroid Build Coastguard Worker
rsdIntrinsicYuv_K(void * dst,const unsigned char * pY,const unsigned char * pUV,uint32_t count,const short * param)413*e1eccf28SAndroid Build Coastguard Worker void rsdIntrinsicYuv_K(void *dst,
414*e1eccf28SAndroid Build Coastguard Worker const unsigned char *pY, const unsigned char *pUV,
415*e1eccf28SAndroid Build Coastguard Worker uint32_t count, const short *param) {
416*e1eccf28SAndroid Build Coastguard Worker __m128i biasY, biasUV;
417*e1eccf28SAndroid Build Coastguard Worker __m128i c0, c1, c2, c3, c4;
418*e1eccf28SAndroid Build Coastguard Worker
419*e1eccf28SAndroid Build Coastguard Worker biasY = _mm_set1_epi32(param[8]); /* 16 */
420*e1eccf28SAndroid Build Coastguard Worker biasUV = _mm_set1_epi32(param[16]); /* 128 */
421*e1eccf28SAndroid Build Coastguard Worker
422*e1eccf28SAndroid Build Coastguard Worker c0 = _mm_set1_epi32(param[0]); /* 298 */
423*e1eccf28SAndroid Build Coastguard Worker c1 = _mm_set1_epi32(param[1]); /* 409 */
424*e1eccf28SAndroid Build Coastguard Worker c2 = _mm_set1_epi32(param[2]); /* -100 */
425*e1eccf28SAndroid Build Coastguard Worker c3 = _mm_set1_epi32(param[3]); /* 516 */
426*e1eccf28SAndroid Build Coastguard Worker c4 = _mm_set1_epi32(param[4]); /* -208 */
427*e1eccf28SAndroid Build Coastguard Worker
428*e1eccf28SAndroid Build Coastguard Worker __m128i Y, UV, U, V, R, G, B, A;
429*e1eccf28SAndroid Build Coastguard Worker
430*e1eccf28SAndroid Build Coastguard Worker A = _mm_set1_epi32(255);
431*e1eccf28SAndroid Build Coastguard Worker uint32_t i;
432*e1eccf28SAndroid Build Coastguard Worker
433*e1eccf28SAndroid Build Coastguard Worker for (i = 0; i < (count << 1); ++i) {
434*e1eccf28SAndroid Build Coastguard Worker Y = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pY));
435*e1eccf28SAndroid Build Coastguard Worker UV = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pUV));
436*e1eccf28SAndroid Build Coastguard Worker
437*e1eccf28SAndroid Build Coastguard Worker Y = _mm_sub_epi32(Y, biasY);
438*e1eccf28SAndroid Build Coastguard Worker UV = _mm_sub_epi32(UV, biasUV);
439*e1eccf28SAndroid Build Coastguard Worker
440*e1eccf28SAndroid Build Coastguard Worker U = _mm_shuffle_epi32(UV, 0xf5);
441*e1eccf28SAndroid Build Coastguard Worker V = _mm_shuffle_epi32(UV, 0xa0);
442*e1eccf28SAndroid Build Coastguard Worker
443*e1eccf28SAndroid Build Coastguard Worker Y = mullo_epi32(Y, c0);
444*e1eccf28SAndroid Build Coastguard Worker
445*e1eccf28SAndroid Build Coastguard Worker R = _mm_add_epi32(Y, mullo_epi32(V, c1));
446*e1eccf28SAndroid Build Coastguard Worker R = _mm_add_epi32(R, biasUV);
447*e1eccf28SAndroid Build Coastguard Worker R = _mm_srai_epi32(R, 8);
448*e1eccf28SAndroid Build Coastguard Worker
449*e1eccf28SAndroid Build Coastguard Worker G = _mm_add_epi32(Y, mullo_epi32(U, c2));
450*e1eccf28SAndroid Build Coastguard Worker G = _mm_add_epi32(G, mullo_epi32(V, c4));
451*e1eccf28SAndroid Build Coastguard Worker G = _mm_add_epi32(G, biasUV);
452*e1eccf28SAndroid Build Coastguard Worker G = _mm_srai_epi32(G, 8);
453*e1eccf28SAndroid Build Coastguard Worker
454*e1eccf28SAndroid Build Coastguard Worker B = _mm_add_epi32(Y, mullo_epi32(U, c3));
455*e1eccf28SAndroid Build Coastguard Worker B = _mm_add_epi32(B, biasUV);
456*e1eccf28SAndroid Build Coastguard Worker B = _mm_srai_epi32(B, 8);
457*e1eccf28SAndroid Build Coastguard Worker
458*e1eccf28SAndroid Build Coastguard Worker __m128i y1, y2, y3, y4;
459*e1eccf28SAndroid Build Coastguard Worker
460*e1eccf28SAndroid Build Coastguard Worker y1 = packus_epi32(R, G);
461*e1eccf28SAndroid Build Coastguard Worker y2 = packus_epi32(B, A);
462*e1eccf28SAndroid Build Coastguard Worker y3 = _mm_packus_epi16(y1, y2);
463*e1eccf28SAndroid Build Coastguard Worker const __m128i T4x4 = _mm_set_epi8(15, 11, 7, 3,
464*e1eccf28SAndroid Build Coastguard Worker 14, 10, 6, 2,
465*e1eccf28SAndroid Build Coastguard Worker 13, 9, 5, 1,
466*e1eccf28SAndroid Build Coastguard Worker 12, 8, 4, 0);
467*e1eccf28SAndroid Build Coastguard Worker y4 = _mm_shuffle_epi8(y3, T4x4);
468*e1eccf28SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i *)dst, y4);
469*e1eccf28SAndroid Build Coastguard Worker pY += 4;
470*e1eccf28SAndroid Build Coastguard Worker pUV += 4;
471*e1eccf28SAndroid Build Coastguard Worker dst = (__m128i *)dst + 1;
472*e1eccf28SAndroid Build Coastguard Worker }
473*e1eccf28SAndroid Build Coastguard Worker }
474*e1eccf28SAndroid Build Coastguard Worker
rsdIntrinsicYuvR_K(void * dst,const unsigned char * pY,const unsigned char * pUV,uint32_t count,const short * param)475*e1eccf28SAndroid Build Coastguard Worker void rsdIntrinsicYuvR_K(void *dst,
476*e1eccf28SAndroid Build Coastguard Worker const unsigned char *pY, const unsigned char *pUV,
477*e1eccf28SAndroid Build Coastguard Worker uint32_t count, const short *param) {
478*e1eccf28SAndroid Build Coastguard Worker __m128i biasY, biasUV;
479*e1eccf28SAndroid Build Coastguard Worker __m128i c0, c1, c2, c3, c4;
480*e1eccf28SAndroid Build Coastguard Worker
481*e1eccf28SAndroid Build Coastguard Worker biasY = _mm_set1_epi32(param[8]); /* 16 */
482*e1eccf28SAndroid Build Coastguard Worker biasUV = _mm_set1_epi32(param[16]); /* 128 */
483*e1eccf28SAndroid Build Coastguard Worker
484*e1eccf28SAndroid Build Coastguard Worker c0 = _mm_set1_epi32(param[0]); /* 298 */
485*e1eccf28SAndroid Build Coastguard Worker c1 = _mm_set1_epi32(param[1]); /* 409 */
486*e1eccf28SAndroid Build Coastguard Worker c2 = _mm_set1_epi32(param[2]); /* -100 */
487*e1eccf28SAndroid Build Coastguard Worker c3 = _mm_set1_epi32(param[3]); /* 516 */
488*e1eccf28SAndroid Build Coastguard Worker c4 = _mm_set1_epi32(param[4]); /* -208 */
489*e1eccf28SAndroid Build Coastguard Worker
490*e1eccf28SAndroid Build Coastguard Worker __m128i Y, UV, U, V, R, G, B, A;
491*e1eccf28SAndroid Build Coastguard Worker
492*e1eccf28SAndroid Build Coastguard Worker A = _mm_set1_epi32(255);
493*e1eccf28SAndroid Build Coastguard Worker uint32_t i;
494*e1eccf28SAndroid Build Coastguard Worker
495*e1eccf28SAndroid Build Coastguard Worker for (i = 0; i < (count << 1); ++i) {
496*e1eccf28SAndroid Build Coastguard Worker Y = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pY));
497*e1eccf28SAndroid Build Coastguard Worker UV = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pUV));
498*e1eccf28SAndroid Build Coastguard Worker
499*e1eccf28SAndroid Build Coastguard Worker Y = _mm_sub_epi32(Y, biasY);
500*e1eccf28SAndroid Build Coastguard Worker UV = _mm_sub_epi32(UV, biasUV);
501*e1eccf28SAndroid Build Coastguard Worker
502*e1eccf28SAndroid Build Coastguard Worker V = _mm_shuffle_epi32(UV, 0xf5);
503*e1eccf28SAndroid Build Coastguard Worker U = _mm_shuffle_epi32(UV, 0xa0);
504*e1eccf28SAndroid Build Coastguard Worker
505*e1eccf28SAndroid Build Coastguard Worker Y = mullo_epi32(Y, c0);
506*e1eccf28SAndroid Build Coastguard Worker
507*e1eccf28SAndroid Build Coastguard Worker R = _mm_add_epi32(Y, mullo_epi32(V, c1));
508*e1eccf28SAndroid Build Coastguard Worker R = _mm_add_epi32(R, biasUV);
509*e1eccf28SAndroid Build Coastguard Worker R = _mm_srai_epi32(R, 8);
510*e1eccf28SAndroid Build Coastguard Worker
511*e1eccf28SAndroid Build Coastguard Worker G = _mm_add_epi32(Y, mullo_epi32(U, c2));
512*e1eccf28SAndroid Build Coastguard Worker G = _mm_add_epi32(G, mullo_epi32(V, c4));
513*e1eccf28SAndroid Build Coastguard Worker G = _mm_add_epi32(G, biasUV);
514*e1eccf28SAndroid Build Coastguard Worker G = _mm_srai_epi32(G, 8);
515*e1eccf28SAndroid Build Coastguard Worker
516*e1eccf28SAndroid Build Coastguard Worker B = _mm_add_epi32(Y, mullo_epi32(U, c3));
517*e1eccf28SAndroid Build Coastguard Worker B = _mm_add_epi32(B, biasUV);
518*e1eccf28SAndroid Build Coastguard Worker B = _mm_srai_epi32(B, 8);
519*e1eccf28SAndroid Build Coastguard Worker
520*e1eccf28SAndroid Build Coastguard Worker __m128i y1, y2, y3, y4;
521*e1eccf28SAndroid Build Coastguard Worker
522*e1eccf28SAndroid Build Coastguard Worker y1 = packus_epi32(R, G);
523*e1eccf28SAndroid Build Coastguard Worker y2 = packus_epi32(B, A);
524*e1eccf28SAndroid Build Coastguard Worker y3 = _mm_packus_epi16(y1, y2);
525*e1eccf28SAndroid Build Coastguard Worker const __m128i T4x4 = _mm_set_epi8(15, 11, 7, 3,
526*e1eccf28SAndroid Build Coastguard Worker 14, 10, 6, 2,
527*e1eccf28SAndroid Build Coastguard Worker 13, 9, 5, 1,
528*e1eccf28SAndroid Build Coastguard Worker 12, 8, 4, 0);
529*e1eccf28SAndroid Build Coastguard Worker y4 = _mm_shuffle_epi8(y3, T4x4);
530*e1eccf28SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i *)dst, y4);
531*e1eccf28SAndroid Build Coastguard Worker pY += 4;
532*e1eccf28SAndroid Build Coastguard Worker pUV += 4;
533*e1eccf28SAndroid Build Coastguard Worker dst = (__m128i *)dst + 1;
534*e1eccf28SAndroid Build Coastguard Worker }
535*e1eccf28SAndroid Build Coastguard Worker }
536*e1eccf28SAndroid Build Coastguard Worker
rsdIntrinsicYuv2_K(void * dst,const unsigned char * pY,const unsigned char * pU,const unsigned char * pV,uint32_t count,const short * param)537*e1eccf28SAndroid Build Coastguard Worker void rsdIntrinsicYuv2_K(void *dst,
538*e1eccf28SAndroid Build Coastguard Worker const unsigned char *pY, const unsigned char *pU,
539*e1eccf28SAndroid Build Coastguard Worker const unsigned char *pV, uint32_t count, const short *param) {
540*e1eccf28SAndroid Build Coastguard Worker __m128i biasY, biasUV;
541*e1eccf28SAndroid Build Coastguard Worker __m128i c0, c1, c2, c3, c4;
542*e1eccf28SAndroid Build Coastguard Worker
543*e1eccf28SAndroid Build Coastguard Worker biasY = _mm_set1_epi32(param[8]); /* 16 */
544*e1eccf28SAndroid Build Coastguard Worker biasUV = _mm_set1_epi32(param[16]); /* 128 */
545*e1eccf28SAndroid Build Coastguard Worker
546*e1eccf28SAndroid Build Coastguard Worker c0 = _mm_set1_epi32(param[0]); /* 298 */
547*e1eccf28SAndroid Build Coastguard Worker c1 = _mm_set1_epi32(param[1]); /* 409 */
548*e1eccf28SAndroid Build Coastguard Worker c2 = _mm_set1_epi32(param[2]); /* -100 */
549*e1eccf28SAndroid Build Coastguard Worker c3 = _mm_set1_epi32(param[3]); /* 516 */
550*e1eccf28SAndroid Build Coastguard Worker c4 = _mm_set1_epi32(param[4]); /* -208 */
551*e1eccf28SAndroid Build Coastguard Worker
552*e1eccf28SAndroid Build Coastguard Worker __m128i Y, U, V, R, G, B, A;
553*e1eccf28SAndroid Build Coastguard Worker
554*e1eccf28SAndroid Build Coastguard Worker A = _mm_set1_epi32(255);
555*e1eccf28SAndroid Build Coastguard Worker uint32_t i;
556*e1eccf28SAndroid Build Coastguard Worker
557*e1eccf28SAndroid Build Coastguard Worker for (i = 0; i < (count << 1); ++i) {
558*e1eccf28SAndroid Build Coastguard Worker Y = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pY));
559*e1eccf28SAndroid Build Coastguard Worker U = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pU));
560*e1eccf28SAndroid Build Coastguard Worker V = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pV));
561*e1eccf28SAndroid Build Coastguard Worker
562*e1eccf28SAndroid Build Coastguard Worker Y = _mm_sub_epi32(Y, biasY);
563*e1eccf28SAndroid Build Coastguard Worker U = _mm_sub_epi32(U, biasUV);
564*e1eccf28SAndroid Build Coastguard Worker V = _mm_sub_epi32(V, biasUV);
565*e1eccf28SAndroid Build Coastguard Worker
566*e1eccf28SAndroid Build Coastguard Worker Y = mullo_epi32(Y, c0);
567*e1eccf28SAndroid Build Coastguard Worker
568*e1eccf28SAndroid Build Coastguard Worker R = _mm_add_epi32(Y, mullo_epi32(V, c1));
569*e1eccf28SAndroid Build Coastguard Worker R = _mm_add_epi32(R, biasUV);
570*e1eccf28SAndroid Build Coastguard Worker R = _mm_srai_epi32(R, 8);
571*e1eccf28SAndroid Build Coastguard Worker
572*e1eccf28SAndroid Build Coastguard Worker G = _mm_add_epi32(Y, mullo_epi32(U, c2));
573*e1eccf28SAndroid Build Coastguard Worker G = _mm_add_epi32(G, mullo_epi32(V, c4));
574*e1eccf28SAndroid Build Coastguard Worker G = _mm_add_epi32(G, biasUV);
575*e1eccf28SAndroid Build Coastguard Worker G = _mm_srai_epi32(G, 8);
576*e1eccf28SAndroid Build Coastguard Worker
577*e1eccf28SAndroid Build Coastguard Worker B = _mm_add_epi32(Y, mullo_epi32(U, c3));
578*e1eccf28SAndroid Build Coastguard Worker B = _mm_add_epi32(B, biasUV);
579*e1eccf28SAndroid Build Coastguard Worker B = _mm_srai_epi32(B, 8);
580*e1eccf28SAndroid Build Coastguard Worker
581*e1eccf28SAndroid Build Coastguard Worker __m128i y1, y2, y3, y4;
582*e1eccf28SAndroid Build Coastguard Worker
583*e1eccf28SAndroid Build Coastguard Worker y1 = packus_epi32(R, G);
584*e1eccf28SAndroid Build Coastguard Worker y2 = packus_epi32(B, A);
585*e1eccf28SAndroid Build Coastguard Worker y3 = _mm_packus_epi16(y1, y2);
586*e1eccf28SAndroid Build Coastguard Worker const __m128i T4x4 = _mm_set_epi8(15, 11, 7, 3,
587*e1eccf28SAndroid Build Coastguard Worker 14, 10, 6, 2,
588*e1eccf28SAndroid Build Coastguard Worker 13, 9, 5, 1,
589*e1eccf28SAndroid Build Coastguard Worker 12, 8, 4, 0);
590*e1eccf28SAndroid Build Coastguard Worker y4 = _mm_shuffle_epi8(y3, T4x4);
591*e1eccf28SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i *)dst, y4);
592*e1eccf28SAndroid Build Coastguard Worker pY += 4;
593*e1eccf28SAndroid Build Coastguard Worker pU += 4;
594*e1eccf28SAndroid Build Coastguard Worker pV += 4;
595*e1eccf28SAndroid Build Coastguard Worker dst = (__m128i *)dst + 1;
596*e1eccf28SAndroid Build Coastguard Worker }
597*e1eccf28SAndroid Build Coastguard Worker }
598*e1eccf28SAndroid Build Coastguard Worker
rsdIntrinsicConvolve5x5_K(void * dst,const void * y0,const void * y1,const void * y2,const void * y3,const void * y4,const short * coef,uint32_t count)599*e1eccf28SAndroid Build Coastguard Worker extern "C" void rsdIntrinsicConvolve5x5_K(void *dst, const void *y0,
600*e1eccf28SAndroid Build Coastguard Worker const void *y1, const void *y2,
601*e1eccf28SAndroid Build Coastguard Worker const void *y3, const void *y4,
602*e1eccf28SAndroid Build Coastguard Worker const short *coef, uint32_t count) {
603*e1eccf28SAndroid Build Coastguard Worker __m128i x;
604*e1eccf28SAndroid Build Coastguard Worker __m128i c0, c2, c4, c6, c8, c10, c12;
605*e1eccf28SAndroid Build Coastguard Worker __m128i c14, c16, c18, c20, c22, c24;
606*e1eccf28SAndroid Build Coastguard Worker __m128i r0, r1, r2, r3, r4, r5, r6, r7, r8, r9;
607*e1eccf28SAndroid Build Coastguard Worker __m128i p0, p1, p2, p3, p4, p5, p6, p7;
608*e1eccf28SAndroid Build Coastguard Worker __m128i p8, p9, p10, p11, p12, p13, p14, p15;
609*e1eccf28SAndroid Build Coastguard Worker __m128i p16, p17, p18, p19, p20, p21, p22, p23;
610*e1eccf28SAndroid Build Coastguard Worker __m128i p24, p25, p26, p27, p28, p29, p30, p31;
611*e1eccf28SAndroid Build Coastguard Worker __m128i p32, p33, p34, p35, p36, p37, p38, p39;
612*e1eccf28SAndroid Build Coastguard Worker __m128i o0, o1, o2, o3;
613*e1eccf28SAndroid Build Coastguard Worker uint32_t i;
614*e1eccf28SAndroid Build Coastguard Worker
615*e1eccf28SAndroid Build Coastguard Worker x = _mm_loadl_epi64((const __m128i *)(coef+0));
616*e1eccf28SAndroid Build Coastguard Worker c0 = _mm_shuffle_epi32(x, 0x00);
617*e1eccf28SAndroid Build Coastguard Worker c2 = _mm_shuffle_epi32(x, 0x55);
618*e1eccf28SAndroid Build Coastguard Worker
619*e1eccf28SAndroid Build Coastguard Worker x = _mm_loadl_epi64((const __m128i *)(coef+4));
620*e1eccf28SAndroid Build Coastguard Worker c4 = _mm_shuffle_epi32(x, 0x00);
621*e1eccf28SAndroid Build Coastguard Worker c6 = _mm_shuffle_epi32(x, 0x55);
622*e1eccf28SAndroid Build Coastguard Worker
623*e1eccf28SAndroid Build Coastguard Worker x = _mm_loadl_epi64((const __m128i *)(coef+8));
624*e1eccf28SAndroid Build Coastguard Worker c8 = _mm_shuffle_epi32(x, 0x00);
625*e1eccf28SAndroid Build Coastguard Worker c10 = _mm_shuffle_epi32(x, 0x55);
626*e1eccf28SAndroid Build Coastguard Worker
627*e1eccf28SAndroid Build Coastguard Worker x = _mm_loadl_epi64((const __m128i *)(coef+12));
628*e1eccf28SAndroid Build Coastguard Worker c12 = _mm_shuffle_epi32(x, 0x00);
629*e1eccf28SAndroid Build Coastguard Worker c14 = _mm_shuffle_epi32(x, 0x55);
630*e1eccf28SAndroid Build Coastguard Worker
631*e1eccf28SAndroid Build Coastguard Worker x = _mm_loadl_epi64((const __m128i *)(coef+16));
632*e1eccf28SAndroid Build Coastguard Worker c16 = _mm_shuffle_epi32(x, 0x00);
633*e1eccf28SAndroid Build Coastguard Worker c18 = _mm_shuffle_epi32(x, 0x55);
634*e1eccf28SAndroid Build Coastguard Worker
635*e1eccf28SAndroid Build Coastguard Worker x = _mm_loadl_epi64((const __m128i *)(coef+20));
636*e1eccf28SAndroid Build Coastguard Worker c20 = _mm_shuffle_epi32(x, 0x00);
637*e1eccf28SAndroid Build Coastguard Worker c22 = _mm_shuffle_epi32(x, 0x55);
638*e1eccf28SAndroid Build Coastguard Worker
639*e1eccf28SAndroid Build Coastguard Worker x = _mm_loadl_epi64((const __m128i *)(coef+24));
640*e1eccf28SAndroid Build Coastguard Worker c24 = _mm_shuffle_epi32(x, 0x00);
641*e1eccf28SAndroid Build Coastguard Worker
642*e1eccf28SAndroid Build Coastguard Worker for (i = 0; i < count; ++i) {
643*e1eccf28SAndroid Build Coastguard Worker
644*e1eccf28SAndroid Build Coastguard Worker p0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int32_t *)y0), _mm_setzero_si128());
645*e1eccf28SAndroid Build Coastguard Worker p1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+1)), _mm_setzero_si128());
646*e1eccf28SAndroid Build Coastguard Worker p2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+2)), _mm_setzero_si128());
647*e1eccf28SAndroid Build Coastguard Worker p3 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+3)), _mm_setzero_si128());
648*e1eccf28SAndroid Build Coastguard Worker p4 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+4)), _mm_setzero_si128());
649*e1eccf28SAndroid Build Coastguard Worker p5 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+5)), _mm_setzero_si128());
650*e1eccf28SAndroid Build Coastguard Worker p6 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+6)), _mm_setzero_si128());
651*e1eccf28SAndroid Build Coastguard Worker p7 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+7)), _mm_setzero_si128());
652*e1eccf28SAndroid Build Coastguard Worker
653*e1eccf28SAndroid Build Coastguard Worker p8 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1)), _mm_setzero_si128());
654*e1eccf28SAndroid Build Coastguard Worker p9 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+1)), _mm_setzero_si128());
655*e1eccf28SAndroid Build Coastguard Worker p10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+2)), _mm_setzero_si128());
656*e1eccf28SAndroid Build Coastguard Worker p11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+3)), _mm_setzero_si128());
657*e1eccf28SAndroid Build Coastguard Worker p12 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+4)), _mm_setzero_si128());
658*e1eccf28SAndroid Build Coastguard Worker p13 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+5)), _mm_setzero_si128());
659*e1eccf28SAndroid Build Coastguard Worker p14 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+6)), _mm_setzero_si128());
660*e1eccf28SAndroid Build Coastguard Worker p15 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+7)), _mm_setzero_si128());
661*e1eccf28SAndroid Build Coastguard Worker
662*e1eccf28SAndroid Build Coastguard Worker p16 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2)), _mm_setzero_si128());
663*e1eccf28SAndroid Build Coastguard Worker p17 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+1)), _mm_setzero_si128());
664*e1eccf28SAndroid Build Coastguard Worker p18 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+2)), _mm_setzero_si128());
665*e1eccf28SAndroid Build Coastguard Worker p19 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+3)), _mm_setzero_si128());
666*e1eccf28SAndroid Build Coastguard Worker p20 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+4)), _mm_setzero_si128());
667*e1eccf28SAndroid Build Coastguard Worker p21 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+5)), _mm_setzero_si128());
668*e1eccf28SAndroid Build Coastguard Worker p22 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+6)), _mm_setzero_si128());
669*e1eccf28SAndroid Build Coastguard Worker p23 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+7)), _mm_setzero_si128());
670*e1eccf28SAndroid Build Coastguard Worker
671*e1eccf28SAndroid Build Coastguard Worker p24 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3)), _mm_setzero_si128());
672*e1eccf28SAndroid Build Coastguard Worker p25 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+1)), _mm_setzero_si128());
673*e1eccf28SAndroid Build Coastguard Worker p26 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+2)), _mm_setzero_si128());
674*e1eccf28SAndroid Build Coastguard Worker p27 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+3)), _mm_setzero_si128());
675*e1eccf28SAndroid Build Coastguard Worker p28 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+4)), _mm_setzero_si128());
676*e1eccf28SAndroid Build Coastguard Worker p29 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+5)), _mm_setzero_si128());
677*e1eccf28SAndroid Build Coastguard Worker p30 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+6)), _mm_setzero_si128());
678*e1eccf28SAndroid Build Coastguard Worker p31 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+7)), _mm_setzero_si128());
679*e1eccf28SAndroid Build Coastguard Worker
680*e1eccf28SAndroid Build Coastguard Worker p32 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4)), _mm_setzero_si128());
681*e1eccf28SAndroid Build Coastguard Worker p33 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+1)), _mm_setzero_si128());
682*e1eccf28SAndroid Build Coastguard Worker p34 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+2)), _mm_setzero_si128());
683*e1eccf28SAndroid Build Coastguard Worker p35 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+3)), _mm_setzero_si128());
684*e1eccf28SAndroid Build Coastguard Worker p36 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+4)), _mm_setzero_si128());
685*e1eccf28SAndroid Build Coastguard Worker p37 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+5)), _mm_setzero_si128());
686*e1eccf28SAndroid Build Coastguard Worker p38 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+6)), _mm_setzero_si128());
687*e1eccf28SAndroid Build Coastguard Worker p39 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+7)), _mm_setzero_si128());
688*e1eccf28SAndroid Build Coastguard Worker
689*e1eccf28SAndroid Build Coastguard Worker o0 = _mm_madd_epi16( _mm_unpacklo_epi16(p0, p1), c0);
690*e1eccf28SAndroid Build Coastguard Worker o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p2, p3), c2));
691*e1eccf28SAndroid Build Coastguard Worker o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p4, p8), c4));
692*e1eccf28SAndroid Build Coastguard Worker o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p9,p10), c6));
693*e1eccf28SAndroid Build Coastguard Worker o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p11, p12), c8));
694*e1eccf28SAndroid Build Coastguard Worker o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p16, p17), c10));
695*e1eccf28SAndroid Build Coastguard Worker o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p18, p19), c12));
696*e1eccf28SAndroid Build Coastguard Worker o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p20, p24), c14));
697*e1eccf28SAndroid Build Coastguard Worker o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p25,p26), c16));
698*e1eccf28SAndroid Build Coastguard Worker o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p27, p28), c18));
699*e1eccf28SAndroid Build Coastguard Worker o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p32, p33), c20));
700*e1eccf28SAndroid Build Coastguard Worker o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p34, p35), c22));
701*e1eccf28SAndroid Build Coastguard Worker o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p36, _mm_setzero_si128()), c24));
702*e1eccf28SAndroid Build Coastguard Worker o0 = _mm_srai_epi32(o0, 8);
703*e1eccf28SAndroid Build Coastguard Worker
704*e1eccf28SAndroid Build Coastguard Worker o1 = _mm_madd_epi16( _mm_unpacklo_epi16(p1, p2), c0);
705*e1eccf28SAndroid Build Coastguard Worker o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p3,p4), c2));
706*e1eccf28SAndroid Build Coastguard Worker o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p5, p9), c4));
707*e1eccf28SAndroid Build Coastguard Worker o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p10,p11), c6));
708*e1eccf28SAndroid Build Coastguard Worker o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p12,p13), c8));
709*e1eccf28SAndroid Build Coastguard Worker o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p17,p18), c10));
710*e1eccf28SAndroid Build Coastguard Worker o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p19,p20), c12));
711*e1eccf28SAndroid Build Coastguard Worker o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p21,p25), c14));
712*e1eccf28SAndroid Build Coastguard Worker o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p26, p27), c16));
713*e1eccf28SAndroid Build Coastguard Worker o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p28, p29), c18));
714*e1eccf28SAndroid Build Coastguard Worker o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p33, p34), c20));
715*e1eccf28SAndroid Build Coastguard Worker o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p35, p36), c22));
716*e1eccf28SAndroid Build Coastguard Worker o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p37, _mm_setzero_si128()), c24));
717*e1eccf28SAndroid Build Coastguard Worker o1 = _mm_srai_epi32(o1, 8);
718*e1eccf28SAndroid Build Coastguard Worker
719*e1eccf28SAndroid Build Coastguard Worker o2 = _mm_madd_epi16( _mm_unpacklo_epi16(p2,p3), c0);
720*e1eccf28SAndroid Build Coastguard Worker o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p4, p5), c2));
721*e1eccf28SAndroid Build Coastguard Worker o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p6, p10), c4));
722*e1eccf28SAndroid Build Coastguard Worker o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p11, p12), c6));
723*e1eccf28SAndroid Build Coastguard Worker o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p13, p14), c8));
724*e1eccf28SAndroid Build Coastguard Worker o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p18, p19), c10));
725*e1eccf28SAndroid Build Coastguard Worker o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p20, p21), c12));
726*e1eccf28SAndroid Build Coastguard Worker o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p22, p26), c14));
727*e1eccf28SAndroid Build Coastguard Worker o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p27, p28), c16));
728*e1eccf28SAndroid Build Coastguard Worker o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p29, p30), c18));
729*e1eccf28SAndroid Build Coastguard Worker o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p34, p35), c20));
730*e1eccf28SAndroid Build Coastguard Worker o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p36, p37), c22));
731*e1eccf28SAndroid Build Coastguard Worker o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p38, _mm_setzero_si128()), c24));
732*e1eccf28SAndroid Build Coastguard Worker o2 = _mm_srai_epi32(o2, 8);
733*e1eccf28SAndroid Build Coastguard Worker
734*e1eccf28SAndroid Build Coastguard Worker o3 = _mm_madd_epi16( _mm_unpacklo_epi16(p3,p4), c0);
735*e1eccf28SAndroid Build Coastguard Worker o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p5, p6), c2));
736*e1eccf28SAndroid Build Coastguard Worker o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p7, p11), c4));
737*e1eccf28SAndroid Build Coastguard Worker o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p12, p13), c6));
738*e1eccf28SAndroid Build Coastguard Worker o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p14, p15), c8));
739*e1eccf28SAndroid Build Coastguard Worker o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p19, p20), c10));
740*e1eccf28SAndroid Build Coastguard Worker o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p21, p22), c12));
741*e1eccf28SAndroid Build Coastguard Worker o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p23, p27), c14));
742*e1eccf28SAndroid Build Coastguard Worker o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p28, p29), c16));
743*e1eccf28SAndroid Build Coastguard Worker o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p30, p31), c18));
744*e1eccf28SAndroid Build Coastguard Worker o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p35, p36), c20));
745*e1eccf28SAndroid Build Coastguard Worker o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p37,p38), c22));
746*e1eccf28SAndroid Build Coastguard Worker o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p39, _mm_setzero_si128()), c24));
747*e1eccf28SAndroid Build Coastguard Worker o3 = _mm_srai_epi32(o3, 8);
748*e1eccf28SAndroid Build Coastguard Worker
749*e1eccf28SAndroid Build Coastguard Worker o0 = packus_epi32(o0, o1);
750*e1eccf28SAndroid Build Coastguard Worker o2 = packus_epi32(o2, o3);
751*e1eccf28SAndroid Build Coastguard Worker o0 = _mm_packus_epi16(o0, o2);
752*e1eccf28SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i *)dst, o0);
753*e1eccf28SAndroid Build Coastguard Worker
754*e1eccf28SAndroid Build Coastguard Worker y0 = (const char *)y0 + 16;
755*e1eccf28SAndroid Build Coastguard Worker y1 = (const char *)y1 + 16;
756*e1eccf28SAndroid Build Coastguard Worker y2 = (const char *)y2 + 16;
757*e1eccf28SAndroid Build Coastguard Worker y3 = (const char *)y3 + 16;
758*e1eccf28SAndroid Build Coastguard Worker y4 = (const char *)y4 + 16;
759*e1eccf28SAndroid Build Coastguard Worker dst = (char *)dst + 16;
760*e1eccf28SAndroid Build Coastguard Worker }
761*e1eccf28SAndroid Build Coastguard Worker }
762*e1eccf28SAndroid Build Coastguard Worker
rsdIntrinsicBlendSrcOver_K(void * dst,const void * src,uint32_t count8)763*e1eccf28SAndroid Build Coastguard Worker void rsdIntrinsicBlendSrcOver_K(void *dst, const void *src, uint32_t count8) {
764*e1eccf28SAndroid Build Coastguard Worker __m128i all1s, ina, ins;
765*e1eccf28SAndroid Build Coastguard Worker __m128i in0, in1, out0, out1;
766*e1eccf28SAndroid Build Coastguard Worker __m128i t0, t1, t2, t3;
767*e1eccf28SAndroid Build Coastguard Worker uint32_t i;
768*e1eccf28SAndroid Build Coastguard Worker
769*e1eccf28SAndroid Build Coastguard Worker all1s = _mm_set1_epi16(255);
770*e1eccf28SAndroid Build Coastguard Worker
771*e1eccf28SAndroid Build Coastguard Worker for (i = 0; i < count8; ++i) {
772*e1eccf28SAndroid Build Coastguard Worker in0 = _mm_loadu_si128((const __m128i *)src);
773*e1eccf28SAndroid Build Coastguard Worker in1 = _mm_loadu_si128((const __m128i *)src + 1);
774*e1eccf28SAndroid Build Coastguard Worker out0 = _mm_loadu_si128((const __m128i *)dst);
775*e1eccf28SAndroid Build Coastguard Worker out1 = _mm_loadu_si128((const __m128i *)dst + 1);
776*e1eccf28SAndroid Build Coastguard Worker
777*e1eccf28SAndroid Build Coastguard Worker ins = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
778*e1eccf28SAndroid Build Coastguard Worker ina = _mm_shufflelo_epi16(ins, 0xFF);
779*e1eccf28SAndroid Build Coastguard Worker ina = _mm_shufflehi_epi16(ina, 0xFF);
780*e1eccf28SAndroid Build Coastguard Worker t0 = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
781*e1eccf28SAndroid Build Coastguard Worker t0 = _mm_mullo_epi16(t0, _mm_sub_epi16(all1s, ina));
782*e1eccf28SAndroid Build Coastguard Worker t0 = _mm_srli_epi16(t0, 8);
783*e1eccf28SAndroid Build Coastguard Worker t0 = _mm_add_epi16(t0, ins);
784*e1eccf28SAndroid Build Coastguard Worker
785*e1eccf28SAndroid Build Coastguard Worker ins = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
786*e1eccf28SAndroid Build Coastguard Worker ina = _mm_shufflelo_epi16(ins, 0xFF);
787*e1eccf28SAndroid Build Coastguard Worker ina = _mm_shufflehi_epi16(ina, 0xFF);
788*e1eccf28SAndroid Build Coastguard Worker t1 = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
789*e1eccf28SAndroid Build Coastguard Worker t1 = _mm_mullo_epi16(t1, _mm_sub_epi16(all1s, ina));
790*e1eccf28SAndroid Build Coastguard Worker t1 = _mm_srli_epi16(t1, 8);
791*e1eccf28SAndroid Build Coastguard Worker t1 = _mm_add_epi16(t1, ins);
792*e1eccf28SAndroid Build Coastguard Worker
793*e1eccf28SAndroid Build Coastguard Worker ins = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
794*e1eccf28SAndroid Build Coastguard Worker ina = _mm_shufflelo_epi16(ins, 0xFF);
795*e1eccf28SAndroid Build Coastguard Worker ina = _mm_shufflehi_epi16(ina, 0xFF);
796*e1eccf28SAndroid Build Coastguard Worker t2 = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
797*e1eccf28SAndroid Build Coastguard Worker t2 = _mm_mullo_epi16(t2, _mm_sub_epi16(all1s, ina));
798*e1eccf28SAndroid Build Coastguard Worker t2 = _mm_srli_epi16(t2, 8);
799*e1eccf28SAndroid Build Coastguard Worker t2 = _mm_add_epi16(t2, ins);
800*e1eccf28SAndroid Build Coastguard Worker
801*e1eccf28SAndroid Build Coastguard Worker ins = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
802*e1eccf28SAndroid Build Coastguard Worker ina = _mm_shufflelo_epi16(ins, 0xFF);
803*e1eccf28SAndroid Build Coastguard Worker ina = _mm_shufflehi_epi16(ina, 0xFF);
804*e1eccf28SAndroid Build Coastguard Worker t3 = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
805*e1eccf28SAndroid Build Coastguard Worker t3 = _mm_mullo_epi16(t3, _mm_sub_epi16(all1s, ina));
806*e1eccf28SAndroid Build Coastguard Worker t3 = _mm_srli_epi16(t3, 8);
807*e1eccf28SAndroid Build Coastguard Worker t3 = _mm_add_epi16(t3, ins);
808*e1eccf28SAndroid Build Coastguard Worker
809*e1eccf28SAndroid Build Coastguard Worker t0 = _mm_packus_epi16(t0, t1);
810*e1eccf28SAndroid Build Coastguard Worker t2 = _mm_packus_epi16(t2, t3);
811*e1eccf28SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i *)dst, t0);
812*e1eccf28SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i *)dst + 1, t2);
813*e1eccf28SAndroid Build Coastguard Worker
814*e1eccf28SAndroid Build Coastguard Worker src = (const __m128i *)src + 2;
815*e1eccf28SAndroid Build Coastguard Worker dst = (__m128i *)dst + 2;
816*e1eccf28SAndroid Build Coastguard Worker }
817*e1eccf28SAndroid Build Coastguard Worker }
818*e1eccf28SAndroid Build Coastguard Worker
rsdIntrinsicBlendDstOver_K(void * dst,const void * src,uint32_t count8)819*e1eccf28SAndroid Build Coastguard Worker void rsdIntrinsicBlendDstOver_K(void *dst, const void *src, uint32_t count8) {
820*e1eccf28SAndroid Build Coastguard Worker __m128i all1s, outa, outs;
821*e1eccf28SAndroid Build Coastguard Worker __m128i in0, in1, out0, out1;
822*e1eccf28SAndroid Build Coastguard Worker __m128i t0, t1, t2, t3;
823*e1eccf28SAndroid Build Coastguard Worker uint32_t i;
824*e1eccf28SAndroid Build Coastguard Worker
825*e1eccf28SAndroid Build Coastguard Worker all1s = _mm_set1_epi16(255);
826*e1eccf28SAndroid Build Coastguard Worker
827*e1eccf28SAndroid Build Coastguard Worker for (i = 0; i < count8; ++i) {
828*e1eccf28SAndroid Build Coastguard Worker in0 = _mm_loadu_si128((const __m128i *)src);
829*e1eccf28SAndroid Build Coastguard Worker in1 = _mm_loadu_si128((const __m128i *)src + 1);
830*e1eccf28SAndroid Build Coastguard Worker out0 = _mm_loadu_si128((const __m128i *)dst);
831*e1eccf28SAndroid Build Coastguard Worker out1 = _mm_loadu_si128((const __m128i *)dst + 1);
832*e1eccf28SAndroid Build Coastguard Worker
833*e1eccf28SAndroid Build Coastguard Worker
834*e1eccf28SAndroid Build Coastguard Worker outs = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
835*e1eccf28SAndroid Build Coastguard Worker outa = _mm_shufflelo_epi16(outs, 0xFF);
836*e1eccf28SAndroid Build Coastguard Worker outa = _mm_shufflehi_epi16(outa, 0xFF);
837*e1eccf28SAndroid Build Coastguard Worker t0 = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
838*e1eccf28SAndroid Build Coastguard Worker t0 = _mm_mullo_epi16(t0, _mm_sub_epi16(all1s, outa));
839*e1eccf28SAndroid Build Coastguard Worker t0 = _mm_srli_epi16(t0, 8);
840*e1eccf28SAndroid Build Coastguard Worker t0 = _mm_add_epi16(t0, outs);
841*e1eccf28SAndroid Build Coastguard Worker
842*e1eccf28SAndroid Build Coastguard Worker outs = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
843*e1eccf28SAndroid Build Coastguard Worker outa = _mm_shufflelo_epi16(outs, 0xFF);
844*e1eccf28SAndroid Build Coastguard Worker outa = _mm_shufflehi_epi16(outa, 0xFF);
845*e1eccf28SAndroid Build Coastguard Worker t1 = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
846*e1eccf28SAndroid Build Coastguard Worker t1 = _mm_mullo_epi16(t1, _mm_sub_epi16(all1s, outa));
847*e1eccf28SAndroid Build Coastguard Worker t1 = _mm_srli_epi16(t1, 8);
848*e1eccf28SAndroid Build Coastguard Worker t1 = _mm_add_epi16(t1, outs);
849*e1eccf28SAndroid Build Coastguard Worker
850*e1eccf28SAndroid Build Coastguard Worker outs = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
851*e1eccf28SAndroid Build Coastguard Worker outa = _mm_shufflelo_epi16(outs, 0xFF);
852*e1eccf28SAndroid Build Coastguard Worker outa = _mm_shufflehi_epi16(outa, 0xFF);
853*e1eccf28SAndroid Build Coastguard Worker t2 = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
854*e1eccf28SAndroid Build Coastguard Worker t2 = _mm_mullo_epi16(t2, _mm_sub_epi16(all1s, outa));
855*e1eccf28SAndroid Build Coastguard Worker t2 = _mm_srli_epi16(t2, 8);
856*e1eccf28SAndroid Build Coastguard Worker t2 = _mm_add_epi16(t2, outs);
857*e1eccf28SAndroid Build Coastguard Worker
858*e1eccf28SAndroid Build Coastguard Worker outs = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
859*e1eccf28SAndroid Build Coastguard Worker outa = _mm_shufflelo_epi16(outs, 0xFF);
860*e1eccf28SAndroid Build Coastguard Worker outa = _mm_shufflehi_epi16(outa, 0xFF);
861*e1eccf28SAndroid Build Coastguard Worker t3 = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
862*e1eccf28SAndroid Build Coastguard Worker t3 = _mm_mullo_epi16(t3, _mm_sub_epi16(all1s, outa));
863*e1eccf28SAndroid Build Coastguard Worker t3 = _mm_srli_epi16(t3, 8);
864*e1eccf28SAndroid Build Coastguard Worker t3 = _mm_add_epi16(t3, outs);
865*e1eccf28SAndroid Build Coastguard Worker
866*e1eccf28SAndroid Build Coastguard Worker t0 = _mm_packus_epi16(t0, t1);
867*e1eccf28SAndroid Build Coastguard Worker t2 = _mm_packus_epi16(t2, t3);
868*e1eccf28SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i *)dst, t0);
869*e1eccf28SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i *)dst + 1, t2);
870*e1eccf28SAndroid Build Coastguard Worker
871*e1eccf28SAndroid Build Coastguard Worker src = (const __m128i *)src + 2;
872*e1eccf28SAndroid Build Coastguard Worker dst = (__m128i *)dst + 2;
873*e1eccf28SAndroid Build Coastguard Worker }
874*e1eccf28SAndroid Build Coastguard Worker }
875*e1eccf28SAndroid Build Coastguard Worker
rsdIntrinsicBlendSrcIn_K(void * dst,const void * src,uint32_t count8)876*e1eccf28SAndroid Build Coastguard Worker void rsdIntrinsicBlendSrcIn_K(void *dst, const void *src, uint32_t count8) {
877*e1eccf28SAndroid Build Coastguard Worker __m128i outa;
878*e1eccf28SAndroid Build Coastguard Worker __m128i in0, in1, out0, out1;
879*e1eccf28SAndroid Build Coastguard Worker __m128i t0, t1, t2, t3;
880*e1eccf28SAndroid Build Coastguard Worker uint32_t i;
881*e1eccf28SAndroid Build Coastguard Worker
882*e1eccf28SAndroid Build Coastguard Worker for (i = 0; i < count8; ++i) {
883*e1eccf28SAndroid Build Coastguard Worker in0 = _mm_loadu_si128((const __m128i *)src);
884*e1eccf28SAndroid Build Coastguard Worker in1 = _mm_loadu_si128((const __m128i *)src + 1);
885*e1eccf28SAndroid Build Coastguard Worker out0 = _mm_loadu_si128((const __m128i *)dst);
886*e1eccf28SAndroid Build Coastguard Worker out1 = _mm_loadu_si128((const __m128i *)dst + 1);
887*e1eccf28SAndroid Build Coastguard Worker
888*e1eccf28SAndroid Build Coastguard Worker outa = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
889*e1eccf28SAndroid Build Coastguard Worker outa = _mm_shufflelo_epi16(outa, 0xFF);
890*e1eccf28SAndroid Build Coastguard Worker outa = _mm_shufflehi_epi16(outa, 0xFF);
891*e1eccf28SAndroid Build Coastguard Worker t0 = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
892*e1eccf28SAndroid Build Coastguard Worker t0 = _mm_mullo_epi16(t0, outa);
893*e1eccf28SAndroid Build Coastguard Worker t0 = _mm_srli_epi16(t0, 8);
894*e1eccf28SAndroid Build Coastguard Worker
895*e1eccf28SAndroid Build Coastguard Worker outa = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
896*e1eccf28SAndroid Build Coastguard Worker outa = _mm_shufflelo_epi16(outa, 0xFF);
897*e1eccf28SAndroid Build Coastguard Worker outa = _mm_shufflehi_epi16(outa, 0xFF);
898*e1eccf28SAndroid Build Coastguard Worker t1 = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
899*e1eccf28SAndroid Build Coastguard Worker t1 = _mm_mullo_epi16(t1, outa);
900*e1eccf28SAndroid Build Coastguard Worker t1 = _mm_srli_epi16(t1, 8);
901*e1eccf28SAndroid Build Coastguard Worker
902*e1eccf28SAndroid Build Coastguard Worker outa = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
903*e1eccf28SAndroid Build Coastguard Worker outa = _mm_shufflelo_epi16(outa, 0xFF);
904*e1eccf28SAndroid Build Coastguard Worker outa = _mm_shufflehi_epi16(outa, 0xFF);
905*e1eccf28SAndroid Build Coastguard Worker t2 = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
906*e1eccf28SAndroid Build Coastguard Worker t2 = _mm_mullo_epi16(t2, outa);
907*e1eccf28SAndroid Build Coastguard Worker t2 = _mm_srli_epi16(t2, 8);
908*e1eccf28SAndroid Build Coastguard Worker
909*e1eccf28SAndroid Build Coastguard Worker outa = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
910*e1eccf28SAndroid Build Coastguard Worker outa = _mm_shufflelo_epi16(outa, 0xFF);
911*e1eccf28SAndroid Build Coastguard Worker outa = _mm_shufflehi_epi16(outa, 0xFF);
912*e1eccf28SAndroid Build Coastguard Worker t3 = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
913*e1eccf28SAndroid Build Coastguard Worker t3 = _mm_mullo_epi16(t3, outa);
914*e1eccf28SAndroid Build Coastguard Worker t3 = _mm_srli_epi16(t3, 8);
915*e1eccf28SAndroid Build Coastguard Worker
916*e1eccf28SAndroid Build Coastguard Worker t0 = _mm_packus_epi16(t0, t1);
917*e1eccf28SAndroid Build Coastguard Worker t2 = _mm_packus_epi16(t2, t3);
918*e1eccf28SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i *)dst, t0);
919*e1eccf28SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i *)dst + 1, t2);
920*e1eccf28SAndroid Build Coastguard Worker
921*e1eccf28SAndroid Build Coastguard Worker src = (const __m128i *)src + 2;
922*e1eccf28SAndroid Build Coastguard Worker dst = (__m128i *)dst + 2;
923*e1eccf28SAndroid Build Coastguard Worker }
924*e1eccf28SAndroid Build Coastguard Worker }
925*e1eccf28SAndroid Build Coastguard Worker
rsdIntrinsicBlendDstIn_K(void * dst,const void * src,uint32_t count8)926*e1eccf28SAndroid Build Coastguard Worker void rsdIntrinsicBlendDstIn_K(void *dst, const void *src, uint32_t count8) {
927*e1eccf28SAndroid Build Coastguard Worker __m128i ina;
928*e1eccf28SAndroid Build Coastguard Worker __m128i in0, in1, out0, out1;
929*e1eccf28SAndroid Build Coastguard Worker __m128i t0, t1, t2, t3;
930*e1eccf28SAndroid Build Coastguard Worker uint32_t i;
931*e1eccf28SAndroid Build Coastguard Worker
932*e1eccf28SAndroid Build Coastguard Worker for (i = 0; i < count8; ++i) {
933*e1eccf28SAndroid Build Coastguard Worker in0 = _mm_loadu_si128((const __m128i *)src);
934*e1eccf28SAndroid Build Coastguard Worker in1 = _mm_loadu_si128((const __m128i *)src + 1);
935*e1eccf28SAndroid Build Coastguard Worker out0 = _mm_loadu_si128((const __m128i *)dst);
936*e1eccf28SAndroid Build Coastguard Worker out1 = _mm_loadu_si128((const __m128i *)dst + 1);
937*e1eccf28SAndroid Build Coastguard Worker
938*e1eccf28SAndroid Build Coastguard Worker ina = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
939*e1eccf28SAndroid Build Coastguard Worker ina = _mm_shufflelo_epi16(ina, 0xFF);
940*e1eccf28SAndroid Build Coastguard Worker ina = _mm_shufflehi_epi16(ina, 0xFF);
941*e1eccf28SAndroid Build Coastguard Worker t0 = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
942*e1eccf28SAndroid Build Coastguard Worker t0 = _mm_mullo_epi16(t0, ina);
943*e1eccf28SAndroid Build Coastguard Worker t0 = _mm_srli_epi16(t0, 8);
944*e1eccf28SAndroid Build Coastguard Worker
945*e1eccf28SAndroid Build Coastguard Worker ina = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
946*e1eccf28SAndroid Build Coastguard Worker ina = _mm_shufflelo_epi16(ina, 0xFF);
947*e1eccf28SAndroid Build Coastguard Worker ina = _mm_shufflehi_epi16(ina, 0xFF);
948*e1eccf28SAndroid Build Coastguard Worker t1 = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
949*e1eccf28SAndroid Build Coastguard Worker t1 = _mm_mullo_epi16(t1, ina);
950*e1eccf28SAndroid Build Coastguard Worker t1 = _mm_srli_epi16(t1, 8);
951*e1eccf28SAndroid Build Coastguard Worker
952*e1eccf28SAndroid Build Coastguard Worker ina = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
953*e1eccf28SAndroid Build Coastguard Worker ina = _mm_shufflelo_epi16(ina, 0xFF);
954*e1eccf28SAndroid Build Coastguard Worker ina = _mm_shufflehi_epi16(ina, 0xFF);
955*e1eccf28SAndroid Build Coastguard Worker t2 = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
956*e1eccf28SAndroid Build Coastguard Worker t2 = _mm_mullo_epi16(t2, ina);
957*e1eccf28SAndroid Build Coastguard Worker t2 = _mm_srli_epi16(t2, 8);
958*e1eccf28SAndroid Build Coastguard Worker
959*e1eccf28SAndroid Build Coastguard Worker ina = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
960*e1eccf28SAndroid Build Coastguard Worker ina = _mm_shufflelo_epi16(ina, 0xFF);
961*e1eccf28SAndroid Build Coastguard Worker ina = _mm_shufflehi_epi16(ina, 0xFF);
962*e1eccf28SAndroid Build Coastguard Worker t3 = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
963*e1eccf28SAndroid Build Coastguard Worker t3 = _mm_mullo_epi16(t3, ina);
964*e1eccf28SAndroid Build Coastguard Worker t3 = _mm_srli_epi16(t3, 8);
965*e1eccf28SAndroid Build Coastguard Worker
966*e1eccf28SAndroid Build Coastguard Worker t0 = _mm_packus_epi16(t0, t1);
967*e1eccf28SAndroid Build Coastguard Worker t2 = _mm_packus_epi16(t2, t3);
968*e1eccf28SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i *)dst, t0);
969*e1eccf28SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i *)dst + 1, t2);
970*e1eccf28SAndroid Build Coastguard Worker
971*e1eccf28SAndroid Build Coastguard Worker src = (const __m128i *)src + 2;
972*e1eccf28SAndroid Build Coastguard Worker dst = (__m128i *)dst + 2;
973*e1eccf28SAndroid Build Coastguard Worker }
974*e1eccf28SAndroid Build Coastguard Worker }
975*e1eccf28SAndroid Build Coastguard Worker
rsdIntrinsicBlendSrcOut_K(void * dst,const void * src,uint32_t count8)976*e1eccf28SAndroid Build Coastguard Worker void rsdIntrinsicBlendSrcOut_K(void *dst, const void *src, uint32_t count8) {
977*e1eccf28SAndroid Build Coastguard Worker __m128i all1s, outa;
978*e1eccf28SAndroid Build Coastguard Worker __m128i in0, in1, out0, out1;
979*e1eccf28SAndroid Build Coastguard Worker __m128i t0, t1, t2, t3;
980*e1eccf28SAndroid Build Coastguard Worker uint32_t i;
981*e1eccf28SAndroid Build Coastguard Worker
982*e1eccf28SAndroid Build Coastguard Worker all1s = _mm_set1_epi16(255);
983*e1eccf28SAndroid Build Coastguard Worker
984*e1eccf28SAndroid Build Coastguard Worker for (i = 0; i < count8; ++i) {
985*e1eccf28SAndroid Build Coastguard Worker in0 = _mm_loadu_si128((const __m128i *)src);
986*e1eccf28SAndroid Build Coastguard Worker in1 = _mm_loadu_si128((const __m128i *)src + 1);
987*e1eccf28SAndroid Build Coastguard Worker out0 = _mm_loadu_si128((const __m128i *)dst);
988*e1eccf28SAndroid Build Coastguard Worker out1 = _mm_loadu_si128((const __m128i *)dst + 1);
989*e1eccf28SAndroid Build Coastguard Worker
990*e1eccf28SAndroid Build Coastguard Worker outa = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
991*e1eccf28SAndroid Build Coastguard Worker outa = _mm_shufflelo_epi16(outa, 0xFF);
992*e1eccf28SAndroid Build Coastguard Worker outa = _mm_shufflehi_epi16(outa, 0xFF);
993*e1eccf28SAndroid Build Coastguard Worker t0 = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
994*e1eccf28SAndroid Build Coastguard Worker t0 = _mm_mullo_epi16(t0, _mm_sub_epi16(all1s, outa));
995*e1eccf28SAndroid Build Coastguard Worker t0 = _mm_srli_epi16(t0, 8);
996*e1eccf28SAndroid Build Coastguard Worker
997*e1eccf28SAndroid Build Coastguard Worker outa = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
998*e1eccf28SAndroid Build Coastguard Worker outa = _mm_shufflelo_epi16(outa, 0xFF);
999*e1eccf28SAndroid Build Coastguard Worker outa = _mm_shufflehi_epi16(outa, 0xFF);
1000*e1eccf28SAndroid Build Coastguard Worker t1 = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
1001*e1eccf28SAndroid Build Coastguard Worker t1 = _mm_mullo_epi16(t1, _mm_sub_epi16(all1s, outa));
1002*e1eccf28SAndroid Build Coastguard Worker t1 = _mm_srli_epi16(t1, 8);
1003*e1eccf28SAndroid Build Coastguard Worker
1004*e1eccf28SAndroid Build Coastguard Worker outa = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
1005*e1eccf28SAndroid Build Coastguard Worker outa = _mm_shufflelo_epi16(outa, 0xFF);
1006*e1eccf28SAndroid Build Coastguard Worker outa = _mm_shufflehi_epi16(outa, 0xFF);
1007*e1eccf28SAndroid Build Coastguard Worker t2 = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
1008*e1eccf28SAndroid Build Coastguard Worker t2 = _mm_mullo_epi16(t2, _mm_sub_epi16(all1s, outa));
1009*e1eccf28SAndroid Build Coastguard Worker t2 = _mm_srli_epi16(t2, 8);
1010*e1eccf28SAndroid Build Coastguard Worker
1011*e1eccf28SAndroid Build Coastguard Worker outa = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
1012*e1eccf28SAndroid Build Coastguard Worker outa = _mm_shufflelo_epi16(outa, 0xFF);
1013*e1eccf28SAndroid Build Coastguard Worker outa = _mm_shufflehi_epi16(outa, 0xFF);
1014*e1eccf28SAndroid Build Coastguard Worker t3 = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
1015*e1eccf28SAndroid Build Coastguard Worker t3 = _mm_mullo_epi16(t3, _mm_sub_epi16(all1s, outa));
1016*e1eccf28SAndroid Build Coastguard Worker t3 = _mm_srli_epi16(t3, 8);
1017*e1eccf28SAndroid Build Coastguard Worker
1018*e1eccf28SAndroid Build Coastguard Worker t0 = _mm_packus_epi16(t0, t1);
1019*e1eccf28SAndroid Build Coastguard Worker t2 = _mm_packus_epi16(t2, t3);
1020*e1eccf28SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i *)dst, t0);
1021*e1eccf28SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i *)dst + 1, t2);
1022*e1eccf28SAndroid Build Coastguard Worker
1023*e1eccf28SAndroid Build Coastguard Worker src = (const __m128i *)src + 2;
1024*e1eccf28SAndroid Build Coastguard Worker dst = (__m128i *)dst + 2;
1025*e1eccf28SAndroid Build Coastguard Worker }
1026*e1eccf28SAndroid Build Coastguard Worker }
1027*e1eccf28SAndroid Build Coastguard Worker
rsdIntrinsicBlendDstOut_K(void * dst,const void * src,uint32_t count8)1028*e1eccf28SAndroid Build Coastguard Worker void rsdIntrinsicBlendDstOut_K(void *dst, const void *src, uint32_t count8) {
1029*e1eccf28SAndroid Build Coastguard Worker __m128i all1s, ina;
1030*e1eccf28SAndroid Build Coastguard Worker __m128i in0, in1, out0, out1;
1031*e1eccf28SAndroid Build Coastguard Worker __m128i t0, t1, t2, t3;
1032*e1eccf28SAndroid Build Coastguard Worker uint32_t i;
1033*e1eccf28SAndroid Build Coastguard Worker
1034*e1eccf28SAndroid Build Coastguard Worker all1s = _mm_set1_epi16(255);
1035*e1eccf28SAndroid Build Coastguard Worker
1036*e1eccf28SAndroid Build Coastguard Worker for (i = 0; i < count8; ++i) {
1037*e1eccf28SAndroid Build Coastguard Worker in0 = _mm_loadu_si128((const __m128i *)src);
1038*e1eccf28SAndroid Build Coastguard Worker in1 = _mm_loadu_si128((const __m128i *)src + 1);
1039*e1eccf28SAndroid Build Coastguard Worker out0 = _mm_loadu_si128((const __m128i *)dst);
1040*e1eccf28SAndroid Build Coastguard Worker out1 = _mm_loadu_si128((const __m128i *)dst + 1);
1041*e1eccf28SAndroid Build Coastguard Worker
1042*e1eccf28SAndroid Build Coastguard Worker ina = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
1043*e1eccf28SAndroid Build Coastguard Worker ina = _mm_shufflelo_epi16(ina, 0xFF);
1044*e1eccf28SAndroid Build Coastguard Worker ina = _mm_shufflehi_epi16(ina, 0xFF);
1045*e1eccf28SAndroid Build Coastguard Worker t0 = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
1046*e1eccf28SAndroid Build Coastguard Worker t0 = _mm_mullo_epi16(t0, _mm_sub_epi16(all1s, ina));
1047*e1eccf28SAndroid Build Coastguard Worker t0 = _mm_srli_epi16(t0, 8);
1048*e1eccf28SAndroid Build Coastguard Worker
1049*e1eccf28SAndroid Build Coastguard Worker ina = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
1050*e1eccf28SAndroid Build Coastguard Worker ina = _mm_shufflelo_epi16(ina, 0xFF);
1051*e1eccf28SAndroid Build Coastguard Worker ina = _mm_shufflehi_epi16(ina, 0xFF);
1052*e1eccf28SAndroid Build Coastguard Worker t1 = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
1053*e1eccf28SAndroid Build Coastguard Worker t1 = _mm_mullo_epi16(t1, _mm_sub_epi16(all1s, ina));
1054*e1eccf28SAndroid Build Coastguard Worker t1 = _mm_srli_epi16(t1, 8);
1055*e1eccf28SAndroid Build Coastguard Worker
1056*e1eccf28SAndroid Build Coastguard Worker ina = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
1057*e1eccf28SAndroid Build Coastguard Worker ina = _mm_shufflelo_epi16(ina, 0xFF);
1058*e1eccf28SAndroid Build Coastguard Worker ina = _mm_shufflehi_epi16(ina, 0xFF);
1059*e1eccf28SAndroid Build Coastguard Worker t2 = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
1060*e1eccf28SAndroid Build Coastguard Worker t2 = _mm_mullo_epi16(t2, _mm_sub_epi16(all1s, ina));
1061*e1eccf28SAndroid Build Coastguard Worker t2 = _mm_srli_epi16(t2, 8);
1062*e1eccf28SAndroid Build Coastguard Worker
1063*e1eccf28SAndroid Build Coastguard Worker ina = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
1064*e1eccf28SAndroid Build Coastguard Worker ina = _mm_shufflelo_epi16(ina, 0xFF);
1065*e1eccf28SAndroid Build Coastguard Worker ina = _mm_shufflehi_epi16(ina, 0xFF);
1066*e1eccf28SAndroid Build Coastguard Worker t3 = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
1067*e1eccf28SAndroid Build Coastguard Worker t3 = _mm_mullo_epi16(t3, _mm_sub_epi16(all1s, ina));
1068*e1eccf28SAndroid Build Coastguard Worker t3 = _mm_srli_epi16(t3, 8);
1069*e1eccf28SAndroid Build Coastguard Worker
1070*e1eccf28SAndroid Build Coastguard Worker t0 = _mm_packus_epi16(t0, t1);
1071*e1eccf28SAndroid Build Coastguard Worker t2 = _mm_packus_epi16(t2, t3);
1072*e1eccf28SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i *)dst, t0);
1073*e1eccf28SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i *)dst + 1, t2);
1074*e1eccf28SAndroid Build Coastguard Worker
1075*e1eccf28SAndroid Build Coastguard Worker src = (const __m128i *)src + 2;
1076*e1eccf28SAndroid Build Coastguard Worker dst = (__m128i *)dst + 2;
1077*e1eccf28SAndroid Build Coastguard Worker }
1078*e1eccf28SAndroid Build Coastguard Worker }
1079*e1eccf28SAndroid Build Coastguard Worker
rsdIntrinsicBlendSrcAtop_K(void * dst,const void * src,uint32_t count8)1080*e1eccf28SAndroid Build Coastguard Worker void rsdIntrinsicBlendSrcAtop_K(void *dst, const void *src, uint32_t count8) {
1081*e1eccf28SAndroid Build Coastguard Worker const __m128i M0001 = _mm_set_epi32(0xff000000, 0xff000000, 0xff000000, 0xff000000);
1082*e1eccf28SAndroid Build Coastguard Worker __m128i all1s, ina, outa, ins, outs;
1083*e1eccf28SAndroid Build Coastguard Worker __m128i in0, in1, out0, out1;
1084*e1eccf28SAndroid Build Coastguard Worker __m128i t0, t1, t2, t3;
1085*e1eccf28SAndroid Build Coastguard Worker uint32_t i;
1086*e1eccf28SAndroid Build Coastguard Worker
1087*e1eccf28SAndroid Build Coastguard Worker all1s = _mm_set1_epi16(255);
1088*e1eccf28SAndroid Build Coastguard Worker
1089*e1eccf28SAndroid Build Coastguard Worker for (i = 0; i < count8; ++i) {
1090*e1eccf28SAndroid Build Coastguard Worker in0 = _mm_loadu_si128((const __m128i *)src);
1091*e1eccf28SAndroid Build Coastguard Worker in1 = _mm_loadu_si128((const __m128i *)src + 1);
1092*e1eccf28SAndroid Build Coastguard Worker out0 = _mm_loadu_si128((const __m128i *)dst);
1093*e1eccf28SAndroid Build Coastguard Worker out1 = _mm_loadu_si128((const __m128i *)dst + 1);
1094*e1eccf28SAndroid Build Coastguard Worker
1095*e1eccf28SAndroid Build Coastguard Worker ins = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
1096*e1eccf28SAndroid Build Coastguard Worker ina = _mm_shufflelo_epi16(ins, 0xFF);
1097*e1eccf28SAndroid Build Coastguard Worker ina = _mm_shufflehi_epi16(ina, 0xFF);
1098*e1eccf28SAndroid Build Coastguard Worker outs = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
1099*e1eccf28SAndroid Build Coastguard Worker outa = _mm_shufflelo_epi16(outs, 0xFF);
1100*e1eccf28SAndroid Build Coastguard Worker outa = _mm_shufflehi_epi16(outa, 0xFF);
1101*e1eccf28SAndroid Build Coastguard Worker t0 = _mm_sub_epi16(all1s, ina);
1102*e1eccf28SAndroid Build Coastguard Worker t0 = _mm_mullo_epi16(t0, outs);
1103*e1eccf28SAndroid Build Coastguard Worker t0 = _mm_adds_epu16(t0, _mm_mullo_epi16(outa, ins));
1104*e1eccf28SAndroid Build Coastguard Worker t0 = _mm_srli_epi16(t0, 8);
1105*e1eccf28SAndroid Build Coastguard Worker
1106*e1eccf28SAndroid Build Coastguard Worker ins = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
1107*e1eccf28SAndroid Build Coastguard Worker ina = _mm_shufflelo_epi16(ins, 0xFF);
1108*e1eccf28SAndroid Build Coastguard Worker ina = _mm_shufflehi_epi16(ina, 0xFF);
1109*e1eccf28SAndroid Build Coastguard Worker outs = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
1110*e1eccf28SAndroid Build Coastguard Worker outa = _mm_shufflelo_epi16(outs, 0xFF);
1111*e1eccf28SAndroid Build Coastguard Worker outa = _mm_shufflehi_epi16(outa, 0xFF);
1112*e1eccf28SAndroid Build Coastguard Worker t1 = _mm_sub_epi16(all1s, ina);
1113*e1eccf28SAndroid Build Coastguard Worker t1 = _mm_mullo_epi16(t1, outs);
1114*e1eccf28SAndroid Build Coastguard Worker t1 = _mm_adds_epu16(t1, _mm_mullo_epi16(outa, ins));
1115*e1eccf28SAndroid Build Coastguard Worker t1 = _mm_srli_epi16(t1, 8);
1116*e1eccf28SAndroid Build Coastguard Worker
1117*e1eccf28SAndroid Build Coastguard Worker ins = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
1118*e1eccf28SAndroid Build Coastguard Worker ina = _mm_shufflelo_epi16(ins, 0xFF);
1119*e1eccf28SAndroid Build Coastguard Worker ina = _mm_shufflehi_epi16(ina, 0xFF);
1120*e1eccf28SAndroid Build Coastguard Worker outs = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
1121*e1eccf28SAndroid Build Coastguard Worker outa = _mm_shufflelo_epi16(outs, 0xFF);
1122*e1eccf28SAndroid Build Coastguard Worker outa = _mm_shufflehi_epi16(outa, 0xFF);
1123*e1eccf28SAndroid Build Coastguard Worker t2 = _mm_sub_epi16(all1s, ina);
1124*e1eccf28SAndroid Build Coastguard Worker t2 = _mm_mullo_epi16(t2, outs);
1125*e1eccf28SAndroid Build Coastguard Worker t2 = _mm_adds_epu16(t2, _mm_mullo_epi16(outa, ins));
1126*e1eccf28SAndroid Build Coastguard Worker t2 = _mm_srli_epi16(t2, 8);
1127*e1eccf28SAndroid Build Coastguard Worker
1128*e1eccf28SAndroid Build Coastguard Worker ins = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
1129*e1eccf28SAndroid Build Coastguard Worker ina = _mm_shufflelo_epi16(ins, 0xFF);
1130*e1eccf28SAndroid Build Coastguard Worker ina = _mm_shufflehi_epi16(ina, 0xFF);
1131*e1eccf28SAndroid Build Coastguard Worker outs = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
1132*e1eccf28SAndroid Build Coastguard Worker outa = _mm_shufflelo_epi16(outs, 0xFF);
1133*e1eccf28SAndroid Build Coastguard Worker outa = _mm_shufflehi_epi16(outa, 0xFF);
1134*e1eccf28SAndroid Build Coastguard Worker t3 = _mm_sub_epi16(all1s, ina);
1135*e1eccf28SAndroid Build Coastguard Worker t3 = _mm_mullo_epi16(t3, outs);
1136*e1eccf28SAndroid Build Coastguard Worker t3 = _mm_adds_epu16(t3, _mm_mullo_epi16(outa, ins));
1137*e1eccf28SAndroid Build Coastguard Worker t3 = _mm_srli_epi16(t3, 8);
1138*e1eccf28SAndroid Build Coastguard Worker
1139*e1eccf28SAndroid Build Coastguard Worker t0 = _mm_packus_epi16(t0, t1);
1140*e1eccf28SAndroid Build Coastguard Worker t0 = blendv_epi8(t0, out0, M0001);
1141*e1eccf28SAndroid Build Coastguard Worker t2 = _mm_packus_epi16(t2, t3);
1142*e1eccf28SAndroid Build Coastguard Worker t2 = blendv_epi8(t2, out1, M0001);
1143*e1eccf28SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i *)dst, t0);
1144*e1eccf28SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i *)dst + 1, t2);
1145*e1eccf28SAndroid Build Coastguard Worker
1146*e1eccf28SAndroid Build Coastguard Worker src = (const __m128i *)src + 2;
1147*e1eccf28SAndroid Build Coastguard Worker dst = (__m128i *)dst + 2;
1148*e1eccf28SAndroid Build Coastguard Worker }
1149*e1eccf28SAndroid Build Coastguard Worker }
1150*e1eccf28SAndroid Build Coastguard Worker
rsdIntrinsicBlendDstAtop_K(void * dst,const void * src,uint32_t count8)1151*e1eccf28SAndroid Build Coastguard Worker void rsdIntrinsicBlendDstAtop_K(void *dst, const void *src, uint32_t count8) {
1152*e1eccf28SAndroid Build Coastguard Worker const __m128i M0001 = _mm_set_epi32(0xff000000, 0xff000000, 0xff000000, 0xff000000);
1153*e1eccf28SAndroid Build Coastguard Worker __m128i all1s, ina, ins, outa, outs;
1154*e1eccf28SAndroid Build Coastguard Worker __m128i in0, in1, out0, out1;
1155*e1eccf28SAndroid Build Coastguard Worker __m128i t0, t1, t2, t3;
1156*e1eccf28SAndroid Build Coastguard Worker uint32_t i;
1157*e1eccf28SAndroid Build Coastguard Worker
1158*e1eccf28SAndroid Build Coastguard Worker all1s = _mm_set1_epi16(255);
1159*e1eccf28SAndroid Build Coastguard Worker
1160*e1eccf28SAndroid Build Coastguard Worker for (i = 0; i < count8; ++i) {
1161*e1eccf28SAndroid Build Coastguard Worker in0 = _mm_loadu_si128((const __m128i *)src);
1162*e1eccf28SAndroid Build Coastguard Worker in1 = _mm_loadu_si128((const __m128i *)src + 1);
1163*e1eccf28SAndroid Build Coastguard Worker out0 = _mm_loadu_si128((const __m128i *)dst);
1164*e1eccf28SAndroid Build Coastguard Worker out1 = _mm_loadu_si128((const __m128i *)dst + 1);
1165*e1eccf28SAndroid Build Coastguard Worker
1166*e1eccf28SAndroid Build Coastguard Worker ins = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
1167*e1eccf28SAndroid Build Coastguard Worker ina = _mm_shufflelo_epi16(ins, 0xFF);
1168*e1eccf28SAndroid Build Coastguard Worker ina = _mm_shufflehi_epi16(ina, 0xFF);
1169*e1eccf28SAndroid Build Coastguard Worker outs = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
1170*e1eccf28SAndroid Build Coastguard Worker outa = _mm_shufflelo_epi16(outs, 0xFF);
1171*e1eccf28SAndroid Build Coastguard Worker outa = _mm_shufflehi_epi16(outa, 0xFF);
1172*e1eccf28SAndroid Build Coastguard Worker t0 = _mm_sub_epi16(all1s, outa);
1173*e1eccf28SAndroid Build Coastguard Worker t0 = _mm_mullo_epi16(t0, ins);
1174*e1eccf28SAndroid Build Coastguard Worker t0 = _mm_adds_epu16(t0, _mm_mullo_epi16(ina, outs));
1175*e1eccf28SAndroid Build Coastguard Worker t0 = _mm_srli_epi16(t0, 8);
1176*e1eccf28SAndroid Build Coastguard Worker
1177*e1eccf28SAndroid Build Coastguard Worker ins = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
1178*e1eccf28SAndroid Build Coastguard Worker ina = _mm_shufflelo_epi16(ins, 0xFF);
1179*e1eccf28SAndroid Build Coastguard Worker ina = _mm_shufflehi_epi16(ina, 0xFF);
1180*e1eccf28SAndroid Build Coastguard Worker outs = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
1181*e1eccf28SAndroid Build Coastguard Worker outa = _mm_shufflelo_epi16(outs, 0xFF);
1182*e1eccf28SAndroid Build Coastguard Worker outa = _mm_shufflehi_epi16(outa, 0xFF);
1183*e1eccf28SAndroid Build Coastguard Worker t1 = _mm_sub_epi16(all1s, outa);
1184*e1eccf28SAndroid Build Coastguard Worker t1 = _mm_mullo_epi16(t1, ins);
1185*e1eccf28SAndroid Build Coastguard Worker t1 = _mm_adds_epu16(t1, _mm_mullo_epi16(ina, outs));
1186*e1eccf28SAndroid Build Coastguard Worker t1 = _mm_srli_epi16(t1, 8);
1187*e1eccf28SAndroid Build Coastguard Worker
1188*e1eccf28SAndroid Build Coastguard Worker ins = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
1189*e1eccf28SAndroid Build Coastguard Worker ina = _mm_shufflelo_epi16(ins, 0xFF);
1190*e1eccf28SAndroid Build Coastguard Worker ina = _mm_shufflehi_epi16(ina, 0xFF);
1191*e1eccf28SAndroid Build Coastguard Worker outs = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
1192*e1eccf28SAndroid Build Coastguard Worker outa = _mm_shufflelo_epi16(outs, 0xFF);
1193*e1eccf28SAndroid Build Coastguard Worker outa = _mm_shufflehi_epi16(outa, 0xFF);
1194*e1eccf28SAndroid Build Coastguard Worker t2 = _mm_sub_epi16(all1s, outa);
1195*e1eccf28SAndroid Build Coastguard Worker t2 = _mm_mullo_epi16(t2, ins);
1196*e1eccf28SAndroid Build Coastguard Worker t2 = _mm_adds_epu16(t2, _mm_mullo_epi16(ina, outs));
1197*e1eccf28SAndroid Build Coastguard Worker t2 = _mm_srli_epi16(t2, 8);
1198*e1eccf28SAndroid Build Coastguard Worker
1199*e1eccf28SAndroid Build Coastguard Worker ins = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
1200*e1eccf28SAndroid Build Coastguard Worker ina = _mm_shufflelo_epi16(ins, 0xFF);
1201*e1eccf28SAndroid Build Coastguard Worker ina = _mm_shufflehi_epi16(ina, 0xFF);
1202*e1eccf28SAndroid Build Coastguard Worker outs = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
1203*e1eccf28SAndroid Build Coastguard Worker outa = _mm_shufflelo_epi16(outs, 0xFF);
1204*e1eccf28SAndroid Build Coastguard Worker outa = _mm_shufflehi_epi16(outa, 0xFF);
1205*e1eccf28SAndroid Build Coastguard Worker t3 = _mm_sub_epi16(all1s, outa);
1206*e1eccf28SAndroid Build Coastguard Worker t3 = _mm_mullo_epi16(t3, ins);
1207*e1eccf28SAndroid Build Coastguard Worker t3 = _mm_adds_epu16(t3, _mm_mullo_epi16(ina, outs));
1208*e1eccf28SAndroid Build Coastguard Worker t3 = _mm_srli_epi16(t3, 8);
1209*e1eccf28SAndroid Build Coastguard Worker
1210*e1eccf28SAndroid Build Coastguard Worker t0 = _mm_packus_epi16(t0, t1);
1211*e1eccf28SAndroid Build Coastguard Worker t0 = blendv_epi8(t0, in0, M0001);
1212*e1eccf28SAndroid Build Coastguard Worker t2 = _mm_packus_epi16(t2, t3);
1213*e1eccf28SAndroid Build Coastguard Worker t2 = blendv_epi8(t2, in1, M0001);
1214*e1eccf28SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i *)dst, t0);
1215*e1eccf28SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i *)dst + 1, t2);
1216*e1eccf28SAndroid Build Coastguard Worker
1217*e1eccf28SAndroid Build Coastguard Worker src = (const __m128i *)src + 2;
1218*e1eccf28SAndroid Build Coastguard Worker dst = (__m128i *)dst + 2;
1219*e1eccf28SAndroid Build Coastguard Worker }
1220*e1eccf28SAndroid Build Coastguard Worker }
1221*e1eccf28SAndroid Build Coastguard Worker
rsdIntrinsicBlendXor_K(void * dst,const void * src,uint32_t count8)1222*e1eccf28SAndroid Build Coastguard Worker void rsdIntrinsicBlendXor_K(void *dst, const void *src, uint32_t count8) {
1223*e1eccf28SAndroid Build Coastguard Worker __m128i in0, in1, out0, out1;
1224*e1eccf28SAndroid Build Coastguard Worker uint32_t i;
1225*e1eccf28SAndroid Build Coastguard Worker
1226*e1eccf28SAndroid Build Coastguard Worker for (i = 0; i < count8; ++i) {
1227*e1eccf28SAndroid Build Coastguard Worker in0 = _mm_loadu_si128((const __m128i *)src);
1228*e1eccf28SAndroid Build Coastguard Worker in1 = _mm_loadu_si128((const __m128i *)src + 1);
1229*e1eccf28SAndroid Build Coastguard Worker out0 = _mm_loadu_si128((const __m128i *)dst);
1230*e1eccf28SAndroid Build Coastguard Worker out1 = _mm_loadu_si128((const __m128i *)dst + 1);
1231*e1eccf28SAndroid Build Coastguard Worker
1232*e1eccf28SAndroid Build Coastguard Worker out0 = _mm_xor_si128(out0, in0);
1233*e1eccf28SAndroid Build Coastguard Worker out1 = _mm_xor_si128(out1, in1);
1234*e1eccf28SAndroid Build Coastguard Worker
1235*e1eccf28SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i *)dst, out0);
1236*e1eccf28SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i *)dst + 1, out1);
1237*e1eccf28SAndroid Build Coastguard Worker
1238*e1eccf28SAndroid Build Coastguard Worker src = (const __m128i *)src + 2;
1239*e1eccf28SAndroid Build Coastguard Worker dst = (__m128i *)dst + 2;
1240*e1eccf28SAndroid Build Coastguard Worker }
1241*e1eccf28SAndroid Build Coastguard Worker }
1242*e1eccf28SAndroid Build Coastguard Worker
rsdIntrinsicBlendMultiply_K(void * dst,const void * src,uint32_t count8)1243*e1eccf28SAndroid Build Coastguard Worker void rsdIntrinsicBlendMultiply_K(void *dst, const void *src, uint32_t count8) {
1244*e1eccf28SAndroid Build Coastguard Worker __m128i in0, in1, out0, out1;
1245*e1eccf28SAndroid Build Coastguard Worker __m128i t0, t1, t2, t3;
1246*e1eccf28SAndroid Build Coastguard Worker uint32_t i;
1247*e1eccf28SAndroid Build Coastguard Worker
1248*e1eccf28SAndroid Build Coastguard Worker for (i = 0; i < count8; ++i) {
1249*e1eccf28SAndroid Build Coastguard Worker in0 = _mm_loadu_si128((const __m128i *)src);
1250*e1eccf28SAndroid Build Coastguard Worker in1 = _mm_loadu_si128((const __m128i *)src + 1);
1251*e1eccf28SAndroid Build Coastguard Worker out0 = _mm_loadu_si128((const __m128i *)dst);
1252*e1eccf28SAndroid Build Coastguard Worker out1 = _mm_loadu_si128((const __m128i *)dst + 1);
1253*e1eccf28SAndroid Build Coastguard Worker
1254*e1eccf28SAndroid Build Coastguard Worker t0 = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
1255*e1eccf28SAndroid Build Coastguard Worker t0 = _mm_mullo_epi16(t0, _mm_unpacklo_epi8(out0, _mm_setzero_si128()));
1256*e1eccf28SAndroid Build Coastguard Worker t0 = _mm_srli_epi16(t0, 8);
1257*e1eccf28SAndroid Build Coastguard Worker
1258*e1eccf28SAndroid Build Coastguard Worker t1 = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
1259*e1eccf28SAndroid Build Coastguard Worker t1 = _mm_mullo_epi16(t1, _mm_unpackhi_epi8(out0, _mm_setzero_si128()));
1260*e1eccf28SAndroid Build Coastguard Worker t1 = _mm_srli_epi16(t1, 8);
1261*e1eccf28SAndroid Build Coastguard Worker
1262*e1eccf28SAndroid Build Coastguard Worker t2 = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
1263*e1eccf28SAndroid Build Coastguard Worker t2 = _mm_mullo_epi16(t2, _mm_unpacklo_epi8(out1, _mm_setzero_si128()));
1264*e1eccf28SAndroid Build Coastguard Worker t2 = _mm_srli_epi16(t2, 8);
1265*e1eccf28SAndroid Build Coastguard Worker
1266*e1eccf28SAndroid Build Coastguard Worker t3 = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
1267*e1eccf28SAndroid Build Coastguard Worker t3 = _mm_mullo_epi16(t3, _mm_unpackhi_epi8(out1, _mm_setzero_si128()));
1268*e1eccf28SAndroid Build Coastguard Worker t3 = _mm_srli_epi16(t3, 8);
1269*e1eccf28SAndroid Build Coastguard Worker
1270*e1eccf28SAndroid Build Coastguard Worker t0 = _mm_packus_epi16(t0, t1);
1271*e1eccf28SAndroid Build Coastguard Worker t2 = _mm_packus_epi16(t2, t3);
1272*e1eccf28SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i *)dst, t0);
1273*e1eccf28SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i *)dst + 1, t2);
1274*e1eccf28SAndroid Build Coastguard Worker
1275*e1eccf28SAndroid Build Coastguard Worker src = (const __m128i *)src + 2;
1276*e1eccf28SAndroid Build Coastguard Worker dst = (__m128i *)dst + 2;
1277*e1eccf28SAndroid Build Coastguard Worker }
1278*e1eccf28SAndroid Build Coastguard Worker }
1279*e1eccf28SAndroid Build Coastguard Worker
rsdIntrinsicBlendAdd_K(void * dst,const void * src,uint32_t count8)1280*e1eccf28SAndroid Build Coastguard Worker void rsdIntrinsicBlendAdd_K(void *dst, const void *src, uint32_t count8) {
1281*e1eccf28SAndroid Build Coastguard Worker __m128i in0, in1, out0, out1;
1282*e1eccf28SAndroid Build Coastguard Worker uint32_t i;
1283*e1eccf28SAndroid Build Coastguard Worker
1284*e1eccf28SAndroid Build Coastguard Worker for (i = 0; i < count8; ++i) {
1285*e1eccf28SAndroid Build Coastguard Worker in0 = _mm_loadu_si128((const __m128i *)src);
1286*e1eccf28SAndroid Build Coastguard Worker in1 = _mm_loadu_si128((const __m128i *)src + 1);
1287*e1eccf28SAndroid Build Coastguard Worker out0 = _mm_loadu_si128((const __m128i *)dst);
1288*e1eccf28SAndroid Build Coastguard Worker out1 = _mm_loadu_si128((const __m128i *)dst + 1);
1289*e1eccf28SAndroid Build Coastguard Worker
1290*e1eccf28SAndroid Build Coastguard Worker out0 = _mm_adds_epu8(out0, in0);
1291*e1eccf28SAndroid Build Coastguard Worker out1 = _mm_adds_epu8(out1, in1);
1292*e1eccf28SAndroid Build Coastguard Worker
1293*e1eccf28SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i *)dst, out0);
1294*e1eccf28SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i *)dst + 1, out1);
1295*e1eccf28SAndroid Build Coastguard Worker
1296*e1eccf28SAndroid Build Coastguard Worker src = (const __m128i *)src + 2;
1297*e1eccf28SAndroid Build Coastguard Worker dst = (__m128i *)dst + 2;
1298*e1eccf28SAndroid Build Coastguard Worker }
1299*e1eccf28SAndroid Build Coastguard Worker }
1300*e1eccf28SAndroid Build Coastguard Worker
rsdIntrinsicBlendSub_K(void * dst,const void * src,uint32_t count8)1301*e1eccf28SAndroid Build Coastguard Worker void rsdIntrinsicBlendSub_K(void *dst, const void *src, uint32_t count8) {
1302*e1eccf28SAndroid Build Coastguard Worker __m128i in0, in1, out0, out1;
1303*e1eccf28SAndroid Build Coastguard Worker uint32_t i;
1304*e1eccf28SAndroid Build Coastguard Worker
1305*e1eccf28SAndroid Build Coastguard Worker for (i = 0; i < count8; ++i) {
1306*e1eccf28SAndroid Build Coastguard Worker in0 = _mm_loadu_si128((const __m128i *)src);
1307*e1eccf28SAndroid Build Coastguard Worker in1 = _mm_loadu_si128((const __m128i *)src + 1);
1308*e1eccf28SAndroid Build Coastguard Worker out0 = _mm_loadu_si128((const __m128i *)dst);
1309*e1eccf28SAndroid Build Coastguard Worker out1 = _mm_loadu_si128((const __m128i *)dst + 1);
1310*e1eccf28SAndroid Build Coastguard Worker
1311*e1eccf28SAndroid Build Coastguard Worker out0 = _mm_subs_epu8(out0, in0);
1312*e1eccf28SAndroid Build Coastguard Worker out1 = _mm_subs_epu8(out1, in1);
1313*e1eccf28SAndroid Build Coastguard Worker
1314*e1eccf28SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i *)dst, out0);
1315*e1eccf28SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i *)dst + 1, out1);
1316*e1eccf28SAndroid Build Coastguard Worker
1317*e1eccf28SAndroid Build Coastguard Worker src = (const __m128i *)src + 2;
1318*e1eccf28SAndroid Build Coastguard Worker dst = (__m128i *)dst + 2;
1319*e1eccf28SAndroid Build Coastguard Worker }
1320*e1eccf28SAndroid Build Coastguard Worker }
1321*e1eccf28SAndroid Build Coastguard Worker
1322*e1eccf28SAndroid Build Coastguard Worker } // namespace android
1323*e1eccf28SAndroid Build Coastguard Worker } // namespace renderscript
1324