xref: /aosp_15_r20/frameworks/rs/cpu_ref/rsCpuIntrinsicConvolve5x5.cpp (revision e1eccf28f96817838ad6867f7f39d2351ec11f56)
1*e1eccf28SAndroid Build Coastguard Worker /*
2*e1eccf28SAndroid Build Coastguard Worker  * Copyright (C) 2012 The Android Open Source Project
3*e1eccf28SAndroid Build Coastguard Worker  *
4*e1eccf28SAndroid Build Coastguard Worker  * Licensed under the Apache License, Version 2.0 (the "License");
5*e1eccf28SAndroid Build Coastguard Worker  * you may not use this file except in compliance with the License.
6*e1eccf28SAndroid Build Coastguard Worker  * You may obtain a copy of the License at
7*e1eccf28SAndroid Build Coastguard Worker  *
8*e1eccf28SAndroid Build Coastguard Worker  *      http://www.apache.org/licenses/LICENSE-2.0
9*e1eccf28SAndroid Build Coastguard Worker  *
10*e1eccf28SAndroid Build Coastguard Worker  * Unless required by applicable law or agreed to in writing, software
11*e1eccf28SAndroid Build Coastguard Worker  * distributed under the License is distributed on an "AS IS" BASIS,
12*e1eccf28SAndroid Build Coastguard Worker  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13*e1eccf28SAndroid Build Coastguard Worker  * See the License for the specific language governing permissions and
14*e1eccf28SAndroid Build Coastguard Worker  * limitations under the License.
15*e1eccf28SAndroid Build Coastguard Worker  */
16*e1eccf28SAndroid Build Coastguard Worker 
17*e1eccf28SAndroid Build Coastguard Worker 
18*e1eccf28SAndroid Build Coastguard Worker #include "rsCpuIntrinsic.h"
19*e1eccf28SAndroid Build Coastguard Worker #include "rsCpuIntrinsicInlines.h"
20*e1eccf28SAndroid Build Coastguard Worker 
21*e1eccf28SAndroid Build Coastguard Worker namespace android {
22*e1eccf28SAndroid Build Coastguard Worker namespace renderscript {
23*e1eccf28SAndroid Build Coastguard Worker 
24*e1eccf28SAndroid Build Coastguard Worker 
25*e1eccf28SAndroid Build Coastguard Worker class RsdCpuScriptIntrinsicConvolve5x5 : public RsdCpuScriptIntrinsic {
26*e1eccf28SAndroid Build Coastguard Worker public:
27*e1eccf28SAndroid Build Coastguard Worker     void populateScript(Script *) override;
28*e1eccf28SAndroid Build Coastguard Worker     void invokeFreeChildren() override;
29*e1eccf28SAndroid Build Coastguard Worker 
30*e1eccf28SAndroid Build Coastguard Worker     void setGlobalVar(uint32_t slot, const void *data, size_t dataLength) override;
31*e1eccf28SAndroid Build Coastguard Worker     void setGlobalObj(uint32_t slot, ObjectBase *data) override;
32*e1eccf28SAndroid Build Coastguard Worker 
33*e1eccf28SAndroid Build Coastguard Worker     ~RsdCpuScriptIntrinsicConvolve5x5() override;
34*e1eccf28SAndroid Build Coastguard Worker     RsdCpuScriptIntrinsicConvolve5x5(RsdCpuReferenceImpl *ctx, const Script *s, const Element *e);
35*e1eccf28SAndroid Build Coastguard Worker 
36*e1eccf28SAndroid Build Coastguard Worker protected:
37*e1eccf28SAndroid Build Coastguard Worker     float mFp[28];
38*e1eccf28SAndroid Build Coastguard Worker     int16_t mIp[28];
39*e1eccf28SAndroid Build Coastguard Worker     ObjectBaseRef<Allocation> alloc;
40*e1eccf28SAndroid Build Coastguard Worker 
41*e1eccf28SAndroid Build Coastguard Worker 
42*e1eccf28SAndroid Build Coastguard Worker     static void kernelU1(const RsExpandKernelDriverInfo *info,
43*e1eccf28SAndroid Build Coastguard Worker                          uint32_t xstart, uint32_t xend,
44*e1eccf28SAndroid Build Coastguard Worker                          uint32_t outstep);
45*e1eccf28SAndroid Build Coastguard Worker     static void kernelU2(const RsExpandKernelDriverInfo *info,
46*e1eccf28SAndroid Build Coastguard Worker                          uint32_t xstart, uint32_t xend,
47*e1eccf28SAndroid Build Coastguard Worker                          uint32_t outstep);
48*e1eccf28SAndroid Build Coastguard Worker     static void kernelU4(const RsExpandKernelDriverInfo *info,
49*e1eccf28SAndroid Build Coastguard Worker                          uint32_t xstart, uint32_t xend,
50*e1eccf28SAndroid Build Coastguard Worker                          uint32_t outstep);
51*e1eccf28SAndroid Build Coastguard Worker     static void kernelF1(const RsExpandKernelDriverInfo *info,
52*e1eccf28SAndroid Build Coastguard Worker                          uint32_t xstart, uint32_t xend,
53*e1eccf28SAndroid Build Coastguard Worker                          uint32_t outstep);
54*e1eccf28SAndroid Build Coastguard Worker     static void kernelF2(const RsExpandKernelDriverInfo *info,
55*e1eccf28SAndroid Build Coastguard Worker                          uint32_t xstart, uint32_t xend,
56*e1eccf28SAndroid Build Coastguard Worker                          uint32_t outstep);
57*e1eccf28SAndroid Build Coastguard Worker     static void kernelF4(const RsExpandKernelDriverInfo *info,
58*e1eccf28SAndroid Build Coastguard Worker                          uint32_t xstart, uint32_t xend,
59*e1eccf28SAndroid Build Coastguard Worker                          uint32_t outstep);
60*e1eccf28SAndroid Build Coastguard Worker 
61*e1eccf28SAndroid Build Coastguard Worker 
62*e1eccf28SAndroid Build Coastguard Worker };
63*e1eccf28SAndroid Build Coastguard Worker 
setGlobalObj(uint32_t slot,ObjectBase * data)64*e1eccf28SAndroid Build Coastguard Worker void RsdCpuScriptIntrinsicConvolve5x5::setGlobalObj(uint32_t slot, ObjectBase *data) {
65*e1eccf28SAndroid Build Coastguard Worker     rsAssert(slot == 1);
66*e1eccf28SAndroid Build Coastguard Worker     alloc.set(static_cast<Allocation *>(data));
67*e1eccf28SAndroid Build Coastguard Worker }
68*e1eccf28SAndroid Build Coastguard Worker 
setGlobalVar(uint32_t slot,const void * data,size_t dataLength)69*e1eccf28SAndroid Build Coastguard Worker void RsdCpuScriptIntrinsicConvolve5x5::setGlobalVar(uint32_t slot,
70*e1eccf28SAndroid Build Coastguard Worker                                                     const void *data, size_t dataLength) {
71*e1eccf28SAndroid Build Coastguard Worker     rsAssert(slot == 0);
72*e1eccf28SAndroid Build Coastguard Worker     memcpy (&mFp, data, dataLength);
73*e1eccf28SAndroid Build Coastguard Worker     for(int ct=0; ct < 25; ct++) {
74*e1eccf28SAndroid Build Coastguard Worker         if (mFp[ct] >= 0) {
75*e1eccf28SAndroid Build Coastguard Worker             mIp[ct] = (int16_t)(mFp[ct] * 256.f + 0.5f);
76*e1eccf28SAndroid Build Coastguard Worker         } else {
77*e1eccf28SAndroid Build Coastguard Worker             mIp[ct] = (int16_t)(mFp[ct] * 256.f - 0.5f);
78*e1eccf28SAndroid Build Coastguard Worker         }
79*e1eccf28SAndroid Build Coastguard Worker     }
80*e1eccf28SAndroid Build Coastguard Worker }
81*e1eccf28SAndroid Build Coastguard Worker 
82*e1eccf28SAndroid Build Coastguard Worker 
OneU4(const RsExpandKernelDriverInfo * info,uint32_t x,uchar4 * out,const uchar4 * py0,const uchar4 * py1,const uchar4 * py2,const uchar4 * py3,const uchar4 * py4,const float * coeff)83*e1eccf28SAndroid Build Coastguard Worker static void OneU4(const RsExpandKernelDriverInfo *info, uint32_t x, uchar4 *out,
84*e1eccf28SAndroid Build Coastguard Worker                   const uchar4 *py0, const uchar4 *py1, const uchar4 *py2, const uchar4 *py3, const uchar4 *py4,
85*e1eccf28SAndroid Build Coastguard Worker                   const float* coeff) {
86*e1eccf28SAndroid Build Coastguard Worker 
87*e1eccf28SAndroid Build Coastguard Worker     uint32_t x0 = rsMax((int32_t)x-2, 0);
88*e1eccf28SAndroid Build Coastguard Worker     uint32_t x1 = rsMax((int32_t)x-1, 0);
89*e1eccf28SAndroid Build Coastguard Worker     uint32_t x2 = x;
90*e1eccf28SAndroid Build Coastguard Worker     uint32_t x3 = rsMin((int32_t)x+1, (int32_t)(info->dim.x-1));
91*e1eccf28SAndroid Build Coastguard Worker     uint32_t x4 = rsMin((int32_t)x+2, (int32_t)(info->dim.x-1));
92*e1eccf28SAndroid Build Coastguard Worker 
93*e1eccf28SAndroid Build Coastguard Worker     float4 px = convert_float4(py0[x0]) * coeff[0] +
94*e1eccf28SAndroid Build Coastguard Worker                 convert_float4(py0[x1]) * coeff[1] +
95*e1eccf28SAndroid Build Coastguard Worker                 convert_float4(py0[x2]) * coeff[2] +
96*e1eccf28SAndroid Build Coastguard Worker                 convert_float4(py0[x3]) * coeff[3] +
97*e1eccf28SAndroid Build Coastguard Worker                 convert_float4(py0[x4]) * coeff[4] +
98*e1eccf28SAndroid Build Coastguard Worker 
99*e1eccf28SAndroid Build Coastguard Worker                 convert_float4(py1[x0]) * coeff[5] +
100*e1eccf28SAndroid Build Coastguard Worker                 convert_float4(py1[x1]) * coeff[6] +
101*e1eccf28SAndroid Build Coastguard Worker                 convert_float4(py1[x2]) * coeff[7] +
102*e1eccf28SAndroid Build Coastguard Worker                 convert_float4(py1[x3]) * coeff[8] +
103*e1eccf28SAndroid Build Coastguard Worker                 convert_float4(py1[x4]) * coeff[9] +
104*e1eccf28SAndroid Build Coastguard Worker 
105*e1eccf28SAndroid Build Coastguard Worker                 convert_float4(py2[x0]) * coeff[10] +
106*e1eccf28SAndroid Build Coastguard Worker                 convert_float4(py2[x1]) * coeff[11] +
107*e1eccf28SAndroid Build Coastguard Worker                 convert_float4(py2[x2]) * coeff[12] +
108*e1eccf28SAndroid Build Coastguard Worker                 convert_float4(py2[x3]) * coeff[13] +
109*e1eccf28SAndroid Build Coastguard Worker                 convert_float4(py2[x4]) * coeff[14] +
110*e1eccf28SAndroid Build Coastguard Worker 
111*e1eccf28SAndroid Build Coastguard Worker                 convert_float4(py3[x0]) * coeff[15] +
112*e1eccf28SAndroid Build Coastguard Worker                 convert_float4(py3[x1]) * coeff[16] +
113*e1eccf28SAndroid Build Coastguard Worker                 convert_float4(py3[x2]) * coeff[17] +
114*e1eccf28SAndroid Build Coastguard Worker                 convert_float4(py3[x3]) * coeff[18] +
115*e1eccf28SAndroid Build Coastguard Worker                 convert_float4(py3[x4]) * coeff[19] +
116*e1eccf28SAndroid Build Coastguard Worker 
117*e1eccf28SAndroid Build Coastguard Worker                 convert_float4(py4[x0]) * coeff[20] +
118*e1eccf28SAndroid Build Coastguard Worker                 convert_float4(py4[x1]) * coeff[21] +
119*e1eccf28SAndroid Build Coastguard Worker                 convert_float4(py4[x2]) * coeff[22] +
120*e1eccf28SAndroid Build Coastguard Worker                 convert_float4(py4[x3]) * coeff[23] +
121*e1eccf28SAndroid Build Coastguard Worker                 convert_float4(py4[x4]) * coeff[24];
122*e1eccf28SAndroid Build Coastguard Worker     px = clamp(px + 0.5f, 0.f, 255.f);
123*e1eccf28SAndroid Build Coastguard Worker     *out = convert_uchar4(px);
124*e1eccf28SAndroid Build Coastguard Worker }
125*e1eccf28SAndroid Build Coastguard Worker 
OneU2(const RsExpandKernelDriverInfo * info,uint32_t x,uchar2 * out,const uchar2 * py0,const uchar2 * py1,const uchar2 * py2,const uchar2 * py3,const uchar2 * py4,const float * coeff)126*e1eccf28SAndroid Build Coastguard Worker static void OneU2(const RsExpandKernelDriverInfo *info, uint32_t x, uchar2 *out,
127*e1eccf28SAndroid Build Coastguard Worker                   const uchar2 *py0, const uchar2 *py1, const uchar2 *py2, const uchar2 *py3, const uchar2 *py4,
128*e1eccf28SAndroid Build Coastguard Worker                   const float* coeff) {
129*e1eccf28SAndroid Build Coastguard Worker 
130*e1eccf28SAndroid Build Coastguard Worker     uint32_t x0 = rsMax((int32_t)x-2, 0);
131*e1eccf28SAndroid Build Coastguard Worker     uint32_t x1 = rsMax((int32_t)x-1, 0);
132*e1eccf28SAndroid Build Coastguard Worker     uint32_t x2 = x;
133*e1eccf28SAndroid Build Coastguard Worker     uint32_t x3 = rsMin((int32_t)x+1, (int32_t)(info->dim.x-1));
134*e1eccf28SAndroid Build Coastguard Worker     uint32_t x4 = rsMin((int32_t)x+2, (int32_t)(info->dim.x-1));
135*e1eccf28SAndroid Build Coastguard Worker 
136*e1eccf28SAndroid Build Coastguard Worker     float2 px = convert_float2(py0[x0]) * coeff[0] +
137*e1eccf28SAndroid Build Coastguard Worker                 convert_float2(py0[x1]) * coeff[1] +
138*e1eccf28SAndroid Build Coastguard Worker                 convert_float2(py0[x2]) * coeff[2] +
139*e1eccf28SAndroid Build Coastguard Worker                 convert_float2(py0[x3]) * coeff[3] +
140*e1eccf28SAndroid Build Coastguard Worker                 convert_float2(py0[x4]) * coeff[4] +
141*e1eccf28SAndroid Build Coastguard Worker 
142*e1eccf28SAndroid Build Coastguard Worker                 convert_float2(py1[x0]) * coeff[5] +
143*e1eccf28SAndroid Build Coastguard Worker                 convert_float2(py1[x1]) * coeff[6] +
144*e1eccf28SAndroid Build Coastguard Worker                 convert_float2(py1[x2]) * coeff[7] +
145*e1eccf28SAndroid Build Coastguard Worker                 convert_float2(py1[x3]) * coeff[8] +
146*e1eccf28SAndroid Build Coastguard Worker                 convert_float2(py1[x4]) * coeff[9] +
147*e1eccf28SAndroid Build Coastguard Worker 
148*e1eccf28SAndroid Build Coastguard Worker                 convert_float2(py2[x0]) * coeff[10] +
149*e1eccf28SAndroid Build Coastguard Worker                 convert_float2(py2[x1]) * coeff[11] +
150*e1eccf28SAndroid Build Coastguard Worker                 convert_float2(py2[x2]) * coeff[12] +
151*e1eccf28SAndroid Build Coastguard Worker                 convert_float2(py2[x3]) * coeff[13] +
152*e1eccf28SAndroid Build Coastguard Worker                 convert_float2(py2[x4]) * coeff[14] +
153*e1eccf28SAndroid Build Coastguard Worker 
154*e1eccf28SAndroid Build Coastguard Worker                 convert_float2(py3[x0]) * coeff[15] +
155*e1eccf28SAndroid Build Coastguard Worker                 convert_float2(py3[x1]) * coeff[16] +
156*e1eccf28SAndroid Build Coastguard Worker                 convert_float2(py3[x2]) * coeff[17] +
157*e1eccf28SAndroid Build Coastguard Worker                 convert_float2(py3[x3]) * coeff[18] +
158*e1eccf28SAndroid Build Coastguard Worker                 convert_float2(py3[x4]) * coeff[19] +
159*e1eccf28SAndroid Build Coastguard Worker 
160*e1eccf28SAndroid Build Coastguard Worker                 convert_float2(py4[x0]) * coeff[20] +
161*e1eccf28SAndroid Build Coastguard Worker                 convert_float2(py4[x1]) * coeff[21] +
162*e1eccf28SAndroid Build Coastguard Worker                 convert_float2(py4[x2]) * coeff[22] +
163*e1eccf28SAndroid Build Coastguard Worker                 convert_float2(py4[x3]) * coeff[23] +
164*e1eccf28SAndroid Build Coastguard Worker                 convert_float2(py4[x4]) * coeff[24];
165*e1eccf28SAndroid Build Coastguard Worker     px = clamp(px + 0.5f, 0.f, 255.f);
166*e1eccf28SAndroid Build Coastguard Worker     *out = convert_uchar2(px);
167*e1eccf28SAndroid Build Coastguard Worker }
168*e1eccf28SAndroid Build Coastguard Worker 
OneU1(const RsExpandKernelDriverInfo * info,uint32_t x,uchar * out,const uchar * py0,const uchar * py1,const uchar * py2,const uchar * py3,const uchar * py4,const float * coeff)169*e1eccf28SAndroid Build Coastguard Worker static void OneU1(const RsExpandKernelDriverInfo *info, uint32_t x, uchar *out,
170*e1eccf28SAndroid Build Coastguard Worker                   const uchar *py0, const uchar *py1, const uchar *py2, const uchar *py3, const uchar *py4,
171*e1eccf28SAndroid Build Coastguard Worker                   const float* coeff) {
172*e1eccf28SAndroid Build Coastguard Worker 
173*e1eccf28SAndroid Build Coastguard Worker     uint32_t x0 = rsMax((int32_t)x-2, 0);
174*e1eccf28SAndroid Build Coastguard Worker     uint32_t x1 = rsMax((int32_t)x-1, 0);
175*e1eccf28SAndroid Build Coastguard Worker     uint32_t x2 = x;
176*e1eccf28SAndroid Build Coastguard Worker     uint32_t x3 = rsMin((int32_t)x+1, (int32_t)(info->dim.x-1));
177*e1eccf28SAndroid Build Coastguard Worker     uint32_t x4 = rsMin((int32_t)x+2, (int32_t)(info->dim.x-1));
178*e1eccf28SAndroid Build Coastguard Worker 
179*e1eccf28SAndroid Build Coastguard Worker     float px = (float)(py0[x0]) * coeff[0] +
180*e1eccf28SAndroid Build Coastguard Worker                (float)(py0[x1]) * coeff[1] +
181*e1eccf28SAndroid Build Coastguard Worker                (float)(py0[x2]) * coeff[2] +
182*e1eccf28SAndroid Build Coastguard Worker                (float)(py0[x3]) * coeff[3] +
183*e1eccf28SAndroid Build Coastguard Worker                (float)(py0[x4]) * coeff[4] +
184*e1eccf28SAndroid Build Coastguard Worker 
185*e1eccf28SAndroid Build Coastguard Worker                (float)(py1[x0]) * coeff[5] +
186*e1eccf28SAndroid Build Coastguard Worker                (float)(py1[x1]) * coeff[6] +
187*e1eccf28SAndroid Build Coastguard Worker                (float)(py1[x2]) * coeff[7] +
188*e1eccf28SAndroid Build Coastguard Worker                (float)(py1[x3]) * coeff[8] +
189*e1eccf28SAndroid Build Coastguard Worker                (float)(py1[x4]) * coeff[9] +
190*e1eccf28SAndroid Build Coastguard Worker 
191*e1eccf28SAndroid Build Coastguard Worker                (float)(py2[x0]) * coeff[10] +
192*e1eccf28SAndroid Build Coastguard Worker                (float)(py2[x1]) * coeff[11] +
193*e1eccf28SAndroid Build Coastguard Worker                (float)(py2[x2]) * coeff[12] +
194*e1eccf28SAndroid Build Coastguard Worker                (float)(py2[x3]) * coeff[13] +
195*e1eccf28SAndroid Build Coastguard Worker                (float)(py2[x4]) * coeff[14] +
196*e1eccf28SAndroid Build Coastguard Worker 
197*e1eccf28SAndroid Build Coastguard Worker                (float)(py3[x0]) * coeff[15] +
198*e1eccf28SAndroid Build Coastguard Worker                (float)(py3[x1]) * coeff[16] +
199*e1eccf28SAndroid Build Coastguard Worker                (float)(py3[x2]) * coeff[17] +
200*e1eccf28SAndroid Build Coastguard Worker                (float)(py3[x3]) * coeff[18] +
201*e1eccf28SAndroid Build Coastguard Worker                (float)(py3[x4]) * coeff[19] +
202*e1eccf28SAndroid Build Coastguard Worker 
203*e1eccf28SAndroid Build Coastguard Worker                (float)(py4[x0]) * coeff[20] +
204*e1eccf28SAndroid Build Coastguard Worker                (float)(py4[x1]) * coeff[21] +
205*e1eccf28SAndroid Build Coastguard Worker                (float)(py4[x2]) * coeff[22] +
206*e1eccf28SAndroid Build Coastguard Worker                (float)(py4[x3]) * coeff[23] +
207*e1eccf28SAndroid Build Coastguard Worker                (float)(py4[x4]) * coeff[24];
208*e1eccf28SAndroid Build Coastguard Worker     px = clamp(px + 0.5f, 0.f, 255.f);
209*e1eccf28SAndroid Build Coastguard Worker     *out = px;
210*e1eccf28SAndroid Build Coastguard Worker }
211*e1eccf28SAndroid Build Coastguard Worker 
OneF4(const RsExpandKernelDriverInfo * info,uint32_t x,float4 * out,const float4 * py0,const float4 * py1,const float4 * py2,const float4 * py3,const float4 * py4,const float * coeff)212*e1eccf28SAndroid Build Coastguard Worker static void OneF4(const RsExpandKernelDriverInfo *info, uint32_t x, float4 *out,
213*e1eccf28SAndroid Build Coastguard Worker                   const float4 *py0, const float4 *py1, const float4 *py2, const float4 *py3, const float4 *py4,
214*e1eccf28SAndroid Build Coastguard Worker                   const float* coeff) {
215*e1eccf28SAndroid Build Coastguard Worker 
216*e1eccf28SAndroid Build Coastguard Worker     uint32_t x0 = rsMax((int32_t)x-2, 0);
217*e1eccf28SAndroid Build Coastguard Worker     uint32_t x1 = rsMax((int32_t)x-1, 0);
218*e1eccf28SAndroid Build Coastguard Worker     uint32_t x2 = x;
219*e1eccf28SAndroid Build Coastguard Worker     uint32_t x3 = rsMin((int32_t)x+1, (int32_t)(info->dim.x-1));
220*e1eccf28SAndroid Build Coastguard Worker     uint32_t x4 = rsMin((int32_t)x+2, (int32_t)(info->dim.x-1));
221*e1eccf28SAndroid Build Coastguard Worker 
222*e1eccf28SAndroid Build Coastguard Worker     float4 px = py0[x0] * coeff[0] +
223*e1eccf28SAndroid Build Coastguard Worker                 py0[x1] * coeff[1] +
224*e1eccf28SAndroid Build Coastguard Worker                 py0[x2] * coeff[2] +
225*e1eccf28SAndroid Build Coastguard Worker                 py0[x3] * coeff[3] +
226*e1eccf28SAndroid Build Coastguard Worker                 py0[x4] * coeff[4] +
227*e1eccf28SAndroid Build Coastguard Worker 
228*e1eccf28SAndroid Build Coastguard Worker                 py1[x0] * coeff[5] +
229*e1eccf28SAndroid Build Coastguard Worker                 py1[x1] * coeff[6] +
230*e1eccf28SAndroid Build Coastguard Worker                 py1[x2] * coeff[7] +
231*e1eccf28SAndroid Build Coastguard Worker                 py1[x3] * coeff[8] +
232*e1eccf28SAndroid Build Coastguard Worker                 py1[x4] * coeff[9] +
233*e1eccf28SAndroid Build Coastguard Worker 
234*e1eccf28SAndroid Build Coastguard Worker                 py2[x0] * coeff[10] +
235*e1eccf28SAndroid Build Coastguard Worker                 py2[x1] * coeff[11] +
236*e1eccf28SAndroid Build Coastguard Worker                 py2[x2] * coeff[12] +
237*e1eccf28SAndroid Build Coastguard Worker                 py2[x3] * coeff[13] +
238*e1eccf28SAndroid Build Coastguard Worker                 py2[x4] * coeff[14] +
239*e1eccf28SAndroid Build Coastguard Worker 
240*e1eccf28SAndroid Build Coastguard Worker                 py3[x0] * coeff[15] +
241*e1eccf28SAndroid Build Coastguard Worker                 py3[x1] * coeff[16] +
242*e1eccf28SAndroid Build Coastguard Worker                 py3[x2] * coeff[17] +
243*e1eccf28SAndroid Build Coastguard Worker                 py3[x3] * coeff[18] +
244*e1eccf28SAndroid Build Coastguard Worker                 py3[x4] * coeff[19] +
245*e1eccf28SAndroid Build Coastguard Worker 
246*e1eccf28SAndroid Build Coastguard Worker                 py4[x0] * coeff[20] +
247*e1eccf28SAndroid Build Coastguard Worker                 py4[x1] * coeff[21] +
248*e1eccf28SAndroid Build Coastguard Worker                 py4[x2] * coeff[22] +
249*e1eccf28SAndroid Build Coastguard Worker                 py4[x3] * coeff[23] +
250*e1eccf28SAndroid Build Coastguard Worker                 py4[x4] * coeff[24];
251*e1eccf28SAndroid Build Coastguard Worker     *out = px;
252*e1eccf28SAndroid Build Coastguard Worker }
253*e1eccf28SAndroid Build Coastguard Worker 
OneF2(const RsExpandKernelDriverInfo * info,uint32_t x,float2 * out,const float2 * py0,const float2 * py1,const float2 * py2,const float2 * py3,const float2 * py4,const float * coeff)254*e1eccf28SAndroid Build Coastguard Worker static void OneF2(const RsExpandKernelDriverInfo *info, uint32_t x, float2 *out,
255*e1eccf28SAndroid Build Coastguard Worker                   const float2 *py0, const float2 *py1, const float2 *py2, const float2 *py3, const float2 *py4,
256*e1eccf28SAndroid Build Coastguard Worker                   const float* coeff) {
257*e1eccf28SAndroid Build Coastguard Worker 
258*e1eccf28SAndroid Build Coastguard Worker     uint32_t x0 = rsMax((int32_t)x-2, 0);
259*e1eccf28SAndroid Build Coastguard Worker     uint32_t x1 = rsMax((int32_t)x-1, 0);
260*e1eccf28SAndroid Build Coastguard Worker     uint32_t x2 = x;
261*e1eccf28SAndroid Build Coastguard Worker     uint32_t x3 = rsMin((int32_t)x+1, (int32_t)(info->dim.x-1));
262*e1eccf28SAndroid Build Coastguard Worker     uint32_t x4 = rsMin((int32_t)x+2, (int32_t)(info->dim.x-1));
263*e1eccf28SAndroid Build Coastguard Worker 
264*e1eccf28SAndroid Build Coastguard Worker     float2 px = py0[x0] * coeff[0] +
265*e1eccf28SAndroid Build Coastguard Worker                 py0[x1] * coeff[1] +
266*e1eccf28SAndroid Build Coastguard Worker                 py0[x2] * coeff[2] +
267*e1eccf28SAndroid Build Coastguard Worker                 py0[x3] * coeff[3] +
268*e1eccf28SAndroid Build Coastguard Worker                 py0[x4] * coeff[4] +
269*e1eccf28SAndroid Build Coastguard Worker 
270*e1eccf28SAndroid Build Coastguard Worker                 py1[x0] * coeff[5] +
271*e1eccf28SAndroid Build Coastguard Worker                 py1[x1] * coeff[6] +
272*e1eccf28SAndroid Build Coastguard Worker                 py1[x2] * coeff[7] +
273*e1eccf28SAndroid Build Coastguard Worker                 py1[x3] * coeff[8] +
274*e1eccf28SAndroid Build Coastguard Worker                 py1[x4] * coeff[9] +
275*e1eccf28SAndroid Build Coastguard Worker 
276*e1eccf28SAndroid Build Coastguard Worker                 py2[x0] * coeff[10] +
277*e1eccf28SAndroid Build Coastguard Worker                 py2[x1] * coeff[11] +
278*e1eccf28SAndroid Build Coastguard Worker                 py2[x2] * coeff[12] +
279*e1eccf28SAndroid Build Coastguard Worker                 py2[x3] * coeff[13] +
280*e1eccf28SAndroid Build Coastguard Worker                 py2[x4] * coeff[14] +
281*e1eccf28SAndroid Build Coastguard Worker 
282*e1eccf28SAndroid Build Coastguard Worker                 py3[x0] * coeff[15] +
283*e1eccf28SAndroid Build Coastguard Worker                 py3[x1] * coeff[16] +
284*e1eccf28SAndroid Build Coastguard Worker                 py3[x2] * coeff[17] +
285*e1eccf28SAndroid Build Coastguard Worker                 py3[x3] * coeff[18] +
286*e1eccf28SAndroid Build Coastguard Worker                 py3[x4] * coeff[19] +
287*e1eccf28SAndroid Build Coastguard Worker 
288*e1eccf28SAndroid Build Coastguard Worker                 py4[x0] * coeff[20] +
289*e1eccf28SAndroid Build Coastguard Worker                 py4[x1] * coeff[21] +
290*e1eccf28SAndroid Build Coastguard Worker                 py4[x2] * coeff[22] +
291*e1eccf28SAndroid Build Coastguard Worker                 py4[x3] * coeff[23] +
292*e1eccf28SAndroid Build Coastguard Worker                 py4[x4] * coeff[24];
293*e1eccf28SAndroid Build Coastguard Worker     *out = px;
294*e1eccf28SAndroid Build Coastguard Worker }
295*e1eccf28SAndroid Build Coastguard Worker 
OneF1(const RsExpandKernelDriverInfo * info,uint32_t x,float * out,const float * py0,const float * py1,const float * py2,const float * py3,const float * py4,const float * coeff)296*e1eccf28SAndroid Build Coastguard Worker static void OneF1(const RsExpandKernelDriverInfo *info, uint32_t x, float *out,
297*e1eccf28SAndroid Build Coastguard Worker                   const float *py0, const float *py1, const float *py2, const float *py3, const float *py4,
298*e1eccf28SAndroid Build Coastguard Worker                   const float* coeff) {
299*e1eccf28SAndroid Build Coastguard Worker 
300*e1eccf28SAndroid Build Coastguard Worker     uint32_t x0 = rsMax((int32_t)x-2, 0);
301*e1eccf28SAndroid Build Coastguard Worker     uint32_t x1 = rsMax((int32_t)x-1, 0);
302*e1eccf28SAndroid Build Coastguard Worker     uint32_t x2 = x;
303*e1eccf28SAndroid Build Coastguard Worker     uint32_t x3 = rsMin((int32_t)x+1, (int32_t)(info->dim.x-1));
304*e1eccf28SAndroid Build Coastguard Worker     uint32_t x4 = rsMin((int32_t)x+2, (int32_t)(info->dim.x-1));
305*e1eccf28SAndroid Build Coastguard Worker 
306*e1eccf28SAndroid Build Coastguard Worker     float px = py0[x0] * coeff[0] +
307*e1eccf28SAndroid Build Coastguard Worker                py0[x1] * coeff[1] +
308*e1eccf28SAndroid Build Coastguard Worker                py0[x2] * coeff[2] +
309*e1eccf28SAndroid Build Coastguard Worker                py0[x3] * coeff[3] +
310*e1eccf28SAndroid Build Coastguard Worker                py0[x4] * coeff[4] +
311*e1eccf28SAndroid Build Coastguard Worker 
312*e1eccf28SAndroid Build Coastguard Worker                py1[x0] * coeff[5] +
313*e1eccf28SAndroid Build Coastguard Worker                py1[x1] * coeff[6] +
314*e1eccf28SAndroid Build Coastguard Worker                py1[x2] * coeff[7] +
315*e1eccf28SAndroid Build Coastguard Worker                py1[x3] * coeff[8] +
316*e1eccf28SAndroid Build Coastguard Worker                py1[x4] * coeff[9] +
317*e1eccf28SAndroid Build Coastguard Worker 
318*e1eccf28SAndroid Build Coastguard Worker                py2[x0] * coeff[10] +
319*e1eccf28SAndroid Build Coastguard Worker                py2[x1] * coeff[11] +
320*e1eccf28SAndroid Build Coastguard Worker                py2[x2] * coeff[12] +
321*e1eccf28SAndroid Build Coastguard Worker                py2[x3] * coeff[13] +
322*e1eccf28SAndroid Build Coastguard Worker                py2[x4] * coeff[14] +
323*e1eccf28SAndroid Build Coastguard Worker 
324*e1eccf28SAndroid Build Coastguard Worker                py3[x0] * coeff[15] +
325*e1eccf28SAndroid Build Coastguard Worker                py3[x1] * coeff[16] +
326*e1eccf28SAndroid Build Coastguard Worker                py3[x2] * coeff[17] +
327*e1eccf28SAndroid Build Coastguard Worker                py3[x3] * coeff[18] +
328*e1eccf28SAndroid Build Coastguard Worker                py3[x4] * coeff[19] +
329*e1eccf28SAndroid Build Coastguard Worker 
330*e1eccf28SAndroid Build Coastguard Worker                py4[x0] * coeff[20] +
331*e1eccf28SAndroid Build Coastguard Worker                py4[x1] * coeff[21] +
332*e1eccf28SAndroid Build Coastguard Worker                py4[x2] * coeff[22] +
333*e1eccf28SAndroid Build Coastguard Worker                py4[x3] * coeff[23] +
334*e1eccf28SAndroid Build Coastguard Worker                py4[x4] * coeff[24];
335*e1eccf28SAndroid Build Coastguard Worker     *out = px;
336*e1eccf28SAndroid Build Coastguard Worker }
337*e1eccf28SAndroid Build Coastguard Worker 
338*e1eccf28SAndroid Build Coastguard Worker 
339*e1eccf28SAndroid Build Coastguard Worker extern "C" void rsdIntrinsicConvolve5x5_K(void *dst, const void *y0, const void *y1,
340*e1eccf28SAndroid Build Coastguard Worker                                           const void *y2, const void *y3, const void *y4,
341*e1eccf28SAndroid Build Coastguard Worker                                           const int16_t *coef, uint32_t count);
342*e1eccf28SAndroid Build Coastguard Worker 
kernelU4(const RsExpandKernelDriverInfo * info,uint32_t xstart,uint32_t xend,uint32_t outstep)343*e1eccf28SAndroid Build Coastguard Worker void RsdCpuScriptIntrinsicConvolve5x5::kernelU4(const RsExpandKernelDriverInfo *info,
344*e1eccf28SAndroid Build Coastguard Worker                                                 uint32_t xstart, uint32_t xend,
345*e1eccf28SAndroid Build Coastguard Worker                                                 uint32_t outstep) {
346*e1eccf28SAndroid Build Coastguard Worker     RsdCpuScriptIntrinsicConvolve5x5 *cp = (RsdCpuScriptIntrinsicConvolve5x5 *)info->usr;
347*e1eccf28SAndroid Build Coastguard Worker     if (!cp->alloc.get()) {
348*e1eccf28SAndroid Build Coastguard Worker         ALOGE("Convolve5x5 executed without input, skipping");
349*e1eccf28SAndroid Build Coastguard Worker         return;
350*e1eccf28SAndroid Build Coastguard Worker     }
351*e1eccf28SAndroid Build Coastguard Worker     const uchar *pin = (const uchar *)cp->alloc->mHal.drvState.lod[0].mallocPtr;
352*e1eccf28SAndroid Build Coastguard Worker     const size_t stride = cp->alloc->mHal.drvState.lod[0].stride;
353*e1eccf28SAndroid Build Coastguard Worker 
354*e1eccf28SAndroid Build Coastguard Worker     uint32_t y0 = rsMax((int32_t)info->current.y-2, 0);
355*e1eccf28SAndroid Build Coastguard Worker     uint32_t y1 = rsMax((int32_t)info->current.y-1, 0);
356*e1eccf28SAndroid Build Coastguard Worker     uint32_t y2 = info->current.y;
357*e1eccf28SAndroid Build Coastguard Worker     uint32_t y3 = rsMin((int32_t)info->current.y+1, (int32_t)(info->dim.y-1));
358*e1eccf28SAndroid Build Coastguard Worker     uint32_t y4 = rsMin((int32_t)info->current.y+2, (int32_t)(info->dim.y-1));
359*e1eccf28SAndroid Build Coastguard Worker 
360*e1eccf28SAndroid Build Coastguard Worker     const uchar4 *py0 = (const uchar4 *)(pin + stride * y0);
361*e1eccf28SAndroid Build Coastguard Worker     const uchar4 *py1 = (const uchar4 *)(pin + stride * y1);
362*e1eccf28SAndroid Build Coastguard Worker     const uchar4 *py2 = (const uchar4 *)(pin + stride * y2);
363*e1eccf28SAndroid Build Coastguard Worker     const uchar4 *py3 = (const uchar4 *)(pin + stride * y3);
364*e1eccf28SAndroid Build Coastguard Worker     const uchar4 *py4 = (const uchar4 *)(pin + stride * y4);
365*e1eccf28SAndroid Build Coastguard Worker 
366*e1eccf28SAndroid Build Coastguard Worker     uchar4 *out = (uchar4 *)info->outPtr[0];
367*e1eccf28SAndroid Build Coastguard Worker     uint32_t x1 = xstart;
368*e1eccf28SAndroid Build Coastguard Worker     uint32_t x2 = xend;
369*e1eccf28SAndroid Build Coastguard Worker 
370*e1eccf28SAndroid Build Coastguard Worker     while((x1 < x2) && (x1 < 2)) {
371*e1eccf28SAndroid Build Coastguard Worker         OneU4(info, x1, out, py0, py1, py2, py3, py4, cp->mFp);
372*e1eccf28SAndroid Build Coastguard Worker         out++;
373*e1eccf28SAndroid Build Coastguard Worker         x1++;
374*e1eccf28SAndroid Build Coastguard Worker     }
375*e1eccf28SAndroid Build Coastguard Worker #if defined(ARCH_X86_HAVE_SSSE3)
376*e1eccf28SAndroid Build Coastguard Worker     // for x86 SIMD, require minimum of 7 elements (4 for SIMD,
377*e1eccf28SAndroid Build Coastguard Worker     // 3 for end boundary where x may hit the end boundary)
378*e1eccf28SAndroid Build Coastguard Worker     if (gArchUseSIMD &&((x1 + 6) < x2)) {
379*e1eccf28SAndroid Build Coastguard Worker         // subtract 3 for end boundary
380*e1eccf28SAndroid Build Coastguard Worker         uint32_t len = (x2 - x1 - 3) >> 2;
381*e1eccf28SAndroid Build Coastguard Worker         rsdIntrinsicConvolve5x5_K(out, py0 + x1 - 2, py1 + x1 - 2, py2 + x1 - 2, py3 + x1 - 2, py4 + x1 - 2, cp->mIp, len);
382*e1eccf28SAndroid Build Coastguard Worker         out += len << 2;
383*e1eccf28SAndroid Build Coastguard Worker         x1 += len << 2;
384*e1eccf28SAndroid Build Coastguard Worker     }
385*e1eccf28SAndroid Build Coastguard Worker #endif
386*e1eccf28SAndroid Build Coastguard Worker 
387*e1eccf28SAndroid Build Coastguard Worker #if defined(ARCH_ARM_USE_INTRINSICS)
388*e1eccf28SAndroid Build Coastguard Worker     if(gArchUseSIMD && ((x1 + 3) < x2)) {
389*e1eccf28SAndroid Build Coastguard Worker         uint32_t len = (x2 - x1 - 3) >> 1;
390*e1eccf28SAndroid Build Coastguard Worker         rsdIntrinsicConvolve5x5_K(out, py0 + x1 - 2, py1 + x1 - 2, py2 + x1 - 2, py3 + x1 - 2, py4 + x1 - 2, cp->mIp, len);
391*e1eccf28SAndroid Build Coastguard Worker         out += len << 1;
392*e1eccf28SAndroid Build Coastguard Worker         x1 += len << 1;
393*e1eccf28SAndroid Build Coastguard Worker     }
394*e1eccf28SAndroid Build Coastguard Worker #endif
395*e1eccf28SAndroid Build Coastguard Worker 
396*e1eccf28SAndroid Build Coastguard Worker     while(x1 < x2) {
397*e1eccf28SAndroid Build Coastguard Worker         OneU4(info, x1, out, py0, py1, py2, py3, py4, cp->mFp);
398*e1eccf28SAndroid Build Coastguard Worker         out++;
399*e1eccf28SAndroid Build Coastguard Worker         x1++;
400*e1eccf28SAndroid Build Coastguard Worker     }
401*e1eccf28SAndroid Build Coastguard Worker }
402*e1eccf28SAndroid Build Coastguard Worker 
kernelU2(const RsExpandKernelDriverInfo * info,uint32_t xstart,uint32_t xend,uint32_t outstep)403*e1eccf28SAndroid Build Coastguard Worker void RsdCpuScriptIntrinsicConvolve5x5::kernelU2(const RsExpandKernelDriverInfo *info,
404*e1eccf28SAndroid Build Coastguard Worker                                                 uint32_t xstart, uint32_t xend,
405*e1eccf28SAndroid Build Coastguard Worker                                                 uint32_t outstep) {
406*e1eccf28SAndroid Build Coastguard Worker     RsdCpuScriptIntrinsicConvolve5x5 *cp = (RsdCpuScriptIntrinsicConvolve5x5 *)info->usr;
407*e1eccf28SAndroid Build Coastguard Worker     if (!cp->alloc.get()) {
408*e1eccf28SAndroid Build Coastguard Worker         ALOGE("Convolve5x5 executed without input, skipping");
409*e1eccf28SAndroid Build Coastguard Worker         return;
410*e1eccf28SAndroid Build Coastguard Worker     }
411*e1eccf28SAndroid Build Coastguard Worker     const uchar *pin = (const uchar *)cp->alloc->mHal.drvState.lod[0].mallocPtr;
412*e1eccf28SAndroid Build Coastguard Worker     const size_t stride = cp->alloc->mHal.drvState.lod[0].stride;
413*e1eccf28SAndroid Build Coastguard Worker 
414*e1eccf28SAndroid Build Coastguard Worker     uint32_t y0 = rsMax((int32_t)info->current.y-2, 0);
415*e1eccf28SAndroid Build Coastguard Worker     uint32_t y1 = rsMax((int32_t)info->current.y-1, 0);
416*e1eccf28SAndroid Build Coastguard Worker     uint32_t y2 = info->current.y;
417*e1eccf28SAndroid Build Coastguard Worker     uint32_t y3 = rsMin((int32_t)info->current.y+1, (int32_t)(info->dim.y-1));
418*e1eccf28SAndroid Build Coastguard Worker     uint32_t y4 = rsMin((int32_t)info->current.y+2, (int32_t)(info->dim.y-1));
419*e1eccf28SAndroid Build Coastguard Worker 
420*e1eccf28SAndroid Build Coastguard Worker     const uchar2 *py0 = (const uchar2 *)(pin + stride * y0);
421*e1eccf28SAndroid Build Coastguard Worker     const uchar2 *py1 = (const uchar2 *)(pin + stride * y1);
422*e1eccf28SAndroid Build Coastguard Worker     const uchar2 *py2 = (const uchar2 *)(pin + stride * y2);
423*e1eccf28SAndroid Build Coastguard Worker     const uchar2 *py3 = (const uchar2 *)(pin + stride * y3);
424*e1eccf28SAndroid Build Coastguard Worker     const uchar2 *py4 = (const uchar2 *)(pin + stride * y4);
425*e1eccf28SAndroid Build Coastguard Worker 
426*e1eccf28SAndroid Build Coastguard Worker     uchar2 *out = (uchar2 *)info->outPtr[0];
427*e1eccf28SAndroid Build Coastguard Worker     uint32_t x1 = xstart;
428*e1eccf28SAndroid Build Coastguard Worker     uint32_t x2 = xend;
429*e1eccf28SAndroid Build Coastguard Worker 
430*e1eccf28SAndroid Build Coastguard Worker     while((x1 < x2) && (x1 < 2)) {
431*e1eccf28SAndroid Build Coastguard Worker         OneU2(info, x1, out, py0, py1, py2, py3, py4, cp->mFp);
432*e1eccf28SAndroid Build Coastguard Worker         out++;
433*e1eccf28SAndroid Build Coastguard Worker         x1++;
434*e1eccf28SAndroid Build Coastguard Worker     }
435*e1eccf28SAndroid Build Coastguard Worker 
436*e1eccf28SAndroid Build Coastguard Worker #if 0//defined(ARCH_ARM_HAVE_NEON)
437*e1eccf28SAndroid Build Coastguard Worker     if((x1 + 3) < x2) {
438*e1eccf28SAndroid Build Coastguard Worker         uint32_t len = (x2 - x1 - 3) >> 1;
439*e1eccf28SAndroid Build Coastguard Worker         rsdIntrinsicConvolve5x5_K(out, py0, py1, py2, py3, py4, cp->ip, len);
440*e1eccf28SAndroid Build Coastguard Worker         out += len << 1;
441*e1eccf28SAndroid Build Coastguard Worker         x1 += len << 1;
442*e1eccf28SAndroid Build Coastguard Worker     }
443*e1eccf28SAndroid Build Coastguard Worker #endif
444*e1eccf28SAndroid Build Coastguard Worker 
445*e1eccf28SAndroid Build Coastguard Worker     while(x1 < x2) {
446*e1eccf28SAndroid Build Coastguard Worker         OneU2(info, x1, out, py0, py1, py2, py3, py4, cp->mFp);
447*e1eccf28SAndroid Build Coastguard Worker         out++;
448*e1eccf28SAndroid Build Coastguard Worker         x1++;
449*e1eccf28SAndroid Build Coastguard Worker     }
450*e1eccf28SAndroid Build Coastguard Worker }
451*e1eccf28SAndroid Build Coastguard Worker 
kernelU1(const RsExpandKernelDriverInfo * info,uint32_t xstart,uint32_t xend,uint32_t outstep)452*e1eccf28SAndroid Build Coastguard Worker void RsdCpuScriptIntrinsicConvolve5x5::kernelU1(const RsExpandKernelDriverInfo *info,
453*e1eccf28SAndroid Build Coastguard Worker                                                 uint32_t xstart, uint32_t xend,
454*e1eccf28SAndroid Build Coastguard Worker                                                 uint32_t outstep) {
455*e1eccf28SAndroid Build Coastguard Worker     RsdCpuScriptIntrinsicConvolve5x5 *cp = (RsdCpuScriptIntrinsicConvolve5x5 *)info->usr;
456*e1eccf28SAndroid Build Coastguard Worker     if (!cp->alloc.get()) {
457*e1eccf28SAndroid Build Coastguard Worker         ALOGE("Convolve5x5 executed without input, skipping");
458*e1eccf28SAndroid Build Coastguard Worker         return;
459*e1eccf28SAndroid Build Coastguard Worker     }
460*e1eccf28SAndroid Build Coastguard Worker     const uchar *pin = (const uchar *)cp->alloc->mHal.drvState.lod[0].mallocPtr;
461*e1eccf28SAndroid Build Coastguard Worker     const size_t stride = cp->alloc->mHal.drvState.lod[0].stride;
462*e1eccf28SAndroid Build Coastguard Worker 
463*e1eccf28SAndroid Build Coastguard Worker     uint32_t y0 = rsMax((int32_t)info->current.y-2, 0);
464*e1eccf28SAndroid Build Coastguard Worker     uint32_t y1 = rsMax((int32_t)info->current.y-1, 0);
465*e1eccf28SAndroid Build Coastguard Worker     uint32_t y2 = info->current.y;
466*e1eccf28SAndroid Build Coastguard Worker     uint32_t y3 = rsMin((int32_t)info->current.y+1, (int32_t)(info->dim.y-1));
467*e1eccf28SAndroid Build Coastguard Worker     uint32_t y4 = rsMin((int32_t)info->current.y+2, (int32_t)(info->dim.y-1));
468*e1eccf28SAndroid Build Coastguard Worker 
469*e1eccf28SAndroid Build Coastguard Worker     const uchar *py0 = (const uchar *)(pin + stride * y0);
470*e1eccf28SAndroid Build Coastguard Worker     const uchar *py1 = (const uchar *)(pin + stride * y1);
471*e1eccf28SAndroid Build Coastguard Worker     const uchar *py2 = (const uchar *)(pin + stride * y2);
472*e1eccf28SAndroid Build Coastguard Worker     const uchar *py3 = (const uchar *)(pin + stride * y3);
473*e1eccf28SAndroid Build Coastguard Worker     const uchar *py4 = (const uchar *)(pin + stride * y4);
474*e1eccf28SAndroid Build Coastguard Worker 
475*e1eccf28SAndroid Build Coastguard Worker     uchar *out = (uchar *)info->outPtr[0];
476*e1eccf28SAndroid Build Coastguard Worker     uint32_t x1 = xstart;
477*e1eccf28SAndroid Build Coastguard Worker     uint32_t x2 = xend;
478*e1eccf28SAndroid Build Coastguard Worker 
479*e1eccf28SAndroid Build Coastguard Worker     while((x1 < x2) && (x1 < 2)) {
480*e1eccf28SAndroid Build Coastguard Worker         OneU1(info, x1, out, py0, py1, py2, py3, py4, cp->mFp);
481*e1eccf28SAndroid Build Coastguard Worker         out++;
482*e1eccf28SAndroid Build Coastguard Worker         x1++;
483*e1eccf28SAndroid Build Coastguard Worker     }
484*e1eccf28SAndroid Build Coastguard Worker 
485*e1eccf28SAndroid Build Coastguard Worker #if 0//defined(ARCH_ARM_HAVE_NEON)
486*e1eccf28SAndroid Build Coastguard Worker     if((x1 + 3) < x2) {
487*e1eccf28SAndroid Build Coastguard Worker         uint32_t len = (x2 - x1 - 3) >> 1;
488*e1eccf28SAndroid Build Coastguard Worker         rsdIntrinsicConvolve5x5_K(out, py0, py1, py2, py3, py4, cp->ip, len);
489*e1eccf28SAndroid Build Coastguard Worker         out += len << 1;
490*e1eccf28SAndroid Build Coastguard Worker         x1 += len << 1;
491*e1eccf28SAndroid Build Coastguard Worker     }
492*e1eccf28SAndroid Build Coastguard Worker #endif
493*e1eccf28SAndroid Build Coastguard Worker 
494*e1eccf28SAndroid Build Coastguard Worker     while(x1 < x2) {
495*e1eccf28SAndroid Build Coastguard Worker         OneU1(info, x1, out, py0, py1, py2, py3, py4, cp->mFp);
496*e1eccf28SAndroid Build Coastguard Worker         out++;
497*e1eccf28SAndroid Build Coastguard Worker         x1++;
498*e1eccf28SAndroid Build Coastguard Worker     }
499*e1eccf28SAndroid Build Coastguard Worker }
500*e1eccf28SAndroid Build Coastguard Worker 
kernelF4(const RsExpandKernelDriverInfo * info,uint32_t xstart,uint32_t xend,uint32_t outstep)501*e1eccf28SAndroid Build Coastguard Worker void RsdCpuScriptIntrinsicConvolve5x5::kernelF4(const RsExpandKernelDriverInfo *info,
502*e1eccf28SAndroid Build Coastguard Worker                                                 uint32_t xstart, uint32_t xend,
503*e1eccf28SAndroid Build Coastguard Worker                                                 uint32_t outstep) {
504*e1eccf28SAndroid Build Coastguard Worker     RsdCpuScriptIntrinsicConvolve5x5 *cp = (RsdCpuScriptIntrinsicConvolve5x5 *)info->usr;
505*e1eccf28SAndroid Build Coastguard Worker     if (!cp->alloc.get()) {
506*e1eccf28SAndroid Build Coastguard Worker         ALOGE("Convolve5x5 executed without input, skipping");
507*e1eccf28SAndroid Build Coastguard Worker         return;
508*e1eccf28SAndroid Build Coastguard Worker     }
509*e1eccf28SAndroid Build Coastguard Worker     const uchar *pin = (const uchar *)cp->alloc->mHal.drvState.lod[0].mallocPtr;
510*e1eccf28SAndroid Build Coastguard Worker     const size_t stride = cp->alloc->mHal.drvState.lod[0].stride;
511*e1eccf28SAndroid Build Coastguard Worker 
512*e1eccf28SAndroid Build Coastguard Worker     uint32_t y0 = rsMax((int32_t)info->current.y-2, 0);
513*e1eccf28SAndroid Build Coastguard Worker     uint32_t y1 = rsMax((int32_t)info->current.y-1, 0);
514*e1eccf28SAndroid Build Coastguard Worker     uint32_t y2 = info->current.y;
515*e1eccf28SAndroid Build Coastguard Worker     uint32_t y3 = rsMin((int32_t)info->current.y+1, (int32_t)(info->dim.y-1));
516*e1eccf28SAndroid Build Coastguard Worker     uint32_t y4 = rsMin((int32_t)info->current.y+2, (int32_t)(info->dim.y-1));
517*e1eccf28SAndroid Build Coastguard Worker 
518*e1eccf28SAndroid Build Coastguard Worker     const float4 *py0 = (const float4 *)(pin + stride * y0);
519*e1eccf28SAndroid Build Coastguard Worker     const float4 *py1 = (const float4 *)(pin + stride * y1);
520*e1eccf28SAndroid Build Coastguard Worker     const float4 *py2 = (const float4 *)(pin + stride * y2);
521*e1eccf28SAndroid Build Coastguard Worker     const float4 *py3 = (const float4 *)(pin + stride * y3);
522*e1eccf28SAndroid Build Coastguard Worker     const float4 *py4 = (const float4 *)(pin + stride * y4);
523*e1eccf28SAndroid Build Coastguard Worker 
524*e1eccf28SAndroid Build Coastguard Worker     float4 *out = (float4 *)info->outPtr[0];
525*e1eccf28SAndroid Build Coastguard Worker     uint32_t x1 = xstart;
526*e1eccf28SAndroid Build Coastguard Worker     uint32_t x2 = xend;
527*e1eccf28SAndroid Build Coastguard Worker 
528*e1eccf28SAndroid Build Coastguard Worker     while((x1 < x2) && (x1 < 2)) {
529*e1eccf28SAndroid Build Coastguard Worker         OneF4(info, x1, out, py0, py1, py2, py3, py4, cp->mFp);
530*e1eccf28SAndroid Build Coastguard Worker         out++;
531*e1eccf28SAndroid Build Coastguard Worker         x1++;
532*e1eccf28SAndroid Build Coastguard Worker     }
533*e1eccf28SAndroid Build Coastguard Worker 
534*e1eccf28SAndroid Build Coastguard Worker #if 0//defined(ARCH_ARM_HAVE_NEON)
535*e1eccf28SAndroid Build Coastguard Worker     if((x1 + 3) < x2) {
536*e1eccf28SAndroid Build Coastguard Worker         uint32_t len = (x2 - x1 - 3) >> 1;
537*e1eccf28SAndroid Build Coastguard Worker         rsdIntrinsicConvolve5x5_K(out, py0, py1, py2, py3, py4, cp->ip, len);
538*e1eccf28SAndroid Build Coastguard Worker         out += len << 1;
539*e1eccf28SAndroid Build Coastguard Worker         x1 += len << 1;
540*e1eccf28SAndroid Build Coastguard Worker     }
541*e1eccf28SAndroid Build Coastguard Worker #endif
542*e1eccf28SAndroid Build Coastguard Worker 
543*e1eccf28SAndroid Build Coastguard Worker     while(x1 < x2) {
544*e1eccf28SAndroid Build Coastguard Worker         OneF4(info, x1, out, py0, py1, py2, py3, py4, cp->mFp);
545*e1eccf28SAndroid Build Coastguard Worker         out++;
546*e1eccf28SAndroid Build Coastguard Worker         x1++;
547*e1eccf28SAndroid Build Coastguard Worker     }
548*e1eccf28SAndroid Build Coastguard Worker }
549*e1eccf28SAndroid Build Coastguard Worker 
kernelF2(const RsExpandKernelDriverInfo * info,uint32_t xstart,uint32_t xend,uint32_t outstep)550*e1eccf28SAndroid Build Coastguard Worker void RsdCpuScriptIntrinsicConvolve5x5::kernelF2(const RsExpandKernelDriverInfo *info,
551*e1eccf28SAndroid Build Coastguard Worker                                                 uint32_t xstart, uint32_t xend,
552*e1eccf28SAndroid Build Coastguard Worker                                                 uint32_t outstep) {
553*e1eccf28SAndroid Build Coastguard Worker     RsdCpuScriptIntrinsicConvolve5x5 *cp = (RsdCpuScriptIntrinsicConvolve5x5 *)info->usr;
554*e1eccf28SAndroid Build Coastguard Worker     if (!cp->alloc.get()) {
555*e1eccf28SAndroid Build Coastguard Worker         ALOGE("Convolve5x5 executed without input, skipping");
556*e1eccf28SAndroid Build Coastguard Worker         return;
557*e1eccf28SAndroid Build Coastguard Worker     }
558*e1eccf28SAndroid Build Coastguard Worker     const uchar *pin = (const uchar *)cp->alloc->mHal.drvState.lod[0].mallocPtr;
559*e1eccf28SAndroid Build Coastguard Worker     const size_t stride = cp->alloc->mHal.drvState.lod[0].stride;
560*e1eccf28SAndroid Build Coastguard Worker 
561*e1eccf28SAndroid Build Coastguard Worker     uint32_t y0 = rsMax((int32_t)info->current.y-2, 0);
562*e1eccf28SAndroid Build Coastguard Worker     uint32_t y1 = rsMax((int32_t)info->current.y-1, 0);
563*e1eccf28SAndroid Build Coastguard Worker     uint32_t y2 = info->current.y;
564*e1eccf28SAndroid Build Coastguard Worker     uint32_t y3 = rsMin((int32_t)info->current.y+1, (int32_t)(info->dim.y-1));
565*e1eccf28SAndroid Build Coastguard Worker     uint32_t y4 = rsMin((int32_t)info->current.y+2, (int32_t)(info->dim.y-1));
566*e1eccf28SAndroid Build Coastguard Worker 
567*e1eccf28SAndroid Build Coastguard Worker     const float2 *py0 = (const float2 *)(pin + stride * y0);
568*e1eccf28SAndroid Build Coastguard Worker     const float2 *py1 = (const float2 *)(pin + stride * y1);
569*e1eccf28SAndroid Build Coastguard Worker     const float2 *py2 = (const float2 *)(pin + stride * y2);
570*e1eccf28SAndroid Build Coastguard Worker     const float2 *py3 = (const float2 *)(pin + stride * y3);
571*e1eccf28SAndroid Build Coastguard Worker     const float2 *py4 = (const float2 *)(pin + stride * y4);
572*e1eccf28SAndroid Build Coastguard Worker 
573*e1eccf28SAndroid Build Coastguard Worker     float2 *out = (float2 *)info->outPtr[0];
574*e1eccf28SAndroid Build Coastguard Worker     uint32_t x1 = xstart;
575*e1eccf28SAndroid Build Coastguard Worker     uint32_t x2 = xend;
576*e1eccf28SAndroid Build Coastguard Worker 
577*e1eccf28SAndroid Build Coastguard Worker     while((x1 < x2) && (x1 < 2)) {
578*e1eccf28SAndroid Build Coastguard Worker         OneF2(info, x1, out, py0, py1, py2, py3, py4, cp->mFp);
579*e1eccf28SAndroid Build Coastguard Worker         out++;
580*e1eccf28SAndroid Build Coastguard Worker         x1++;
581*e1eccf28SAndroid Build Coastguard Worker     }
582*e1eccf28SAndroid Build Coastguard Worker 
583*e1eccf28SAndroid Build Coastguard Worker #if 0//defined(ARCH_ARM_HAVE_NEON)
584*e1eccf28SAndroid Build Coastguard Worker     if((x1 + 3) < x2) {
585*e1eccf28SAndroid Build Coastguard Worker         uint32_t len = (x2 - x1 - 3) >> 1;
586*e1eccf28SAndroid Build Coastguard Worker         rsdIntrinsicConvolve5x5_K(out, py0, py1, py2, py3, py4, cp->ip, len);
587*e1eccf28SAndroid Build Coastguard Worker         out += len << 1;
588*e1eccf28SAndroid Build Coastguard Worker         x1 += len << 1;
589*e1eccf28SAndroid Build Coastguard Worker     }
590*e1eccf28SAndroid Build Coastguard Worker #endif
591*e1eccf28SAndroid Build Coastguard Worker 
592*e1eccf28SAndroid Build Coastguard Worker     while(x1 < x2) {
593*e1eccf28SAndroid Build Coastguard Worker         OneF2(info, x1, out, py0, py1, py2, py3, py4, cp->mFp);
594*e1eccf28SAndroid Build Coastguard Worker         out++;
595*e1eccf28SAndroid Build Coastguard Worker         x1++;
596*e1eccf28SAndroid Build Coastguard Worker     }
597*e1eccf28SAndroid Build Coastguard Worker }
598*e1eccf28SAndroid Build Coastguard Worker 
kernelF1(const RsExpandKernelDriverInfo * info,uint32_t xstart,uint32_t xend,uint32_t outstep)599*e1eccf28SAndroid Build Coastguard Worker void RsdCpuScriptIntrinsicConvolve5x5::kernelF1(const RsExpandKernelDriverInfo *info,
600*e1eccf28SAndroid Build Coastguard Worker                                                 uint32_t xstart, uint32_t xend,
601*e1eccf28SAndroid Build Coastguard Worker                                                 uint32_t outstep) {
602*e1eccf28SAndroid Build Coastguard Worker     RsdCpuScriptIntrinsicConvolve5x5 *cp = (RsdCpuScriptIntrinsicConvolve5x5 *)info->usr;
603*e1eccf28SAndroid Build Coastguard Worker     if (!cp->alloc.get()) {
604*e1eccf28SAndroid Build Coastguard Worker         ALOGE("Convolve5x5 executed without input, skipping");
605*e1eccf28SAndroid Build Coastguard Worker         return;
606*e1eccf28SAndroid Build Coastguard Worker     }
607*e1eccf28SAndroid Build Coastguard Worker     const uchar *pin = (const uchar *)cp->alloc->mHal.drvState.lod[0].mallocPtr;
608*e1eccf28SAndroid Build Coastguard Worker     const size_t stride = cp->alloc->mHal.drvState.lod[0].stride;
609*e1eccf28SAndroid Build Coastguard Worker 
610*e1eccf28SAndroid Build Coastguard Worker     uint32_t y0 = rsMax((int32_t)info->current.y-2, 0);
611*e1eccf28SAndroid Build Coastguard Worker     uint32_t y1 = rsMax((int32_t)info->current.y-1, 0);
612*e1eccf28SAndroid Build Coastguard Worker     uint32_t y2 = info->current.y;
613*e1eccf28SAndroid Build Coastguard Worker     uint32_t y3 = rsMin((int32_t)info->current.y+1, (int32_t)(info->dim.y-1));
614*e1eccf28SAndroid Build Coastguard Worker     uint32_t y4 = rsMin((int32_t)info->current.y+2, (int32_t)(info->dim.y-1));
615*e1eccf28SAndroid Build Coastguard Worker 
616*e1eccf28SAndroid Build Coastguard Worker     const float *py0 = (const float *)(pin + stride * y0);
617*e1eccf28SAndroid Build Coastguard Worker     const float *py1 = (const float *)(pin + stride * y1);
618*e1eccf28SAndroid Build Coastguard Worker     const float *py2 = (const float *)(pin + stride * y2);
619*e1eccf28SAndroid Build Coastguard Worker     const float *py3 = (const float *)(pin + stride * y3);
620*e1eccf28SAndroid Build Coastguard Worker     const float *py4 = (const float *)(pin + stride * y4);
621*e1eccf28SAndroid Build Coastguard Worker 
622*e1eccf28SAndroid Build Coastguard Worker     float *out = (float *)info->outPtr[0];
623*e1eccf28SAndroid Build Coastguard Worker     uint32_t x1 = xstart;
624*e1eccf28SAndroid Build Coastguard Worker     uint32_t x2 = xend;
625*e1eccf28SAndroid Build Coastguard Worker 
626*e1eccf28SAndroid Build Coastguard Worker     while((x1 < x2) && (x1 < 2)) {
627*e1eccf28SAndroid Build Coastguard Worker         OneF1(info, x1, out, py0, py1, py2, py3, py4, cp->mFp);
628*e1eccf28SAndroid Build Coastguard Worker         out++;
629*e1eccf28SAndroid Build Coastguard Worker         x1++;
630*e1eccf28SAndroid Build Coastguard Worker     }
631*e1eccf28SAndroid Build Coastguard Worker 
632*e1eccf28SAndroid Build Coastguard Worker #if 0//defined(ARCH_ARM_HAVE_NEON)
633*e1eccf28SAndroid Build Coastguard Worker     if((x1 + 3) < x2) {
634*e1eccf28SAndroid Build Coastguard Worker         uint32_t len = (x2 - x1 - 3) >> 1;
635*e1eccf28SAndroid Build Coastguard Worker         rsdIntrinsicConvolve5x5_K(out, py0, py1, py2, py3, py4, cp->ip, len);
636*e1eccf28SAndroid Build Coastguard Worker         out += len << 1;
637*e1eccf28SAndroid Build Coastguard Worker         x1 += len << 1;
638*e1eccf28SAndroid Build Coastguard Worker     }
639*e1eccf28SAndroid Build Coastguard Worker #endif
640*e1eccf28SAndroid Build Coastguard Worker 
641*e1eccf28SAndroid Build Coastguard Worker     while(x1 < x2) {
642*e1eccf28SAndroid Build Coastguard Worker         OneF1(info, x1, out, py0, py1, py2, py3, py4, cp->mFp);
643*e1eccf28SAndroid Build Coastguard Worker         out++;
644*e1eccf28SAndroid Build Coastguard Worker         x1++;
645*e1eccf28SAndroid Build Coastguard Worker     }
646*e1eccf28SAndroid Build Coastguard Worker }
647*e1eccf28SAndroid Build Coastguard Worker 
RsdCpuScriptIntrinsicConvolve5x5(RsdCpuReferenceImpl * ctx,const Script * s,const Element * e)648*e1eccf28SAndroid Build Coastguard Worker RsdCpuScriptIntrinsicConvolve5x5::RsdCpuScriptIntrinsicConvolve5x5(
649*e1eccf28SAndroid Build Coastguard Worker             RsdCpuReferenceImpl *ctx, const Script *s, const Element *e)
650*e1eccf28SAndroid Build Coastguard Worker             : RsdCpuScriptIntrinsic(ctx, s, e, RS_SCRIPT_INTRINSIC_ID_CONVOLVE_5x5) {
651*e1eccf28SAndroid Build Coastguard Worker 
652*e1eccf28SAndroid Build Coastguard Worker     if (e->getType() == RS_TYPE_FLOAT_32) {
653*e1eccf28SAndroid Build Coastguard Worker         switch(e->getVectorSize()) {
654*e1eccf28SAndroid Build Coastguard Worker         case 1:
655*e1eccf28SAndroid Build Coastguard Worker             mRootPtr = &kernelF1;
656*e1eccf28SAndroid Build Coastguard Worker             break;
657*e1eccf28SAndroid Build Coastguard Worker         case 2:
658*e1eccf28SAndroid Build Coastguard Worker             mRootPtr = &kernelF2;
659*e1eccf28SAndroid Build Coastguard Worker             break;
660*e1eccf28SAndroid Build Coastguard Worker         case 3:
661*e1eccf28SAndroid Build Coastguard Worker         case 4:
662*e1eccf28SAndroid Build Coastguard Worker             mRootPtr = &kernelF4;
663*e1eccf28SAndroid Build Coastguard Worker             break;
664*e1eccf28SAndroid Build Coastguard Worker         }
665*e1eccf28SAndroid Build Coastguard Worker     } else {
666*e1eccf28SAndroid Build Coastguard Worker         switch(e->getVectorSize()) {
667*e1eccf28SAndroid Build Coastguard Worker         case 1:
668*e1eccf28SAndroid Build Coastguard Worker             mRootPtr = &kernelU1;
669*e1eccf28SAndroid Build Coastguard Worker             break;
670*e1eccf28SAndroid Build Coastguard Worker         case 2:
671*e1eccf28SAndroid Build Coastguard Worker             mRootPtr = &kernelU2;
672*e1eccf28SAndroid Build Coastguard Worker             break;
673*e1eccf28SAndroid Build Coastguard Worker         case 3:
674*e1eccf28SAndroid Build Coastguard Worker         case 4:
675*e1eccf28SAndroid Build Coastguard Worker             mRootPtr = &kernelU4;
676*e1eccf28SAndroid Build Coastguard Worker             break;
677*e1eccf28SAndroid Build Coastguard Worker         }
678*e1eccf28SAndroid Build Coastguard Worker     }
679*e1eccf28SAndroid Build Coastguard Worker     for(int ct=0; ct < 25; ct++) {
680*e1eccf28SAndroid Build Coastguard Worker         mFp[ct] = 1.f / 25.f;
681*e1eccf28SAndroid Build Coastguard Worker         mIp[ct] = (int16_t)(mFp[ct] * 256.f);
682*e1eccf28SAndroid Build Coastguard Worker     }
683*e1eccf28SAndroid Build Coastguard Worker }
684*e1eccf28SAndroid Build Coastguard Worker 
~RsdCpuScriptIntrinsicConvolve5x5()685*e1eccf28SAndroid Build Coastguard Worker RsdCpuScriptIntrinsicConvolve5x5::~RsdCpuScriptIntrinsicConvolve5x5() {
686*e1eccf28SAndroid Build Coastguard Worker }
687*e1eccf28SAndroid Build Coastguard Worker 
populateScript(Script * s)688*e1eccf28SAndroid Build Coastguard Worker void RsdCpuScriptIntrinsicConvolve5x5::populateScript(Script *s) {
689*e1eccf28SAndroid Build Coastguard Worker     s->mHal.info.exportedVariableCount = 2;
690*e1eccf28SAndroid Build Coastguard Worker }
691*e1eccf28SAndroid Build Coastguard Worker 
invokeFreeChildren()692*e1eccf28SAndroid Build Coastguard Worker void RsdCpuScriptIntrinsicConvolve5x5::invokeFreeChildren() {
693*e1eccf28SAndroid Build Coastguard Worker     alloc.clear();
694*e1eccf28SAndroid Build Coastguard Worker }
695*e1eccf28SAndroid Build Coastguard Worker 
rsdIntrinsic_Convolve5x5(RsdCpuReferenceImpl * ctx,const Script * s,const Element * e)696*e1eccf28SAndroid Build Coastguard Worker RsdCpuScriptImpl * rsdIntrinsic_Convolve5x5(RsdCpuReferenceImpl *ctx,
697*e1eccf28SAndroid Build Coastguard Worker                                             const Script *s, const Element *e) {
698*e1eccf28SAndroid Build Coastguard Worker 
699*e1eccf28SAndroid Build Coastguard Worker     return new RsdCpuScriptIntrinsicConvolve5x5(ctx, s, e);
700*e1eccf28SAndroid Build Coastguard Worker }
701*e1eccf28SAndroid Build Coastguard Worker 
702*e1eccf28SAndroid Build Coastguard Worker } // namespace renderscript
703*e1eccf28SAndroid Build Coastguard Worker } // namespace android
704