1*e1eccf28SAndroid Build Coastguard Worker /*
2*e1eccf28SAndroid Build Coastguard Worker * Copyright (C) 2012 The Android Open Source Project
3*e1eccf28SAndroid Build Coastguard Worker *
4*e1eccf28SAndroid Build Coastguard Worker * Licensed under the Apache License, Version 2.0 (the "License");
5*e1eccf28SAndroid Build Coastguard Worker * you may not use this file except in compliance with the License.
6*e1eccf28SAndroid Build Coastguard Worker * You may obtain a copy of the License at
7*e1eccf28SAndroid Build Coastguard Worker *
8*e1eccf28SAndroid Build Coastguard Worker * http://www.apache.org/licenses/LICENSE-2.0
9*e1eccf28SAndroid Build Coastguard Worker *
10*e1eccf28SAndroid Build Coastguard Worker * Unless required by applicable law or agreed to in writing, software
11*e1eccf28SAndroid Build Coastguard Worker * distributed under the License is distributed on an "AS IS" BASIS,
12*e1eccf28SAndroid Build Coastguard Worker * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13*e1eccf28SAndroid Build Coastguard Worker * See the License for the specific language governing permissions and
14*e1eccf28SAndroid Build Coastguard Worker * limitations under the License.
15*e1eccf28SAndroid Build Coastguard Worker */
16*e1eccf28SAndroid Build Coastguard Worker
17*e1eccf28SAndroid Build Coastguard Worker #include <cstdint>
18*e1eccf28SAndroid Build Coastguard Worker
19*e1eccf28SAndroid Build Coastguard Worker #include "RenderScriptToolkit.h"
20*e1eccf28SAndroid Build Coastguard Worker #include "TaskProcessor.h"
21*e1eccf28SAndroid Build Coastguard Worker #include "Utils.h"
22*e1eccf28SAndroid Build Coastguard Worker
23*e1eccf28SAndroid Build Coastguard Worker namespace android {
24*e1eccf28SAndroid Build Coastguard Worker namespace renderscript {
25*e1eccf28SAndroid Build Coastguard Worker
26*e1eccf28SAndroid Build Coastguard Worker #define LOG_TAG "renderscript.toolkit.Convolve5x5"
27*e1eccf28SAndroid Build Coastguard Worker
28*e1eccf28SAndroid Build Coastguard Worker extern "C" void rsdIntrinsicConvolve5x5_K(void* dst, const void* y0, const void* y1, const void* y2,
29*e1eccf28SAndroid Build Coastguard Worker const void* y3, const void* y4, const int16_t* coef,
30*e1eccf28SAndroid Build Coastguard Worker uint32_t count);
31*e1eccf28SAndroid Build Coastguard Worker
32*e1eccf28SAndroid Build Coastguard Worker class Convolve5x5Task : public Task {
33*e1eccf28SAndroid Build Coastguard Worker const void* mIn;
34*e1eccf28SAndroid Build Coastguard Worker void* mOut;
35*e1eccf28SAndroid Build Coastguard Worker // Even though we have exactly 25 coefficients, store them in an array of size 28 so that
36*e1eccf28SAndroid Build Coastguard Worker // the SIMD instructions can load them in three chunks of 8 and 1 of chunk of 4.
37*e1eccf28SAndroid Build Coastguard Worker float mFp[28];
38*e1eccf28SAndroid Build Coastguard Worker int16_t mIp[28];
39*e1eccf28SAndroid Build Coastguard Worker
40*e1eccf28SAndroid Build Coastguard Worker void kernelU4(uchar* out, uint32_t xstart, uint32_t xend, const uchar* py0, const uchar* py1,
41*e1eccf28SAndroid Build Coastguard Worker const uchar* py2, const uchar* py3, const uchar* py4);
42*e1eccf28SAndroid Build Coastguard Worker void convolveU4(const uchar* pin, uchar* pout, size_t vectorSize, size_t sizeX, size_t sizeY,
43*e1eccf28SAndroid Build Coastguard Worker size_t startX, size_t startY, size_t endX, size_t endY);
44*e1eccf28SAndroid Build Coastguard Worker
45*e1eccf28SAndroid Build Coastguard Worker // Process a 2D tile of the overall work. threadIndex identifies which thread does the work.
46*e1eccf28SAndroid Build Coastguard Worker virtual void processData(int threadIndex, size_t startX, size_t startY, size_t endX,
47*e1eccf28SAndroid Build Coastguard Worker size_t endY) override;
48*e1eccf28SAndroid Build Coastguard Worker
49*e1eccf28SAndroid Build Coastguard Worker public:
Convolve5x5Task(const void * in,void * out,size_t vectorSize,size_t sizeX,size_t sizeY,const float * coefficients,const Restriction * restriction)50*e1eccf28SAndroid Build Coastguard Worker Convolve5x5Task(const void* in, void* out, size_t vectorSize, size_t sizeX, size_t sizeY,
51*e1eccf28SAndroid Build Coastguard Worker const float* coefficients, const Restriction* restriction)
52*e1eccf28SAndroid Build Coastguard Worker : Task{sizeX, sizeY, vectorSize, false, restriction}, mIn{in}, mOut{out} {
53*e1eccf28SAndroid Build Coastguard Worker for (int ct = 0; ct < 25; ct++) {
54*e1eccf28SAndroid Build Coastguard Worker mFp[ct] = coefficients[ct];
55*e1eccf28SAndroid Build Coastguard Worker if (mFp[ct] >= 0) {
56*e1eccf28SAndroid Build Coastguard Worker mIp[ct] = (int16_t)(mFp[ct] * 256.f + 0.5f);
57*e1eccf28SAndroid Build Coastguard Worker } else {
58*e1eccf28SAndroid Build Coastguard Worker mIp[ct] = (int16_t)(mFp[ct] * 256.f - 0.5f);
59*e1eccf28SAndroid Build Coastguard Worker }
60*e1eccf28SAndroid Build Coastguard Worker }
61*e1eccf28SAndroid Build Coastguard Worker }
62*e1eccf28SAndroid Build Coastguard Worker };
63*e1eccf28SAndroid Build Coastguard Worker
64*e1eccf28SAndroid Build Coastguard Worker template <typename InputOutputType, typename ComputationType>
ConvolveOneU(uint32_t x,InputOutputType * out,const InputOutputType * py0,const InputOutputType * py1,const InputOutputType * py2,const InputOutputType * py3,const InputOutputType * py4,const float * coeff,int32_t width)65*e1eccf28SAndroid Build Coastguard Worker static void ConvolveOneU(uint32_t x, InputOutputType* out, const InputOutputType* py0,
66*e1eccf28SAndroid Build Coastguard Worker const InputOutputType* py1, const InputOutputType* py2,
67*e1eccf28SAndroid Build Coastguard Worker const InputOutputType* py3, const InputOutputType* py4, const float* coeff,
68*e1eccf28SAndroid Build Coastguard Worker int32_t width) {
69*e1eccf28SAndroid Build Coastguard Worker uint32_t x0 = std::max((int32_t)x - 2, 0);
70*e1eccf28SAndroid Build Coastguard Worker uint32_t x1 = std::max((int32_t)x - 1, 0);
71*e1eccf28SAndroid Build Coastguard Worker uint32_t x2 = x;
72*e1eccf28SAndroid Build Coastguard Worker uint32_t x3 = std::min((int32_t)x + 1, width - 1);
73*e1eccf28SAndroid Build Coastguard Worker uint32_t x4 = std::min((int32_t)x + 2, width - 1);
74*e1eccf28SAndroid Build Coastguard Worker
75*e1eccf28SAndroid Build Coastguard Worker ComputationType px = convert<ComputationType>(py0[x0]) * coeff[0] +
76*e1eccf28SAndroid Build Coastguard Worker convert<ComputationType>(py0[x1]) * coeff[1] +
77*e1eccf28SAndroid Build Coastguard Worker convert<ComputationType>(py0[x2]) * coeff[2] +
78*e1eccf28SAndroid Build Coastguard Worker convert<ComputationType>(py0[x3]) * coeff[3] +
79*e1eccf28SAndroid Build Coastguard Worker convert<ComputationType>(py0[x4]) * coeff[4] +
80*e1eccf28SAndroid Build Coastguard Worker
81*e1eccf28SAndroid Build Coastguard Worker convert<ComputationType>(py1[x0]) * coeff[5] +
82*e1eccf28SAndroid Build Coastguard Worker convert<ComputationType>(py1[x1]) * coeff[6] +
83*e1eccf28SAndroid Build Coastguard Worker convert<ComputationType>(py1[x2]) * coeff[7] +
84*e1eccf28SAndroid Build Coastguard Worker convert<ComputationType>(py1[x3]) * coeff[8] +
85*e1eccf28SAndroid Build Coastguard Worker convert<ComputationType>(py1[x4]) * coeff[9] +
86*e1eccf28SAndroid Build Coastguard Worker
87*e1eccf28SAndroid Build Coastguard Worker convert<ComputationType>(py2[x0]) * coeff[10] +
88*e1eccf28SAndroid Build Coastguard Worker convert<ComputationType>(py2[x1]) * coeff[11] +
89*e1eccf28SAndroid Build Coastguard Worker convert<ComputationType>(py2[x2]) * coeff[12] +
90*e1eccf28SAndroid Build Coastguard Worker convert<ComputationType>(py2[x3]) * coeff[13] +
91*e1eccf28SAndroid Build Coastguard Worker convert<ComputationType>(py2[x4]) * coeff[14] +
92*e1eccf28SAndroid Build Coastguard Worker
93*e1eccf28SAndroid Build Coastguard Worker convert<ComputationType>(py3[x0]) * coeff[15] +
94*e1eccf28SAndroid Build Coastguard Worker convert<ComputationType>(py3[x1]) * coeff[16] +
95*e1eccf28SAndroid Build Coastguard Worker convert<ComputationType>(py3[x2]) * coeff[17] +
96*e1eccf28SAndroid Build Coastguard Worker convert<ComputationType>(py3[x3]) * coeff[18] +
97*e1eccf28SAndroid Build Coastguard Worker convert<ComputationType>(py3[x4]) * coeff[19] +
98*e1eccf28SAndroid Build Coastguard Worker
99*e1eccf28SAndroid Build Coastguard Worker convert<ComputationType>(py4[x0]) * coeff[20] +
100*e1eccf28SAndroid Build Coastguard Worker convert<ComputationType>(py4[x1]) * coeff[21] +
101*e1eccf28SAndroid Build Coastguard Worker convert<ComputationType>(py4[x2]) * coeff[22] +
102*e1eccf28SAndroid Build Coastguard Worker convert<ComputationType>(py4[x3]) * coeff[23] +
103*e1eccf28SAndroid Build Coastguard Worker convert<ComputationType>(py4[x4]) * coeff[24];
104*e1eccf28SAndroid Build Coastguard Worker px = clamp(px + 0.5f, 0.f, 255.f);
105*e1eccf28SAndroid Build Coastguard Worker *out = convert<InputOutputType>(px);
106*e1eccf28SAndroid Build Coastguard Worker }
107*e1eccf28SAndroid Build Coastguard Worker
108*e1eccf28SAndroid Build Coastguard Worker #ifdef ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
109*e1eccf28SAndroid Build Coastguard Worker template <typename InputOutputType>
ConvolveOneF(uint32_t x,InputOutputType * out,const InputOutputType * py0,const InputOutputType * py1,const InputOutputType * py2,const InputOutputType * py3,const InputOutputType * py4,const float * coeff,int32_t width)110*e1eccf28SAndroid Build Coastguard Worker static void ConvolveOneF(uint32_t x, InputOutputType* out, const InputOutputType* py0,
111*e1eccf28SAndroid Build Coastguard Worker const InputOutputType* py1, const InputOutputType* py2,
112*e1eccf28SAndroid Build Coastguard Worker const InputOutputType* py3, const InputOutputType* py4, const float* coeff,
113*e1eccf28SAndroid Build Coastguard Worker int32_t width) {
114*e1eccf28SAndroid Build Coastguard Worker uint32_t x0 = std::max((int32_t)x - 2, 0);
115*e1eccf28SAndroid Build Coastguard Worker uint32_t x1 = std::max((int32_t)x - 1, 0);
116*e1eccf28SAndroid Build Coastguard Worker uint32_t x2 = x;
117*e1eccf28SAndroid Build Coastguard Worker uint32_t x3 = std::min((int32_t)x + 1, width - 1);
118*e1eccf28SAndroid Build Coastguard Worker uint32_t x4 = std::min((int32_t)x + 2, width - 1);
119*e1eccf28SAndroid Build Coastguard Worker
120*e1eccf28SAndroid Build Coastguard Worker InputOutputType px = py0[x0] * coeff[0] + py0[x1] * coeff[1] + py0[x2] * coeff[2] +
121*e1eccf28SAndroid Build Coastguard Worker py0[x3] * coeff[3] + py0[x4] * coeff[4] +
122*e1eccf28SAndroid Build Coastguard Worker
123*e1eccf28SAndroid Build Coastguard Worker py1[x0] * coeff[5] + py1[x1] * coeff[6] + py1[x2] * coeff[7] +
124*e1eccf28SAndroid Build Coastguard Worker py1[x3] * coeff[8] + py1[x4] * coeff[9] +
125*e1eccf28SAndroid Build Coastguard Worker
126*e1eccf28SAndroid Build Coastguard Worker py2[x0] * coeff[10] + py2[x1] * coeff[11] + py2[x2] * coeff[12] +
127*e1eccf28SAndroid Build Coastguard Worker py2[x3] * coeff[13] + py2[x4] * coeff[14] +
128*e1eccf28SAndroid Build Coastguard Worker
129*e1eccf28SAndroid Build Coastguard Worker py3[x0] * coeff[15] + py3[x1] * coeff[16] + py3[x2] * coeff[17] +
130*e1eccf28SAndroid Build Coastguard Worker py3[x3] * coeff[18] + py3[x4] * coeff[19] +
131*e1eccf28SAndroid Build Coastguard Worker
132*e1eccf28SAndroid Build Coastguard Worker py4[x0] * coeff[20] + py4[x1] * coeff[21] + py4[x2] * coeff[22] +
133*e1eccf28SAndroid Build Coastguard Worker py4[x3] * coeff[23] + py4[x4] * coeff[24];
134*e1eccf28SAndroid Build Coastguard Worker *out = px;
135*e1eccf28SAndroid Build Coastguard Worker }
136*e1eccf28SAndroid Build Coastguard Worker #endif // ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
137*e1eccf28SAndroid Build Coastguard Worker
138*e1eccf28SAndroid Build Coastguard Worker /**
139*e1eccf28SAndroid Build Coastguard Worker * This function convolves one line.
140*e1eccf28SAndroid Build Coastguard Worker *
141*e1eccf28SAndroid Build Coastguard Worker * @param pout Where to place the next output.
142*e1eccf28SAndroid Build Coastguard Worker * @param xstart Index in the X direction of where to start.
143*e1eccf28SAndroid Build Coastguard Worker * @param xend End index
144*e1eccf28SAndroid Build Coastguard Worker * @param ppy0 Points to the start of the line two above.
145*e1eccf28SAndroid Build Coastguard Worker * @param ppy1 Points to the start of the line one above.
146*e1eccf28SAndroid Build Coastguard Worker * @param ppy2 Points to the start of the current line.
147*e1eccf28SAndroid Build Coastguard Worker * @param ppy3 Points to the start of the line one below.
148*e1eccf28SAndroid Build Coastguard Worker * @param ppy4 Points to the start of the line two below.
149*e1eccf28SAndroid Build Coastguard Worker */
kernelU4(uchar * pout,uint32_t x1,uint32_t x2,const uchar * ppy0,const uchar * ppy1,const uchar * ppy2,const uchar * ppy3,const uchar * ppy4)150*e1eccf28SAndroid Build Coastguard Worker void Convolve5x5Task::kernelU4(uchar* pout, uint32_t x1, uint32_t x2, const uchar* ppy0,
151*e1eccf28SAndroid Build Coastguard Worker const uchar* ppy1, const uchar* ppy2, const uchar* ppy3,
152*e1eccf28SAndroid Build Coastguard Worker const uchar* ppy4) {
153*e1eccf28SAndroid Build Coastguard Worker uchar4* out = (uchar4*)pout;
154*e1eccf28SAndroid Build Coastguard Worker const uchar4* py0 = (const uchar4*)ppy0;
155*e1eccf28SAndroid Build Coastguard Worker const uchar4* py1 = (const uchar4*)ppy1;
156*e1eccf28SAndroid Build Coastguard Worker const uchar4* py2 = (const uchar4*)ppy2;
157*e1eccf28SAndroid Build Coastguard Worker const uchar4* py3 = (const uchar4*)ppy3;
158*e1eccf28SAndroid Build Coastguard Worker const uchar4* py4 = (const uchar4*)ppy4;
159*e1eccf28SAndroid Build Coastguard Worker
160*e1eccf28SAndroid Build Coastguard Worker while ((x1 < x2) && (x1 < 2)) {
161*e1eccf28SAndroid Build Coastguard Worker ConvolveOneU<uchar4, float4>(x1, out, py0, py1, py2, py3, py4, mFp, mSizeX);
162*e1eccf28SAndroid Build Coastguard Worker out++;
163*e1eccf28SAndroid Build Coastguard Worker x1++;
164*e1eccf28SAndroid Build Coastguard Worker }
165*e1eccf28SAndroid Build Coastguard Worker #if defined(ARCH_X86_HAVE_SSSE3)
166*e1eccf28SAndroid Build Coastguard Worker // for x86 SIMD, require minimum of 7 elements (4 for SIMD,
167*e1eccf28SAndroid Build Coastguard Worker // 3 for end boundary where x may hit the end boundary)
168*e1eccf28SAndroid Build Coastguard Worker if (mUsesSimd && ((x1 + 6) < x2)) {
169*e1eccf28SAndroid Build Coastguard Worker // subtract 3 for end boundary
170*e1eccf28SAndroid Build Coastguard Worker uint32_t len = (x2 - x1 - 3) >> 2;
171*e1eccf28SAndroid Build Coastguard Worker rsdIntrinsicConvolve5x5_K(out, py0 + x1 - 2, py1 + x1 - 2, py2 + x1 - 2, py3 + x1 - 2,
172*e1eccf28SAndroid Build Coastguard Worker py4 + x1 - 2, mIp, len);
173*e1eccf28SAndroid Build Coastguard Worker out += len << 2;
174*e1eccf28SAndroid Build Coastguard Worker x1 += len << 2;
175*e1eccf28SAndroid Build Coastguard Worker }
176*e1eccf28SAndroid Build Coastguard Worker #endif
177*e1eccf28SAndroid Build Coastguard Worker
178*e1eccf28SAndroid Build Coastguard Worker #if defined(ARCH_ARM_USE_INTRINSICS)
179*e1eccf28SAndroid Build Coastguard Worker if (mUsesSimd && ((x1 + 3) < x2)) {
180*e1eccf28SAndroid Build Coastguard Worker uint32_t len = (x2 - x1 - 3) >> 1;
181*e1eccf28SAndroid Build Coastguard Worker rsdIntrinsicConvolve5x5_K(out, py0 + x1 - 2, py1 + x1 - 2, py2 + x1 - 2, py3 + x1 - 2,
182*e1eccf28SAndroid Build Coastguard Worker py4 + x1 - 2, mIp, len);
183*e1eccf28SAndroid Build Coastguard Worker out += len << 1;
184*e1eccf28SAndroid Build Coastguard Worker x1 += len << 1;
185*e1eccf28SAndroid Build Coastguard Worker }
186*e1eccf28SAndroid Build Coastguard Worker #endif
187*e1eccf28SAndroid Build Coastguard Worker
188*e1eccf28SAndroid Build Coastguard Worker while (x1 < x2) {
189*e1eccf28SAndroid Build Coastguard Worker ConvolveOneU<uchar4, float4>(x1, out, py0, py1, py2, py3, py4, mFp, mSizeX);
190*e1eccf28SAndroid Build Coastguard Worker out++;
191*e1eccf28SAndroid Build Coastguard Worker x1++;
192*e1eccf28SAndroid Build Coastguard Worker }
193*e1eccf28SAndroid Build Coastguard Worker }
194*e1eccf28SAndroid Build Coastguard Worker
195*e1eccf28SAndroid Build Coastguard Worker #ifdef ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
196*e1eccf28SAndroid Build Coastguard Worker // This will need more cleanup before it can be used.
kernelF4(const ConvolveInfo * info,float4 * out,uint32_t xstart,uint32_t xend,uint32_t currentY)197*e1eccf28SAndroid Build Coastguard Worker void Convolve5x5Task::kernelF4(const ConvolveInfo* info, float4* out,
198*e1eccf28SAndroid Build Coastguard Worker uint32_t xstart, uint32_t xend, uint32_t currentY) {
199*e1eccf28SAndroid Build Coastguard Worker const uchar* pin = (const uchar*)info->in;
200*e1eccf28SAndroid Build Coastguard Worker const size_t stride = info->stride;
201*e1eccf28SAndroid Build Coastguard Worker
202*e1eccf28SAndroid Build Coastguard Worker uint32_t y0 = std::max((int32_t)currentY - 2, 0);
203*e1eccf28SAndroid Build Coastguard Worker uint32_t y1 = std::max((int32_t)currentY - 1, 0);
204*e1eccf28SAndroid Build Coastguard Worker uint32_t y2 = currentY;
205*e1eccf28SAndroid Build Coastguard Worker uint32_t y3 = std::min((int32_t)currentY + 1, sizeY);
206*e1eccf28SAndroid Build Coastguard Worker uint32_t y4 = std::min((int32_t)currentY + 2, sizeY);
207*e1eccf28SAndroid Build Coastguard Worker
208*e1eccf28SAndroid Build Coastguard Worker const float4* py0 = (const float4*)(pin + stride * y0);
209*e1eccf28SAndroid Build Coastguard Worker const float4* py1 = (const float4*)(pin + stride * y1);
210*e1eccf28SAndroid Build Coastguard Worker const float4* py2 = (const float4*)(pin + stride * y2);
211*e1eccf28SAndroid Build Coastguard Worker const float4* py3 = (const float4*)(pin + stride * y3);
212*e1eccf28SAndroid Build Coastguard Worker const float4* py4 = (const float4*)(pin + stride * y4);
213*e1eccf28SAndroid Build Coastguard Worker
214*e1eccf28SAndroid Build Coastguard Worker for (uint32_t x = xstart; x < xend; x++, out++) {
215*e1eccf28SAndroid Build Coastguard Worker ConvolveOneF<float4>(x, out, py0, py1, py2, py3, py4, mFp, sizeX);
216*e1eccf28SAndroid Build Coastguard Worker }
217*e1eccf28SAndroid Build Coastguard Worker }
218*e1eccf28SAndroid Build Coastguard Worker
RsdCpuScriptIntrinsicConvolve5x5_kernelF2(const ConvolveInfo * info,float2 * out,uint32_t xstart,uint32_t xend,uint32_t currentY)219*e1eccf28SAndroid Build Coastguard Worker void RsdCpuScriptIntrinsicConvolve5x5_kernelF2(const ConvolveInfo* info, float2* out,
220*e1eccf28SAndroid Build Coastguard Worker uint32_t xstart, uint32_t xend, uint32_t currentY) {
221*e1eccf28SAndroid Build Coastguard Worker const uchar* pin = (const uchar*)info->in;
222*e1eccf28SAndroid Build Coastguard Worker const size_t stride = info->stride;
223*e1eccf28SAndroid Build Coastguard Worker
224*e1eccf28SAndroid Build Coastguard Worker uint32_t y0 = std::max((int32_t)currentY - 2, 0);
225*e1eccf28SAndroid Build Coastguard Worker uint32_t y1 = std::max((int32_t)currentY - 1, 0);
226*e1eccf28SAndroid Build Coastguard Worker uint32_t y2 = currentY;
227*e1eccf28SAndroid Build Coastguard Worker uint32_t y3 = std::min((int32_t)currentY + 1, sizeY);
228*e1eccf28SAndroid Build Coastguard Worker uint32_t y4 = std::min((int32_t)currentY + 2, sizeY);
229*e1eccf28SAndroid Build Coastguard Worker
230*e1eccf28SAndroid Build Coastguard Worker const float2* py0 = (const float2*)(pin + stride * y0);
231*e1eccf28SAndroid Build Coastguard Worker const float2* py1 = (const float2*)(pin + stride * y1);
232*e1eccf28SAndroid Build Coastguard Worker const float2* py2 = (const float2*)(pin + stride * y2);
233*e1eccf28SAndroid Build Coastguard Worker const float2* py3 = (const float2*)(pin + stride * y3);
234*e1eccf28SAndroid Build Coastguard Worker const float2* py4 = (const float2*)(pin + stride * y4);
235*e1eccf28SAndroid Build Coastguard Worker
236*e1eccf28SAndroid Build Coastguard Worker for (uint32_t x = xstart; x < xend; x++, out++) {
237*e1eccf28SAndroid Build Coastguard Worker ConvolveOneF<float2>(x, out, py0, py1, py2, py3, py4, mFp, sizeX);
238*e1eccf28SAndroid Build Coastguard Worker }
239*e1eccf28SAndroid Build Coastguard Worker }
240*e1eccf28SAndroid Build Coastguard Worker
RsdCpuScriptIntrinsicConvolve5x5_kernelF1(const ConvolveInfo * info,float * out,uint32_t xstart,uint32_t xend,uint32_t currentY)241*e1eccf28SAndroid Build Coastguard Worker void RsdCpuScriptIntrinsicConvolve5x5_kernelF1(const ConvolveInfo* info, float* out,
242*e1eccf28SAndroid Build Coastguard Worker uint32_t xstart, uint32_t xend, uint32_t currentY) {
243*e1eccf28SAndroid Build Coastguard Worker const uchar* pin = (const uchar*)info->in;
244*e1eccf28SAndroid Build Coastguard Worker const size_t stride = info->stride;
245*e1eccf28SAndroid Build Coastguard Worker
246*e1eccf28SAndroid Build Coastguard Worker uint32_t y0 = std::max((int32_t)currentY - 2, 0);
247*e1eccf28SAndroid Build Coastguard Worker uint32_t y1 = std::max((int32_t)currentY - 1, 0);
248*e1eccf28SAndroid Build Coastguard Worker uint32_t y2 = currentY;
249*e1eccf28SAndroid Build Coastguard Worker uint32_t y3 = std::min((int32_t)currentY + 1, sizeY);
250*e1eccf28SAndroid Build Coastguard Worker uint32_t y4 = std::min((int32_t)currentY + 2, sizeY);
251*e1eccf28SAndroid Build Coastguard Worker
252*e1eccf28SAndroid Build Coastguard Worker const float* py0 = (const float*)(pin + stride * y0);
253*e1eccf28SAndroid Build Coastguard Worker const float* py1 = (const float*)(pin + stride * y1);
254*e1eccf28SAndroid Build Coastguard Worker const float* py2 = (const float*)(pin + stride * y2);
255*e1eccf28SAndroid Build Coastguard Worker const float* py3 = (const float*)(pin + stride * y3);
256*e1eccf28SAndroid Build Coastguard Worker const float* py4 = (const float*)(pin + stride * y4);
257*e1eccf28SAndroid Build Coastguard Worker
258*e1eccf28SAndroid Build Coastguard Worker for (uint32_t x = xstart; x < xend; x++, out++) {
259*e1eccf28SAndroid Build Coastguard Worker ConvolveOneF<float>(x, out, py0, py1, py2, py3, py4, mFp, sizeX);
260*e1eccf28SAndroid Build Coastguard Worker }
261*e1eccf28SAndroid Build Coastguard Worker }
262*e1eccf28SAndroid Build Coastguard Worker #endif // ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
263*e1eccf28SAndroid Build Coastguard Worker
264*e1eccf28SAndroid Build Coastguard Worker template <typename InputOutputType, typename ComputationType>
convolveU(const uchar * pin,uchar * pout,size_t vectorSize,size_t sizeX,size_t sizeY,size_t startX,size_t startY,size_t endX,size_t endY,float * mFp)265*e1eccf28SAndroid Build Coastguard Worker static void convolveU(const uchar* pin, uchar* pout, size_t vectorSize, size_t sizeX, size_t sizeY,
266*e1eccf28SAndroid Build Coastguard Worker size_t startX, size_t startY, size_t endX, size_t endY, float* mFp) {
267*e1eccf28SAndroid Build Coastguard Worker const size_t stride = vectorSize * sizeX;
268*e1eccf28SAndroid Build Coastguard Worker for (size_t y = startY; y < endY; y++) {
269*e1eccf28SAndroid Build Coastguard Worker uint32_t y0 = std::max((int32_t)y - 2, 0);
270*e1eccf28SAndroid Build Coastguard Worker uint32_t y1 = std::max((int32_t)y - 1, 0);
271*e1eccf28SAndroid Build Coastguard Worker uint32_t y2 = y;
272*e1eccf28SAndroid Build Coastguard Worker uint32_t y3 = std::min((int32_t)y + 1, (int32_t)(sizeY - 1));
273*e1eccf28SAndroid Build Coastguard Worker uint32_t y4 = std::min((int32_t)y + 2, (int32_t)(sizeY - 1));
274*e1eccf28SAndroid Build Coastguard Worker
275*e1eccf28SAndroid Build Coastguard Worker size_t offset = (y * sizeX + startX) * vectorSize;
276*e1eccf28SAndroid Build Coastguard Worker InputOutputType* px = (InputOutputType*)(pout + offset);
277*e1eccf28SAndroid Build Coastguard Worker InputOutputType* py0 = (InputOutputType*)(pin + stride * y0);
278*e1eccf28SAndroid Build Coastguard Worker InputOutputType* py1 = (InputOutputType*)(pin + stride * y1);
279*e1eccf28SAndroid Build Coastguard Worker InputOutputType* py2 = (InputOutputType*)(pin + stride * y2);
280*e1eccf28SAndroid Build Coastguard Worker InputOutputType* py3 = (InputOutputType*)(pin + stride * y3);
281*e1eccf28SAndroid Build Coastguard Worker InputOutputType* py4 = (InputOutputType*)(pin + stride * y4);
282*e1eccf28SAndroid Build Coastguard Worker for (uint32_t x = startX; x < endX; x++, px++) {
283*e1eccf28SAndroid Build Coastguard Worker ConvolveOneU<InputOutputType, ComputationType>(x, px, py0, py1, py2, py3, py4, mFp,
284*e1eccf28SAndroid Build Coastguard Worker sizeX);
285*e1eccf28SAndroid Build Coastguard Worker }
286*e1eccf28SAndroid Build Coastguard Worker }
287*e1eccf28SAndroid Build Coastguard Worker }
288*e1eccf28SAndroid Build Coastguard Worker
convolveU4(const uchar * pin,uchar * pout,size_t vectorSize,size_t sizeX,size_t sizeY,size_t startX,size_t startY,size_t endX,size_t endY)289*e1eccf28SAndroid Build Coastguard Worker void Convolve5x5Task::convolveU4(const uchar* pin, uchar* pout, size_t vectorSize, size_t sizeX,
290*e1eccf28SAndroid Build Coastguard Worker size_t sizeY, size_t startX, size_t startY, size_t endX,
291*e1eccf28SAndroid Build Coastguard Worker size_t endY) {
292*e1eccf28SAndroid Build Coastguard Worker const size_t stride = paddedSize(vectorSize) * sizeX;
293*e1eccf28SAndroid Build Coastguard Worker for (size_t y = startY; y < endY; y++) {
294*e1eccf28SAndroid Build Coastguard Worker uint32_t y0 = std::max((int32_t)y - 2, 0);
295*e1eccf28SAndroid Build Coastguard Worker uint32_t y1 = std::max((int32_t)y - 1, 0);
296*e1eccf28SAndroid Build Coastguard Worker uint32_t y2 = y;
297*e1eccf28SAndroid Build Coastguard Worker uint32_t y3 = std::min((int32_t)y + 1, (int32_t)(sizeY - 1));
298*e1eccf28SAndroid Build Coastguard Worker uint32_t y4 = std::min((int32_t)y + 2, (int32_t)(sizeY - 1));
299*e1eccf28SAndroid Build Coastguard Worker
300*e1eccf28SAndroid Build Coastguard Worker size_t offset = (y * sizeX + startX) * paddedSize(vectorSize);
301*e1eccf28SAndroid Build Coastguard Worker uchar* px = pout + offset;
302*e1eccf28SAndroid Build Coastguard Worker const uchar* py0 = pin + stride * y0;
303*e1eccf28SAndroid Build Coastguard Worker const uchar* py1 = pin + stride * y1;
304*e1eccf28SAndroid Build Coastguard Worker const uchar* py2 = pin + stride * y2;
305*e1eccf28SAndroid Build Coastguard Worker const uchar* py3 = pin + stride * y3;
306*e1eccf28SAndroid Build Coastguard Worker const uchar* py4 = pin + stride * y4;
307*e1eccf28SAndroid Build Coastguard Worker kernelU4(px, startX, endX, py0, py1, py2, py3, py4);
308*e1eccf28SAndroid Build Coastguard Worker }
309*e1eccf28SAndroid Build Coastguard Worker }
310*e1eccf28SAndroid Build Coastguard Worker
processData(int,size_t startX,size_t startY,size_t endX,size_t endY)311*e1eccf28SAndroid Build Coastguard Worker void Convolve5x5Task::processData(int /* threadIndex */, size_t startX, size_t startY, size_t endX,
312*e1eccf28SAndroid Build Coastguard Worker size_t endY) {
313*e1eccf28SAndroid Build Coastguard Worker // ALOGI("Thread %d start tile from (%zd, %zd) to (%zd, %zd)", threadIndex, startX, startY,
314*e1eccf28SAndroid Build Coastguard Worker // endX, endY);
315*e1eccf28SAndroid Build Coastguard Worker switch (mVectorSize) {
316*e1eccf28SAndroid Build Coastguard Worker case 1:
317*e1eccf28SAndroid Build Coastguard Worker convolveU<uchar, float>((const uchar*)mIn, (uchar*)mOut, mVectorSize, mSizeX, mSizeY,
318*e1eccf28SAndroid Build Coastguard Worker startX, startY, endX, endY, mFp);
319*e1eccf28SAndroid Build Coastguard Worker break;
320*e1eccf28SAndroid Build Coastguard Worker case 2:
321*e1eccf28SAndroid Build Coastguard Worker convolveU<uchar2, float2>((const uchar*)mIn, (uchar*)mOut, mVectorSize, mSizeX, mSizeY,
322*e1eccf28SAndroid Build Coastguard Worker startX, startY, endX, endY, mFp);
323*e1eccf28SAndroid Build Coastguard Worker break;
324*e1eccf28SAndroid Build Coastguard Worker case 3:
325*e1eccf28SAndroid Build Coastguard Worker case 4:
326*e1eccf28SAndroid Build Coastguard Worker convolveU4((const uchar*)mIn, (uchar*)mOut, mVectorSize, mSizeX, mSizeY, startX, startY,
327*e1eccf28SAndroid Build Coastguard Worker endX, endY);
328*e1eccf28SAndroid Build Coastguard Worker break;
329*e1eccf28SAndroid Build Coastguard Worker }
330*e1eccf28SAndroid Build Coastguard Worker }
331*e1eccf28SAndroid Build Coastguard Worker
convolve5x5(const void * in,void * out,size_t vectorSize,size_t sizeX,size_t sizeY,const float * coefficients,const Restriction * restriction)332*e1eccf28SAndroid Build Coastguard Worker void RenderScriptToolkit::convolve5x5(const void* in, void* out, size_t vectorSize, size_t sizeX,
333*e1eccf28SAndroid Build Coastguard Worker size_t sizeY, const float* coefficients,
334*e1eccf28SAndroid Build Coastguard Worker const Restriction* restriction) {
335*e1eccf28SAndroid Build Coastguard Worker #ifdef ANDROID_RENDERSCRIPT_TOOLKIT_VALIDATE
336*e1eccf28SAndroid Build Coastguard Worker if (!validRestriction(LOG_TAG, sizeX, sizeY, restriction)) {
337*e1eccf28SAndroid Build Coastguard Worker return;
338*e1eccf28SAndroid Build Coastguard Worker }
339*e1eccf28SAndroid Build Coastguard Worker if (vectorSize < 1 || vectorSize > 4) {
340*e1eccf28SAndroid Build Coastguard Worker ALOGE("The vectorSize should be between 1 and 4. %zu provided.", vectorSize);
341*e1eccf28SAndroid Build Coastguard Worker return;
342*e1eccf28SAndroid Build Coastguard Worker }
343*e1eccf28SAndroid Build Coastguard Worker #endif
344*e1eccf28SAndroid Build Coastguard Worker
345*e1eccf28SAndroid Build Coastguard Worker Convolve5x5Task task(in, out, vectorSize, sizeX, sizeY, coefficients, restriction);
346*e1eccf28SAndroid Build Coastguard Worker processor->doTask(&task);
347*e1eccf28SAndroid Build Coastguard Worker }
348*e1eccf28SAndroid Build Coastguard Worker
349*e1eccf28SAndroid Build Coastguard Worker } // namespace renderscript
350*e1eccf28SAndroid Build Coastguard Worker } // namespace android
351