1*32afb93cSXin Li /*
2*32afb93cSXin Li  * Copyright (C) 2012 The Android Open Source Project
3*32afb93cSXin Li  *
4*32afb93cSXin Li  * Licensed under the Apache License, Version 2.0 (the "License");
5*32afb93cSXin Li  * you may not use this file except in compliance with the License.
6*32afb93cSXin Li  * You may obtain a copy of the License at
7*32afb93cSXin Li  *
8*32afb93cSXin Li  *      http://www.apache.org/licenses/LICENSE-2.0
9*32afb93cSXin Li  *
10*32afb93cSXin Li  * Unless required by applicable law or agreed to in writing, software
11*32afb93cSXin Li  * distributed under the License is distributed on an "AS IS" BASIS,
12*32afb93cSXin Li  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13*32afb93cSXin Li  * See the License for the specific language governing permissions and
14*32afb93cSXin Li  * limitations under the License.
15*32afb93cSXin Li  */
16*32afb93cSXin Li 
17*32afb93cSXin Li #include <cstdint>
18*32afb93cSXin Li 
19*32afb93cSXin Li #include "RenderScriptToolkit.h"
20*32afb93cSXin Li #include "TaskProcessor.h"
21*32afb93cSXin Li #include "Utils.h"
22*32afb93cSXin Li 
23*32afb93cSXin Li #define LOG_TAG "renderscript.toolkit.Convolve3x3"
24*32afb93cSXin Li 
25*32afb93cSXin Li namespace renderscript {
26*32afb93cSXin Li 
27*32afb93cSXin Li extern "C" void rsdIntrinsicConvolve3x3_K(void* dst, const void* y0, const void* y1, const void* y2,
28*32afb93cSXin Li                                           const int16_t* coef, uint32_t count);
29*32afb93cSXin Li 
30*32afb93cSXin Li class Convolve3x3Task : public Task {
31*32afb93cSXin Li     const void* mIn;
32*32afb93cSXin Li     void* mOut;
33*32afb93cSXin Li     // Even though we have exactly 9 coefficients, store them in an array of size 16 so that
34*32afb93cSXin Li     // the SIMD instructions can load them in chunks multiple of 8.
35*32afb93cSXin Li     float mFp[16];
36*32afb93cSXin Li     int16_t mIp[16];
37*32afb93cSXin Li 
38*32afb93cSXin Li     void kernelU4(uchar* out, uint32_t xstart, uint32_t xend, const uchar* py0, const uchar* py1,
39*32afb93cSXin Li                   const uchar* py2);
40*32afb93cSXin Li     void convolveU4(const uchar* pin, uchar* pout, size_t vectorSize, size_t sizeX, size_t sizeY,
41*32afb93cSXin Li                     size_t startX, size_t startY, size_t endX, size_t endY);
42*32afb93cSXin Li 
43*32afb93cSXin Li     // Process a 2D tile of the overall work. threadIndex identifies which thread does the work.
44*32afb93cSXin Li     void processData(int threadIndex, size_t startX, size_t startY, size_t endX,
45*32afb93cSXin Li                      size_t endY) override;
46*32afb93cSXin Li 
47*32afb93cSXin Li    public:
Convolve3x3Task(const void * in,void * out,size_t vectorSize,size_t sizeX,size_t sizeY,const float * coefficients,const Restriction * restriction)48*32afb93cSXin Li     Convolve3x3Task(const void* in, void* out, size_t vectorSize, size_t sizeX, size_t sizeY,
49*32afb93cSXin Li                     const float* coefficients, const Restriction* restriction)
50*32afb93cSXin Li         : Task{sizeX, sizeY, vectorSize, false, restriction}, mIn{in}, mOut{out} {
51*32afb93cSXin Li         for (int ct = 0; ct < 9; ct++) {
52*32afb93cSXin Li             mFp[ct] = coefficients[ct];
53*32afb93cSXin Li             if (mFp[ct] >= 0) {
54*32afb93cSXin Li                 mIp[ct] = (int16_t)(mFp[ct] * 256.f + 0.5f);
55*32afb93cSXin Li             } else {
56*32afb93cSXin Li                 mIp[ct] = (int16_t)(mFp[ct] * 256.f - 0.5f);
57*32afb93cSXin Li             }
58*32afb93cSXin Li         }
59*32afb93cSXin Li     }
60*32afb93cSXin Li };
61*32afb93cSXin Li 
62*32afb93cSXin Li /**
63*32afb93cSXin Li  * Computes one convolution and stores the result in the output. This is used for uchar, uchar2,
64*32afb93cSXin Li  * uchar3, and uchar4 vectors.
65*32afb93cSXin Li  *
66*32afb93cSXin Li  * @tparam InputOutputType Type of the input and output arrays. A vector type, e.g. uchar4.
67*32afb93cSXin Li  * @tparam ComputationType Type we use for the intermediate computations.
68*32afb93cSXin Li  * @param x The index in the row of the value we'll convolve.
69*32afb93cSXin Li  * @param out The location in the output array where we store the value.
70*32afb93cSXin Li  * @param py0 The start of the top row.
71*32afb93cSXin Li  * @param py1 The start of the middle row.
72*32afb93cSXin Li  * @param py2 The start of the bottom row.
73*32afb93cSXin Li  * @param coeff Pointer to the float coefficients, in row major format.
74*32afb93cSXin Li  * @param sizeX The number of cells of one row.
75*32afb93cSXin Li  */
76*32afb93cSXin Li template <typename InputOutputType, typename ComputationType>
convolveOneU(uint32_t x,InputOutputType * out,const InputOutputType * py0,const InputOutputType * py1,const InputOutputType * py2,const float * coeff,int32_t sizeX)77*32afb93cSXin Li static void convolveOneU(uint32_t x, InputOutputType* out, const InputOutputType* py0,
78*32afb93cSXin Li                          const InputOutputType* py1, const InputOutputType* py2, const float* coeff,
79*32afb93cSXin Li                          int32_t sizeX) {
80*32afb93cSXin Li     uint32_t x1 = std::max((int32_t)x - 1, 0);
81*32afb93cSXin Li     uint32_t x2 = std::min((int32_t)x + 1, sizeX - 1);
82*32afb93cSXin Li 
83*32afb93cSXin Li     ComputationType px = convert<ComputationType>(py0[x1]) * coeff[0] +
84*32afb93cSXin Li                          convert<ComputationType>(py0[x]) * coeff[1] +
85*32afb93cSXin Li                          convert<ComputationType>(py0[x2]) * coeff[2] +
86*32afb93cSXin Li                          convert<ComputationType>(py1[x1]) * coeff[3] +
87*32afb93cSXin Li                          convert<ComputationType>(py1[x]) * coeff[4] +
88*32afb93cSXin Li                          convert<ComputationType>(py1[x2]) * coeff[5] +
89*32afb93cSXin Li                          convert<ComputationType>(py2[x1]) * coeff[6] +
90*32afb93cSXin Li                          convert<ComputationType>(py2[x]) * coeff[7] +
91*32afb93cSXin Li                          convert<ComputationType>(py2[x2]) * coeff[8];
92*32afb93cSXin Li 
93*32afb93cSXin Li     px = clamp(px + 0.5f, 0.f, 255.f);
94*32afb93cSXin Li     *out = convert<InputOutputType>(px);
95*32afb93cSXin Li }
96*32afb93cSXin Li 
97*32afb93cSXin Li #ifdef ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
98*32afb93cSXin Li /**
99*32afb93cSXin Li  * Computes one convolution and stores the result in the output. This is used for float, float2,
100*32afb93cSXin Li  * float3, and float4 vectors.
101*32afb93cSXin Li  *
102*32afb93cSXin Li  * @tparam InputOutputType Type of the input and output arrays. A vector type, e.g. float4.
103*32afb93cSXin Li  * @param x The index in the row of the value we'll convolve.
104*32afb93cSXin Li  * @param out The location in the output array where we store the value.
105*32afb93cSXin Li  * @param py0 The start of the top row.
106*32afb93cSXin Li  * @param py1 The start of the middle row.
107*32afb93cSXin Li  * @param py2 The start of the bottom row.
108*32afb93cSXin Li  * @param coeff Pointer to the float coefficients, in row major format.
109*32afb93cSXin Li  * @param sizeX The number of cells of one row.
110*32afb93cSXin Li  */
111*32afb93cSXin Li template <typename InputOutputType>
ConvolveOneF(uint32_t x,InputOutputType * out,const InputOutputType * py0,const InputOutputType * py1,const InputOutputType * py2,const float * coeff,int32_t sizeX)112*32afb93cSXin Li static void ConvolveOneF(uint32_t x, InputOutputType* out, const InputOutputType* py0,
113*32afb93cSXin Li                          const InputOutputType* py1, const InputOutputType* py2, const float* coeff,
114*32afb93cSXin Li                          int32_t sizeX) {
115*32afb93cSXin Li     uint32_t x1 = std::max((int32_t)x - 1, 0);
116*32afb93cSXin Li     uint32_t x2 = std::min((int32_t)x + 1, sizeX - 1);
117*32afb93cSXin Li     *out = (py0[x1] * coeff[0]) + (py0[x] * coeff[1]) + (py0[x2] * coeff[2]) +
118*32afb93cSXin Li            (py1[x1] * coeff[3]) + (py1[x] * coeff[4]) + (py1[x2] * coeff[5]) +
119*32afb93cSXin Li            (py2[x1] * coeff[6]) + (py2[x] * coeff[7]) + (py2[x2] * coeff[8]);
120*32afb93cSXin Li }
121*32afb93cSXin Li #endif  // ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
122*32afb93cSXin Li 
123*32afb93cSXin Li /**
124*32afb93cSXin Li  * This function convolves one line.
125*32afb93cSXin Li  *
126*32afb93cSXin Li  * @param pout Where to place the next output.
127*32afb93cSXin Li  * @param xstart Index in the X direction of where to start.
128*32afb93cSXin Li  * @param xend End index
129*32afb93cSXin Li  * @param ppy0 Points to the start of the previous line.
130*32afb93cSXin Li  * @param ppy1 Points to the start of the current line.
131*32afb93cSXin Li  * @param ppy2 Points to the start of the next line.
132*32afb93cSXin Li  */
kernelU4(uchar * pout,uint32_t xstart,uint32_t xend,const uchar * ppy0,const uchar * ppy1,const uchar * ppy2)133*32afb93cSXin Li void Convolve3x3Task::kernelU4(uchar* pout, uint32_t xstart, uint32_t xend, const uchar* ppy0,
134*32afb93cSXin Li                                const uchar* ppy1, const uchar* ppy2) {
135*32afb93cSXin Li     uchar4* out = (uchar4*)pout;
136*32afb93cSXin Li     const uchar4* py0 = (const uchar4*)ppy0;
137*32afb93cSXin Li     const uchar4* py1 = (const uchar4*)ppy1;
138*32afb93cSXin Li     const uchar4* py2 = (const uchar4*)ppy2;
139*32afb93cSXin Li 
140*32afb93cSXin Li     uint32_t x1 = xstart;
141*32afb93cSXin Li     uint32_t x2 = xend;
142*32afb93cSXin Li     if (x1 == 0) {
143*32afb93cSXin Li         convolveOneU<uchar4, float4>(0, out, py0, py1, py2, mFp, mSizeX);
144*32afb93cSXin Li         x1++;
145*32afb93cSXin Li         out++;
146*32afb93cSXin Li     }
147*32afb93cSXin Li 
148*32afb93cSXin Li     if (x2 > x1) {
149*32afb93cSXin Li #if defined(ARCH_ARM_USE_INTRINSICS) || defined(ARCH_X86_HAVE_SSSE3)
150*32afb93cSXin Li         if (mUsesSimd) {
151*32afb93cSXin Li             int32_t len = (x2 - x1 - 1) >> 1;
152*32afb93cSXin Li             if (len > 0) {
153*32afb93cSXin Li                 rsdIntrinsicConvolve3x3_K(out, &py0[x1 - 1], &py1[x1 - 1], &py2[x1 - 1], mIp, len);
154*32afb93cSXin Li                 x1 += len << 1;
155*32afb93cSXin Li                 out += len << 1;
156*32afb93cSXin Li             }
157*32afb93cSXin Li         }
158*32afb93cSXin Li #endif
159*32afb93cSXin Li 
160*32afb93cSXin Li         while (x1 != x2) {
161*32afb93cSXin Li             convolveOneU<uchar4, float4>(x1, out, py0, py1, py2, mFp, mSizeX);
162*32afb93cSXin Li             out++;
163*32afb93cSXin Li             x1++;
164*32afb93cSXin Li         }
165*32afb93cSXin Li     }
166*32afb93cSXin Li }
167*32afb93cSXin Li 
168*32afb93cSXin Li #ifdef ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
169*32afb93cSXin Li template <typename T>
RsdCpuScriptIntrinsicConvolve3x3_kernelF(void * in,T * out,uint32_t xstart,uint32_t xend,uint32_t currentY,size_t sizeX,size_t sizeY,size_t vectorSize,float * fp)170*32afb93cSXin Li void RsdCpuScriptIntrinsicConvolve3x3_kernelF(void* in, T* out, uint32_t xstart, uint32_t xend,
171*32afb93cSXin Li                                               uint32_t currentY, size_t sizeX, size_t sizeY,
172*32afb93cSXin Li                                               size_t vectorSize, float* fp) {
173*32afb93cSXin Li     const uchar* pin = (const uchar*)in;
174*32afb93cSXin Li     const size_t stride = sizeX * vectorSize * 4;  // float takes 4 bytes
175*32afb93cSXin Li 
176*32afb93cSXin Li     uint32_t y1 = std::min((int32_t)currentY + 1, (int32_t)(sizeY - 1));
177*32afb93cSXin Li     uint32_t y2 = std::max((int32_t)currentY - 1, 0);
178*32afb93cSXin Li     const T* py0 = (const T*)(pin + stride * y2);
179*32afb93cSXin Li     const T* py1 = (const T*)(pin + stride * currentY);
180*32afb93cSXin Li     const T* py2 = (const T*)(pin + stride * y1);
181*32afb93cSXin Li 
182*32afb93cSXin Li     for (uint32_t x = xstart; x < xend; x++, out++) {
183*32afb93cSXin Li         ConvolveOneF<T>(x, out, py0, py1, py2, fp, sizeX);
184*32afb93cSXin Li     }
185*32afb93cSXin Li }
186*32afb93cSXin Li #endif  // ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
187*32afb93cSXin Li 
188*32afb93cSXin Li template <typename InputOutputType, typename ComputationType>
convolveU(const uchar * pin,uchar * pout,size_t vectorSize,size_t sizeX,size_t sizeY,size_t startX,size_t startY,size_t endX,size_t endY,float * fp)189*32afb93cSXin Li static void convolveU(const uchar* pin, uchar* pout, size_t vectorSize, size_t sizeX, size_t sizeY,
190*32afb93cSXin Li                       size_t startX, size_t startY, size_t endX, size_t endY, float* fp) {
191*32afb93cSXin Li     const size_t stride = vectorSize * sizeX;
192*32afb93cSXin Li     for (size_t y = startY; y < endY; y++) {
193*32afb93cSXin Li         uint32_t y1 = std::min((int32_t)y + 1, (int32_t)(sizeY - 1));
194*32afb93cSXin Li         uint32_t y2 = std::max((int32_t)y - 1, 0);
195*32afb93cSXin Li 
196*32afb93cSXin Li         size_t offset = (y * sizeX + startX) * vectorSize;
197*32afb93cSXin Li         InputOutputType* px = (InputOutputType*)(pout + offset);
198*32afb93cSXin Li         InputOutputType* py0 = (InputOutputType*)(pin + stride * y2);
199*32afb93cSXin Li         InputOutputType* py1 = (InputOutputType*)(pin + stride * y);
200*32afb93cSXin Li         InputOutputType* py2 = (InputOutputType*)(pin + stride * y1);
201*32afb93cSXin Li         for (uint32_t x = startX; x < endX; x++, px++) {
202*32afb93cSXin Li             convolveOneU<InputOutputType, ComputationType>(x, px, py0, py1, py2, fp, sizeX);
203*32afb93cSXin Li         }
204*32afb93cSXin Li     }
205*32afb93cSXin Li }
206*32afb93cSXin Li 
convolveU4(const uchar * pin,uchar * pout,size_t vectorSize,size_t sizeX,size_t sizeY,size_t startX,size_t startY,size_t endX,size_t endY)207*32afb93cSXin Li void Convolve3x3Task::convolveU4(const uchar* pin, uchar* pout, size_t vectorSize, size_t sizeX,
208*32afb93cSXin Li                                  size_t sizeY, size_t startX, size_t startY, size_t endX,
209*32afb93cSXin Li                                  size_t endY) {
210*32afb93cSXin Li     const size_t stride = paddedSize(vectorSize) * sizeX;
211*32afb93cSXin Li     for (size_t y = startY; y < endY; y++) {
212*32afb93cSXin Li         uint32_t y1 = std::min((int32_t)y + 1, (int32_t)(sizeY - 1));
213*32afb93cSXin Li         uint32_t y2 = std::max((int32_t)y - 1, 0);
214*32afb93cSXin Li 
215*32afb93cSXin Li         size_t offset = (y * sizeX + startX) * paddedSize(vectorSize);
216*32afb93cSXin Li         uchar* px = pout + offset;
217*32afb93cSXin Li         const uchar* py0 = pin + stride * y2;
218*32afb93cSXin Li         const uchar* py1 = pin + stride * y;
219*32afb93cSXin Li         const uchar* py2 = pin + stride * y1;
220*32afb93cSXin Li         kernelU4(px, startX, endX, py0, py1, py2);
221*32afb93cSXin Li     }
222*32afb93cSXin Li }
223*32afb93cSXin Li 
processData(int,size_t startX,size_t startY,size_t endX,size_t endY)224*32afb93cSXin Li void Convolve3x3Task::processData(int /* threadIndex */, size_t startX, size_t startY, size_t endX,
225*32afb93cSXin Li                                   size_t endY) {
226*32afb93cSXin Li     // ALOGI("Thread %d start tile from (%zd, %zd) to (%zd, %zd)", threadIndex, startX, startY,
227*32afb93cSXin Li     // endX, endY);
228*32afb93cSXin Li     switch (mVectorSize) {
229*32afb93cSXin Li         case 1:
230*32afb93cSXin Li             convolveU<uchar, float>((const uchar*)mIn, (uchar*)mOut, mVectorSize, mSizeX, mSizeY,
231*32afb93cSXin Li                                     startX, startY, endX, endY, mFp);
232*32afb93cSXin Li             break;
233*32afb93cSXin Li         case 2:
234*32afb93cSXin Li             convolveU<uchar2, float2>((const uchar*)mIn, (uchar*)mOut, mVectorSize, mSizeX, mSizeY,
235*32afb93cSXin Li                                       startX, startY, endX, endY, mFp);
236*32afb93cSXin Li             break;
237*32afb93cSXin Li         case 3:
238*32afb93cSXin Li         case 4:
239*32afb93cSXin Li             convolveU4((const uchar*)mIn, (uchar*)mOut, mVectorSize, mSizeX, mSizeY, startX, startY,
240*32afb93cSXin Li                        endX, endY);
241*32afb93cSXin Li             break;
242*32afb93cSXin Li     }
243*32afb93cSXin Li }
244*32afb93cSXin Li 
convolve3x3(const void * in,void * out,size_t vectorSize,size_t sizeX,size_t sizeY,const float * coefficients,const Restriction * restriction)245*32afb93cSXin Li void RenderScriptToolkit::convolve3x3(const void* in, void* out, size_t vectorSize, size_t sizeX,
246*32afb93cSXin Li                                       size_t sizeY, const float* coefficients,
247*32afb93cSXin Li                                       const Restriction* restriction) {
248*32afb93cSXin Li #ifdef ANDROID_RENDERSCRIPT_TOOLKIT_VALIDATE
249*32afb93cSXin Li     if (!validRestriction(LOG_TAG, sizeX, sizeY, restriction)) {
250*32afb93cSXin Li         return;
251*32afb93cSXin Li     }
252*32afb93cSXin Li     if (vectorSize < 1 || vectorSize > 4) {
253*32afb93cSXin Li         ALOGE("The vectorSize should be between 1 and 4. %zu provided.", vectorSize);
254*32afb93cSXin Li         return;
255*32afb93cSXin Li     }
256*32afb93cSXin Li #endif
257*32afb93cSXin Li 
258*32afb93cSXin Li     Convolve3x3Task task(in, out, vectorSize, sizeX, sizeY, coefficients, restriction);
259*32afb93cSXin Li     processor->doTask(&task);
260*32afb93cSXin Li }
261*32afb93cSXin Li 
262*32afb93cSXin Li }  // namespace renderscript
263