1*32afb93cSXin Li /*
2*32afb93cSXin Li  * Copyright (C) 2012 The Android Open Source Project
3*32afb93cSXin Li  *
4*32afb93cSXin Li  * Licensed under the Apache License, Version 2.0 (the "License");
5*32afb93cSXin Li  * you may not use this file except in compliance with the License.
6*32afb93cSXin Li  * You may obtain a copy of the License at
7*32afb93cSXin Li  *
8*32afb93cSXin Li  *      http://www.apache.org/licenses/LICENSE-2.0
9*32afb93cSXin Li  *
10*32afb93cSXin Li  * Unless required by applicable law or agreed to in writing, software
11*32afb93cSXin Li  * distributed under the License is distributed on an "AS IS" BASIS,
12*32afb93cSXin Li  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13*32afb93cSXin Li  * See the License for the specific language governing permissions and
14*32afb93cSXin Li  * limitations under the License.
15*32afb93cSXin Li  */
16*32afb93cSXin Li 
17*32afb93cSXin Li #include <cstdint>
18*32afb93cSXin Li 
19*32afb93cSXin Li #include "RenderScriptToolkit.h"
20*32afb93cSXin Li #include "TaskProcessor.h"
21*32afb93cSXin Li #include "Utils.h"
22*32afb93cSXin Li 
23*32afb93cSXin Li namespace renderscript {
24*32afb93cSXin Li 
25*32afb93cSXin Li #define LOG_TAG "renderscript.toolkit.Lut3d"
26*32afb93cSXin Li 
27*32afb93cSXin Li /**
28*32afb93cSXin Li  * Converts a RGBA buffer using a 3D cube.
29*32afb93cSXin Li  */
30*32afb93cSXin Li class Lut3dTask : public Task {
31*32afb93cSXin Li     // The input array we're transforming.
32*32afb93cSXin Li     const uchar4* mIn;
33*32afb93cSXin Li     // Where we'll store the transformed result.
34*32afb93cSXin Li     uchar4* mOut;
35*32afb93cSXin Li     // The size of each of the three cube dimensions. We don't make use of the last value.
36*32afb93cSXin Li     int4 mCubeDimension;
37*32afb93cSXin Li     // The translation cube, in row major format.
38*32afb93cSXin Li     const uchar* mCubeTable;
39*32afb93cSXin Li 
40*32afb93cSXin Li     /**
41*32afb93cSXin Li      * Converts a subset of a line of the 2D buffer.
42*32afb93cSXin Li      *
43*32afb93cSXin Li      * @param in The start of the data to transform.
44*32afb93cSXin Li      * @param out Where to store the result.
45*32afb93cSXin Li      * @param length The number of 4-byte vectors to transform.
46*32afb93cSXin Li      */
47*32afb93cSXin Li     void kernel(const uchar4* in, uchar4* out, uint32_t length);
48*32afb93cSXin Li 
49*32afb93cSXin Li     // Process a 2D tile of the overall work. threadIndex identifies which thread does the work.
50*32afb93cSXin Li     void processData(int threadIndex, size_t startX, size_t startY, size_t endX,
51*32afb93cSXin Li                      size_t endY) override;
52*32afb93cSXin Li 
53*32afb93cSXin Li    public:
Lut3dTask(const uint8_t * input,uint8_t * output,size_t sizeX,size_t sizeY,const uint8_t * cube,int cubeSizeX,int cubeSizeY,int cubeSizeZ,const Restriction * restriction)54*32afb93cSXin Li     Lut3dTask(const uint8_t* input, uint8_t* output, size_t sizeX, size_t sizeY,
55*32afb93cSXin Li               const uint8_t* cube, int cubeSizeX, int cubeSizeY, int cubeSizeZ,
56*32afb93cSXin Li               const Restriction* restriction)
57*32afb93cSXin Li         : Task{sizeX, sizeY, 4, true, restriction},
58*32afb93cSXin Li           mIn{reinterpret_cast<const uchar4*>(input)},
59*32afb93cSXin Li           mOut{reinterpret_cast<uchar4*>(output)},
60*32afb93cSXin Li           mCubeDimension{cubeSizeX, cubeSizeY, cubeSizeZ, 0},
61*32afb93cSXin Li           mCubeTable{cube} {}
62*32afb93cSXin Li };
63*32afb93cSXin Li 
64*32afb93cSXin Li extern "C" void rsdIntrinsic3DLUT_K(void* dst, void const* in, size_t count, void const* lut,
65*32afb93cSXin Li                                     int32_t pitchy, int32_t pitchz, int dimx, int dimy, int dimz);
66*32afb93cSXin Li 
kernel(const uchar4 * in,uchar4 * out,uint32_t length)67*32afb93cSXin Li void Lut3dTask::kernel(const uchar4* in, uchar4* out, uint32_t length) {
68*32afb93cSXin Li     uint32_t x1 = 0;
69*32afb93cSXin Li     uint32_t x2 = length;
70*32afb93cSXin Li 
71*32afb93cSXin Li     const uchar* bp = mCubeTable;
72*32afb93cSXin Li 
73*32afb93cSXin Li     int4 dims = mCubeDimension - 1;
74*32afb93cSXin Li 
75*32afb93cSXin Li     const float4 m = (float4)(1.f / 255.f) * convert<float4>(dims);
76*32afb93cSXin Li     const int4 coordMul = convert<int4>(m * (float4)0x8000);
77*32afb93cSXin Li     const size_t stride_y = mCubeDimension.x * 4;
78*32afb93cSXin Li     const size_t stride_z = stride_y * mCubeDimension.y;
79*32afb93cSXin Li 
80*32afb93cSXin Li     // ALOGE("strides %zu %zu", stride_y, stride_z);
81*32afb93cSXin Li 
82*32afb93cSXin Li #if defined(ARCH_ARM_USE_INTRINSICS)
83*32afb93cSXin Li     if (mUsesSimd) {
84*32afb93cSXin Li         int32_t len = x2 - x1;
85*32afb93cSXin Li         if (len > 0) {
86*32afb93cSXin Li             rsdIntrinsic3DLUT_K(out, in, len, bp, stride_y, stride_z, dims.x, dims.y, dims.z);
87*32afb93cSXin Li             x1 += len;
88*32afb93cSXin Li             out += len;
89*32afb93cSXin Li             in += len;
90*32afb93cSXin Li         }
91*32afb93cSXin Li     }
92*32afb93cSXin Li #endif
93*32afb93cSXin Li 
94*32afb93cSXin Li     while (x1 < x2) {
95*32afb93cSXin Li         int4 baseCoord = convert<int4>(*in) * coordMul;
96*32afb93cSXin Li         int4 coord1 = baseCoord >> (int4)15;
97*32afb93cSXin Li         // int4 coord2 = min(coord1 + 1, gDims - 1);
98*32afb93cSXin Li 
99*32afb93cSXin Li         int4 weight2 = baseCoord & 0x7fff;
100*32afb93cSXin Li         int4 weight1 = (int4)0x8000 - weight2;
101*32afb93cSXin Li 
102*32afb93cSXin Li         // ALOGE("coord1      %08x %08x %08x %08x", coord1.x, coord1.y, coord1.z, coord1.w);
103*32afb93cSXin Li         const uchar* bp2 = bp + (coord1.x * 4) + (coord1.y * stride_y) + (coord1.z * stride_z);
104*32afb93cSXin Li         const uchar4* pt_00 = (const uchar4*)&bp2[0];
105*32afb93cSXin Li         const uchar4* pt_10 = (const uchar4*)&bp2[stride_y];
106*32afb93cSXin Li         const uchar4* pt_01 = (const uchar4*)&bp2[stride_z];
107*32afb93cSXin Li         const uchar4* pt_11 = (const uchar4*)&bp2[stride_y + stride_z];
108*32afb93cSXin Li 
109*32afb93cSXin Li         uint4 v000 = convert<uint4>(pt_00[0]);
110*32afb93cSXin Li         uint4 v100 = convert<uint4>(pt_00[1]);
111*32afb93cSXin Li         uint4 v010 = convert<uint4>(pt_10[0]);
112*32afb93cSXin Li         uint4 v110 = convert<uint4>(pt_10[1]);
113*32afb93cSXin Li         uint4 v001 = convert<uint4>(pt_01[0]);
114*32afb93cSXin Li         uint4 v101 = convert<uint4>(pt_01[1]);
115*32afb93cSXin Li         uint4 v011 = convert<uint4>(pt_11[0]);
116*32afb93cSXin Li         uint4 v111 = convert<uint4>(pt_11[1]);
117*32afb93cSXin Li 
118*32afb93cSXin Li         uint4 yz00 = ((v000 * weight1.x) + (v100 * weight2.x)) >> (int4)7;
119*32afb93cSXin Li         uint4 yz10 = ((v010 * weight1.x) + (v110 * weight2.x)) >> (int4)7;
120*32afb93cSXin Li         uint4 yz01 = ((v001 * weight1.x) + (v101 * weight2.x)) >> (int4)7;
121*32afb93cSXin Li         uint4 yz11 = ((v011 * weight1.x) + (v111 * weight2.x)) >> (int4)7;
122*32afb93cSXin Li 
123*32afb93cSXin Li         uint4 z0 = ((yz00 * weight1.y) + (yz10 * weight2.y)) >> (int4)15;
124*32afb93cSXin Li         uint4 z1 = ((yz01 * weight1.y) + (yz11 * weight2.y)) >> (int4)15;
125*32afb93cSXin Li 
126*32afb93cSXin Li         uint4 v = ((z0 * weight1.z) + (z1 * weight2.z)) >> (int4)15;
127*32afb93cSXin Li         uint4 v2 = (v + 0x7f) >> (int4)8;
128*32afb93cSXin Li 
129*32afb93cSXin Li         uchar4 ret = convert<uchar4>(v2);
130*32afb93cSXin Li         ret.w = in->w;
131*32afb93cSXin Li 
132*32afb93cSXin Li #if 0
133*32afb93cSXin Li         if (!x1) {
134*32afb93cSXin Li             ALOGE("in          %08x %08x %08x %08x", in->r, in->g, in->b, in->a);
135*32afb93cSXin Li             ALOGE("baseCoord   %08x %08x %08x %08x", baseCoord.x, baseCoord.y, baseCoord.z,
136*32afb93cSXin Li                   baseCoord.w);
137*32afb93cSXin Li             ALOGE("coord1      %08x %08x %08x %08x", coord1.x, coord1.y, coord1.z, coord1.w);
138*32afb93cSXin Li             ALOGE("weight1     %08x %08x %08x %08x", weight1.x, weight1.y, weight1.z, weight1.w);
139*32afb93cSXin Li             ALOGE("weight2     %08x %08x %08x %08x", weight2.x, weight2.y, weight2.z, weight2.w);
140*32afb93cSXin Li 
141*32afb93cSXin Li             ALOGE("v000        %08x %08x %08x %08x", v000.x, v000.y, v000.z, v000.w);
142*32afb93cSXin Li             ALOGE("v100        %08x %08x %08x %08x", v100.x, v100.y, v100.z, v100.w);
143*32afb93cSXin Li             ALOGE("yz00        %08x %08x %08x %08x", yz00.x, yz00.y, yz00.z, yz00.w);
144*32afb93cSXin Li             ALOGE("z0          %08x %08x %08x %08x", z0.x, z0.y, z0.z, z0.w);
145*32afb93cSXin Li 
146*32afb93cSXin Li             ALOGE("v           %08x %08x %08x %08x", v.x, v.y, v.z, v.w);
147*32afb93cSXin Li             ALOGE("v2          %08x %08x %08x %08x", v2.x, v2.y, v2.z, v2.w);
148*32afb93cSXin Li         }
149*32afb93cSXin Li #endif
150*32afb93cSXin Li         *out = ret;
151*32afb93cSXin Li 
152*32afb93cSXin Li         in++;
153*32afb93cSXin Li         out++;
154*32afb93cSXin Li         x1++;
155*32afb93cSXin Li     }
156*32afb93cSXin Li }
157*32afb93cSXin Li 
processData(int,size_t startX,size_t startY,size_t endX,size_t endY)158*32afb93cSXin Li void Lut3dTask::processData(int /* threadIndex */, size_t startX, size_t startY, size_t endX,
159*32afb93cSXin Li                             size_t endY) {
160*32afb93cSXin Li     for (size_t y = startY; y < endY; y++) {
161*32afb93cSXin Li         size_t offset = mSizeX * y + startX;
162*32afb93cSXin Li         kernel(mIn + offset, mOut + offset, endX - startX);
163*32afb93cSXin Li     }
164*32afb93cSXin Li }
165*32afb93cSXin Li 
lut3d(const uint8_t * input,uint8_t * output,size_t sizeX,size_t sizeY,const uint8_t * cube,size_t cubeSizeX,size_t cubeSizeY,size_t cubeSizeZ,const Restriction * restriction)166*32afb93cSXin Li void RenderScriptToolkit::lut3d(const uint8_t* input, uint8_t* output, size_t sizeX, size_t sizeY,
167*32afb93cSXin Li                                 const uint8_t* cube, size_t cubeSizeX, size_t cubeSizeY,
168*32afb93cSXin Li                                 size_t cubeSizeZ, const Restriction* restriction) {
169*32afb93cSXin Li #ifdef ANDROID_RENDERSCRIPT_TOOLKIT_VALIDATE
170*32afb93cSXin Li     if (!validRestriction(LOG_TAG, sizeX, sizeY, restriction)) {
171*32afb93cSXin Li         return;
172*32afb93cSXin Li     }
173*32afb93cSXin Li #endif
174*32afb93cSXin Li 
175*32afb93cSXin Li     Lut3dTask task(input, output, sizeX, sizeY, cube, cubeSizeX, cubeSizeY, cubeSizeZ, restriction);
176*32afb93cSXin Li     processor->doTask(&task);
177*32afb93cSXin Li }
178*32afb93cSXin Li 
179*32afb93cSXin Li }  // namespace renderscript
180