main/cpp/Convolve3x3.cpp

*32afb93cSXin Li/*
*32afb93cSXin Li * Copyright (C) 2012 The Android Open Source Project
*32afb93cSXin Li *
*32afb93cSXin Li * Licensed under the Apache License, Version 2.0 (the "License");
*32afb93cSXin Li * you may not use this file except in compliance with the License.
*32afb93cSXin Li * You may obtain a copy of the License at
*32afb93cSXin Li *
*32afb93cSXin Li *      http://www.apache.org/licenses/LICENSE-2.0
*32afb93cSXin Li *
*32afb93cSXin Li * Unless required by applicable law or agreed to in writing, software
*32afb93cSXin Li * distributed under the License is distributed on an "AS IS" BASIS,
*32afb93cSXin Li * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*32afb93cSXin Li * See the License for the specific language governing permissions and
*32afb93cSXin Li * limitations under the License.
*32afb93cSXin Li */
*32afb93cSXin Li
*32afb93cSXin Li#include <cstdint>
*32afb93cSXin Li
*32afb93cSXin Li#include "RenderScriptToolkit.h"
*32afb93cSXin Li#include "TaskProcessor.h"
*32afb93cSXin Li#include "Utils.h"
*32afb93cSXin Li
*32afb93cSXin Li#define LOG_TAG "renderscript.toolkit.Convolve3x3"
*32afb93cSXin Li
*32afb93cSXin Linamespace renderscript {
*32afb93cSXin Li
*32afb93cSXin Liextern "C" void rsdIntrinsicConvolve3x3_K(void* dst, const void* y0, const void* y1, const void* y2,
*32afb93cSXin Li                                          const int16_t* coef, uint32_t count);
*32afb93cSXin Li
*32afb93cSXin Liclass Convolve3x3Task : public Task {
*32afb93cSXin Li    const void* mIn;
*32afb93cSXin Li    void* mOut;
*32afb93cSXin Li    // Even though we have exactly 9 coefficients, store them in an array of size 16 so that
*32afb93cSXin Li    // the SIMD instructions can load them in chunks multiple of 8.
*32afb93cSXin Li    float mFp[16];
*32afb93cSXin Li    int16_t mIp[16];
*32afb93cSXin Li
*32afb93cSXin Li    void kernelU4(uchar* out, uint32_t xstart, uint32_t xend, const uchar* py0, const uchar* py1,
*32afb93cSXin Li                  const uchar* py2);
*32afb93cSXin Li    void convolveU4(const uchar* pin, uchar* pout, size_t vectorSize, size_t sizeX, size_t sizeY,
*32afb93cSXin Li                    size_t startX, size_t startY, size_t endX, size_t endY);
*32afb93cSXin Li
*32afb93cSXin Li    // Process a 2D tile of the overall work. threadIndex identifies which thread does the work.
*32afb93cSXin Li    void processData(int threadIndex, size_t startX, size_t startY, size_t endX,
*32afb93cSXin Li                     size_t endY) override;
*32afb93cSXin Li
*32afb93cSXin Li   public:
*32afb93cSXin Li    Convolve3x3Task(const void* in, void* out, size_t vectorSize, size_t sizeX, size_t sizeY,
*32afb93cSXin Li                    const float* coefficients, const Restriction* restriction)
*32afb93cSXin Li        : Task{sizeX, sizeY, vectorSize, false, restriction}, mIn{in}, mOut{out} {
*32afb93cSXin Li        for (int ct = 0; ct < 9; ct++) {
*32afb93cSXin Li            mFp[ct] = coefficients[ct];
*32afb93cSXin Li            if (mFp[ct] >= 0) {
*32afb93cSXin Li                mIp[ct] = (int16_t)(mFp[ct] * 256.f + 0.5f);
*32afb93cSXin Li            } else {
*32afb93cSXin Li                mIp[ct] = (int16_t)(mFp[ct] * 256.f - 0.5f);
*32afb93cSXin Li            }
*32afb93cSXin Li        }
*32afb93cSXin Li    }
*32afb93cSXin Li};
*32afb93cSXin Li
*32afb93cSXin Li/**
*32afb93cSXin Li * Computes one convolution and stores the result in the output. This is used for uchar, uchar2,
*32afb93cSXin Li * uchar3, and uchar4 vectors.
*32afb93cSXin Li *
*32afb93cSXin Li * @tparam InputOutputType Type of the input and output arrays. A vector type, e.g. uchar4.
*32afb93cSXin Li * @tparam ComputationType Type we use for the intermediate computations.
*32afb93cSXin Li * @param x The index in the row of the value we'll convolve.
*32afb93cSXin Li * @param out The location in the output array where we store the value.
*32afb93cSXin Li * @param py0 The start of the top row.
*32afb93cSXin Li * @param py1 The start of the middle row.
*32afb93cSXin Li * @param py2 The start of the bottom row.
*32afb93cSXin Li * @param coeff Pointer to the float coefficients, in row major format.
*32afb93cSXin Li * @param sizeX The number of cells of one row.
*32afb93cSXin Li */
*32afb93cSXin Litemplate <typename InputOutputType, typename ComputationType>
*32afb93cSXin Listatic void convolveOneU(uint32_t x, InputOutputType* out, const InputOutputType* py0,
*32afb93cSXin Li                         const InputOutputType* py1, const InputOutputType* py2, const float* coeff,
*32afb93cSXin Li                         int32_t sizeX) {
*32afb93cSXin Li    uint32_t x1 = std::max((int32_t)x - 1, 0);
*32afb93cSXin Li    uint32_t x2 = std::min((int32_t)x + 1, sizeX - 1);
*32afb93cSXin Li
*32afb93cSXin Li    ComputationType px = convert<ComputationType>(py0[x1]) * coeff[0] +
*32afb93cSXin Li                         convert<ComputationType>(py0[x]) * coeff[1] +
*32afb93cSXin Li                         convert<ComputationType>(py0[x2]) * coeff[2] +
*32afb93cSXin Li                         convert<ComputationType>(py1[x1]) * coeff[3] +
*32afb93cSXin Li                         convert<ComputationType>(py1[x]) * coeff[4] +
*32afb93cSXin Li                         convert<ComputationType>(py1[x2]) * coeff[5] +
*32afb93cSXin Li                         convert<ComputationType>(py2[x1]) * coeff[6] +
*32afb93cSXin Li                         convert<ComputationType>(py2[x]) * coeff[7] +
*32afb93cSXin Li                         convert<ComputationType>(py2[x2]) * coeff[8];
*32afb93cSXin Li
*32afb93cSXin Li    px = clamp(px + 0.5f, 0.f, 255.f);
*32afb93cSXin Li    *out = convert<InputOutputType>(px);
*32afb93cSXin Li}
*32afb93cSXin Li
*32afb93cSXin Li#ifdef ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
*32afb93cSXin Li/**
*32afb93cSXin Li * Computes one convolution and stores the result in the output. This is used for float, float2,
*32afb93cSXin Li * float3, and float4 vectors.
*32afb93cSXin Li *
*32afb93cSXin Li * @tparam InputOutputType Type of the input and output arrays. A vector type, e.g. float4.
*32afb93cSXin Li * @param x The index in the row of the value we'll convolve.
*32afb93cSXin Li * @param out The location in the output array where we store the value.
*32afb93cSXin Li * @param py0 The start of the top row.
*32afb93cSXin Li * @param py1 The start of the middle row.
*32afb93cSXin Li * @param py2 The start of the bottom row.
*32afb93cSXin Li * @param coeff Pointer to the float coefficients, in row major format.
*32afb93cSXin Li * @param sizeX The number of cells of one row.
*32afb93cSXin Li */
*32afb93cSXin Litemplate <typename InputOutputType>
*32afb93cSXin Listatic void ConvolveOneF(uint32_t x, InputOutputType* out, const InputOutputType* py0,
*32afb93cSXin Li                         const InputOutputType* py1, const InputOutputType* py2, const float* coeff,
*32afb93cSXin Li                         int32_t sizeX) {
*32afb93cSXin Li    uint32_t x1 = std::max((int32_t)x - 1, 0);
*32afb93cSXin Li    uint32_t x2 = std::min((int32_t)x + 1, sizeX - 1);
*32afb93cSXin Li    *out = (py0[x1] * coeff[0]) + (py0[x] * coeff[1]) + (py0[x2] * coeff[2]) +
*32afb93cSXin Li           (py1[x1] * coeff[3]) + (py1[x] * coeff[4]) + (py1[x2] * coeff[5]) +
*32afb93cSXin Li           (py2[x1] * coeff[6]) + (py2[x] * coeff[7]) + (py2[x2] * coeff[8]);
*32afb93cSXin Li}
*32afb93cSXin Li#endif  // ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
*32afb93cSXin Li
*32afb93cSXin Li/**
*32afb93cSXin Li * This function convolves one line.
*32afb93cSXin Li *
*32afb93cSXin Li * @param pout Where to place the next output.
*32afb93cSXin Li * @param xstart Index in the X direction of where to start.
*32afb93cSXin Li * @param xend End index
*32afb93cSXin Li * @param ppy0 Points to the start of the previous line.
*32afb93cSXin Li * @param ppy1 Points to the start of the current line.
*32afb93cSXin Li * @param ppy2 Points to the start of the next line.
*32afb93cSXin Li */
*32afb93cSXin Livoid Convolve3x3Task::kernelU4(uchar* pout, uint32_t xstart, uint32_t xend, const uchar* ppy0,
*32afb93cSXin Li                               const uchar* ppy1, const uchar* ppy2) {
*32afb93cSXin Li    uchar4* out = (uchar4*)pout;
*32afb93cSXin Li    const uchar4* py0 = (const uchar4*)ppy0;
*32afb93cSXin Li    const uchar4* py1 = (const uchar4*)ppy1;
*32afb93cSXin Li    const uchar4* py2 = (const uchar4*)ppy2;
*32afb93cSXin Li
*32afb93cSXin Li    uint32_t x1 = xstart;
*32afb93cSXin Li    uint32_t x2 = xend;
*32afb93cSXin Li    if (x1 == 0) {
*32afb93cSXin Li        convolveOneU<uchar4, float4>(0, out, py0, py1, py2, mFp, mSizeX);
*32afb93cSXin Li        x1++;
*32afb93cSXin Li        out++;
*32afb93cSXin Li    }
*32afb93cSXin Li
*32afb93cSXin Li    if (x2 > x1) {
*32afb93cSXin Li#if defined(ARCH_ARM_USE_INTRINSICS) || defined(ARCH_X86_HAVE_SSSE3)
*32afb93cSXin Li        if (mUsesSimd) {
*32afb93cSXin Li            int32_t len = (x2 - x1 - 1) >> 1;
*32afb93cSXin Li            if (len > 0) {
*32afb93cSXin Li                rsdIntrinsicConvolve3x3_K(out, &py0[x1 - 1], &py1[x1 - 1], &py2[x1 - 1], mIp, len);
*32afb93cSXin Li                x1 += len << 1;
*32afb93cSXin Li                out += len << 1;
*32afb93cSXin Li            }
*32afb93cSXin Li        }
*32afb93cSXin Li#endif
*32afb93cSXin Li
*32afb93cSXin Li        while (x1 != x2) {
*32afb93cSXin Li            convolveOneU<uchar4, float4>(x1, out, py0, py1, py2, mFp, mSizeX);
*32afb93cSXin Li            out++;
*32afb93cSXin Li            x1++;
*32afb93cSXin Li        }
*32afb93cSXin Li    }
*32afb93cSXin Li}
*32afb93cSXin Li
*32afb93cSXin Li#ifdef ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
*32afb93cSXin Litemplate <typename T>
*32afb93cSXin Livoid RsdCpuScriptIntrinsicConvolve3x3_kernelF(void* in, T* out, uint32_t xstart, uint32_t xend,
*32afb93cSXin Li                                              uint32_t currentY, size_t sizeX, size_t sizeY,
*32afb93cSXin Li                                              size_t vectorSize, float* fp) {
*32afb93cSXin Li    const uchar* pin = (const uchar*)in;
*32afb93cSXin Li    const size_t stride = sizeX * vectorSize * 4;  // float takes 4 bytes
*32afb93cSXin Li
*32afb93cSXin Li    uint32_t y1 = std::min((int32_t)currentY + 1, (int32_t)(sizeY - 1));
*32afb93cSXin Li    uint32_t y2 = std::max((int32_t)currentY - 1, 0);
*32afb93cSXin Li    const T* py0 = (const T*)(pin + stride * y2);
*32afb93cSXin Li    const T* py1 = (const T*)(pin + stride * currentY);
*32afb93cSXin Li    const T* py2 = (const T*)(pin + stride * y1);
*32afb93cSXin Li
*32afb93cSXin Li    for (uint32_t x = xstart; x < xend; x++, out++) {
*32afb93cSXin Li        ConvolveOneF<T>(x, out, py0, py1, py2, fp, sizeX);
*32afb93cSXin Li    }
*32afb93cSXin Li}
*32afb93cSXin Li#endif  // ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
*32afb93cSXin Li
*32afb93cSXin Litemplate <typename InputOutputType, typename ComputationType>
*32afb93cSXin Listatic void convolveU(const uchar* pin, uchar* pout, size_t vectorSize, size_t sizeX, size_t sizeY,
*32afb93cSXin Li                      size_t startX, size_t startY, size_t endX, size_t endY, float* fp) {
*32afb93cSXin Li    const size_t stride = vectorSize * sizeX;
*32afb93cSXin Li    for (size_t y = startY; y < endY; y++) {
*32afb93cSXin Li        uint32_t y1 = std::min((int32_t)y + 1, (int32_t)(sizeY - 1));
*32afb93cSXin Li        uint32_t y2 = std::max((int32_t)y - 1, 0);
*32afb93cSXin Li
*32afb93cSXin Li        size_t offset = (y * sizeX + startX) * vectorSize;
*32afb93cSXin Li        InputOutputType* px = (InputOutputType*)(pout + offset);
*32afb93cSXin Li        InputOutputType* py0 = (InputOutputType*)(pin + stride * y2);
*32afb93cSXin Li        InputOutputType* py1 = (InputOutputType*)(pin + stride * y);
*32afb93cSXin Li        InputOutputType* py2 = (InputOutputType*)(pin + stride * y1);
*32afb93cSXin Li        for (uint32_t x = startX; x < endX; x++, px++) {
*32afb93cSXin Li            convolveOneU<InputOutputType, ComputationType>(x, px, py0, py1, py2, fp, sizeX);
*32afb93cSXin Li        }
*32afb93cSXin Li    }
*32afb93cSXin Li}
*32afb93cSXin Li
*32afb93cSXin Livoid Convolve3x3Task::convolveU4(const uchar* pin, uchar* pout, size_t vectorSize, size_t sizeX,
*32afb93cSXin Li                                 size_t sizeY, size_t startX, size_t startY, size_t endX,
*32afb93cSXin Li                                 size_t endY) {
*32afb93cSXin Li    const size_t stride = paddedSize(vectorSize) * sizeX;
*32afb93cSXin Li    for (size_t y = startY; y < endY; y++) {
*32afb93cSXin Li        uint32_t y1 = std::min((int32_t)y + 1, (int32_t)(sizeY - 1));
*32afb93cSXin Li        uint32_t y2 = std::max((int32_t)y - 1, 0);
*32afb93cSXin Li
*32afb93cSXin Li        size_t offset = (y * sizeX + startX) * paddedSize(vectorSize);
*32afb93cSXin Li        uchar* px = pout + offset;
*32afb93cSXin Li        const uchar* py0 = pin + stride * y2;
*32afb93cSXin Li        const uchar* py1 = pin + stride * y;
*32afb93cSXin Li        const uchar* py2 = pin + stride * y1;
*32afb93cSXin Li        kernelU4(px, startX, endX, py0, py1, py2);
*32afb93cSXin Li    }
*32afb93cSXin Li}
*32afb93cSXin Li
*32afb93cSXin Livoid Convolve3x3Task::processData(int /* threadIndex */, size_t startX, size_t startY, size_t endX,
*32afb93cSXin Li                                  size_t endY) {
*32afb93cSXin Li    // ALOGI("Thread %d start tile from (%zd, %zd) to (%zd, %zd)", threadIndex, startX, startY,
*32afb93cSXin Li    // endX, endY);
*32afb93cSXin Li    switch (mVectorSize) {
*32afb93cSXin Li        case 1:
*32afb93cSXin Li            convolveU<uchar, float>((const uchar*)mIn, (uchar*)mOut, mVectorSize, mSizeX, mSizeY,
*32afb93cSXin Li                                    startX, startY, endX, endY, mFp);
*32afb93cSXin Li            break;
*32afb93cSXin Li        case 2:
*32afb93cSXin Li            convolveU<uchar2, float2>((const uchar*)mIn, (uchar*)mOut, mVectorSize, mSizeX, mSizeY,
*32afb93cSXin Li                                      startX, startY, endX, endY, mFp);
*32afb93cSXin Li            break;
*32afb93cSXin Li        case 3:
*32afb93cSXin Li        case 4:
*32afb93cSXin Li            convolveU4((const uchar*)mIn, (uchar*)mOut, mVectorSize, mSizeX, mSizeY, startX, startY,
*32afb93cSXin Li                       endX, endY);
*32afb93cSXin Li            break;
*32afb93cSXin Li    }
*32afb93cSXin Li}
*32afb93cSXin Li
*32afb93cSXin Livoid RenderScriptToolkit::convolve3x3(const void* in, void* out, size_t vectorSize, size_t sizeX,
*32afb93cSXin Li                                      size_t sizeY, const float* coefficients,
*32afb93cSXin Li                                      const Restriction* restriction) {
*32afb93cSXin Li#ifdef ANDROID_RENDERSCRIPT_TOOLKIT_VALIDATE
*32afb93cSXin Li    if (!validRestriction(LOG_TAG, sizeX, sizeY, restriction)) {
*32afb93cSXin Li        return;
*32afb93cSXin Li    }
*32afb93cSXin Li    if (vectorSize < 1 || vectorSize > 4) {
*32afb93cSXin Li        ALOGE("The vectorSize should be between 1 and 4. %zu provided.", vectorSize);
*32afb93cSXin Li        return;
*32afb93cSXin Li    }
*32afb93cSXin Li#endif
*32afb93cSXin Li
*32afb93cSXin Li    Convolve3x3Task task(in, out, vectorSize, sizeX, sizeY, coefficients, restriction);
*32afb93cSXin Li    processor->doTask(&task);
*32afb93cSXin Li}
*32afb93cSXin Li
*32afb93cSXin Li}  // namespace renderscript