quantized_resize_bilinear_op.cc (revision b6fb3261f9314811a0f4371741dbb8839866f948) - OpenGrok cross reference for /aosp_15_r20/external/tensorflow/tensorflow/core/kernels/quantized_resize_bilinear_op.cc

/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/

// Implements a quantized version of the resize bilinear op.

#define EIGEN_USE_THREADS

#if defined(__ARM_NEON__) || defined(__ARM_NEON)
#define USE_NEON
#define QUANTIZED_RESIZE_BILINEAR_USE_NEON
#include <arm_neon.h>
#endif

#include "tensorflow/core/framework/op_kernel.h"
#include "tensorflow/core/framework/types.h"
#include "tensorflow/core/kernels/quantization_utils.h"
#include "tensorflow/core/platform/macros.h"
#include "tensorflow/core/util/image_resizer_state.h"

namespace tensorflow {

static constexpr bool USE_REFERENCE = false;

namespace {
// Compute the interpolation indices only once.
template <typename T_SCALE>
struct InterpolationCache {
  std::vector<int64_t> lower;  // Lower source index used in the interpolation
  std::vector<int64_t> upper;  // Upper source index used in the interpolation
  // 1-D linear interpolation scale (see:
  // https://en.wikipedia.org/wiki/Bilinear_interpolation)
  std::vector<float> lerp;
  std::vector<T_SCALE> ilerp;
};

template <typename T_SCALE, typename Scaler>
inline void ComputeInterpolationWeights(
    const int64_t out_size, const int64_t in_size, const float scale,
    const int resolution, InterpolationCache<T_SCALE>* interpolation) {
  const Scaler scaler;
  interpolation->lower.resize(out_size + 1);
  interpolation->upper.resize(out_size + 1);
  interpolation->lerp.resize(out_size + 1);
  interpolation->ilerp.resize(out_size + 1);

  interpolation->lower[out_size] = 0;
  interpolation->upper[out_size] = 0;
  for (int64_t i = out_size - 1; i >= 0; --i) {
    const float in = scaler(i, scale);
    const float in_f = std::floor(in);
    interpolation->lower[i] =
        std::max(static_cast<int64_t>(in_f), static_cast<int64_t>(0));
    interpolation->upper[i] =
        std::min(static_cast<int64_t>(std::ceil(in)), in_size - 1);
    interpolation->lower[i] =
        std::min(interpolation->lower[i], interpolation->upper[i]);
    interpolation->lerp[i] = in - in_f;
    interpolation->ilerp[i] =
        static_cast<T_SCALE>((in - in_f) * (1 << resolution));
  }
}

template <typename T_SCALE>
inline InterpolationCache<T_SCALE> BuildLerpCache(
    const int64_t out_size, const int64_t in_size, const float scale,
    const int index_step, const int resolution, const bool half_pixel_centers) {
  InterpolationCache<T_SCALE> cache;
  // Compute the cached interpolation weights on the x and y dimensions.
  if (half_pixel_centers) {
    ComputeInterpolationWeights<T_SCALE, HalfPixelScaler>(
        out_size, in_size, scale, resolution, &cache);
  } else {
    ComputeInterpolationWeights<T_SCALE, LegacyScaler>(out_size, in_size, scale,
                                                       resolution, &cache);
  }
  CHECK(index_step > 0);
  if (index_step > 1) {
    for (int i = 0; i < cache.lower.size(); ++i) {
      cache.lower[i] *= index_step;
      cache.upper[i] *= index_step;
    }
  }
  return cache;
}

/**
 * Computes the bilinear interpolation from the appropriate 4 float points
 * and the linear interpolation weights.
 */
template <typename T>
inline T ComputeLerpReference(const T in_top_left, const T in_top_right,
                              const T in_bottom_left, const T in_bottom_right,
                              const float x_lerp, const float y_lerp,
                              const float min, const float max) {
  const float top_left = QuantizedToFloat<T>(in_top_left, min, max);
  const float top_right = QuantizedToFloat<T>(in_top_right, min, max);
  const float bottom_left = QuantizedToFloat<T>(in_bottom_left, min, max);
  const float bottom_right = QuantizedToFloat<T>(in_bottom_right, min, max);
  const float top = top_left + (top_right - top_left) * x_lerp;
  const float bottom = bottom_left + (bottom_right - bottom_left) * x_lerp;
  const float out = top + (bottom - top) * y_lerp;
  return FloatToQuantized<T>(out, min, max);
}

template <typename T, typename T_SCALE, typename T_CALC>
inline T_CALC MulOffset(T a, T b, T_SCALE c) {
  return (static_cast<T_CALC>(a) - static_cast<T_CALC>(b)) *
         static_cast<T_CALC>(c);
}

template <int RESOLUTION, typename T, typename T_SCALE, typename T_CALC>
inline T ComputeLerp(const T top_left, const T top_right, const T bottom_left,
                     const T bottom_right, const T_SCALE x_lerp,
                     const T_SCALE y_lerp) {
  constexpr T_CALC RESOLUTION_MULT = (1 << RESOLUTION);
  const T_CALC top = static_cast<T_CALC>(top_left) * RESOLUTION_MULT +
                     MulOffset<T, T_SCALE, T_CALC>(top_right, top_left, x_lerp);
  const T_CALC bottom =
      static_cast<T_CALC>(bottom_left) * RESOLUTION_MULT +
      MulOffset<T, T_SCALE, T_CALC>(bottom_right, bottom_left, x_lerp);
  const T_CALC out = top + (bottom - top) / RESOLUTION_MULT * y_lerp;
  return static_cast<T>(
      static_cast<int32>((out + RESOLUTION_MULT / 2) / RESOLUTION_MULT));
}

#ifdef QUANTIZED_RESIZE_BILINEAR_USE_NEON
inline uint8x8_t ToUint8x8(const quint8* v0, const quint8* v1, const quint8* v2,
                           const quint8* v3, const quint8* v4, const quint8* v5,
                           const quint8* v6, const quint8* v7) {
  static const uint8x8_t ZERO_8x8 = vmov_n_u8(0);
  uint8x8_t ret = vld1_lane_u8(reinterpret_cast<const uint8*>(v0), ZERO_8x8, 0);
  ret = vld1_lane_u8(reinterpret_cast<const uint8*>(v1), ret, 1);
  ret = vld1_lane_u8(reinterpret_cast<const uint8*>(v2), ret, 2);
  ret = vld1_lane_u8(reinterpret_cast<const uint8*>(v3), ret, 3);
  ret = vld1_lane_u8(reinterpret_cast<const uint8*>(v4), ret, 4);
  ret = vld1_lane_u8(reinterpret_cast<const uint8*>(v5), ret, 5);
  ret = vld1_lane_u8(reinterpret_cast<const uint8*>(v6), ret, 6);
  ret = vld1_lane_u8(reinterpret_cast<const uint8*>(v7), ret, 7);
  return ret;
}

inline int16x8_t ToInt16x8(const int16* v0, const int16* v1, const int16* v2,
                           const int16* v3, const int16* v4, const int16* v5,
                           const int16* v6, const int16* v7) {
  static const int16x8_t ZERO_16x8 = vmovq_n_s16(0);
  int16x8_t ret = vld1q_lane_s16(v0, ZERO_16x8, 0);
  ret = vld1q_lane_s16(v1, ret, 1);
  ret = vld1q_lane_s16(v2, ret, 2);
  ret = vld1q_lane_s16(v3, ret, 3);
  ret = vld1q_lane_s16(v4, ret, 4);
  ret = vld1q_lane_s16(v5, ret, 5);
  ret = vld1q_lane_s16(v6, ret, 6);
  ret = vld1q_lane_s16(v7, ret, 7);
  return ret;
}

inline int32x2_t ToInt32x2(const qint32* v0, const qint32* v1) {
  static const int32x2_t ZERO_32x2 = vmov_n_s32(0);
  const int32x2_t ret0 =
      vld1_lane_s32(reinterpret_cast<const int32*>(v0), ZERO_32x2, 0);
  const int32x2_t ret1 =
      vld1_lane_s32(reinterpret_cast<const int32*>(v1), ret0, 1);
  return ret1;
}

template <int RESOLUTION, bool X_LERP_SAME>
inline int32x2_t ComputeLerpx2(
    const qint32* top_left0, const qint32* top_right0,
    const qint32* bottom_left0, const qint32* bottom_right0,
    const qint32* top_left1, const qint32* top_right1,
    const qint32* bottom_left1, const qint32* bottom_right1,
    const int32* x_lerp, const int32x2_t y_lerpsx) {
  const int32x2_t x_lerpsx =
      X_LERP_SAME ? vld1_dup_s32(reinterpret_cast<const int32*>(x_lerp))
                  : vld1_s32(reinterpret_cast<const int32*>(x_lerp));

  const int32x2_t top_leftsx = ToInt32x2(top_left0, top_left1);
  const int32x2_t top_rightsx = ToInt32x2(top_right0, top_right1);
  const int32x2_t bottom_leftsx = ToInt32x2(bottom_left0, bottom_left1);
  const int32x2_t bottom_rightsx = ToInt32x2(bottom_right0, bottom_right1);

  const int32x2_t retval =
      ComputeLerp32x2<RESOLUTION>(top_leftsx, top_rightsx, bottom_leftsx,
                                  bottom_rightsx, x_lerpsx, y_lerpsx);
  return retval;
}

template <int RESOLUTION>
inline uint8x8_t ComputeLerpx8(
    const quint8* tl0, const quint8* tr0, const quint8* bl0, const quint8* br0,
    const int16* xlp0, const quint8* tl1, const quint8* tr1, const quint8* bl1,
    const quint8* br1, const int16* xlp1, const quint8* tl2, const quint8* tr2,
    const quint8* bl2, const quint8* br2, const int16* xlp2, const quint8* tl3,
    const quint8* tr3, const quint8* bl3, const quint8* br3, const int16* xlp3,
    const quint8* tl4, const quint8* tr4, const quint8* bl4, const quint8* br4,
    const int16* xlp4, const quint8* tl5, const quint8* tr5, const quint8* bl5,
    const quint8* br5, const int16* xlp5, const quint8* tl6, const quint8* tr6,
    const quint8* bl6, const quint8* br6, const int16* xlp6, const quint8* tl7,
    const quint8* tr7, const quint8* bl7, const quint8* br7, const int16* xlp7,
    const int16x8_t ys_lerpsx) {
  const uint8x8_t tl8x8 = ToUint8x8(tl0, tl1, tl2, tl3, tl4, tl5, tl6, tl7);
  const uint8x8_t tr8x8 = ToUint8x8(tr0, tr1, tr2, tr3, tr4, tr5, tr6, tr7);
  const uint8x8_t bl8x8 = ToUint8x8(bl0, bl1, bl2, bl3, bl4, bl5, bl6, bl7);
  const uint8x8_t br8x8 = ToUint8x8(br0, br1, br2, br3, br4, br5, br6, br7);
  const int16x8_t xs_lerpsx =
      ToInt16x8(xlp0, xlp1, xlp2, xlp3, xlp4, xlp5, xlp6, xlp7);
  return ComputeLerp8x8<RESOLUTION>(tl8x8, tr8x8, bl8x8, br8x8, xs_lerpsx,
                                    ys_lerpsx);
}

// Expand address at compile time to improve performance
template <int RESOLUTION, int ID0, int CH0, int ID1, int CH1, int ID2, int CH2,
          int ID3, int CH3, int ID4, int CH4, int ID5, int CH5, int ID6,
          int CH6, int ID7, int CH7>
inline uint8x8_t ComputeLerpx8Tmpl(const quint8* const yl, const quint8* yu,
                                   const int64* xl, const int64* xu,
                                   const int16* xlp,
                                   const int16x8_t ys_lerpsx) {
  return ComputeLerpx8<RESOLUTION>(
      yl + xl[ID0] + CH0, yl + xu[ID0] + CH0, yu + xl[ID0] + CH0,
      yu + xu[ID0] + CH0, xlp + ID0, yl + xl[ID1] + CH1, yl + xu[ID1] + CH1,
      yu + xl[ID1] + CH1, yu + xu[ID1] + CH1, xlp + ID1, yl + xl[ID2] + CH2,
      yl + xu[ID2] + CH2, yu + xl[ID2] + CH2, yu + xu[ID2] + CH2, xlp + ID2,
      yl + xl[ID3] + CH3, yl + xu[ID3] + CH3, yu + xl[ID3] + CH3,
      yu + xu[ID3] + CH3, xlp + ID3, yl + xl[ID4] + CH4, yl + xu[ID4] + CH4,
      yu + xl[ID4] + CH4, yu + xu[ID4] + CH4, xlp + ID4, yl + xl[ID5] + CH5,
      yl + xu[ID5] + CH5, yu + xl[ID5] + CH5, yu + xu[ID5] + CH5, xlp + ID5,
      yl + xl[ID6] + CH6, yl + xu[ID6] + CH6, yu + xl[ID6] + CH6,
      yu + xu[ID6] + CH6, xlp + ID6, yl + xl[ID7] + CH7, yl + xu[ID7] + CH7,
      yu + xl[ID7] + CH7, yu + xu[ID7] + CH7, xlp + ID7, ys_lerpsx);
}

#endif

template <int RESOLUTION, typename T, typename T_SCALE, typename T_CALC>
inline void OutputLerpForChannels(const InterpolationCache<T_SCALE>& xs,
                                  const int64_t x, const T_SCALE ys_ilerp,
                                  const int channels, const float min,
                                  const float max, const T* ys_input_lower_ptr,
                                  const T* ys_input_upper_ptr,
                                  T* output_y_ptr) {
  const int64_t xs_lower = xs.lower[x];
  const int64_t xs_upper = xs.upper[x];
  const T_SCALE xs_ilerp = xs.ilerp[x];
  for (int c = 0; c < channels; ++c) {
    const T top_left = ys_input_lower_ptr[xs_lower + c];
    const T top_right = ys_input_lower_ptr[xs_upper + c];
    const T bottom_left = ys_input_upper_ptr[xs_lower + c];
    const T bottom_right = ys_input_upper_ptr[xs_upper + c];
    const T val = ComputeLerp<RESOLUTION, T, T_SCALE, T_CALC>(
        top_left, top_right, bottom_left, bottom_right, xs_ilerp, ys_ilerp);
    output_y_ptr[x * channels + c] = val;
  }
}

template <int RES>
inline void OutputLerp8x8x1(const InterpolationCache<int16>& xs,
                            const int64_t x_start, const int16_t ys_ilerp,
                            const float min, const float max,
                            const quint8* const ys_input_lower_ptr,
                            const quint8* const ys_input_upper_ptr,
                            quint8* output_y_ptr) {
#ifdef QUANTIZED_RESIZE_BILINEAR_USE_NEON
  const int16x8_t y_lerpsx = vmovq_n_s16(ys_ilerp);

  const uint8x8_t x0x7 =
      ComputeLerpx8Tmpl<RES, 0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7, 0>(
          ys_input_lower_ptr, ys_input_upper_ptr, &xs.lower[x_start],
          &xs.upper[x_start], &xs.ilerp[x_start], y_lerpsx);

  vst1_u8(reinterpret_cast<uint8_t*>(output_y_ptr + x_start), x0x7);

#else
  for (int x = x_start; x < x_start + 8; ++x) {
    OutputLerpForChannels<RES, quint8, int16, int16>(
        xs, x, ys_ilerp, 1, min, max, ys_input_lower_ptr, ys_input_upper_ptr,
        output_y_ptr);
  }
#endif
}

template <int RES>
inline void OutputLerp8x8x3(const InterpolationCache<int16>& xs,
                            const int64_t x_start, const int16_t ys_ilerp,
                            const float min, const float max,
                            const quint8* const ys_input_lower_ptr,
                            const quint8* const ys_input_upper_ptr,
                            quint8* output_y_ptr) {
#ifdef QUANTIZED_RESIZE_BILINEAR_USE_NEON
  const int16x8_t y_lerpsx = vmovq_n_s16(ys_ilerp);

  const uint8x8_t x0c0x2c1 =
      ComputeLerpx8Tmpl<RES, 0, 0, 0, 1, 0, 2, 1, 0, 1, 1, 1, 2, 2, 0, 2, 1>(
          ys_input_lower_ptr, ys_input_upper_ptr, &xs.lower[x_start],
          &xs.upper[x_start], &xs.ilerp[x_start], y_lerpsx);

  vst1_u8(reinterpret_cast<uint8_t*>(output_y_ptr + x_start * 3), x0c0x2c1);

  const uint8x8_t x2c2x5c0 =
      ComputeLerpx8Tmpl<RES, 2, 2, 3, 0, 3, 1, 3, 2, 4, 0, 4, 1, 4, 2, 5, 0>(
          ys_input_lower_ptr, ys_input_upper_ptr, &xs.lower[x_start],
          &xs.upper[x_start], &xs.ilerp[x_start], y_lerpsx);

  vst1_u8(reinterpret_cast<uint8_t*>(output_y_ptr + x_start * 3 + 8), x2c2x5c0);

  const uint8x8_t x5c1x7c2 =
      ComputeLerpx8Tmpl<RES, 5, 1, 5, 2, 6, 0, 6, 1, 6, 2, 7, 0, 7, 1, 7, 2>(
          ys_input_lower_ptr, ys_input_upper_ptr, &xs.lower[x_start],
          &xs.upper[x_start], &xs.ilerp[x_start], y_lerpsx);

  vst1_u8(reinterpret_cast<uint8_t*>(output_y_ptr + x_start * 3 + 16),
          x5c1x7c2);

#else
  for (int x = x_start; x < x_start + 8; ++x) {
    OutputLerpForChannels<RES, quint8, int16, int16>(
        xs, x, ys_ilerp, 3, min, max, ys_input_lower_ptr, ys_input_upper_ptr,
        output_y_ptr);
  }
#endif
}

template <int RESOLUTION>
inline void OutputLerp32x4x1(const InterpolationCache<int32>& xs,
                             const int64_t x_start, const int32_t ys_ilerp,
                             const float min, const float max,
                             const qint32* const ys_input_lower_ptr,
                             const qint32* const ys_input_upper_ptr,
                             qint32* output_y_ptr) {
#ifdef QUANTIZED_RESIZE_BILINEAR_USE_NEON
  const int64 xs_lower0 = xs.lower[x_start];
  const int64 xs_upper0 = xs.upper[x_start];
  const int32* const xs_ilerp0 = &xs.ilerp[x_start];
  const int64 xs_lower1 = xs.lower[x_start + 1];
  const int64 xs_upper1 = xs.upper[x_start + 1];
  const int64 xs_lower2 = xs.lower[x_start + 2];
  const int64 xs_upper2 = xs.upper[x_start + 2];
  const int32* const xs_ilerp2 = &xs.ilerp[x_start + 2];
  const int64 xs_lower3 = xs.lower[x_start + 3];
  const int64 xs_upper3 = xs.upper[x_start + 3];

  const int32x2_t y_lerpsx = vmov_n_s32(ys_ilerp);

  const int32x2_t x0x1 = ComputeLerpx2<RESOLUTION, false>(
      ys_input_lower_ptr + xs_lower0, ys_input_lower_ptr + xs_upper0,
      ys_input_upper_ptr + xs_lower0, ys_input_upper_ptr + xs_upper0,
      ys_input_lower_ptr + xs_lower1, ys_input_lower_ptr + xs_upper1,
      ys_input_upper_ptr + xs_lower1, ys_input_upper_ptr + xs_upper1, xs_ilerp0,
      y_lerpsx);

  const int32x2_t x1x2 = ComputeLerpx2<RESOLUTION, false>(
      ys_input_lower_ptr + xs_lower2, ys_input_lower_ptr + xs_upper2,
      ys_input_upper_ptr + xs_lower2, ys_input_upper_ptr + xs_upper2,
      ys_input_lower_ptr + xs_lower3, ys_input_lower_ptr + xs_upper3,
      ys_input_upper_ptr + xs_lower3, ys_input_upper_ptr + xs_upper3, xs_ilerp2,
      y_lerpsx);

  const int32x4_t x0x1x2x3 = vcombine_s32(x0x1, x1x2);

  vst1q_s32(reinterpret_cast<int32*>(output_y_ptr + x_start), x0x1x2x3);

#else
  for (int x = x_start; x < x_start + 4; ++x) {
    OutputLerpForChannels<RESOLUTION, qint32, int32, int64_t>(
        xs, x, ys_ilerp, 1, min, max, ys_input_lower_ptr, ys_input_upper_ptr,
        output_y_ptr);
  }
#endif
}

template <int RESOLUTION>
inline void OutputLerp32x4x3(const InterpolationCache<int32>& xs,
                             const int64_t x_start, const int32_t ys_ilerp,
                             const float min, const float max,
                             const qint32* const ys_input_lower_ptr,
                             const qint32* const ys_input_upper_ptr,
                             qint32* output_y_ptr) {
#ifdef QUANTIZED_RESIZE_BILINEAR_USE_NEON
  const int64 xs_lower0 = xs.lower[x_start];
  const int64 xs_upper0 = xs.upper[x_start];
  const int32* const xs_ilerp0 = &xs.ilerp[x_start];
  const int64 xs_lower1 = xs.lower[x_start + 1];
  const int64 xs_upper1 = xs.upper[x_start + 1];
  const int32* const xs_ilerp1 = &xs.ilerp[x_start + 1];
  const int64 xs_lower2 = xs.lower[x_start + 2];
  const int64 xs_upper2 = xs.upper[x_start + 2];
  const int32* const xs_ilerp2 = &xs.ilerp[x_start + 2];
  const int64 xs_lower3 = xs.lower[x_start + 3];
  const int64 xs_upper3 = xs.upper[x_start + 3];
  const int32* const xs_ilerp3 = &xs.ilerp[x_start + 3];

  const int32x2_t y_lerpsx = vmov_n_s32(ys_ilerp);

  const int32x2_t x0c0x0c1 = ComputeLerpx2<RESOLUTION, true>(
      ys_input_lower_ptr + xs_lower0, ys_input_lower_ptr + xs_upper0,
      ys_input_upper_ptr + xs_lower0, ys_input_upper_ptr + xs_upper0,
      ys_input_lower_ptr + xs_lower0 + 1, ys_input_lower_ptr + xs_upper0 + 1,
      ys_input_upper_ptr + xs_lower0 + 1, ys_input_upper_ptr + xs_upper0 + 1,
      xs_ilerp0, y_lerpsx);

  const int32x2_t x0c2x1c0 = ComputeLerpx2<RESOLUTION, false>(
      ys_input_lower_ptr + xs_lower0 + 2, ys_input_lower_ptr + xs_upper0 + 2,
      ys_input_upper_ptr + xs_lower0 + 2, ys_input_upper_ptr + xs_upper0 + 2,
      ys_input_lower_ptr + xs_lower1, ys_input_lower_ptr + xs_upper1,
      ys_input_upper_ptr + xs_lower1, ys_input_upper_ptr + xs_upper1, xs_ilerp0,
      y_lerpsx);

  const int32x2_t x1c1x1c2 = ComputeLerpx2<RESOLUTION, true>(
      ys_input_lower_ptr + xs_lower1 + 1, ys_input_lower_ptr + xs_upper1 + 1,
      ys_input_upper_ptr + xs_lower1 + 1, ys_input_upper_ptr + xs_upper1 + 1,
      ys_input_lower_ptr + xs_lower1 + 2, ys_input_lower_ptr + xs_upper1 + 2,
      ys_input_upper_ptr + xs_lower1 + 2, ys_input_upper_ptr + xs_upper1 + 2,
      xs_ilerp1, y_lerpsx);

  const int32x2_t x2c0x2c1 = ComputeLerpx2<RESOLUTION, true>(
      ys_input_lower_ptr + xs_lower2, ys_input_lower_ptr + xs_upper2,
      ys_input_upper_ptr + xs_lower2, ys_input_upper_ptr + xs_upper2,
      ys_input_lower_ptr + xs_lower2 + 1, ys_input_lower_ptr + xs_upper2 + 1,
      ys_input_upper_ptr + xs_lower2 + 1, ys_input_upper_ptr + xs_upper2 + 1,
      xs_ilerp2, y_lerpsx);

  const int32x2_t x2c2x3c0 = ComputeLerpx2<RESOLUTION, false>(
      ys_input_lower_ptr + xs_lower2 + 2, ys_input_lower_ptr + xs_upper2 + 2,
      ys_input_upper_ptr + xs_lower2 + 2, ys_input_upper_ptr + xs_upper2 + 2,
      ys_input_lower_ptr + xs_lower3, ys_input_lower_ptr + xs_upper3,
      ys_input_upper_ptr + xs_lower3, ys_input_upper_ptr + xs_upper3, xs_ilerp2,
      y_lerpsx);

  const int32x2_t x3c1x3c2 = ComputeLerpx2<RESOLUTION, true>(
      ys_input_lower_ptr + xs_lower3 + 1, ys_input_lower_ptr + xs_upper3 + 1,
      ys_input_upper_ptr + xs_lower3 + 1, ys_input_upper_ptr + xs_upper3 + 1,
      ys_input_lower_ptr + xs_lower3 + 2, ys_input_lower_ptr + xs_upper3 + 2,
      ys_input_upper_ptr + xs_lower3 + 2, ys_input_upper_ptr + xs_upper3 + 2,
      xs_ilerp3, y_lerpsx);

  const int32x4_t x0c0x0c1x0c2x1c0 = vcombine_s32(x0c0x0c1, x0c2x1c0);
  const int32x4_t x1c1x1c2x2c0x2c1 = vcombine_s32(x1c1x1c2, x2c0x2c1);
  const int32x4_t x2c2x3c0x3c1x3c2 = vcombine_s32(x2c2x3c0, x3c1x3c2);

  vst1q_s32(reinterpret_cast<int32*>(output_y_ptr + x_start * 3),
            x0c0x0c1x0c2x1c0);
  vst1q_s32(reinterpret_cast<int32*>(output_y_ptr + x_start * 3 + 4),
            x1c1x1c2x2c0x2c1);
  vst1q_s32(reinterpret_cast<int32*>(output_y_ptr + x_start * 3 + 8),
            x2c2x3c0x3c1x3c2);

#else
  for (int x = x_start; x < x_start + 4; ++x) {
    OutputLerpForChannels<RESOLUTION, qint32, int32, int64_t>(
        xs, x, ys_ilerp, 3, min, max, ys_input_lower_ptr, ys_input_upper_ptr,
        output_y_ptr);
  }
#endif
}

template <typename T>
void ResizeImageReference(typename TTypes<T, 4>::ConstTensor images,
                          const int batch_size, const int64_t in_height,
                          const int64_t in_width, const int64_t out_height,
                          const int64_t out_width, const int channels,
                          const float height_scale, const float width_scale,
                          const float in_min, const float in_max,
                          const bool half_pixel_centers,
                          typename TTypes<T, 4>::Tensor* output) {
  CHECK_NOTNULL(output);

  const InterpolationCache<float> xs = BuildLerpCache<float>(
      out_width, in_width, width_scale, channels, 0, half_pixel_centers);
  const InterpolationCache<float> ys = BuildLerpCache<float>(
      out_height, in_height, height_scale, 1, 0, half_pixel_centers);

  const int64_t in_row_size = in_width * channels;
  const int64_t in_batch_num_values = in_height * in_row_size;
  const int64_t out_row_size = out_width * channels;

  const T* input_b_ptr = images.data();

  T* output_y_ptr = output->data();
  for (int b = 0; b < batch_size; ++b) {
    for (int64_t y = 0; y < out_height; ++y) {
      const T* ys_input_lower_ptr = input_b_ptr + ys.lower[y] * in_row_size;
      const T* ys_input_upper_ptr = input_b_ptr + ys.upper[y] * in_row_size;
      const float ys_lerp = ys.lerp[y];
      for (int64_t x = 0; x < out_width; ++x) {
        const int64_t xs_lower = xs.lower[x];
        const int64_t xs_upper = xs.upper[x];
        const float xs_lerp = xs.lerp[x];
        for (int c = 0; c < channels; ++c) {
          const T top_left = ys_input_lower_ptr[xs_lower + c];
          const T top_right = ys_input_lower_ptr[xs_upper + c];
          const T bottom_left = ys_input_upper_ptr[xs_lower + c];
          const T bottom_right = ys_input_upper_ptr[xs_upper + c];
          const T val = ComputeLerpReference<T>(
              top_left, top_right, bottom_left, bottom_right, xs_lerp, ys_lerp,
              in_min, in_max);
          output_y_ptr[x * channels + c] = val;
        }
      }
      output_y_ptr += out_row_size;
    }
    input_b_ptr += in_batch_num_values;
  }
}

template <typename T>
void ResizeImage(typename TTypes<T, 4>::ConstTensor images,
                 const int batch_size, const int64_t in_height,
                 const int64_t in_width, const int64_t out_height,
                 const int64_t out_width, const int channels,
                 const float height_scale, const float width_scale,
                 const float in_min, const float in_max,
                 const bool half_pixel_centers,
                 typename TTypes<T, 4>::Tensor* output) {
  ResizeImageReference<T>(images, batch_size, in_height, in_width, out_height,
                          out_width, channels, height_scale, width_scale,
                          in_min, in_max, half_pixel_centers, output);
}

template <>
void ResizeImage<qint32>(typename TTypes<qint32, 4>::ConstTensor images,
                         const int batch_size, const int64_t in_height,
                         const int64_t in_width, const int64_t out_height,
                         const int64_t out_width, const int channels,
                         const float height_scale, const float width_scale,
                         const float in_min, const float in_max,
                         const bool half_pixel_centers,
                         typename TTypes<qint32, 4>::Tensor* output) {
  // 30 is maximum resolution for signed int.
  constexpr int RESOLUTION = 30;
  constexpr int SIMD_STEP = 4;

  CHECK_NOTNULL(output);

  const InterpolationCache<int32> xs =
      BuildLerpCache<int32>(out_width, in_width, width_scale, channels,
                            RESOLUTION, half_pixel_centers);
  const InterpolationCache<int32> ys = BuildLerpCache<int32>(
      out_height, in_height, height_scale, 1, RESOLUTION, half_pixel_centers);

  const int64_t in_row_size = in_width * channels;
  const int64_t in_batch_num_values = in_height * in_row_size;
  const int64_t out_row_size = out_width * channels;

  const qint32* input_b_ptr = images.data();

  qint32* output_y_ptr = output->data();

  for (int b = 0; b < batch_size; ++b) {
    for (int64_t y = 0; y < out_height; ++y) {
      const qint32* ys_input_lower_ptr =
          input_b_ptr + ys.lower[y] * in_row_size;
      const qint32* ys_input_upper_ptr =
          input_b_ptr + ys.upper[y] * in_row_size;
      const int32_t ys_ilerp = ys.ilerp[y];
      // Optimized for channels == 1 or channels == 3 as this
      // is typical channels.
      int64_t x = 0;
      if (channels == 1) {
        for (; x < out_width - SIMD_STEP + 1; x += SIMD_STEP) {
          OutputLerp32x4x1<RESOLUTION>(xs, x, ys_ilerp, in_min, in_max,
                                       ys_input_lower_ptr, ys_input_upper_ptr,
                                       output_y_ptr);
        }
      } else if (channels == 3) {
        for (; x < out_width - SIMD_STEP + 1; x += SIMD_STEP) {
          OutputLerp32x4x3<RESOLUTION>(xs, x, ys_ilerp, in_min, in_max,
                                       ys_input_lower_ptr, ys_input_upper_ptr,
                                       output_y_ptr);
        }
      }
      for (; x < out_width; ++x) {
        OutputLerpForChannels<RESOLUTION, qint32, int32, int64_t>(
            xs, x, ys_ilerp, channels, in_min, in_max, ys_input_lower_ptr,
            ys_input_upper_ptr, output_y_ptr);
      }
      output_y_ptr += out_row_size;
    }
    input_b_ptr += in_batch_num_values;
  }
}

template <>
void ResizeImage<quint8>(typename TTypes<quint8, 4>::ConstTensor images,
                         const int batch_size, const int64_t in_height,
                         const int64_t in_width, const int64_t out_height,
                         const int64_t out_width, const int channels,
                         const float height_scale, const float width_scale,
                         const float in_min, const float in_max,
                         const bool half_pixel_centers,
                         typename TTypes<quint8, 4>::Tensor* output) {
  // 7 is maximum resolution for unsigned byte.
  constexpr int RESOLUTION = 7;
  constexpr int SIMD_STEP = 8;

  CHECK_NOTNULL(output);

  const InterpolationCache<int16> xs =
      BuildLerpCache<int16>(out_width, in_width, width_scale, channels,
                            RESOLUTION, half_pixel_centers);
  const InterpolationCache<int16> ys = BuildLerpCache<int16>(
      out_height, in_height, height_scale, 1, RESOLUTION, half_pixel_centers);

  const int64_t in_row_size = in_width * channels;
  const int64_t in_batch_num_values = in_height * in_row_size;
  const int64_t out_row_size = out_width * channels;

  const quint8* input_b_ptr = images.data();

  quint8* output_y_ptr = output->data();

  for (int b = 0; b < batch_size; ++b) {
    for (int64_t y = 0; y < out_height; ++y) {
      const quint8* ys_input_lower_ptr =
          input_b_ptr + ys.lower[y] * in_row_size;
      const quint8* ys_input_upper_ptr =
          input_b_ptr + ys.upper[y] * in_row_size;
      const int32_t ys_ilerp = ys.ilerp[y];
      // Optimized for channels == 1 or channels == 3 as this
      // is typical channels.
      // TODO(satok): Support more generic NEON optimized implementation
      // for different channels.
      int64_t x = 0;
      if (channels == 1) {
        for (; x < out_width - SIMD_STEP + 1; x += SIMD_STEP) {
          OutputLerp8x8x1<RESOLUTION>(xs, x, ys_ilerp, in_min, in_max,
                                      ys_input_lower_ptr, ys_input_upper_ptr,
                                      output_y_ptr);
        }
      } else if (channels == 3) {
        for (; x < out_width - SIMD_STEP + 1; x += SIMD_STEP) {
          OutputLerp8x8x3<RESOLUTION>(xs, x, ys_ilerp, in_min, in_max,
                                      ys_input_lower_ptr, ys_input_upper_ptr,
                                      output_y_ptr);
        }
      }
      for (; x < out_width; ++x) {
        OutputLerpForChannels<RESOLUTION, quint8, int16, int16>(
            xs, x, ys_ilerp, channels, in_min, in_max, ys_input_lower_ptr,
            ys_input_upper_ptr, output_y_ptr);
      }
      output_y_ptr += out_row_size;
    }
    input_b_ptr += in_batch_num_values;
  }
}

template <typename T>
void ResizeBilinear(const typename TTypes<T, 4>::ConstTensor& images,
                    const float height_scale, const float width_scale,
                    const float in_min, const float in_max,
                    const bool half_pixel_centers,
                    typename TTypes<T, 4>::Tensor* output) {
  CHECK_NOTNULL(output);

  const int batch_size = images.dimension(0);
  const int64_t in_height = images.dimension(1);
  const int64_t in_width = images.dimension(2);
  const int channels = images.dimension(3);

  const int64_t out_height = output->dimension(1);
  const int64_t out_width = output->dimension(2);

  // Handle no-op resizes efficiently.
  if (out_height == in_height && out_width == in_width) {
    *output = images.template cast<T>();
    return;
  }

  if (USE_REFERENCE) {
    ResizeImageReference<T>(images, batch_size, in_height, in_width, out_height,
                            out_width, channels, height_scale, width_scale,
                            in_min, in_max, half_pixel_centers, output);
  } else {
    ResizeImage<T>(images, batch_size, in_height, in_width, out_height,
                   out_width, channels, height_scale, width_scale, in_min,
                   in_max, half_pixel_centers, output);
  }
}

}  // namespace

template <class T>
class QuantizedResizeBilinearOp : public OpKernel {
 public:
  explicit QuantizedResizeBilinearOp(OpKernelConstruction* context)
      : OpKernel(context) {
    OP_REQUIRES_OK(context, context->GetAttr("align_corners", &align_corners_));
    OP_REQUIRES_OK(
        context, context->GetAttr("half_pixel_centers", &half_pixel_centers_));
  }

  void Compute(OpKernelContext* context) override {
    const auto& in_min_tensor = context->input(2);
    OP_REQUIRES(context, TensorShapeUtils::IsScalar(in_min_tensor.shape()),
                errors::InvalidArgument("min must be a scalar"));
    const float in_min = in_min_tensor.flat<float>()(0);
    const auto& in_max_tensor = context->input(3);
    OP_REQUIRES(context, TensorShapeUtils::IsScalar(in_max_tensor.shape()),
                errors::InvalidArgument("max must be a scalar"));
    const float in_max = in_max_tensor.flat<float>()(0);

    ImageResizerState st(align_corners_, false);
    st.ValidateAndCreateOutput(context);

    if (!context->status().ok()) return;

    // Return if the output is empty.
    if (st.output->NumElements() == 0) return;

    typename TTypes<T, 4>::ConstTensor image_data(
        context->input(0).tensor<T, 4>());
    typename TTypes<T, 4>::Tensor output_data(st.output->tensor<T, 4>());

    ResizeBilinear<T>(image_data, st.height_scale, st.width_scale, in_min,
                      in_max, half_pixel_centers_, &output_data);
    Tensor* out_min = nullptr;
    OP_REQUIRES_OK(context, context->allocate_output(1, {}, &out_min));
    out_min->flat<float>()(0) = in_min;

    Tensor* out_max = nullptr;
    OP_REQUIRES_OK(context, context->allocate_output(2, {}, &out_max));
    out_max->flat<float>()(0) = in_max;
  }

 private:
  bool align_corners_;
  bool half_pixel_centers_;

  TF_DISALLOW_COPY_AND_ASSIGN(QuantizedResizeBilinearOp<T>);
};

#define REGISTER_CPU_KERNEL(type)                         \
  REGISTER_KERNEL_BUILDER(Name("QuantizedResizeBilinear") \
                              .Device(DEVICE_CPU)         \
                              .HostMemory("size")         \
                              .TypeConstraint<type>("T"), \
                          QuantizedResizeBilinearOp<type>)

REGISTER_CPU_KERNEL(::tensorflow::quint8);
REGISTER_CPU_KERNEL(::tensorflow::qint32);
REGISTER_CPU_KERNEL(float);

}  // namespace tensorflow