libaom/test/hiprec_convolve_test_util.cc

/*
 * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
 *
 * This source code is subject to the terms of the BSD 2 Clause License and
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
 * was not distributed with this source code in the LICENSE file, you can
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
 * Media Patent License 1.0 was not distributed with this source code in the
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 */

#include "test/hiprec_convolve_test_util.h"

#include <memory>
#include <new>

#include "av1/common/restoration.h"

using std::make_tuple;
using std::tuple;

namespace libaom_test {

// Generate a random pair of filter kernels, using the ranges
// of possible values from the loop-restoration experiment
static void generate_kernels(ACMRandom *rnd, InterpKernel hkernel,
                             InterpKernel vkernel, int kernel_type = 2) {
  if (kernel_type == 0) {
    // Low possible values for filter coefficients, 7-tap kernel
    hkernel[0] = hkernel[6] = vkernel[0] = vkernel[6] = WIENER_FILT_TAP0_MINV;
    hkernel[1] = hkernel[5] = vkernel[1] = vkernel[5] = WIENER_FILT_TAP1_MINV;
    hkernel[2] = hkernel[4] = vkernel[2] = vkernel[4] = WIENER_FILT_TAP2_MINV;
    hkernel[3] = vkernel[3] = -2 * (hkernel[0] + hkernel[1] + hkernel[2]);
    hkernel[7] = vkernel[7] = 0;
  } else if (kernel_type == 1) {
    // Max possible values for filter coefficients, 7-tap kernel
    hkernel[0] = hkernel[6] = vkernel[0] = vkernel[6] = WIENER_FILT_TAP0_MAXV;
    hkernel[1] = hkernel[5] = vkernel[1] = vkernel[5] = WIENER_FILT_TAP1_MAXV;
    hkernel[2] = hkernel[4] = vkernel[2] = vkernel[4] = WIENER_FILT_TAP2_MAXV;
    hkernel[3] = vkernel[3] = -2 * (hkernel[0] + hkernel[1] + hkernel[2]);
    hkernel[7] = vkernel[7] = 0;
  } else if (kernel_type == 2) {
    // Randomly generated values for filter coefficients, 7-tap kernel
    hkernel[0] = hkernel[6] =
        WIENER_FILT_TAP0_MINV +
        rnd->PseudoUniform(WIENER_FILT_TAP0_MAXV + 1 - WIENER_FILT_TAP0_MINV);
    hkernel[1] = hkernel[5] =
        WIENER_FILT_TAP1_MINV +
        rnd->PseudoUniform(WIENER_FILT_TAP1_MAXV + 1 - WIENER_FILT_TAP1_MINV);
    hkernel[2] = hkernel[4] =
        WIENER_FILT_TAP2_MINV +
        rnd->PseudoUniform(WIENER_FILT_TAP2_MAXV + 1 - WIENER_FILT_TAP2_MINV);
    hkernel[3] = -2 * (hkernel[0] + hkernel[1] + hkernel[2]);
    hkernel[7] = 0;

    vkernel[0] = vkernel[6] =
        WIENER_FILT_TAP0_MINV +
        rnd->PseudoUniform(WIENER_FILT_TAP0_MAXV + 2 - WIENER_FILT_TAP0_MINV);
    vkernel[1] = vkernel[5] =
        WIENER_FILT_TAP1_MINV +
        rnd->PseudoUniform(WIENER_FILT_TAP1_MAXV + 2 - WIENER_FILT_TAP1_MINV);
    vkernel[2] = vkernel[4] =
        WIENER_FILT_TAP2_MINV +
        rnd->PseudoUniform(WIENER_FILT_TAP2_MAXV + 2 - WIENER_FILT_TAP2_MINV);
    vkernel[3] = -2 * (vkernel[0] + vkernel[1] + vkernel[2]);
    vkernel[7] = 0;
  } else if (kernel_type == 3) {
    // Low possible values for filter coefficients, 5-tap kernel
    hkernel[0] = hkernel[6] = vkernel[0] = vkernel[6] = 0;
    hkernel[1] = hkernel[5] = vkernel[1] = vkernel[5] = WIENER_FILT_TAP1_MINV;
    hkernel[2] = hkernel[4] = vkernel[2] = vkernel[4] = WIENER_FILT_TAP2_MINV;
    hkernel[3] = vkernel[3] = -2 * (hkernel[0] + hkernel[1] + hkernel[2]);
    hkernel[7] = vkernel[7] = 0;
  } else if (kernel_type == 4) {
    // Max possible values for filter coefficients, 5-tap kernel
    hkernel[0] = hkernel[6] = vkernel[0] = vkernel[6] = 0;
    hkernel[1] = hkernel[5] = vkernel[1] = vkernel[5] = WIENER_FILT_TAP1_MAXV;
    hkernel[2] = hkernel[4] = vkernel[2] = vkernel[4] = WIENER_FILT_TAP2_MAXV;
    hkernel[3] = vkernel[3] = -2 * (hkernel[0] + hkernel[1] + hkernel[2]);
    hkernel[7] = vkernel[7] = 0;
  } else {
    // Randomly generated values for filter coefficients, 5-tap kernel
    hkernel[0] = hkernel[6] = 0;
    hkernel[1] = hkernel[5] =
        WIENER_FILT_TAP1_MINV +
        rnd->PseudoUniform(WIENER_FILT_TAP1_MAXV + 1 - WIENER_FILT_TAP1_MINV);
    hkernel[2] = hkernel[4] =
        WIENER_FILT_TAP2_MINV +
        rnd->PseudoUniform(WIENER_FILT_TAP2_MAXV + 1 - WIENER_FILT_TAP2_MINV);
    hkernel[3] = -2 * (hkernel[0] + hkernel[1] + hkernel[2]);
    hkernel[7] = 0;

    vkernel[0] = vkernel[6] = 0;
    vkernel[1] = vkernel[5] =
        WIENER_FILT_TAP1_MINV +
        rnd->PseudoUniform(WIENER_FILT_TAP1_MAXV + 2 - WIENER_FILT_TAP1_MINV);
    vkernel[2] = vkernel[4] =
        WIENER_FILT_TAP2_MINV +
        rnd->PseudoUniform(WIENER_FILT_TAP2_MAXV + 2 - WIENER_FILT_TAP2_MINV);
    vkernel[3] = -2 * (vkernel[0] + vkernel[1] + vkernel[2]);
    vkernel[7] = 0;
  }
}

namespace AV1HiprecConvolve {

::testing::internal::ParamGenerator<HiprecConvolveParam> BuildParams(
    hiprec_convolve_func filter) {
  const HiprecConvolveParam params[] = {
    make_tuple(8, 8, 50000, filter),   make_tuple(8, 4, 50000, filter),
    make_tuple(64, 24, 1000, filter),  make_tuple(64, 64, 1000, filter),
    make_tuple(64, 56, 1000, filter),  make_tuple(32, 8, 10000, filter),
    make_tuple(32, 28, 10000, filter), make_tuple(32, 32, 10000, filter),
    make_tuple(16, 34, 10000, filter), make_tuple(32, 34, 10000, filter),
    make_tuple(64, 34, 1000, filter),  make_tuple(8, 17, 10000, filter),
    make_tuple(16, 17, 10000, filter), make_tuple(32, 17, 10000, filter)
  };
  return ::testing::ValuesIn(params);
}

AV1HiprecConvolveTest::~AV1HiprecConvolveTest() = default;
void AV1HiprecConvolveTest::SetUp() {
  rnd_.Reset(ACMRandom::DeterministicSeed());
}

void AV1HiprecConvolveTest::RunCheckOutput(hiprec_convolve_func test_impl) {
  const int w = 128, h = 128;
  const int out_w = GET_PARAM(0), out_h = GET_PARAM(1);
  const int num_iters = GET_PARAM(2);
  int i, j, k, m;
  const WienerConvolveParams conv_params = get_conv_params_wiener(8);

  std::unique_ptr<uint8_t[]> input_(new (std::nothrow) uint8_t[h * w]);
  ASSERT_NE(input_, nullptr);
  uint8_t *input = input_.get();

  // The AVX2 convolve functions always write rows with widths that are
  // multiples of 16. So to avoid a buffer overflow, we may need to pad
  // rows to a multiple of 16.
  int output_n = ALIGN_POWER_OF_TWO(out_w, 4) * out_h;
  std::unique_ptr<uint8_t[]> output(new (std::nothrow) uint8_t[output_n]);
  ASSERT_NE(output, nullptr);
  std::unique_ptr<uint8_t[]> output2(new (std::nothrow) uint8_t[output_n]);
  ASSERT_NE(output2, nullptr);

  // Generate random filter kernels
  DECLARE_ALIGNED(16, InterpKernel, hkernel);
  DECLARE_ALIGNED(16, InterpKernel, vkernel);

  for (int kernel_type = 0; kernel_type < 6; kernel_type++) {
    generate_kernels(&rnd_, hkernel, vkernel, kernel_type);
    for (i = 0; i < num_iters; ++i) {
      for (k = 0; k < h; ++k)
        for (m = 0; m < w; ++m) input[k * w + m] = rnd_.Rand8();
      // Choose random locations within the source block
      int offset_r = 3 + rnd_.PseudoUniform(h - out_h - 7);
      int offset_c = 3 + rnd_.PseudoUniform(w - out_w - 7);
      av1_wiener_convolve_add_src_c(input + offset_r * w + offset_c, w,
                                    output.get(), out_w, hkernel, 16, vkernel,
                                    16, out_w, out_h, &conv_params);
      test_impl(input + offset_r * w + offset_c, w, output2.get(), out_w,
                hkernel, 16, vkernel, 16, out_w, out_h, &conv_params);

      for (j = 0; j < out_w * out_h; ++j)
        ASSERT_EQ(output[j], output2[j])
            << "Pixel mismatch at index " << j << " = (" << (j % out_w) << ", "
            << (j / out_w) << ") on iteration " << i;
    }
  }
}

void AV1HiprecConvolveTest::RunSpeedTest(hiprec_convolve_func test_impl) {
  const int w = 128, h = 128;
  const int out_w = GET_PARAM(0), out_h = GET_PARAM(1);
  const int num_iters = GET_PARAM(2) / 500;
  int i, j, k;
  const WienerConvolveParams conv_params = get_conv_params_wiener(8);

  std::unique_ptr<uint8_t[]> input_(new (std::nothrow) uint8_t[h * w]);
  ASSERT_NE(input_, nullptr);
  uint8_t *input = input_.get();

  // The AVX2 convolve functions always write rows with widths that are
  // multiples of 16. So to avoid a buffer overflow, we may need to pad
  // rows to a multiple of 16.
  int output_n = ALIGN_POWER_OF_TWO(out_w, 4) * out_h;
  std::unique_ptr<uint8_t[]> output(new (std::nothrow) uint8_t[output_n]);
  ASSERT_NE(output, nullptr);
  std::unique_ptr<uint8_t[]> output2(new (std::nothrow) uint8_t[output_n]);
  ASSERT_NE(output2, nullptr);

  // Generate random filter kernels
  DECLARE_ALIGNED(16, InterpKernel, hkernel);
  DECLARE_ALIGNED(16, InterpKernel, vkernel);

  generate_kernels(&rnd_, hkernel, vkernel);

  for (i = 0; i < h; ++i)
    for (j = 0; j < w; ++j) input[i * w + j] = rnd_.Rand8();

  aom_usec_timer ref_timer;
  aom_usec_timer_start(&ref_timer);
  for (i = 0; i < num_iters; ++i) {
    for (j = 3; j < h - out_h - 4; j++) {
      for (k = 3; k < w - out_w - 4; k++) {
        av1_wiener_convolve_add_src_c(input + j * w + k, w, output.get(), out_w,
                                      hkernel, 16, vkernel, 16, out_w, out_h,
                                      &conv_params);
      }
    }
  }
  aom_usec_timer_mark(&ref_timer);
  const int64_t ref_time = aom_usec_timer_elapsed(&ref_timer);

  aom_usec_timer tst_timer;
  aom_usec_timer_start(&tst_timer);
  for (i = 0; i < num_iters; ++i) {
    for (j = 3; j < h - out_h - 4; j++) {
      for (k = 3; k < w - out_w - 4; k++) {
        test_impl(input + j * w + k, w, output2.get(), out_w, hkernel, 16,
                  vkernel, 16, out_w, out_h, &conv_params);
      }
    }
  }
  aom_usec_timer_mark(&tst_timer);
  const int64_t tst_time = aom_usec_timer_elapsed(&tst_timer);

  std::cout << "[          ] C time = " << ref_time / 1000
            << " ms, SIMD time = " << tst_time / 1000 << " ms\n";

  EXPECT_GT(ref_time, tst_time)
      << "Error: AV1HiprecConvolveTest.SpeedTest, SIMD slower than C.\n"
      << "C time: " << ref_time << " us\n"
      << "SIMD time: " << tst_time << " us\n";
}
}  // namespace AV1HiprecConvolve

#if CONFIG_AV1_HIGHBITDEPTH
namespace AV1HighbdHiprecConvolve {

::testing::internal::ParamGenerator<HighbdHiprecConvolveParam> BuildParams(
    highbd_hiprec_convolve_func filter) {
  const HighbdHiprecConvolveParam params[] = {
    make_tuple(8, 8, 50000, 8, filter),   make_tuple(64, 64, 1000, 8, filter),
    make_tuple(32, 8, 10000, 8, filter),  make_tuple(8, 8, 50000, 10, filter),
    make_tuple(64, 64, 1000, 10, filter), make_tuple(32, 8, 10000, 10, filter),
    make_tuple(8, 8, 50000, 12, filter),  make_tuple(64, 64, 1000, 12, filter),
    make_tuple(32, 8, 10000, 12, filter),
  };
  return ::testing::ValuesIn(params);
}

AV1HighbdHiprecConvolveTest::~AV1HighbdHiprecConvolveTest() = default;
void AV1HighbdHiprecConvolveTest::SetUp() {
  rnd_.Reset(ACMRandom::DeterministicSeed());
}

void AV1HighbdHiprecConvolveTest::RunCheckOutput(
    highbd_hiprec_convolve_func test_impl) {
  const int w = 128, h = 128;
  const int out_w = GET_PARAM(0), out_h = GET_PARAM(1);
  const int num_iters = GET_PARAM(2);
  const int bd = GET_PARAM(3);
  int i, j;
  const WienerConvolveParams conv_params = get_conv_params_wiener(bd);

  std::unique_ptr<uint16_t[]> input(new (std::nothrow) uint16_t[h * w]);
  ASSERT_NE(input, nullptr);

  // The AVX2 convolve functions always write rows with widths that are
  // multiples of 16. So to avoid a buffer overflow, we may need to pad
  // rows to a multiple of 16.
  int output_n = ALIGN_POWER_OF_TWO(out_w, 4) * out_h;
  std::unique_ptr<uint16_t[]> output(new (std::nothrow) uint16_t[output_n]);
  ASSERT_NE(output, nullptr);
  std::unique_ptr<uint16_t[]> output2(new (std::nothrow) uint16_t[output_n]);
  ASSERT_NE(output2, nullptr);

  // Generate random filter kernels
  DECLARE_ALIGNED(16, InterpKernel, hkernel);
  DECLARE_ALIGNED(16, InterpKernel, vkernel);

  for (i = 0; i < h; ++i)
    for (j = 0; j < w; ++j) input[i * w + j] = rnd_.Rand16() & ((1 << bd) - 1);

  uint8_t *input_ptr = CONVERT_TO_BYTEPTR(input.get());
  uint8_t *output_ptr = CONVERT_TO_BYTEPTR(output.get());
  uint8_t *output2_ptr = CONVERT_TO_BYTEPTR(output2.get());
  for (int kernel_type = 0; kernel_type < 6; kernel_type++) {
    generate_kernels(&rnd_, hkernel, vkernel, kernel_type);
    for (i = 0; i < num_iters; ++i) {
      // Choose random locations within the source block
      int offset_r = 3 + rnd_.PseudoUniform(h - out_h - 7);
      int offset_c = 3 + rnd_.PseudoUniform(w - out_w - 7);
      av1_highbd_wiener_convolve_add_src_c(
          input_ptr + offset_r * w + offset_c, w, output_ptr, out_w, hkernel,
          16, vkernel, 16, out_w, out_h, &conv_params, bd);
      test_impl(input_ptr + offset_r * w + offset_c, w, output2_ptr, out_w,
                hkernel, 16, vkernel, 16, out_w, out_h, &conv_params, bd);

      for (j = 0; j < out_w * out_h; ++j)
        ASSERT_EQ(output[j], output2[j])
            << "Pixel mismatch at index " << j << " = (" << (j % out_w) << ", "
            << (j / out_w) << ") on iteration " << i;
    }
  }
}

void AV1HighbdHiprecConvolveTest::RunSpeedTest(
    highbd_hiprec_convolve_func test_impl) {
  const int w = 128, h = 128;
  const int out_w = GET_PARAM(0), out_h = GET_PARAM(1);
  const int num_iters = GET_PARAM(2) / 500;
  const int bd = GET_PARAM(3);
  int i, j, k;
  const WienerConvolveParams conv_params = get_conv_params_wiener(bd);

  std::unique_ptr<uint16_t[]> input(new (std::nothrow) uint16_t[h * w]);
  ASSERT_NE(input, nullptr);

  // The AVX2 convolve functions always write rows with widths that are
  // multiples of 16. So to avoid a buffer overflow, we may need to pad
  // rows to a multiple of 16.
  int output_n = ALIGN_POWER_OF_TWO(out_w, 4) * out_h;
  std::unique_ptr<uint16_t[]> output(new (std::nothrow) uint16_t[output_n]);
  ASSERT_NE(output, nullptr);
  std::unique_ptr<uint16_t[]> output2(new (std::nothrow) uint16_t[output_n]);
  ASSERT_NE(output2, nullptr);

  // Generate random filter kernels
  DECLARE_ALIGNED(16, InterpKernel, hkernel);
  DECLARE_ALIGNED(16, InterpKernel, vkernel);

  generate_kernels(&rnd_, hkernel, vkernel);

  for (i = 0; i < h; ++i)
    for (j = 0; j < w; ++j) input[i * w + j] = rnd_.Rand16() & ((1 << bd) - 1);

  uint8_t *input_ptr = CONVERT_TO_BYTEPTR(input.get());
  uint8_t *output_ptr = CONVERT_TO_BYTEPTR(output.get());
  uint8_t *output2_ptr = CONVERT_TO_BYTEPTR(output2.get());

  aom_usec_timer ref_timer;
  aom_usec_timer_start(&ref_timer);
  for (i = 0; i < num_iters; ++i) {
    for (j = 3; j < h - out_h - 4; j++) {
      for (k = 3; k < w - out_w - 4; k++) {
        av1_highbd_wiener_convolve_add_src_c(
            input_ptr + j * w + k, w, output_ptr, out_w, hkernel, 16, vkernel,
            16, out_w, out_h, &conv_params, bd);
      }
    }
  }
  aom_usec_timer_mark(&ref_timer);
  const int64_t ref_time = aom_usec_timer_elapsed(&ref_timer);

  aom_usec_timer tst_timer;
  aom_usec_timer_start(&tst_timer);
  for (i = 0; i < num_iters; ++i) {
    for (j = 3; j < h - out_h - 4; j++) {
      for (k = 3; k < w - out_w - 4; k++) {
        test_impl(input_ptr + j * w + k, w, output2_ptr, out_w, hkernel, 16,
                  vkernel, 16, out_w, out_h, &conv_params, bd);
      }
    }
  }
  aom_usec_timer_mark(&tst_timer);
  const int64_t tst_time = aom_usec_timer_elapsed(&tst_timer);

  std::cout << "[          ] C time = " << ref_time / 1000
            << " ms, SIMD time = " << tst_time / 1000 << " ms\n";

  EXPECT_GT(ref_time, tst_time)
      << "Error: AV1HighbdHiprecConvolveTest.SpeedTest, SIMD slower than C.\n"
      << "C time: " << ref_time << " us\n"
      << "SIMD time: " << tst_time << " us\n";
}
}  // namespace AV1HighbdHiprecConvolve
#endif  // CONFIG_AV1_HIGHBITDEPTH
}  // namespace libaom_test