1 /*
2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #include "test/hiprec_convolve_test_util.h"
13
14 #include <memory>
15 #include <new>
16
17 #include "av1/common/restoration.h"
18
19 using std::make_tuple;
20 using std::tuple;
21
22 namespace libaom_test {
23
24 // Generate a random pair of filter kernels, using the ranges
25 // of possible values from the loop-restoration experiment
generate_kernels(ACMRandom * rnd,InterpKernel hkernel,InterpKernel vkernel,int kernel_type=2)26 static void generate_kernels(ACMRandom *rnd, InterpKernel hkernel,
27 InterpKernel vkernel, int kernel_type = 2) {
28 if (kernel_type == 0) {
29 // Low possible values for filter coefficients, 7-tap kernel
30 hkernel[0] = hkernel[6] = vkernel[0] = vkernel[6] = WIENER_FILT_TAP0_MINV;
31 hkernel[1] = hkernel[5] = vkernel[1] = vkernel[5] = WIENER_FILT_TAP1_MINV;
32 hkernel[2] = hkernel[4] = vkernel[2] = vkernel[4] = WIENER_FILT_TAP2_MINV;
33 hkernel[3] = vkernel[3] = -2 * (hkernel[0] + hkernel[1] + hkernel[2]);
34 hkernel[7] = vkernel[7] = 0;
35 } else if (kernel_type == 1) {
36 // Max possible values for filter coefficients, 7-tap kernel
37 hkernel[0] = hkernel[6] = vkernel[0] = vkernel[6] = WIENER_FILT_TAP0_MAXV;
38 hkernel[1] = hkernel[5] = vkernel[1] = vkernel[5] = WIENER_FILT_TAP1_MAXV;
39 hkernel[2] = hkernel[4] = vkernel[2] = vkernel[4] = WIENER_FILT_TAP2_MAXV;
40 hkernel[3] = vkernel[3] = -2 * (hkernel[0] + hkernel[1] + hkernel[2]);
41 hkernel[7] = vkernel[7] = 0;
42 } else if (kernel_type == 2) {
43 // Randomly generated values for filter coefficients, 7-tap kernel
44 hkernel[0] = hkernel[6] =
45 WIENER_FILT_TAP0_MINV +
46 rnd->PseudoUniform(WIENER_FILT_TAP0_MAXV + 1 - WIENER_FILT_TAP0_MINV);
47 hkernel[1] = hkernel[5] =
48 WIENER_FILT_TAP1_MINV +
49 rnd->PseudoUniform(WIENER_FILT_TAP1_MAXV + 1 - WIENER_FILT_TAP1_MINV);
50 hkernel[2] = hkernel[4] =
51 WIENER_FILT_TAP2_MINV +
52 rnd->PseudoUniform(WIENER_FILT_TAP2_MAXV + 1 - WIENER_FILT_TAP2_MINV);
53 hkernel[3] = -2 * (hkernel[0] + hkernel[1] + hkernel[2]);
54 hkernel[7] = 0;
55
56 vkernel[0] = vkernel[6] =
57 WIENER_FILT_TAP0_MINV +
58 rnd->PseudoUniform(WIENER_FILT_TAP0_MAXV + 2 - WIENER_FILT_TAP0_MINV);
59 vkernel[1] = vkernel[5] =
60 WIENER_FILT_TAP1_MINV +
61 rnd->PseudoUniform(WIENER_FILT_TAP1_MAXV + 2 - WIENER_FILT_TAP1_MINV);
62 vkernel[2] = vkernel[4] =
63 WIENER_FILT_TAP2_MINV +
64 rnd->PseudoUniform(WIENER_FILT_TAP2_MAXV + 2 - WIENER_FILT_TAP2_MINV);
65 vkernel[3] = -2 * (vkernel[0] + vkernel[1] + vkernel[2]);
66 vkernel[7] = 0;
67 } else if (kernel_type == 3) {
68 // Low possible values for filter coefficients, 5-tap kernel
69 hkernel[0] = hkernel[6] = vkernel[0] = vkernel[6] = 0;
70 hkernel[1] = hkernel[5] = vkernel[1] = vkernel[5] = WIENER_FILT_TAP1_MINV;
71 hkernel[2] = hkernel[4] = vkernel[2] = vkernel[4] = WIENER_FILT_TAP2_MINV;
72 hkernel[3] = vkernel[3] = -2 * (hkernel[0] + hkernel[1] + hkernel[2]);
73 hkernel[7] = vkernel[7] = 0;
74 } else if (kernel_type == 4) {
75 // Max possible values for filter coefficients, 5-tap kernel
76 hkernel[0] = hkernel[6] = vkernel[0] = vkernel[6] = 0;
77 hkernel[1] = hkernel[5] = vkernel[1] = vkernel[5] = WIENER_FILT_TAP1_MAXV;
78 hkernel[2] = hkernel[4] = vkernel[2] = vkernel[4] = WIENER_FILT_TAP2_MAXV;
79 hkernel[3] = vkernel[3] = -2 * (hkernel[0] + hkernel[1] + hkernel[2]);
80 hkernel[7] = vkernel[7] = 0;
81 } else {
82 // Randomly generated values for filter coefficients, 5-tap kernel
83 hkernel[0] = hkernel[6] = 0;
84 hkernel[1] = hkernel[5] =
85 WIENER_FILT_TAP1_MINV +
86 rnd->PseudoUniform(WIENER_FILT_TAP1_MAXV + 1 - WIENER_FILT_TAP1_MINV);
87 hkernel[2] = hkernel[4] =
88 WIENER_FILT_TAP2_MINV +
89 rnd->PseudoUniform(WIENER_FILT_TAP2_MAXV + 1 - WIENER_FILT_TAP2_MINV);
90 hkernel[3] = -2 * (hkernel[0] + hkernel[1] + hkernel[2]);
91 hkernel[7] = 0;
92
93 vkernel[0] = vkernel[6] = 0;
94 vkernel[1] = vkernel[5] =
95 WIENER_FILT_TAP1_MINV +
96 rnd->PseudoUniform(WIENER_FILT_TAP1_MAXV + 2 - WIENER_FILT_TAP1_MINV);
97 vkernel[2] = vkernel[4] =
98 WIENER_FILT_TAP2_MINV +
99 rnd->PseudoUniform(WIENER_FILT_TAP2_MAXV + 2 - WIENER_FILT_TAP2_MINV);
100 vkernel[3] = -2 * (vkernel[0] + vkernel[1] + vkernel[2]);
101 vkernel[7] = 0;
102 }
103 }
104
105 namespace AV1HiprecConvolve {
106
BuildParams(hiprec_convolve_func filter)107 ::testing::internal::ParamGenerator<HiprecConvolveParam> BuildParams(
108 hiprec_convolve_func filter) {
109 const HiprecConvolveParam params[] = {
110 make_tuple(8, 8, 50000, filter), make_tuple(8, 4, 50000, filter),
111 make_tuple(64, 24, 1000, filter), make_tuple(64, 64, 1000, filter),
112 make_tuple(64, 56, 1000, filter), make_tuple(32, 8, 10000, filter),
113 make_tuple(32, 28, 10000, filter), make_tuple(32, 32, 10000, filter),
114 make_tuple(16, 34, 10000, filter), make_tuple(32, 34, 10000, filter),
115 make_tuple(64, 34, 1000, filter), make_tuple(8, 17, 10000, filter),
116 make_tuple(16, 17, 10000, filter), make_tuple(32, 17, 10000, filter)
117 };
118 return ::testing::ValuesIn(params);
119 }
120
121 AV1HiprecConvolveTest::~AV1HiprecConvolveTest() = default;
SetUp()122 void AV1HiprecConvolveTest::SetUp() {
123 rnd_.Reset(ACMRandom::DeterministicSeed());
124 }
125
RunCheckOutput(hiprec_convolve_func test_impl)126 void AV1HiprecConvolveTest::RunCheckOutput(hiprec_convolve_func test_impl) {
127 const int w = 128, h = 128;
128 const int out_w = GET_PARAM(0), out_h = GET_PARAM(1);
129 const int num_iters = GET_PARAM(2);
130 int i, j, k, m;
131 const WienerConvolveParams conv_params = get_conv_params_wiener(8);
132
133 std::unique_ptr<uint8_t[]> input_(new (std::nothrow) uint8_t[h * w]);
134 ASSERT_NE(input_, nullptr);
135 uint8_t *input = input_.get();
136
137 // The AVX2 convolve functions always write rows with widths that are
138 // multiples of 16. So to avoid a buffer overflow, we may need to pad
139 // rows to a multiple of 16.
140 int output_n = ALIGN_POWER_OF_TWO(out_w, 4) * out_h;
141 std::unique_ptr<uint8_t[]> output(new (std::nothrow) uint8_t[output_n]);
142 ASSERT_NE(output, nullptr);
143 std::unique_ptr<uint8_t[]> output2(new (std::nothrow) uint8_t[output_n]);
144 ASSERT_NE(output2, nullptr);
145
146 // Generate random filter kernels
147 DECLARE_ALIGNED(16, InterpKernel, hkernel);
148 DECLARE_ALIGNED(16, InterpKernel, vkernel);
149
150 for (int kernel_type = 0; kernel_type < 6; kernel_type++) {
151 generate_kernels(&rnd_, hkernel, vkernel, kernel_type);
152 for (i = 0; i < num_iters; ++i) {
153 for (k = 0; k < h; ++k)
154 for (m = 0; m < w; ++m) input[k * w + m] = rnd_.Rand8();
155 // Choose random locations within the source block
156 int offset_r = 3 + rnd_.PseudoUniform(h - out_h - 7);
157 int offset_c = 3 + rnd_.PseudoUniform(w - out_w - 7);
158 av1_wiener_convolve_add_src_c(input + offset_r * w + offset_c, w,
159 output.get(), out_w, hkernel, 16, vkernel,
160 16, out_w, out_h, &conv_params);
161 test_impl(input + offset_r * w + offset_c, w, output2.get(), out_w,
162 hkernel, 16, vkernel, 16, out_w, out_h, &conv_params);
163
164 for (j = 0; j < out_w * out_h; ++j)
165 ASSERT_EQ(output[j], output2[j])
166 << "Pixel mismatch at index " << j << " = (" << (j % out_w) << ", "
167 << (j / out_w) << ") on iteration " << i;
168 }
169 }
170 }
171
RunSpeedTest(hiprec_convolve_func test_impl)172 void AV1HiprecConvolveTest::RunSpeedTest(hiprec_convolve_func test_impl) {
173 const int w = 128, h = 128;
174 const int out_w = GET_PARAM(0), out_h = GET_PARAM(1);
175 const int num_iters = GET_PARAM(2) / 500;
176 int i, j, k;
177 const WienerConvolveParams conv_params = get_conv_params_wiener(8);
178
179 std::unique_ptr<uint8_t[]> input_(new (std::nothrow) uint8_t[h * w]);
180 ASSERT_NE(input_, nullptr);
181 uint8_t *input = input_.get();
182
183 // The AVX2 convolve functions always write rows with widths that are
184 // multiples of 16. So to avoid a buffer overflow, we may need to pad
185 // rows to a multiple of 16.
186 int output_n = ALIGN_POWER_OF_TWO(out_w, 4) * out_h;
187 std::unique_ptr<uint8_t[]> output(new (std::nothrow) uint8_t[output_n]);
188 ASSERT_NE(output, nullptr);
189 std::unique_ptr<uint8_t[]> output2(new (std::nothrow) uint8_t[output_n]);
190 ASSERT_NE(output2, nullptr);
191
192 // Generate random filter kernels
193 DECLARE_ALIGNED(16, InterpKernel, hkernel);
194 DECLARE_ALIGNED(16, InterpKernel, vkernel);
195
196 generate_kernels(&rnd_, hkernel, vkernel);
197
198 for (i = 0; i < h; ++i)
199 for (j = 0; j < w; ++j) input[i * w + j] = rnd_.Rand8();
200
201 aom_usec_timer ref_timer;
202 aom_usec_timer_start(&ref_timer);
203 for (i = 0; i < num_iters; ++i) {
204 for (j = 3; j < h - out_h - 4; j++) {
205 for (k = 3; k < w - out_w - 4; k++) {
206 av1_wiener_convolve_add_src_c(input + j * w + k, w, output.get(), out_w,
207 hkernel, 16, vkernel, 16, out_w, out_h,
208 &conv_params);
209 }
210 }
211 }
212 aom_usec_timer_mark(&ref_timer);
213 const int64_t ref_time = aom_usec_timer_elapsed(&ref_timer);
214
215 aom_usec_timer tst_timer;
216 aom_usec_timer_start(&tst_timer);
217 for (i = 0; i < num_iters; ++i) {
218 for (j = 3; j < h - out_h - 4; j++) {
219 for (k = 3; k < w - out_w - 4; k++) {
220 test_impl(input + j * w + k, w, output2.get(), out_w, hkernel, 16,
221 vkernel, 16, out_w, out_h, &conv_params);
222 }
223 }
224 }
225 aom_usec_timer_mark(&tst_timer);
226 const int64_t tst_time = aom_usec_timer_elapsed(&tst_timer);
227
228 std::cout << "[ ] C time = " << ref_time / 1000
229 << " ms, SIMD time = " << tst_time / 1000 << " ms\n";
230
231 EXPECT_GT(ref_time, tst_time)
232 << "Error: AV1HiprecConvolveTest.SpeedTest, SIMD slower than C.\n"
233 << "C time: " << ref_time << " us\n"
234 << "SIMD time: " << tst_time << " us\n";
235 }
236 } // namespace AV1HiprecConvolve
237
238 #if CONFIG_AV1_HIGHBITDEPTH
239 namespace AV1HighbdHiprecConvolve {
240
BuildParams(highbd_hiprec_convolve_func filter)241 ::testing::internal::ParamGenerator<HighbdHiprecConvolveParam> BuildParams(
242 highbd_hiprec_convolve_func filter) {
243 const HighbdHiprecConvolveParam params[] = {
244 make_tuple(8, 8, 50000, 8, filter), make_tuple(64, 64, 1000, 8, filter),
245 make_tuple(32, 8, 10000, 8, filter), make_tuple(8, 8, 50000, 10, filter),
246 make_tuple(64, 64, 1000, 10, filter), make_tuple(32, 8, 10000, 10, filter),
247 make_tuple(8, 8, 50000, 12, filter), make_tuple(64, 64, 1000, 12, filter),
248 make_tuple(32, 8, 10000, 12, filter),
249 };
250 return ::testing::ValuesIn(params);
251 }
252
253 AV1HighbdHiprecConvolveTest::~AV1HighbdHiprecConvolveTest() = default;
SetUp()254 void AV1HighbdHiprecConvolveTest::SetUp() {
255 rnd_.Reset(ACMRandom::DeterministicSeed());
256 }
257
RunCheckOutput(highbd_hiprec_convolve_func test_impl)258 void AV1HighbdHiprecConvolveTest::RunCheckOutput(
259 highbd_hiprec_convolve_func test_impl) {
260 const int w = 128, h = 128;
261 const int out_w = GET_PARAM(0), out_h = GET_PARAM(1);
262 const int num_iters = GET_PARAM(2);
263 const int bd = GET_PARAM(3);
264 int i, j;
265 const WienerConvolveParams conv_params = get_conv_params_wiener(bd);
266
267 std::unique_ptr<uint16_t[]> input(new (std::nothrow) uint16_t[h * w]);
268 ASSERT_NE(input, nullptr);
269
270 // The AVX2 convolve functions always write rows with widths that are
271 // multiples of 16. So to avoid a buffer overflow, we may need to pad
272 // rows to a multiple of 16.
273 int output_n = ALIGN_POWER_OF_TWO(out_w, 4) * out_h;
274 std::unique_ptr<uint16_t[]> output(new (std::nothrow) uint16_t[output_n]);
275 ASSERT_NE(output, nullptr);
276 std::unique_ptr<uint16_t[]> output2(new (std::nothrow) uint16_t[output_n]);
277 ASSERT_NE(output2, nullptr);
278
279 // Generate random filter kernels
280 DECLARE_ALIGNED(16, InterpKernel, hkernel);
281 DECLARE_ALIGNED(16, InterpKernel, vkernel);
282
283 for (i = 0; i < h; ++i)
284 for (j = 0; j < w; ++j) input[i * w + j] = rnd_.Rand16() & ((1 << bd) - 1);
285
286 uint8_t *input_ptr = CONVERT_TO_BYTEPTR(input.get());
287 uint8_t *output_ptr = CONVERT_TO_BYTEPTR(output.get());
288 uint8_t *output2_ptr = CONVERT_TO_BYTEPTR(output2.get());
289 for (int kernel_type = 0; kernel_type < 6; kernel_type++) {
290 generate_kernels(&rnd_, hkernel, vkernel, kernel_type);
291 for (i = 0; i < num_iters; ++i) {
292 // Choose random locations within the source block
293 int offset_r = 3 + rnd_.PseudoUniform(h - out_h - 7);
294 int offset_c = 3 + rnd_.PseudoUniform(w - out_w - 7);
295 av1_highbd_wiener_convolve_add_src_c(
296 input_ptr + offset_r * w + offset_c, w, output_ptr, out_w, hkernel,
297 16, vkernel, 16, out_w, out_h, &conv_params, bd);
298 test_impl(input_ptr + offset_r * w + offset_c, w, output2_ptr, out_w,
299 hkernel, 16, vkernel, 16, out_w, out_h, &conv_params, bd);
300
301 for (j = 0; j < out_w * out_h; ++j)
302 ASSERT_EQ(output[j], output2[j])
303 << "Pixel mismatch at index " << j << " = (" << (j % out_w) << ", "
304 << (j / out_w) << ") on iteration " << i;
305 }
306 }
307 }
308
RunSpeedTest(highbd_hiprec_convolve_func test_impl)309 void AV1HighbdHiprecConvolveTest::RunSpeedTest(
310 highbd_hiprec_convolve_func test_impl) {
311 const int w = 128, h = 128;
312 const int out_w = GET_PARAM(0), out_h = GET_PARAM(1);
313 const int num_iters = GET_PARAM(2) / 500;
314 const int bd = GET_PARAM(3);
315 int i, j, k;
316 const WienerConvolveParams conv_params = get_conv_params_wiener(bd);
317
318 std::unique_ptr<uint16_t[]> input(new (std::nothrow) uint16_t[h * w]);
319 ASSERT_NE(input, nullptr);
320
321 // The AVX2 convolve functions always write rows with widths that are
322 // multiples of 16. So to avoid a buffer overflow, we may need to pad
323 // rows to a multiple of 16.
324 int output_n = ALIGN_POWER_OF_TWO(out_w, 4) * out_h;
325 std::unique_ptr<uint16_t[]> output(new (std::nothrow) uint16_t[output_n]);
326 ASSERT_NE(output, nullptr);
327 std::unique_ptr<uint16_t[]> output2(new (std::nothrow) uint16_t[output_n]);
328 ASSERT_NE(output2, nullptr);
329
330 // Generate random filter kernels
331 DECLARE_ALIGNED(16, InterpKernel, hkernel);
332 DECLARE_ALIGNED(16, InterpKernel, vkernel);
333
334 generate_kernels(&rnd_, hkernel, vkernel);
335
336 for (i = 0; i < h; ++i)
337 for (j = 0; j < w; ++j) input[i * w + j] = rnd_.Rand16() & ((1 << bd) - 1);
338
339 uint8_t *input_ptr = CONVERT_TO_BYTEPTR(input.get());
340 uint8_t *output_ptr = CONVERT_TO_BYTEPTR(output.get());
341 uint8_t *output2_ptr = CONVERT_TO_BYTEPTR(output2.get());
342
343 aom_usec_timer ref_timer;
344 aom_usec_timer_start(&ref_timer);
345 for (i = 0; i < num_iters; ++i) {
346 for (j = 3; j < h - out_h - 4; j++) {
347 for (k = 3; k < w - out_w - 4; k++) {
348 av1_highbd_wiener_convolve_add_src_c(
349 input_ptr + j * w + k, w, output_ptr, out_w, hkernel, 16, vkernel,
350 16, out_w, out_h, &conv_params, bd);
351 }
352 }
353 }
354 aom_usec_timer_mark(&ref_timer);
355 const int64_t ref_time = aom_usec_timer_elapsed(&ref_timer);
356
357 aom_usec_timer tst_timer;
358 aom_usec_timer_start(&tst_timer);
359 for (i = 0; i < num_iters; ++i) {
360 for (j = 3; j < h - out_h - 4; j++) {
361 for (k = 3; k < w - out_w - 4; k++) {
362 test_impl(input_ptr + j * w + k, w, output2_ptr, out_w, hkernel, 16,
363 vkernel, 16, out_w, out_h, &conv_params, bd);
364 }
365 }
366 }
367 aom_usec_timer_mark(&tst_timer);
368 const int64_t tst_time = aom_usec_timer_elapsed(&tst_timer);
369
370 std::cout << "[ ] C time = " << ref_time / 1000
371 << " ms, SIMD time = " << tst_time / 1000 << " ms\n";
372
373 EXPECT_GT(ref_time, tst_time)
374 << "Error: AV1HighbdHiprecConvolveTest.SpeedTest, SIMD slower than C.\n"
375 << "C time: " << ref_time << " us\n"
376 << "SIMD time: " << tst_time << " us\n";
377 }
378 } // namespace AV1HighbdHiprecConvolve
379 #endif // CONFIG_AV1_HIGHBITDEPTH
380 } // namespace libaom_test
381