1 /* 2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 #ifndef VPX_VPX_DSP_X86_CONVOLVE_H_ 11 #define VPX_VPX_DSP_X86_CONVOLVE_H_ 12 13 #include <assert.h> 14 15 #include "./vpx_config.h" 16 #include "vpx/vpx_integer.h" 17 #include "vpx_ports/compiler_attributes.h" 18 19 // TODO([email protected]): Refactor the code here. Currently this is pretty 20 // hacky and awful to read. Note that there is a filter_x[3] == 128 check in 21 // HIGHBD_FUN_CONV_2D to avoid seg fault due to the fact that the c function 22 // assumes the filter is always 8 tap. 23 typedef void filter8_1dfunction(const uint8_t *src_ptr, ptrdiff_t src_pitch, 24 uint8_t *output_ptr, ptrdiff_t out_pitch, 25 uint32_t output_height, const int16_t *filter); 26 27 // TODO([email protected]): Remove the is_avg argument to the MACROS once we 28 // have 4-tap vert avg filter. 29 #define FUN_CONV_1D(name, offset, step_q4, dir, src_start, avg, opt, is_avg) \ 30 void vpx_convolve8_##name##_##opt( \ 31 const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, \ 32 ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, \ 33 int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { \ 34 const int16_t *filter_row = filter[offset]; \ 35 (void)x0_q4; \ 36 (void)x_step_q4; \ 37 (void)y0_q4; \ 38 (void)y_step_q4; \ 39 assert(filter_row[3] != 128); \ 40 assert(step_q4 == 16); \ 41 if (filter_row[0] | filter_row[1] | filter_row[6] | filter_row[7]) { \ 42 const int num_taps = 8; \ 43 while (w >= 16) { \ 44 vpx_filter_block1d16_##dir##8_##avg##opt(src_start, src_stride, dst, \ 45 dst_stride, h, filter_row); \ 46 src += 16; \ 47 dst += 16; \ 48 w -= 16; \ 49 } \ 50 if (w == 8) { \ 51 vpx_filter_block1d8_##dir##8_##avg##opt(src_start, src_stride, dst, \ 52 dst_stride, h, filter_row); \ 53 } else if (w == 4) { \ 54 vpx_filter_block1d4_##dir##8_##avg##opt(src_start, src_stride, dst, \ 55 dst_stride, h, filter_row); \ 56 } \ 57 (void)num_taps; \ 58 } else if (filter_row[2] | filter_row[5]) { \ 59 const int num_taps = is_avg ? 8 : 4; \ 60 while (w >= 16) { \ 61 vpx_filter_block1d16_##dir##4_##avg##opt(src_start, src_stride, dst, \ 62 dst_stride, h, filter_row); \ 63 src += 16; \ 64 dst += 16; \ 65 w -= 16; \ 66 } \ 67 if (w == 8) { \ 68 vpx_filter_block1d8_##dir##4_##avg##opt(src_start, src_stride, dst, \ 69 dst_stride, h, filter_row); \ 70 } else if (w == 4) { \ 71 vpx_filter_block1d4_##dir##4_##avg##opt(src_start, src_stride, dst, \ 72 dst_stride, h, filter_row); \ 73 } \ 74 (void)num_taps; \ 75 } else { \ 76 const int num_taps = 2; \ 77 while (w >= 16) { \ 78 vpx_filter_block1d16_##dir##2_##avg##opt(src_start, src_stride, dst, \ 79 dst_stride, h, filter_row); \ 80 src += 16; \ 81 dst += 16; \ 82 w -= 16; \ 83 } \ 84 if (w == 8) { \ 85 vpx_filter_block1d8_##dir##2_##avg##opt(src_start, src_stride, dst, \ 86 dst_stride, h, filter_row); \ 87 } else if (w == 4) { \ 88 vpx_filter_block1d4_##dir##2_##avg##opt(src_start, src_stride, dst, \ 89 dst_stride, h, filter_row); \ 90 } \ 91 (void)num_taps; \ 92 } \ 93 } 94 95 #define FUN_CONV_2D(avg, opt, is_avg) \ 96 void vpx_convolve8_##avg##opt( \ 97 const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, \ 98 ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, \ 99 int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { \ 100 const int16_t *filter_x = filter[x0_q4]; \ 101 const int16_t *filter_y = filter[y0_q4]; \ 102 (void)filter_y; \ 103 assert(filter_x[3] != 128); \ 104 assert(filter_y[3] != 128); \ 105 assert(w <= 64); \ 106 assert(h <= 64); \ 107 assert(x_step_q4 == 16); \ 108 assert(y_step_q4 == 16); \ 109 if (filter_x[0] | filter_x[1] | filter_x[6] | filter_x[7]) { \ 110 DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 71] VPX_UNINITIALIZED); \ 111 vpx_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, fdata2, 64, \ 112 filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w, \ 113 h + 7); \ 114 vpx_convolve8_##avg##vert_##opt(fdata2 + 3 * 64, 64, dst, dst_stride, \ 115 filter, x0_q4, x_step_q4, y0_q4, \ 116 y_step_q4, w, h); \ 117 } else if (filter_x[2] | filter_x[5]) { \ 118 const int num_taps = is_avg ? 8 : 4; \ 119 DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 71] VPX_UNINITIALIZED); \ 120 vpx_convolve8_horiz_##opt( \ 121 src - (num_taps / 2 - 1) * src_stride, src_stride, fdata2, 64, \ 122 filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w, h + num_taps - 1); \ 123 vpx_convolve8_##avg##vert_##opt(fdata2 + 64 * (num_taps / 2 - 1), 64, \ 124 dst, dst_stride, filter, x0_q4, \ 125 x_step_q4, y0_q4, y_step_q4, w, h); \ 126 } else { \ 127 DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 65] VPX_UNINITIALIZED); \ 128 vpx_convolve8_horiz_##opt(src, src_stride, fdata2, 64, filter, x0_q4, \ 129 x_step_q4, y0_q4, y_step_q4, w, h + 1); \ 130 vpx_convolve8_##avg##vert_##opt(fdata2, 64, dst, dst_stride, filter, \ 131 x0_q4, x_step_q4, y0_q4, y_step_q4, w, \ 132 h); \ 133 } \ 134 } 135 136 #if CONFIG_VP9_HIGHBITDEPTH 137 138 typedef void highbd_filter8_1dfunction(const uint16_t *src_ptr, 139 const ptrdiff_t src_pitch, 140 uint16_t *output_ptr, 141 ptrdiff_t out_pitch, 142 unsigned int output_height, 143 const int16_t *filter, int bd); 144 145 #define HIGH_FUN_CONV_1D(name, offset, step_q4, dir, src_start, avg, opt, \ 146 is_avg) \ 147 void vpx_highbd_convolve8_##name##_##opt( \ 148 const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, \ 149 ptrdiff_t dst_stride, const InterpKernel *filter_kernel, int x0_q4, \ 150 int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd) { \ 151 const int16_t *filter_row = filter_kernel[offset]; \ 152 if (step_q4 == 16 && filter_row[3] != 128) { \ 153 if (filter_row[0] | filter_row[1] | filter_row[6] | filter_row[7]) { \ 154 const int num_taps = 8; \ 155 while (w >= 16) { \ 156 vpx_highbd_filter_block1d16_##dir##8_##avg##opt( \ 157 src_start, src_stride, dst, dst_stride, h, filter_row, bd); \ 158 src += 16; \ 159 dst += 16; \ 160 w -= 16; \ 161 } \ 162 while (w >= 8) { \ 163 vpx_highbd_filter_block1d8_##dir##8_##avg##opt( \ 164 src_start, src_stride, dst, dst_stride, h, filter_row, bd); \ 165 src += 8; \ 166 dst += 8; \ 167 w -= 8; \ 168 } \ 169 while (w >= 4) { \ 170 vpx_highbd_filter_block1d4_##dir##8_##avg##opt( \ 171 src_start, src_stride, dst, dst_stride, h, filter_row, bd); \ 172 src += 4; \ 173 dst += 4; \ 174 w -= 4; \ 175 } \ 176 (void)num_taps; \ 177 } else if (filter_row[2] | filter_row[5]) { \ 178 const int num_taps = is_avg ? 8 : 4; \ 179 while (w >= 16) { \ 180 vpx_highbd_filter_block1d16_##dir##4_##avg##opt( \ 181 src_start, src_stride, dst, dst_stride, h, filter_row, bd); \ 182 src += 16; \ 183 dst += 16; \ 184 w -= 16; \ 185 } \ 186 while (w >= 8) { \ 187 vpx_highbd_filter_block1d8_##dir##4_##avg##opt( \ 188 src_start, src_stride, dst, dst_stride, h, filter_row, bd); \ 189 src += 8; \ 190 dst += 8; \ 191 w -= 8; \ 192 } \ 193 while (w >= 4) { \ 194 vpx_highbd_filter_block1d4_##dir##4_##avg##opt( \ 195 src_start, src_stride, dst, dst_stride, h, filter_row, bd); \ 196 src += 4; \ 197 dst += 4; \ 198 w -= 4; \ 199 } \ 200 (void)num_taps; \ 201 } else { \ 202 const int num_taps = 2; \ 203 while (w >= 16) { \ 204 vpx_highbd_filter_block1d16_##dir##2_##avg##opt( \ 205 src_start, src_stride, dst, dst_stride, h, filter_row, bd); \ 206 src += 16; \ 207 dst += 16; \ 208 w -= 16; \ 209 } \ 210 while (w >= 8) { \ 211 vpx_highbd_filter_block1d8_##dir##2_##avg##opt( \ 212 src_start, src_stride, dst, dst_stride, h, filter_row, bd); \ 213 src += 8; \ 214 dst += 8; \ 215 w -= 8; \ 216 } \ 217 while (w >= 4) { \ 218 vpx_highbd_filter_block1d4_##dir##2_##avg##opt( \ 219 src_start, src_stride, dst, dst_stride, h, filter_row, bd); \ 220 src += 4; \ 221 dst += 4; \ 222 w -= 4; \ 223 } \ 224 (void)num_taps; \ 225 } \ 226 } \ 227 if (w) { \ 228 vpx_highbd_convolve8_##name##_c(src, src_stride, dst, dst_stride, \ 229 filter_kernel, x0_q4, x_step_q4, y0_q4, \ 230 y_step_q4, w, h, bd); \ 231 } \ 232 } 233 234 #define HIGH_FUN_CONV_2D(avg, opt, is_avg) \ 235 void vpx_highbd_convolve8_##avg##opt( \ 236 const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, \ 237 ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, \ 238 int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd) { \ 239 const int16_t *filter_x = filter[x0_q4]; \ 240 assert(w <= 64); \ 241 assert(h <= 64); \ 242 if (x_step_q4 == 16 && y_step_q4 == 16) { \ 243 if ((filter_x[0] | filter_x[1] | filter_x[6] | filter_x[7]) || \ 244 filter_x[3] == 128) { \ 245 DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 71] VPX_UNINITIALIZED); \ 246 vpx_highbd_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, \ 247 fdata2, 64, filter, x0_q4, x_step_q4, \ 248 y0_q4, y_step_q4, w, h + 7, bd); \ 249 vpx_highbd_convolve8_##avg##vert_##opt( \ 250 fdata2 + 192, 64, dst, dst_stride, filter, x0_q4, x_step_q4, \ 251 y0_q4, y_step_q4, w, h, bd); \ 252 } else if (filter_x[2] | filter_x[5]) { \ 253 const int num_taps = is_avg ? 8 : 4; \ 254 DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 71] VPX_UNINITIALIZED); \ 255 vpx_highbd_convolve8_horiz_##opt( \ 256 src - (num_taps / 2 - 1) * src_stride, src_stride, fdata2, 64, \ 257 filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w, h + num_taps - 1, \ 258 bd); \ 259 vpx_highbd_convolve8_##avg##vert_##opt( \ 260 fdata2 + 64 * (num_taps / 2 - 1), 64, dst, dst_stride, filter, \ 261 x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, bd); \ 262 } else { \ 263 DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 65] VPX_UNINITIALIZED); \ 264 vpx_highbd_convolve8_horiz_##opt(src, src_stride, fdata2, 64, filter, \ 265 x0_q4, x_step_q4, y0_q4, y_step_q4, \ 266 w, h + 1, bd); \ 267 vpx_highbd_convolve8_##avg##vert_##opt(fdata2, 64, dst, dst_stride, \ 268 filter, x0_q4, x_step_q4, \ 269 y0_q4, y_step_q4, w, h, bd); \ 270 } \ 271 } else { \ 272 vpx_highbd_convolve8_##avg##c(src, src_stride, dst, dst_stride, filter, \ 273 x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, \ 274 bd); \ 275 } \ 276 } 277 278 #endif // CONFIG_VP9_HIGHBITDEPTH 279 #endif // VPX_VPX_DSP_X86_CONVOLVE_H_ 280