1 /*
2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #include <stdio.h>
13 #include <stdlib.h>
14 #include <memory.h>
15 #include <math.h>
16 #include <assert.h>
17
18 #include "config/av1_rtcd.h"
19
20 #include "av1/common/av1_common_int.h"
21 #include "av1/common/warped_motion.h"
22 #include "av1/common/scale.h"
23
24 // For warping, we really use a 6-tap filter, but we do blocks of 8 pixels
25 // at a time. The zoom/rotation/shear in the model are applied to the
26 // "fractional" position of each pixel, which therefore varies within
27 // [-1, 2) * WARPEDPIXEL_PREC_SHIFTS.
28 // We need an extra 2 taps to fit this in, for a total of 8 taps.
29 /* clang-format off */
30 const int16_t av1_warped_filter[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8] = {
31 // [-1, 0)
32 { 0, 0, 127, 1, 0, 0, 0, 0 }, { 0, - 1, 127, 2, 0, 0, 0, 0 },
33 { 1, - 3, 127, 4, - 1, 0, 0, 0 }, { 1, - 4, 126, 6, - 2, 1, 0, 0 },
34 { 1, - 5, 126, 8, - 3, 1, 0, 0 }, { 1, - 6, 125, 11, - 4, 1, 0, 0 },
35 { 1, - 7, 124, 13, - 4, 1, 0, 0 }, { 2, - 8, 123, 15, - 5, 1, 0, 0 },
36 { 2, - 9, 122, 18, - 6, 1, 0, 0 }, { 2, -10, 121, 20, - 6, 1, 0, 0 },
37 { 2, -11, 120, 22, - 7, 2, 0, 0 }, { 2, -12, 119, 25, - 8, 2, 0, 0 },
38 { 3, -13, 117, 27, - 8, 2, 0, 0 }, { 3, -13, 116, 29, - 9, 2, 0, 0 },
39 { 3, -14, 114, 32, -10, 3, 0, 0 }, { 3, -15, 113, 35, -10, 2, 0, 0 },
40 { 3, -15, 111, 37, -11, 3, 0, 0 }, { 3, -16, 109, 40, -11, 3, 0, 0 },
41 { 3, -16, 108, 42, -12, 3, 0, 0 }, { 4, -17, 106, 45, -13, 3, 0, 0 },
42 { 4, -17, 104, 47, -13, 3, 0, 0 }, { 4, -17, 102, 50, -14, 3, 0, 0 },
43 { 4, -17, 100, 52, -14, 3, 0, 0 }, { 4, -18, 98, 55, -15, 4, 0, 0 },
44 { 4, -18, 96, 58, -15, 3, 0, 0 }, { 4, -18, 94, 60, -16, 4, 0, 0 },
45 { 4, -18, 91, 63, -16, 4, 0, 0 }, { 4, -18, 89, 65, -16, 4, 0, 0 },
46 { 4, -18, 87, 68, -17, 4, 0, 0 }, { 4, -18, 85, 70, -17, 4, 0, 0 },
47 { 4, -18, 82, 73, -17, 4, 0, 0 }, { 4, -18, 80, 75, -17, 4, 0, 0 },
48 { 4, -18, 78, 78, -18, 4, 0, 0 }, { 4, -17, 75, 80, -18, 4, 0, 0 },
49 { 4, -17, 73, 82, -18, 4, 0, 0 }, { 4, -17, 70, 85, -18, 4, 0, 0 },
50 { 4, -17, 68, 87, -18, 4, 0, 0 }, { 4, -16, 65, 89, -18, 4, 0, 0 },
51 { 4, -16, 63, 91, -18, 4, 0, 0 }, { 4, -16, 60, 94, -18, 4, 0, 0 },
52 { 3, -15, 58, 96, -18, 4, 0, 0 }, { 4, -15, 55, 98, -18, 4, 0, 0 },
53 { 3, -14, 52, 100, -17, 4, 0, 0 }, { 3, -14, 50, 102, -17, 4, 0, 0 },
54 { 3, -13, 47, 104, -17, 4, 0, 0 }, { 3, -13, 45, 106, -17, 4, 0, 0 },
55 { 3, -12, 42, 108, -16, 3, 0, 0 }, { 3, -11, 40, 109, -16, 3, 0, 0 },
56 { 3, -11, 37, 111, -15, 3, 0, 0 }, { 2, -10, 35, 113, -15, 3, 0, 0 },
57 { 3, -10, 32, 114, -14, 3, 0, 0 }, { 2, - 9, 29, 116, -13, 3, 0, 0 },
58 { 2, - 8, 27, 117, -13, 3, 0, 0 }, { 2, - 8, 25, 119, -12, 2, 0, 0 },
59 { 2, - 7, 22, 120, -11, 2, 0, 0 }, { 1, - 6, 20, 121, -10, 2, 0, 0 },
60 { 1, - 6, 18, 122, - 9, 2, 0, 0 }, { 1, - 5, 15, 123, - 8, 2, 0, 0 },
61 { 1, - 4, 13, 124, - 7, 1, 0, 0 }, { 1, - 4, 11, 125, - 6, 1, 0, 0 },
62 { 1, - 3, 8, 126, - 5, 1, 0, 0 }, { 1, - 2, 6, 126, - 4, 1, 0, 0 },
63 { 0, - 1, 4, 127, - 3, 1, 0, 0 }, { 0, 0, 2, 127, - 1, 0, 0, 0 },
64
65 // [0, 1)
66 { 0, 0, 0, 127, 1, 0, 0, 0}, { 0, 0, -1, 127, 2, 0, 0, 0},
67 { 0, 1, -3, 127, 4, -2, 1, 0}, { 0, 1, -5, 127, 6, -2, 1, 0},
68 { 0, 2, -6, 126, 8, -3, 1, 0}, {-1, 2, -7, 126, 11, -4, 2, -1},
69 {-1, 3, -8, 125, 13, -5, 2, -1}, {-1, 3, -10, 124, 16, -6, 3, -1},
70 {-1, 4, -11, 123, 18, -7, 3, -1}, {-1, 4, -12, 122, 20, -7, 3, -1},
71 {-1, 4, -13, 121, 23, -8, 3, -1}, {-2, 5, -14, 120, 25, -9, 4, -1},
72 {-1, 5, -15, 119, 27, -10, 4, -1}, {-1, 5, -16, 118, 30, -11, 4, -1},
73 {-2, 6, -17, 116, 33, -12, 5, -1}, {-2, 6, -17, 114, 35, -12, 5, -1},
74 {-2, 6, -18, 113, 38, -13, 5, -1}, {-2, 7, -19, 111, 41, -14, 6, -2},
75 {-2, 7, -19, 110, 43, -15, 6, -2}, {-2, 7, -20, 108, 46, -15, 6, -2},
76 {-2, 7, -20, 106, 49, -16, 6, -2}, {-2, 7, -21, 104, 51, -16, 7, -2},
77 {-2, 7, -21, 102, 54, -17, 7, -2}, {-2, 8, -21, 100, 56, -18, 7, -2},
78 {-2, 8, -22, 98, 59, -18, 7, -2}, {-2, 8, -22, 96, 62, -19, 7, -2},
79 {-2, 8, -22, 94, 64, -19, 7, -2}, {-2, 8, -22, 91, 67, -20, 8, -2},
80 {-2, 8, -22, 89, 69, -20, 8, -2}, {-2, 8, -22, 87, 72, -21, 8, -2},
81 {-2, 8, -21, 84, 74, -21, 8, -2}, {-2, 8, -22, 82, 77, -21, 8, -2},
82 {-2, 8, -21, 79, 79, -21, 8, -2}, {-2, 8, -21, 77, 82, -22, 8, -2},
83 {-2, 8, -21, 74, 84, -21, 8, -2}, {-2, 8, -21, 72, 87, -22, 8, -2},
84 {-2, 8, -20, 69, 89, -22, 8, -2}, {-2, 8, -20, 67, 91, -22, 8, -2},
85 {-2, 7, -19, 64, 94, -22, 8, -2}, {-2, 7, -19, 62, 96, -22, 8, -2},
86 {-2, 7, -18, 59, 98, -22, 8, -2}, {-2, 7, -18, 56, 100, -21, 8, -2},
87 {-2, 7, -17, 54, 102, -21, 7, -2}, {-2, 7, -16, 51, 104, -21, 7, -2},
88 {-2, 6, -16, 49, 106, -20, 7, -2}, {-2, 6, -15, 46, 108, -20, 7, -2},
89 {-2, 6, -15, 43, 110, -19, 7, -2}, {-2, 6, -14, 41, 111, -19, 7, -2},
90 {-1, 5, -13, 38, 113, -18, 6, -2}, {-1, 5, -12, 35, 114, -17, 6, -2},
91 {-1, 5, -12, 33, 116, -17, 6, -2}, {-1, 4, -11, 30, 118, -16, 5, -1},
92 {-1, 4, -10, 27, 119, -15, 5, -1}, {-1, 4, -9, 25, 120, -14, 5, -2},
93 {-1, 3, -8, 23, 121, -13, 4, -1}, {-1, 3, -7, 20, 122, -12, 4, -1},
94 {-1, 3, -7, 18, 123, -11, 4, -1}, {-1, 3, -6, 16, 124, -10, 3, -1},
95 {-1, 2, -5, 13, 125, -8, 3, -1}, {-1, 2, -4, 11, 126, -7, 2, -1},
96 { 0, 1, -3, 8, 126, -6, 2, 0}, { 0, 1, -2, 6, 127, -5, 1, 0},
97 { 0, 1, -2, 4, 127, -3, 1, 0}, { 0, 0, 0, 2, 127, -1, 0, 0},
98
99 // [1, 2)
100 { 0, 0, 0, 1, 127, 0, 0, 0 }, { 0, 0, 0, - 1, 127, 2, 0, 0 },
101 { 0, 0, 1, - 3, 127, 4, - 1, 0 }, { 0, 0, 1, - 4, 126, 6, - 2, 1 },
102 { 0, 0, 1, - 5, 126, 8, - 3, 1 }, { 0, 0, 1, - 6, 125, 11, - 4, 1 },
103 { 0, 0, 1, - 7, 124, 13, - 4, 1 }, { 0, 0, 2, - 8, 123, 15, - 5, 1 },
104 { 0, 0, 2, - 9, 122, 18, - 6, 1 }, { 0, 0, 2, -10, 121, 20, - 6, 1 },
105 { 0, 0, 2, -11, 120, 22, - 7, 2 }, { 0, 0, 2, -12, 119, 25, - 8, 2 },
106 { 0, 0, 3, -13, 117, 27, - 8, 2 }, { 0, 0, 3, -13, 116, 29, - 9, 2 },
107 { 0, 0, 3, -14, 114, 32, -10, 3 }, { 0, 0, 3, -15, 113, 35, -10, 2 },
108 { 0, 0, 3, -15, 111, 37, -11, 3 }, { 0, 0, 3, -16, 109, 40, -11, 3 },
109 { 0, 0, 3, -16, 108, 42, -12, 3 }, { 0, 0, 4, -17, 106, 45, -13, 3 },
110 { 0, 0, 4, -17, 104, 47, -13, 3 }, { 0, 0, 4, -17, 102, 50, -14, 3 },
111 { 0, 0, 4, -17, 100, 52, -14, 3 }, { 0, 0, 4, -18, 98, 55, -15, 4 },
112 { 0, 0, 4, -18, 96, 58, -15, 3 }, { 0, 0, 4, -18, 94, 60, -16, 4 },
113 { 0, 0, 4, -18, 91, 63, -16, 4 }, { 0, 0, 4, -18, 89, 65, -16, 4 },
114 { 0, 0, 4, -18, 87, 68, -17, 4 }, { 0, 0, 4, -18, 85, 70, -17, 4 },
115 { 0, 0, 4, -18, 82, 73, -17, 4 }, { 0, 0, 4, -18, 80, 75, -17, 4 },
116 { 0, 0, 4, -18, 78, 78, -18, 4 }, { 0, 0, 4, -17, 75, 80, -18, 4 },
117 { 0, 0, 4, -17, 73, 82, -18, 4 }, { 0, 0, 4, -17, 70, 85, -18, 4 },
118 { 0, 0, 4, -17, 68, 87, -18, 4 }, { 0, 0, 4, -16, 65, 89, -18, 4 },
119 { 0, 0, 4, -16, 63, 91, -18, 4 }, { 0, 0, 4, -16, 60, 94, -18, 4 },
120 { 0, 0, 3, -15, 58, 96, -18, 4 }, { 0, 0, 4, -15, 55, 98, -18, 4 },
121 { 0, 0, 3, -14, 52, 100, -17, 4 }, { 0, 0, 3, -14, 50, 102, -17, 4 },
122 { 0, 0, 3, -13, 47, 104, -17, 4 }, { 0, 0, 3, -13, 45, 106, -17, 4 },
123 { 0, 0, 3, -12, 42, 108, -16, 3 }, { 0, 0, 3, -11, 40, 109, -16, 3 },
124 { 0, 0, 3, -11, 37, 111, -15, 3 }, { 0, 0, 2, -10, 35, 113, -15, 3 },
125 { 0, 0, 3, -10, 32, 114, -14, 3 }, { 0, 0, 2, - 9, 29, 116, -13, 3 },
126 { 0, 0, 2, - 8, 27, 117, -13, 3 }, { 0, 0, 2, - 8, 25, 119, -12, 2 },
127 { 0, 0, 2, - 7, 22, 120, -11, 2 }, { 0, 0, 1, - 6, 20, 121, -10, 2 },
128 { 0, 0, 1, - 6, 18, 122, - 9, 2 }, { 0, 0, 1, - 5, 15, 123, - 8, 2 },
129 { 0, 0, 1, - 4, 13, 124, - 7, 1 }, { 0, 0, 1, - 4, 11, 125, - 6, 1 },
130 { 0, 0, 1, - 3, 8, 126, - 5, 1 }, { 0, 0, 1, - 2, 6, 126, - 4, 1 },
131 { 0, 0, 0, - 1, 4, 127, - 3, 1 }, { 0, 0, 0, 0, 2, 127, - 1, 0 },
132 // dummy (replicate row index 191)
133 { 0, 0, 0, 0, 2, 127, - 1, 0 },
134 };
135
136 /* clang-format on */
137
138 #define DIV_LUT_PREC_BITS 14
139 #define DIV_LUT_BITS 8
140 #define DIV_LUT_NUM (1 << DIV_LUT_BITS)
141
142 static const uint16_t div_lut[DIV_LUT_NUM + 1] = {
143 16384, 16320, 16257, 16194, 16132, 16070, 16009, 15948, 15888, 15828, 15768,
144 15709, 15650, 15592, 15534, 15477, 15420, 15364, 15308, 15252, 15197, 15142,
145 15087, 15033, 14980, 14926, 14873, 14821, 14769, 14717, 14665, 14614, 14564,
146 14513, 14463, 14413, 14364, 14315, 14266, 14218, 14170, 14122, 14075, 14028,
147 13981, 13935, 13888, 13843, 13797, 13752, 13707, 13662, 13618, 13574, 13530,
148 13487, 13443, 13400, 13358, 13315, 13273, 13231, 13190, 13148, 13107, 13066,
149 13026, 12985, 12945, 12906, 12866, 12827, 12788, 12749, 12710, 12672, 12633,
150 12596, 12558, 12520, 12483, 12446, 12409, 12373, 12336, 12300, 12264, 12228,
151 12193, 12157, 12122, 12087, 12053, 12018, 11984, 11950, 11916, 11882, 11848,
152 11815, 11782, 11749, 11716, 11683, 11651, 11619, 11586, 11555, 11523, 11491,
153 11460, 11429, 11398, 11367, 11336, 11305, 11275, 11245, 11215, 11185, 11155,
154 11125, 11096, 11067, 11038, 11009, 10980, 10951, 10923, 10894, 10866, 10838,
155 10810, 10782, 10755, 10727, 10700, 10673, 10645, 10618, 10592, 10565, 10538,
156 10512, 10486, 10460, 10434, 10408, 10382, 10356, 10331, 10305, 10280, 10255,
157 10230, 10205, 10180, 10156, 10131, 10107, 10082, 10058, 10034, 10010, 9986,
158 9963, 9939, 9916, 9892, 9869, 9846, 9823, 9800, 9777, 9754, 9732,
159 9709, 9687, 9664, 9642, 9620, 9598, 9576, 9554, 9533, 9511, 9489,
160 9468, 9447, 9425, 9404, 9383, 9362, 9341, 9321, 9300, 9279, 9259,
161 9239, 9218, 9198, 9178, 9158, 9138, 9118, 9098, 9079, 9059, 9039,
162 9020, 9001, 8981, 8962, 8943, 8924, 8905, 8886, 8867, 8849, 8830,
163 8812, 8793, 8775, 8756, 8738, 8720, 8702, 8684, 8666, 8648, 8630,
164 8613, 8595, 8577, 8560, 8542, 8525, 8508, 8490, 8473, 8456, 8439,
165 8422, 8405, 8389, 8372, 8355, 8339, 8322, 8306, 8289, 8273, 8257,
166 8240, 8224, 8208, 8192,
167 };
168
169 // Decomposes a divisor D such that 1/D = y/2^shift, where y is returned
170 // at precision of DIV_LUT_PREC_BITS along with the shift.
resolve_divisor_64(uint64_t D,int16_t * shift)171 static int16_t resolve_divisor_64(uint64_t D, int16_t *shift) {
172 int64_t f;
173 *shift = (int16_t)((D >> 32) ? get_msb((unsigned int)(D >> 32)) + 32
174 : get_msb((unsigned int)D));
175 // e is obtained from D after resetting the most significant 1 bit.
176 const int64_t e = D - ((uint64_t)1 << *shift);
177 // Get the most significant DIV_LUT_BITS (8) bits of e into f
178 if (*shift > DIV_LUT_BITS)
179 f = ROUND_POWER_OF_TWO_64(e, *shift - DIV_LUT_BITS);
180 else
181 f = e << (DIV_LUT_BITS - *shift);
182 assert(f <= DIV_LUT_NUM);
183 *shift += DIV_LUT_PREC_BITS;
184 // Use f as lookup into the precomputed table of multipliers
185 return div_lut[f];
186 }
187
resolve_divisor_32(uint32_t D,int16_t * shift)188 static int16_t resolve_divisor_32(uint32_t D, int16_t *shift) {
189 int32_t f;
190 *shift = get_msb(D);
191 // e is obtained from D after resetting the most significant 1 bit.
192 const int32_t e = D - ((uint32_t)1 << *shift);
193 // Get the most significant DIV_LUT_BITS (8) bits of e into f
194 if (*shift > DIV_LUT_BITS)
195 f = ROUND_POWER_OF_TWO(e, *shift - DIV_LUT_BITS);
196 else
197 f = e << (DIV_LUT_BITS - *shift);
198 assert(f <= DIV_LUT_NUM);
199 *shift += DIV_LUT_PREC_BITS;
200 // Use f as lookup into the precomputed table of multipliers
201 return div_lut[f];
202 }
203
is_affine_valid(const WarpedMotionParams * const wm)204 static int is_affine_valid(const WarpedMotionParams *const wm) {
205 const int32_t *mat = wm->wmmat;
206 return (mat[2] > 0);
207 }
208
is_affine_shear_allowed(int16_t alpha,int16_t beta,int16_t gamma,int16_t delta)209 static int is_affine_shear_allowed(int16_t alpha, int16_t beta, int16_t gamma,
210 int16_t delta) {
211 if ((4 * abs(alpha) + 7 * abs(beta) >= (1 << WARPEDMODEL_PREC_BITS)) ||
212 (4 * abs(gamma) + 4 * abs(delta) >= (1 << WARPEDMODEL_PREC_BITS)))
213 return 0;
214 else
215 return 1;
216 }
217
218 #ifndef NDEBUG
219 // Check that the given warp model satisfies the relevant constraints for
220 // its stated model type
check_model_consistency(WarpedMotionParams * wm)221 static void check_model_consistency(WarpedMotionParams *wm) {
222 switch (wm->wmtype) {
223 case IDENTITY:
224 assert(wm->wmmat[0] == 0);
225 assert(wm->wmmat[1] == 0);
226 AOM_FALLTHROUGH_INTENDED;
227 case TRANSLATION:
228 assert(wm->wmmat[2] == 1 << WARPEDMODEL_PREC_BITS);
229 assert(wm->wmmat[3] == 0);
230 AOM_FALLTHROUGH_INTENDED;
231 case ROTZOOM:
232 assert(wm->wmmat[4] == -wm->wmmat[3]);
233 assert(wm->wmmat[5] == wm->wmmat[2]);
234 AOM_FALLTHROUGH_INTENDED;
235 case AFFINE: break;
236 default: assert(0 && "Bad wmtype");
237 }
238 }
239 #endif // NDEBUG
240
241 // Returns 1 on success or 0 on an invalid affine set
av1_get_shear_params(WarpedMotionParams * wm)242 int av1_get_shear_params(WarpedMotionParams *wm) {
243 #ifndef NDEBUG
244 // Check that models have been constructed sensibly
245 // This is a good place to check, because this function does not need to
246 // be called until after model construction is complete, but must be called
247 // before the model can be used for prediction.
248 check_model_consistency(wm);
249 #endif // NDEBUG
250
251 const int32_t *mat = wm->wmmat;
252 if (!is_affine_valid(wm)) return 0;
253
254 wm->alpha =
255 clamp(mat[2] - (1 << WARPEDMODEL_PREC_BITS), INT16_MIN, INT16_MAX);
256 wm->beta = clamp(mat[3], INT16_MIN, INT16_MAX);
257 int16_t shift;
258 int16_t y = resolve_divisor_32(abs(mat[2]), &shift) * (mat[2] < 0 ? -1 : 1);
259 int64_t v = ((int64_t)mat[4] * (1 << WARPEDMODEL_PREC_BITS)) * y;
260 wm->gamma =
261 clamp((int)ROUND_POWER_OF_TWO_SIGNED_64(v, shift), INT16_MIN, INT16_MAX);
262 v = ((int64_t)mat[3] * mat[4]) * y;
263 wm->delta = clamp(mat[5] - (int)ROUND_POWER_OF_TWO_SIGNED_64(v, shift) -
264 (1 << WARPEDMODEL_PREC_BITS),
265 INT16_MIN, INT16_MAX);
266
267 wm->alpha = ROUND_POWER_OF_TWO_SIGNED(wm->alpha, WARP_PARAM_REDUCE_BITS) *
268 (1 << WARP_PARAM_REDUCE_BITS);
269 wm->beta = ROUND_POWER_OF_TWO_SIGNED(wm->beta, WARP_PARAM_REDUCE_BITS) *
270 (1 << WARP_PARAM_REDUCE_BITS);
271 wm->gamma = ROUND_POWER_OF_TWO_SIGNED(wm->gamma, WARP_PARAM_REDUCE_BITS) *
272 (1 << WARP_PARAM_REDUCE_BITS);
273 wm->delta = ROUND_POWER_OF_TWO_SIGNED(wm->delta, WARP_PARAM_REDUCE_BITS) *
274 (1 << WARP_PARAM_REDUCE_BITS);
275
276 if (!is_affine_shear_allowed(wm->alpha, wm->beta, wm->gamma, wm->delta))
277 return 0;
278
279 return 1;
280 }
281
282 #if CONFIG_AV1_HIGHBITDEPTH
283 /* Note: For an explanation of the warp algorithm, and some notes on bit widths
284 for hardware implementations, see the comments above av1_warp_affine_c
285 */
av1_highbd_warp_affine_c(const int32_t * mat,const uint16_t * ref,int width,int height,int stride,uint16_t * pred,int p_col,int p_row,int p_width,int p_height,int p_stride,int subsampling_x,int subsampling_y,int bd,ConvolveParams * conv_params,int16_t alpha,int16_t beta,int16_t gamma,int16_t delta)286 void av1_highbd_warp_affine_c(const int32_t *mat, const uint16_t *ref,
287 int width, int height, int stride, uint16_t *pred,
288 int p_col, int p_row, int p_width, int p_height,
289 int p_stride, int subsampling_x,
290 int subsampling_y, int bd,
291 ConvolveParams *conv_params, int16_t alpha,
292 int16_t beta, int16_t gamma, int16_t delta) {
293 int32_t tmp[15 * 8];
294 const int reduce_bits_horiz = conv_params->round_0;
295 const int reduce_bits_vert = conv_params->is_compound
296 ? conv_params->round_1
297 : 2 * FILTER_BITS - reduce_bits_horiz;
298 const int max_bits_horiz = bd + FILTER_BITS + 1 - reduce_bits_horiz;
299 const int offset_bits_horiz = bd + FILTER_BITS - 1;
300 const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz;
301 const int round_bits =
302 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
303 const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
304 (void)max_bits_horiz;
305 assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL));
306
307 // Check that, even with 12-bit input, the intermediate values will fit
308 // into an unsigned 16-bit intermediate array.
309 assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16);
310
311 for (int i = p_row; i < p_row + p_height; i += 8) {
312 for (int j = p_col; j < p_col + p_width; j += 8) {
313 // Calculate the center of this 8x8 block,
314 // project to luma coordinates (if in a subsampled chroma plane),
315 // apply the affine transformation,
316 // then convert back to the original coordinates (if necessary)
317 const int32_t src_x = (j + 4) << subsampling_x;
318 const int32_t src_y = (i + 4) << subsampling_y;
319 const int64_t dst_x =
320 (int64_t)mat[2] * src_x + (int64_t)mat[3] * src_y + (int64_t)mat[0];
321 const int64_t dst_y =
322 (int64_t)mat[4] * src_x + (int64_t)mat[5] * src_y + (int64_t)mat[1];
323 const int64_t x4 = dst_x >> subsampling_x;
324 const int64_t y4 = dst_y >> subsampling_y;
325
326 const int32_t ix4 = (int32_t)(x4 >> WARPEDMODEL_PREC_BITS);
327 int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
328 const int32_t iy4 = (int32_t)(y4 >> WARPEDMODEL_PREC_BITS);
329 int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
330
331 sx4 += alpha * (-4) + beta * (-4);
332 sy4 += gamma * (-4) + delta * (-4);
333
334 sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
335 sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
336
337 // Horizontal filter
338 for (int k = -7; k < 8; ++k) {
339 const int iy = clamp(iy4 + k, 0, height - 1);
340
341 int sx = sx4 + beta * (k + 4);
342 for (int l = -4; l < 4; ++l) {
343 int ix = ix4 + l - 3;
344 const int offs = ROUND_POWER_OF_TWO(sx, WARPEDDIFF_PREC_BITS) +
345 WARPEDPIXEL_PREC_SHIFTS;
346 assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
347 const int16_t *coeffs = av1_warped_filter[offs];
348
349 int32_t sum = 1 << offset_bits_horiz;
350 for (int m = 0; m < 8; ++m) {
351 const int sample_x = clamp(ix + m, 0, width - 1);
352 sum += ref[iy * stride + sample_x] * coeffs[m];
353 }
354 sum = ROUND_POWER_OF_TWO(sum, reduce_bits_horiz);
355 assert(0 <= sum && sum < (1 << max_bits_horiz));
356 tmp[(k + 7) * 8 + (l + 4)] = sum;
357 sx += alpha;
358 }
359 }
360
361 // Vertical filter
362 for (int k = -4; k < AOMMIN(4, p_row + p_height - i - 4); ++k) {
363 int sy = sy4 + delta * (k + 4);
364 for (int l = -4; l < AOMMIN(4, p_col + p_width - j - 4); ++l) {
365 const int offs = ROUND_POWER_OF_TWO(sy, WARPEDDIFF_PREC_BITS) +
366 WARPEDPIXEL_PREC_SHIFTS;
367 assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
368 const int16_t *coeffs = av1_warped_filter[offs];
369
370 int32_t sum = 1 << offset_bits_vert;
371 for (int m = 0; m < 8; ++m) {
372 sum += tmp[(k + m + 4) * 8 + (l + 4)] * coeffs[m];
373 }
374
375 if (conv_params->is_compound) {
376 CONV_BUF_TYPE *p =
377 &conv_params
378 ->dst[(i - p_row + k + 4) * conv_params->dst_stride +
379 (j - p_col + l + 4)];
380 sum = ROUND_POWER_OF_TWO(sum, reduce_bits_vert);
381 if (conv_params->do_average) {
382 uint16_t *dst16 =
383 &pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)];
384 int32_t tmp32 = *p;
385 if (conv_params->use_dist_wtd_comp_avg) {
386 tmp32 = tmp32 * conv_params->fwd_offset +
387 sum * conv_params->bck_offset;
388 tmp32 = tmp32 >> DIST_PRECISION_BITS;
389 } else {
390 tmp32 += sum;
391 tmp32 = tmp32 >> 1;
392 }
393 tmp32 = tmp32 - (1 << (offset_bits - conv_params->round_1)) -
394 (1 << (offset_bits - conv_params->round_1 - 1));
395 *dst16 =
396 clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp32, round_bits), bd);
397 } else {
398 *p = sum;
399 }
400 } else {
401 uint16_t *p =
402 &pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)];
403 sum = ROUND_POWER_OF_TWO(sum, reduce_bits_vert);
404 assert(0 <= sum && sum < (1 << (bd + 2)));
405 *p = clip_pixel_highbd(sum - (1 << (bd - 1)) - (1 << bd), bd);
406 }
407 sy += gamma;
408 }
409 }
410 }
411 }
412 }
413
highbd_warp_plane(WarpedMotionParams * wm,const uint16_t * const ref,int width,int height,int stride,uint16_t * const pred,int p_col,int p_row,int p_width,int p_height,int p_stride,int subsampling_x,int subsampling_y,int bd,ConvolveParams * conv_params)414 void highbd_warp_plane(WarpedMotionParams *wm, const uint16_t *const ref,
415 int width, int height, int stride, uint16_t *const pred,
416 int p_col, int p_row, int p_width, int p_height,
417 int p_stride, int subsampling_x, int subsampling_y,
418 int bd, ConvolveParams *conv_params) {
419 const int32_t *const mat = wm->wmmat;
420 const int16_t alpha = wm->alpha;
421 const int16_t beta = wm->beta;
422 const int16_t gamma = wm->gamma;
423 const int16_t delta = wm->delta;
424
425 av1_highbd_warp_affine(mat, ref, width, height, stride, pred, p_col, p_row,
426 p_width, p_height, p_stride, subsampling_x,
427 subsampling_y, bd, conv_params, alpha, beta, gamma,
428 delta);
429 }
430 #endif // CONFIG_AV1_HIGHBITDEPTH
431
432 /* The warp filter for ROTZOOM and AFFINE models works as follows:
433 * Split the input into 8x8 blocks
434 * For each block, project the point (4, 4) within the block, to get the
435 overall block position. Split into integer and fractional coordinates,
436 maintaining full WARPEDMODEL precision
437 * Filter horizontally: Generate 15 rows of 8 pixels each. Each pixel gets a
438 variable horizontal offset. This means that, while the rows of the
439 intermediate buffer align with the rows of the *reference* image, the
440 columns align with the columns of the *destination* image.
441 * Filter vertically: Generate the output block (up to 8x8 pixels, but if the
442 destination is too small we crop the output at this stage). Each pixel has
443 a variable vertical offset, so that the resulting rows are aligned with
444 the rows of the destination image.
445
446 To accomplish these alignments, we factor the warp matrix as a
447 product of two shear / asymmetric zoom matrices:
448 / a b \ = / 1 0 \ * / 1+alpha beta \
449 \ c d / \ gamma 1+delta / \ 0 1 /
450 where a, b, c, d are wmmat[2], wmmat[3], wmmat[4], wmmat[5] respectively.
451 The horizontal shear (with alpha and beta) is applied first,
452 then the vertical shear (with gamma and delta) is applied second.
453
454 The only limitation is that, to fit this in a fixed 8-tap filter size,
455 the fractional pixel offsets must be at most +-1. Since the horizontal filter
456 generates 15 rows of 8 columns, and the initial point we project is at (4, 4)
457 within the block, the parameters must satisfy
458 4 * |alpha| + 7 * |beta| <= 1 and 4 * |gamma| + 4 * |delta| <= 1
459 for this filter to be applicable.
460
461 Note: This function assumes that the caller has done all of the relevant
462 checks, ie. that we have a ROTZOOM or AFFINE model, that wm[4] and wm[5]
463 are set appropriately (if using a ROTZOOM model), and that alpha, beta,
464 gamma, delta are all in range.
465
466 TODO(rachelbarker): Maybe support scaled references?
467 */
468 /* A note on hardware implementation:
469 The warp filter is intended to be implementable using the same hardware as
470 the high-precision convolve filters from the loop-restoration and
471 convolve-round experiments.
472
473 For a single filter stage, considering all of the coefficient sets for the
474 warp filter and the regular convolution filter, an input in the range
475 [0, 2^k - 1] is mapped into the range [-56 * (2^k - 1), 184 * (2^k - 1)]
476 before rounding.
477
478 Allowing for some changes to the filter coefficient sets, call the range
479 [-64 * 2^k, 192 * 2^k]. Then, if we initialize the accumulator to 64 * 2^k,
480 we can replace this by the range [0, 256 * 2^k], which can be stored in an
481 unsigned value with 8 + k bits.
482
483 This allows the derivation of the appropriate bit widths and offsets for
484 the various intermediate values: If
485
486 F := FILTER_BITS = 7 (or else the above ranges need adjusting)
487 So a *single* filter stage maps a k-bit input to a (k + F + 1)-bit
488 intermediate value.
489 H := ROUND0_BITS
490 V := VERSHEAR_REDUCE_PREC_BITS
491 (and note that we must have H + V = 2*F for the output to have the same
492 scale as the input)
493
494 then we end up with the following offsets and ranges:
495 Horizontal filter: Apply an offset of 1 << (bd + F - 1), sum fits into a
496 uint{bd + F + 1}
497 After rounding: The values stored in 'tmp' fit into a uint{bd + F + 1 - H}.
498 Vertical filter: Apply an offset of 1 << (bd + 2*F - H), sum fits into a
499 uint{bd + 2*F + 2 - H}
500 After rounding: The final value, before undoing the offset, fits into a
501 uint{bd + 2}.
502
503 Then we need to undo the offsets before clamping to a pixel. Note that,
504 if we do this at the end, the amount to subtract is actually independent
505 of H and V:
506
507 offset to subtract = (1 << ((bd + F - 1) - H + F - V)) +
508 (1 << ((bd + 2*F - H) - V))
509 == (1 << (bd - 1)) + (1 << bd)
510
511 This allows us to entirely avoid clamping in both the warp filter and
512 the convolve-round experiment. As of the time of writing, the Wiener filter
513 from loop-restoration can encode a central coefficient up to 216, which
514 leads to a maximum value of about 282 * 2^k after applying the offset.
515 So in that case we still need to clamp.
516 */
av1_warp_affine_c(const int32_t * mat,const uint8_t * ref,int width,int height,int stride,uint8_t * pred,int p_col,int p_row,int p_width,int p_height,int p_stride,int subsampling_x,int subsampling_y,ConvolveParams * conv_params,int16_t alpha,int16_t beta,int16_t gamma,int16_t delta)517 void av1_warp_affine_c(const int32_t *mat, const uint8_t *ref, int width,
518 int height, int stride, uint8_t *pred, int p_col,
519 int p_row, int p_width, int p_height, int p_stride,
520 int subsampling_x, int subsampling_y,
521 ConvolveParams *conv_params, int16_t alpha, int16_t beta,
522 int16_t gamma, int16_t delta) {
523 int32_t tmp[15 * 8];
524 const int bd = 8;
525 const int reduce_bits_horiz = conv_params->round_0;
526 const int reduce_bits_vert = conv_params->is_compound
527 ? conv_params->round_1
528 : 2 * FILTER_BITS - reduce_bits_horiz;
529 const int max_bits_horiz = bd + FILTER_BITS + 1 - reduce_bits_horiz;
530 const int offset_bits_horiz = bd + FILTER_BITS - 1;
531 const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz;
532 const int round_bits =
533 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
534 const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
535 (void)max_bits_horiz;
536 assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL));
537 assert(IMPLIES(conv_params->do_average, conv_params->is_compound));
538
539 for (int i = p_row; i < p_row + p_height; i += 8) {
540 for (int j = p_col; j < p_col + p_width; j += 8) {
541 // Calculate the center of this 8x8 block,
542 // project to luma coordinates (if in a subsampled chroma plane),
543 // apply the affine transformation,
544 // then convert back to the original coordinates (if necessary)
545 const int32_t src_x = (j + 4) << subsampling_x;
546 const int32_t src_y = (i + 4) << subsampling_y;
547 const int64_t dst_x =
548 (int64_t)mat[2] * src_x + (int64_t)mat[3] * src_y + (int64_t)mat[0];
549 const int64_t dst_y =
550 (int64_t)mat[4] * src_x + (int64_t)mat[5] * src_y + (int64_t)mat[1];
551 const int64_t x4 = dst_x >> subsampling_x;
552 const int64_t y4 = dst_y >> subsampling_y;
553
554 int32_t ix4 = (int32_t)(x4 >> WARPEDMODEL_PREC_BITS);
555 int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
556 int32_t iy4 = (int32_t)(y4 >> WARPEDMODEL_PREC_BITS);
557 int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
558
559 sx4 += alpha * (-4) + beta * (-4);
560 sy4 += gamma * (-4) + delta * (-4);
561
562 sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
563 sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
564
565 // Horizontal filter
566 for (int k = -7; k < 8; ++k) {
567 // Clamp to top/bottom edge of the frame
568 const int iy = clamp(iy4 + k, 0, height - 1);
569
570 int sx = sx4 + beta * (k + 4);
571
572 for (int l = -4; l < 4; ++l) {
573 int ix = ix4 + l - 3;
574 // At this point, sx = sx4 + alpha * l + beta * k
575 const int offs = ROUND_POWER_OF_TWO(sx, WARPEDDIFF_PREC_BITS) +
576 WARPEDPIXEL_PREC_SHIFTS;
577 assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
578 const int16_t *coeffs = av1_warped_filter[offs];
579
580 int32_t sum = 1 << offset_bits_horiz;
581 for (int m = 0; m < 8; ++m) {
582 // Clamp to left/right edge of the frame
583 const int sample_x = clamp(ix + m, 0, width - 1);
584
585 sum += ref[iy * stride + sample_x] * coeffs[m];
586 }
587 sum = ROUND_POWER_OF_TWO(sum, reduce_bits_horiz);
588 assert(0 <= sum && sum < (1 << max_bits_horiz));
589 tmp[(k + 7) * 8 + (l + 4)] = sum;
590 sx += alpha;
591 }
592 }
593
594 // Vertical filter
595 for (int k = -4; k < AOMMIN(4, p_row + p_height - i - 4); ++k) {
596 int sy = sy4 + delta * (k + 4);
597 for (int l = -4; l < AOMMIN(4, p_col + p_width - j - 4); ++l) {
598 // At this point, sy = sy4 + gamma * l + delta * k
599 const int offs = ROUND_POWER_OF_TWO(sy, WARPEDDIFF_PREC_BITS) +
600 WARPEDPIXEL_PREC_SHIFTS;
601 assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
602 const int16_t *coeffs = av1_warped_filter[offs];
603
604 int32_t sum = 1 << offset_bits_vert;
605 for (int m = 0; m < 8; ++m) {
606 sum += tmp[(k + m + 4) * 8 + (l + 4)] * coeffs[m];
607 }
608
609 if (conv_params->is_compound) {
610 CONV_BUF_TYPE *p =
611 &conv_params
612 ->dst[(i - p_row + k + 4) * conv_params->dst_stride +
613 (j - p_col + l + 4)];
614 sum = ROUND_POWER_OF_TWO(sum, reduce_bits_vert);
615 if (conv_params->do_average) {
616 uint8_t *dst8 =
617 &pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)];
618 int32_t tmp32 = *p;
619 if (conv_params->use_dist_wtd_comp_avg) {
620 tmp32 = tmp32 * conv_params->fwd_offset +
621 sum * conv_params->bck_offset;
622 tmp32 = tmp32 >> DIST_PRECISION_BITS;
623 } else {
624 tmp32 += sum;
625 tmp32 = tmp32 >> 1;
626 }
627 tmp32 = tmp32 - (1 << (offset_bits - conv_params->round_1)) -
628 (1 << (offset_bits - conv_params->round_1 - 1));
629 *dst8 = clip_pixel(ROUND_POWER_OF_TWO(tmp32, round_bits));
630 } else {
631 *p = sum;
632 }
633 } else {
634 uint8_t *p =
635 &pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)];
636 sum = ROUND_POWER_OF_TWO(sum, reduce_bits_vert);
637 assert(0 <= sum && sum < (1 << (bd + 2)));
638 *p = clip_pixel(sum - (1 << (bd - 1)) - (1 << bd));
639 }
640 sy += gamma;
641 }
642 }
643 }
644 }
645 }
646
warp_plane(WarpedMotionParams * wm,const uint8_t * const ref,int width,int height,int stride,uint8_t * pred,int p_col,int p_row,int p_width,int p_height,int p_stride,int subsampling_x,int subsampling_y,ConvolveParams * conv_params)647 void warp_plane(WarpedMotionParams *wm, const uint8_t *const ref, int width,
648 int height, int stride, uint8_t *pred, int p_col, int p_row,
649 int p_width, int p_height, int p_stride, int subsampling_x,
650 int subsampling_y, ConvolveParams *conv_params) {
651 const int32_t *const mat = wm->wmmat;
652 const int16_t alpha = wm->alpha;
653 const int16_t beta = wm->beta;
654 const int16_t gamma = wm->gamma;
655 const int16_t delta = wm->delta;
656 av1_warp_affine(mat, ref, width, height, stride, pred, p_col, p_row, p_width,
657 p_height, p_stride, subsampling_x, subsampling_y, conv_params,
658 alpha, beta, gamma, delta);
659 }
660
av1_warp_plane(WarpedMotionParams * wm,int use_hbd,int bd,const uint8_t * ref,int width,int height,int stride,uint8_t * pred,int p_col,int p_row,int p_width,int p_height,int p_stride,int subsampling_x,int subsampling_y,ConvolveParams * conv_params)661 void av1_warp_plane(WarpedMotionParams *wm, int use_hbd, int bd,
662 const uint8_t *ref, int width, int height, int stride,
663 uint8_t *pred, int p_col, int p_row, int p_width,
664 int p_height, int p_stride, int subsampling_x,
665 int subsampling_y, ConvolveParams *conv_params) {
666 #if CONFIG_AV1_HIGHBITDEPTH
667 if (use_hbd)
668 highbd_warp_plane(wm, CONVERT_TO_SHORTPTR(ref), width, height, stride,
669 CONVERT_TO_SHORTPTR(pred), p_col, p_row, p_width,
670 p_height, p_stride, subsampling_x, subsampling_y, bd,
671 conv_params);
672 else
673 warp_plane(wm, ref, width, height, stride, pred, p_col, p_row, p_width,
674 p_height, p_stride, subsampling_x, subsampling_y, conv_params);
675 #else
676 (void)use_hbd;
677 (void)bd;
678 warp_plane(wm, ref, width, height, stride, pred, p_col, p_row, p_width,
679 p_height, p_stride, subsampling_x, subsampling_y, conv_params);
680 #endif
681 }
682
683 #define LS_MV_MAX 256 // max mv in 1/8-pel
684 // Use LS_STEP = 8 so that 2 less bits needed for A, Bx, By.
685 #define LS_STEP 8
686
687 // Assuming LS_MV_MAX is < MAX_SB_SIZE * 8,
688 // the precision needed is:
689 // (MAX_SB_SIZE_LOG2 + 3) [for sx * sx magnitude] +
690 // (MAX_SB_SIZE_LOG2 + 4) [for sx * dx magnitude] +
691 // 1 [for sign] +
692 // LEAST_SQUARES_SAMPLES_MAX_BITS
693 // [for adding up to LEAST_SQUARES_SAMPLES_MAX samples]
694 // The value is 23
695 #define LS_MAT_RANGE_BITS \
696 ((MAX_SB_SIZE_LOG2 + 4) * 2 + LEAST_SQUARES_SAMPLES_MAX_BITS)
697
698 // Bit-depth reduction from the full-range
699 #define LS_MAT_DOWN_BITS 2
700
701 // bits range of A, Bx and By after downshifting
702 #define LS_MAT_BITS (LS_MAT_RANGE_BITS - LS_MAT_DOWN_BITS)
703 #define LS_MAT_MIN (-(1 << (LS_MAT_BITS - 1)))
704 #define LS_MAT_MAX ((1 << (LS_MAT_BITS - 1)) - 1)
705
706 // By setting LS_STEP = 8, the least 2 bits of every elements in A, Bx, By are
707 // 0. So, we can reduce LS_MAT_RANGE_BITS(2) bits here.
708 #define LS_SQUARE(a) \
709 (((a) * (a)*4 + (a)*4 * LS_STEP + LS_STEP * LS_STEP * 2) >> \
710 (2 + LS_MAT_DOWN_BITS))
711 #define LS_PRODUCT1(a, b) \
712 (((a) * (b)*4 + ((a) + (b)) * 2 * LS_STEP + LS_STEP * LS_STEP) >> \
713 (2 + LS_MAT_DOWN_BITS))
714 #define LS_PRODUCT2(a, b) \
715 (((a) * (b)*4 + ((a) + (b)) * 2 * LS_STEP + LS_STEP * LS_STEP * 2) >> \
716 (2 + LS_MAT_DOWN_BITS))
717
718 #define USE_LIMITED_PREC_MULT 0
719
720 #if USE_LIMITED_PREC_MULT
721
722 #define MUL_PREC_BITS 16
resolve_multiplier_64(uint64_t D,int16_t * shift)723 static uint16_t resolve_multiplier_64(uint64_t D, int16_t *shift) {
724 int msb = 0;
725 uint16_t mult = 0;
726 *shift = 0;
727 if (D != 0) {
728 msb = (int16_t)((D >> 32) ? get_msb((unsigned int)(D >> 32)) + 32
729 : get_msb((unsigned int)D));
730 if (msb >= MUL_PREC_BITS) {
731 mult = (uint16_t)ROUND_POWER_OF_TWO_64(D, msb + 1 - MUL_PREC_BITS);
732 *shift = msb + 1 - MUL_PREC_BITS;
733 } else {
734 mult = (uint16_t)D;
735 *shift = 0;
736 }
737 }
738 return mult;
739 }
740
get_mult_shift_ndiag(int64_t Px,int16_t iDet,int shift)741 static int32_t get_mult_shift_ndiag(int64_t Px, int16_t iDet, int shift) {
742 int32_t ret;
743 int16_t mshift;
744 uint16_t Mul = resolve_multiplier_64(llabs(Px), &mshift);
745 int32_t v = (int32_t)Mul * (int32_t)iDet * (Px < 0 ? -1 : 1);
746 shift -= mshift;
747 if (shift > 0) {
748 return (int32_t)clamp(ROUND_POWER_OF_TWO_SIGNED(v, shift),
749 -WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1,
750 WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
751 } else {
752 return (int32_t)clamp(v * (1 << (-shift)),
753 -WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1,
754 WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
755 }
756 return ret;
757 }
758
get_mult_shift_diag(int64_t Px,int16_t iDet,int shift)759 static int32_t get_mult_shift_diag(int64_t Px, int16_t iDet, int shift) {
760 int16_t mshift;
761 uint16_t Mul = resolve_multiplier_64(llabs(Px), &mshift);
762 int32_t v = (int32_t)Mul * (int32_t)iDet * (Px < 0 ? -1 : 1);
763 shift -= mshift;
764 if (shift > 0) {
765 return (int32_t)clamp(
766 ROUND_POWER_OF_TWO_SIGNED(v, shift),
767 (1 << WARPEDMODEL_PREC_BITS) - WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1,
768 (1 << WARPEDMODEL_PREC_BITS) + WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
769 } else {
770 return (int32_t)clamp(
771 v * (1 << (-shift)),
772 (1 << WARPEDMODEL_PREC_BITS) - WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1,
773 (1 << WARPEDMODEL_PREC_BITS) + WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
774 }
775 }
776
777 #else
778
get_mult_shift_ndiag(int64_t Px,int16_t iDet,int shift)779 static int32_t get_mult_shift_ndiag(int64_t Px, int16_t iDet, int shift) {
780 int64_t v = Px * (int64_t)iDet;
781 return (int32_t)clamp64(ROUND_POWER_OF_TWO_SIGNED_64(v, shift),
782 -WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1,
783 WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
784 }
785
get_mult_shift_diag(int64_t Px,int16_t iDet,int shift)786 static int32_t get_mult_shift_diag(int64_t Px, int16_t iDet, int shift) {
787 int64_t v = Px * (int64_t)iDet;
788 return (int32_t)clamp64(
789 ROUND_POWER_OF_TWO_SIGNED_64(v, shift),
790 (1 << WARPEDMODEL_PREC_BITS) - WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1,
791 (1 << WARPEDMODEL_PREC_BITS) + WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
792 }
793 #endif // USE_LIMITED_PREC_MULT
794
find_affine_int(int np,const int * pts1,const int * pts2,BLOCK_SIZE bsize,int mvy,int mvx,WarpedMotionParams * wm,int mi_row,int mi_col)795 static int find_affine_int(int np, const int *pts1, const int *pts2,
796 BLOCK_SIZE bsize, int mvy, int mvx,
797 WarpedMotionParams *wm, int mi_row, int mi_col) {
798 int32_t A[2][2] = { { 0, 0 }, { 0, 0 } };
799 int32_t Bx[2] = { 0, 0 };
800 int32_t By[2] = { 0, 0 };
801
802 const int bw = block_size_wide[bsize];
803 const int bh = block_size_high[bsize];
804 const int rsuy = bh / 2 - 1;
805 const int rsux = bw / 2 - 1;
806 const int suy = rsuy * 8;
807 const int sux = rsux * 8;
808 const int duy = suy + mvy;
809 const int dux = sux + mvx;
810
811 // Assume the center pixel of the block has exactly the same motion vector
812 // as transmitted for the block. First shift the origin of the source
813 // points to the block center, and the origin of the destination points to
814 // the block center added to the motion vector transmitted.
815 // Let (xi, yi) denote the source points and (xi', yi') denote destination
816 // points after origin shfifting, for i = 0, 1, 2, .... n-1.
817 // Then if P = [x0, y0,
818 // x1, y1
819 // x2, y1,
820 // ....
821 // ]
822 // q = [x0', x1', x2', ... ]'
823 // r = [y0', y1', y2', ... ]'
824 // the least squares problems that need to be solved are:
825 // [h1, h2]' = inv(P'P)P'q and
826 // [h3, h4]' = inv(P'P)P'r
827 // where the affine transformation is given by:
828 // x' = h1.x + h2.y
829 // y' = h3.x + h4.y
830 //
831 // The loop below computes: A = P'P, Bx = P'q, By = P'r
832 // We need to just compute inv(A).Bx and inv(A).By for the solutions.
833 // Contribution from neighbor block
834 for (int i = 0; i < np; i++) {
835 const int dx = pts2[i * 2] - dux;
836 const int dy = pts2[i * 2 + 1] - duy;
837 const int sx = pts1[i * 2] - sux;
838 const int sy = pts1[i * 2 + 1] - suy;
839 // (TODO)yunqing: This comparison wouldn't be necessary if the sample
840 // selection is done in find_samples(). Also, global offset can be removed
841 // while collecting samples.
842 if (abs(sx - dx) < LS_MV_MAX && abs(sy - dy) < LS_MV_MAX) {
843 A[0][0] += LS_SQUARE(sx);
844 A[0][1] += LS_PRODUCT1(sx, sy);
845 A[1][1] += LS_SQUARE(sy);
846 Bx[0] += LS_PRODUCT2(sx, dx);
847 Bx[1] += LS_PRODUCT1(sy, dx);
848 By[0] += LS_PRODUCT1(sx, dy);
849 By[1] += LS_PRODUCT2(sy, dy);
850 }
851 }
852
853 // Just for debugging, and can be removed later.
854 assert(A[0][0] >= LS_MAT_MIN && A[0][0] <= LS_MAT_MAX);
855 assert(A[0][1] >= LS_MAT_MIN && A[0][1] <= LS_MAT_MAX);
856 assert(A[1][1] >= LS_MAT_MIN && A[1][1] <= LS_MAT_MAX);
857 assert(Bx[0] >= LS_MAT_MIN && Bx[0] <= LS_MAT_MAX);
858 assert(Bx[1] >= LS_MAT_MIN && Bx[1] <= LS_MAT_MAX);
859 assert(By[0] >= LS_MAT_MIN && By[0] <= LS_MAT_MAX);
860 assert(By[1] >= LS_MAT_MIN && By[1] <= LS_MAT_MAX);
861
862 // Compute Determinant of A
863 const int64_t Det = (int64_t)A[0][0] * A[1][1] - (int64_t)A[0][1] * A[0][1];
864 if (Det == 0) return 1;
865
866 int16_t shift;
867 int16_t iDet = resolve_divisor_64(llabs(Det), &shift) * (Det < 0 ? -1 : 1);
868 shift -= WARPEDMODEL_PREC_BITS;
869 if (shift < 0) {
870 iDet <<= (-shift);
871 shift = 0;
872 }
873
874 int64_t Px[2], Py[2];
875 // These divided by the Det, are the least squares solutions
876 Px[0] = (int64_t)A[1][1] * Bx[0] - (int64_t)A[0][1] * Bx[1];
877 Px[1] = -(int64_t)A[0][1] * Bx[0] + (int64_t)A[0][0] * Bx[1];
878 Py[0] = (int64_t)A[1][1] * By[0] - (int64_t)A[0][1] * By[1];
879 Py[1] = -(int64_t)A[0][1] * By[0] + (int64_t)A[0][0] * By[1];
880
881 wm->wmmat[2] = get_mult_shift_diag(Px[0], iDet, shift);
882 wm->wmmat[3] = get_mult_shift_ndiag(Px[1], iDet, shift);
883 wm->wmmat[4] = get_mult_shift_ndiag(Py[0], iDet, shift);
884 wm->wmmat[5] = get_mult_shift_diag(Py[1], iDet, shift);
885
886 const int isuy = (mi_row * MI_SIZE + rsuy);
887 const int isux = (mi_col * MI_SIZE + rsux);
888 // Note: In the vx, vy expressions below, the max value of each of the
889 // 2nd and 3rd terms are (2^16 - 1) * (2^13 - 1). That leaves enough room
890 // for the first term so that the overall sum in the worst case fits
891 // within 32 bits overall.
892 const int32_t vx = mvx * (1 << (WARPEDMODEL_PREC_BITS - 3)) -
893 (isux * (wm->wmmat[2] - (1 << WARPEDMODEL_PREC_BITS)) +
894 isuy * wm->wmmat[3]);
895 const int32_t vy = mvy * (1 << (WARPEDMODEL_PREC_BITS - 3)) -
896 (isux * wm->wmmat[4] +
897 isuy * (wm->wmmat[5] - (1 << WARPEDMODEL_PREC_BITS)));
898 wm->wmmat[0] =
899 clamp(vx, -WARPEDMODEL_TRANS_CLAMP, WARPEDMODEL_TRANS_CLAMP - 1);
900 wm->wmmat[1] =
901 clamp(vy, -WARPEDMODEL_TRANS_CLAMP, WARPEDMODEL_TRANS_CLAMP - 1);
902 return 0;
903 }
904
av1_find_projection(int np,const int * pts1,const int * pts2,BLOCK_SIZE bsize,int mvy,int mvx,WarpedMotionParams * wm_params,int mi_row,int mi_col)905 int av1_find_projection(int np, const int *pts1, const int *pts2,
906 BLOCK_SIZE bsize, int mvy, int mvx,
907 WarpedMotionParams *wm_params, int mi_row, int mi_col) {
908 assert(wm_params->wmtype == AFFINE);
909
910 if (find_affine_int(np, pts1, pts2, bsize, mvy, mvx, wm_params, mi_row,
911 mi_col))
912 return 1;
913
914 // check compatibility with the fast warp filter
915 if (!av1_get_shear_params(wm_params)) return 1;
916
917 return 0;
918 }
919