xref: /aosp_15_r20/external/libaom/av1/common/warped_motion.c (revision 77c1e3ccc04c968bd2bc212e87364f250e820521)
1 /*
2  * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include <stdio.h>
13 #include <stdlib.h>
14 #include <memory.h>
15 #include <math.h>
16 #include <assert.h>
17 
18 #include "config/av1_rtcd.h"
19 
20 #include "av1/common/av1_common_int.h"
21 #include "av1/common/warped_motion.h"
22 #include "av1/common/scale.h"
23 
24 // For warping, we really use a 6-tap filter, but we do blocks of 8 pixels
25 // at a time. The zoom/rotation/shear in the model are applied to the
26 // "fractional" position of each pixel, which therefore varies within
27 // [-1, 2) * WARPEDPIXEL_PREC_SHIFTS.
28 // We need an extra 2 taps to fit this in, for a total of 8 taps.
29 /* clang-format off */
30 const int16_t av1_warped_filter[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8] = {
31   // [-1, 0)
32   { 0,   0, 127,   1,   0, 0, 0, 0 }, { 0, - 1, 127,   2,   0, 0, 0, 0 },
33   { 1, - 3, 127,   4, - 1, 0, 0, 0 }, { 1, - 4, 126,   6, - 2, 1, 0, 0 },
34   { 1, - 5, 126,   8, - 3, 1, 0, 0 }, { 1, - 6, 125,  11, - 4, 1, 0, 0 },
35   { 1, - 7, 124,  13, - 4, 1, 0, 0 }, { 2, - 8, 123,  15, - 5, 1, 0, 0 },
36   { 2, - 9, 122,  18, - 6, 1, 0, 0 }, { 2, -10, 121,  20, - 6, 1, 0, 0 },
37   { 2, -11, 120,  22, - 7, 2, 0, 0 }, { 2, -12, 119,  25, - 8, 2, 0, 0 },
38   { 3, -13, 117,  27, - 8, 2, 0, 0 }, { 3, -13, 116,  29, - 9, 2, 0, 0 },
39   { 3, -14, 114,  32, -10, 3, 0, 0 }, { 3, -15, 113,  35, -10, 2, 0, 0 },
40   { 3, -15, 111,  37, -11, 3, 0, 0 }, { 3, -16, 109,  40, -11, 3, 0, 0 },
41   { 3, -16, 108,  42, -12, 3, 0, 0 }, { 4, -17, 106,  45, -13, 3, 0, 0 },
42   { 4, -17, 104,  47, -13, 3, 0, 0 }, { 4, -17, 102,  50, -14, 3, 0, 0 },
43   { 4, -17, 100,  52, -14, 3, 0, 0 }, { 4, -18,  98,  55, -15, 4, 0, 0 },
44   { 4, -18,  96,  58, -15, 3, 0, 0 }, { 4, -18,  94,  60, -16, 4, 0, 0 },
45   { 4, -18,  91,  63, -16, 4, 0, 0 }, { 4, -18,  89,  65, -16, 4, 0, 0 },
46   { 4, -18,  87,  68, -17, 4, 0, 0 }, { 4, -18,  85,  70, -17, 4, 0, 0 },
47   { 4, -18,  82,  73, -17, 4, 0, 0 }, { 4, -18,  80,  75, -17, 4, 0, 0 },
48   { 4, -18,  78,  78, -18, 4, 0, 0 }, { 4, -17,  75,  80, -18, 4, 0, 0 },
49   { 4, -17,  73,  82, -18, 4, 0, 0 }, { 4, -17,  70,  85, -18, 4, 0, 0 },
50   { 4, -17,  68,  87, -18, 4, 0, 0 }, { 4, -16,  65,  89, -18, 4, 0, 0 },
51   { 4, -16,  63,  91, -18, 4, 0, 0 }, { 4, -16,  60,  94, -18, 4, 0, 0 },
52   { 3, -15,  58,  96, -18, 4, 0, 0 }, { 4, -15,  55,  98, -18, 4, 0, 0 },
53   { 3, -14,  52, 100, -17, 4, 0, 0 }, { 3, -14,  50, 102, -17, 4, 0, 0 },
54   { 3, -13,  47, 104, -17, 4, 0, 0 }, { 3, -13,  45, 106, -17, 4, 0, 0 },
55   { 3, -12,  42, 108, -16, 3, 0, 0 }, { 3, -11,  40, 109, -16, 3, 0, 0 },
56   { 3, -11,  37, 111, -15, 3, 0, 0 }, { 2, -10,  35, 113, -15, 3, 0, 0 },
57   { 3, -10,  32, 114, -14, 3, 0, 0 }, { 2, - 9,  29, 116, -13, 3, 0, 0 },
58   { 2, - 8,  27, 117, -13, 3, 0, 0 }, { 2, - 8,  25, 119, -12, 2, 0, 0 },
59   { 2, - 7,  22, 120, -11, 2, 0, 0 }, { 1, - 6,  20, 121, -10, 2, 0, 0 },
60   { 1, - 6,  18, 122, - 9, 2, 0, 0 }, { 1, - 5,  15, 123, - 8, 2, 0, 0 },
61   { 1, - 4,  13, 124, - 7, 1, 0, 0 }, { 1, - 4,  11, 125, - 6, 1, 0, 0 },
62   { 1, - 3,   8, 126, - 5, 1, 0, 0 }, { 1, - 2,   6, 126, - 4, 1, 0, 0 },
63   { 0, - 1,   4, 127, - 3, 1, 0, 0 }, { 0,   0,   2, 127, - 1, 0, 0, 0 },
64 
65   // [0, 1)
66   { 0,  0,   0, 127,   1,   0,  0,  0}, { 0,  0,  -1, 127,   2,   0,  0,  0},
67   { 0,  1,  -3, 127,   4,  -2,  1,  0}, { 0,  1,  -5, 127,   6,  -2,  1,  0},
68   { 0,  2,  -6, 126,   8,  -3,  1,  0}, {-1,  2,  -7, 126,  11,  -4,  2, -1},
69   {-1,  3,  -8, 125,  13,  -5,  2, -1}, {-1,  3, -10, 124,  16,  -6,  3, -1},
70   {-1,  4, -11, 123,  18,  -7,  3, -1}, {-1,  4, -12, 122,  20,  -7,  3, -1},
71   {-1,  4, -13, 121,  23,  -8,  3, -1}, {-2,  5, -14, 120,  25,  -9,  4, -1},
72   {-1,  5, -15, 119,  27, -10,  4, -1}, {-1,  5, -16, 118,  30, -11,  4, -1},
73   {-2,  6, -17, 116,  33, -12,  5, -1}, {-2,  6, -17, 114,  35, -12,  5, -1},
74   {-2,  6, -18, 113,  38, -13,  5, -1}, {-2,  7, -19, 111,  41, -14,  6, -2},
75   {-2,  7, -19, 110,  43, -15,  6, -2}, {-2,  7, -20, 108,  46, -15,  6, -2},
76   {-2,  7, -20, 106,  49, -16,  6, -2}, {-2,  7, -21, 104,  51, -16,  7, -2},
77   {-2,  7, -21, 102,  54, -17,  7, -2}, {-2,  8, -21, 100,  56, -18,  7, -2},
78   {-2,  8, -22,  98,  59, -18,  7, -2}, {-2,  8, -22,  96,  62, -19,  7, -2},
79   {-2,  8, -22,  94,  64, -19,  7, -2}, {-2,  8, -22,  91,  67, -20,  8, -2},
80   {-2,  8, -22,  89,  69, -20,  8, -2}, {-2,  8, -22,  87,  72, -21,  8, -2},
81   {-2,  8, -21,  84,  74, -21,  8, -2}, {-2,  8, -22,  82,  77, -21,  8, -2},
82   {-2,  8, -21,  79,  79, -21,  8, -2}, {-2,  8, -21,  77,  82, -22,  8, -2},
83   {-2,  8, -21,  74,  84, -21,  8, -2}, {-2,  8, -21,  72,  87, -22,  8, -2},
84   {-2,  8, -20,  69,  89, -22,  8, -2}, {-2,  8, -20,  67,  91, -22,  8, -2},
85   {-2,  7, -19,  64,  94, -22,  8, -2}, {-2,  7, -19,  62,  96, -22,  8, -2},
86   {-2,  7, -18,  59,  98, -22,  8, -2}, {-2,  7, -18,  56, 100, -21,  8, -2},
87   {-2,  7, -17,  54, 102, -21,  7, -2}, {-2,  7, -16,  51, 104, -21,  7, -2},
88   {-2,  6, -16,  49, 106, -20,  7, -2}, {-2,  6, -15,  46, 108, -20,  7, -2},
89   {-2,  6, -15,  43, 110, -19,  7, -2}, {-2,  6, -14,  41, 111, -19,  7, -2},
90   {-1,  5, -13,  38, 113, -18,  6, -2}, {-1,  5, -12,  35, 114, -17,  6, -2},
91   {-1,  5, -12,  33, 116, -17,  6, -2}, {-1,  4, -11,  30, 118, -16,  5, -1},
92   {-1,  4, -10,  27, 119, -15,  5, -1}, {-1,  4,  -9,  25, 120, -14,  5, -2},
93   {-1,  3,  -8,  23, 121, -13,  4, -1}, {-1,  3,  -7,  20, 122, -12,  4, -1},
94   {-1,  3,  -7,  18, 123, -11,  4, -1}, {-1,  3,  -6,  16, 124, -10,  3, -1},
95   {-1,  2,  -5,  13, 125,  -8,  3, -1}, {-1,  2,  -4,  11, 126,  -7,  2, -1},
96   { 0,  1,  -3,   8, 126,  -6,  2,  0}, { 0,  1,  -2,   6, 127,  -5,  1,  0},
97   { 0,  1,  -2,   4, 127,  -3,  1,  0}, { 0,  0,   0,   2, 127,  -1,  0,  0},
98 
99   // [1, 2)
100   { 0, 0, 0,   1, 127,   0,   0, 0 }, { 0, 0, 0, - 1, 127,   2,   0, 0 },
101   { 0, 0, 1, - 3, 127,   4, - 1, 0 }, { 0, 0, 1, - 4, 126,   6, - 2, 1 },
102   { 0, 0, 1, - 5, 126,   8, - 3, 1 }, { 0, 0, 1, - 6, 125,  11, - 4, 1 },
103   { 0, 0, 1, - 7, 124,  13, - 4, 1 }, { 0, 0, 2, - 8, 123,  15, - 5, 1 },
104   { 0, 0, 2, - 9, 122,  18, - 6, 1 }, { 0, 0, 2, -10, 121,  20, - 6, 1 },
105   { 0, 0, 2, -11, 120,  22, - 7, 2 }, { 0, 0, 2, -12, 119,  25, - 8, 2 },
106   { 0, 0, 3, -13, 117,  27, - 8, 2 }, { 0, 0, 3, -13, 116,  29, - 9, 2 },
107   { 0, 0, 3, -14, 114,  32, -10, 3 }, { 0, 0, 3, -15, 113,  35, -10, 2 },
108   { 0, 0, 3, -15, 111,  37, -11, 3 }, { 0, 0, 3, -16, 109,  40, -11, 3 },
109   { 0, 0, 3, -16, 108,  42, -12, 3 }, { 0, 0, 4, -17, 106,  45, -13, 3 },
110   { 0, 0, 4, -17, 104,  47, -13, 3 }, { 0, 0, 4, -17, 102,  50, -14, 3 },
111   { 0, 0, 4, -17, 100,  52, -14, 3 }, { 0, 0, 4, -18,  98,  55, -15, 4 },
112   { 0, 0, 4, -18,  96,  58, -15, 3 }, { 0, 0, 4, -18,  94,  60, -16, 4 },
113   { 0, 0, 4, -18,  91,  63, -16, 4 }, { 0, 0, 4, -18,  89,  65, -16, 4 },
114   { 0, 0, 4, -18,  87,  68, -17, 4 }, { 0, 0, 4, -18,  85,  70, -17, 4 },
115   { 0, 0, 4, -18,  82,  73, -17, 4 }, { 0, 0, 4, -18,  80,  75, -17, 4 },
116   { 0, 0, 4, -18,  78,  78, -18, 4 }, { 0, 0, 4, -17,  75,  80, -18, 4 },
117   { 0, 0, 4, -17,  73,  82, -18, 4 }, { 0, 0, 4, -17,  70,  85, -18, 4 },
118   { 0, 0, 4, -17,  68,  87, -18, 4 }, { 0, 0, 4, -16,  65,  89, -18, 4 },
119   { 0, 0, 4, -16,  63,  91, -18, 4 }, { 0, 0, 4, -16,  60,  94, -18, 4 },
120   { 0, 0, 3, -15,  58,  96, -18, 4 }, { 0, 0, 4, -15,  55,  98, -18, 4 },
121   { 0, 0, 3, -14,  52, 100, -17, 4 }, { 0, 0, 3, -14,  50, 102, -17, 4 },
122   { 0, 0, 3, -13,  47, 104, -17, 4 }, { 0, 0, 3, -13,  45, 106, -17, 4 },
123   { 0, 0, 3, -12,  42, 108, -16, 3 }, { 0, 0, 3, -11,  40, 109, -16, 3 },
124   { 0, 0, 3, -11,  37, 111, -15, 3 }, { 0, 0, 2, -10,  35, 113, -15, 3 },
125   { 0, 0, 3, -10,  32, 114, -14, 3 }, { 0, 0, 2, - 9,  29, 116, -13, 3 },
126   { 0, 0, 2, - 8,  27, 117, -13, 3 }, { 0, 0, 2, - 8,  25, 119, -12, 2 },
127   { 0, 0, 2, - 7,  22, 120, -11, 2 }, { 0, 0, 1, - 6,  20, 121, -10, 2 },
128   { 0, 0, 1, - 6,  18, 122, - 9, 2 }, { 0, 0, 1, - 5,  15, 123, - 8, 2 },
129   { 0, 0, 1, - 4,  13, 124, - 7, 1 }, { 0, 0, 1, - 4,  11, 125, - 6, 1 },
130   { 0, 0, 1, - 3,   8, 126, - 5, 1 }, { 0, 0, 1, - 2,   6, 126, - 4, 1 },
131   { 0, 0, 0, - 1,   4, 127, - 3, 1 }, { 0, 0, 0,   0,   2, 127, - 1, 0 },
132   // dummy (replicate row index 191)
133   { 0, 0, 0,   0,   2, 127, - 1, 0 },
134 };
135 
136 /* clang-format on */
137 
138 #define DIV_LUT_PREC_BITS 14
139 #define DIV_LUT_BITS 8
140 #define DIV_LUT_NUM (1 << DIV_LUT_BITS)
141 
142 static const uint16_t div_lut[DIV_LUT_NUM + 1] = {
143   16384, 16320, 16257, 16194, 16132, 16070, 16009, 15948, 15888, 15828, 15768,
144   15709, 15650, 15592, 15534, 15477, 15420, 15364, 15308, 15252, 15197, 15142,
145   15087, 15033, 14980, 14926, 14873, 14821, 14769, 14717, 14665, 14614, 14564,
146   14513, 14463, 14413, 14364, 14315, 14266, 14218, 14170, 14122, 14075, 14028,
147   13981, 13935, 13888, 13843, 13797, 13752, 13707, 13662, 13618, 13574, 13530,
148   13487, 13443, 13400, 13358, 13315, 13273, 13231, 13190, 13148, 13107, 13066,
149   13026, 12985, 12945, 12906, 12866, 12827, 12788, 12749, 12710, 12672, 12633,
150   12596, 12558, 12520, 12483, 12446, 12409, 12373, 12336, 12300, 12264, 12228,
151   12193, 12157, 12122, 12087, 12053, 12018, 11984, 11950, 11916, 11882, 11848,
152   11815, 11782, 11749, 11716, 11683, 11651, 11619, 11586, 11555, 11523, 11491,
153   11460, 11429, 11398, 11367, 11336, 11305, 11275, 11245, 11215, 11185, 11155,
154   11125, 11096, 11067, 11038, 11009, 10980, 10951, 10923, 10894, 10866, 10838,
155   10810, 10782, 10755, 10727, 10700, 10673, 10645, 10618, 10592, 10565, 10538,
156   10512, 10486, 10460, 10434, 10408, 10382, 10356, 10331, 10305, 10280, 10255,
157   10230, 10205, 10180, 10156, 10131, 10107, 10082, 10058, 10034, 10010, 9986,
158   9963,  9939,  9916,  9892,  9869,  9846,  9823,  9800,  9777,  9754,  9732,
159   9709,  9687,  9664,  9642,  9620,  9598,  9576,  9554,  9533,  9511,  9489,
160   9468,  9447,  9425,  9404,  9383,  9362,  9341,  9321,  9300,  9279,  9259,
161   9239,  9218,  9198,  9178,  9158,  9138,  9118,  9098,  9079,  9059,  9039,
162   9020,  9001,  8981,  8962,  8943,  8924,  8905,  8886,  8867,  8849,  8830,
163   8812,  8793,  8775,  8756,  8738,  8720,  8702,  8684,  8666,  8648,  8630,
164   8613,  8595,  8577,  8560,  8542,  8525,  8508,  8490,  8473,  8456,  8439,
165   8422,  8405,  8389,  8372,  8355,  8339,  8322,  8306,  8289,  8273,  8257,
166   8240,  8224,  8208,  8192,
167 };
168 
169 // Decomposes a divisor D such that 1/D = y/2^shift, where y is returned
170 // at precision of DIV_LUT_PREC_BITS along with the shift.
resolve_divisor_64(uint64_t D,int16_t * shift)171 static int16_t resolve_divisor_64(uint64_t D, int16_t *shift) {
172   int64_t f;
173   *shift = (int16_t)((D >> 32) ? get_msb((unsigned int)(D >> 32)) + 32
174                                : get_msb((unsigned int)D));
175   // e is obtained from D after resetting the most significant 1 bit.
176   const int64_t e = D - ((uint64_t)1 << *shift);
177   // Get the most significant DIV_LUT_BITS (8) bits of e into f
178   if (*shift > DIV_LUT_BITS)
179     f = ROUND_POWER_OF_TWO_64(e, *shift - DIV_LUT_BITS);
180   else
181     f = e << (DIV_LUT_BITS - *shift);
182   assert(f <= DIV_LUT_NUM);
183   *shift += DIV_LUT_PREC_BITS;
184   // Use f as lookup into the precomputed table of multipliers
185   return div_lut[f];
186 }
187 
resolve_divisor_32(uint32_t D,int16_t * shift)188 static int16_t resolve_divisor_32(uint32_t D, int16_t *shift) {
189   int32_t f;
190   *shift = get_msb(D);
191   // e is obtained from D after resetting the most significant 1 bit.
192   const int32_t e = D - ((uint32_t)1 << *shift);
193   // Get the most significant DIV_LUT_BITS (8) bits of e into f
194   if (*shift > DIV_LUT_BITS)
195     f = ROUND_POWER_OF_TWO(e, *shift - DIV_LUT_BITS);
196   else
197     f = e << (DIV_LUT_BITS - *shift);
198   assert(f <= DIV_LUT_NUM);
199   *shift += DIV_LUT_PREC_BITS;
200   // Use f as lookup into the precomputed table of multipliers
201   return div_lut[f];
202 }
203 
is_affine_valid(const WarpedMotionParams * const wm)204 static int is_affine_valid(const WarpedMotionParams *const wm) {
205   const int32_t *mat = wm->wmmat;
206   return (mat[2] > 0);
207 }
208 
is_affine_shear_allowed(int16_t alpha,int16_t beta,int16_t gamma,int16_t delta)209 static int is_affine_shear_allowed(int16_t alpha, int16_t beta, int16_t gamma,
210                                    int16_t delta) {
211   if ((4 * abs(alpha) + 7 * abs(beta) >= (1 << WARPEDMODEL_PREC_BITS)) ||
212       (4 * abs(gamma) + 4 * abs(delta) >= (1 << WARPEDMODEL_PREC_BITS)))
213     return 0;
214   else
215     return 1;
216 }
217 
218 #ifndef NDEBUG
219 // Check that the given warp model satisfies the relevant constraints for
220 // its stated model type
check_model_consistency(WarpedMotionParams * wm)221 static void check_model_consistency(WarpedMotionParams *wm) {
222   switch (wm->wmtype) {
223     case IDENTITY:
224       assert(wm->wmmat[0] == 0);
225       assert(wm->wmmat[1] == 0);
226       AOM_FALLTHROUGH_INTENDED;
227     case TRANSLATION:
228       assert(wm->wmmat[2] == 1 << WARPEDMODEL_PREC_BITS);
229       assert(wm->wmmat[3] == 0);
230       AOM_FALLTHROUGH_INTENDED;
231     case ROTZOOM:
232       assert(wm->wmmat[4] == -wm->wmmat[3]);
233       assert(wm->wmmat[5] == wm->wmmat[2]);
234       AOM_FALLTHROUGH_INTENDED;
235     case AFFINE: break;
236     default: assert(0 && "Bad wmtype");
237   }
238 }
239 #endif  // NDEBUG
240 
241 // Returns 1 on success or 0 on an invalid affine set
av1_get_shear_params(WarpedMotionParams * wm)242 int av1_get_shear_params(WarpedMotionParams *wm) {
243 #ifndef NDEBUG
244   // Check that models have been constructed sensibly
245   // This is a good place to check, because this function does not need to
246   // be called until after model construction is complete, but must be called
247   // before the model can be used for prediction.
248   check_model_consistency(wm);
249 #endif  // NDEBUG
250 
251   const int32_t *mat = wm->wmmat;
252   if (!is_affine_valid(wm)) return 0;
253 
254   wm->alpha =
255       clamp(mat[2] - (1 << WARPEDMODEL_PREC_BITS), INT16_MIN, INT16_MAX);
256   wm->beta = clamp(mat[3], INT16_MIN, INT16_MAX);
257   int16_t shift;
258   int16_t y = resolve_divisor_32(abs(mat[2]), &shift) * (mat[2] < 0 ? -1 : 1);
259   int64_t v = ((int64_t)mat[4] * (1 << WARPEDMODEL_PREC_BITS)) * y;
260   wm->gamma =
261       clamp((int)ROUND_POWER_OF_TWO_SIGNED_64(v, shift), INT16_MIN, INT16_MAX);
262   v = ((int64_t)mat[3] * mat[4]) * y;
263   wm->delta = clamp(mat[5] - (int)ROUND_POWER_OF_TWO_SIGNED_64(v, shift) -
264                         (1 << WARPEDMODEL_PREC_BITS),
265                     INT16_MIN, INT16_MAX);
266 
267   wm->alpha = ROUND_POWER_OF_TWO_SIGNED(wm->alpha, WARP_PARAM_REDUCE_BITS) *
268               (1 << WARP_PARAM_REDUCE_BITS);
269   wm->beta = ROUND_POWER_OF_TWO_SIGNED(wm->beta, WARP_PARAM_REDUCE_BITS) *
270              (1 << WARP_PARAM_REDUCE_BITS);
271   wm->gamma = ROUND_POWER_OF_TWO_SIGNED(wm->gamma, WARP_PARAM_REDUCE_BITS) *
272               (1 << WARP_PARAM_REDUCE_BITS);
273   wm->delta = ROUND_POWER_OF_TWO_SIGNED(wm->delta, WARP_PARAM_REDUCE_BITS) *
274               (1 << WARP_PARAM_REDUCE_BITS);
275 
276   if (!is_affine_shear_allowed(wm->alpha, wm->beta, wm->gamma, wm->delta))
277     return 0;
278 
279   return 1;
280 }
281 
282 #if CONFIG_AV1_HIGHBITDEPTH
283 /* Note: For an explanation of the warp algorithm, and some notes on bit widths
284     for hardware implementations, see the comments above av1_warp_affine_c
285 */
av1_highbd_warp_affine_c(const int32_t * mat,const uint16_t * ref,int width,int height,int stride,uint16_t * pred,int p_col,int p_row,int p_width,int p_height,int p_stride,int subsampling_x,int subsampling_y,int bd,ConvolveParams * conv_params,int16_t alpha,int16_t beta,int16_t gamma,int16_t delta)286 void av1_highbd_warp_affine_c(const int32_t *mat, const uint16_t *ref,
287                               int width, int height, int stride, uint16_t *pred,
288                               int p_col, int p_row, int p_width, int p_height,
289                               int p_stride, int subsampling_x,
290                               int subsampling_y, int bd,
291                               ConvolveParams *conv_params, int16_t alpha,
292                               int16_t beta, int16_t gamma, int16_t delta) {
293   int32_t tmp[15 * 8];
294   const int reduce_bits_horiz = conv_params->round_0;
295   const int reduce_bits_vert = conv_params->is_compound
296                                    ? conv_params->round_1
297                                    : 2 * FILTER_BITS - reduce_bits_horiz;
298   const int max_bits_horiz = bd + FILTER_BITS + 1 - reduce_bits_horiz;
299   const int offset_bits_horiz = bd + FILTER_BITS - 1;
300   const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz;
301   const int round_bits =
302       2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
303   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
304   (void)max_bits_horiz;
305   assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL));
306 
307   // Check that, even with 12-bit input, the intermediate values will fit
308   // into an unsigned 16-bit intermediate array.
309   assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16);
310 
311   for (int i = p_row; i < p_row + p_height; i += 8) {
312     for (int j = p_col; j < p_col + p_width; j += 8) {
313       // Calculate the center of this 8x8 block,
314       // project to luma coordinates (if in a subsampled chroma plane),
315       // apply the affine transformation,
316       // then convert back to the original coordinates (if necessary)
317       const int32_t src_x = (j + 4) << subsampling_x;
318       const int32_t src_y = (i + 4) << subsampling_y;
319       const int64_t dst_x =
320           (int64_t)mat[2] * src_x + (int64_t)mat[3] * src_y + (int64_t)mat[0];
321       const int64_t dst_y =
322           (int64_t)mat[4] * src_x + (int64_t)mat[5] * src_y + (int64_t)mat[1];
323       const int64_t x4 = dst_x >> subsampling_x;
324       const int64_t y4 = dst_y >> subsampling_y;
325 
326       const int32_t ix4 = (int32_t)(x4 >> WARPEDMODEL_PREC_BITS);
327       int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
328       const int32_t iy4 = (int32_t)(y4 >> WARPEDMODEL_PREC_BITS);
329       int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
330 
331       sx4 += alpha * (-4) + beta * (-4);
332       sy4 += gamma * (-4) + delta * (-4);
333 
334       sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
335       sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
336 
337       // Horizontal filter
338       for (int k = -7; k < 8; ++k) {
339         const int iy = clamp(iy4 + k, 0, height - 1);
340 
341         int sx = sx4 + beta * (k + 4);
342         for (int l = -4; l < 4; ++l) {
343           int ix = ix4 + l - 3;
344           const int offs = ROUND_POWER_OF_TWO(sx, WARPEDDIFF_PREC_BITS) +
345                            WARPEDPIXEL_PREC_SHIFTS;
346           assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
347           const int16_t *coeffs = av1_warped_filter[offs];
348 
349           int32_t sum = 1 << offset_bits_horiz;
350           for (int m = 0; m < 8; ++m) {
351             const int sample_x = clamp(ix + m, 0, width - 1);
352             sum += ref[iy * stride + sample_x] * coeffs[m];
353           }
354           sum = ROUND_POWER_OF_TWO(sum, reduce_bits_horiz);
355           assert(0 <= sum && sum < (1 << max_bits_horiz));
356           tmp[(k + 7) * 8 + (l + 4)] = sum;
357           sx += alpha;
358         }
359       }
360 
361       // Vertical filter
362       for (int k = -4; k < AOMMIN(4, p_row + p_height - i - 4); ++k) {
363         int sy = sy4 + delta * (k + 4);
364         for (int l = -4; l < AOMMIN(4, p_col + p_width - j - 4); ++l) {
365           const int offs = ROUND_POWER_OF_TWO(sy, WARPEDDIFF_PREC_BITS) +
366                            WARPEDPIXEL_PREC_SHIFTS;
367           assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
368           const int16_t *coeffs = av1_warped_filter[offs];
369 
370           int32_t sum = 1 << offset_bits_vert;
371           for (int m = 0; m < 8; ++m) {
372             sum += tmp[(k + m + 4) * 8 + (l + 4)] * coeffs[m];
373           }
374 
375           if (conv_params->is_compound) {
376             CONV_BUF_TYPE *p =
377                 &conv_params
378                      ->dst[(i - p_row + k + 4) * conv_params->dst_stride +
379                            (j - p_col + l + 4)];
380             sum = ROUND_POWER_OF_TWO(sum, reduce_bits_vert);
381             if (conv_params->do_average) {
382               uint16_t *dst16 =
383                   &pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)];
384               int32_t tmp32 = *p;
385               if (conv_params->use_dist_wtd_comp_avg) {
386                 tmp32 = tmp32 * conv_params->fwd_offset +
387                         sum * conv_params->bck_offset;
388                 tmp32 = tmp32 >> DIST_PRECISION_BITS;
389               } else {
390                 tmp32 += sum;
391                 tmp32 = tmp32 >> 1;
392               }
393               tmp32 = tmp32 - (1 << (offset_bits - conv_params->round_1)) -
394                       (1 << (offset_bits - conv_params->round_1 - 1));
395               *dst16 =
396                   clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp32, round_bits), bd);
397             } else {
398               *p = sum;
399             }
400           } else {
401             uint16_t *p =
402                 &pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)];
403             sum = ROUND_POWER_OF_TWO(sum, reduce_bits_vert);
404             assert(0 <= sum && sum < (1 << (bd + 2)));
405             *p = clip_pixel_highbd(sum - (1 << (bd - 1)) - (1 << bd), bd);
406           }
407           sy += gamma;
408         }
409       }
410     }
411   }
412 }
413 
highbd_warp_plane(WarpedMotionParams * wm,const uint16_t * const ref,int width,int height,int stride,uint16_t * const pred,int p_col,int p_row,int p_width,int p_height,int p_stride,int subsampling_x,int subsampling_y,int bd,ConvolveParams * conv_params)414 void highbd_warp_plane(WarpedMotionParams *wm, const uint16_t *const ref,
415                        int width, int height, int stride, uint16_t *const pred,
416                        int p_col, int p_row, int p_width, int p_height,
417                        int p_stride, int subsampling_x, int subsampling_y,
418                        int bd, ConvolveParams *conv_params) {
419   const int32_t *const mat = wm->wmmat;
420   const int16_t alpha = wm->alpha;
421   const int16_t beta = wm->beta;
422   const int16_t gamma = wm->gamma;
423   const int16_t delta = wm->delta;
424 
425   av1_highbd_warp_affine(mat, ref, width, height, stride, pred, p_col, p_row,
426                          p_width, p_height, p_stride, subsampling_x,
427                          subsampling_y, bd, conv_params, alpha, beta, gamma,
428                          delta);
429 }
430 #endif  // CONFIG_AV1_HIGHBITDEPTH
431 
432 /* The warp filter for ROTZOOM and AFFINE models works as follows:
433    * Split the input into 8x8 blocks
434    * For each block, project the point (4, 4) within the block, to get the
435      overall block position. Split into integer and fractional coordinates,
436      maintaining full WARPEDMODEL precision
437    * Filter horizontally: Generate 15 rows of 8 pixels each. Each pixel gets a
438      variable horizontal offset. This means that, while the rows of the
439      intermediate buffer align with the rows of the *reference* image, the
440      columns align with the columns of the *destination* image.
441    * Filter vertically: Generate the output block (up to 8x8 pixels, but if the
442      destination is too small we crop the output at this stage). Each pixel has
443      a variable vertical offset, so that the resulting rows are aligned with
444      the rows of the destination image.
445 
446    To accomplish these alignments, we factor the warp matrix as a
447    product of two shear / asymmetric zoom matrices:
448    / a b \  = /   1       0    \ * / 1+alpha  beta \
449    \ c d /    \ gamma  1+delta /   \    0      1   /
450    where a, b, c, d are wmmat[2], wmmat[3], wmmat[4], wmmat[5] respectively.
451    The horizontal shear (with alpha and beta) is applied first,
452    then the vertical shear (with gamma and delta) is applied second.
453 
454    The only limitation is that, to fit this in a fixed 8-tap filter size,
455    the fractional pixel offsets must be at most +-1. Since the horizontal filter
456    generates 15 rows of 8 columns, and the initial point we project is at (4, 4)
457    within the block, the parameters must satisfy
458    4 * |alpha| + 7 * |beta| <= 1   and   4 * |gamma| + 4 * |delta| <= 1
459    for this filter to be applicable.
460 
461    Note: This function assumes that the caller has done all of the relevant
462    checks, ie. that we have a ROTZOOM or AFFINE model, that wm[4] and wm[5]
463    are set appropriately (if using a ROTZOOM model), and that alpha, beta,
464    gamma, delta are all in range.
465 
466    TODO(rachelbarker): Maybe support scaled references?
467 */
468 /* A note on hardware implementation:
469     The warp filter is intended to be implementable using the same hardware as
470     the high-precision convolve filters from the loop-restoration and
471     convolve-round experiments.
472 
473     For a single filter stage, considering all of the coefficient sets for the
474     warp filter and the regular convolution filter, an input in the range
475     [0, 2^k - 1] is mapped into the range [-56 * (2^k - 1), 184 * (2^k - 1)]
476     before rounding.
477 
478     Allowing for some changes to the filter coefficient sets, call the range
479     [-64 * 2^k, 192 * 2^k]. Then, if we initialize the accumulator to 64 * 2^k,
480     we can replace this by the range [0, 256 * 2^k], which can be stored in an
481     unsigned value with 8 + k bits.
482 
483     This allows the derivation of the appropriate bit widths and offsets for
484     the various intermediate values: If
485 
486     F := FILTER_BITS = 7 (or else the above ranges need adjusting)
487          So a *single* filter stage maps a k-bit input to a (k + F + 1)-bit
488          intermediate value.
489     H := ROUND0_BITS
490     V := VERSHEAR_REDUCE_PREC_BITS
491     (and note that we must have H + V = 2*F for the output to have the same
492      scale as the input)
493 
494     then we end up with the following offsets and ranges:
495     Horizontal filter: Apply an offset of 1 << (bd + F - 1), sum fits into a
496                        uint{bd + F + 1}
497     After rounding: The values stored in 'tmp' fit into a uint{bd + F + 1 - H}.
498     Vertical filter: Apply an offset of 1 << (bd + 2*F - H), sum fits into a
499                      uint{bd + 2*F + 2 - H}
500     After rounding: The final value, before undoing the offset, fits into a
501                     uint{bd + 2}.
502 
503     Then we need to undo the offsets before clamping to a pixel. Note that,
504     if we do this at the end, the amount to subtract is actually independent
505     of H and V:
506 
507     offset to subtract = (1 << ((bd + F - 1) - H + F - V)) +
508                          (1 << ((bd + 2*F - H) - V))
509                       == (1 << (bd - 1)) + (1 << bd)
510 
511     This allows us to entirely avoid clamping in both the warp filter and
512     the convolve-round experiment. As of the time of writing, the Wiener filter
513     from loop-restoration can encode a central coefficient up to 216, which
514     leads to a maximum value of about 282 * 2^k after applying the offset.
515     So in that case we still need to clamp.
516 */
av1_warp_affine_c(const int32_t * mat,const uint8_t * ref,int width,int height,int stride,uint8_t * pred,int p_col,int p_row,int p_width,int p_height,int p_stride,int subsampling_x,int subsampling_y,ConvolveParams * conv_params,int16_t alpha,int16_t beta,int16_t gamma,int16_t delta)517 void av1_warp_affine_c(const int32_t *mat, const uint8_t *ref, int width,
518                        int height, int stride, uint8_t *pred, int p_col,
519                        int p_row, int p_width, int p_height, int p_stride,
520                        int subsampling_x, int subsampling_y,
521                        ConvolveParams *conv_params, int16_t alpha, int16_t beta,
522                        int16_t gamma, int16_t delta) {
523   int32_t tmp[15 * 8];
524   const int bd = 8;
525   const int reduce_bits_horiz = conv_params->round_0;
526   const int reduce_bits_vert = conv_params->is_compound
527                                    ? conv_params->round_1
528                                    : 2 * FILTER_BITS - reduce_bits_horiz;
529   const int max_bits_horiz = bd + FILTER_BITS + 1 - reduce_bits_horiz;
530   const int offset_bits_horiz = bd + FILTER_BITS - 1;
531   const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz;
532   const int round_bits =
533       2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
534   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
535   (void)max_bits_horiz;
536   assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL));
537   assert(IMPLIES(conv_params->do_average, conv_params->is_compound));
538 
539   for (int i = p_row; i < p_row + p_height; i += 8) {
540     for (int j = p_col; j < p_col + p_width; j += 8) {
541       // Calculate the center of this 8x8 block,
542       // project to luma coordinates (if in a subsampled chroma plane),
543       // apply the affine transformation,
544       // then convert back to the original coordinates (if necessary)
545       const int32_t src_x = (j + 4) << subsampling_x;
546       const int32_t src_y = (i + 4) << subsampling_y;
547       const int64_t dst_x =
548           (int64_t)mat[2] * src_x + (int64_t)mat[3] * src_y + (int64_t)mat[0];
549       const int64_t dst_y =
550           (int64_t)mat[4] * src_x + (int64_t)mat[5] * src_y + (int64_t)mat[1];
551       const int64_t x4 = dst_x >> subsampling_x;
552       const int64_t y4 = dst_y >> subsampling_y;
553 
554       int32_t ix4 = (int32_t)(x4 >> WARPEDMODEL_PREC_BITS);
555       int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
556       int32_t iy4 = (int32_t)(y4 >> WARPEDMODEL_PREC_BITS);
557       int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
558 
559       sx4 += alpha * (-4) + beta * (-4);
560       sy4 += gamma * (-4) + delta * (-4);
561 
562       sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
563       sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
564 
565       // Horizontal filter
566       for (int k = -7; k < 8; ++k) {
567         // Clamp to top/bottom edge of the frame
568         const int iy = clamp(iy4 + k, 0, height - 1);
569 
570         int sx = sx4 + beta * (k + 4);
571 
572         for (int l = -4; l < 4; ++l) {
573           int ix = ix4 + l - 3;
574           // At this point, sx = sx4 + alpha * l + beta * k
575           const int offs = ROUND_POWER_OF_TWO(sx, WARPEDDIFF_PREC_BITS) +
576                            WARPEDPIXEL_PREC_SHIFTS;
577           assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
578           const int16_t *coeffs = av1_warped_filter[offs];
579 
580           int32_t sum = 1 << offset_bits_horiz;
581           for (int m = 0; m < 8; ++m) {
582             // Clamp to left/right edge of the frame
583             const int sample_x = clamp(ix + m, 0, width - 1);
584 
585             sum += ref[iy * stride + sample_x] * coeffs[m];
586           }
587           sum = ROUND_POWER_OF_TWO(sum, reduce_bits_horiz);
588           assert(0 <= sum && sum < (1 << max_bits_horiz));
589           tmp[(k + 7) * 8 + (l + 4)] = sum;
590           sx += alpha;
591         }
592       }
593 
594       // Vertical filter
595       for (int k = -4; k < AOMMIN(4, p_row + p_height - i - 4); ++k) {
596         int sy = sy4 + delta * (k + 4);
597         for (int l = -4; l < AOMMIN(4, p_col + p_width - j - 4); ++l) {
598           // At this point, sy = sy4 + gamma * l + delta * k
599           const int offs = ROUND_POWER_OF_TWO(sy, WARPEDDIFF_PREC_BITS) +
600                            WARPEDPIXEL_PREC_SHIFTS;
601           assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
602           const int16_t *coeffs = av1_warped_filter[offs];
603 
604           int32_t sum = 1 << offset_bits_vert;
605           for (int m = 0; m < 8; ++m) {
606             sum += tmp[(k + m + 4) * 8 + (l + 4)] * coeffs[m];
607           }
608 
609           if (conv_params->is_compound) {
610             CONV_BUF_TYPE *p =
611                 &conv_params
612                      ->dst[(i - p_row + k + 4) * conv_params->dst_stride +
613                            (j - p_col + l + 4)];
614             sum = ROUND_POWER_OF_TWO(sum, reduce_bits_vert);
615             if (conv_params->do_average) {
616               uint8_t *dst8 =
617                   &pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)];
618               int32_t tmp32 = *p;
619               if (conv_params->use_dist_wtd_comp_avg) {
620                 tmp32 = tmp32 * conv_params->fwd_offset +
621                         sum * conv_params->bck_offset;
622                 tmp32 = tmp32 >> DIST_PRECISION_BITS;
623               } else {
624                 tmp32 += sum;
625                 tmp32 = tmp32 >> 1;
626               }
627               tmp32 = tmp32 - (1 << (offset_bits - conv_params->round_1)) -
628                       (1 << (offset_bits - conv_params->round_1 - 1));
629               *dst8 = clip_pixel(ROUND_POWER_OF_TWO(tmp32, round_bits));
630             } else {
631               *p = sum;
632             }
633           } else {
634             uint8_t *p =
635                 &pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)];
636             sum = ROUND_POWER_OF_TWO(sum, reduce_bits_vert);
637             assert(0 <= sum && sum < (1 << (bd + 2)));
638             *p = clip_pixel(sum - (1 << (bd - 1)) - (1 << bd));
639           }
640           sy += gamma;
641         }
642       }
643     }
644   }
645 }
646 
warp_plane(WarpedMotionParams * wm,const uint8_t * const ref,int width,int height,int stride,uint8_t * pred,int p_col,int p_row,int p_width,int p_height,int p_stride,int subsampling_x,int subsampling_y,ConvolveParams * conv_params)647 void warp_plane(WarpedMotionParams *wm, const uint8_t *const ref, int width,
648                 int height, int stride, uint8_t *pred, int p_col, int p_row,
649                 int p_width, int p_height, int p_stride, int subsampling_x,
650                 int subsampling_y, ConvolveParams *conv_params) {
651   const int32_t *const mat = wm->wmmat;
652   const int16_t alpha = wm->alpha;
653   const int16_t beta = wm->beta;
654   const int16_t gamma = wm->gamma;
655   const int16_t delta = wm->delta;
656   av1_warp_affine(mat, ref, width, height, stride, pred, p_col, p_row, p_width,
657                   p_height, p_stride, subsampling_x, subsampling_y, conv_params,
658                   alpha, beta, gamma, delta);
659 }
660 
av1_warp_plane(WarpedMotionParams * wm,int use_hbd,int bd,const uint8_t * ref,int width,int height,int stride,uint8_t * pred,int p_col,int p_row,int p_width,int p_height,int p_stride,int subsampling_x,int subsampling_y,ConvolveParams * conv_params)661 void av1_warp_plane(WarpedMotionParams *wm, int use_hbd, int bd,
662                     const uint8_t *ref, int width, int height, int stride,
663                     uint8_t *pred, int p_col, int p_row, int p_width,
664                     int p_height, int p_stride, int subsampling_x,
665                     int subsampling_y, ConvolveParams *conv_params) {
666 #if CONFIG_AV1_HIGHBITDEPTH
667   if (use_hbd)
668     highbd_warp_plane(wm, CONVERT_TO_SHORTPTR(ref), width, height, stride,
669                       CONVERT_TO_SHORTPTR(pred), p_col, p_row, p_width,
670                       p_height, p_stride, subsampling_x, subsampling_y, bd,
671                       conv_params);
672   else
673     warp_plane(wm, ref, width, height, stride, pred, p_col, p_row, p_width,
674                p_height, p_stride, subsampling_x, subsampling_y, conv_params);
675 #else
676   (void)use_hbd;
677   (void)bd;
678   warp_plane(wm, ref, width, height, stride, pred, p_col, p_row, p_width,
679              p_height, p_stride, subsampling_x, subsampling_y, conv_params);
680 #endif
681 }
682 
683 #define LS_MV_MAX 256  // max mv in 1/8-pel
684 // Use LS_STEP = 8 so that 2 less bits needed for A, Bx, By.
685 #define LS_STEP 8
686 
687 // Assuming LS_MV_MAX is < MAX_SB_SIZE * 8,
688 // the precision needed is:
689 //   (MAX_SB_SIZE_LOG2 + 3) [for sx * sx magnitude] +
690 //   (MAX_SB_SIZE_LOG2 + 4) [for sx * dx magnitude] +
691 //   1 [for sign] +
692 //   LEAST_SQUARES_SAMPLES_MAX_BITS
693 //        [for adding up to LEAST_SQUARES_SAMPLES_MAX samples]
694 // The value is 23
695 #define LS_MAT_RANGE_BITS \
696   ((MAX_SB_SIZE_LOG2 + 4) * 2 + LEAST_SQUARES_SAMPLES_MAX_BITS)
697 
698 // Bit-depth reduction from the full-range
699 #define LS_MAT_DOWN_BITS 2
700 
701 // bits range of A, Bx and By after downshifting
702 #define LS_MAT_BITS (LS_MAT_RANGE_BITS - LS_MAT_DOWN_BITS)
703 #define LS_MAT_MIN (-(1 << (LS_MAT_BITS - 1)))
704 #define LS_MAT_MAX ((1 << (LS_MAT_BITS - 1)) - 1)
705 
706 // By setting LS_STEP = 8, the least 2 bits of every elements in A, Bx, By are
707 // 0. So, we can reduce LS_MAT_RANGE_BITS(2) bits here.
708 #define LS_SQUARE(a)                                          \
709   (((a) * (a)*4 + (a)*4 * LS_STEP + LS_STEP * LS_STEP * 2) >> \
710    (2 + LS_MAT_DOWN_BITS))
711 #define LS_PRODUCT1(a, b)                                           \
712   (((a) * (b)*4 + ((a) + (b)) * 2 * LS_STEP + LS_STEP * LS_STEP) >> \
713    (2 + LS_MAT_DOWN_BITS))
714 #define LS_PRODUCT2(a, b)                                               \
715   (((a) * (b)*4 + ((a) + (b)) * 2 * LS_STEP + LS_STEP * LS_STEP * 2) >> \
716    (2 + LS_MAT_DOWN_BITS))
717 
718 #define USE_LIMITED_PREC_MULT 0
719 
720 #if USE_LIMITED_PREC_MULT
721 
722 #define MUL_PREC_BITS 16
resolve_multiplier_64(uint64_t D,int16_t * shift)723 static uint16_t resolve_multiplier_64(uint64_t D, int16_t *shift) {
724   int msb = 0;
725   uint16_t mult = 0;
726   *shift = 0;
727   if (D != 0) {
728     msb = (int16_t)((D >> 32) ? get_msb((unsigned int)(D >> 32)) + 32
729                               : get_msb((unsigned int)D));
730     if (msb >= MUL_PREC_BITS) {
731       mult = (uint16_t)ROUND_POWER_OF_TWO_64(D, msb + 1 - MUL_PREC_BITS);
732       *shift = msb + 1 - MUL_PREC_BITS;
733     } else {
734       mult = (uint16_t)D;
735       *shift = 0;
736     }
737   }
738   return mult;
739 }
740 
get_mult_shift_ndiag(int64_t Px,int16_t iDet,int shift)741 static int32_t get_mult_shift_ndiag(int64_t Px, int16_t iDet, int shift) {
742   int32_t ret;
743   int16_t mshift;
744   uint16_t Mul = resolve_multiplier_64(llabs(Px), &mshift);
745   int32_t v = (int32_t)Mul * (int32_t)iDet * (Px < 0 ? -1 : 1);
746   shift -= mshift;
747   if (shift > 0) {
748     return (int32_t)clamp(ROUND_POWER_OF_TWO_SIGNED(v, shift),
749                           -WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1,
750                           WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
751   } else {
752     return (int32_t)clamp(v * (1 << (-shift)),
753                           -WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1,
754                           WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
755   }
756   return ret;
757 }
758 
get_mult_shift_diag(int64_t Px,int16_t iDet,int shift)759 static int32_t get_mult_shift_diag(int64_t Px, int16_t iDet, int shift) {
760   int16_t mshift;
761   uint16_t Mul = resolve_multiplier_64(llabs(Px), &mshift);
762   int32_t v = (int32_t)Mul * (int32_t)iDet * (Px < 0 ? -1 : 1);
763   shift -= mshift;
764   if (shift > 0) {
765     return (int32_t)clamp(
766         ROUND_POWER_OF_TWO_SIGNED(v, shift),
767         (1 << WARPEDMODEL_PREC_BITS) - WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1,
768         (1 << WARPEDMODEL_PREC_BITS) + WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
769   } else {
770     return (int32_t)clamp(
771         v * (1 << (-shift)),
772         (1 << WARPEDMODEL_PREC_BITS) - WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1,
773         (1 << WARPEDMODEL_PREC_BITS) + WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
774   }
775 }
776 
777 #else
778 
get_mult_shift_ndiag(int64_t Px,int16_t iDet,int shift)779 static int32_t get_mult_shift_ndiag(int64_t Px, int16_t iDet, int shift) {
780   int64_t v = Px * (int64_t)iDet;
781   return (int32_t)clamp64(ROUND_POWER_OF_TWO_SIGNED_64(v, shift),
782                           -WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1,
783                           WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
784 }
785 
get_mult_shift_diag(int64_t Px,int16_t iDet,int shift)786 static int32_t get_mult_shift_diag(int64_t Px, int16_t iDet, int shift) {
787   int64_t v = Px * (int64_t)iDet;
788   return (int32_t)clamp64(
789       ROUND_POWER_OF_TWO_SIGNED_64(v, shift),
790       (1 << WARPEDMODEL_PREC_BITS) - WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1,
791       (1 << WARPEDMODEL_PREC_BITS) + WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
792 }
793 #endif  // USE_LIMITED_PREC_MULT
794 
find_affine_int(int np,const int * pts1,const int * pts2,BLOCK_SIZE bsize,int mvy,int mvx,WarpedMotionParams * wm,int mi_row,int mi_col)795 static int find_affine_int(int np, const int *pts1, const int *pts2,
796                            BLOCK_SIZE bsize, int mvy, int mvx,
797                            WarpedMotionParams *wm, int mi_row, int mi_col) {
798   int32_t A[2][2] = { { 0, 0 }, { 0, 0 } };
799   int32_t Bx[2] = { 0, 0 };
800   int32_t By[2] = { 0, 0 };
801 
802   const int bw = block_size_wide[bsize];
803   const int bh = block_size_high[bsize];
804   const int rsuy = bh / 2 - 1;
805   const int rsux = bw / 2 - 1;
806   const int suy = rsuy * 8;
807   const int sux = rsux * 8;
808   const int duy = suy + mvy;
809   const int dux = sux + mvx;
810 
811   // Assume the center pixel of the block has exactly the same motion vector
812   // as transmitted for the block. First shift the origin of the source
813   // points to the block center, and the origin of the destination points to
814   // the block center added to the motion vector transmitted.
815   // Let (xi, yi) denote the source points and (xi', yi') denote destination
816   // points after origin shfifting, for i = 0, 1, 2, .... n-1.
817   // Then if  P = [x0, y0,
818   //               x1, y1
819   //               x2, y1,
820   //                ....
821   //              ]
822   //          q = [x0', x1', x2', ... ]'
823   //          r = [y0', y1', y2', ... ]'
824   // the least squares problems that need to be solved are:
825   //          [h1, h2]' = inv(P'P)P'q and
826   //          [h3, h4]' = inv(P'P)P'r
827   // where the affine transformation is given by:
828   //          x' = h1.x + h2.y
829   //          y' = h3.x + h4.y
830   //
831   // The loop below computes: A = P'P, Bx = P'q, By = P'r
832   // We need to just compute inv(A).Bx and inv(A).By for the solutions.
833   // Contribution from neighbor block
834   for (int i = 0; i < np; i++) {
835     const int dx = pts2[i * 2] - dux;
836     const int dy = pts2[i * 2 + 1] - duy;
837     const int sx = pts1[i * 2] - sux;
838     const int sy = pts1[i * 2 + 1] - suy;
839     // (TODO)yunqing: This comparison wouldn't be necessary if the sample
840     // selection is done in find_samples(). Also, global offset can be removed
841     // while collecting samples.
842     if (abs(sx - dx) < LS_MV_MAX && abs(sy - dy) < LS_MV_MAX) {
843       A[0][0] += LS_SQUARE(sx);
844       A[0][1] += LS_PRODUCT1(sx, sy);
845       A[1][1] += LS_SQUARE(sy);
846       Bx[0] += LS_PRODUCT2(sx, dx);
847       Bx[1] += LS_PRODUCT1(sy, dx);
848       By[0] += LS_PRODUCT1(sx, dy);
849       By[1] += LS_PRODUCT2(sy, dy);
850     }
851   }
852 
853   // Just for debugging, and can be removed later.
854   assert(A[0][0] >= LS_MAT_MIN && A[0][0] <= LS_MAT_MAX);
855   assert(A[0][1] >= LS_MAT_MIN && A[0][1] <= LS_MAT_MAX);
856   assert(A[1][1] >= LS_MAT_MIN && A[1][1] <= LS_MAT_MAX);
857   assert(Bx[0] >= LS_MAT_MIN && Bx[0] <= LS_MAT_MAX);
858   assert(Bx[1] >= LS_MAT_MIN && Bx[1] <= LS_MAT_MAX);
859   assert(By[0] >= LS_MAT_MIN && By[0] <= LS_MAT_MAX);
860   assert(By[1] >= LS_MAT_MIN && By[1] <= LS_MAT_MAX);
861 
862   // Compute Determinant of A
863   const int64_t Det = (int64_t)A[0][0] * A[1][1] - (int64_t)A[0][1] * A[0][1];
864   if (Det == 0) return 1;
865 
866   int16_t shift;
867   int16_t iDet = resolve_divisor_64(llabs(Det), &shift) * (Det < 0 ? -1 : 1);
868   shift -= WARPEDMODEL_PREC_BITS;
869   if (shift < 0) {
870     iDet <<= (-shift);
871     shift = 0;
872   }
873 
874   int64_t Px[2], Py[2];
875   // These divided by the Det, are the least squares solutions
876   Px[0] = (int64_t)A[1][1] * Bx[0] - (int64_t)A[0][1] * Bx[1];
877   Px[1] = -(int64_t)A[0][1] * Bx[0] + (int64_t)A[0][0] * Bx[1];
878   Py[0] = (int64_t)A[1][1] * By[0] - (int64_t)A[0][1] * By[1];
879   Py[1] = -(int64_t)A[0][1] * By[0] + (int64_t)A[0][0] * By[1];
880 
881   wm->wmmat[2] = get_mult_shift_diag(Px[0], iDet, shift);
882   wm->wmmat[3] = get_mult_shift_ndiag(Px[1], iDet, shift);
883   wm->wmmat[4] = get_mult_shift_ndiag(Py[0], iDet, shift);
884   wm->wmmat[5] = get_mult_shift_diag(Py[1], iDet, shift);
885 
886   const int isuy = (mi_row * MI_SIZE + rsuy);
887   const int isux = (mi_col * MI_SIZE + rsux);
888   // Note: In the vx, vy expressions below, the max value of each of the
889   // 2nd and 3rd terms are (2^16 - 1) * (2^13 - 1). That leaves enough room
890   // for the first term so that the overall sum in the worst case fits
891   // within 32 bits overall.
892   const int32_t vx = mvx * (1 << (WARPEDMODEL_PREC_BITS - 3)) -
893                      (isux * (wm->wmmat[2] - (1 << WARPEDMODEL_PREC_BITS)) +
894                       isuy * wm->wmmat[3]);
895   const int32_t vy = mvy * (1 << (WARPEDMODEL_PREC_BITS - 3)) -
896                      (isux * wm->wmmat[4] +
897                       isuy * (wm->wmmat[5] - (1 << WARPEDMODEL_PREC_BITS)));
898   wm->wmmat[0] =
899       clamp(vx, -WARPEDMODEL_TRANS_CLAMP, WARPEDMODEL_TRANS_CLAMP - 1);
900   wm->wmmat[1] =
901       clamp(vy, -WARPEDMODEL_TRANS_CLAMP, WARPEDMODEL_TRANS_CLAMP - 1);
902   return 0;
903 }
904 
av1_find_projection(int np,const int * pts1,const int * pts2,BLOCK_SIZE bsize,int mvy,int mvx,WarpedMotionParams * wm_params,int mi_row,int mi_col)905 int av1_find_projection(int np, const int *pts1, const int *pts2,
906                         BLOCK_SIZE bsize, int mvy, int mvx,
907                         WarpedMotionParams *wm_params, int mi_row, int mi_col) {
908   assert(wm_params->wmtype == AFFINE);
909 
910   if (find_affine_int(np, pts1, pts2, bsize, mvy, mvx, wm_params, mi_row,
911                       mi_col))
912     return 1;
913 
914   // check compatibility with the fast warp filter
915   if (!av1_get_shear_params(wm_params)) return 1;
916 
917   return 0;
918 }
919