xref: /aosp_15_r20/external/libvpx/vpx_dsp/mips/variance_mmi.c (revision fb1b10ab9aebc7c7068eedab379b749d7e3900be)
1 /*
2  *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include "./vpx_dsp_rtcd.h"
12 #include "vpx_dsp/variance.h"
13 #include "vpx_ports/mem.h"
14 #include "vpx/vpx_integer.h"
15 #include "vpx_ports/asmdefs_mmi.h"
16 
17 static const uint8_t bilinear_filters[8][2] = {
18   { 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 },
19   { 64, 64 }, { 48, 80 },  { 32, 96 }, { 16, 112 },
20 };
21 
22 /* Use VARIANCE_SSE_SUM_8_FOR_W64 in vpx_variance64x64,vpx_variance64x32,
23    vpx_variance32x64. VARIANCE_SSE_SUM_8 will lead to sum overflow. */
24 #define VARIANCE_SSE_SUM_8_FOR_W64                                  \
25   /* sse */                                                         \
26   "pasubub    %[ftmp3],   %[ftmp1],       %[ftmp2]            \n\t" \
27   "punpcklbh  %[ftmp4],   %[ftmp3],       %[ftmp0]            \n\t" \
28   "punpckhbh  %[ftmp5],   %[ftmp3],       %[ftmp0]            \n\t" \
29   "pmaddhw    %[ftmp6],   %[ftmp4],       %[ftmp4]            \n\t" \
30   "pmaddhw    %[ftmp7],   %[ftmp5],       %[ftmp5]            \n\t" \
31   "paddw      %[ftmp10],  %[ftmp10],      %[ftmp6]            \n\t" \
32   "paddw      %[ftmp10],  %[ftmp10],      %[ftmp7]            \n\t" \
33                                                                     \
34   /* sum */                                                         \
35   "punpcklbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t" \
36   "punpckhbh  %[ftmp4],   %[ftmp1],       %[ftmp0]            \n\t" \
37   "punpcklbh  %[ftmp5],   %[ftmp2],       %[ftmp0]            \n\t" \
38   "punpckhbh  %[ftmp6],   %[ftmp2],       %[ftmp0]            \n\t" \
39   "punpcklhw  %[ftmp1],   %[ftmp3],       %[ftmp0]            \n\t" \
40   "punpckhhw  %[ftmp2],   %[ftmp3],       %[ftmp0]            \n\t" \
41   "punpcklhw  %[ftmp7],   %[ftmp5],       %[ftmp0]            \n\t" \
42   "punpckhhw  %[ftmp8],   %[ftmp5],       %[ftmp0]            \n\t" \
43   "psubw      %[ftmp3],   %[ftmp1],       %[ftmp7]            \n\t" \
44   "psubw      %[ftmp5],   %[ftmp2],       %[ftmp8]            \n\t" \
45   "punpcklhw  %[ftmp1],   %[ftmp4],       %[ftmp0]            \n\t" \
46   "punpckhhw  %[ftmp2],   %[ftmp4],       %[ftmp0]            \n\t" \
47   "punpcklhw  %[ftmp7],   %[ftmp6],       %[ftmp0]            \n\t" \
48   "punpckhhw  %[ftmp8],   %[ftmp6],       %[ftmp0]            \n\t" \
49   "psubw      %[ftmp4],   %[ftmp1],       %[ftmp7]            \n\t" \
50   "psubw      %[ftmp6],   %[ftmp2],       %[ftmp8]            \n\t" \
51   "paddw      %[ftmp9],   %[ftmp9],       %[ftmp3]            \n\t" \
52   "paddw      %[ftmp9],   %[ftmp9],       %[ftmp4]            \n\t" \
53   "paddw      %[ftmp9],   %[ftmp9],       %[ftmp5]            \n\t" \
54   "paddw      %[ftmp9],   %[ftmp9],       %[ftmp6]            \n\t"
55 
56 #define VARIANCE_SSE_SUM_4                                          \
57   /* sse */                                                         \
58   "pasubub    %[ftmp3],   %[ftmp1],       %[ftmp2]            \n\t" \
59   "punpcklbh  %[ftmp4],   %[ftmp3],       %[ftmp0]            \n\t" \
60   "pmaddhw    %[ftmp5],   %[ftmp4],       %[ftmp4]            \n\t" \
61   "paddw      %[ftmp6],   %[ftmp6],       %[ftmp5]            \n\t" \
62                                                                     \
63   /* sum */                                                         \
64   "punpcklbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t" \
65   "punpcklbh  %[ftmp4],   %[ftmp2],       %[ftmp0]            \n\t" \
66   "paddh      %[ftmp7],   %[ftmp7],       %[ftmp3]            \n\t" \
67   "paddh      %[ftmp8],   %[ftmp8],       %[ftmp4]            \n\t"
68 
69 #define VARIANCE_SSE_SUM_8                                          \
70   /* sse */                                                         \
71   "pasubub    %[ftmp3],   %[ftmp1],       %[ftmp2]            \n\t" \
72   "punpcklbh  %[ftmp4],   %[ftmp3],       %[ftmp0]            \n\t" \
73   "punpckhbh  %[ftmp5],   %[ftmp3],       %[ftmp0]            \n\t" \
74   "pmaddhw    %[ftmp6],   %[ftmp4],       %[ftmp4]            \n\t" \
75   "pmaddhw    %[ftmp7],   %[ftmp5],       %[ftmp5]            \n\t" \
76   "paddw      %[ftmp8],   %[ftmp8],       %[ftmp6]            \n\t" \
77   "paddw      %[ftmp8],   %[ftmp8],       %[ftmp7]            \n\t" \
78                                                                     \
79   /* sum */                                                         \
80   "punpcklbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t" \
81   "punpckhbh  %[ftmp4],   %[ftmp1],       %[ftmp0]            \n\t" \
82   "punpcklbh  %[ftmp5],   %[ftmp2],       %[ftmp0]            \n\t" \
83   "punpckhbh  %[ftmp6],   %[ftmp2],       %[ftmp0]            \n\t" \
84   "paddh      %[ftmp10],  %[ftmp10],      %[ftmp3]            \n\t" \
85   "paddh      %[ftmp10],  %[ftmp10],      %[ftmp4]            \n\t" \
86   "paddh      %[ftmp12],  %[ftmp12],      %[ftmp5]            \n\t" \
87   "paddh      %[ftmp12],  %[ftmp12],      %[ftmp6]            \n\t"
88 
89 #define VARIANCE_SSE_8                                              \
90   "gsldlc1    %[ftmp1],   0x07(%[src_ptr])                    \n\t" \
91   "gsldrc1    %[ftmp1],   0x00(%[src_ptr])                    \n\t" \
92   "gsldlc1    %[ftmp2],   0x07(%[ref_ptr])                    \n\t" \
93   "gsldrc1    %[ftmp2],   0x00(%[ref_ptr])                    \n\t" \
94   "pasubub    %[ftmp3],   %[ftmp1],       %[ftmp2]            \n\t" \
95   "punpcklbh  %[ftmp4],   %[ftmp3],       %[ftmp0]            \n\t" \
96   "punpckhbh  %[ftmp5],   %[ftmp3],       %[ftmp0]            \n\t" \
97   "pmaddhw    %[ftmp6],   %[ftmp4],       %[ftmp4]            \n\t" \
98   "pmaddhw    %[ftmp7],   %[ftmp5],       %[ftmp5]            \n\t" \
99   "paddw      %[ftmp8],   %[ftmp8],       %[ftmp6]            \n\t" \
100   "paddw      %[ftmp8],   %[ftmp8],       %[ftmp7]            \n\t"
101 
102 #define VARIANCE_SSE_16                                             \
103   VARIANCE_SSE_8                                                    \
104   "gsldlc1    %[ftmp1],   0x0f(%[src_ptr])                    \n\t" \
105   "gsldrc1    %[ftmp1],   0x08(%[src_ptr])                    \n\t" \
106   "gsldlc1    %[ftmp2],   0x0f(%[ref_ptr])                    \n\t" \
107   "gsldrc1    %[ftmp2],   0x08(%[ref_ptr])                    \n\t" \
108   "pasubub    %[ftmp3],   %[ftmp1],       %[ftmp2]            \n\t" \
109   "punpcklbh  %[ftmp4],   %[ftmp3],       %[ftmp0]            \n\t" \
110   "punpckhbh  %[ftmp5],   %[ftmp3],       %[ftmp0]            \n\t" \
111   "pmaddhw    %[ftmp6],   %[ftmp4],       %[ftmp4]            \n\t" \
112   "pmaddhw    %[ftmp7],   %[ftmp5],       %[ftmp5]            \n\t" \
113   "paddw      %[ftmp8],   %[ftmp8],       %[ftmp6]            \n\t" \
114   "paddw      %[ftmp8],   %[ftmp8],       %[ftmp7]            \n\t"
115 
116 #define VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_A                       \
117   /* calculate fdata3[0]~fdata3[3], store at ftmp2*/                \
118   "gsldlc1    %[ftmp1],   0x07(%[src_ptr])                    \n\t" \
119   "gsldrc1    %[ftmp1],   0x00(%[src_ptr])                    \n\t" \
120   "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t" \
121   "gsldlc1    %[ftmp1],   0x08(%[src_ptr])                    \n\t" \
122   "gsldrc1    %[ftmp1],   0x01(%[src_ptr])                    \n\t" \
123   "punpcklbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t" \
124   "pmullh     %[ftmp2],   %[ftmp2],       %[filter_x0]        \n\t" \
125   "paddh      %[ftmp2],   %[ftmp2],       %[ff_ph_40]         \n\t" \
126   "pmullh     %[ftmp3],   %[ftmp3],       %[filter_x1]        \n\t" \
127   "paddh      %[ftmp2],   %[ftmp2],       %[ftmp3]            \n\t" \
128   "psrlh      %[ftmp2],   %[ftmp2],       %[ftmp6]            \n\t"
129 
130 #define VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_B                       \
131   /* calculate fdata3[0]~fdata3[3], store at ftmp4*/                \
132   "gsldlc1    %[ftmp1],   0x07(%[src_ptr])                    \n\t" \
133   "gsldrc1    %[ftmp1],   0x00(%[src_ptr])                    \n\t" \
134   "punpcklbh  %[ftmp4],   %[ftmp1],       %[ftmp0]            \n\t" \
135   "gsldlc1    %[ftmp1],   0x08(%[src_ptr])                    \n\t" \
136   "gsldrc1    %[ftmp1],   0x01(%[src_ptr])                    \n\t" \
137   "punpcklbh  %[ftmp5],   %[ftmp1],       %[ftmp0]            \n\t" \
138   "pmullh     %[ftmp4],   %[ftmp4],       %[filter_x0]        \n\t" \
139   "paddh      %[ftmp4],   %[ftmp4],       %[ff_ph_40]         \n\t" \
140   "pmullh     %[ftmp5],   %[ftmp5],       %[filter_x1]        \n\t" \
141   "paddh      %[ftmp4],   %[ftmp4],       %[ftmp5]            \n\t" \
142   "psrlh      %[ftmp4],   %[ftmp4],       %[ftmp6]            \n\t"
143 
144 #define VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_4_A                      \
145   /* calculate: temp2[0] ~ temp2[3] */                              \
146   "pmullh     %[ftmp2],   %[ftmp2],       %[filter_y0]        \n\t" \
147   "paddh      %[ftmp2],   %[ftmp2],       %[ff_ph_40]         \n\t" \
148   "pmullh     %[ftmp1],   %[ftmp4],       %[filter_y1]        \n\t" \
149   "paddh      %[ftmp2],   %[ftmp2],       %[ftmp1]            \n\t" \
150   "psrlh      %[ftmp2],   %[ftmp2],       %[ftmp6]            \n\t" \
151                                                                     \
152   /* store: temp2[0] ~ temp2[3] */                                  \
153   "pand       %[ftmp2],   %[ftmp2],       %[mask]             \n\t" \
154   "packushb   %[ftmp2],   %[ftmp2],       %[ftmp0]            \n\t" \
155   "gssdrc1    %[ftmp2],   0x00(%[temp2_ptr])                  \n\t"
156 
157 #define VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_4_B                      \
158   /* calculate: temp2[0] ~ temp2[3] */                              \
159   "pmullh     %[ftmp4],   %[ftmp4],       %[filter_y0]        \n\t" \
160   "paddh      %[ftmp4],   %[ftmp4],       %[ff_ph_40]         \n\t" \
161   "pmullh     %[ftmp1],   %[ftmp2],       %[filter_y1]        \n\t" \
162   "paddh      %[ftmp4],   %[ftmp4],       %[ftmp1]            \n\t" \
163   "psrlh      %[ftmp4],   %[ftmp4],       %[ftmp6]            \n\t" \
164                                                                     \
165   /* store: temp2[0] ~ temp2[3] */                                  \
166   "pand       %[ftmp4],   %[ftmp4],       %[mask]             \n\t" \
167   "packushb   %[ftmp4],   %[ftmp4],       %[ftmp0]            \n\t" \
168   "gssdrc1    %[ftmp4],   0x00(%[temp2_ptr])                  \n\t"
169 
170 #define VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_A                       \
171   /* calculate fdata3[0]~fdata3[7], store at ftmp2 and ftmp3*/      \
172   "gsldlc1    %[ftmp1],   0x07(%[src_ptr])                    \n\t" \
173   "gsldrc1    %[ftmp1],   0x00(%[src_ptr])                    \n\t" \
174   "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t" \
175   "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t" \
176   "gsldlc1    %[ftmp1],   0x08(%[src_ptr])                    \n\t" \
177   "gsldrc1    %[ftmp1],   0x01(%[src_ptr])                    \n\t" \
178   "punpcklbh  %[ftmp4],   %[ftmp1],       %[ftmp0]            \n\t" \
179   "punpckhbh  %[ftmp5],   %[ftmp1],       %[ftmp0]            \n\t" \
180   "pmullh     %[ftmp2],   %[ftmp2],       %[filter_x0]        \n\t" \
181   "pmullh     %[ftmp3],   %[ftmp3],       %[filter_x0]        \n\t" \
182   "paddh      %[ftmp2],   %[ftmp2],       %[ff_ph_40]         \n\t" \
183   "paddh      %[ftmp3],   %[ftmp3],       %[ff_ph_40]         \n\t" \
184   "pmullh     %[ftmp4],   %[ftmp4],       %[filter_x1]        \n\t" \
185   "pmullh     %[ftmp5],   %[ftmp5],       %[filter_x1]        \n\t" \
186   "paddh      %[ftmp2],   %[ftmp2],       %[ftmp4]            \n\t" \
187   "paddh      %[ftmp3],   %[ftmp3],       %[ftmp5]            \n\t" \
188   "psrlh      %[ftmp2],   %[ftmp2],       %[ftmp14]           \n\t" \
189   "psrlh      %[ftmp3],   %[ftmp3],       %[ftmp14]           \n\t"
190 
191 #define VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_B                       \
192   /* calculate fdata3[0]~fdata3[7], store at ftmp8 and ftmp9*/      \
193   "gsldlc1    %[ftmp1],   0x07(%[src_ptr])                    \n\t" \
194   "gsldrc1    %[ftmp1],   0x00(%[src_ptr])                    \n\t" \
195   "punpcklbh  %[ftmp8],   %[ftmp1],       %[ftmp0]            \n\t" \
196   "punpckhbh  %[ftmp9],   %[ftmp1],       %[ftmp0]            \n\t" \
197   "gsldlc1    %[ftmp1],   0x08(%[src_ptr])                    \n\t" \
198   "gsldrc1    %[ftmp1],   0x01(%[src_ptr])                    \n\t" \
199   "punpcklbh  %[ftmp10],  %[ftmp1],       %[ftmp0]            \n\t" \
200   "punpckhbh  %[ftmp11],  %[ftmp1],       %[ftmp0]            \n\t" \
201   "pmullh     %[ftmp8],   %[ftmp8],       %[filter_x0]        \n\t" \
202   "pmullh     %[ftmp9],   %[ftmp9],       %[filter_x0]        \n\t" \
203   "paddh      %[ftmp8],   %[ftmp8],       %[ff_ph_40]         \n\t" \
204   "paddh      %[ftmp9],   %[ftmp9],       %[ff_ph_40]         \n\t" \
205   "pmullh     %[ftmp10],  %[ftmp10],      %[filter_x1]        \n\t" \
206   "pmullh     %[ftmp11],  %[ftmp11],      %[filter_x1]        \n\t" \
207   "paddh      %[ftmp8],   %[ftmp8],       %[ftmp10]           \n\t" \
208   "paddh      %[ftmp9],   %[ftmp9],       %[ftmp11]           \n\t" \
209   "psrlh      %[ftmp8],   %[ftmp8],       %[ftmp14]           \n\t" \
210   "psrlh      %[ftmp9],   %[ftmp9],       %[ftmp14]           \n\t"
211 
212 #define VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_A                      \
213   /* calculate: temp2[0] ~ temp2[3] */                              \
214   "pmullh     %[ftmp2],   %[ftmp2],       %[filter_y0]        \n\t" \
215   "paddh      %[ftmp2],   %[ftmp2],       %[ff_ph_40]         \n\t" \
216   "pmullh     %[ftmp1],   %[ftmp8],       %[filter_y1]        \n\t" \
217   "paddh      %[ftmp2],   %[ftmp2],       %[ftmp1]            \n\t" \
218   "psrlh      %[ftmp2],   %[ftmp2],       %[ftmp14]           \n\t" \
219                                                                     \
220   /* calculate: temp2[4] ~ temp2[7] */                              \
221   "pmullh     %[ftmp3],   %[ftmp3],       %[filter_y0]        \n\t" \
222   "paddh      %[ftmp3],   %[ftmp3],       %[ff_ph_40]         \n\t" \
223   "pmullh     %[ftmp1],   %[ftmp9],       %[filter_y1]        \n\t" \
224   "paddh      %[ftmp3],   %[ftmp3],       %[ftmp1]            \n\t" \
225   "psrlh      %[ftmp3],   %[ftmp3],       %[ftmp14]           \n\t" \
226                                                                     \
227   /* store: temp2[0] ~ temp2[7] */                                  \
228   "pand       %[ftmp2],   %[ftmp2],       %[mask]             \n\t" \
229   "pand       %[ftmp3],   %[ftmp3],       %[mask]             \n\t" \
230   "packushb   %[ftmp2],   %[ftmp2],       %[ftmp3]            \n\t" \
231   "gssdlc1    %[ftmp2],   0x07(%[temp2_ptr])                  \n\t" \
232   "gssdrc1    %[ftmp2],   0x00(%[temp2_ptr])                  \n\t"
233 
234 #define VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_B                      \
235   /* calculate: temp2[0] ~ temp2[3] */                              \
236   "pmullh     %[ftmp8],   %[ftmp8],       %[filter_y0]        \n\t" \
237   "paddh      %[ftmp8],   %[ftmp8],       %[ff_ph_40]         \n\t" \
238   "pmullh     %[ftmp1],   %[ftmp2],       %[filter_y1]        \n\t" \
239   "paddh      %[ftmp8],   %[ftmp8],       %[ftmp1]            \n\t" \
240   "psrlh      %[ftmp8],   %[ftmp8],       %[ftmp14]           \n\t" \
241                                                                     \
242   /* calculate: temp2[4] ~ temp2[7] */                              \
243   "pmullh     %[ftmp9],   %[ftmp9],       %[filter_y0]        \n\t" \
244   "paddh      %[ftmp9],   %[ftmp9],       %[ff_ph_40]         \n\t" \
245   "pmullh     %[ftmp1],   %[ftmp3],       %[filter_y1]        \n\t" \
246   "paddh      %[ftmp9],   %[ftmp9],       %[ftmp1]            \n\t" \
247   "psrlh      %[ftmp9],   %[ftmp9],       %[ftmp14]           \n\t" \
248                                                                     \
249   /* store: temp2[0] ~ temp2[7] */                                  \
250   "pand       %[ftmp8],   %[ftmp8],       %[mask]             \n\t" \
251   "pand       %[ftmp9],   %[ftmp9],       %[mask]             \n\t" \
252   "packushb   %[ftmp8],   %[ftmp8],       %[ftmp9]            \n\t" \
253   "gssdlc1    %[ftmp8],   0x07(%[temp2_ptr])                  \n\t" \
254   "gssdrc1    %[ftmp8],   0x00(%[temp2_ptr])                  \n\t"
255 
256 #define VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_A                      \
257   /* calculate fdata3[0]~fdata3[7], store at ftmp2 and ftmp3*/      \
258   VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_A                             \
259                                                                     \
260   /* calculate fdata3[8]~fdata3[15], store at ftmp4 and ftmp5*/     \
261   "gsldlc1    %[ftmp1],   0x0f(%[src_ptr])                    \n\t" \
262   "gsldrc1    %[ftmp1],   0x08(%[src_ptr])                    \n\t" \
263   "punpcklbh  %[ftmp4],   %[ftmp1],       %[ftmp0]            \n\t" \
264   "punpckhbh  %[ftmp5],   %[ftmp1],       %[ftmp0]            \n\t" \
265   "gsldlc1    %[ftmp1],   0x10(%[src_ptr])                    \n\t" \
266   "gsldrc1    %[ftmp1],   0x09(%[src_ptr])                    \n\t" \
267   "punpcklbh  %[ftmp6],   %[ftmp1],       %[ftmp0]            \n\t" \
268   "punpckhbh  %[ftmp7],   %[ftmp1],       %[ftmp0]            \n\t" \
269   "pmullh     %[ftmp4],   %[ftmp4],       %[filter_x0]        \n\t" \
270   "pmullh     %[ftmp5],   %[ftmp5],       %[filter_x0]        \n\t" \
271   "paddh      %[ftmp4],   %[ftmp4],       %[ff_ph_40]         \n\t" \
272   "paddh      %[ftmp5],   %[ftmp5],       %[ff_ph_40]         \n\t" \
273   "pmullh     %[ftmp6],   %[ftmp6],       %[filter_x1]        \n\t" \
274   "pmullh     %[ftmp7],   %[ftmp7],       %[filter_x1]        \n\t" \
275   "paddh      %[ftmp4],   %[ftmp4],       %[ftmp6]            \n\t" \
276   "paddh      %[ftmp5],   %[ftmp5],       %[ftmp7]            \n\t" \
277   "psrlh      %[ftmp4],   %[ftmp4],       %[ftmp14]           \n\t" \
278   "psrlh      %[ftmp5],   %[ftmp5],       %[ftmp14]           \n\t"
279 
280 #define VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_B                      \
281   /* calculate fdata3[0]~fdata3[7], store at ftmp8 and ftmp9*/      \
282   VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_B                             \
283                                                                     \
284   /* calculate fdata3[8]~fdata3[15], store at ftmp10 and ftmp11*/   \
285   "gsldlc1    %[ftmp1],   0x0f(%[src_ptr])                    \n\t" \
286   "gsldrc1    %[ftmp1],   0x08(%[src_ptr])                    \n\t" \
287   "punpcklbh  %[ftmp10],  %[ftmp1],       %[ftmp0]            \n\t" \
288   "punpckhbh  %[ftmp11],  %[ftmp1],       %[ftmp0]            \n\t" \
289   "gsldlc1    %[ftmp1],   0x10(%[src_ptr])                    \n\t" \
290   "gsldrc1    %[ftmp1],   0x09(%[src_ptr])                    \n\t" \
291   "punpcklbh  %[ftmp12],  %[ftmp1],       %[ftmp0]            \n\t" \
292   "punpckhbh  %[ftmp13],  %[ftmp1],       %[ftmp0]            \n\t" \
293   "pmullh     %[ftmp10],  %[ftmp10],      %[filter_x0]        \n\t" \
294   "pmullh     %[ftmp11],  %[ftmp11],      %[filter_x0]        \n\t" \
295   "paddh      %[ftmp10],  %[ftmp10],      %[ff_ph_40]         \n\t" \
296   "paddh      %[ftmp11],  %[ftmp11],      %[ff_ph_40]         \n\t" \
297   "pmullh     %[ftmp12],  %[ftmp12],      %[filter_x1]        \n\t" \
298   "pmullh     %[ftmp13],  %[ftmp13],      %[filter_x1]        \n\t" \
299   "paddh      %[ftmp10],  %[ftmp10],      %[ftmp12]           \n\t" \
300   "paddh      %[ftmp11],  %[ftmp11],      %[ftmp13]           \n\t" \
301   "psrlh      %[ftmp10],  %[ftmp10],      %[ftmp14]           \n\t" \
302   "psrlh      %[ftmp11],  %[ftmp11],      %[ftmp14]           \n\t"
303 
304 #define VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_16_A                     \
305   VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_A                            \
306                                                                     \
307   /* calculate: temp2[8] ~ temp2[11] */                             \
308   "pmullh     %[ftmp4],   %[ftmp4],       %[filter_y0]        \n\t" \
309   "paddh      %[ftmp4],   %[ftmp4],       %[ff_ph_40]         \n\t" \
310   "pmullh     %[ftmp1],   %[ftmp10],      %[filter_y1]        \n\t" \
311   "paddh      %[ftmp4],   %[ftmp4],       %[ftmp1]            \n\t" \
312   "psrlh      %[ftmp4],   %[ftmp4],       %[ftmp14]           \n\t" \
313                                                                     \
314   /* calculate: temp2[12] ~ temp2[15] */                            \
315   "pmullh     %[ftmp5],   %[ftmp5],       %[filter_y0]        \n\t" \
316   "paddh      %[ftmp5],   %[ftmp5],       %[ff_ph_40]         \n\t" \
317   "pmullh     %[ftmp1],   %[ftmp11],       %[filter_y1]       \n\t" \
318   "paddh      %[ftmp5],   %[ftmp5],       %[ftmp1]            \n\t" \
319   "psrlh      %[ftmp5],   %[ftmp5],       %[ftmp14]           \n\t" \
320                                                                     \
321   /* store: temp2[8] ~ temp2[15] */                                 \
322   "pand       %[ftmp4],   %[ftmp4],       %[mask]             \n\t" \
323   "pand       %[ftmp5],   %[ftmp5],       %[mask]             \n\t" \
324   "packushb   %[ftmp4],   %[ftmp4],       %[ftmp5]            \n\t" \
325   "gssdlc1    %[ftmp4],   0x0f(%[temp2_ptr])                  \n\t" \
326   "gssdrc1    %[ftmp4],   0x08(%[temp2_ptr])                  \n\t"
327 
328 #define VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_16_B                     \
329   VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_B                            \
330                                                                     \
331   /* calculate: temp2[8] ~ temp2[11] */                             \
332   "pmullh     %[ftmp10],  %[ftmp10],      %[filter_y0]        \n\t" \
333   "paddh      %[ftmp10],  %[ftmp10],      %[ff_ph_40]         \n\t" \
334   "pmullh     %[ftmp1],   %[ftmp4],       %[filter_y1]        \n\t" \
335   "paddh      %[ftmp10],  %[ftmp10],      %[ftmp1]            \n\t" \
336   "psrlh      %[ftmp10],  %[ftmp10],      %[ftmp14]           \n\t" \
337                                                                     \
338   /* calculate: temp2[12] ~ temp2[15] */                            \
339   "pmullh     %[ftmp11],  %[ftmp11],      %[filter_y0]        \n\t" \
340   "paddh      %[ftmp11],  %[ftmp11],      %[ff_ph_40]         \n\t" \
341   "pmullh     %[ftmp1],   %[ftmp5],       %[filter_y1]        \n\t" \
342   "paddh      %[ftmp11],  %[ftmp11],      %[ftmp1]            \n\t" \
343   "psrlh      %[ftmp11],  %[ftmp11],      %[ftmp14]           \n\t" \
344                                                                     \
345   /* store: temp2[8] ~ temp2[15] */                                 \
346   "pand       %[ftmp10],  %[ftmp10],      %[mask]             \n\t" \
347   "pand       %[ftmp11],  %[ftmp11],      %[mask]             \n\t" \
348   "packushb   %[ftmp10],  %[ftmp10],      %[ftmp11]           \n\t" \
349   "gssdlc1    %[ftmp10],  0x0f(%[temp2_ptr])                  \n\t" \
350   "gssdrc1    %[ftmp10],  0x08(%[temp2_ptr])                  \n\t"
351 
352 // Applies a 1-D 2-tap bilinear filter to the source block in either horizontal
353 // or vertical direction to produce the filtered output block. Used to implement
354 // the first-pass of 2-D separable filter.
355 //
356 // Produces int16_t output to retain precision for the next pass. Two filter
357 // taps should sum to FILTER_WEIGHT. pixel_step defines whether the filter is
358 // applied horizontally (pixel_step = 1) or vertically (pixel_step = stride).
359 // It defines the offset required to move from one input to the next.
var_filter_block2d_bil_first_pass(const uint8_t * src_ptr,uint16_t * ref_ptr,unsigned int src_pixels_per_line,int pixel_step,unsigned int output_height,unsigned int output_width,const uint8_t * filter)360 static void var_filter_block2d_bil_first_pass(
361     const uint8_t *src_ptr, uint16_t *ref_ptr, unsigned int src_pixels_per_line,
362     int pixel_step, unsigned int output_height, unsigned int output_width,
363     const uint8_t *filter) {
364   unsigned int i, j;
365 
366   for (i = 0; i < output_height; ++i) {
367     for (j = 0; j < output_width; ++j) {
368       ref_ptr[j] = ROUND_POWER_OF_TWO(
369           (int)src_ptr[0] * filter[0] + (int)src_ptr[pixel_step] * filter[1],
370           FILTER_BITS);
371 
372       ++src_ptr;
373     }
374 
375     src_ptr += src_pixels_per_line - output_width;
376     ref_ptr += output_width;
377   }
378 }
379 
380 // Applies a 1-D 2-tap bilinear filter to the source block in either horizontal
381 // or vertical direction to produce the filtered output block. Used to implement
382 // the second-pass of 2-D separable filter.
383 //
384 // Requires 16-bit input as produced by filter_block2d_bil_first_pass. Two
385 // filter taps should sum to FILTER_WEIGHT. pixel_step defines whether the
386 // filter is applied horizontally (pixel_step = 1) or vertically
387 // (pixel_step = stride). It defines the offset required to move from one input
388 // to the next. Output is 8-bit.
var_filter_block2d_bil_second_pass(const uint16_t * src_ptr,uint8_t * ref_ptr,unsigned int src_pixels_per_line,unsigned int pixel_step,unsigned int output_height,unsigned int output_width,const uint8_t * filter)389 static void var_filter_block2d_bil_second_pass(
390     const uint16_t *src_ptr, uint8_t *ref_ptr, unsigned int src_pixels_per_line,
391     unsigned int pixel_step, unsigned int output_height,
392     unsigned int output_width, const uint8_t *filter) {
393   unsigned int i, j;
394 
395   for (i = 0; i < output_height; ++i) {
396     for (j = 0; j < output_width; ++j) {
397       ref_ptr[j] = ROUND_POWER_OF_TWO(
398           (int)src_ptr[0] * filter[0] + (int)src_ptr[pixel_step] * filter[1],
399           FILTER_BITS);
400       ++src_ptr;
401     }
402 
403     src_ptr += src_pixels_per_line - output_width;
404     ref_ptr += output_width;
405   }
406 }
407 
vpx_variance64x(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,uint32_t * sse,int high)408 static inline uint32_t vpx_variance64x(const uint8_t *src_ptr, int src_stride,
409                                        const uint8_t *ref_ptr, int ref_stride,
410                                        uint32_t *sse, int high) {
411   int sum;
412   double ftmp[12];
413   uint32_t tmp[3];
414 
415   *sse = 0;
416 
417   /* clang-format off */
418   __asm__ volatile (
419     "li         %[tmp0],    0x20                                \n\t"
420     "mtc1       %[tmp0],    %[ftmp11]                           \n\t"
421     MMI_L(%[tmp0], %[high], 0x00)
422     "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
423     "pxor       %[ftmp9],   %[ftmp9],       %[ftmp9]            \n\t"
424     "pxor       %[ftmp10],  %[ftmp10],      %[ftmp10]           \n\t"
425     "1:                                                         \n\t"
426     "gsldlc1    %[ftmp1],   0x07(%[src_ptr])                    \n\t"
427     "gsldrc1    %[ftmp1],   0x00(%[src_ptr])                    \n\t"
428     "gsldlc1    %[ftmp2],   0x07(%[ref_ptr])                    \n\t"
429     "gsldrc1    %[ftmp2],   0x00(%[ref_ptr])                    \n\t"
430     VARIANCE_SSE_SUM_8_FOR_W64
431 
432     "gsldlc1    %[ftmp1],   0x0f(%[src_ptr])                    \n\t"
433     "gsldrc1    %[ftmp1],   0x08(%[src_ptr])                    \n\t"
434     "gsldlc1    %[ftmp2],   0x0f(%[ref_ptr])                    \n\t"
435     "gsldrc1    %[ftmp2],   0x08(%[ref_ptr])                    \n\t"
436     VARIANCE_SSE_SUM_8_FOR_W64
437 
438     "gsldlc1    %[ftmp1],   0x17(%[src_ptr])                    \n\t"
439     "gsldrc1    %[ftmp1],   0x10(%[src_ptr])                    \n\t"
440     "gsldlc1    %[ftmp2],   0x17(%[ref_ptr])                    \n\t"
441     "gsldrc1    %[ftmp2],   0x10(%[ref_ptr])                    \n\t"
442     VARIANCE_SSE_SUM_8_FOR_W64
443 
444     "gsldlc1    %[ftmp1],   0x1f(%[src_ptr])                    \n\t"
445     "gsldrc1    %[ftmp1],   0x18(%[src_ptr])                    \n\t"
446     "gsldlc1    %[ftmp2],   0x1f(%[ref_ptr])                    \n\t"
447     "gsldrc1    %[ftmp2],   0x18(%[ref_ptr])                    \n\t"
448     VARIANCE_SSE_SUM_8_FOR_W64
449 
450     "gsldlc1    %[ftmp1],   0x27(%[src_ptr])                    \n\t"
451     "gsldrc1    %[ftmp1],   0x20(%[src_ptr])                    \n\t"
452     "gsldlc1    %[ftmp2],   0x27(%[ref_ptr])                    \n\t"
453     "gsldrc1    %[ftmp2],   0x20(%[ref_ptr])                    \n\t"
454     VARIANCE_SSE_SUM_8_FOR_W64
455 
456     "gsldlc1    %[ftmp1],   0x2f(%[src_ptr])                    \n\t"
457     "gsldrc1    %[ftmp1],   0x28(%[src_ptr])                    \n\t"
458     "gsldlc1    %[ftmp2],   0x2f(%[ref_ptr])                    \n\t"
459     "gsldrc1    %[ftmp2],   0x28(%[ref_ptr])                    \n\t"
460     VARIANCE_SSE_SUM_8_FOR_W64
461 
462     "gsldlc1    %[ftmp1],   0x37(%[src_ptr])                    \n\t"
463     "gsldrc1    %[ftmp1],   0x30(%[src_ptr])                    \n\t"
464     "gsldlc1    %[ftmp2],   0x37(%[ref_ptr])                    \n\t"
465     "gsldrc1    %[ftmp2],   0x30(%[ref_ptr])                    \n\t"
466     VARIANCE_SSE_SUM_8_FOR_W64
467 
468     "gsldlc1    %[ftmp1],   0x3f(%[src_ptr])                    \n\t"
469     "gsldrc1    %[ftmp1],   0x38(%[src_ptr])                    \n\t"
470     "gsldlc1    %[ftmp2],   0x3f(%[ref_ptr])                    \n\t"
471     "gsldrc1    %[ftmp2],   0x38(%[ref_ptr])                    \n\t"
472     VARIANCE_SSE_SUM_8_FOR_W64
473 
474     "addiu      %[tmp0],    %[tmp0],        -0x01               \n\t"
475     MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride])
476     MMI_ADDU(%[ref_ptr], %[ref_ptr], %[ref_stride])
477     "bnez       %[tmp0],    1b                                  \n\t"
478 
479     "mfc1       %[tmp1],    %[ftmp9]                            \n\t"
480     "mfhc1      %[tmp2],    %[ftmp9]                            \n\t"
481     "addu       %[sum],     %[tmp1],        %[tmp2]             \n\t"
482     "ssrld      %[ftmp1],   %[ftmp10],      %[ftmp11]           \n\t"
483     "paddw      %[ftmp1],   %[ftmp1],       %[ftmp10]           \n\t"
484     "swc1       %[ftmp1],   0x00(%[sse])                        \n\t"
485     : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
486       [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
487       [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
488       [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
489       [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
490       [ftmp10]"=&f"(ftmp[10]),          [ftmp11]"=&f"(ftmp[11]),
491       [tmp0]"=&r"(tmp[0]),              [tmp1]"=&r"(tmp[1]),
492       [tmp2]"=&r"(tmp[2]),
493       [src_ptr]"+&r"(src_ptr),          [ref_ptr]"+&r"(ref_ptr),
494       [sum]"=&r"(sum)
495     : [src_stride]"r"((mips_reg)src_stride),
496       [ref_stride]"r"((mips_reg)ref_stride),
497       [high]"r"(&high), [sse]"r"(sse)
498     : "memory"
499   );
500   /* clang-format on */
501 
502   return *sse - (((int64_t)sum * sum) / (64 * high));
503 }
504 
505 #define VPX_VARIANCE64XN(n)                                                   \
506   uint32_t vpx_variance64x##n##_mmi(const uint8_t *src_ptr, int src_stride,   \
507                                     const uint8_t *ref_ptr, int ref_stride,   \
508                                     uint32_t *sse) {                          \
509     return vpx_variance64x(src_ptr, src_stride, ref_ptr, ref_stride, sse, n); \
510   }
511 
512 VPX_VARIANCE64XN(64)
513 VPX_VARIANCE64XN(32)
514 
vpx_variance32x64_mmi(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,uint32_t * sse)515 uint32_t vpx_variance32x64_mmi(const uint8_t *src_ptr, int src_stride,
516                                const uint8_t *ref_ptr, int ref_stride,
517                                uint32_t *sse) {
518   int sum;
519   double ftmp[12];
520   uint32_t tmp[3];
521 
522   *sse = 0;
523 
524   /* clang-format off */
525   __asm__ volatile (
526     "li         %[tmp0],    0x20                                \n\t"
527     "mtc1       %[tmp0],    %[ftmp11]                           \n\t"
528     "li         %[tmp0],    0x40                                \n\t"
529     "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
530     "pxor       %[ftmp9],   %[ftmp9],       %[ftmp9]            \n\t"
531     "pxor       %[ftmp10],  %[ftmp10],      %[ftmp10]           \n\t"
532     "1:                                                         \n\t"
533     "gsldlc1    %[ftmp1],   0x07(%[src_ptr])                    \n\t"
534     "gsldrc1    %[ftmp1],   0x00(%[src_ptr])                    \n\t"
535     "gsldlc1    %[ftmp2],   0x07(%[ref_ptr])                    \n\t"
536     "gsldrc1    %[ftmp2],   0x00(%[ref_ptr])                    \n\t"
537     VARIANCE_SSE_SUM_8_FOR_W64
538 
539     "gsldlc1    %[ftmp1],   0x0f(%[src_ptr])                    \n\t"
540     "gsldrc1    %[ftmp1],   0x08(%[src_ptr])                    \n\t"
541     "gsldlc1    %[ftmp2],   0x0f(%[ref_ptr])                    \n\t"
542     "gsldrc1    %[ftmp2],   0x08(%[ref_ptr])                    \n\t"
543     VARIANCE_SSE_SUM_8_FOR_W64
544 
545     "gsldlc1    %[ftmp1],   0x17(%[src_ptr])                    \n\t"
546     "gsldrc1    %[ftmp1],   0x10(%[src_ptr])                    \n\t"
547     "gsldlc1    %[ftmp2],   0x17(%[ref_ptr])                    \n\t"
548     "gsldrc1    %[ftmp2],   0x10(%[ref_ptr])                    \n\t"
549     VARIANCE_SSE_SUM_8_FOR_W64
550 
551     "gsldlc1    %[ftmp1],   0x1f(%[src_ptr])                    \n\t"
552     "gsldrc1    %[ftmp1],   0x18(%[src_ptr])                    \n\t"
553     "gsldlc1    %[ftmp2],   0x1f(%[ref_ptr])                    \n\t"
554     "gsldrc1    %[ftmp2],   0x18(%[ref_ptr])                    \n\t"
555     VARIANCE_SSE_SUM_8_FOR_W64
556 
557     "addiu      %[tmp0],    %[tmp0],        -0x01               \n\t"
558     MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride])
559     MMI_ADDU(%[ref_ptr], %[ref_ptr], %[ref_stride])
560     "bnez       %[tmp0],    1b                                  \n\t"
561 
562     "mfc1       %[tmp1],    %[ftmp9]                            \n\t"
563     "mfhc1      %[tmp2],    %[ftmp9]                            \n\t"
564     "addu       %[sum],     %[tmp1],        %[tmp2]             \n\t"
565     "ssrld      %[ftmp1],   %[ftmp10],      %[ftmp11]           \n\t"
566     "paddw      %[ftmp1],   %[ftmp1],       %[ftmp10]           \n\t"
567     "swc1       %[ftmp1],   0x00(%[sse])                        \n\t"
568     : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
569       [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
570       [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
571       [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
572       [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
573       [ftmp10]"=&f"(ftmp[10]),          [ftmp11]"=&f"(ftmp[11]),
574       [tmp0]"=&r"(tmp[0]),              [tmp1]"=&r"(tmp[1]),
575       [tmp2]"=&r"(tmp[2]),
576       [src_ptr]"+&r"(src_ptr),          [ref_ptr]"+&r"(ref_ptr),
577       [sum]"=&r"(sum)
578     : [src_stride]"r"((mips_reg)src_stride),
579       [ref_stride]"r"((mips_reg)ref_stride),
580       [sse]"r"(sse)
581     : "memory"
582   );
583   /* clang-format on */
584 
585   return *sse - (((int64_t)sum * sum) / 2048);
586 }
587 
vpx_variance32x(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,uint32_t * sse,int high)588 static inline uint32_t vpx_variance32x(const uint8_t *src_ptr, int src_stride,
589                                        const uint8_t *ref_ptr, int ref_stride,
590                                        uint32_t *sse, int high) {
591   int sum;
592   double ftmp[13];
593   uint32_t tmp[3];
594 
595   *sse = 0;
596 
597   /* clang-format off */
598   __asm__ volatile (
599     "li         %[tmp0],    0x20                                \n\t"
600     "mtc1       %[tmp0],    %[ftmp11]                           \n\t"
601     MMI_L(%[tmp0], %[high], 0x00)
602     "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
603     "pxor       %[ftmp8],   %[ftmp8],       %[ftmp8]            \n\t"
604     "pxor       %[ftmp10],  %[ftmp10],      %[ftmp10]           \n\t"
605     "pxor       %[ftmp12],  %[ftmp12],      %[ftmp12]           \n\t"
606     "1:                                                         \n\t"
607     "gsldlc1    %[ftmp1],   0x07(%[src_ptr])                    \n\t"
608     "gsldrc1    %[ftmp1],   0x00(%[src_ptr])                    \n\t"
609     "gsldlc1    %[ftmp2],   0x07(%[ref_ptr])                    \n\t"
610     "gsldrc1    %[ftmp2],   0x00(%[ref_ptr])                    \n\t"
611     VARIANCE_SSE_SUM_8
612     "gsldlc1    %[ftmp1],   0x0f(%[src_ptr])                    \n\t"
613     "gsldrc1    %[ftmp1],   0x08(%[src_ptr])                    \n\t"
614     "gsldlc1    %[ftmp2],   0x0f(%[ref_ptr])                    \n\t"
615     "gsldrc1    %[ftmp2],   0x08(%[ref_ptr])                    \n\t"
616     VARIANCE_SSE_SUM_8
617     "gsldlc1    %[ftmp1],   0x17(%[src_ptr])                    \n\t"
618     "gsldrc1    %[ftmp1],   0x10(%[src_ptr])                    \n\t"
619     "gsldlc1    %[ftmp2],   0x17(%[ref_ptr])                    \n\t"
620     "gsldrc1    %[ftmp2],   0x10(%[ref_ptr])                    \n\t"
621     VARIANCE_SSE_SUM_8
622     "gsldlc1    %[ftmp1],   0x1f(%[src_ptr])                    \n\t"
623     "gsldrc1    %[ftmp1],   0x18(%[src_ptr])                    \n\t"
624     "gsldlc1    %[ftmp2],   0x1f(%[ref_ptr])                    \n\t"
625     "gsldrc1    %[ftmp2],   0x18(%[ref_ptr])                    \n\t"
626     VARIANCE_SSE_SUM_8
627 
628     "addiu      %[tmp0],    %[tmp0],        -0x01               \n\t"
629     MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride])
630     MMI_ADDU(%[ref_ptr], %[ref_ptr], %[ref_stride])
631     "bnez       %[tmp0],    1b                                  \n\t"
632 
633     "ssrld      %[ftmp9],   %[ftmp8],       %[ftmp11]           \n\t"
634     "paddw      %[ftmp9],   %[ftmp9],       %[ftmp8]            \n\t"
635     "swc1       %[ftmp9],   0x00(%[sse])                        \n\t"
636 
637     "punpcklhw  %[ftmp3],   %[ftmp10],      %[ftmp0]            \n\t"
638     "punpckhhw  %[ftmp4],   %[ftmp10],      %[ftmp0]            \n\t"
639     "punpcklhw  %[ftmp5],   %[ftmp12],      %[ftmp0]            \n\t"
640     "punpckhhw  %[ftmp6],   %[ftmp12],      %[ftmp0]            \n\t"
641     "paddw      %[ftmp3],   %[ftmp3],       %[ftmp4]            \n\t"
642     "psubw      %[ftmp3],   %[ftmp3],       %[ftmp5]            \n\t"
643     "psubw      %[ftmp3],   %[ftmp3],       %[ftmp6]            \n\t"
644     "ssrld      %[ftmp0],   %[ftmp3],       %[ftmp11]           \n\t"
645     "paddw      %[ftmp0],   %[ftmp0],       %[ftmp3]            \n\t"
646     "swc1       %[ftmp0],   0x00(%[sum])                        \n\t"
647 
648     : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
649       [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
650       [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
651       [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
652       [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
653       [ftmp10]"=&f"(ftmp[10]),          [ftmp11]"=&f"(ftmp[11]),
654       [ftmp12]"=&f"(ftmp[12]),          [tmp0]"=&r"(tmp[0]),
655       [src_ptr]"+&r"(src_ptr),          [ref_ptr]"+&r"(ref_ptr)
656     : [src_stride]"r"((mips_reg)src_stride),
657       [ref_stride]"r"((mips_reg)ref_stride),
658       [high]"r"(&high), [sse]"r"(sse), [sum]"r"(&sum)
659     : "memory"
660   );
661   /* clang-format on */
662 
663   return *sse - (((int64_t)sum * sum) / (32 * high));
664 }
665 
666 #define VPX_VARIANCE32XN(n)                                                   \
667   uint32_t vpx_variance32x##n##_mmi(const uint8_t *src_ptr, int src_stride,   \
668                                     const uint8_t *ref_ptr, int ref_stride,   \
669                                     uint32_t *sse) {                          \
670     return vpx_variance32x(src_ptr, src_stride, ref_ptr, ref_stride, sse, n); \
671   }
672 
673 VPX_VARIANCE32XN(32)
674 VPX_VARIANCE32XN(16)
675 
vpx_variance16x(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,uint32_t * sse,int high)676 static inline uint32_t vpx_variance16x(const uint8_t *src_ptr, int src_stride,
677                                        const uint8_t *ref_ptr, int ref_stride,
678                                        uint32_t *sse, int high) {
679   int sum;
680   double ftmp[13];
681   uint32_t tmp[3];
682 
683   *sse = 0;
684 
685   /* clang-format off */
686   __asm__ volatile (
687     "li         %[tmp0],    0x20                                \n\t"
688     "mtc1       %[tmp0],    %[ftmp11]                           \n\t"
689     MMI_L(%[tmp0], %[high], 0x00)
690     "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
691     "pxor       %[ftmp8],   %[ftmp8],       %[ftmp8]            \n\t"
692     "pxor       %[ftmp10],  %[ftmp10],      %[ftmp10]           \n\t"
693     "pxor       %[ftmp12],  %[ftmp12],      %[ftmp12]           \n\t"
694     "1:                                                         \n\t"
695     "gsldlc1    %[ftmp1],   0x07(%[src_ptr])                    \n\t"
696     "gsldrc1    %[ftmp1],   0x00(%[src_ptr])                    \n\t"
697     "gsldlc1    %[ftmp2],   0x07(%[ref_ptr])                    \n\t"
698     "gsldrc1    %[ftmp2],   0x00(%[ref_ptr])                    \n\t"
699     VARIANCE_SSE_SUM_8
700     "gsldlc1    %[ftmp1],   0x0f(%[src_ptr])                    \n\t"
701     "gsldrc1    %[ftmp1],   0x08(%[src_ptr])                    \n\t"
702     "gsldlc1    %[ftmp2],   0x0f(%[ref_ptr])                    \n\t"
703     "gsldrc1    %[ftmp2],   0x08(%[ref_ptr])                    \n\t"
704     VARIANCE_SSE_SUM_8
705 
706     "addiu      %[tmp0],    %[tmp0],        -0x01               \n\t"
707     MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride])
708     MMI_ADDU(%[ref_ptr], %[ref_ptr], %[ref_stride])
709     "bnez       %[tmp0],    1b                                  \n\t"
710 
711     "ssrld      %[ftmp9],   %[ftmp8],       %[ftmp11]           \n\t"
712     "paddw      %[ftmp9],   %[ftmp9],       %[ftmp8]            \n\t"
713     "swc1       %[ftmp9],   0x00(%[sse])                        \n\t"
714 
715     "punpcklhw  %[ftmp3],   %[ftmp10],      %[ftmp0]            \n\t"
716     "punpckhhw  %[ftmp4],   %[ftmp10],      %[ftmp0]            \n\t"
717     "punpcklhw  %[ftmp5],   %[ftmp12],      %[ftmp0]            \n\t"
718     "punpckhhw  %[ftmp6],   %[ftmp12],      %[ftmp0]            \n\t"
719     "paddw      %[ftmp3],   %[ftmp3],       %[ftmp4]            \n\t"
720     "psubw      %[ftmp3],   %[ftmp3],       %[ftmp5]            \n\t"
721     "psubw      %[ftmp3],   %[ftmp3],       %[ftmp6]            \n\t"
722     "ssrld      %[ftmp0],   %[ftmp3],       %[ftmp11]           \n\t"
723     "paddw      %[ftmp0],   %[ftmp0],       %[ftmp3]            \n\t"
724     "swc1       %[ftmp0],   0x00(%[sum])                        \n\t"
725 
726     : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
727       [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
728       [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
729       [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
730       [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
731       [ftmp10]"=&f"(ftmp[10]),          [ftmp11]"=&f"(ftmp[11]),
732       [ftmp12]"=&f"(ftmp[12]),          [tmp0]"=&r"(tmp[0]),
733       [src_ptr]"+&r"(src_ptr),          [ref_ptr]"+&r"(ref_ptr)
734     : [src_stride]"r"((mips_reg)src_stride),
735       [ref_stride]"r"((mips_reg)ref_stride),
736       [high]"r"(&high), [sse]"r"(sse), [sum]"r"(&sum)
737     : "memory"
738   );
739   /* clang-format on */
740 
741   return *sse - (((int64_t)sum * sum) / (16 * high));
742 }
743 
744 #define VPX_VARIANCE16XN(n)                                                   \
745   uint32_t vpx_variance16x##n##_mmi(const uint8_t *src_ptr, int src_stride,   \
746                                     const uint8_t *ref_ptr, int ref_stride,   \
747                                     uint32_t *sse) {                          \
748     return vpx_variance16x(src_ptr, src_stride, ref_ptr, ref_stride, sse, n); \
749   }
750 
751 VPX_VARIANCE16XN(32)
752 VPX_VARIANCE16XN(16)
753 VPX_VARIANCE16XN(8)
754 
vpx_variance8x(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,uint32_t * sse,int high)755 static inline uint32_t vpx_variance8x(const uint8_t *src_ptr, int src_stride,
756                                       const uint8_t *ref_ptr, int ref_stride,
757                                       uint32_t *sse, int high) {
758   int sum;
759   double ftmp[13];
760   uint32_t tmp[3];
761 
762   *sse = 0;
763 
764   /* clang-format off */
765   __asm__ volatile (
766     "li         %[tmp0],    0x20                                \n\t"
767     "mtc1       %[tmp0],    %[ftmp11]                           \n\t"
768     MMI_L(%[tmp0], %[high], 0x00)
769     "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
770     "pxor       %[ftmp8],   %[ftmp8],       %[ftmp8]            \n\t"
771     "pxor       %[ftmp10],  %[ftmp10],      %[ftmp10]           \n\t"
772     "pxor       %[ftmp12],  %[ftmp12],      %[ftmp12]           \n\t"
773     "1:                                                         \n\t"
774     "gsldlc1    %[ftmp1],   0x07(%[src_ptr])                    \n\t"
775     "gsldrc1    %[ftmp1],   0x00(%[src_ptr])                    \n\t"
776     "gsldlc1    %[ftmp2],   0x07(%[ref_ptr])                    \n\t"
777     "gsldrc1    %[ftmp2],   0x00(%[ref_ptr])                    \n\t"
778     VARIANCE_SSE_SUM_8
779 
780     "addiu      %[tmp0],    %[tmp0],        -0x01               \n\t"
781     MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride])
782     MMI_ADDU(%[ref_ptr], %[ref_ptr], %[ref_stride])
783     "bnez       %[tmp0],    1b                                  \n\t"
784 
785     "ssrld      %[ftmp9],   %[ftmp8],       %[ftmp11]           \n\t"
786     "paddw      %[ftmp9],   %[ftmp9],       %[ftmp8]            \n\t"
787     "swc1       %[ftmp9],   0x00(%[sse])                        \n\t"
788 
789     "punpcklhw  %[ftmp3],   %[ftmp10],      %[ftmp0]            \n\t"
790     "punpckhhw  %[ftmp4],   %[ftmp10],      %[ftmp0]            \n\t"
791     "punpcklhw  %[ftmp5],   %[ftmp12],      %[ftmp0]            \n\t"
792     "punpckhhw  %[ftmp6],   %[ftmp12],      %[ftmp0]            \n\t"
793     "paddw      %[ftmp3],   %[ftmp3],       %[ftmp4]            \n\t"
794     "psubw      %[ftmp3],   %[ftmp3],       %[ftmp5]            \n\t"
795     "psubw      %[ftmp3],   %[ftmp3],       %[ftmp6]            \n\t"
796     "ssrld      %[ftmp0],   %[ftmp3],       %[ftmp11]           \n\t"
797     "paddw      %[ftmp0],   %[ftmp0],       %[ftmp3]            \n\t"
798     "swc1       %[ftmp0],   0x00(%[sum])                        \n\t"
799 
800     : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
801       [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
802       [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
803       [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
804       [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
805       [ftmp10]"=&f"(ftmp[10]),          [ftmp11]"=&f"(ftmp[11]),
806       [ftmp12]"=&f"(ftmp[12]),          [tmp0]"=&r"(tmp[0]),
807       [src_ptr]"+&r"(src_ptr),          [ref_ptr]"+&r"(ref_ptr)
808     : [src_stride]"r"((mips_reg)src_stride),
809       [ref_stride]"r"((mips_reg)ref_stride),
810       [high]"r"(&high), [sse]"r"(sse), [sum]"r"(&sum)
811     : "memory"
812   );
813   /* clang-format on */
814 
815   return *sse - (((int64_t)sum * sum) / (8 * high));
816 }
817 
818 #define VPX_VARIANCE8XN(n)                                                   \
819   uint32_t vpx_variance8x##n##_mmi(const uint8_t *src_ptr, int src_stride,   \
820                                    const uint8_t *ref_ptr, int ref_stride,   \
821                                    uint32_t *sse) {                          \
822     return vpx_variance8x(src_ptr, src_stride, ref_ptr, ref_stride, sse, n); \
823   }
824 
825 VPX_VARIANCE8XN(16)
826 VPX_VARIANCE8XN(8)
827 VPX_VARIANCE8XN(4)
828 
vpx_variance4x(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,uint32_t * sse,int high)829 static inline uint32_t vpx_variance4x(const uint8_t *src_ptr, int src_stride,
830                                       const uint8_t *ref_ptr, int ref_stride,
831                                       uint32_t *sse, int high) {
832   int sum;
833   double ftmp[12];
834   uint32_t tmp[3];
835 
836   *sse = 0;
837 
838   /* clang-format off */
839   __asm__ volatile (
840     "li         %[tmp0],    0x20                                \n\t"
841     "mtc1       %[tmp0],    %[ftmp10]                           \n\t"
842     MMI_L(%[tmp0], %[high], 0x00)
843     "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
844     "pxor       %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
845     "pxor       %[ftmp7],   %[ftmp7],       %[ftmp7]            \n\t"
846     "pxor       %[ftmp8],   %[ftmp8],       %[ftmp8]            \n\t"
847     "1:                                                         \n\t"
848     "gsldlc1    %[ftmp1],   0x07(%[src_ptr])                    \n\t"
849     "gsldrc1    %[ftmp1],   0x00(%[src_ptr])                    \n\t"
850     "gsldlc1    %[ftmp2],   0x07(%[ref_ptr])                    \n\t"
851     "gsldrc1    %[ftmp2],   0x00(%[ref_ptr])                    \n\t"
852     VARIANCE_SSE_SUM_4
853 
854     "addiu      %[tmp0],    %[tmp0],        -0x01               \n\t"
855     MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride])
856     MMI_ADDU(%[ref_ptr], %[ref_ptr], %[ref_stride])
857     "bnez       %[tmp0],    1b                                  \n\t"
858 
859     "ssrld      %[ftmp9],   %[ftmp6],       %[ftmp10]           \n\t"
860     "paddw      %[ftmp9],   %[ftmp9],       %[ftmp6]            \n\t"
861     "swc1       %[ftmp9],   0x00(%[sse])                        \n\t"
862 
863     "punpcklhw  %[ftmp3],   %[ftmp7],       %[ftmp0]            \n\t"
864     "punpckhhw  %[ftmp4],   %[ftmp7],       %[ftmp0]            \n\t"
865     "punpcklhw  %[ftmp5],   %[ftmp8],       %[ftmp0]            \n\t"
866     "punpckhhw  %[ftmp6],   %[ftmp8],       %[ftmp0]            \n\t"
867     "paddw      %[ftmp3],   %[ftmp3],       %[ftmp4]            \n\t"
868     "psubw      %[ftmp3],   %[ftmp3],       %[ftmp5]            \n\t"
869     "psubw      %[ftmp3],   %[ftmp3],       %[ftmp6]            \n\t"
870     "ssrld      %[ftmp0],   %[ftmp3],       %[ftmp10]           \n\t"
871     "paddw      %[ftmp0],   %[ftmp0],       %[ftmp3]            \n\t"
872     "swc1       %[ftmp0],   0x00(%[sum])                        \n\t"
873     : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
874       [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
875       [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
876       [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
877       [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
878       [ftmp10]"=&f"(ftmp[10]),
879       [tmp0]"=&r"(tmp[0]),
880       [src_ptr]"+&r"(src_ptr),          [ref_ptr]"+&r"(ref_ptr)
881     : [src_stride]"r"((mips_reg)src_stride),
882       [ref_stride]"r"((mips_reg)ref_stride),
883       [high]"r"(&high), [sse]"r"(sse), [sum]"r"(&sum)
884     : "memory"
885   );
886   /* clang-format on */
887 
888   return *sse - (((int64_t)sum * sum) / (4 * high));
889 }
890 
891 #define VPX_VARIANCE4XN(n)                                                   \
892   uint32_t vpx_variance4x##n##_mmi(const uint8_t *src_ptr, int src_stride,   \
893                                    const uint8_t *ref_ptr, int ref_stride,   \
894                                    uint32_t *sse) {                          \
895     return vpx_variance4x(src_ptr, src_stride, ref_ptr, ref_stride, sse, n); \
896   }
897 
898 VPX_VARIANCE4XN(8)
899 VPX_VARIANCE4XN(4)
900 
vpx_mse16x(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,uint32_t * sse,uint64_t high)901 static inline uint32_t vpx_mse16x(const uint8_t *src_ptr, int src_stride,
902                                   const uint8_t *ref_ptr, int ref_stride,
903                                   uint32_t *sse, uint64_t high) {
904   double ftmp[12];
905   uint32_t tmp[1];
906 
907   *sse = 0;
908 
909   /* clang-format off */
910   __asm__ volatile (
911     "li         %[tmp0],    0x20                                \n\t"
912     "mtc1       %[tmp0],    %[ftmp11]                           \n\t"
913     MMI_L(%[tmp0], %[high], 0x00)
914     "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
915     "pxor       %[ftmp8],   %[ftmp8],       %[ftmp8]            \n\t"
916 
917     "1:                                                         \n\t"
918     VARIANCE_SSE_16
919 
920     "addiu      %[tmp0],    %[tmp0],        -0x01               \n\t"
921     MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride])
922     MMI_ADDU(%[ref_ptr], %[ref_ptr], %[ref_stride])
923     "bnez       %[tmp0],    1b                                  \n\t"
924 
925     "ssrld      %[ftmp9],   %[ftmp8],       %[ftmp11]           \n\t"
926     "paddw      %[ftmp9],   %[ftmp9],       %[ftmp8]            \n\t"
927     "swc1       %[ftmp9],   0x00(%[sse])                        \n\t"
928     : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
929       [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
930       [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
931       [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
932       [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
933       [ftmp10]"=&f"(ftmp[10]),          [ftmp11]"=&f"(ftmp[11]),
934       [tmp0]"=&r"(tmp[0]),
935       [src_ptr]"+&r"(src_ptr),          [ref_ptr]"+&r"(ref_ptr)
936     : [src_stride]"r"((mips_reg)src_stride),
937       [ref_stride]"r"((mips_reg)ref_stride),
938       [high]"r"(&high), [sse]"r"(sse)
939     : "memory"
940   );
941   /* clang-format on */
942 
943   return *sse;
944 }
945 
946 #define vpx_mse16xN(n)                                                   \
947   uint32_t vpx_mse16x##n##_mmi(const uint8_t *src_ptr, int src_stride,   \
948                                const uint8_t *ref_ptr, int ref_stride,   \
949                                uint32_t *sse) {                          \
950     return vpx_mse16x(src_ptr, src_stride, ref_ptr, ref_stride, sse, n); \
951   }
952 
953 vpx_mse16xN(16);
954 vpx_mse16xN(8);
955 
vpx_mse8x(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,uint32_t * sse,uint64_t high)956 static inline uint32_t vpx_mse8x(const uint8_t *src_ptr, int src_stride,
957                                  const uint8_t *ref_ptr, int ref_stride,
958                                  uint32_t *sse, uint64_t high) {
959   double ftmp[12];
960   uint32_t tmp[1];
961 
962   *sse = 0;
963 
964   /* clang-format off */
965   __asm__ volatile (
966     "li         %[tmp0],    0x20                                \n\t"
967     "mtc1       %[tmp0],    %[ftmp11]                           \n\t"
968     MMI_L(%[tmp0], %[high], 0x00)
969     "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
970     "pxor       %[ftmp8],   %[ftmp8],       %[ftmp8]            \n\t"
971 
972     "1:                                                         \n\t"
973     VARIANCE_SSE_8
974 
975     "addiu      %[tmp0],    %[tmp0],        -0x01               \n\t"
976     MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride])
977     MMI_ADDU(%[ref_ptr], %[ref_ptr], %[ref_stride])
978     "bnez       %[tmp0],    1b                                  \n\t"
979 
980     "ssrld      %[ftmp9],   %[ftmp8],       %[ftmp11]           \n\t"
981     "paddw      %[ftmp9],   %[ftmp9],       %[ftmp8]            \n\t"
982     "swc1       %[ftmp9],   0x00(%[sse])                        \n\t"
983     : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
984       [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
985       [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
986       [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
987       [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
988       [ftmp10]"=&f"(ftmp[10]),          [ftmp11]"=&f"(ftmp[11]),
989       [tmp0]"=&r"(tmp[0]),
990       [src_ptr]"+&r"(src_ptr),          [ref_ptr]"+&r"(ref_ptr)
991     : [src_stride]"r"((mips_reg)src_stride),
992       [ref_stride]"r"((mips_reg)ref_stride),
993       [high]"r"(&high), [sse]"r"(sse)
994     : "memory"
995   );
996   /* clang-format on */
997 
998   return *sse;
999 }
1000 
1001 #define vpx_mse8xN(n)                                                   \
1002   uint32_t vpx_mse8x##n##_mmi(const uint8_t *src_ptr, int src_stride,   \
1003                               const uint8_t *ref_ptr, int ref_stride,   \
1004                               uint32_t *sse) {                          \
1005     return vpx_mse8x(src_ptr, src_stride, ref_ptr, ref_stride, sse, n); \
1006   }
1007 
1008 vpx_mse8xN(16);
1009 vpx_mse8xN(8);
1010 
1011 #define SUBPIX_VAR(W, H)                                                       \
1012   uint32_t vpx_sub_pixel_variance##W##x##H##_mmi(                              \
1013       const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset,      \
1014       const uint8_t *ref_ptr, int ref_stride, uint32_t *sse) {                 \
1015     uint16_t fdata3[((H) + 1) * (W)];                                          \
1016     uint8_t temp2[(H) * (W)];                                                  \
1017                                                                                \
1018     var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_stride, 1, (H) + 1, \
1019                                       W, bilinear_filters[x_offset]);          \
1020     var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W,              \
1021                                        bilinear_filters[y_offset]);            \
1022                                                                                \
1023     return vpx_variance##W##x##H##_mmi(temp2, W, ref_ptr, ref_stride, sse);    \
1024   }
1025 
1026 SUBPIX_VAR(64, 64)
1027 SUBPIX_VAR(64, 32)
1028 SUBPIX_VAR(32, 64)
1029 SUBPIX_VAR(32, 32)
1030 SUBPIX_VAR(32, 16)
1031 SUBPIX_VAR(16, 32)
1032 
var_filter_block2d_bil_16x(const uint8_t * src_ptr,int src_stride,int x_offset,int y_offset,uint8_t * temp2,int counter)1033 static inline void var_filter_block2d_bil_16x(const uint8_t *src_ptr,
1034                                               int src_stride, int x_offset,
1035                                               int y_offset, uint8_t *temp2,
1036                                               int counter) {
1037   uint8_t *temp2_ptr = temp2;
1038   mips_reg l_counter = counter;
1039   double ftmp[15];
1040   double ff_ph_40, mask;
1041   double filter_x0, filter_x1, filter_y0, filter_y1;
1042   mips_reg tmp[2];
1043   uint64_t x0, x1, y0, y1, all;
1044 
1045   const uint8_t *filter_x = bilinear_filters[x_offset];
1046   const uint8_t *filter_y = bilinear_filters[y_offset];
1047   x0 = (uint64_t)filter_x[0];
1048   x1 = (uint64_t)filter_x[1];
1049   y0 = (uint64_t)filter_y[0];
1050   y1 = (uint64_t)filter_y[1];
1051   all = x0 | x1 << 8 | y0 << 16 | y1 << 24;
1052 
1053   /* clang-format off */
1054   __asm__ volatile (
1055     "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
1056     MMI_MTC1(%[all], %[ftmp14])
1057     "punpcklbh  %[ftmp14],  %[ftmp14],      %[ftmp0]            \n\t"
1058     "pshufh     %[filter_x0], %[ftmp14],    %[ftmp0]            \n\t"
1059     MMI_LI(%[tmp0], 0x10)
1060     MMI_MTC1(%[tmp0], %[mask])
1061     "ssrld      %[ftmp14],  %[ftmp14],      %[mask]             \n\t"
1062     "pshufh     %[filter_x1], %[ftmp14],    %[ftmp0]            \n\t"
1063     "ssrld      %[ftmp14],  %[ftmp14],      %[mask]             \n\t"
1064     "pshufh     %[filter_y0], %[ftmp14],    %[ftmp0]            \n\t"
1065     "ssrld      %[ftmp14],  %[ftmp14],      %[mask]             \n\t"
1066     "pshufh     %[filter_y1], %[ftmp14],    %[ftmp0]            \n\t"
1067     MMI_LI(%[tmp0], 0x07)
1068     MMI_MTC1(%[tmp0], %[ftmp14])
1069     MMI_LI(%[tmp0], 0x0040004000400040)
1070     MMI_MTC1(%[tmp0], %[ff_ph_40])
1071     MMI_LI(%[tmp0], 0x00ff00ff00ff00ff)
1072     MMI_MTC1(%[tmp0], %[mask])
1073     // fdata3: fdata3[0] ~ fdata3[15]
1074     VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_A
1075 
1076     // fdata3 +src_stride*1: fdata3[0] ~ fdata3[15]
1077     MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride])
1078     VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_B
1079     // temp2: temp2[0] ~ temp2[15]
1080     VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_16_A
1081 
1082     // fdata3 +src_stride*2: fdata3[0] ~ fdata3[15]
1083     MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride])
1084     VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_A
1085     // temp2+16*1: temp2[0] ~ temp2[15]
1086     MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x10)
1087     VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_16_B
1088 
1089     "1:                                                         \n\t"
1090     MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride])
1091     VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_B
1092     MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x10)
1093     VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_16_A
1094 
1095     MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride])
1096     VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_A
1097     MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x10)
1098     VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_16_B
1099     "addiu      %[counter], %[counter],     -0x01               \n\t"
1100     "bnez       %[counter], 1b                                  \n\t"
1101     : [ftmp0] "=&f"(ftmp[0]), [ftmp1] "=&f"(ftmp[1]), [ftmp2] "=&f"(ftmp[2]),
1102       [ftmp3] "=&f"(ftmp[3]), [ftmp4] "=&f"(ftmp[4]), [ftmp5] "=&f"(ftmp[5]),
1103       [ftmp6] "=&f"(ftmp[6]), [ftmp7] "=&f"(ftmp[7]), [ftmp8] "=&f"(ftmp[8]),
1104       [ftmp9] "=&f"(ftmp[9]), [ftmp10] "=&f"(ftmp[10]),
1105       [ftmp11] "=&f"(ftmp[11]), [ftmp12] "=&f"(ftmp[12]),
1106       [ftmp13] "=&f"(ftmp[13]), [ftmp14] "=&f"(ftmp[14]),
1107       [tmp0] "=&r"(tmp[0]), [src_ptr] "+&r"(src_ptr), [temp2_ptr] "+&r"(temp2_ptr),
1108       [counter]"+&r"(l_counter), [ff_ph_40] "=&f"(ff_ph_40), [mask] "=&f"(mask),
1109       [filter_x0] "=&f"(filter_x0), [filter_x1] "=&f"(filter_x1),
1110       [filter_y0] "=&f"(filter_y0), [filter_y1] "=&f"(filter_y1)
1111     : [src_stride] "r"((mips_reg)src_stride), [all] "r"(all)
1112     : "memory"
1113   );
1114   /* clang-format on */
1115 }
1116 
1117 #define SUBPIX_VAR16XN(H)                                                      \
1118   uint32_t vpx_sub_pixel_variance16x##H##_mmi(                                 \
1119       const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset,      \
1120       const uint8_t *ref_ptr, int ref_stride, uint32_t *sse) {                 \
1121     uint8_t temp2[16 * (H)];                                                   \
1122     var_filter_block2d_bil_16x(src_ptr, src_stride, x_offset, y_offset, temp2, \
1123                                ((H)-2) / 2);                                   \
1124                                                                                \
1125     return vpx_variance16x##H##_mmi(temp2, 16, ref_ptr, ref_stride, sse);      \
1126   }
1127 
1128 SUBPIX_VAR16XN(16)
1129 SUBPIX_VAR16XN(8)
1130 
var_filter_block2d_bil_8x(const uint8_t * src_ptr,int src_stride,int x_offset,int y_offset,uint8_t * temp2,int counter)1131 static inline void var_filter_block2d_bil_8x(const uint8_t *src_ptr,
1132                                              int src_stride, int x_offset,
1133                                              int y_offset, uint8_t *temp2,
1134                                              int counter) {
1135   uint8_t *temp2_ptr = temp2;
1136   mips_reg l_counter = counter;
1137   double ftmp[15];
1138   mips_reg tmp[2];
1139   double ff_ph_40, mask;
1140   uint64_t x0, x1, y0, y1, all;
1141   double filter_x0, filter_x1, filter_y0, filter_y1;
1142   const uint8_t *filter_x = bilinear_filters[x_offset];
1143   const uint8_t *filter_y = bilinear_filters[y_offset];
1144   x0 = (uint64_t)filter_x[0];
1145   x1 = (uint64_t)filter_x[1];
1146   y0 = (uint64_t)filter_y[0];
1147   y1 = (uint64_t)filter_y[1];
1148   all = x0 | x1 << 8 | y0 << 16 | y1 << 24;
1149 
1150   /* clang-format off */
1151   __asm__ volatile (
1152     "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
1153     MMI_MTC1(%[all], %[ftmp14])
1154     "punpcklbh  %[ftmp14],  %[ftmp14],      %[ftmp0]            \n\t"
1155     "pshufh     %[filter_x0], %[ftmp14],    %[ftmp0]            \n\t"
1156     MMI_LI(%[tmp0], 0x10)
1157     MMI_MTC1(%[tmp0], %[mask])
1158     "ssrld      %[ftmp14],  %[ftmp14],      %[mask]             \n\t"
1159     "pshufh     %[filter_x1], %[ftmp14],    %[ftmp0]            \n\t"
1160     "ssrld      %[ftmp14],  %[ftmp14],      %[mask]             \n\t"
1161     "pshufh     %[filter_y0], %[ftmp14],    %[ftmp0]            \n\t"
1162     "ssrld      %[ftmp14],  %[ftmp14],      %[mask]             \n\t"
1163     "pshufh     %[filter_y1], %[ftmp14],    %[ftmp0]            \n\t"
1164     "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
1165     MMI_LI(%[tmp0], 0x07)
1166     MMI_MTC1(%[tmp0], %[ftmp14])
1167     MMI_LI(%[tmp0], 0x0040004000400040)
1168     MMI_MTC1(%[tmp0], %[ff_ph_40])
1169     MMI_LI(%[tmp0], 0x00ff00ff00ff00ff)
1170     MMI_MTC1(%[tmp0], %[mask])
1171 
1172     // fdata3: fdata3[0] ~ fdata3[7]
1173     VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_A
1174 
1175     // fdata3 +src_stride*1: fdata3[0] ~ fdata3[7]
1176     MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride])
1177     VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_B
1178     // temp2: temp2[0] ~ temp2[7]
1179     VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_A
1180 
1181     // fdata3 +src_stride*2: fdata3[0] ~ fdata3[7]
1182     MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride])
1183     VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_A
1184     // temp2+8*1: temp2[0] ~ temp2[7]
1185     MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x08)
1186     VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_B
1187 
1188     "1:                                                         \n\t"
1189     MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride])
1190     VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_B
1191     MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x08)
1192     VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_A
1193 
1194     MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride])
1195     VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_A
1196     MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x08)
1197     VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_B
1198     "addiu      %[counter], %[counter],     -0x01               \n\t"
1199     "bnez       %[counter], 1b                                  \n\t"
1200     : [ftmp0] "=&f"(ftmp[0]), [ftmp1] "=&f"(ftmp[1]), [ftmp2] "=&f"(ftmp[2]),
1201       [ftmp3] "=&f"(ftmp[3]), [ftmp4] "=&f"(ftmp[4]), [ftmp5] "=&f"(ftmp[5]),
1202       [ftmp6] "=&f"(ftmp[6]), [ftmp7] "=&f"(ftmp[7]), [ftmp8] "=&f"(ftmp[8]),
1203       [ftmp9] "=&f"(ftmp[9]), [ftmp10] "=&f"(ftmp[10]),
1204       [ftmp11] "=&f"(ftmp[11]), [ftmp12] "=&f"(ftmp[12]),
1205       [ftmp13] "=&f"(ftmp[13]), [ftmp14] "=&f"(ftmp[14]),
1206       [tmp0] "=&r"(tmp[0]), [src_ptr] "+&r"(src_ptr), [temp2_ptr] "+&r"(temp2_ptr),
1207       [counter]"+&r"(l_counter), [ff_ph_40] "=&f"(ff_ph_40), [mask] "=&f"(mask),
1208       [filter_x0] "=&f"(filter_x0), [filter_x1] "=&f"(filter_x1),
1209       [filter_y0] "=&f"(filter_y0), [filter_y1] "=&f"(filter_y1)
1210     : [src_stride] "r"((mips_reg)src_stride), [all] "r"(all)
1211     : "memory"
1212   );
1213   /* clang-format on */
1214 }
1215 
1216 #define SUBPIX_VAR8XN(H)                                                      \
1217   uint32_t vpx_sub_pixel_variance8x##H##_mmi(                                 \
1218       const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset,     \
1219       const uint8_t *ref_ptr, int ref_stride, uint32_t *sse) {                \
1220     uint8_t temp2[8 * (H)];                                                   \
1221     var_filter_block2d_bil_8x(src_ptr, src_stride, x_offset, y_offset, temp2, \
1222                               ((H)-2) / 2);                                   \
1223                                                                               \
1224     return vpx_variance8x##H##_mmi(temp2, 8, ref_ptr, ref_stride, sse);       \
1225   }
1226 
1227 SUBPIX_VAR8XN(16)
1228 SUBPIX_VAR8XN(8)
1229 SUBPIX_VAR8XN(4)
1230 
var_filter_block2d_bil_4x(const uint8_t * src_ptr,int src_stride,int x_offset,int y_offset,uint8_t * temp2,int counter)1231 static inline void var_filter_block2d_bil_4x(const uint8_t *src_ptr,
1232                                              int src_stride, int x_offset,
1233                                              int y_offset, uint8_t *temp2,
1234                                              int counter) {
1235   uint8_t *temp2_ptr = temp2;
1236   mips_reg l_counter = counter;
1237   double ftmp[7];
1238   mips_reg tmp[2];
1239   double ff_ph_40, mask;
1240   uint64_t x0, x1, y0, y1, all;
1241   double filter_x0, filter_x1, filter_y0, filter_y1;
1242   const uint8_t *filter_x = bilinear_filters[x_offset];
1243   const uint8_t *filter_y = bilinear_filters[y_offset];
1244   x0 = (uint64_t)filter_x[0];
1245   x1 = (uint64_t)filter_x[1];
1246   y0 = (uint64_t)filter_y[0];
1247   y1 = (uint64_t)filter_y[1];
1248   all = x0 | x1 << 8 | y0 << 16 | y1 << 24;
1249 
1250   /* clang-format off */
1251   __asm__ volatile (
1252     "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
1253     MMI_MTC1(%[all], %[ftmp6])
1254     "punpcklbh  %[ftmp6],   %[ftmp6],       %[ftmp0]            \n\t"
1255     "pshufh     %[filter_x0], %[ftmp6],     %[ftmp0]            \n\t"
1256     MMI_LI(%[tmp0], 0x10)
1257     MMI_MTC1(%[tmp0], %[mask])
1258     "ssrld      %[ftmp6],   %[ftmp6],       %[mask]             \n\t"
1259     "pshufh     %[filter_x1], %[ftmp6],     %[ftmp0]            \n\t"
1260     "ssrld      %[ftmp6],   %[ftmp6],       %[mask]             \n\t"
1261     "pshufh     %[filter_y0], %[ftmp6],     %[ftmp0]            \n\t"
1262     "ssrld      %[ftmp6],   %[ftmp6],       %[mask]             \n\t"
1263     "pshufh     %[filter_y1], %[ftmp6],     %[ftmp0]            \n\t"
1264     "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
1265     MMI_LI(%[tmp0], 0x07)
1266     MMI_MTC1(%[tmp0], %[ftmp6])
1267     MMI_LI(%[tmp0], 0x0040004000400040)
1268     MMI_MTC1(%[tmp0], %[ff_ph_40])
1269     MMI_LI(%[tmp0], 0x00ff00ff00ff00ff)
1270     MMI_MTC1(%[tmp0], %[mask])
1271     // fdata3: fdata3[0] ~ fdata3[3]
1272     VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_A
1273 
1274     // fdata3 +src_stride*1: fdata3[0] ~ fdata3[3]
1275     MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride])
1276     VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_B
1277     // temp2: temp2[0] ~ temp2[7]
1278     VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_4_A
1279 
1280     // fdata3 +src_stride*2: fdata3[0] ~ fdata3[3]
1281     MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride])
1282     VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_A
1283     // temp2+4*1: temp2[0] ~ temp2[7]
1284     MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x04)
1285     VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_4_B
1286 
1287     "1:                                                         \n\t"
1288     MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride])
1289     VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_B
1290     MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x04)
1291     VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_4_A
1292 
1293     MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride])
1294     VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_A
1295     MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x04)
1296     VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_4_B
1297     "addiu      %[counter], %[counter],     -0x01               \n\t"
1298     "bnez       %[counter], 1b                                  \n\t"
1299     : [ftmp0] "=&f"(ftmp[0]), [ftmp1] "=&f"(ftmp[1]), [ftmp2] "=&f"(ftmp[2]),
1300       [ftmp3] "=&f"(ftmp[3]), [ftmp4] "=&f"(ftmp[4]), [ftmp5] "=&f"(ftmp[5]),
1301       [ftmp6] "=&f"(ftmp[6]), [tmp0] "=&r"(tmp[0]), [src_ptr] "+&r"(src_ptr),
1302       [temp2_ptr] "+&r"(temp2_ptr), [counter]"+&r"(l_counter),
1303       [ff_ph_40] "=&f"(ff_ph_40), [mask] "=&f"(mask),
1304       [filter_x0] "=&f"(filter_x0), [filter_x1] "=&f"(filter_x1),
1305       [filter_y0] "=&f"(filter_y0), [filter_y1] "=&f"(filter_y1)
1306     : [src_stride] "r"((mips_reg)src_stride), [all] "r"(all)
1307     : "memory"
1308   );
1309   /* clang-format on */
1310 }
1311 
1312 #define SUBPIX_VAR4XN(H)                                                      \
1313   uint32_t vpx_sub_pixel_variance4x##H##_mmi(                                 \
1314       const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset,     \
1315       const uint8_t *ref_ptr, int ref_stride, uint32_t *sse) {                \
1316     uint8_t temp2[4 * (H)];                                                   \
1317     var_filter_block2d_bil_4x(src_ptr, src_stride, x_offset, y_offset, temp2, \
1318                               ((H)-2) / 2);                                   \
1319                                                                               \
1320     return vpx_variance4x##H##_mmi(temp2, 4, ref_ptr, ref_stride, sse);       \
1321   }
1322 
1323 SUBPIX_VAR4XN(8)
1324 SUBPIX_VAR4XN(4)
1325 
1326 #define SUBPIX_AVG_VAR(W, H)                                                   \
1327   uint32_t vpx_sub_pixel_avg_variance##W##x##H##_mmi(                          \
1328       const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset,      \
1329       const uint8_t *ref_ptr, int ref_stride, uint32_t *sse,                   \
1330       const uint8_t *second_pred) {                                            \
1331     uint16_t fdata3[((H) + 1) * (W)];                                          \
1332     uint8_t temp2[(H) * (W)];                                                  \
1333     DECLARE_ALIGNED(16, uint8_t, temp3[(H) * (W)]);                            \
1334                                                                                \
1335     var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_stride, 1, (H) + 1, \
1336                                       W, bilinear_filters[x_offset]);          \
1337     var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W,              \
1338                                        bilinear_filters[y_offset]);            \
1339                                                                                \
1340     vpx_comp_avg_pred_c(temp3, second_pred, W, H, temp2, W);                   \
1341                                                                                \
1342     return vpx_variance##W##x##H##_mmi(temp3, W, ref_ptr, ref_stride, sse);    \
1343   }
1344 
1345 SUBPIX_AVG_VAR(64, 64)
1346 SUBPIX_AVG_VAR(64, 32)
1347 SUBPIX_AVG_VAR(32, 64)
1348 SUBPIX_AVG_VAR(32, 32)
1349 SUBPIX_AVG_VAR(32, 16)
1350 SUBPIX_AVG_VAR(16, 32)
1351 SUBPIX_AVG_VAR(16, 16)
1352 SUBPIX_AVG_VAR(16, 8)
1353 SUBPIX_AVG_VAR(8, 16)
1354 SUBPIX_AVG_VAR(8, 8)
1355 SUBPIX_AVG_VAR(8, 4)
1356 SUBPIX_AVG_VAR(4, 8)
1357 SUBPIX_AVG_VAR(4, 4)
1358