1 /*
2 * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "./vpx_dsp_rtcd.h"
12 #include "vpx_dsp/variance.h"
13 #include "vpx_ports/mem.h"
14 #include "vpx/vpx_integer.h"
15 #include "vpx_ports/asmdefs_mmi.h"
16
17 static const uint8_t bilinear_filters[8][2] = {
18 { 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 },
19 { 64, 64 }, { 48, 80 }, { 32, 96 }, { 16, 112 },
20 };
21
22 /* Use VARIANCE_SSE_SUM_8_FOR_W64 in vpx_variance64x64,vpx_variance64x32,
23 vpx_variance32x64. VARIANCE_SSE_SUM_8 will lead to sum overflow. */
24 #define VARIANCE_SSE_SUM_8_FOR_W64 \
25 /* sse */ \
26 "pasubub %[ftmp3], %[ftmp1], %[ftmp2] \n\t" \
27 "punpcklbh %[ftmp4], %[ftmp3], %[ftmp0] \n\t" \
28 "punpckhbh %[ftmp5], %[ftmp3], %[ftmp0] \n\t" \
29 "pmaddhw %[ftmp6], %[ftmp4], %[ftmp4] \n\t" \
30 "pmaddhw %[ftmp7], %[ftmp5], %[ftmp5] \n\t" \
31 "paddw %[ftmp10], %[ftmp10], %[ftmp6] \n\t" \
32 "paddw %[ftmp10], %[ftmp10], %[ftmp7] \n\t" \
33 \
34 /* sum */ \
35 "punpcklbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
36 "punpckhbh %[ftmp4], %[ftmp1], %[ftmp0] \n\t" \
37 "punpcklbh %[ftmp5], %[ftmp2], %[ftmp0] \n\t" \
38 "punpckhbh %[ftmp6], %[ftmp2], %[ftmp0] \n\t" \
39 "punpcklhw %[ftmp1], %[ftmp3], %[ftmp0] \n\t" \
40 "punpckhhw %[ftmp2], %[ftmp3], %[ftmp0] \n\t" \
41 "punpcklhw %[ftmp7], %[ftmp5], %[ftmp0] \n\t" \
42 "punpckhhw %[ftmp8], %[ftmp5], %[ftmp0] \n\t" \
43 "psubw %[ftmp3], %[ftmp1], %[ftmp7] \n\t" \
44 "psubw %[ftmp5], %[ftmp2], %[ftmp8] \n\t" \
45 "punpcklhw %[ftmp1], %[ftmp4], %[ftmp0] \n\t" \
46 "punpckhhw %[ftmp2], %[ftmp4], %[ftmp0] \n\t" \
47 "punpcklhw %[ftmp7], %[ftmp6], %[ftmp0] \n\t" \
48 "punpckhhw %[ftmp8], %[ftmp6], %[ftmp0] \n\t" \
49 "psubw %[ftmp4], %[ftmp1], %[ftmp7] \n\t" \
50 "psubw %[ftmp6], %[ftmp2], %[ftmp8] \n\t" \
51 "paddw %[ftmp9], %[ftmp9], %[ftmp3] \n\t" \
52 "paddw %[ftmp9], %[ftmp9], %[ftmp4] \n\t" \
53 "paddw %[ftmp9], %[ftmp9], %[ftmp5] \n\t" \
54 "paddw %[ftmp9], %[ftmp9], %[ftmp6] \n\t"
55
56 #define VARIANCE_SSE_SUM_4 \
57 /* sse */ \
58 "pasubub %[ftmp3], %[ftmp1], %[ftmp2] \n\t" \
59 "punpcklbh %[ftmp4], %[ftmp3], %[ftmp0] \n\t" \
60 "pmaddhw %[ftmp5], %[ftmp4], %[ftmp4] \n\t" \
61 "paddw %[ftmp6], %[ftmp6], %[ftmp5] \n\t" \
62 \
63 /* sum */ \
64 "punpcklbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
65 "punpcklbh %[ftmp4], %[ftmp2], %[ftmp0] \n\t" \
66 "paddh %[ftmp7], %[ftmp7], %[ftmp3] \n\t" \
67 "paddh %[ftmp8], %[ftmp8], %[ftmp4] \n\t"
68
69 #define VARIANCE_SSE_SUM_8 \
70 /* sse */ \
71 "pasubub %[ftmp3], %[ftmp1], %[ftmp2] \n\t" \
72 "punpcklbh %[ftmp4], %[ftmp3], %[ftmp0] \n\t" \
73 "punpckhbh %[ftmp5], %[ftmp3], %[ftmp0] \n\t" \
74 "pmaddhw %[ftmp6], %[ftmp4], %[ftmp4] \n\t" \
75 "pmaddhw %[ftmp7], %[ftmp5], %[ftmp5] \n\t" \
76 "paddw %[ftmp8], %[ftmp8], %[ftmp6] \n\t" \
77 "paddw %[ftmp8], %[ftmp8], %[ftmp7] \n\t" \
78 \
79 /* sum */ \
80 "punpcklbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
81 "punpckhbh %[ftmp4], %[ftmp1], %[ftmp0] \n\t" \
82 "punpcklbh %[ftmp5], %[ftmp2], %[ftmp0] \n\t" \
83 "punpckhbh %[ftmp6], %[ftmp2], %[ftmp0] \n\t" \
84 "paddh %[ftmp10], %[ftmp10], %[ftmp3] \n\t" \
85 "paddh %[ftmp10], %[ftmp10], %[ftmp4] \n\t" \
86 "paddh %[ftmp12], %[ftmp12], %[ftmp5] \n\t" \
87 "paddh %[ftmp12], %[ftmp12], %[ftmp6] \n\t"
88
89 #define VARIANCE_SSE_8 \
90 "gsldlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t" \
91 "gsldrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t" \
92 "gsldlc1 %[ftmp2], 0x07(%[ref_ptr]) \n\t" \
93 "gsldrc1 %[ftmp2], 0x00(%[ref_ptr]) \n\t" \
94 "pasubub %[ftmp3], %[ftmp1], %[ftmp2] \n\t" \
95 "punpcklbh %[ftmp4], %[ftmp3], %[ftmp0] \n\t" \
96 "punpckhbh %[ftmp5], %[ftmp3], %[ftmp0] \n\t" \
97 "pmaddhw %[ftmp6], %[ftmp4], %[ftmp4] \n\t" \
98 "pmaddhw %[ftmp7], %[ftmp5], %[ftmp5] \n\t" \
99 "paddw %[ftmp8], %[ftmp8], %[ftmp6] \n\t" \
100 "paddw %[ftmp8], %[ftmp8], %[ftmp7] \n\t"
101
102 #define VARIANCE_SSE_16 \
103 VARIANCE_SSE_8 \
104 "gsldlc1 %[ftmp1], 0x0f(%[src_ptr]) \n\t" \
105 "gsldrc1 %[ftmp1], 0x08(%[src_ptr]) \n\t" \
106 "gsldlc1 %[ftmp2], 0x0f(%[ref_ptr]) \n\t" \
107 "gsldrc1 %[ftmp2], 0x08(%[ref_ptr]) \n\t" \
108 "pasubub %[ftmp3], %[ftmp1], %[ftmp2] \n\t" \
109 "punpcklbh %[ftmp4], %[ftmp3], %[ftmp0] \n\t" \
110 "punpckhbh %[ftmp5], %[ftmp3], %[ftmp0] \n\t" \
111 "pmaddhw %[ftmp6], %[ftmp4], %[ftmp4] \n\t" \
112 "pmaddhw %[ftmp7], %[ftmp5], %[ftmp5] \n\t" \
113 "paddw %[ftmp8], %[ftmp8], %[ftmp6] \n\t" \
114 "paddw %[ftmp8], %[ftmp8], %[ftmp7] \n\t"
115
116 #define VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_A \
117 /* calculate fdata3[0]~fdata3[3], store at ftmp2*/ \
118 "gsldlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t" \
119 "gsldrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t" \
120 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
121 "gsldlc1 %[ftmp1], 0x08(%[src_ptr]) \n\t" \
122 "gsldrc1 %[ftmp1], 0x01(%[src_ptr]) \n\t" \
123 "punpcklbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
124 "pmullh %[ftmp2], %[ftmp2], %[filter_x0] \n\t" \
125 "paddh %[ftmp2], %[ftmp2], %[ff_ph_40] \n\t" \
126 "pmullh %[ftmp3], %[ftmp3], %[filter_x1] \n\t" \
127 "paddh %[ftmp2], %[ftmp2], %[ftmp3] \n\t" \
128 "psrlh %[ftmp2], %[ftmp2], %[ftmp6] \n\t"
129
130 #define VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_B \
131 /* calculate fdata3[0]~fdata3[3], store at ftmp4*/ \
132 "gsldlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t" \
133 "gsldrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t" \
134 "punpcklbh %[ftmp4], %[ftmp1], %[ftmp0] \n\t" \
135 "gsldlc1 %[ftmp1], 0x08(%[src_ptr]) \n\t" \
136 "gsldrc1 %[ftmp1], 0x01(%[src_ptr]) \n\t" \
137 "punpcklbh %[ftmp5], %[ftmp1], %[ftmp0] \n\t" \
138 "pmullh %[ftmp4], %[ftmp4], %[filter_x0] \n\t" \
139 "paddh %[ftmp4], %[ftmp4], %[ff_ph_40] \n\t" \
140 "pmullh %[ftmp5], %[ftmp5], %[filter_x1] \n\t" \
141 "paddh %[ftmp4], %[ftmp4], %[ftmp5] \n\t" \
142 "psrlh %[ftmp4], %[ftmp4], %[ftmp6] \n\t"
143
144 #define VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_4_A \
145 /* calculate: temp2[0] ~ temp2[3] */ \
146 "pmullh %[ftmp2], %[ftmp2], %[filter_y0] \n\t" \
147 "paddh %[ftmp2], %[ftmp2], %[ff_ph_40] \n\t" \
148 "pmullh %[ftmp1], %[ftmp4], %[filter_y1] \n\t" \
149 "paddh %[ftmp2], %[ftmp2], %[ftmp1] \n\t" \
150 "psrlh %[ftmp2], %[ftmp2], %[ftmp6] \n\t" \
151 \
152 /* store: temp2[0] ~ temp2[3] */ \
153 "pand %[ftmp2], %[ftmp2], %[mask] \n\t" \
154 "packushb %[ftmp2], %[ftmp2], %[ftmp0] \n\t" \
155 "gssdrc1 %[ftmp2], 0x00(%[temp2_ptr]) \n\t"
156
157 #define VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_4_B \
158 /* calculate: temp2[0] ~ temp2[3] */ \
159 "pmullh %[ftmp4], %[ftmp4], %[filter_y0] \n\t" \
160 "paddh %[ftmp4], %[ftmp4], %[ff_ph_40] \n\t" \
161 "pmullh %[ftmp1], %[ftmp2], %[filter_y1] \n\t" \
162 "paddh %[ftmp4], %[ftmp4], %[ftmp1] \n\t" \
163 "psrlh %[ftmp4], %[ftmp4], %[ftmp6] \n\t" \
164 \
165 /* store: temp2[0] ~ temp2[3] */ \
166 "pand %[ftmp4], %[ftmp4], %[mask] \n\t" \
167 "packushb %[ftmp4], %[ftmp4], %[ftmp0] \n\t" \
168 "gssdrc1 %[ftmp4], 0x00(%[temp2_ptr]) \n\t"
169
170 #define VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_A \
171 /* calculate fdata3[0]~fdata3[7], store at ftmp2 and ftmp3*/ \
172 "gsldlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t" \
173 "gsldrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t" \
174 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
175 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
176 "gsldlc1 %[ftmp1], 0x08(%[src_ptr]) \n\t" \
177 "gsldrc1 %[ftmp1], 0x01(%[src_ptr]) \n\t" \
178 "punpcklbh %[ftmp4], %[ftmp1], %[ftmp0] \n\t" \
179 "punpckhbh %[ftmp5], %[ftmp1], %[ftmp0] \n\t" \
180 "pmullh %[ftmp2], %[ftmp2], %[filter_x0] \n\t" \
181 "pmullh %[ftmp3], %[ftmp3], %[filter_x0] \n\t" \
182 "paddh %[ftmp2], %[ftmp2], %[ff_ph_40] \n\t" \
183 "paddh %[ftmp3], %[ftmp3], %[ff_ph_40] \n\t" \
184 "pmullh %[ftmp4], %[ftmp4], %[filter_x1] \n\t" \
185 "pmullh %[ftmp5], %[ftmp5], %[filter_x1] \n\t" \
186 "paddh %[ftmp2], %[ftmp2], %[ftmp4] \n\t" \
187 "paddh %[ftmp3], %[ftmp3], %[ftmp5] \n\t" \
188 "psrlh %[ftmp2], %[ftmp2], %[ftmp14] \n\t" \
189 "psrlh %[ftmp3], %[ftmp3], %[ftmp14] \n\t"
190
191 #define VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_B \
192 /* calculate fdata3[0]~fdata3[7], store at ftmp8 and ftmp9*/ \
193 "gsldlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t" \
194 "gsldrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t" \
195 "punpcklbh %[ftmp8], %[ftmp1], %[ftmp0] \n\t" \
196 "punpckhbh %[ftmp9], %[ftmp1], %[ftmp0] \n\t" \
197 "gsldlc1 %[ftmp1], 0x08(%[src_ptr]) \n\t" \
198 "gsldrc1 %[ftmp1], 0x01(%[src_ptr]) \n\t" \
199 "punpcklbh %[ftmp10], %[ftmp1], %[ftmp0] \n\t" \
200 "punpckhbh %[ftmp11], %[ftmp1], %[ftmp0] \n\t" \
201 "pmullh %[ftmp8], %[ftmp8], %[filter_x0] \n\t" \
202 "pmullh %[ftmp9], %[ftmp9], %[filter_x0] \n\t" \
203 "paddh %[ftmp8], %[ftmp8], %[ff_ph_40] \n\t" \
204 "paddh %[ftmp9], %[ftmp9], %[ff_ph_40] \n\t" \
205 "pmullh %[ftmp10], %[ftmp10], %[filter_x1] \n\t" \
206 "pmullh %[ftmp11], %[ftmp11], %[filter_x1] \n\t" \
207 "paddh %[ftmp8], %[ftmp8], %[ftmp10] \n\t" \
208 "paddh %[ftmp9], %[ftmp9], %[ftmp11] \n\t" \
209 "psrlh %[ftmp8], %[ftmp8], %[ftmp14] \n\t" \
210 "psrlh %[ftmp9], %[ftmp9], %[ftmp14] \n\t"
211
212 #define VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_A \
213 /* calculate: temp2[0] ~ temp2[3] */ \
214 "pmullh %[ftmp2], %[ftmp2], %[filter_y0] \n\t" \
215 "paddh %[ftmp2], %[ftmp2], %[ff_ph_40] \n\t" \
216 "pmullh %[ftmp1], %[ftmp8], %[filter_y1] \n\t" \
217 "paddh %[ftmp2], %[ftmp2], %[ftmp1] \n\t" \
218 "psrlh %[ftmp2], %[ftmp2], %[ftmp14] \n\t" \
219 \
220 /* calculate: temp2[4] ~ temp2[7] */ \
221 "pmullh %[ftmp3], %[ftmp3], %[filter_y0] \n\t" \
222 "paddh %[ftmp3], %[ftmp3], %[ff_ph_40] \n\t" \
223 "pmullh %[ftmp1], %[ftmp9], %[filter_y1] \n\t" \
224 "paddh %[ftmp3], %[ftmp3], %[ftmp1] \n\t" \
225 "psrlh %[ftmp3], %[ftmp3], %[ftmp14] \n\t" \
226 \
227 /* store: temp2[0] ~ temp2[7] */ \
228 "pand %[ftmp2], %[ftmp2], %[mask] \n\t" \
229 "pand %[ftmp3], %[ftmp3], %[mask] \n\t" \
230 "packushb %[ftmp2], %[ftmp2], %[ftmp3] \n\t" \
231 "gssdlc1 %[ftmp2], 0x07(%[temp2_ptr]) \n\t" \
232 "gssdrc1 %[ftmp2], 0x00(%[temp2_ptr]) \n\t"
233
234 #define VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_B \
235 /* calculate: temp2[0] ~ temp2[3] */ \
236 "pmullh %[ftmp8], %[ftmp8], %[filter_y0] \n\t" \
237 "paddh %[ftmp8], %[ftmp8], %[ff_ph_40] \n\t" \
238 "pmullh %[ftmp1], %[ftmp2], %[filter_y1] \n\t" \
239 "paddh %[ftmp8], %[ftmp8], %[ftmp1] \n\t" \
240 "psrlh %[ftmp8], %[ftmp8], %[ftmp14] \n\t" \
241 \
242 /* calculate: temp2[4] ~ temp2[7] */ \
243 "pmullh %[ftmp9], %[ftmp9], %[filter_y0] \n\t" \
244 "paddh %[ftmp9], %[ftmp9], %[ff_ph_40] \n\t" \
245 "pmullh %[ftmp1], %[ftmp3], %[filter_y1] \n\t" \
246 "paddh %[ftmp9], %[ftmp9], %[ftmp1] \n\t" \
247 "psrlh %[ftmp9], %[ftmp9], %[ftmp14] \n\t" \
248 \
249 /* store: temp2[0] ~ temp2[7] */ \
250 "pand %[ftmp8], %[ftmp8], %[mask] \n\t" \
251 "pand %[ftmp9], %[ftmp9], %[mask] \n\t" \
252 "packushb %[ftmp8], %[ftmp8], %[ftmp9] \n\t" \
253 "gssdlc1 %[ftmp8], 0x07(%[temp2_ptr]) \n\t" \
254 "gssdrc1 %[ftmp8], 0x00(%[temp2_ptr]) \n\t"
255
256 #define VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_A \
257 /* calculate fdata3[0]~fdata3[7], store at ftmp2 and ftmp3*/ \
258 VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_A \
259 \
260 /* calculate fdata3[8]~fdata3[15], store at ftmp4 and ftmp5*/ \
261 "gsldlc1 %[ftmp1], 0x0f(%[src_ptr]) \n\t" \
262 "gsldrc1 %[ftmp1], 0x08(%[src_ptr]) \n\t" \
263 "punpcklbh %[ftmp4], %[ftmp1], %[ftmp0] \n\t" \
264 "punpckhbh %[ftmp5], %[ftmp1], %[ftmp0] \n\t" \
265 "gsldlc1 %[ftmp1], 0x10(%[src_ptr]) \n\t" \
266 "gsldrc1 %[ftmp1], 0x09(%[src_ptr]) \n\t" \
267 "punpcklbh %[ftmp6], %[ftmp1], %[ftmp0] \n\t" \
268 "punpckhbh %[ftmp7], %[ftmp1], %[ftmp0] \n\t" \
269 "pmullh %[ftmp4], %[ftmp4], %[filter_x0] \n\t" \
270 "pmullh %[ftmp5], %[ftmp5], %[filter_x0] \n\t" \
271 "paddh %[ftmp4], %[ftmp4], %[ff_ph_40] \n\t" \
272 "paddh %[ftmp5], %[ftmp5], %[ff_ph_40] \n\t" \
273 "pmullh %[ftmp6], %[ftmp6], %[filter_x1] \n\t" \
274 "pmullh %[ftmp7], %[ftmp7], %[filter_x1] \n\t" \
275 "paddh %[ftmp4], %[ftmp4], %[ftmp6] \n\t" \
276 "paddh %[ftmp5], %[ftmp5], %[ftmp7] \n\t" \
277 "psrlh %[ftmp4], %[ftmp4], %[ftmp14] \n\t" \
278 "psrlh %[ftmp5], %[ftmp5], %[ftmp14] \n\t"
279
280 #define VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_B \
281 /* calculate fdata3[0]~fdata3[7], store at ftmp8 and ftmp9*/ \
282 VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_B \
283 \
284 /* calculate fdata3[8]~fdata3[15], store at ftmp10 and ftmp11*/ \
285 "gsldlc1 %[ftmp1], 0x0f(%[src_ptr]) \n\t" \
286 "gsldrc1 %[ftmp1], 0x08(%[src_ptr]) \n\t" \
287 "punpcklbh %[ftmp10], %[ftmp1], %[ftmp0] \n\t" \
288 "punpckhbh %[ftmp11], %[ftmp1], %[ftmp0] \n\t" \
289 "gsldlc1 %[ftmp1], 0x10(%[src_ptr]) \n\t" \
290 "gsldrc1 %[ftmp1], 0x09(%[src_ptr]) \n\t" \
291 "punpcklbh %[ftmp12], %[ftmp1], %[ftmp0] \n\t" \
292 "punpckhbh %[ftmp13], %[ftmp1], %[ftmp0] \n\t" \
293 "pmullh %[ftmp10], %[ftmp10], %[filter_x0] \n\t" \
294 "pmullh %[ftmp11], %[ftmp11], %[filter_x0] \n\t" \
295 "paddh %[ftmp10], %[ftmp10], %[ff_ph_40] \n\t" \
296 "paddh %[ftmp11], %[ftmp11], %[ff_ph_40] \n\t" \
297 "pmullh %[ftmp12], %[ftmp12], %[filter_x1] \n\t" \
298 "pmullh %[ftmp13], %[ftmp13], %[filter_x1] \n\t" \
299 "paddh %[ftmp10], %[ftmp10], %[ftmp12] \n\t" \
300 "paddh %[ftmp11], %[ftmp11], %[ftmp13] \n\t" \
301 "psrlh %[ftmp10], %[ftmp10], %[ftmp14] \n\t" \
302 "psrlh %[ftmp11], %[ftmp11], %[ftmp14] \n\t"
303
304 #define VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_16_A \
305 VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_A \
306 \
307 /* calculate: temp2[8] ~ temp2[11] */ \
308 "pmullh %[ftmp4], %[ftmp4], %[filter_y0] \n\t" \
309 "paddh %[ftmp4], %[ftmp4], %[ff_ph_40] \n\t" \
310 "pmullh %[ftmp1], %[ftmp10], %[filter_y1] \n\t" \
311 "paddh %[ftmp4], %[ftmp4], %[ftmp1] \n\t" \
312 "psrlh %[ftmp4], %[ftmp4], %[ftmp14] \n\t" \
313 \
314 /* calculate: temp2[12] ~ temp2[15] */ \
315 "pmullh %[ftmp5], %[ftmp5], %[filter_y0] \n\t" \
316 "paddh %[ftmp5], %[ftmp5], %[ff_ph_40] \n\t" \
317 "pmullh %[ftmp1], %[ftmp11], %[filter_y1] \n\t" \
318 "paddh %[ftmp5], %[ftmp5], %[ftmp1] \n\t" \
319 "psrlh %[ftmp5], %[ftmp5], %[ftmp14] \n\t" \
320 \
321 /* store: temp2[8] ~ temp2[15] */ \
322 "pand %[ftmp4], %[ftmp4], %[mask] \n\t" \
323 "pand %[ftmp5], %[ftmp5], %[mask] \n\t" \
324 "packushb %[ftmp4], %[ftmp4], %[ftmp5] \n\t" \
325 "gssdlc1 %[ftmp4], 0x0f(%[temp2_ptr]) \n\t" \
326 "gssdrc1 %[ftmp4], 0x08(%[temp2_ptr]) \n\t"
327
328 #define VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_16_B \
329 VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_B \
330 \
331 /* calculate: temp2[8] ~ temp2[11] */ \
332 "pmullh %[ftmp10], %[ftmp10], %[filter_y0] \n\t" \
333 "paddh %[ftmp10], %[ftmp10], %[ff_ph_40] \n\t" \
334 "pmullh %[ftmp1], %[ftmp4], %[filter_y1] \n\t" \
335 "paddh %[ftmp10], %[ftmp10], %[ftmp1] \n\t" \
336 "psrlh %[ftmp10], %[ftmp10], %[ftmp14] \n\t" \
337 \
338 /* calculate: temp2[12] ~ temp2[15] */ \
339 "pmullh %[ftmp11], %[ftmp11], %[filter_y0] \n\t" \
340 "paddh %[ftmp11], %[ftmp11], %[ff_ph_40] \n\t" \
341 "pmullh %[ftmp1], %[ftmp5], %[filter_y1] \n\t" \
342 "paddh %[ftmp11], %[ftmp11], %[ftmp1] \n\t" \
343 "psrlh %[ftmp11], %[ftmp11], %[ftmp14] \n\t" \
344 \
345 /* store: temp2[8] ~ temp2[15] */ \
346 "pand %[ftmp10], %[ftmp10], %[mask] \n\t" \
347 "pand %[ftmp11], %[ftmp11], %[mask] \n\t" \
348 "packushb %[ftmp10], %[ftmp10], %[ftmp11] \n\t" \
349 "gssdlc1 %[ftmp10], 0x0f(%[temp2_ptr]) \n\t" \
350 "gssdrc1 %[ftmp10], 0x08(%[temp2_ptr]) \n\t"
351
352 // Applies a 1-D 2-tap bilinear filter to the source block in either horizontal
353 // or vertical direction to produce the filtered output block. Used to implement
354 // the first-pass of 2-D separable filter.
355 //
356 // Produces int16_t output to retain precision for the next pass. Two filter
357 // taps should sum to FILTER_WEIGHT. pixel_step defines whether the filter is
358 // applied horizontally (pixel_step = 1) or vertically (pixel_step = stride).
359 // It defines the offset required to move from one input to the next.
var_filter_block2d_bil_first_pass(const uint8_t * src_ptr,uint16_t * ref_ptr,unsigned int src_pixels_per_line,int pixel_step,unsigned int output_height,unsigned int output_width,const uint8_t * filter)360 static void var_filter_block2d_bil_first_pass(
361 const uint8_t *src_ptr, uint16_t *ref_ptr, unsigned int src_pixels_per_line,
362 int pixel_step, unsigned int output_height, unsigned int output_width,
363 const uint8_t *filter) {
364 unsigned int i, j;
365
366 for (i = 0; i < output_height; ++i) {
367 for (j = 0; j < output_width; ++j) {
368 ref_ptr[j] = ROUND_POWER_OF_TWO(
369 (int)src_ptr[0] * filter[0] + (int)src_ptr[pixel_step] * filter[1],
370 FILTER_BITS);
371
372 ++src_ptr;
373 }
374
375 src_ptr += src_pixels_per_line - output_width;
376 ref_ptr += output_width;
377 }
378 }
379
380 // Applies a 1-D 2-tap bilinear filter to the source block in either horizontal
381 // or vertical direction to produce the filtered output block. Used to implement
382 // the second-pass of 2-D separable filter.
383 //
384 // Requires 16-bit input as produced by filter_block2d_bil_first_pass. Two
385 // filter taps should sum to FILTER_WEIGHT. pixel_step defines whether the
386 // filter is applied horizontally (pixel_step = 1) or vertically
387 // (pixel_step = stride). It defines the offset required to move from one input
388 // to the next. Output is 8-bit.
var_filter_block2d_bil_second_pass(const uint16_t * src_ptr,uint8_t * ref_ptr,unsigned int src_pixels_per_line,unsigned int pixel_step,unsigned int output_height,unsigned int output_width,const uint8_t * filter)389 static void var_filter_block2d_bil_second_pass(
390 const uint16_t *src_ptr, uint8_t *ref_ptr, unsigned int src_pixels_per_line,
391 unsigned int pixel_step, unsigned int output_height,
392 unsigned int output_width, const uint8_t *filter) {
393 unsigned int i, j;
394
395 for (i = 0; i < output_height; ++i) {
396 for (j = 0; j < output_width; ++j) {
397 ref_ptr[j] = ROUND_POWER_OF_TWO(
398 (int)src_ptr[0] * filter[0] + (int)src_ptr[pixel_step] * filter[1],
399 FILTER_BITS);
400 ++src_ptr;
401 }
402
403 src_ptr += src_pixels_per_line - output_width;
404 ref_ptr += output_width;
405 }
406 }
407
vpx_variance64x(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,uint32_t * sse,int high)408 static inline uint32_t vpx_variance64x(const uint8_t *src_ptr, int src_stride,
409 const uint8_t *ref_ptr, int ref_stride,
410 uint32_t *sse, int high) {
411 int sum;
412 double ftmp[12];
413 uint32_t tmp[3];
414
415 *sse = 0;
416
417 /* clang-format off */
418 __asm__ volatile (
419 "li %[tmp0], 0x20 \n\t"
420 "mtc1 %[tmp0], %[ftmp11] \n\t"
421 MMI_L(%[tmp0], %[high], 0x00)
422 "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
423 "pxor %[ftmp9], %[ftmp9], %[ftmp9] \n\t"
424 "pxor %[ftmp10], %[ftmp10], %[ftmp10] \n\t"
425 "1: \n\t"
426 "gsldlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t"
427 "gsldrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t"
428 "gsldlc1 %[ftmp2], 0x07(%[ref_ptr]) \n\t"
429 "gsldrc1 %[ftmp2], 0x00(%[ref_ptr]) \n\t"
430 VARIANCE_SSE_SUM_8_FOR_W64
431
432 "gsldlc1 %[ftmp1], 0x0f(%[src_ptr]) \n\t"
433 "gsldrc1 %[ftmp1], 0x08(%[src_ptr]) \n\t"
434 "gsldlc1 %[ftmp2], 0x0f(%[ref_ptr]) \n\t"
435 "gsldrc1 %[ftmp2], 0x08(%[ref_ptr]) \n\t"
436 VARIANCE_SSE_SUM_8_FOR_W64
437
438 "gsldlc1 %[ftmp1], 0x17(%[src_ptr]) \n\t"
439 "gsldrc1 %[ftmp1], 0x10(%[src_ptr]) \n\t"
440 "gsldlc1 %[ftmp2], 0x17(%[ref_ptr]) \n\t"
441 "gsldrc1 %[ftmp2], 0x10(%[ref_ptr]) \n\t"
442 VARIANCE_SSE_SUM_8_FOR_W64
443
444 "gsldlc1 %[ftmp1], 0x1f(%[src_ptr]) \n\t"
445 "gsldrc1 %[ftmp1], 0x18(%[src_ptr]) \n\t"
446 "gsldlc1 %[ftmp2], 0x1f(%[ref_ptr]) \n\t"
447 "gsldrc1 %[ftmp2], 0x18(%[ref_ptr]) \n\t"
448 VARIANCE_SSE_SUM_8_FOR_W64
449
450 "gsldlc1 %[ftmp1], 0x27(%[src_ptr]) \n\t"
451 "gsldrc1 %[ftmp1], 0x20(%[src_ptr]) \n\t"
452 "gsldlc1 %[ftmp2], 0x27(%[ref_ptr]) \n\t"
453 "gsldrc1 %[ftmp2], 0x20(%[ref_ptr]) \n\t"
454 VARIANCE_SSE_SUM_8_FOR_W64
455
456 "gsldlc1 %[ftmp1], 0x2f(%[src_ptr]) \n\t"
457 "gsldrc1 %[ftmp1], 0x28(%[src_ptr]) \n\t"
458 "gsldlc1 %[ftmp2], 0x2f(%[ref_ptr]) \n\t"
459 "gsldrc1 %[ftmp2], 0x28(%[ref_ptr]) \n\t"
460 VARIANCE_SSE_SUM_8_FOR_W64
461
462 "gsldlc1 %[ftmp1], 0x37(%[src_ptr]) \n\t"
463 "gsldrc1 %[ftmp1], 0x30(%[src_ptr]) \n\t"
464 "gsldlc1 %[ftmp2], 0x37(%[ref_ptr]) \n\t"
465 "gsldrc1 %[ftmp2], 0x30(%[ref_ptr]) \n\t"
466 VARIANCE_SSE_SUM_8_FOR_W64
467
468 "gsldlc1 %[ftmp1], 0x3f(%[src_ptr]) \n\t"
469 "gsldrc1 %[ftmp1], 0x38(%[src_ptr]) \n\t"
470 "gsldlc1 %[ftmp2], 0x3f(%[ref_ptr]) \n\t"
471 "gsldrc1 %[ftmp2], 0x38(%[ref_ptr]) \n\t"
472 VARIANCE_SSE_SUM_8_FOR_W64
473
474 "addiu %[tmp0], %[tmp0], -0x01 \n\t"
475 MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride])
476 MMI_ADDU(%[ref_ptr], %[ref_ptr], %[ref_stride])
477 "bnez %[tmp0], 1b \n\t"
478
479 "mfc1 %[tmp1], %[ftmp9] \n\t"
480 "mfhc1 %[tmp2], %[ftmp9] \n\t"
481 "addu %[sum], %[tmp1], %[tmp2] \n\t"
482 "ssrld %[ftmp1], %[ftmp10], %[ftmp11] \n\t"
483 "paddw %[ftmp1], %[ftmp1], %[ftmp10] \n\t"
484 "swc1 %[ftmp1], 0x00(%[sse]) \n\t"
485 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
486 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
487 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
488 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
489 [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
490 [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
491 [tmp0]"=&r"(tmp[0]), [tmp1]"=&r"(tmp[1]),
492 [tmp2]"=&r"(tmp[2]),
493 [src_ptr]"+&r"(src_ptr), [ref_ptr]"+&r"(ref_ptr),
494 [sum]"=&r"(sum)
495 : [src_stride]"r"((mips_reg)src_stride),
496 [ref_stride]"r"((mips_reg)ref_stride),
497 [high]"r"(&high), [sse]"r"(sse)
498 : "memory"
499 );
500 /* clang-format on */
501
502 return *sse - (((int64_t)sum * sum) / (64 * high));
503 }
504
505 #define VPX_VARIANCE64XN(n) \
506 uint32_t vpx_variance64x##n##_mmi(const uint8_t *src_ptr, int src_stride, \
507 const uint8_t *ref_ptr, int ref_stride, \
508 uint32_t *sse) { \
509 return vpx_variance64x(src_ptr, src_stride, ref_ptr, ref_stride, sse, n); \
510 }
511
512 VPX_VARIANCE64XN(64)
513 VPX_VARIANCE64XN(32)
514
vpx_variance32x64_mmi(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,uint32_t * sse)515 uint32_t vpx_variance32x64_mmi(const uint8_t *src_ptr, int src_stride,
516 const uint8_t *ref_ptr, int ref_stride,
517 uint32_t *sse) {
518 int sum;
519 double ftmp[12];
520 uint32_t tmp[3];
521
522 *sse = 0;
523
524 /* clang-format off */
525 __asm__ volatile (
526 "li %[tmp0], 0x20 \n\t"
527 "mtc1 %[tmp0], %[ftmp11] \n\t"
528 "li %[tmp0], 0x40 \n\t"
529 "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
530 "pxor %[ftmp9], %[ftmp9], %[ftmp9] \n\t"
531 "pxor %[ftmp10], %[ftmp10], %[ftmp10] \n\t"
532 "1: \n\t"
533 "gsldlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t"
534 "gsldrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t"
535 "gsldlc1 %[ftmp2], 0x07(%[ref_ptr]) \n\t"
536 "gsldrc1 %[ftmp2], 0x00(%[ref_ptr]) \n\t"
537 VARIANCE_SSE_SUM_8_FOR_W64
538
539 "gsldlc1 %[ftmp1], 0x0f(%[src_ptr]) \n\t"
540 "gsldrc1 %[ftmp1], 0x08(%[src_ptr]) \n\t"
541 "gsldlc1 %[ftmp2], 0x0f(%[ref_ptr]) \n\t"
542 "gsldrc1 %[ftmp2], 0x08(%[ref_ptr]) \n\t"
543 VARIANCE_SSE_SUM_8_FOR_W64
544
545 "gsldlc1 %[ftmp1], 0x17(%[src_ptr]) \n\t"
546 "gsldrc1 %[ftmp1], 0x10(%[src_ptr]) \n\t"
547 "gsldlc1 %[ftmp2], 0x17(%[ref_ptr]) \n\t"
548 "gsldrc1 %[ftmp2], 0x10(%[ref_ptr]) \n\t"
549 VARIANCE_SSE_SUM_8_FOR_W64
550
551 "gsldlc1 %[ftmp1], 0x1f(%[src_ptr]) \n\t"
552 "gsldrc1 %[ftmp1], 0x18(%[src_ptr]) \n\t"
553 "gsldlc1 %[ftmp2], 0x1f(%[ref_ptr]) \n\t"
554 "gsldrc1 %[ftmp2], 0x18(%[ref_ptr]) \n\t"
555 VARIANCE_SSE_SUM_8_FOR_W64
556
557 "addiu %[tmp0], %[tmp0], -0x01 \n\t"
558 MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride])
559 MMI_ADDU(%[ref_ptr], %[ref_ptr], %[ref_stride])
560 "bnez %[tmp0], 1b \n\t"
561
562 "mfc1 %[tmp1], %[ftmp9] \n\t"
563 "mfhc1 %[tmp2], %[ftmp9] \n\t"
564 "addu %[sum], %[tmp1], %[tmp2] \n\t"
565 "ssrld %[ftmp1], %[ftmp10], %[ftmp11] \n\t"
566 "paddw %[ftmp1], %[ftmp1], %[ftmp10] \n\t"
567 "swc1 %[ftmp1], 0x00(%[sse]) \n\t"
568 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
569 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
570 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
571 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
572 [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
573 [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
574 [tmp0]"=&r"(tmp[0]), [tmp1]"=&r"(tmp[1]),
575 [tmp2]"=&r"(tmp[2]),
576 [src_ptr]"+&r"(src_ptr), [ref_ptr]"+&r"(ref_ptr),
577 [sum]"=&r"(sum)
578 : [src_stride]"r"((mips_reg)src_stride),
579 [ref_stride]"r"((mips_reg)ref_stride),
580 [sse]"r"(sse)
581 : "memory"
582 );
583 /* clang-format on */
584
585 return *sse - (((int64_t)sum * sum) / 2048);
586 }
587
vpx_variance32x(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,uint32_t * sse,int high)588 static inline uint32_t vpx_variance32x(const uint8_t *src_ptr, int src_stride,
589 const uint8_t *ref_ptr, int ref_stride,
590 uint32_t *sse, int high) {
591 int sum;
592 double ftmp[13];
593 uint32_t tmp[3];
594
595 *sse = 0;
596
597 /* clang-format off */
598 __asm__ volatile (
599 "li %[tmp0], 0x20 \n\t"
600 "mtc1 %[tmp0], %[ftmp11] \n\t"
601 MMI_L(%[tmp0], %[high], 0x00)
602 "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
603 "pxor %[ftmp8], %[ftmp8], %[ftmp8] \n\t"
604 "pxor %[ftmp10], %[ftmp10], %[ftmp10] \n\t"
605 "pxor %[ftmp12], %[ftmp12], %[ftmp12] \n\t"
606 "1: \n\t"
607 "gsldlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t"
608 "gsldrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t"
609 "gsldlc1 %[ftmp2], 0x07(%[ref_ptr]) \n\t"
610 "gsldrc1 %[ftmp2], 0x00(%[ref_ptr]) \n\t"
611 VARIANCE_SSE_SUM_8
612 "gsldlc1 %[ftmp1], 0x0f(%[src_ptr]) \n\t"
613 "gsldrc1 %[ftmp1], 0x08(%[src_ptr]) \n\t"
614 "gsldlc1 %[ftmp2], 0x0f(%[ref_ptr]) \n\t"
615 "gsldrc1 %[ftmp2], 0x08(%[ref_ptr]) \n\t"
616 VARIANCE_SSE_SUM_8
617 "gsldlc1 %[ftmp1], 0x17(%[src_ptr]) \n\t"
618 "gsldrc1 %[ftmp1], 0x10(%[src_ptr]) \n\t"
619 "gsldlc1 %[ftmp2], 0x17(%[ref_ptr]) \n\t"
620 "gsldrc1 %[ftmp2], 0x10(%[ref_ptr]) \n\t"
621 VARIANCE_SSE_SUM_8
622 "gsldlc1 %[ftmp1], 0x1f(%[src_ptr]) \n\t"
623 "gsldrc1 %[ftmp1], 0x18(%[src_ptr]) \n\t"
624 "gsldlc1 %[ftmp2], 0x1f(%[ref_ptr]) \n\t"
625 "gsldrc1 %[ftmp2], 0x18(%[ref_ptr]) \n\t"
626 VARIANCE_SSE_SUM_8
627
628 "addiu %[tmp0], %[tmp0], -0x01 \n\t"
629 MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride])
630 MMI_ADDU(%[ref_ptr], %[ref_ptr], %[ref_stride])
631 "bnez %[tmp0], 1b \n\t"
632
633 "ssrld %[ftmp9], %[ftmp8], %[ftmp11] \n\t"
634 "paddw %[ftmp9], %[ftmp9], %[ftmp8] \n\t"
635 "swc1 %[ftmp9], 0x00(%[sse]) \n\t"
636
637 "punpcklhw %[ftmp3], %[ftmp10], %[ftmp0] \n\t"
638 "punpckhhw %[ftmp4], %[ftmp10], %[ftmp0] \n\t"
639 "punpcklhw %[ftmp5], %[ftmp12], %[ftmp0] \n\t"
640 "punpckhhw %[ftmp6], %[ftmp12], %[ftmp0] \n\t"
641 "paddw %[ftmp3], %[ftmp3], %[ftmp4] \n\t"
642 "psubw %[ftmp3], %[ftmp3], %[ftmp5] \n\t"
643 "psubw %[ftmp3], %[ftmp3], %[ftmp6] \n\t"
644 "ssrld %[ftmp0], %[ftmp3], %[ftmp11] \n\t"
645 "paddw %[ftmp0], %[ftmp0], %[ftmp3] \n\t"
646 "swc1 %[ftmp0], 0x00(%[sum]) \n\t"
647
648 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
649 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
650 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
651 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
652 [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
653 [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
654 [ftmp12]"=&f"(ftmp[12]), [tmp0]"=&r"(tmp[0]),
655 [src_ptr]"+&r"(src_ptr), [ref_ptr]"+&r"(ref_ptr)
656 : [src_stride]"r"((mips_reg)src_stride),
657 [ref_stride]"r"((mips_reg)ref_stride),
658 [high]"r"(&high), [sse]"r"(sse), [sum]"r"(&sum)
659 : "memory"
660 );
661 /* clang-format on */
662
663 return *sse - (((int64_t)sum * sum) / (32 * high));
664 }
665
666 #define VPX_VARIANCE32XN(n) \
667 uint32_t vpx_variance32x##n##_mmi(const uint8_t *src_ptr, int src_stride, \
668 const uint8_t *ref_ptr, int ref_stride, \
669 uint32_t *sse) { \
670 return vpx_variance32x(src_ptr, src_stride, ref_ptr, ref_stride, sse, n); \
671 }
672
673 VPX_VARIANCE32XN(32)
674 VPX_VARIANCE32XN(16)
675
vpx_variance16x(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,uint32_t * sse,int high)676 static inline uint32_t vpx_variance16x(const uint8_t *src_ptr, int src_stride,
677 const uint8_t *ref_ptr, int ref_stride,
678 uint32_t *sse, int high) {
679 int sum;
680 double ftmp[13];
681 uint32_t tmp[3];
682
683 *sse = 0;
684
685 /* clang-format off */
686 __asm__ volatile (
687 "li %[tmp0], 0x20 \n\t"
688 "mtc1 %[tmp0], %[ftmp11] \n\t"
689 MMI_L(%[tmp0], %[high], 0x00)
690 "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
691 "pxor %[ftmp8], %[ftmp8], %[ftmp8] \n\t"
692 "pxor %[ftmp10], %[ftmp10], %[ftmp10] \n\t"
693 "pxor %[ftmp12], %[ftmp12], %[ftmp12] \n\t"
694 "1: \n\t"
695 "gsldlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t"
696 "gsldrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t"
697 "gsldlc1 %[ftmp2], 0x07(%[ref_ptr]) \n\t"
698 "gsldrc1 %[ftmp2], 0x00(%[ref_ptr]) \n\t"
699 VARIANCE_SSE_SUM_8
700 "gsldlc1 %[ftmp1], 0x0f(%[src_ptr]) \n\t"
701 "gsldrc1 %[ftmp1], 0x08(%[src_ptr]) \n\t"
702 "gsldlc1 %[ftmp2], 0x0f(%[ref_ptr]) \n\t"
703 "gsldrc1 %[ftmp2], 0x08(%[ref_ptr]) \n\t"
704 VARIANCE_SSE_SUM_8
705
706 "addiu %[tmp0], %[tmp0], -0x01 \n\t"
707 MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride])
708 MMI_ADDU(%[ref_ptr], %[ref_ptr], %[ref_stride])
709 "bnez %[tmp0], 1b \n\t"
710
711 "ssrld %[ftmp9], %[ftmp8], %[ftmp11] \n\t"
712 "paddw %[ftmp9], %[ftmp9], %[ftmp8] \n\t"
713 "swc1 %[ftmp9], 0x00(%[sse]) \n\t"
714
715 "punpcklhw %[ftmp3], %[ftmp10], %[ftmp0] \n\t"
716 "punpckhhw %[ftmp4], %[ftmp10], %[ftmp0] \n\t"
717 "punpcklhw %[ftmp5], %[ftmp12], %[ftmp0] \n\t"
718 "punpckhhw %[ftmp6], %[ftmp12], %[ftmp0] \n\t"
719 "paddw %[ftmp3], %[ftmp3], %[ftmp4] \n\t"
720 "psubw %[ftmp3], %[ftmp3], %[ftmp5] \n\t"
721 "psubw %[ftmp3], %[ftmp3], %[ftmp6] \n\t"
722 "ssrld %[ftmp0], %[ftmp3], %[ftmp11] \n\t"
723 "paddw %[ftmp0], %[ftmp0], %[ftmp3] \n\t"
724 "swc1 %[ftmp0], 0x00(%[sum]) \n\t"
725
726 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
727 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
728 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
729 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
730 [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
731 [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
732 [ftmp12]"=&f"(ftmp[12]), [tmp0]"=&r"(tmp[0]),
733 [src_ptr]"+&r"(src_ptr), [ref_ptr]"+&r"(ref_ptr)
734 : [src_stride]"r"((mips_reg)src_stride),
735 [ref_stride]"r"((mips_reg)ref_stride),
736 [high]"r"(&high), [sse]"r"(sse), [sum]"r"(&sum)
737 : "memory"
738 );
739 /* clang-format on */
740
741 return *sse - (((int64_t)sum * sum) / (16 * high));
742 }
743
744 #define VPX_VARIANCE16XN(n) \
745 uint32_t vpx_variance16x##n##_mmi(const uint8_t *src_ptr, int src_stride, \
746 const uint8_t *ref_ptr, int ref_stride, \
747 uint32_t *sse) { \
748 return vpx_variance16x(src_ptr, src_stride, ref_ptr, ref_stride, sse, n); \
749 }
750
751 VPX_VARIANCE16XN(32)
752 VPX_VARIANCE16XN(16)
753 VPX_VARIANCE16XN(8)
754
vpx_variance8x(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,uint32_t * sse,int high)755 static inline uint32_t vpx_variance8x(const uint8_t *src_ptr, int src_stride,
756 const uint8_t *ref_ptr, int ref_stride,
757 uint32_t *sse, int high) {
758 int sum;
759 double ftmp[13];
760 uint32_t tmp[3];
761
762 *sse = 0;
763
764 /* clang-format off */
765 __asm__ volatile (
766 "li %[tmp0], 0x20 \n\t"
767 "mtc1 %[tmp0], %[ftmp11] \n\t"
768 MMI_L(%[tmp0], %[high], 0x00)
769 "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
770 "pxor %[ftmp8], %[ftmp8], %[ftmp8] \n\t"
771 "pxor %[ftmp10], %[ftmp10], %[ftmp10] \n\t"
772 "pxor %[ftmp12], %[ftmp12], %[ftmp12] \n\t"
773 "1: \n\t"
774 "gsldlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t"
775 "gsldrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t"
776 "gsldlc1 %[ftmp2], 0x07(%[ref_ptr]) \n\t"
777 "gsldrc1 %[ftmp2], 0x00(%[ref_ptr]) \n\t"
778 VARIANCE_SSE_SUM_8
779
780 "addiu %[tmp0], %[tmp0], -0x01 \n\t"
781 MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride])
782 MMI_ADDU(%[ref_ptr], %[ref_ptr], %[ref_stride])
783 "bnez %[tmp0], 1b \n\t"
784
785 "ssrld %[ftmp9], %[ftmp8], %[ftmp11] \n\t"
786 "paddw %[ftmp9], %[ftmp9], %[ftmp8] \n\t"
787 "swc1 %[ftmp9], 0x00(%[sse]) \n\t"
788
789 "punpcklhw %[ftmp3], %[ftmp10], %[ftmp0] \n\t"
790 "punpckhhw %[ftmp4], %[ftmp10], %[ftmp0] \n\t"
791 "punpcklhw %[ftmp5], %[ftmp12], %[ftmp0] \n\t"
792 "punpckhhw %[ftmp6], %[ftmp12], %[ftmp0] \n\t"
793 "paddw %[ftmp3], %[ftmp3], %[ftmp4] \n\t"
794 "psubw %[ftmp3], %[ftmp3], %[ftmp5] \n\t"
795 "psubw %[ftmp3], %[ftmp3], %[ftmp6] \n\t"
796 "ssrld %[ftmp0], %[ftmp3], %[ftmp11] \n\t"
797 "paddw %[ftmp0], %[ftmp0], %[ftmp3] \n\t"
798 "swc1 %[ftmp0], 0x00(%[sum]) \n\t"
799
800 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
801 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
802 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
803 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
804 [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
805 [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
806 [ftmp12]"=&f"(ftmp[12]), [tmp0]"=&r"(tmp[0]),
807 [src_ptr]"+&r"(src_ptr), [ref_ptr]"+&r"(ref_ptr)
808 : [src_stride]"r"((mips_reg)src_stride),
809 [ref_stride]"r"((mips_reg)ref_stride),
810 [high]"r"(&high), [sse]"r"(sse), [sum]"r"(&sum)
811 : "memory"
812 );
813 /* clang-format on */
814
815 return *sse - (((int64_t)sum * sum) / (8 * high));
816 }
817
818 #define VPX_VARIANCE8XN(n) \
819 uint32_t vpx_variance8x##n##_mmi(const uint8_t *src_ptr, int src_stride, \
820 const uint8_t *ref_ptr, int ref_stride, \
821 uint32_t *sse) { \
822 return vpx_variance8x(src_ptr, src_stride, ref_ptr, ref_stride, sse, n); \
823 }
824
825 VPX_VARIANCE8XN(16)
826 VPX_VARIANCE8XN(8)
827 VPX_VARIANCE8XN(4)
828
vpx_variance4x(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,uint32_t * sse,int high)829 static inline uint32_t vpx_variance4x(const uint8_t *src_ptr, int src_stride,
830 const uint8_t *ref_ptr, int ref_stride,
831 uint32_t *sse, int high) {
832 int sum;
833 double ftmp[12];
834 uint32_t tmp[3];
835
836 *sse = 0;
837
838 /* clang-format off */
839 __asm__ volatile (
840 "li %[tmp0], 0x20 \n\t"
841 "mtc1 %[tmp0], %[ftmp10] \n\t"
842 MMI_L(%[tmp0], %[high], 0x00)
843 "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
844 "pxor %[ftmp6], %[ftmp6], %[ftmp6] \n\t"
845 "pxor %[ftmp7], %[ftmp7], %[ftmp7] \n\t"
846 "pxor %[ftmp8], %[ftmp8], %[ftmp8] \n\t"
847 "1: \n\t"
848 "gsldlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t"
849 "gsldrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t"
850 "gsldlc1 %[ftmp2], 0x07(%[ref_ptr]) \n\t"
851 "gsldrc1 %[ftmp2], 0x00(%[ref_ptr]) \n\t"
852 VARIANCE_SSE_SUM_4
853
854 "addiu %[tmp0], %[tmp0], -0x01 \n\t"
855 MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride])
856 MMI_ADDU(%[ref_ptr], %[ref_ptr], %[ref_stride])
857 "bnez %[tmp0], 1b \n\t"
858
859 "ssrld %[ftmp9], %[ftmp6], %[ftmp10] \n\t"
860 "paddw %[ftmp9], %[ftmp9], %[ftmp6] \n\t"
861 "swc1 %[ftmp9], 0x00(%[sse]) \n\t"
862
863 "punpcklhw %[ftmp3], %[ftmp7], %[ftmp0] \n\t"
864 "punpckhhw %[ftmp4], %[ftmp7], %[ftmp0] \n\t"
865 "punpcklhw %[ftmp5], %[ftmp8], %[ftmp0] \n\t"
866 "punpckhhw %[ftmp6], %[ftmp8], %[ftmp0] \n\t"
867 "paddw %[ftmp3], %[ftmp3], %[ftmp4] \n\t"
868 "psubw %[ftmp3], %[ftmp3], %[ftmp5] \n\t"
869 "psubw %[ftmp3], %[ftmp3], %[ftmp6] \n\t"
870 "ssrld %[ftmp0], %[ftmp3], %[ftmp10] \n\t"
871 "paddw %[ftmp0], %[ftmp0], %[ftmp3] \n\t"
872 "swc1 %[ftmp0], 0x00(%[sum]) \n\t"
873 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
874 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
875 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
876 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
877 [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
878 [ftmp10]"=&f"(ftmp[10]),
879 [tmp0]"=&r"(tmp[0]),
880 [src_ptr]"+&r"(src_ptr), [ref_ptr]"+&r"(ref_ptr)
881 : [src_stride]"r"((mips_reg)src_stride),
882 [ref_stride]"r"((mips_reg)ref_stride),
883 [high]"r"(&high), [sse]"r"(sse), [sum]"r"(&sum)
884 : "memory"
885 );
886 /* clang-format on */
887
888 return *sse - (((int64_t)sum * sum) / (4 * high));
889 }
890
891 #define VPX_VARIANCE4XN(n) \
892 uint32_t vpx_variance4x##n##_mmi(const uint8_t *src_ptr, int src_stride, \
893 const uint8_t *ref_ptr, int ref_stride, \
894 uint32_t *sse) { \
895 return vpx_variance4x(src_ptr, src_stride, ref_ptr, ref_stride, sse, n); \
896 }
897
898 VPX_VARIANCE4XN(8)
899 VPX_VARIANCE4XN(4)
900
vpx_mse16x(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,uint32_t * sse,uint64_t high)901 static inline uint32_t vpx_mse16x(const uint8_t *src_ptr, int src_stride,
902 const uint8_t *ref_ptr, int ref_stride,
903 uint32_t *sse, uint64_t high) {
904 double ftmp[12];
905 uint32_t tmp[1];
906
907 *sse = 0;
908
909 /* clang-format off */
910 __asm__ volatile (
911 "li %[tmp0], 0x20 \n\t"
912 "mtc1 %[tmp0], %[ftmp11] \n\t"
913 MMI_L(%[tmp0], %[high], 0x00)
914 "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
915 "pxor %[ftmp8], %[ftmp8], %[ftmp8] \n\t"
916
917 "1: \n\t"
918 VARIANCE_SSE_16
919
920 "addiu %[tmp0], %[tmp0], -0x01 \n\t"
921 MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride])
922 MMI_ADDU(%[ref_ptr], %[ref_ptr], %[ref_stride])
923 "bnez %[tmp0], 1b \n\t"
924
925 "ssrld %[ftmp9], %[ftmp8], %[ftmp11] \n\t"
926 "paddw %[ftmp9], %[ftmp9], %[ftmp8] \n\t"
927 "swc1 %[ftmp9], 0x00(%[sse]) \n\t"
928 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
929 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
930 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
931 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
932 [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
933 [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
934 [tmp0]"=&r"(tmp[0]),
935 [src_ptr]"+&r"(src_ptr), [ref_ptr]"+&r"(ref_ptr)
936 : [src_stride]"r"((mips_reg)src_stride),
937 [ref_stride]"r"((mips_reg)ref_stride),
938 [high]"r"(&high), [sse]"r"(sse)
939 : "memory"
940 );
941 /* clang-format on */
942
943 return *sse;
944 }
945
946 #define vpx_mse16xN(n) \
947 uint32_t vpx_mse16x##n##_mmi(const uint8_t *src_ptr, int src_stride, \
948 const uint8_t *ref_ptr, int ref_stride, \
949 uint32_t *sse) { \
950 return vpx_mse16x(src_ptr, src_stride, ref_ptr, ref_stride, sse, n); \
951 }
952
953 vpx_mse16xN(16);
954 vpx_mse16xN(8);
955
vpx_mse8x(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,uint32_t * sse,uint64_t high)956 static inline uint32_t vpx_mse8x(const uint8_t *src_ptr, int src_stride,
957 const uint8_t *ref_ptr, int ref_stride,
958 uint32_t *sse, uint64_t high) {
959 double ftmp[12];
960 uint32_t tmp[1];
961
962 *sse = 0;
963
964 /* clang-format off */
965 __asm__ volatile (
966 "li %[tmp0], 0x20 \n\t"
967 "mtc1 %[tmp0], %[ftmp11] \n\t"
968 MMI_L(%[tmp0], %[high], 0x00)
969 "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
970 "pxor %[ftmp8], %[ftmp8], %[ftmp8] \n\t"
971
972 "1: \n\t"
973 VARIANCE_SSE_8
974
975 "addiu %[tmp0], %[tmp0], -0x01 \n\t"
976 MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride])
977 MMI_ADDU(%[ref_ptr], %[ref_ptr], %[ref_stride])
978 "bnez %[tmp0], 1b \n\t"
979
980 "ssrld %[ftmp9], %[ftmp8], %[ftmp11] \n\t"
981 "paddw %[ftmp9], %[ftmp9], %[ftmp8] \n\t"
982 "swc1 %[ftmp9], 0x00(%[sse]) \n\t"
983 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
984 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
985 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
986 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
987 [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
988 [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
989 [tmp0]"=&r"(tmp[0]),
990 [src_ptr]"+&r"(src_ptr), [ref_ptr]"+&r"(ref_ptr)
991 : [src_stride]"r"((mips_reg)src_stride),
992 [ref_stride]"r"((mips_reg)ref_stride),
993 [high]"r"(&high), [sse]"r"(sse)
994 : "memory"
995 );
996 /* clang-format on */
997
998 return *sse;
999 }
1000
1001 #define vpx_mse8xN(n) \
1002 uint32_t vpx_mse8x##n##_mmi(const uint8_t *src_ptr, int src_stride, \
1003 const uint8_t *ref_ptr, int ref_stride, \
1004 uint32_t *sse) { \
1005 return vpx_mse8x(src_ptr, src_stride, ref_ptr, ref_stride, sse, n); \
1006 }
1007
1008 vpx_mse8xN(16);
1009 vpx_mse8xN(8);
1010
1011 #define SUBPIX_VAR(W, H) \
1012 uint32_t vpx_sub_pixel_variance##W##x##H##_mmi( \
1013 const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \
1014 const uint8_t *ref_ptr, int ref_stride, uint32_t *sse) { \
1015 uint16_t fdata3[((H) + 1) * (W)]; \
1016 uint8_t temp2[(H) * (W)]; \
1017 \
1018 var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_stride, 1, (H) + 1, \
1019 W, bilinear_filters[x_offset]); \
1020 var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
1021 bilinear_filters[y_offset]); \
1022 \
1023 return vpx_variance##W##x##H##_mmi(temp2, W, ref_ptr, ref_stride, sse); \
1024 }
1025
1026 SUBPIX_VAR(64, 64)
1027 SUBPIX_VAR(64, 32)
1028 SUBPIX_VAR(32, 64)
1029 SUBPIX_VAR(32, 32)
1030 SUBPIX_VAR(32, 16)
1031 SUBPIX_VAR(16, 32)
1032
var_filter_block2d_bil_16x(const uint8_t * src_ptr,int src_stride,int x_offset,int y_offset,uint8_t * temp2,int counter)1033 static inline void var_filter_block2d_bil_16x(const uint8_t *src_ptr,
1034 int src_stride, int x_offset,
1035 int y_offset, uint8_t *temp2,
1036 int counter) {
1037 uint8_t *temp2_ptr = temp2;
1038 mips_reg l_counter = counter;
1039 double ftmp[15];
1040 double ff_ph_40, mask;
1041 double filter_x0, filter_x1, filter_y0, filter_y1;
1042 mips_reg tmp[2];
1043 uint64_t x0, x1, y0, y1, all;
1044
1045 const uint8_t *filter_x = bilinear_filters[x_offset];
1046 const uint8_t *filter_y = bilinear_filters[y_offset];
1047 x0 = (uint64_t)filter_x[0];
1048 x1 = (uint64_t)filter_x[1];
1049 y0 = (uint64_t)filter_y[0];
1050 y1 = (uint64_t)filter_y[1];
1051 all = x0 | x1 << 8 | y0 << 16 | y1 << 24;
1052
1053 /* clang-format off */
1054 __asm__ volatile (
1055 "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1056 MMI_MTC1(%[all], %[ftmp14])
1057 "punpcklbh %[ftmp14], %[ftmp14], %[ftmp0] \n\t"
1058 "pshufh %[filter_x0], %[ftmp14], %[ftmp0] \n\t"
1059 MMI_LI(%[tmp0], 0x10)
1060 MMI_MTC1(%[tmp0], %[mask])
1061 "ssrld %[ftmp14], %[ftmp14], %[mask] \n\t"
1062 "pshufh %[filter_x1], %[ftmp14], %[ftmp0] \n\t"
1063 "ssrld %[ftmp14], %[ftmp14], %[mask] \n\t"
1064 "pshufh %[filter_y0], %[ftmp14], %[ftmp0] \n\t"
1065 "ssrld %[ftmp14], %[ftmp14], %[mask] \n\t"
1066 "pshufh %[filter_y1], %[ftmp14], %[ftmp0] \n\t"
1067 MMI_LI(%[tmp0], 0x07)
1068 MMI_MTC1(%[tmp0], %[ftmp14])
1069 MMI_LI(%[tmp0], 0x0040004000400040)
1070 MMI_MTC1(%[tmp0], %[ff_ph_40])
1071 MMI_LI(%[tmp0], 0x00ff00ff00ff00ff)
1072 MMI_MTC1(%[tmp0], %[mask])
1073 // fdata3: fdata3[0] ~ fdata3[15]
1074 VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_A
1075
1076 // fdata3 +src_stride*1: fdata3[0] ~ fdata3[15]
1077 MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride])
1078 VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_B
1079 // temp2: temp2[0] ~ temp2[15]
1080 VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_16_A
1081
1082 // fdata3 +src_stride*2: fdata3[0] ~ fdata3[15]
1083 MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride])
1084 VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_A
1085 // temp2+16*1: temp2[0] ~ temp2[15]
1086 MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x10)
1087 VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_16_B
1088
1089 "1: \n\t"
1090 MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride])
1091 VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_B
1092 MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x10)
1093 VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_16_A
1094
1095 MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride])
1096 VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_A
1097 MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x10)
1098 VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_16_B
1099 "addiu %[counter], %[counter], -0x01 \n\t"
1100 "bnez %[counter], 1b \n\t"
1101 : [ftmp0] "=&f"(ftmp[0]), [ftmp1] "=&f"(ftmp[1]), [ftmp2] "=&f"(ftmp[2]),
1102 [ftmp3] "=&f"(ftmp[3]), [ftmp4] "=&f"(ftmp[4]), [ftmp5] "=&f"(ftmp[5]),
1103 [ftmp6] "=&f"(ftmp[6]), [ftmp7] "=&f"(ftmp[7]), [ftmp8] "=&f"(ftmp[8]),
1104 [ftmp9] "=&f"(ftmp[9]), [ftmp10] "=&f"(ftmp[10]),
1105 [ftmp11] "=&f"(ftmp[11]), [ftmp12] "=&f"(ftmp[12]),
1106 [ftmp13] "=&f"(ftmp[13]), [ftmp14] "=&f"(ftmp[14]),
1107 [tmp0] "=&r"(tmp[0]), [src_ptr] "+&r"(src_ptr), [temp2_ptr] "+&r"(temp2_ptr),
1108 [counter]"+&r"(l_counter), [ff_ph_40] "=&f"(ff_ph_40), [mask] "=&f"(mask),
1109 [filter_x0] "=&f"(filter_x0), [filter_x1] "=&f"(filter_x1),
1110 [filter_y0] "=&f"(filter_y0), [filter_y1] "=&f"(filter_y1)
1111 : [src_stride] "r"((mips_reg)src_stride), [all] "r"(all)
1112 : "memory"
1113 );
1114 /* clang-format on */
1115 }
1116
1117 #define SUBPIX_VAR16XN(H) \
1118 uint32_t vpx_sub_pixel_variance16x##H##_mmi( \
1119 const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \
1120 const uint8_t *ref_ptr, int ref_stride, uint32_t *sse) { \
1121 uint8_t temp2[16 * (H)]; \
1122 var_filter_block2d_bil_16x(src_ptr, src_stride, x_offset, y_offset, temp2, \
1123 ((H)-2) / 2); \
1124 \
1125 return vpx_variance16x##H##_mmi(temp2, 16, ref_ptr, ref_stride, sse); \
1126 }
1127
1128 SUBPIX_VAR16XN(16)
1129 SUBPIX_VAR16XN(8)
1130
var_filter_block2d_bil_8x(const uint8_t * src_ptr,int src_stride,int x_offset,int y_offset,uint8_t * temp2,int counter)1131 static inline void var_filter_block2d_bil_8x(const uint8_t *src_ptr,
1132 int src_stride, int x_offset,
1133 int y_offset, uint8_t *temp2,
1134 int counter) {
1135 uint8_t *temp2_ptr = temp2;
1136 mips_reg l_counter = counter;
1137 double ftmp[15];
1138 mips_reg tmp[2];
1139 double ff_ph_40, mask;
1140 uint64_t x0, x1, y0, y1, all;
1141 double filter_x0, filter_x1, filter_y0, filter_y1;
1142 const uint8_t *filter_x = bilinear_filters[x_offset];
1143 const uint8_t *filter_y = bilinear_filters[y_offset];
1144 x0 = (uint64_t)filter_x[0];
1145 x1 = (uint64_t)filter_x[1];
1146 y0 = (uint64_t)filter_y[0];
1147 y1 = (uint64_t)filter_y[1];
1148 all = x0 | x1 << 8 | y0 << 16 | y1 << 24;
1149
1150 /* clang-format off */
1151 __asm__ volatile (
1152 "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1153 MMI_MTC1(%[all], %[ftmp14])
1154 "punpcklbh %[ftmp14], %[ftmp14], %[ftmp0] \n\t"
1155 "pshufh %[filter_x0], %[ftmp14], %[ftmp0] \n\t"
1156 MMI_LI(%[tmp0], 0x10)
1157 MMI_MTC1(%[tmp0], %[mask])
1158 "ssrld %[ftmp14], %[ftmp14], %[mask] \n\t"
1159 "pshufh %[filter_x1], %[ftmp14], %[ftmp0] \n\t"
1160 "ssrld %[ftmp14], %[ftmp14], %[mask] \n\t"
1161 "pshufh %[filter_y0], %[ftmp14], %[ftmp0] \n\t"
1162 "ssrld %[ftmp14], %[ftmp14], %[mask] \n\t"
1163 "pshufh %[filter_y1], %[ftmp14], %[ftmp0] \n\t"
1164 "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1165 MMI_LI(%[tmp0], 0x07)
1166 MMI_MTC1(%[tmp0], %[ftmp14])
1167 MMI_LI(%[tmp0], 0x0040004000400040)
1168 MMI_MTC1(%[tmp0], %[ff_ph_40])
1169 MMI_LI(%[tmp0], 0x00ff00ff00ff00ff)
1170 MMI_MTC1(%[tmp0], %[mask])
1171
1172 // fdata3: fdata3[0] ~ fdata3[7]
1173 VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_A
1174
1175 // fdata3 +src_stride*1: fdata3[0] ~ fdata3[7]
1176 MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride])
1177 VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_B
1178 // temp2: temp2[0] ~ temp2[7]
1179 VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_A
1180
1181 // fdata3 +src_stride*2: fdata3[0] ~ fdata3[7]
1182 MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride])
1183 VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_A
1184 // temp2+8*1: temp2[0] ~ temp2[7]
1185 MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x08)
1186 VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_B
1187
1188 "1: \n\t"
1189 MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride])
1190 VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_B
1191 MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x08)
1192 VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_A
1193
1194 MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride])
1195 VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_A
1196 MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x08)
1197 VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_B
1198 "addiu %[counter], %[counter], -0x01 \n\t"
1199 "bnez %[counter], 1b \n\t"
1200 : [ftmp0] "=&f"(ftmp[0]), [ftmp1] "=&f"(ftmp[1]), [ftmp2] "=&f"(ftmp[2]),
1201 [ftmp3] "=&f"(ftmp[3]), [ftmp4] "=&f"(ftmp[4]), [ftmp5] "=&f"(ftmp[5]),
1202 [ftmp6] "=&f"(ftmp[6]), [ftmp7] "=&f"(ftmp[7]), [ftmp8] "=&f"(ftmp[8]),
1203 [ftmp9] "=&f"(ftmp[9]), [ftmp10] "=&f"(ftmp[10]),
1204 [ftmp11] "=&f"(ftmp[11]), [ftmp12] "=&f"(ftmp[12]),
1205 [ftmp13] "=&f"(ftmp[13]), [ftmp14] "=&f"(ftmp[14]),
1206 [tmp0] "=&r"(tmp[0]), [src_ptr] "+&r"(src_ptr), [temp2_ptr] "+&r"(temp2_ptr),
1207 [counter]"+&r"(l_counter), [ff_ph_40] "=&f"(ff_ph_40), [mask] "=&f"(mask),
1208 [filter_x0] "=&f"(filter_x0), [filter_x1] "=&f"(filter_x1),
1209 [filter_y0] "=&f"(filter_y0), [filter_y1] "=&f"(filter_y1)
1210 : [src_stride] "r"((mips_reg)src_stride), [all] "r"(all)
1211 : "memory"
1212 );
1213 /* clang-format on */
1214 }
1215
1216 #define SUBPIX_VAR8XN(H) \
1217 uint32_t vpx_sub_pixel_variance8x##H##_mmi( \
1218 const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \
1219 const uint8_t *ref_ptr, int ref_stride, uint32_t *sse) { \
1220 uint8_t temp2[8 * (H)]; \
1221 var_filter_block2d_bil_8x(src_ptr, src_stride, x_offset, y_offset, temp2, \
1222 ((H)-2) / 2); \
1223 \
1224 return vpx_variance8x##H##_mmi(temp2, 8, ref_ptr, ref_stride, sse); \
1225 }
1226
1227 SUBPIX_VAR8XN(16)
1228 SUBPIX_VAR8XN(8)
1229 SUBPIX_VAR8XN(4)
1230
var_filter_block2d_bil_4x(const uint8_t * src_ptr,int src_stride,int x_offset,int y_offset,uint8_t * temp2,int counter)1231 static inline void var_filter_block2d_bil_4x(const uint8_t *src_ptr,
1232 int src_stride, int x_offset,
1233 int y_offset, uint8_t *temp2,
1234 int counter) {
1235 uint8_t *temp2_ptr = temp2;
1236 mips_reg l_counter = counter;
1237 double ftmp[7];
1238 mips_reg tmp[2];
1239 double ff_ph_40, mask;
1240 uint64_t x0, x1, y0, y1, all;
1241 double filter_x0, filter_x1, filter_y0, filter_y1;
1242 const uint8_t *filter_x = bilinear_filters[x_offset];
1243 const uint8_t *filter_y = bilinear_filters[y_offset];
1244 x0 = (uint64_t)filter_x[0];
1245 x1 = (uint64_t)filter_x[1];
1246 y0 = (uint64_t)filter_y[0];
1247 y1 = (uint64_t)filter_y[1];
1248 all = x0 | x1 << 8 | y0 << 16 | y1 << 24;
1249
1250 /* clang-format off */
1251 __asm__ volatile (
1252 "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1253 MMI_MTC1(%[all], %[ftmp6])
1254 "punpcklbh %[ftmp6], %[ftmp6], %[ftmp0] \n\t"
1255 "pshufh %[filter_x0], %[ftmp6], %[ftmp0] \n\t"
1256 MMI_LI(%[tmp0], 0x10)
1257 MMI_MTC1(%[tmp0], %[mask])
1258 "ssrld %[ftmp6], %[ftmp6], %[mask] \n\t"
1259 "pshufh %[filter_x1], %[ftmp6], %[ftmp0] \n\t"
1260 "ssrld %[ftmp6], %[ftmp6], %[mask] \n\t"
1261 "pshufh %[filter_y0], %[ftmp6], %[ftmp0] \n\t"
1262 "ssrld %[ftmp6], %[ftmp6], %[mask] \n\t"
1263 "pshufh %[filter_y1], %[ftmp6], %[ftmp0] \n\t"
1264 "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1265 MMI_LI(%[tmp0], 0x07)
1266 MMI_MTC1(%[tmp0], %[ftmp6])
1267 MMI_LI(%[tmp0], 0x0040004000400040)
1268 MMI_MTC1(%[tmp0], %[ff_ph_40])
1269 MMI_LI(%[tmp0], 0x00ff00ff00ff00ff)
1270 MMI_MTC1(%[tmp0], %[mask])
1271 // fdata3: fdata3[0] ~ fdata3[3]
1272 VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_A
1273
1274 // fdata3 +src_stride*1: fdata3[0] ~ fdata3[3]
1275 MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride])
1276 VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_B
1277 // temp2: temp2[0] ~ temp2[7]
1278 VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_4_A
1279
1280 // fdata3 +src_stride*2: fdata3[0] ~ fdata3[3]
1281 MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride])
1282 VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_A
1283 // temp2+4*1: temp2[0] ~ temp2[7]
1284 MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x04)
1285 VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_4_B
1286
1287 "1: \n\t"
1288 MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride])
1289 VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_B
1290 MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x04)
1291 VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_4_A
1292
1293 MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride])
1294 VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_A
1295 MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x04)
1296 VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_4_B
1297 "addiu %[counter], %[counter], -0x01 \n\t"
1298 "bnez %[counter], 1b \n\t"
1299 : [ftmp0] "=&f"(ftmp[0]), [ftmp1] "=&f"(ftmp[1]), [ftmp2] "=&f"(ftmp[2]),
1300 [ftmp3] "=&f"(ftmp[3]), [ftmp4] "=&f"(ftmp[4]), [ftmp5] "=&f"(ftmp[5]),
1301 [ftmp6] "=&f"(ftmp[6]), [tmp0] "=&r"(tmp[0]), [src_ptr] "+&r"(src_ptr),
1302 [temp2_ptr] "+&r"(temp2_ptr), [counter]"+&r"(l_counter),
1303 [ff_ph_40] "=&f"(ff_ph_40), [mask] "=&f"(mask),
1304 [filter_x0] "=&f"(filter_x0), [filter_x1] "=&f"(filter_x1),
1305 [filter_y0] "=&f"(filter_y0), [filter_y1] "=&f"(filter_y1)
1306 : [src_stride] "r"((mips_reg)src_stride), [all] "r"(all)
1307 : "memory"
1308 );
1309 /* clang-format on */
1310 }
1311
1312 #define SUBPIX_VAR4XN(H) \
1313 uint32_t vpx_sub_pixel_variance4x##H##_mmi( \
1314 const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \
1315 const uint8_t *ref_ptr, int ref_stride, uint32_t *sse) { \
1316 uint8_t temp2[4 * (H)]; \
1317 var_filter_block2d_bil_4x(src_ptr, src_stride, x_offset, y_offset, temp2, \
1318 ((H)-2) / 2); \
1319 \
1320 return vpx_variance4x##H##_mmi(temp2, 4, ref_ptr, ref_stride, sse); \
1321 }
1322
1323 SUBPIX_VAR4XN(8)
1324 SUBPIX_VAR4XN(4)
1325
1326 #define SUBPIX_AVG_VAR(W, H) \
1327 uint32_t vpx_sub_pixel_avg_variance##W##x##H##_mmi( \
1328 const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \
1329 const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, \
1330 const uint8_t *second_pred) { \
1331 uint16_t fdata3[((H) + 1) * (W)]; \
1332 uint8_t temp2[(H) * (W)]; \
1333 DECLARE_ALIGNED(16, uint8_t, temp3[(H) * (W)]); \
1334 \
1335 var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_stride, 1, (H) + 1, \
1336 W, bilinear_filters[x_offset]); \
1337 var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
1338 bilinear_filters[y_offset]); \
1339 \
1340 vpx_comp_avg_pred_c(temp3, second_pred, W, H, temp2, W); \
1341 \
1342 return vpx_variance##W##x##H##_mmi(temp3, W, ref_ptr, ref_stride, sse); \
1343 }
1344
1345 SUBPIX_AVG_VAR(64, 64)
1346 SUBPIX_AVG_VAR(64, 32)
1347 SUBPIX_AVG_VAR(32, 64)
1348 SUBPIX_AVG_VAR(32, 32)
1349 SUBPIX_AVG_VAR(32, 16)
1350 SUBPIX_AVG_VAR(16, 32)
1351 SUBPIX_AVG_VAR(16, 16)
1352 SUBPIX_AVG_VAR(16, 8)
1353 SUBPIX_AVG_VAR(8, 16)
1354 SUBPIX_AVG_VAR(8, 8)
1355 SUBPIX_AVG_VAR(8, 4)
1356 SUBPIX_AVG_VAR(4, 8)
1357 SUBPIX_AVG_VAR(4, 4)
1358