1 /*
2 * Copyright (c) 2019, Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included
12 * in all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
15 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20 * OTHER DEALINGS IN THE SOFTWARE.
21 */
22
23 #include <cm/cm.h>
24
25 #define NUM_OF_DWORDS_PER_MB (16)
26 #define NUM_OF_BYTES_PER_MB (64)
27
28 const uint Stat_offsets[8] = {5,6,7,8,9,10,11,12};
29 const uint Stat_offsets_[8] = {8,9,10,11,12,13,14,15};
30 const uint Stat_offsets_2[16] = {5,6,7,8,9,10,11,12,13,14,15,21,22,23,24,25};
31 const uint Stat_offsets_2_[8] = {24,25,26,27,28,29,30,31};
32
33 _GENX_ void
check_flatness(uint MBFlatnessThreshold,matrix_ref<uchar,16,32> field_in,matrix_ref<uint,1,2> field_flatness,matrix_ref<uint,1,2> VarianceLuma,matrix_ref<uint,1,2> PixAvg,matrix_ref<uint,2,4> VarianceLuma_8x8,matrix_ref<uint,2,4> PixAvg_8x8,unsigned short Enable_8x8)34 inline check_flatness(uint MBFlatnessThreshold,
35 matrix_ref<uchar,16,32> field_in,
36 matrix_ref<uint, 1, 2> field_flatness,
37 matrix_ref<uint, 1, 2> VarianceLuma,
38 matrix_ref <uint, 1, 2> PixAvg,
39 matrix_ref<uint, 2, 4> VarianceLuma_8x8,
40 matrix_ref <uint, 2, 4> PixAvg_8x8,
41 unsigned short Enable_8x8)
42 {
43 matrix <ushort, 16, 32> squared;
44 vector <uint, 32> temp_sum;
45 vector <uint, 32> temp_SumOfSquared;
46 matrix <uint, 1, 2> sum;
47 matrix <uint, 1, 2> SumOfSquared;
48 matrix <uint, 2, 4> sum_8x8;
49 matrix <uint, 2, 4> SumOfSquared_8x8;
50
51 squared = field_in * field_in;
52
53 if (!Enable_8x8)
54 {
55 sum(0,0) = cm_sum<uint>(field_in.select<16,1,16,1>(0,0), SAT);
56 SumOfSquared(0,0) = cm_sum<uint>(squared.select<16,1,16,1>(0,0), SAT);
57
58 sum(0,1) = cm_sum<uint>(field_in.select<16,1,16,1>(0,16), SAT);
59 SumOfSquared(0,1) = cm_sum<uint>(squared.select<16,1,16,1>(0,16), SAT);
60
61 PixAvg = sum >> 8;
62 sum *= sum;
63 sum >>= 8;
64
65 VarianceLuma = SumOfSquared - sum;
66 VarianceLuma >>= 8;
67
68 field_flatness = (VarianceLuma < MBFlatnessThreshold);
69 }
70 else
71 {
72 sum_8x8(0, 0) = cm_sum<uint>(field_in.select<8, 1, 8, 1>(0, 0) , SAT);
73 SumOfSquared_8x8(0, 0) = cm_sum<uint>(squared.select<8, 1, 8, 1>(0, 0) , SAT);
74 sum_8x8(0, 1) = cm_sum<uint>(field_in.select<8, 1, 8, 1>(0, 8) , SAT);
75 SumOfSquared_8x8(0, 1) = cm_sum<uint>(squared.select<8, 1, 8, 1>(0, 8) , SAT);
76 sum_8x8(0, 2) = cm_sum<uint>(field_in.select<8, 1, 8, 1>(8, 0) , SAT);
77 SumOfSquared_8x8(0, 2) = cm_sum<uint>(squared.select<8, 1, 8, 1>(8, 0) , SAT);
78 sum_8x8(0, 3) = cm_sum<uint>(field_in.select<8, 1, 8, 1>(8, 8) , SAT);
79 SumOfSquared_8x8(0, 3) = cm_sum<uint>(squared.select<8, 1, 8, 1>(8, 8) , SAT);
80
81 sum(0, 0) = cm_sum<uint>(sum_8x8.select<1, 1, 4, 1>(0, 0), SAT);
82 SumOfSquared(0, 0) = cm_sum<uint>(SumOfSquared_8x8.select<1, 1, 4, 1>(0, 0), SAT);
83 sum_8x8(1, 0) = cm_sum<uint>(field_in.select<8, 1, 8, 1>(0, 16), SAT);
84 SumOfSquared_8x8(1, 0)= cm_sum<uint>(squared.select<8, 1, 8, 1>(0, 16), SAT);
85 sum_8x8(1, 1) = cm_sum<uint>(field_in.select<8, 1, 8, 1>(0, 24), SAT);
86 SumOfSquared_8x8(1, 1)= cm_sum<uint>(squared.select<8, 1, 8, 1>(0, 24), SAT);
87 sum_8x8(1, 2) = cm_sum<uint>(field_in.select<8, 1, 8, 1>(8, 16), SAT);
88 SumOfSquared_8x8(1, 2)= cm_sum<uint>(squared.select<8, 1, 8, 1>(8, 16), SAT);
89 sum_8x8(1, 3) = cm_sum<uint>(field_in.select<8, 1, 8, 1>(8, 24), SAT);
90 SumOfSquared_8x8(1, 3)= cm_sum<uint>(squared.select<8, 1, 8, 1>(8, 24), SAT);
91
92 sum(0, 1) = cm_sum<uint>(sum_8x8.select<1, 1, 4, 1>(1, 0), SAT);
93 SumOfSquared(0, 1) = cm_sum<uint>(SumOfSquared_8x8.select<1, 1, 4, 1>(1, 0), SAT);
94
95 PixAvg = sum >> 8;
96 sum *= sum;
97 sum >>= 8;
98
99 VarianceLuma = SumOfSquared - sum;
100 VarianceLuma >>= 8;
101 field_flatness = (VarianceLuma < MBFlatnessThreshold);
102
103 PixAvg_8x8 = sum_8x8 >> 6;
104 sum_8x8 *= sum_8x8;
105 sum_8x8 >>= 6;
106
107 VarianceLuma_8x8 = SumOfSquared_8x8 - sum_8x8;
108 VarianceLuma_8x8 >>= 6;
109 }
110 }
111
hme_frame_downscale(unsigned short width,unsigned short height,SurfaceIndex InputBuf,SurfaceIndex DownscaleBuf,vector<uint,2> reserved1,unsigned int MBFlatnessThreshold,unsigned int Enablers,unsigned int reserved2,SurfaceIndex MB_VProc_Stats)112 extern "C" _GENX_MAIN_ void hme_frame_downscale(
113 unsigned short width,
114 unsigned short height,
115 SurfaceIndex InputBuf,
116 SurfaceIndex DownscaleBuf,
117 vector<uint, 2> reserved1,
118 unsigned int MBFlatnessThreshold,
119 unsigned int Enablers,
120 unsigned int reserved2,
121 SurfaceIndex MB_VProc_Stats)
122 {
123 matrix<uchar, 32, 32> in;
124 matrix<uchar, 8, 8> downscale_out;
125 matrix<uint, 2, 2> flatness_out;
126
127 matrix<ushort, 16,16> temp1;
128 matrix<ushort, 8, 8> temp2;
129 matrix<ushort, 4, 4> temp3;
130 vector<int, 8> mask;
131 vector<ushort, 1> replicate;
132
133 matrix<ushort,32,32> squared;
134 vector<uint, 32> temp_sum;
135 vector<uint, 32> temp_SumOfSquared;
136 matrix<uint, 2, 2> sum;
137 matrix<uint, 2, 2> pixAvg;
138 matrix<uint, 2, 2> SumOfSquared;
139 matrix_ref<uint,2, 2>VarianceLuma = SumOfSquared;
140
141 matrix<uint, 4, 4> sum_8x8;
142 matrix<uint, 4, 4> pixAvg_8x8;
143 matrix<uint, 4, 4> SumOfSquared_8x8;
144 matrix_ref<uint, 4, 4>VarianceLuma_8x8 = SumOfSquared_8x8;
145
146 replicate[0] = 0;
147
148 ushort h_pos = get_thread_origin_x();
149 ushort v_pos = get_thread_origin_y();
150
151 ushort h_pos_write = h_pos << 3;
152 ushort v_pos_write = v_pos << 3;
153
154 ushort h_pos_read = h_pos_write;
155 vector<ushort, 1> v_pos_read = v_pos_write;
156
157 vector<ushort, 1> max_dst_row = v_pos_write + 8;
158 vector<ushort, 1> max_real_dst_row = (height >> 2);
159
160 replicate.merge((max_dst_row - max_real_dst_row), (max_dst_row > max_real_dst_row));
161
162 v_pos_read.merge((max_real_dst_row -1), v_pos_write >= max_real_dst_row);
163 replicate.merge(7, v_pos_write >= max_real_dst_row);
164
165 v_pos_read[0] = v_pos_read[0] <<2;
166
167 read(InputBuf, h_pos_read << 2, v_pos_read[0], in.select<8,1,32,1>(0, 0));
168 read(InputBuf, h_pos_read << 2, v_pos_read[0] + 8, in.select<8,1,32,1>(8, 0));
169 read(InputBuf, h_pos_read << 2, v_pos_read[0] + 16, in.select<8,1,32,1>(16,0));
170 read(InputBuf, h_pos_read << 2, v_pos_read[0] + 24, in.select<8,1,32,1>(24,0));
171
172 #pragma unroll
173 for(uint i=0; i<2; i++) {
174 #pragma unroll
175 for(uint j=0; j<2; j++)
176 {
177 temp1.select<8,2,16,1>(0,0) = in.select<8,2,16,1>(i<<4,j<<4) + in.select<8,2,16,1>( (i<<4)+1, j<<4 );
178 temp2.select<8,1, 8,1>(0,0) = temp1.select<8,2,8,2>(0,0) + temp1.select<8,2,8,2>(0,1);
179
180 temp2.select<8,1,8,1>(0,0) += 2;
181 temp2.select<8,1,8,1>(0,0) = temp2.select<8,1,8,1>(0,0) >> 2;
182
183 temp2.select<4,2,8,1>(0,0) = temp2.select<4,2,8,1>(0,0) + temp2.select<4,2,8,1>(1,0);
184 temp3.select<4,1,4,1>(0,0) = temp2.select<4,2,4,2>(0,0) + temp2.select<4,2,4,2>(0,1);
185
186 temp3.select<4,1,4,1>(0,0) += 2;
187 temp3.select<4,1,4,1>(0,0) = temp3.select<4,1,4,1>(0,0) >> 2;
188
189 downscale_out.select<4,1,4,1>(i<<2,j<<2) = temp3.select<4,1,4,1>(0,0);
190 }
191 }
192
193 if ( ! replicate[0] )
194 {
195 write(DownscaleBuf, h_pos_write, v_pos_write, downscale_out);
196 }
197 else
198 {
199 matrix<uchar, 1, 8> last_line = downscale_out.select<1,1,8,1>(7 - replicate[0], 0);
200
201 #pragma unroll
202 for(uint i=1; i<8; i++)
203 {
204 mask = i > 7 - replicate[0];
205 downscale_out.select<1,1,8,1>(i, 0).merge(last_line, mask);
206 }
207
208 write(DownscaleBuf, h_pos_write, v_pos_write, downscale_out);
209 }
210
211 unsigned short EnableMBFlatnessCheck = Enablers & 0x1;
212 unsigned short EnableVarianceOut = Enablers & 0x2;
213 unsigned short EnablePixAvgOut = Enablers & 0x4;
214 unsigned short Enable_8x8_Stats = Enablers & 0x8;
215
216 if (Enablers != 0)
217 {
218 squared = in * in;
219
220 if (!Enable_8x8_Stats)
221 {
222 sum(0,0) = cm_sum<uint>(in.select<16,1,16,1>(0,0), SAT);
223 SumOfSquared(0,0) = cm_sum<uint>(squared.select<16,1,16,1>(0,0), SAT);
224
225 sum(0,1) = cm_sum<uint>(in.select<16, 1, 16, 1>(0, 16), SAT);
226 SumOfSquared(0,1) = cm_sum<uint>(squared.select<16, 1, 16, 1>(0, 16), SAT);
227
228 sum(1,0) = cm_sum<uint>(in.select<16,1,16,1>(16,0), SAT);
229 SumOfSquared(1,0) = cm_sum<uint>(squared.select<16,1,16,1>(16,0), SAT);
230
231 sum(1,1) = cm_sum<uint>(in.select<16,1,16,1>(16,16), SAT);
232 SumOfSquared(1,1) = cm_sum<uint>(squared.select<16,1,16,1>(16,16), SAT);
233
234 pixAvg = sum >> 8;
235 sum *= sum;
236 sum >>= 8;
237
238 VarianceLuma = SumOfSquared - sum;
239 VarianceLuma >>= 8;
240
241 flatness_out = (VarianceLuma < MBFlatnessThreshold);
242 }
243 else
244 {
245 sum_8x8(0,0) = cm_sum<uint>(in.select<8,1,8,1>(0,0) , SAT);
246 SumOfSquared_8x8(0,0) = cm_sum<uint>(squared.select<8,1,8,1>(0,0) , SAT);
247 sum_8x8(0,1) = cm_sum<uint>(in.select<8,1,8,1>(0,8) , SAT);
248 SumOfSquared_8x8(0,1) = cm_sum<uint>(squared.select<8,1,8,1>(0,8) , SAT);
249 sum_8x8(0,2) = cm_sum<uint>(in.select<8,1,8,1>(8,0) , SAT);
250 SumOfSquared_8x8(0,2) = cm_sum<uint>(squared.select<8,1,8,1>(8,0) , SAT);
251 sum_8x8(0,3) = cm_sum<uint>(in.select<8,1,8,1>(8,8) , SAT);
252 SumOfSquared_8x8(0,3) = cm_sum<uint>(squared.select<8,1,8,1>(8,8) , SAT);
253
254 sum(0,0) = cm_sum<uint>(sum_8x8.select<1,1,4,1>(0,0), SAT);
255 SumOfSquared(0,0) = cm_sum<uint>(SumOfSquared_8x8.select<1,1,4,1>(0,0), SAT);
256
257 sum_8x8(1,0) = cm_sum<uint>(in.select<8,1,8,1>(0,16) , SAT);
258 SumOfSquared_8x8(1,0) = cm_sum<uint>(squared.select<8,1,8,1>(0,16) , SAT);
259 sum_8x8(1,1) = cm_sum<uint>(in.select<8,1,8,1>(0,24) , SAT);
260 SumOfSquared_8x8(1,1) = cm_sum<uint>(squared.select<8,1,8,1>(0,24) , SAT);
261 sum_8x8(1,2) = cm_sum<uint>(in.select<8,1,8,1>(8,16) , SAT);
262 SumOfSquared_8x8(1,2) = cm_sum<uint>(squared.select<8,1,8,1>(8,16) , SAT);
263 sum_8x8(1,3) = cm_sum<uint>(in.select<8,1,8,1>(8,24) , SAT);
264 SumOfSquared_8x8(1,3) = cm_sum<uint>(squared.select<8,1,8,1>(8,24) , SAT);
265
266 sum(0,1) = cm_sum<uint>(sum_8x8.select<1,1,4,1>(1,0), SAT);
267 SumOfSquared(0,1) = cm_sum<uint>(SumOfSquared_8x8.select<1,1,4,1>(1,0), SAT);
268
269 sum_8x8(2,0) = cm_sum<uint>(in.select<8,1,8,1>(16,0) , SAT);
270 SumOfSquared_8x8(2,0) = cm_sum<uint>(squared.select<8,1,8,1>(16,0) , SAT);
271 sum_8x8(2,1) = cm_sum<uint>(in.select<8,1,8,1>(16,8) , SAT);
272 SumOfSquared_8x8(2,1) = cm_sum<uint>(squared.select<8,1,8,1>(16,8) , SAT);
273 sum_8x8(2,2) = cm_sum<uint>(in.select<8,1,8,1>(24,0) , SAT);
274 SumOfSquared_8x8(2,2) = cm_sum<uint>(squared.select<8,1,8,1>(24,0) , SAT);
275 sum_8x8(2,3) = cm_sum<uint>(in.select<8,1,8,1>(24,8) , SAT);
276 SumOfSquared_8x8(2,3) = cm_sum<uint>(squared.select<8,1,8,1>(24,8) , SAT);
277
278 sum(1,0) = cm_sum<uint>(sum_8x8.select<1,1,4,1>(2,0), SAT);
279 SumOfSquared(1,0) = cm_sum<uint>(SumOfSquared_8x8.select<1,1,4,1>(2,0), SAT);
280
281 sum_8x8(3,0) = cm_sum<uint>(in.select<8,1,8,1>(16,16) , SAT);
282 SumOfSquared_8x8(3,0) = cm_sum<uint>(squared.select<8,1,8,1>(16,16) , SAT);
283 sum_8x8(3,1) = cm_sum<uint>(in.select<8,1,8,1>(16,24) , SAT);
284 SumOfSquared_8x8(3,1) = cm_sum<uint>(squared.select<8,1,8,1>(16,24) , SAT);
285 sum_8x8(3,2) = cm_sum<uint>(in.select<8,1,8,1>(24,16) , SAT);
286 SumOfSquared_8x8(3,2) = cm_sum<uint>(squared.select<8,1,8,1>(24,16) , SAT);
287 sum_8x8(3,3) = cm_sum<uint>(in.select<8,1,8,1>(24,24) , SAT);
288 SumOfSquared_8x8(3,3) = cm_sum<uint>(squared.select<8,1,8,1>(24,24) , SAT);
289
290 sum(1,1) = cm_sum<uint>(sum_8x8.select<1,1,4,1>(3,0), SAT);
291 SumOfSquared(1,1) = cm_sum<uint>(SumOfSquared_8x8.select<1,1,4,1>(3,0), SAT);
292
293 pixAvg = sum >> 8;
294 sum *= sum;
295 sum >>= 8;
296
297 VarianceLuma = SumOfSquared - sum;
298 VarianceLuma >>= 8;
299 flatness_out = (VarianceLuma < MBFlatnessThreshold);
300
301 pixAvg_8x8 = sum_8x8 >> 6;
302 sum_8x8 *= sum_8x8;
303 sum_8x8 >>= 6;
304
305 VarianceLuma_8x8 = SumOfSquared_8x8 - sum_8x8;
306 VarianceLuma_8x8 >>= 6;
307 }
308
309 unsigned short NumMBperRow = width / 16;
310 NumMBperRow += ((width % 16)> 0);
311
312 unsigned int offset_vp = (2 * NUM_OF_DWORDS_PER_MB * 4 * (v_pos * NumMBperRow + h_pos));
313 unsigned int offset2_vp = offset_vp + NumMBperRow * NUM_OF_BYTES_PER_MB;
314
315 vector<uint, 44> writeVProc(0);
316 vector<uint, 8> Element_Offset(Stat_offsets);
317 vector<uint, 8> Element_Offset_(Stat_offsets_);
318 vector<uint, 16> Element_Offset_2(Stat_offsets_2);
319 vector<uint, 8> Element_Offset_2_(Stat_offsets_2_);
320
321 offset_vp >>= 2;
322 offset2_vp >>= 2;
323
324 if(EnableMBFlatnessCheck)
325 {
326 writeVProc.select<4,11>(0) = flatness_out.select_all();
327 }
328
329 if(EnableVarianceOut)
330 {
331 writeVProc.select<2,11>(1) = VarianceLuma.row(0);
332 writeVProc.select<2,11>(23) = VarianceLuma.row(1);
333
334 if(Enable_8x8_Stats)
335 {
336 writeVProc.select<4,1>(2) = VarianceLuma_8x8.row(0);
337 writeVProc.select<4,1>(13) = VarianceLuma_8x8.row(1);
338 writeVProc.select<4,1>(24) = VarianceLuma_8x8.row(2);
339 writeVProc.select<4,1>(35) = VarianceLuma_8x8.row(3);
340 }
341 }
342
343 if(EnablePixAvgOut)
344 {
345 writeVProc.select<2,11>(6) = pixAvg.row(0);
346 writeVProc.select<2,11>(28) = pixAvg.row(1);
347
348 if(Enable_8x8_Stats)
349 {
350 writeVProc.select<4,1>(7) = pixAvg_8x8.row(0);
351 writeVProc.select<4,1>(18) = pixAvg_8x8.row(1);
352 writeVProc.select<4,1>(29) = pixAvg_8x8.row(2);
353 writeVProc.select<4,1>(40) = pixAvg_8x8.row(3);
354 }
355 }
356
357 if(EnableVarianceOut || EnablePixAvgOut)
358 {
359 if((h_pos * 32 < width) && (v_pos * 32 < height))
360 {
361 if((h_pos * 32)+ 16 >= width)
362 {
363 write(MB_VProc_Stats, offset_vp, Element_Offset, writeVProc.select<8,1>(0));
364 write(MB_VProc_Stats, offset_vp, Element_Offset_, writeVProc.select<8,1>(3));
365 }
366 else
367 {
368 write(MB_VProc_Stats, offset_vp, Element_Offset_2, writeVProc.select<16,1>(0));
369 write(MB_VProc_Stats, offset_vp, Element_Offset_2_, writeVProc.select<8,1>(14));
370 }
371
372 if ((v_pos * 32) + 16 < height)
373 {
374 if ((h_pos * 32) + 16 >= width)
375 {
376 write(MB_VProc_Stats, offset2_vp, Element_Offset, writeVProc.select<8, 1>(22));
377 write(MB_VProc_Stats, offset2_vp, Element_Offset_, writeVProc.select<8, 1>(25));
378 }
379 else
380 {
381 write(MB_VProc_Stats, offset2_vp, Element_Offset_2, writeVProc.select<16, 1>(22));
382 write(MB_VProc_Stats, offset2_vp, Element_Offset_2_, writeVProc.select<8, 1>(36));
383 }
384 }
385 }
386 }
387 }
388 }
389
390 extern "C" _GENX_MAIN_ void
hme_frame_downscale2(unsigned short width,unsigned short height,vector<uint,7> reserved,SurfaceIndex ibuf,SurfaceIndex obuf)391 hme_frame_downscale2(unsigned short width,
392 unsigned short height,
393 vector<uint, 7> reserved,
394 SurfaceIndex ibuf,
395 SurfaceIndex obuf)
396 {
397 matrix<uchar, 32, 32> in;
398 matrix<ushort, 16, 16> temp1;
399 matrix<ushort, 8, 8> temp2;
400 matrix<ushort, 4, 4> temp3;
401 matrix<uchar, 16, 16> out;
402 vector<int, 16> mask;
403
404 vector<ushort, 1> replicate;
405 replicate[0] = 0;
406
407 ushort h_pos_write = get_thread_origin_x() << 4;
408 ushort v_pos_write = get_thread_origin_y() << 4;
409 ushort h_pos_read = h_pos_write;
410
411 vector<ushort, 1> v_pos_read = v_pos_write;
412 vector<ushort, 1> max_dst_row = v_pos_write + 16;
413 vector<ushort, 1> max_real_dst_row = (height >> 1);
414
415 replicate.merge((max_dst_row - max_real_dst_row), (max_dst_row > max_real_dst_row));
416
417 v_pos_read.merge(max_real_dst_row - 1, v_pos_write >= max_real_dst_row);
418 replicate.merge(15, v_pos_write >= max_real_dst_row);
419
420 read(ibuf, h_pos_read << 1, v_pos_read[0] << 1, in.select<8, 1, 32, 1>(0, 0));
421 read(ibuf, h_pos_read << 1, (v_pos_read[0] << 1) + 8, in.select<8, 1, 32, 1>(8, 0));
422 read(ibuf, h_pos_read << 1, (v_pos_read[0] << 1) + 16, in.select<8, 1, 32, 1>(16,0));
423 read(ibuf, h_pos_read << 1, (v_pos_read[0] << 1) + 24, in.select<8, 1, 32, 1>(24,0));
424
425 #pragma unroll
426 for (uint i = 0; i<2; i++)
427 {
428 #pragma unroll
429 for (uint j = 0; j<2; j++)
430 {
431 temp1.select<8, 2,16, 1>(0, 0) = in.select<8, 2,16, 1>(i << 4, j << 4) + in.select<8, 2, 16, 1>((i << 4) + 1, j << 4);
432 temp2.select<8, 1, 8, 1>(0, 0) = temp1.select<8, 2, 8, 2>(0, 0) + temp1.select<8, 2, 8, 2>(0, 1);
433
434 temp2.select<8, 1, 8, 1>(0, 0) += 2;
435 temp2.select<8, 1, 8, 1>(0, 0) = temp2.select<8, 1, 8, 1>(0, 0) >> 2;
436
437 out.select<8, 1, 8, 1>(i << 3, j << 3) = temp2.select<8, 1, 8, 1>(0, 0);
438 }
439 }
440
441 if (!replicate[0])
442 {
443 write(obuf, h_pos_write, v_pos_write, out);
444 }
445 else
446 {
447 matrix<uchar, 1, 16> last_line = out.select<1, 1, 16, 1>(15 - replicate[0], 0);
448
449 #pragma unroll
450 for (uint i = 1; i < 16; i++)
451 {
452 mask = i > 15 - replicate[0];
453 out.select<1, 1, 16, 1>(i, 0).merge(last_line, mask);
454 }
455
456 write(obuf, h_pos_write, v_pos_write, out);
457 }
458 }
459