1 /*
2 * Copyright (c) 2019, Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included
12 * in all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
15 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20 * OTHER DEALINGS IN THE SOFTWARE.
21 */
22 
23 #include <cm/cm.h>
24 
25 #define NUM_OF_DWORDS_PER_MB (16)
26 #define NUM_OF_BYTES_PER_MB  (64)
27 
28 const uint Stat_offsets[8]         = {5,6,7,8,9,10,11,12};
29 const uint Stat_offsets_[8]       = {8,9,10,11,12,13,14,15};
30 const uint Stat_offsets_2[16] = {5,6,7,8,9,10,11,12,13,14,15,21,22,23,24,25};
31 const uint Stat_offsets_2_[8] = {24,25,26,27,28,29,30,31};
32 
33 _GENX_ void
check_flatness(uint MBFlatnessThreshold,matrix_ref<uchar,16,32> field_in,matrix_ref<uint,1,2> field_flatness,matrix_ref<uint,1,2> VarianceLuma,matrix_ref<uint,1,2> PixAvg,matrix_ref<uint,2,4> VarianceLuma_8x8,matrix_ref<uint,2,4> PixAvg_8x8,unsigned short Enable_8x8)34 inline check_flatness(uint MBFlatnessThreshold,
35                       matrix_ref<uchar,16,32> field_in,
36                       matrix_ref<uint,  1, 2> field_flatness,
37                       matrix_ref<uint,  1, 2> VarianceLuma,
38                       matrix_ref <uint, 1, 2> PixAvg,
39                       matrix_ref<uint,  2, 4> VarianceLuma_8x8,
40                       matrix_ref <uint, 2, 4> PixAvg_8x8,
41                       unsigned short Enable_8x8)
42 {
43     matrix <ushort, 16, 32> squared;
44     vector <uint, 32>       temp_sum;
45     vector <uint, 32>       temp_SumOfSquared;
46     matrix <uint, 1, 2>     sum;
47     matrix <uint, 1, 2>     SumOfSquared;
48     matrix <uint, 2, 4>     sum_8x8;
49     matrix <uint, 2, 4>     SumOfSquared_8x8;
50 
51     squared = field_in * field_in;
52 
53     if (!Enable_8x8)
54     {
55         sum(0,0)          = cm_sum<uint>(field_in.select<16,1,16,1>(0,0),  SAT);
56         SumOfSquared(0,0) = cm_sum<uint>(squared.select<16,1,16,1>(0,0),   SAT);
57 
58         sum(0,1)          = cm_sum<uint>(field_in.select<16,1,16,1>(0,16), SAT);
59         SumOfSquared(0,1) = cm_sum<uint>(squared.select<16,1,16,1>(0,16),  SAT);
60 
61         PixAvg = sum >> 8;
62         sum   *= sum;
63         sum  >>= 8;
64 
65         VarianceLuma = SumOfSquared - sum;
66         VarianceLuma >>= 8;
67 
68         field_flatness = (VarianceLuma < MBFlatnessThreshold);
69     }
70     else
71     {
72         sum_8x8(0, 0)          = cm_sum<uint>(field_in.select<8, 1, 8, 1>(0, 0) , SAT);
73         SumOfSquared_8x8(0, 0) = cm_sum<uint>(squared.select<8,  1, 8, 1>(0, 0) , SAT);
74         sum_8x8(0, 1)          = cm_sum<uint>(field_in.select<8, 1, 8, 1>(0, 8) , SAT);
75         SumOfSquared_8x8(0, 1) = cm_sum<uint>(squared.select<8,  1, 8, 1>(0, 8) , SAT);
76         sum_8x8(0, 2)          = cm_sum<uint>(field_in.select<8, 1, 8, 1>(8, 0) , SAT);
77         SumOfSquared_8x8(0, 2) = cm_sum<uint>(squared.select<8,  1, 8, 1>(8, 0) , SAT);
78         sum_8x8(0, 3)          = cm_sum<uint>(field_in.select<8, 1, 8, 1>(8, 8) , SAT);
79         SumOfSquared_8x8(0, 3) = cm_sum<uint>(squared.select<8,  1, 8, 1>(8, 8) , SAT);
80 
81         sum(0, 0)             = cm_sum<uint>(sum_8x8.select<1, 1, 4, 1>(0, 0), SAT);
82         SumOfSquared(0, 0)    = cm_sum<uint>(SumOfSquared_8x8.select<1, 1, 4, 1>(0, 0), SAT);
83         sum_8x8(1, 0)         = cm_sum<uint>(field_in.select<8, 1, 8, 1>(0, 16), SAT);
84         SumOfSquared_8x8(1, 0)= cm_sum<uint>(squared.select<8,  1, 8, 1>(0, 16), SAT);
85         sum_8x8(1, 1)         = cm_sum<uint>(field_in.select<8, 1, 8, 1>(0, 24), SAT);
86         SumOfSquared_8x8(1, 1)= cm_sum<uint>(squared.select<8,  1, 8, 1>(0, 24), SAT);
87         sum_8x8(1, 2)         = cm_sum<uint>(field_in.select<8, 1, 8, 1>(8, 16), SAT);
88         SumOfSquared_8x8(1, 2)= cm_sum<uint>(squared.select<8,  1, 8, 1>(8, 16), SAT);
89         sum_8x8(1, 3)         = cm_sum<uint>(field_in.select<8, 1, 8, 1>(8, 24), SAT);
90         SumOfSquared_8x8(1, 3)= cm_sum<uint>(squared.select<8,  1, 8, 1>(8, 24), SAT);
91 
92         sum(0, 1) = cm_sum<uint>(sum_8x8.select<1, 1, 4, 1>(1, 0), SAT);
93         SumOfSquared(0, 1) = cm_sum<uint>(SumOfSquared_8x8.select<1, 1, 4, 1>(1, 0), SAT);
94 
95         PixAvg = sum >> 8;
96         sum   *= sum;
97         sum  >>= 8;
98 
99         VarianceLuma   = SumOfSquared - sum;
100         VarianceLuma >>= 8;
101         field_flatness = (VarianceLuma < MBFlatnessThreshold);
102 
103         PixAvg_8x8 = sum_8x8 >> 6;
104         sum_8x8   *= sum_8x8;
105         sum_8x8  >>= 6;
106 
107         VarianceLuma_8x8   = SumOfSquared_8x8 - sum_8x8;
108         VarianceLuma_8x8 >>= 6;
109     }
110 }
111 
hme_frame_downscale(unsigned short width,unsigned short height,SurfaceIndex InputBuf,SurfaceIndex DownscaleBuf,vector<uint,2> reserved1,unsigned int MBFlatnessThreshold,unsigned int Enablers,unsigned int reserved2,SurfaceIndex MB_VProc_Stats)112 extern "C" _GENX_MAIN_ void hme_frame_downscale(
113     unsigned short  width,
114     unsigned short  height,
115     SurfaceIndex    InputBuf,
116     SurfaceIndex    DownscaleBuf,
117     vector<uint, 2> reserved1,
118     unsigned int    MBFlatnessThreshold,
119     unsigned int    Enablers,
120     unsigned int    reserved2,
121     SurfaceIndex    MB_VProc_Stats)
122 {
123     matrix<uchar, 32, 32> in;
124     matrix<uchar,  8,  8> downscale_out;
125     matrix<uint,   2,  2> flatness_out;
126 
127     matrix<ushort, 16,16> temp1;
128     matrix<ushort,  8, 8> temp2;
129     matrix<ushort,  4, 4> temp3;
130     vector<int,       8>  mask;
131     vector<ushort,    1>  replicate;
132 
133     matrix<ushort,32,32> squared;
134     vector<uint,     32> temp_sum;
135     vector<uint,     32> temp_SumOfSquared;
136     matrix<uint,   2, 2> sum;
137     matrix<uint,   2, 2> pixAvg;
138     matrix<uint,   2, 2> SumOfSquared;
139     matrix_ref<uint,2, 2>VarianceLuma = SumOfSquared;
140 
141     matrix<uint, 4, 4> sum_8x8;
142     matrix<uint, 4, 4> pixAvg_8x8;
143     matrix<uint, 4, 4> SumOfSquared_8x8;
144     matrix_ref<uint, 4, 4>VarianceLuma_8x8 = SumOfSquared_8x8;
145 
146     replicate[0] = 0;
147 
148     ushort h_pos = get_thread_origin_x();
149     ushort v_pos = get_thread_origin_y();
150 
151     ushort h_pos_write = h_pos << 3;
152     ushort v_pos_write = v_pos << 3;
153 
154     ushort h_pos_read            = h_pos_write;
155     vector<ushort, 1> v_pos_read = v_pos_write;
156 
157     vector<ushort, 1> max_dst_row      = v_pos_write + 8;
158     vector<ushort, 1> max_real_dst_row = (height >> 2);
159 
160     replicate.merge((max_dst_row - max_real_dst_row), (max_dst_row > max_real_dst_row));
161 
162     v_pos_read.merge((max_real_dst_row -1), v_pos_write >= max_real_dst_row);
163     replicate.merge(7, v_pos_write >= max_real_dst_row);
164 
165     v_pos_read[0] =  v_pos_read[0] <<2;
166 
167     read(InputBuf, h_pos_read << 2, v_pos_read[0],      in.select<8,1,32,1>(0, 0));
168     read(InputBuf, h_pos_read << 2, v_pos_read[0] + 8,  in.select<8,1,32,1>(8, 0));
169     read(InputBuf, h_pos_read << 2, v_pos_read[0] + 16, in.select<8,1,32,1>(16,0));
170     read(InputBuf, h_pos_read << 2, v_pos_read[0] + 24, in.select<8,1,32,1>(24,0));
171 
172     #pragma unroll
173     for(uint i=0; i<2; i++) {
174     #pragma unroll
175         for(uint j=0; j<2; j++)
176         {
177             temp1.select<8,2,16,1>(0,0) = in.select<8,2,16,1>(i<<4,j<<4) + in.select<8,2,16,1>( (i<<4)+1, j<<4 );
178             temp2.select<8,1, 8,1>(0,0) = temp1.select<8,2,8,2>(0,0)     + temp1.select<8,2,8,2>(0,1);
179 
180             temp2.select<8,1,8,1>(0,0) += 2;
181             temp2.select<8,1,8,1>(0,0)  = temp2.select<8,1,8,1>(0,0) >> 2;
182 
183             temp2.select<4,2,8,1>(0,0)  = temp2.select<4,2,8,1>(0,0) + temp2.select<4,2,8,1>(1,0);
184             temp3.select<4,1,4,1>(0,0)  = temp2.select<4,2,4,2>(0,0) + temp2.select<4,2,4,2>(0,1);
185 
186             temp3.select<4,1,4,1>(0,0) += 2;
187             temp3.select<4,1,4,1>(0,0)  = temp3.select<4,1,4,1>(0,0) >> 2;
188 
189             downscale_out.select<4,1,4,1>(i<<2,j<<2) = temp3.select<4,1,4,1>(0,0);
190         }
191     }
192 
193     if ( ! replicate[0] )
194     {
195         write(DownscaleBuf, h_pos_write, v_pos_write, downscale_out);
196     }
197     else
198     {
199         matrix<uchar, 1, 8> last_line = downscale_out.select<1,1,8,1>(7 - replicate[0], 0);
200 
201         #pragma unroll
202         for(uint i=1; i<8; i++)
203         {
204             mask = i > 7 - replicate[0];
205             downscale_out.select<1,1,8,1>(i, 0).merge(last_line, mask);
206         }
207 
208         write(DownscaleBuf, h_pos_write, v_pos_write, downscale_out);
209     }
210 
211     unsigned short EnableMBFlatnessCheck = Enablers & 0x1;
212     unsigned short EnableVarianceOut     = Enablers & 0x2;
213     unsigned short EnablePixAvgOut       = Enablers & 0x4;
214     unsigned short Enable_8x8_Stats      = Enablers & 0x8;
215 
216     if (Enablers != 0)
217     {
218         squared = in * in;
219 
220         if (!Enable_8x8_Stats)
221         {
222             sum(0,0)          = cm_sum<uint>(in.select<16,1,16,1>(0,0), SAT);
223             SumOfSquared(0,0) = cm_sum<uint>(squared.select<16,1,16,1>(0,0), SAT);
224 
225             sum(0,1)          = cm_sum<uint>(in.select<16, 1, 16, 1>(0, 16), SAT);
226             SumOfSquared(0,1) = cm_sum<uint>(squared.select<16, 1, 16, 1>(0, 16), SAT);
227 
228             sum(1,0)          = cm_sum<uint>(in.select<16,1,16,1>(16,0), SAT);
229             SumOfSquared(1,0) = cm_sum<uint>(squared.select<16,1,16,1>(16,0), SAT);
230 
231             sum(1,1)          = cm_sum<uint>(in.select<16,1,16,1>(16,16), SAT);
232             SumOfSquared(1,1) = cm_sum<uint>(squared.select<16,1,16,1>(16,16), SAT);
233 
234             pixAvg = sum >> 8;
235             sum   *= sum;
236             sum  >>= 8;
237 
238             VarianceLuma   = SumOfSquared - sum;
239             VarianceLuma >>= 8;
240 
241             flatness_out   = (VarianceLuma < MBFlatnessThreshold);
242         }
243         else
244         {
245             sum_8x8(0,0)          = cm_sum<uint>(in.select<8,1,8,1>(0,0) , SAT);
246             SumOfSquared_8x8(0,0) = cm_sum<uint>(squared.select<8,1,8,1>(0,0) , SAT);
247             sum_8x8(0,1)          = cm_sum<uint>(in.select<8,1,8,1>(0,8) , SAT);
248             SumOfSquared_8x8(0,1) = cm_sum<uint>(squared.select<8,1,8,1>(0,8) , SAT);
249             sum_8x8(0,2)          = cm_sum<uint>(in.select<8,1,8,1>(8,0) , SAT);
250             SumOfSquared_8x8(0,2) = cm_sum<uint>(squared.select<8,1,8,1>(8,0) , SAT);
251             sum_8x8(0,3)          = cm_sum<uint>(in.select<8,1,8,1>(8,8) , SAT);
252             SumOfSquared_8x8(0,3) = cm_sum<uint>(squared.select<8,1,8,1>(8,8) , SAT);
253 
254             sum(0,0)              = cm_sum<uint>(sum_8x8.select<1,1,4,1>(0,0), SAT);
255             SumOfSquared(0,0)     = cm_sum<uint>(SumOfSquared_8x8.select<1,1,4,1>(0,0), SAT);
256 
257             sum_8x8(1,0)          = cm_sum<uint>(in.select<8,1,8,1>(0,16) , SAT);
258             SumOfSquared_8x8(1,0) = cm_sum<uint>(squared.select<8,1,8,1>(0,16) , SAT);
259             sum_8x8(1,1)          = cm_sum<uint>(in.select<8,1,8,1>(0,24) , SAT);
260             SumOfSquared_8x8(1,1) = cm_sum<uint>(squared.select<8,1,8,1>(0,24) , SAT);
261             sum_8x8(1,2)          = cm_sum<uint>(in.select<8,1,8,1>(8,16) , SAT);
262             SumOfSquared_8x8(1,2) = cm_sum<uint>(squared.select<8,1,8,1>(8,16) , SAT);
263             sum_8x8(1,3)          = cm_sum<uint>(in.select<8,1,8,1>(8,24) , SAT);
264             SumOfSquared_8x8(1,3) = cm_sum<uint>(squared.select<8,1,8,1>(8,24) , SAT);
265 
266             sum(0,1)              = cm_sum<uint>(sum_8x8.select<1,1,4,1>(1,0), SAT);
267             SumOfSquared(0,1)     = cm_sum<uint>(SumOfSquared_8x8.select<1,1,4,1>(1,0), SAT);
268 
269             sum_8x8(2,0)          = cm_sum<uint>(in.select<8,1,8,1>(16,0) , SAT);
270             SumOfSquared_8x8(2,0) = cm_sum<uint>(squared.select<8,1,8,1>(16,0) , SAT);
271             sum_8x8(2,1)          = cm_sum<uint>(in.select<8,1,8,1>(16,8) , SAT);
272             SumOfSquared_8x8(2,1) = cm_sum<uint>(squared.select<8,1,8,1>(16,8) , SAT);
273             sum_8x8(2,2)          = cm_sum<uint>(in.select<8,1,8,1>(24,0) , SAT);
274             SumOfSquared_8x8(2,2) = cm_sum<uint>(squared.select<8,1,8,1>(24,0) , SAT);
275             sum_8x8(2,3)          = cm_sum<uint>(in.select<8,1,8,1>(24,8) , SAT);
276             SumOfSquared_8x8(2,3) = cm_sum<uint>(squared.select<8,1,8,1>(24,8) , SAT);
277 
278             sum(1,0)              = cm_sum<uint>(sum_8x8.select<1,1,4,1>(2,0), SAT);
279             SumOfSquared(1,0)     = cm_sum<uint>(SumOfSquared_8x8.select<1,1,4,1>(2,0), SAT);
280 
281             sum_8x8(3,0)          = cm_sum<uint>(in.select<8,1,8,1>(16,16) , SAT);
282             SumOfSquared_8x8(3,0) = cm_sum<uint>(squared.select<8,1,8,1>(16,16) , SAT);
283             sum_8x8(3,1)          = cm_sum<uint>(in.select<8,1,8,1>(16,24) , SAT);
284             SumOfSquared_8x8(3,1) = cm_sum<uint>(squared.select<8,1,8,1>(16,24) , SAT);
285             sum_8x8(3,2)          = cm_sum<uint>(in.select<8,1,8,1>(24,16) , SAT);
286             SumOfSquared_8x8(3,2) = cm_sum<uint>(squared.select<8,1,8,1>(24,16) , SAT);
287             sum_8x8(3,3)          = cm_sum<uint>(in.select<8,1,8,1>(24,24) , SAT);
288             SumOfSquared_8x8(3,3) = cm_sum<uint>(squared.select<8,1,8,1>(24,24) , SAT);
289 
290             sum(1,1)              = cm_sum<uint>(sum_8x8.select<1,1,4,1>(3,0), SAT);
291             SumOfSquared(1,1)     = cm_sum<uint>(SumOfSquared_8x8.select<1,1,4,1>(3,0), SAT);
292 
293             pixAvg = sum >> 8;
294             sum   *= sum;
295             sum  >>= 8;
296 
297             VarianceLuma   = SumOfSquared - sum;
298             VarianceLuma >>= 8;
299             flatness_out   = (VarianceLuma < MBFlatnessThreshold);
300 
301             pixAvg_8x8 = sum_8x8 >> 6;
302             sum_8x8   *= sum_8x8;
303             sum_8x8  >>= 6;
304 
305             VarianceLuma_8x8   = SumOfSquared_8x8 - sum_8x8;
306             VarianceLuma_8x8 >>= 6;
307         }
308 
309         unsigned short NumMBperRow = width / 16;
310         NumMBperRow               += ((width % 16)> 0);
311 
312         unsigned int offset_vp  = (2 * NUM_OF_DWORDS_PER_MB * 4 * (v_pos * NumMBperRow + h_pos));
313         unsigned int offset2_vp = offset_vp + NumMBperRow * NUM_OF_BYTES_PER_MB;
314 
315         vector<uint, 44> writeVProc(0);
316         vector<uint,  8> Element_Offset(Stat_offsets);
317         vector<uint,  8> Element_Offset_(Stat_offsets_);
318         vector<uint, 16> Element_Offset_2(Stat_offsets_2);
319         vector<uint,  8> Element_Offset_2_(Stat_offsets_2_);
320 
321         offset_vp  >>= 2;
322         offset2_vp >>= 2;
323 
324         if(EnableMBFlatnessCheck)
325         {
326             writeVProc.select<4,11>(0) = flatness_out.select_all();
327         }
328 
329         if(EnableVarianceOut)
330         {
331             writeVProc.select<2,11>(1)  = VarianceLuma.row(0);
332             writeVProc.select<2,11>(23) = VarianceLuma.row(1);
333 
334             if(Enable_8x8_Stats)
335             {
336                 writeVProc.select<4,1>(2)  = VarianceLuma_8x8.row(0);
337                 writeVProc.select<4,1>(13) = VarianceLuma_8x8.row(1);
338                 writeVProc.select<4,1>(24) = VarianceLuma_8x8.row(2);
339                 writeVProc.select<4,1>(35) = VarianceLuma_8x8.row(3);
340             }
341         }
342 
343         if(EnablePixAvgOut)
344         {
345             writeVProc.select<2,11>(6)  = pixAvg.row(0);
346             writeVProc.select<2,11>(28) = pixAvg.row(1);
347 
348             if(Enable_8x8_Stats)
349             {
350                 writeVProc.select<4,1>(7)  = pixAvg_8x8.row(0);
351                 writeVProc.select<4,1>(18) = pixAvg_8x8.row(1);
352                 writeVProc.select<4,1>(29) = pixAvg_8x8.row(2);
353                 writeVProc.select<4,1>(40) = pixAvg_8x8.row(3);
354             }
355         }
356 
357         if(EnableVarianceOut || EnablePixAvgOut)
358         {
359             if((h_pos * 32 < width) && (v_pos * 32 < height))
360             {
361                 if((h_pos * 32)+ 16 >= width)
362                 {
363                     write(MB_VProc_Stats, offset_vp, Element_Offset,  writeVProc.select<8,1>(0));
364                     write(MB_VProc_Stats, offset_vp, Element_Offset_, writeVProc.select<8,1>(3));
365                 }
366                 else
367                 {
368                     write(MB_VProc_Stats, offset_vp, Element_Offset_2,  writeVProc.select<16,1>(0));
369                     write(MB_VProc_Stats, offset_vp, Element_Offset_2_, writeVProc.select<8,1>(14));
370                 }
371 
372                 if ((v_pos * 32) + 16 < height)
373                 {
374                     if ((h_pos * 32) + 16 >= width)
375                     {
376                         write(MB_VProc_Stats, offset2_vp, Element_Offset,  writeVProc.select<8, 1>(22));
377                         write(MB_VProc_Stats, offset2_vp, Element_Offset_, writeVProc.select<8, 1>(25));
378                     }
379                     else
380                     {
381                         write(MB_VProc_Stats, offset2_vp, Element_Offset_2,  writeVProc.select<16, 1>(22));
382                         write(MB_VProc_Stats, offset2_vp, Element_Offset_2_, writeVProc.select<8, 1>(36));
383                     }
384                 }
385             }
386         }
387     }
388 }
389 
390 extern "C" _GENX_MAIN_ void
hme_frame_downscale2(unsigned short width,unsigned short height,vector<uint,7> reserved,SurfaceIndex ibuf,SurfaceIndex obuf)391 hme_frame_downscale2(unsigned short  width,
392                      unsigned short  height,
393                      vector<uint, 7> reserved,
394                      SurfaceIndex    ibuf,
395                      SurfaceIndex    obuf)
396 {
397     matrix<uchar,  32, 32> in;
398     matrix<ushort, 16, 16> temp1;
399     matrix<ushort,  8,  8> temp2;
400     matrix<ushort,  4,  4> temp3;
401     matrix<uchar,  16, 16> out;
402     vector<int,        16> mask;
403 
404     vector<ushort, 1> replicate;
405     replicate[0] = 0;
406 
407     ushort h_pos_write = get_thread_origin_x() << 4;
408     ushort v_pos_write = get_thread_origin_y() << 4;
409     ushort h_pos_read  = h_pos_write;
410 
411     vector<ushort, 1> v_pos_read       = v_pos_write;
412     vector<ushort, 1> max_dst_row      = v_pos_write + 16;
413     vector<ushort, 1> max_real_dst_row = (height >> 1);
414 
415     replicate.merge((max_dst_row - max_real_dst_row), (max_dst_row > max_real_dst_row));
416 
417     v_pos_read.merge(max_real_dst_row - 1, v_pos_write >= max_real_dst_row);
418     replicate.merge(15, v_pos_write >= max_real_dst_row);
419 
420     read(ibuf, h_pos_read << 1, v_pos_read[0]  << 1,       in.select<8, 1, 32, 1>(0, 0));
421     read(ibuf, h_pos_read << 1, (v_pos_read[0] << 1) + 8,  in.select<8, 1, 32, 1>(8, 0));
422     read(ibuf, h_pos_read << 1, (v_pos_read[0] << 1) + 16, in.select<8, 1, 32, 1>(16,0));
423     read(ibuf, h_pos_read << 1, (v_pos_read[0] << 1) + 24, in.select<8, 1, 32, 1>(24,0));
424 
425     #pragma unroll
426     for (uint i = 0; i<2; i++)
427     {
428         #pragma unroll
429         for (uint j = 0; j<2; j++)
430         {
431             temp1.select<8, 2,16, 1>(0, 0)  = in.select<8, 2,16, 1>(i << 4, j << 4) + in.select<8, 2, 16, 1>((i << 4) + 1, j << 4);
432             temp2.select<8, 1, 8, 1>(0, 0)  = temp1.select<8, 2, 8, 2>(0, 0) + temp1.select<8, 2, 8, 2>(0, 1);
433 
434             temp2.select<8, 1, 8, 1>(0, 0) += 2;
435             temp2.select<8, 1, 8, 1>(0, 0)  = temp2.select<8, 1, 8, 1>(0, 0) >> 2;
436 
437             out.select<8, 1, 8, 1>(i << 3, j << 3) = temp2.select<8, 1, 8, 1>(0, 0);
438         }
439     }
440 
441     if (!replicate[0])
442     {
443         write(obuf, h_pos_write, v_pos_write, out);
444     }
445     else
446     {
447         matrix<uchar, 1, 16> last_line = out.select<1, 1, 16, 1>(15 - replicate[0], 0);
448 
449         #pragma unroll
450         for (uint i = 1; i < 16; i++)
451         {
452             mask = i > 15 - replicate[0];
453             out.select<1, 1, 16, 1>(i, 0).merge(last_line, mask);
454         }
455 
456         write(obuf, h_pos_write, v_pos_write, out);
457     }
458 }
459