1 /*
2 * Copyright (c) 2019, Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included
12 * in all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
15 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20 * OTHER DEALINGS IN THE SOFTWARE.
21 */
22 
23 #include <cm/cm.h>
24 
25 #define STREAMIN_SIZE       64
26 #define LCU64_STREAMIN_SIZE (STREAMIN_SIZE * 4)
27 #define TILEINFO_SIZE       16
28 
29 #define LCUSIZE32 0
30 #define LCUSIZE64 1
31 
32 #define SHIFT_MB_TO_SUB_MB    (2)
33 #define SHIFT_PIXEL_TO_SUB_MB (2)
34 
35 // Number of bytes in Curbe data
36 #define CURBEDATA_SIZE 160
37 
38 #define BIT5  0x20
39 #define BIT4  0x10
40 #define BIT3  0x08
41 
42 //---------------------------------------------------------------------------
43 // Binding table indexes
44 //---------------------------------------------------------------------------
45 #define NUM_SURFACE_IDX 14
46 #define UNI              4
47 
48 //---------------------------------------------------------------------------
49 // Const vector for Streamin
50 //---------------------------------------------------------------------------
51 const char MBX_Indx[16]      = {0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3};
52 const char MBaddressIndx[16] = {0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3};
53 
54 const char MBX_Indx_hevc[16]      = {0,1,0,1};
55 const char MBaddressIndx_hevc[16] = {0,0,1,1};
56 
57 enum ROISOURCE
58 {
59    ROIMAP_FROM_UNKNOWN          = 0,
60    ROIMAP_FROM_APP              = 1,
61    ROIMAP_FROM_APP_DIRTYRECT    = 2,
62    ROIMAP_FROM_HME_STATICREGION = 3,
63    ROIMAP_FROM_PIXELVAR         = 4
64 };
65 
66 
67 _GENX_  void
HME_SET_REF(vector_ref<uchar,CURBEDATA_SIZE> CURBEData,vector_ref<short,2> input_refine,vector_ref<short,2> output_refine,vector_ref<short,2> pos)68 inline HME_SET_REF(vector_ref<uchar, CURBEDATA_SIZE> CURBEData, vector_ref<short, 2> input_refine, vector_ref<short, 2> output_refine, vector_ref<short, 2> pos)
69 {
70     vector<short, 2> maxLens;
71 
72     maxLens(0) = 512;
73     maxLens(1) = CURBEData.format<short>()[13] >> 2;
74 
75     vector<short,  2> in_refine = input_refine >> 2;
76     vector<ushort, 2> search    = CURBEData.select<2,1>(22);
77     vector<ushort, 2> tmp       = search - 16;
78     vector<ushort, 2> widths    = tmp >> 1;
79 
80     vector<short,  2> pictureWidths;
81     pictureWidths(0) = CURBEData(18) << 4;
82     pictureWidths(1) = CURBEData(17) << 4;
83     pictureWidths(1) = pictureWidths(1) + 16;
84 
85     vector<short, 2> VME_params = in_refine;
86     VME_params                  = VME_params - widths;
87 
88     output_refine = VME_params + pos;
89     output_refine.merge(pos + maxLens - tmp, in_refine + tmp >   maxLens);
90     output_refine.merge(pos - maxLens,       in_refine - tmp < (-maxLens));
91 
92     output_refine.merge((pictureWidths - 1) & 0xFFFC, output_refine > (pictureWidths - 1));
93 
94     vector<short, 2> tmp_search = -search;
95     output_refine.merge((5 - search) & 0xFFFC,  output_refine <= tmp_search);
96     output_refine = output_refine - pos;
97 }
98 
99 // Main logic is in this function. The logic is shared by both P and B versions of the kernel.
100 _GENX_  void
HME(vector<uchar,CURBEDATA_SIZE> CURBEData,SurfaceIndex HME_MV_Data_Surface_index,SurfaceIndex HME_MV_Input_Data_Surface_index,SurfaceIndex DISTORTION_Surface,SurfaceIndex BRC_DISTORTION_Surface,SurfaceIndex Pred_Surface_L0,SurfaceIndex Pred_Surface_L1,SurfaceIndex StreamINSurface,SurfaceIndex StreamINSurface_input,SurfaceIndex SUM_Surface,SurfaceIndex TileInfo_Buffer,bool b,bool vdenc_enable,bool is_hevc_vp9_vdenc)101 inline HME( vector<uchar, CURBEDATA_SIZE> CURBEData,
102             SurfaceIndex HME_MV_Data_Surface_index,
103             SurfaceIndex HME_MV_Input_Data_Surface_index,
104             SurfaceIndex DISTORTION_Surface,
105             SurfaceIndex BRC_DISTORTION_Surface,
106             SurfaceIndex Pred_Surface_L0,
107             SurfaceIndex Pred_Surface_L1,
108             SurfaceIndex StreamINSurface,
109             SurfaceIndex StreamINSurface_input,
110             SurfaceIndex SUM_Surface,
111             SurfaceIndex TileInfo_Buffer,
112             bool b,
113             bool vdenc_enable,
114             bool is_hevc_vp9_vdenc)
115 {
116     ushort mb_x_pos = get_thread_origin_x();
117     ushort mb_y_pos = get_thread_origin_y();
118 
119     vector<short, 2> pos;
120     ushort x_pos = pos(0) = mb_x_pos << 4;
121     ushort y_pos = pos(1) = mb_y_pos << 4;
122 
123     bool  useMVPrevStep       = (CURBEData(24) & BIT4) != 0;
124     bool  EnableMVSum         = (CURBEData(24) & BIT5) != 0;
125     bool  writeDistortions    = (CURBEData(24) & BIT3) != 0;
126     uchar prevMVReadPosFactor = CURBEData(60);
127 
128     uchar MVShiftFactor       = CURBEData(61);
129     uchar picture_heightMB    = CURBEData(17) + 1;
130     uchar NumRefIdxL0         = CURBEData(52);
131     uchar NumRefIdxL1         = CURBEData(53);
132     uchar BRCMVThreshold      = CURBEData(20);
133 
134     vector<ushort, 2> ActualMBDim = 0;
135     U8 HMEStreaminRefCost;
136     U8 ROIMapEnable;
137 
138     if (!is_hevc_vp9_vdenc && vdenc_enable)
139     {
140         ActualMBDim = CURBEData.format<ushort>().select<2,1>(60);
141         HMEStreaminRefCost = CURBEData(54);
142         ROIMapEnable       = CURBEData(55) >> 5;
143     }
144 
145     vector<U32, 8> BD_MV_Sum = 0;
146 
147     // used for determining whether to calculate refined MV on L0 and L1.
148     // If the absulte value of the input (higher level in the hierarchy ) refined MV data is below this threshold,
149     // we skip the refine calcualtion
150     uchar SUPER_COMBINEDIST = CURBEData(25);
151 
152     // calculations on the reference region width and height
153     vector<short,  2> ref_region_size    = CURBEData.select<2,1>(22) - 16;
154     vector<ushort, 2> search_coordinates = ref_region_size    >> 1;
155     vector<short,  2> rxy                = search_coordinates >> 2;
156 
157     // data structures used for VME commands:
158     matrix<uchar, UNI, 32> UNIInput_tmp;
159     matrix<uchar, UNI, 32> UNIInput;
160     matrix<uchar,   4, 32> IMEInput;
161     matrix<uchar,   4, 32> FBRInput;
162     matrix<uchar,   9, 32> IME_output_MV_refine_L0;
163     matrix<uchar,   7, 32> VME_ME_REFINE_L0;
164     matrix<uchar,   9, 32> IME_output_MV_refine_L1;
165     matrix<uchar,   7, 32> VME_ME_REFINE_L1;
166     matrix<uchar,   7, 32> IME_output_MV_L0;
167     matrix<uchar,   7, 32> VME_ME_L0;
168     matrix<uchar,   7, 32> IME_output_MV_L1;
169     matrix<uchar,   7, 32> VME_ME_L1;
170     vector<short,       2> ref0;
171     vector<ushort,     16> costCenter = 0;
172 
173     uchar FBRMbMode, FBRSubMbShape, FBRSubPredMode;
174 
175     // VME search control
176     VMESearchCtrl searchControl = VME_SEARCH_SINGLE_REF_SINGLE_REC_SINGLE_START;
177 
178     // initialize universal data
179     UNIInput_tmp.row(0)                 = 0;
180     UNIInput_tmp.format<ushort>()[0, 4] = x_pos;
181     UNIInput_tmp.format<ushort>()[0, 5] = y_pos;
182     UNIInput_tmp.row(0).format<uint>().select<2, 2>(3) = CURBEData.format<uint>().select<2, 2>(3);
183     UNIInput_tmp(0, 20)                                = 0;
184     UNIInput_tmp.row(1)                                = 0;
185     UNIInput_tmp.row(1).format<uint>().select<2, 1>(0) = CURBEData.format<uint>().select<2, 1>(0);
186     UNIInput_tmp.row(1).format<uint>().select<1, 1>(2) = CURBEData.format<uint>().select<1, 1>(2);
187     UNIInput_tmp.select<1,1,2,1>(1, 10)                = rxy(0) + (rxy(1) << 4);
188     UNIInput_tmp.row(1).format<uint>().select<1,1>(7)  = CURBEData.format<uint>().select<1, 1>(7);
189     UNIInput_tmp.row(2).format<uint>()                 = CURBEData.format<uint>().select<8, 1>(8);
190 
191     // initialize IMEInput
192     IMEInput.row(0).format<uint>()                 = CURBEData.format<uint>().select<8, 1>(16);
193     IMEInput.row(1).format<uint>().select<6, 1>(0) = CURBEData.format<uint>().select<6, 1>(24);
194     IMEInput.row(1).format<uint>().select<2, 1>(6) = 0;
195 
196     uchar current_iteration = 0;
197     uint  picture_offset    = 0;
198 
199     // indicate whether we need to check L0 and L1 reference frames respectively
200     vector<uchar, 1> checkL0 = 0;
201     vector<uchar, 1> checkL1 = 0;
202 
203     // field_support
204     char refFieldPolarityL0 = CURBEData(56);
205     char refFieldPolarityL1 = CURBEData(57);
206 
207     uchar isField = (CURBEData(12) >> 7);
208 
209     vector<char, 8> vec1;
210     vec1 = 0xff;
211 
212     vector<char, 8>ref0Polarity;
213     ref0Polarity = 0;
214     ref0Polarity.merge(vec1,refFieldPolarityL0);
215 
216     vector<char, 8>ref1Polarity;
217     ref1Polarity = 0;
218 
219     if (b)
220     {
221         ref1Polarity.merge(vec1,refFieldPolarityL1);
222     }
223 
224     /*some local declarations that were moved to the top*/
225     matrix<uchar, 4, 32> MV;
226 
227     matrix<ushort, 4, 4> best_DistL0;
228     matrix<uint,   4, 4> best_refId;
229     matrix<uint,   4, 4> best_MV;
230     matrix<uchar,  4, 4> ROIMask(0);
231 
232     //save values for streamIn
233     matrix<uint, 2, 8> streamIn_MV = 0;
234     uchar  PredictorSelect         = 0;
235     ushort PredictorRefIdx         = 0;
236 
237     if (!is_hevc_vp9_vdenc && vdenc_enable)
238     {
239         char Streamin_MaxRef;
240 
241         best_DistL0     = 0xffff;
242         best_refId      = 0;
243         Streamin_MaxRef = (NumRefIdxL0 > 2 ? 2 : NumRefIdxL0);
244         NumRefIdxL0     = Streamin_MaxRef;
245     }
246 
247     if (is_hevc_vp9_vdenc && vdenc_enable)
248     {
249         NumRefIdxL0 = NumRefIdxL0 > 2 ? 2 : NumRefIdxL0;
250         ActualMBDim = CURBEData.format<ushort>().select<2, 1>(60);
251     }
252 
253     while (current_iteration <= NumRefIdxL0)
254     {
255         // initialize ME refinement cost to maximum
256         VME_ME_REFINE_L0.row(0).format<ushort>()[4] = 0xFFFF;
257         VME_ME_REFINE_L1.row(0).format<ushort>()[4] = 0xFFFF;
258 
259         UNIInput_tmp.row(1).format<char>().select<1,1>(5).merge(ref0Polarity(current_iteration), 0, isField);
260 
261         // SuperHME or UltaraHME
262         if ( useMVPrevStep )
263         {
264             vector<uchar,     8> coarse_refine;
265             vector_ref<short, 4> coarse_mv_refine = coarse_refine.format<short>();
266             vector<short,     2> out_refine;
267 
268             // ===========================================
269             //  Get higher level HME MV refinement data
270             // ===========================================
271             /*
272             * The position of the MV of the last MB (in the vertical direction) should be
273             * at read_y_pos = (picture_heightMB << prevMVReadPosFactor)
274             * which means our input surface must have at least (picture_heightMB << prevMVReadPosFactor)
275             * motion vectors in each column. Each thread write 4 MV in a column so we have to round this number
276             * to the next multiplication of 4
277             */
278             uint subMB_offset_dscaled_pic = (( (picture_heightMB << prevMVReadPosFactor) + 3 ) & ~0x3);
279             subMB_offset_dscaled_pic     *= current_iteration;
280 
281             /*
282             * Read the MV of the current MB from the previous HME level.
283             *   The desired MV located in the higher level in corresponding SubMB which has (pos << prevMVReadPosFactor)
284             *   We look for the index in the array of subMB therfore we need to map between our MB index position
285             *   (In the current resolution) to the SubMB index in the previous level.
286             *
287             *   The index of the x pos is the horizontal index of the current MB shifted by the
288             *   read position factor given by the user. we add <<3 because each MV is 8 bytes
289             *   The index for the y pos is the vertical index of the current MB shifted by the
290             *   read position factor given by the user. we add subMB_offset_dscaled_pic so we get
291             *   the MV from the correct reference (the input surface arranged as a list).
292             */
293             uint read_x_pos = mb_x_pos << prevMVReadPosFactor;
294             uint read_y_pos = mb_y_pos << prevMVReadPosFactor;
295             read(HME_MV_Input_Data_Surface_index, read_x_pos << 3, read_y_pos + subMB_offset_dscaled_pic, coarse_refine);
296 
297             // Compare input refine MV values to threshold
298             vector<ushort, 4> v_mask = (cm_abs<short>(coarse_mv_refine) >> 2) < SUPER_COMBINEDIST;
299 
300             checkL0 = 1;
301             checkL0.merge(0, v_mask(0) & v_mask(1));
302 
303             if (b)
304             {
305                 checkL1 = 1;
306                 checkL1.merge(0, v_mask(2) & v_mask(3));
307             }
308 
309             if (checkL0(0))
310             {
311                 UNIInput.row(0) = UNIInput_tmp.row(0);
312                 UNIInput.row(1) = UNIInput_tmp.row(1);
313                 UNIInput.row(2) = UNIInput_tmp.row(2);
314                 UNIInput.row(3) = 0;
315                 UNIInput.row(1).select<4, 1>(24) = current_iteration;
316 
317                 HME_SET_REF( CURBEData, coarse_mv_refine.select<2, 1>(0), out_refine, pos );
318                 UNIInput.format<short, 4, 16>().select<1, 1, 2, 1>(0, 0) = out_refine;
319 
320                 ref0 = out_refine;
321                 run_vme_ime(UNIInput,
322                            IMEInput.select<2, 1, 32, 1>(0,0),
323                            VME_STREAM_OUT,
324                            searchControl,
325                            Pred_Surface_L0,
326                            ref0,
327                            0,
328                            costCenter,
329                            IME_output_MV_refine_L0.format<uchar, 9, 32>());
330 
331                 UNIInput.format<int, 4, 8>()[2][5] =  IME_output_MV_refine_L0.format<int, 9, 8>()[0][6] & 0xFFFF00;
332                 UNIInput[2][20] = IME_output_MV_refine_L0[0][0] & 0x03;
333                 FBRInput.row(0) = IME_output_MV_refine_L0.row(1);
334                 FBRInput.row(1) = IME_output_MV_refine_L0.row(2);
335                 FBRInput.row(2) = IME_output_MV_refine_L0.row(3);
336                 FBRInput.row(3) = IME_output_MV_refine_L0.row(4);
337 
338                 FBRMbMode      = VME_GET_UNIInput_FBRMbModeInput(UNIInput);
339                 FBRSubMbShape  = VME_GET_UNIInput_FBRSubMBShapeInput(UNIInput);
340                 FBRSubPredMode = VME_GET_UNIInput_FBRSubPredModeInput(UNIInput);
341 
342                 run_vme_fbr(UNIInput,
343                             FBRInput,
344                             Pred_Surface_L0,
345                             FBRMbMode,
346                             FBRSubMbShape,
347                             FBRSubPredMode,
348                             VME_ME_REFINE_L0);
349             }
350 
351             if (b)
352             {
353                 if((current_iteration <= NumRefIdxL1) && checkL1(0))
354                 {
355                     UNIInput_tmp.row(1).format<char>().select<1, 1>(5).merge(ref1Polarity(current_iteration), 0, isField);
356 
357                     UNIInput.row(0) = UNIInput_tmp.row(0);
358                     UNIInput.row(1) = UNIInput_tmp.row(1);
359                     UNIInput.row(2) = UNIInput_tmp.row(2);
360                     UNIInput.row(3) = 0;
361                     UNIInput.row(1).select<4,1>(24) = current_iteration;
362 
363                     HME_SET_REF(CURBEData, coarse_mv_refine.select<2, 1>(2), out_refine, pos);
364 
365                     UNIInput.format<short, 4, 16>().select<1, 1, 2, 1>(0, 0) = out_refine;
366                     ref0 = out_refine;
367 
368                     run_vme_ime(UNIInput,
369                                 IMEInput.select<2, 1, 32, 1>(0, 0),
370                                 VME_STREAM_OUT,
371                                 searchControl,
372                                 Pred_Surface_L1,
373                                 ref0,
374                                 0,
375                                 costCenter,
376                                 IME_output_MV_refine_L1.format<uchar, 9, 32>());
377 
378                     UNIInput.format<int, 4, 8>()[2][5] =  IME_output_MV_refine_L1.format<int, 9, 8>()[0][6] & 0xFFFF00;
379                     UNIInput[2][20] = IME_output_MV_refine_L1[0][0] & 0x03;
380                     FBRInput.row(0) = IME_output_MV_refine_L1.row(1);
381                     FBRInput.row(1) = IME_output_MV_refine_L1.row(2);
382                     FBRInput.row(2) = IME_output_MV_refine_L1.row(3);
383                     FBRInput.row(3) = IME_output_MV_refine_L1.row(4);
384 
385                     FBRMbMode       = VME_GET_UNIInput_FBRMbModeInput(UNIInput);
386                     FBRSubMbShape   = VME_GET_UNIInput_FBRSubMBShapeInput(UNIInput);
387                     FBRSubPredMode  = VME_GET_UNIInput_FBRSubPredModeInput(UNIInput);
388 
389                     run_vme_fbr(UNIInput,
390                                 FBRInput,
391                                 Pred_Surface_L1,
392                                 FBRMbMode,
393                                 FBRSubMbShape,
394                                 FBRSubPredMode,
395                                 VME_ME_REFINE_L1);
396 
397                     VME_ME_REFINE_L0.format<int, 7, 8>().select<4,1,4,2>(1,1) = VME_ME_REFINE_L1.format<int, 7, 8>().select<4, 1, 4, 2>(1, 0);
398                 }
399             }
400 
401             VME_ME_REFINE_L0.format<short, 7, 16>().select<4, 1, 16, 1>(1, 0) <<=  MVShiftFactor;
402         }
403 
404         UNIInput.row(0) = UNIInput_tmp.row(0);
405         UNIInput.row(1) = UNIInput_tmp.row(1);
406         UNIInput.row(2) = UNIInput_tmp.row(2);
407         UNIInput.row(3) = 0;
408 
409         UNIInput.row(1).select<4, 1>(24)                         =  current_iteration;
410         UNIInput.format<short, 4, 16>().select<1, 1, 2, 1>(0, 0) = -search_coordinates;
411         ref0                                                     = -search_coordinates;
412 
413         if (checkL0(0))
414         {
415             IMEInput.row(2) = IME_output_MV_refine_L0.row(7);
416             IMEInput.row(3) = IME_output_MV_refine_L0.row(8);
417             run_vme_ime(UNIInput,
418                         IMEInput,
419                         VME_STREAM_IN,
420                         searchControl,
421                         Pred_Surface_L0,
422                         ref0,
423                         NULL,
424                         costCenter,
425                         IME_output_MV_L0);
426         }
427         else
428         {
429             run_vme_ime( UNIInput,
430             IMEInput.select<2, 1, 32, 1>(0,0),
431                 VME_STREAM_DISABLE,
432                 searchControl,
433                 Pred_Surface_L0,
434                 ref0,
435                 NULL,
436                 costCenter,
437                 IME_output_MV_L0.format<uchar, 7, 32>());
438         }
439 
440         UNIInput.format<int, 4, 8>()[2][5] = IME_output_MV_L0.format<int, 7, 8>()[0][6] & 0xFFFF00;
441         UNIInput[2][20]                    = IME_output_MV_L0[0][0] & 0x03;
442         FBRInput.row(0)                    = IME_output_MV_L0.row(1);
443         FBRInput.row(1)                    = IME_output_MV_L0.row(2);
444         FBRInput.row(2)                    = IME_output_MV_L0.row(3);
445         FBRInput.row(3)                    = IME_output_MV_L0.row(4);
446 
447         FBRMbMode      = VME_GET_UNIInput_FBRMbModeInput(UNIInput);
448         FBRSubMbShape  = VME_GET_UNIInput_FBRSubMBShapeInput(UNIInput);
449         FBRSubPredMode = VME_GET_UNIInput_FBRSubPredModeInput(UNIInput);
450 
451         run_vme_fbr(UNIInput,
452                     FBRInput,
453                     Pred_Surface_L0,
454                     FBRMbMode,
455                     FBRSubMbShape,
456                     FBRSubPredMode,
457                     VME_ME_L0);
458 
459         if (b)
460         {
461             if(current_iteration <= NumRefIdxL1)
462             {
463                 UNIInput_tmp.row(1).format<char>().select<1, 1>(5).merge(ref1Polarity(current_iteration), 0, isField);
464 
465                 UNIInput.row(0) = UNIInput_tmp.row(0);
466                 UNIInput.row(1) = UNIInput_tmp.row(1);
467                 UNIInput.row(2) = UNIInput_tmp.row(2);
468                 UNIInput.row(3) = 0;
469 
470                 UNIInput.row(1).select<4, 1>(24)                        =  current_iteration;
471                 UNIInput.format<short, 4, 16>().select<1, 1, 2, 1>(0,0) = -search_coordinates;
472                 ref0                                                    = -search_coordinates;
473 
474                 if (checkL1(0))
475                 {
476                     IMEInput.row(2) = IME_output_MV_refine_L1.row(7);
477                     IMEInput.row(3) = IME_output_MV_refine_L1.row(8);
478 
479                     run_vme_ime(UNIInput,
480                                 IMEInput,
481                                 VME_STREAM_IN,
482                                 searchControl,
483                                 Pred_Surface_L1,
484                                 ref0,
485                                 NULL,
486                                 costCenter,
487                                 IME_output_MV_L1);
488                 }
489                 else
490                 {
491                     run_vme_ime(UNIInput,
492                                 IMEInput.select<2, 1, 32, 1>(0, 0),
493                                 VME_STREAM_DISABLE,
494                                 searchControl,
495                                 Pred_Surface_L1,
496                                 ref0,
497                                 NULL,
498                                 costCenter,
499                                 IME_output_MV_L1.format<uchar, 7, 32>());
500                 }
501 
502                 UNIInput.format<int, 4, 8>()[2][5] = IME_output_MV_L1.format<int, 7, 8>()[0][6] & 0xFFFF00;
503                 UNIInput[2][20] = IME_output_MV_L1[0][0] & 0x03;
504                 FBRInput.row(0) = IME_output_MV_L1.row(1);
505                 FBRInput.row(1) = IME_output_MV_L1.row(2);
506                 FBRInput.row(2) = IME_output_MV_L1.row(3);
507                 FBRInput.row(3) = IME_output_MV_L1.row(4);
508 
509                 FBRMbMode       = VME_GET_UNIInput_FBRMbModeInput(UNIInput);
510                 FBRSubMbShape   = VME_GET_UNIInput_FBRSubMBShapeInput(UNIInput);
511                 FBRSubPredMode  = VME_GET_UNIInput_FBRSubPredModeInput(UNIInput);
512 
513                 run_vme_fbr(UNIInput,
514                             FBRInput,
515                             Pred_Surface_L1,
516                             FBRMbMode,
517                             FBRSubMbShape,
518                             FBRSubPredMode,
519                             VME_ME_L1);
520 
521                 VME_ME_L0.format<int, 7, 8>().select<4, 1, 4, 2>(1, 1) = VME_ME_L1.format<int, 7, 8>().select<4, 1, 4, 2>(1, 0);
522             }
523         }
524 
525         VME_ME_L0.format<short, 7, 16>().select<4, 1, 16, 1>(1, 0) <<= MVShiftFactor;
526 
527         matrix<ushort, 4, 8> vmeMask = VME_ME_REFINE_L0.row(0).format<ushort>()[4] < VME_ME_L0.row(0).format<ushort>()[4];
528 
529         MV.format<short, 4, 16>().select<4, 1, 8, 1>(0, 0).merge(
530             VME_ME_REFINE_L0.format<short, 7, 16>().select<2, 2, 16, 1>(1, 0),
531             VME_ME_L0.format<short, 7, 16>().select<2, 2, 16, 1>(1, 0), vmeMask);
532 
533         MV.format<short, 4, 16>().select<4, 1, 8, 1>(0, 8).merge(
534             VME_ME_REFINE_L0.format<short, 7, 16>().select<2, 2, 16, 1>(2, 0),
535             VME_ME_L0.format<short, 7, 16>().select<2, 2, 16, 1>(2, 0), vmeMask);
536 
537         if (b)
538         {
539             if(current_iteration <= NumRefIdxL1)
540             {
541                 matrix<ushort, 4, 2> vmeMask1 = VME_ME_REFINE_L1.row(0).format<ushort>()[4] < VME_ME_L1.row(0).format<ushort>()[4];
542 
543                 MV.format<int, 4, 8>().select<4, 1, 2, 2>(0, 1).merge(
544                                 VME_ME_REFINE_L0.format<int, 7, 8>().select<2, 2, 4, 2>(1, 1),
545                                 VME_ME_L0.format<int, 7, 8>().select<2, 2, 4, 2>(1, 1), vmeMask1);
546 
547                 MV.format<int, 4, 8>().select<4, 1, 2, 2>(0, 5).merge(
548                                 VME_ME_REFINE_L0.format<int, 7, 8>().select<2, 2, 4, 2>(2, 1),
549                                 VME_ME_L0.format<int, 7, 8>().select<2, 2, 4, 2>(2, 1), vmeMask1);
550             }
551         }
552 
553         /*
554         *  Each MV (there are 1 per SubMB, 16 total) will be used by a corresponding MB in the next level.
555         *  Each MB in the next level would use the MV of the corresponding Sub MB in the current level
556         */
557         write( HME_MV_Data_Surface_index, x_pos << 1, (y_pos >> SHIFT_PIXEL_TO_SUB_MB) + picture_offset, MV);
558 
559         if (current_iteration==0)
560         {
561             matrix_ref<short, 4, 16>FinalMV = MV.format<short, 4, 16>();
562             matrix<U16, 2, 2> mx            = cm_abs<U16, short>(FinalMV.select<2, 2, 2, 8>(0, 0));
563             matrix<U16, 2, 2> my            = cm_abs<U16, short>(FinalMV.select<2, 2, 2, 8>(0, 1));
564 
565             matrix<U16, 2, 2> mvsum = 0;
566             mvsum.merge(1, 0, (mx + my) < BRCMVThreshold);
567             BD_MV_Sum(0) = cm_sum<U32, U16, 4>(mvsum.format<U16>());
568         }
569 
570         if (is_hevc_vp9_vdenc && vdenc_enable)
571         {
572             streamIn_MV.select<2, 1, 2, 4>(0, current_iteration) = MV.format<uint, 4, 8>().select<2, 2, 2, 4>(0, 0);
573             PredictorSelect = PredictorSelect | (1 << (2 * current_iteration));
574             PredictorRefIdx = PredictorRefIdx | (current_iteration << (current_iteration << 2));
575 
576             if (b & (current_iteration == 0))
577             {
578                 streamIn_MV.select<2, 1, 2, 4>(0, NumRefIdxL0 + 1) = MV.format<uint, 4, 8>().select<2, 2, 2, 4>(0, 1);
579 
580                 PredictorSelect = PredictorSelect | (2 << (2 * (NumRefIdxL0 + 1)));
581                 PredictorRefIdx = PredictorRefIdx | (current_iteration << ((NumRefIdxL0 + 1) << 2));
582             }
583                       }
584 
585         if (b)
586         {
587             if(current_iteration <= NumRefIdxL1)
588             {
589                 MV.format<int, 4, 8 >().select< 4, 1, 4, 2>(0,0) = MV.format<int, 4, 8 >().select< 4, 1, 4, 2>(0,1);
590                 MV.format<int, 4, 8 >().select< 4, 1, 4, 2>(0,1) = 0;
591                 write(HME_MV_Data_Surface_index, x_pos << 1, (y_pos >> SHIFT_PIXEL_TO_SUB_MB) + picture_offset + (picture_heightMB << 5) , MV);
592             }
593         }
594 
595         if (writeDistortions)
596         {
597             matrix<uchar, 4, 8> DistL0;
598             matrix<uchar, 4, 8> DistL1;
599 
600             matrix<ushort, 4, 8> distMask = VME_ME_REFINE_L0.row(0).format<ushort>()[4] < VME_ME_L0.row(0).format<ushort>()[4];
601             DistL0.merge(VME_ME_REFINE_L0.row(5).format<uchar, 4, 8>(), VME_ME_L0.row(5).format<uchar, 4, 8>(), distMask);
602 
603             if (b)
604             {
605                 if(current_iteration <= NumRefIdxL1)
606                 {
607                     distMask = VME_ME_REFINE_L1.row(0).format<ushort>()[4] < VME_ME_L1.row(0).format<ushort>()[4];
608                     DistL1.merge(VME_ME_REFINE_L1.row(5).format<uchar, 4, 8>(), VME_ME_L1.row(5).format<uchar, 4, 8>(), distMask);
609                 }
610             }
611 
612             if (current_iteration == 0)
613             {
614                 matrix<uchar, 4, 8> BRCdist = DistL0;
615 
616                 if (b)
617                 {
618                     if (current_iteration <= NumRefIdxL1)
619                     {
620                         BRCdist.format<ushort>() = cm_min<ushort>(DistL0.format<ushort>(), DistL1.format<ushort>());
621                     }
622                 }
623 
624                 if (!vdenc_enable)
625                 {
626                     write(BRC_DISTORTION_Surface, x_pos >> 1, y_pos >> 2, BRCdist);
627                 }
628 
629                 BD_MV_Sum(1) = cm_sum<U32, U16, 16>(BRCdist.format<U16>());
630 
631                 vector<uint, 8> local_offset = 7;
632                 vector<uint, 8> ret;
633 
634                 local_offset[0] = 0;
635                 local_offset[1] = 1;
636 
637                 if (EnableMVSum)
638                 {
639                     write<uint, 8>(SUM_Surface, ATOMIC_ADD, 0, local_offset, BD_MV_Sum, ret);
640                 }
641             }
642 
643             matrix<uchar, 4, 8> dist;
644             dist = DistL0;
645             dist.format<ushort, 4, 4>().select<4, 1, 1, 1>(0, 1) = dist.format<ushort, 4, 4>().select<4, 1, 1, 1>(0, 0);
646             dist.format<ushort, 4, 4>().select<2, 2, 2, 1>(1, 2) = dist.format<ushort, 4, 4>().select<2, 2, 2, 1>(1, 0);
647             dist.format<ushort, 4, 4>().select<2, 2, 2, 1>(1, 0) = dist.format<ushort, 4, 4>().select<2, 2, 2, 1>(0, 0);
648             dist.format<ushort, 4, 4>().select<2, 2, 2, 1>(0, 2) = dist.format<ushort, 4, 4>().select<2, 2, 2, 1>(1, 2);
649 
650             write(DISTORTION_Surface, x_pos >> 1, (y_pos >> 2) + picture_offset, dist);
651 
652             if (!is_hevc_vp9_vdenc && vdenc_enable)
653             {
654                 vector<uchar,1>factor = 0;
655                 factor.merge(HMEStreaminRefCost, current_iteration != 0);
656 
657                 matrix<ushort, 4, 4>sumDist    = dist.format<ushort>() + factor(0);
658                 matrix<uint,   4, 4>curr_refID = current_iteration;
659                 matrix<uchar,  4, 4>dist_mask  = (best_DistL0 >= sumDist);
660 
661                 best_DistL0.merge(sumDist,dist_mask);
662                 best_refId.merge(curr_refID,dist_mask);
663                 best_MV.merge(MV.format<uint, 4, 8>().select<4, 1, 4, 2>(0, 0), dist_mask);
664             }
665 
666             if (b)
667             {
668                 if (current_iteration <= NumRefIdxL1)
669                 {
670                     dist = DistL1;
671                     dist.format<ushort, 4, 4>().select<4, 1, 1, 1>(0, 1) = dist.format<ushort, 4, 4>().select<4, 1, 1, 1>(0, 0);
672                     dist.format<ushort, 4, 4>().select<2, 2, 2, 1>(1, 2) = dist.format<ushort, 4, 4>().select<2, 2, 2, 1>(1, 0);
673                     dist.format<ushort, 4, 4>().select<2, 2, 2, 1>(1, 0) = dist.format<ushort, 4, 4>().select<2, 2, 2, 1>(0, 0);
674                     dist.format<ushort, 4, 4>().select<2, 2, 2, 1>(0, 2) = dist.format<ushort, 4, 4>().select<2, 2, 2, 1>(1, 2);
675                     write(DISTORTION_Surface, x_pos >> 1, (y_pos >> 2) + picture_offset + (picture_heightMB << 5), dist);
676                 }
677             }
678         }
679 
680         /*
681         * All the MV of the references are arranged in the input surface as a list.
682         * Assume that each reference has X MB in a column so the MV of the top left block of the
683         * first reference is located at (0,0) and the top left MV of the next reference is located
684         * at ( X << SHIFT_MB_TO_SUB_MB, 0)
685         */
686         picture_offset += (picture_heightMB << SHIFT_MB_TO_SUB_MB);
687 
688         if (0 == current_iteration && (!is_hevc_vp9_vdenc && vdenc_enable) && ROIMapEnable == 3)
689         {
690             ROIMask = !((best_DistL0 < 10) & (best_MV.format<ushort,4, 8>().select<4, 1, 4, 2>(0, 0) == 0) & (best_MV.format<ushort, 4, 8>().select<4, 1, 4, 2>(0, 1) == 0));
691         }
692 
693         current_iteration++;
694     }
695 
696     if (is_hevc_vp9_vdenc && vdenc_enable)
697     {
698         vector<ushort, 2> Sizein32x32RoundDown = (ActualMBDim) >> 5;
699         vector<ushort, 2> Sizein32x32RoundUp   = (ActualMBDim + 31) >> 5;
700         vector<ushort, 2> mask                 = ((CURBEData[24] & 0x04) != 0);
701         Sizein32x32RoundUp.merge(((ActualMBDim + 63) >> 6) << 1, mask);
702 
703         uint mb_xpos_32 = mb_x_pos << 1;
704         uint mb_ypos_32 = mb_y_pos << 1;
705 
706         bool tiling = (CURBEData(24) & 0x40) != 0;
707 
708         vector<uint, 32> Streamin_read_row1 = 0;
709         vector<uint, 32> Streamin_read_row2 = 0;
710 
711         vector_ref<uint, 16> Streamin_read_row1_blk1 = Streamin_read_row1.select<16, 1>(0);
712         vector_ref<uint, 16> Streamin_read_row2_blk1 = Streamin_read_row2.select<16, 1>(0);
713 
714         vector<uint,   4> streamin_offset;
715         vector<ushort, 8> tileinfo;
716 
717         if (tiling)
718         {
719             vector<uint,   1> tile_info_offset;
720             vector<uint,   1> Offset_tile_streamin;
721             vector<ushort, 2> ActualMBDimLCU = Sizein32x32RoundUp;
722 
723             if ((CURBEData[24] & 0x04))
724             {
725                 ActualMBDimLCU = ActualMBDimLCU >> 1;
726 
727                 tile_info_offset = (mb_x_pos + (mb_y_pos * ActualMBDimLCU(0))) * TILEINFO_SIZE;
728                 read(TileInfo_Buffer, tile_info_offset(0), tileinfo);
729 
730                 Offset_tile_streamin(0) = tileinfo(4) * tileinfo(5) + (tileinfo(4) * (tileinfo(7) - tileinfo(5)))
731                                            + (tileinfo(5) * (ActualMBDimLCU(0) - tileinfo(4)));
732 
733                 Offset_tile_streamin(0) = Offset_tile_streamin(0)  * LCU64_STREAMIN_SIZE;
734                 streamin_offset(0)      = (mb_y_pos - tileinfo(5)) * (tileinfo(6) - tileinfo(4)) + mb_x_pos - tileinfo(4);
735 
736                 streamin_offset(0) = Offset_tile_streamin(0) + (streamin_offset(0) * LCU64_STREAMIN_SIZE);
737                 streamin_offset(1) = streamin_offset(0) + STREAMIN_SIZE;
738                 streamin_offset(2) = streamin_offset(0) + (2 * STREAMIN_SIZE);
739                 streamin_offset(3) = streamin_offset(0) + (3 * STREAMIN_SIZE);
740             }
741             else
742             {
743                 #pragma unroll
744                 for (ushort i = 0; i < 4; i++)
745                 {
746                     ushort offset_x = i & 0x01;
747                     ushort offset_y = (i & 0x02) >> 1;
748                     tile_info_offset = ((mb_xpos_32 + offset_x) + ((mb_ypos_32 + offset_y) * Sizein32x32RoundUp(0))) * TILEINFO_SIZE;
749 
750                         read(TileInfo_Buffer, tile_info_offset(0), tileinfo);
751 
752                     Offset_tile_streamin(0) = tileinfo(4) * tileinfo(5) + (tileinfo(4) * (tileinfo(7) - tileinfo(5)))
753                     + (tileinfo(5) * (Sizein32x32RoundUp(0) - tileinfo(4)));
754 
755                     Offset_tile_streamin(0) = Offset_tile_streamin(0) * STREAMIN_SIZE;
756 
757                     streamin_offset(i) = ((mb_ypos_32 + offset_y) - tileinfo(5)) * (tileinfo(6) - tileinfo(4)) + (mb_xpos_32 + offset_x) - tileinfo(4);
758                     streamin_offset(i) = Offset_tile_streamin(0) + (streamin_offset(i) * STREAMIN_SIZE);
759                 }
760             }
761         }
762         else
763         {
764             streamin_offset(0) = (mb_xpos_32 + (mb_ypos_32 * Sizein32x32RoundUp(0))) * STREAMIN_SIZE;
765             streamin_offset(2) = streamin_offset(0) + (Sizein32x32RoundUp(0) * STREAMIN_SIZE);
766 
767             streamin_offset.select<1, 1>(0).merge((((mb_x_pos << 2) + ((mb_y_pos << 1) * Sizein32x32RoundUp(0))) * STREAMIN_SIZE), ((CURBEData[24] & 0x04) != 0));
768             streamin_offset.select<1, 1>(2).merge((streamin_offset.select<1, 1>(0) + (2 * STREAMIN_SIZE)), ((CURBEData[24] & 0x04) != 0));
769 
770             streamin_offset(1) = streamin_offset(0) + STREAMIN_SIZE;
771             streamin_offset(3) = streamin_offset(2) + STREAMIN_SIZE;
772         }
773 
774         if (!(CURBEData[24] & 0x02))
775         {
776             Streamin_read_row1.select<2, 16>(0)     = CURBEData.format<uint>()(31);
777             Streamin_read_row2.select<2, 16>(0)     = CURBEData.format<uint>()(31);
778             Streamin_read_row1.select<4, 1>(1)      = CURBEData.format<uint>().select<4, 1>(32);
779             Streamin_read_row1.select<4, 1>(17)     = CURBEData.format<uint>().select<4, 1>(32);
780             Streamin_read_row2.select<4, 1>(1)      = CURBEData.format<uint>().select<4, 1>(32);
781             Streamin_read_row2.select<4, 1>(17)     = CURBEData.format<uint>().select<4, 1>(32);
782             Streamin_read_row1.select<2, 16>(6)     = CURBEData.format<uint>()(36);
783             Streamin_read_row2.select<2, 16>(6)     = CURBEData.format<uint>()(36);
784             Streamin_read_row1.select<2, 16>(7)     = CURBEData.format<uint>()(37);
785             Streamin_read_row2.select<2, 16>(7)     = CURBEData.format<uint>()(37);
786             Streamin_read_row1.select<2, 16>(14)    = CURBEData.format<uint>()(38);
787             Streamin_read_row2.select<2, 16>(14)    = CURBEData.format<uint>()(38);
788 
789             if (((mb_xpos_32 + 1) < Sizein32x32RoundDown(0)) & ((mb_ypos_32 + 1) < Sizein32x32RoundDown(1)))
790             {
791                 Streamin_read_row1.select<4, 1>(8)  = streamIn_MV.select<1, 1, 4, 1>(0, 0);
792                 Streamin_read_row1.select<4, 1>(24) = streamIn_MV.select<1, 1, 4, 1>(0, 4);
793                 Streamin_read_row2.select<4, 1>(8)  = streamIn_MV.select<1, 1, 4, 1>(1, 0);
794                 Streamin_read_row2.select<4, 1>(24) = streamIn_MV.select<1, 1, 4, 1>(1, 4);
795 
796                 Streamin_read_row1.format<uchar>().select<2, 64>(31)  = PredictorSelect;
797                 Streamin_read_row2.format<uchar>().select<2, 64>(31)  = PredictorSelect;
798 
799                 Streamin_read_row1.format<ushort>().select<2, 32>(24) = PredictorRefIdx;
800                 Streamin_read_row2.format<ushort>().select<2, 32>(24) = PredictorRefIdx;
801             }
802             else if (((mb_xpos_32 + 1) < Sizein32x32RoundDown(0)) & ((mb_ypos_32) < Sizein32x32RoundDown(1)))
803             {
804                 Streamin_read_row1.select<4, 1>(8)  = streamIn_MV.select<1, 1, 4, 1>(0, 0);
805                 Streamin_read_row1.select<4, 1>(24) = streamIn_MV.select<1, 1, 4, 1>(0, 4);
806 
807                 Streamin_read_row1.format<uchar>().select<2, 64>(31)  = PredictorSelect;
808                 Streamin_read_row1.format<ushort>().select<2, 32>(24) = PredictorRefIdx;
809             }
810             else if (((mb_xpos_32) < Sizein32x32RoundDown(0)) & ((mb_ypos_32 + 1) < Sizein32x32RoundDown(1)))
811             {
812                 Streamin_read_row1_blk1.select<4, 1>(8) = streamIn_MV.select<1, 1, 4, 1>(0, 0);
813                 Streamin_read_row2_blk1.select<4, 1>(8) = streamIn_MV.select<1, 1, 4, 1>(1, 0);
814 
815                 Streamin_read_row1_blk1.format<uchar>()(31)  = PredictorSelect;
816                 Streamin_read_row2_blk1.format<uchar>()(31)  = PredictorSelect;
817 
818                 Streamin_read_row1_blk1.format<ushort>()(24) = PredictorRefIdx;
819                 Streamin_read_row2_blk1.format<ushort>()(24) = PredictorRefIdx;
820 
821             }
822             else if (mb_xpos_32 < Sizein32x32RoundDown(0) && mb_ypos_32  < Sizein32x32RoundDown(1))
823             {
824                 Streamin_read_row1_blk1.select<4, 1>(8)      = streamIn_MV.select<1, 1, 4, 1>(0, 0);
825                 Streamin_read_row1_blk1.format<uchar>()(31)  = PredictorSelect;
826                 Streamin_read_row1_blk1.format<ushort>()(24) = PredictorRefIdx;
827             }
828 
829             // Note that in the if else blocks Sizein32x32RoundUp is being used. This is because when stream-in input is disabled, kernel has to populate other stream in fields(Num merge candidates etc) from CURBE
830             // For HME predictors we need to round down the frame size. For other fields we need to round up
831             if (((mb_xpos_32 + 1) < Sizein32x32RoundUp(0)) & ((mb_ypos_32 + 1) < Sizein32x32RoundUp(1)))
832             {
833                 write(StreamINSurface, streamin_offset(0), Streamin_read_row1.select<16, 1>(0));
834                 write(StreamINSurface, streamin_offset(1), Streamin_read_row1.select<16, 1>(16));
835                 write(StreamINSurface, streamin_offset(2), Streamin_read_row2.select<16, 1>(0));
836                 write(StreamINSurface, streamin_offset(3), Streamin_read_row2.select<16, 1>(16));
837             }
838             else if (((mb_xpos_32 + 1) < Sizein32x32RoundUp(0)) & ((mb_ypos_32) < Sizein32x32RoundUp(1)))
839             {
840                 write(StreamINSurface, streamin_offset(0), Streamin_read_row1.select<16, 1>(0));
841                 write(StreamINSurface, streamin_offset(1), Streamin_read_row1.select<16, 1>(16));
842             }
843             else if (((mb_xpos_32) < Sizein32x32RoundUp(0)) & ((mb_ypos_32 + 1) < Sizein32x32RoundUp(1)))
844             {
845                 write(StreamINSurface, streamin_offset(0), Streamin_read_row1_blk1);
846                 write(StreamINSurface, streamin_offset(2), Streamin_read_row2_blk1);
847             }
848             else
849             {
850                 write(StreamINSurface, streamin_offset(0), Streamin_read_row1_blk1);
851             }
852         }
853         else
854         {
855             if (((mb_xpos_32 + 1) < Sizein32x32RoundDown(0)) & ((mb_ypos_32 + 1) < Sizein32x32RoundDown(1)))
856             {
857                 read(StreamINSurface_input, streamin_offset(0), Streamin_read_row1.select<16,1>(0));
858                 read(StreamINSurface_input, streamin_offset(1), Streamin_read_row1.select<16, 1>(16));
859                 read(StreamINSurface_input, streamin_offset(2), Streamin_read_row2.select<16, 1>(0));
860                 read(StreamINSurface_input, streamin_offset(3), Streamin_read_row2.select<16, 1>(16));
861 
862                 Streamin_read_row1.select<4, 1>(8)  = streamIn_MV.select<1, 1, 4, 1>(0, 0);
863                 Streamin_read_row1.select<4, 1>(24) = streamIn_MV.select<1, 1, 4, 1>(0, 4);
864                 Streamin_read_row2.select<4, 1>(8)  = streamIn_MV.select<1, 1, 4, 1>(1, 0);
865                 Streamin_read_row2.select<4, 1>(24) = streamIn_MV.select<1, 1, 4, 1>(1, 4);
866 
867 
868                 Streamin_read_row1.format<uchar>().select<2, 64>(31)  = PredictorSelect;
869                 Streamin_read_row2.format<uchar>().select<2, 64>(31)  = PredictorSelect;
870 
871                 Streamin_read_row1.format<ushort>().select<2, 32>(24) = PredictorRefIdx;
872                 Streamin_read_row2.format<ushort>().select<2, 32>(24) = PredictorRefIdx;
873 
874 
875                 write(StreamINSurface, streamin_offset(0), Streamin_read_row1.select<16, 1>(0));
876                 write(StreamINSurface, streamin_offset(1), Streamin_read_row1.select<16, 1>(16));
877                 write(StreamINSurface, streamin_offset(2), Streamin_read_row2.select<16, 1>(0));
878                 write(StreamINSurface, streamin_offset(3), Streamin_read_row2.select<16, 1>(16));
879             }
880             else if (((mb_xpos_32 + 1) < Sizein32x32RoundDown(0)) & ((mb_ypos_32) < Sizein32x32RoundDown(1)))
881             {
882                 read(StreamINSurface_input, streamin_offset(0), Streamin_read_row1.select<16, 1>(0));
883                 read(StreamINSurface_input, streamin_offset(1), Streamin_read_row1.select<16, 1>(16));
884 
885                 Streamin_read_row1.select<4, 1>(8)  = streamIn_MV.select<1, 1, 4, 1>(0, 0);
886                 Streamin_read_row1.select<4, 1>(24) = streamIn_MV.select<1, 1, 4, 1>(0, 4);
887 
888                 Streamin_read_row1.format<uchar>().select<2, 64>(31)  = PredictorSelect;
889                 Streamin_read_row1.format<ushort>().select<2, 32>(24) = PredictorRefIdx;
890 
891                 write(StreamINSurface, streamin_offset(0), Streamin_read_row1.select<16, 1>(0));
892                 write(StreamINSurface, streamin_offset(1), Streamin_read_row1.select<16, 1>(16));
893             }
894             else if (((mb_xpos_32) < Sizein32x32RoundDown(0)) & ((mb_ypos_32 + 1) < Sizein32x32RoundDown(1)))
895             {
896                 read(StreamINSurface_input, streamin_offset(0), Streamin_read_row1_blk1);
897                 read(StreamINSurface_input, streamin_offset(2), Streamin_read_row2_blk1);
898 
899                 Streamin_read_row1_blk1.select<4, 1>(8)      = streamIn_MV.select<1, 1, 4, 1>(0, 0);
900                 Streamin_read_row2_blk1.select<4, 1>(8)      = streamIn_MV.select<1, 1, 4, 1>(1, 0);
901 
902                 Streamin_read_row1_blk1.format<uchar>()(31)  = PredictorSelect;
903                 Streamin_read_row2_blk1.format<uchar>()(31)  = PredictorSelect;
904 
905                 Streamin_read_row1_blk1.format<ushort>()(24) = PredictorRefIdx;
906                 Streamin_read_row2_blk1.format<ushort>()(24) = PredictorRefIdx;
907 
908                 write(StreamINSurface, streamin_offset(0), Streamin_read_row1_blk1);
909                 write(StreamINSurface, streamin_offset(2), Streamin_read_row2_blk1);
910             }
911             else if (mb_xpos_32 < Sizein32x32RoundDown(0) && mb_ypos_32  < Sizein32x32RoundDown(1))
912             {
913                 read(StreamINSurface_input, streamin_offset(0), Streamin_read_row1_blk1);
914 
915                 Streamin_read_row1_blk1.select<4, 1>(8)      = streamIn_MV.select<1, 1, 4, 1>(0, 0);
916                 Streamin_read_row1_blk1.format<uchar>()(31)  = PredictorSelect;
917                 Streamin_read_row1_blk1.format<ushort>()(24) = PredictorRefIdx;
918 
919                 write(StreamINSurface, streamin_offset(0), Streamin_read_row1_blk1);
920             }
921         }
922     }
923 
924     if (!is_hevc_vp9_vdenc && vdenc_enable)
925     {
926         uint initVal, streamin_offset;
927 
928         vector<uint, 16> finalOffset;
929         vector<uint, 16> finalOffset_RefID;
930         vector<uint, 16> finalOffset_ROI;
931         vector<short,16> v_MBaddressIndx(MBaddressIndx );
932         vector<short,16> v_MB_X_Indx( MBX_Indx );
933         vector<uint, 16> lastMBaddr, lastMBmask;
934 
935         vector<uint,  16> offset_debug = 0xffff0000;
936         vector<uint, 256> input_streamInsurface;
937 
938         uint globOffset = ((x_pos + (y_pos * ActualMBDim(0))) << 2);
939 
940         streamin_offset = ActualMBDim(0) * 16;
941 
942         finalOffset       = (v_MBaddressIndx * streamin_offset) + (v_MB_X_Indx * 16) + 2;
943         finalOffset_RefID = finalOffset + 2;
944         finalOffset_ROI   = finalOffset - 2;
945 
946         lastMBaddr = ((y_pos>>2) + v_MBaddressIndx + 1) * streamin_offset;
947         lastMBmask = ((finalOffset + globOffset) > lastMBaddr);
948 
949         uint block1 = globOffset * 4;
950         read((StreamINSurface_input), block1, input_streamInsurface.select<32, 1>(0));
951 
952         block1 = (globOffset + 32) * 4;
953         read((StreamINSurface_input), block1, input_streamInsurface.select<32, 1>(32));
954 
955         block1 = (globOffset + streamin_offset) * 4;
956         read((StreamINSurface_input), block1, input_streamInsurface.select<32, 1>(64));
957 
958         block1 = (globOffset + 32 + streamin_offset) * 4;
959         read((StreamINSurface_input), block1, input_streamInsurface.select<32, 1>(96));
960 
961         block1 = (globOffset + streamin_offset * 2) * 4;
962         read((StreamINSurface_input), block1, input_streamInsurface.select<32, 1>(128));
963 
964         block1 = (globOffset + 32 + streamin_offset * 2) * 4;
965         read((StreamINSurface_input), block1, input_streamInsurface.select<32, 1>(160));
966 
967         block1 = (globOffset + streamin_offset * 3) * 4;
968         read((StreamINSurface_input), block1, input_streamInsurface.select<32, 1>(192));
969 
970         block1 = (globOffset + 32 + streamin_offset * 3) * 4;
971         read((StreamINSurface_input), block1, input_streamInsurface.select<32, 1>(224));
972 
973         input_streamInsurface.select<16, 16>(2) = best_MV.format<uint>();
974         input_streamInsurface.select<16, 16>(4) = best_refId.format<uint>();
975 
976         if (ROIMapEnable == 3)
977         {
978             input_streamInsurface.select<16, 16>(0) = (input_streamInsurface.select<16, 16>(0) & 0xFFFFFF00) | (ROIMask.format<uchar>() & 0x000000FF);
979         }
980 
981         uchar boundary_result = lastMBmask.any();
982         if ( boundary_result == 0)
983         {
984             block1 = globOffset * 4;
985             write((StreamINSurface), block1, input_streamInsurface.select<32, 1>(0));
986 
987             block1 = (globOffset + 32) * 4;
988             write((StreamINSurface), block1, input_streamInsurface.select<32, 1>(32));
989 
990             block1 = (globOffset + streamin_offset) * 4;
991             write((StreamINSurface), block1, input_streamInsurface.select<32, 1>(64));
992 
993             block1 = (globOffset + 32 + streamin_offset) * 4;
994             write((StreamINSurface), block1, input_streamInsurface.select<32, 1>(96));
995 
996             block1 = (globOffset + streamin_offset * 2) * 4;
997             write((StreamINSurface), block1, input_streamInsurface.select<32, 1>(128));
998 
999             block1 = (globOffset + 32 + streamin_offset * 2) * 4;
1000             write((StreamINSurface), block1, input_streamInsurface.select<32, 1>(160));
1001 
1002             block1 = (globOffset + streamin_offset * 3) * 4;
1003             write((StreamINSurface), block1, input_streamInsurface.select<32, 1>(192));
1004 
1005             block1 = (globOffset + 32 + streamin_offset * 3) * 4;
1006             write((StreamINSurface), block1, input_streamInsurface.select<32, 1>(224));
1007         }
1008         else
1009         {
1010             block1 = globOffset * 4;
1011             if ((lastMBmask(0) == 0) && (lastMBmask(1) == 0))
1012             {
1013                 write((StreamINSurface),block1, input_streamInsurface.select<32, 1>(0));
1014             }
1015             else if (lastMBmask(0) == 0)
1016             {
1017                 write((StreamINSurface), block1, input_streamInsurface.select<16, 1>(0));
1018             }
1019 
1020             block1 = (globOffset + 32) * 4;
1021             if ((lastMBmask(2) == 0) && (lastMBmask(3) == 0))
1022             {
1023                 write((StreamINSurface), block1, input_streamInsurface.select<32, 1>(32));
1024             }
1025             else if (lastMBmask(2) == 0)
1026             {
1027                 write((StreamINSurface), block1, input_streamInsurface.select<16, 1>(32));
1028             }
1029 
1030             block1 = (globOffset + streamin_offset) * 4;
1031             if ((lastMBmask(4) == 0) && (lastMBmask(5) == 0))
1032             {
1033                 write((StreamINSurface), block1, input_streamInsurface.select<32, 1>(64));
1034             }
1035             else if (lastMBmask(4) == 0)
1036             {
1037                 write((StreamINSurface), block1, input_streamInsurface.select<16, 1>(64));
1038             }
1039 
1040             block1 = (globOffset + 32 + streamin_offset) * 4;
1041             if ((lastMBmask(6) == 0) && (lastMBmask(7) == 0))
1042             {
1043                 write((StreamINSurface), block1, input_streamInsurface.select<32, 1>(96));
1044             }
1045             else if (lastMBmask(6) == 0)
1046             {
1047                 write((StreamINSurface), block1, input_streamInsurface.select<16, 1>(96));
1048             }
1049 
1050             block1 = (globOffset + streamin_offset * 2) * 4;
1051             if ((lastMBmask(8) == 0) && (lastMBmask(9) == 0))
1052             {
1053                 write((StreamINSurface), block1, input_streamInsurface.select<32, 1>(128));
1054             }
1055             else if (lastMBmask(8) == 0)
1056             {
1057                 write((StreamINSurface), block1, input_streamInsurface.select<16, 1>(128));
1058             }
1059 
1060             block1 = (globOffset + 32 + streamin_offset * 2) * 4;
1061             if ((lastMBmask(10) == 0) && (lastMBmask(11) == 0))
1062             {
1063                 write((StreamINSurface), block1, input_streamInsurface.select<32, 1>(160));
1064             }
1065             else if (lastMBmask(10) == 0)
1066             {
1067                 write((StreamINSurface), block1, input_streamInsurface.select<16, 1>(160));
1068             }
1069 
1070             block1 = (globOffset + streamin_offset * 3) * 4;
1071             if ((lastMBmask(12) == 0) && (lastMBmask(13) == 0))
1072             {
1073                 write((StreamINSurface), block1, input_streamInsurface.select<32, 1>(192));
1074             }
1075             else if (lastMBmask(12) == 0)
1076             {
1077                 write((StreamINSurface), block1, input_streamInsurface.select<16, 1>(192));
1078             }
1079 
1080             block1 = (globOffset + 32 + streamin_offset * 3) * 4;
1081             if ((lastMBmask(14) == 0 ) && (lastMBmask(15) == 0))
1082             {
1083                 write((StreamINSurface), block1, input_streamInsurface.select<32, 1>(224));
1084             }
1085             else if (lastMBmask(14) == 0)
1086             {
1087                 write((StreamINSurface), block1, input_streamInsurface.select<16, 1>(224));
1088             }
1089         }
1090     }
1091 }
1092 
1093 extern "C" _GENX_MAIN_ void
HME_P(vector<uchar,CURBEDATA_SIZE> CURBEData,SurfaceIndex HME_MV_Data_Surface_index,SurfaceIndex HME_MV_Input_Data_Surface_index,SurfaceIndex DISTORTION_Surface,SurfaceIndex BRC_DISTORTION_Surface,SurfaceIndex Pred_Surface_L0,SurfaceIndex Pred_Surface_L1,SurfaceIndex StreamINSurface,SurfaceIndex StreamINSurface_input,SurfaceIndex SUM_Surface,SurfaceIndex TileInfo_Buffer)1094 HME_P( vector<uchar, CURBEDATA_SIZE> CURBEData,
1095        SurfaceIndex HME_MV_Data_Surface_index,
1096        SurfaceIndex HME_MV_Input_Data_Surface_index,
1097        SurfaceIndex DISTORTION_Surface,
1098        SurfaceIndex BRC_DISTORTION_Surface,
1099        SurfaceIndex Pred_Surface_L0,
1100        SurfaceIndex Pred_Surface_L1,
1101        SurfaceIndex StreamINSurface,
1102        SurfaceIndex StreamINSurface_input,
1103        SurfaceIndex SUM_Surface,
1104        SurfaceIndex TileInfo_Buffer
1105     )
1106 {
1107     HME(CURBEData,
1108         HME_MV_Data_Surface_index,
1109         HME_MV_Input_Data_Surface_index,
1110         DISTORTION_Surface,
1111         BRC_DISTORTION_Surface,
1112         Pred_Surface_L0,
1113         Pred_Surface_L1,
1114         StreamINSurface,
1115         StreamINSurface_input,
1116         SUM_Surface,
1117         TileInfo_Buffer,
1118         0,
1119         0,
1120         0);
1121 }
1122 
1123 extern "C" _GENX_MAIN_ void
HME_B(vector<uchar,CURBEDATA_SIZE> CURBEData,SurfaceIndex HME_MV_Data_Surface_index,SurfaceIndex HME_MV_Input_Data_Surface_index,SurfaceIndex DISTORTION_Surface,SurfaceIndex BRC_DISTORTION_Surface,SurfaceIndex Pred_Surface_L0,SurfaceIndex Pred_Surface_L1,SurfaceIndex StreamINSurface,SurfaceIndex StreamINSurface_input,SurfaceIndex SUM_Surface,SurfaceIndex TileInfo_Buffer)1124 HME_B( vector<uchar, CURBEDATA_SIZE> CURBEData,
1125       SurfaceIndex HME_MV_Data_Surface_index,
1126       SurfaceIndex HME_MV_Input_Data_Surface_index,
1127       SurfaceIndex DISTORTION_Surface,
1128       SurfaceIndex BRC_DISTORTION_Surface,
1129       SurfaceIndex Pred_Surface_L0,
1130       SurfaceIndex Pred_Surface_L1,
1131       SurfaceIndex StreamINSurface,
1132       SurfaceIndex StreamINSurface_input,
1133       SurfaceIndex SUM_Surface,
1134       SurfaceIndex TileInfo_Buffer
1135       )
1136 {
1137     HME(CURBEData ,
1138         HME_MV_Data_Surface_index,
1139         HME_MV_Input_Data_Surface_index,
1140         DISTORTION_Surface,
1141         BRC_DISTORTION_Surface,
1142         Pred_Surface_L0,
1143         Pred_Surface_L1,
1144         StreamINSurface,
1145         StreamINSurface_input,
1146         SUM_Surface,
1147         TileInfo_Buffer,
1148         1,
1149         0,
1150         0);
1151 }
1152 
1153 extern "C" _GENX_MAIN_ void
HME_VDENC_STREAMIN(vector<uchar,CURBEDATA_SIZE> CURBEData,SurfaceIndex HME_MV_Data_Surface_index,SurfaceIndex HME_MV_Input_Data_Surface_index,SurfaceIndex DISTORTION_Surface,SurfaceIndex BRC_DISTORTION_Surface,SurfaceIndex Pred_Surface_L0,SurfaceIndex Pred_Surface_L1,SurfaceIndex StreamINSurface,SurfaceIndex StreamINSurface_input,SurfaceIndex SUM_Surface,SurfaceIndex TileInfo_Buffer)1154 HME_VDENC_STREAMIN( vector<uchar, CURBEDATA_SIZE> CURBEData,
1155                     SurfaceIndex HME_MV_Data_Surface_index,
1156                     SurfaceIndex HME_MV_Input_Data_Surface_index,
1157                     SurfaceIndex DISTORTION_Surface,
1158                     SurfaceIndex BRC_DISTORTION_Surface,
1159                     SurfaceIndex Pred_Surface_L0,
1160                     SurfaceIndex Pred_Surface_L1,
1161                     SurfaceIndex StreamINSurface,
1162                     SurfaceIndex StreamINSurface_input,
1163                     SurfaceIndex SUM_Surface,
1164                     SurfaceIndex TileInfo_Buffer)
1165 {
1166     HME(CURBEData ,
1167         HME_MV_Data_Surface_index,
1168         HME_MV_Input_Data_Surface_index,
1169         DISTORTION_Surface,
1170         BRC_DISTORTION_Surface,
1171         Pred_Surface_L0,
1172         Pred_Surface_L1,
1173         StreamINSurface,
1174         StreamINSurface_input,
1175         SUM_Surface,
1176         TileInfo_Buffer,
1177         0,
1178         1,
1179         0);
1180 }
1181