1 /*
2 * Copyright (c) 2019, Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included
12 * in all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
15 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20 * OTHER DEALINGS IN THE SOFTWARE.
21 */
22
23 #include <cm/cm.h>
24
25 #define STREAMIN_SIZE 64
26 #define LCU64_STREAMIN_SIZE (STREAMIN_SIZE * 4)
27 #define TILEINFO_SIZE 16
28
29 #define LCUSIZE32 0
30 #define LCUSIZE64 1
31
32 #define SHIFT_MB_TO_SUB_MB (2)
33 #define SHIFT_PIXEL_TO_SUB_MB (2)
34
35 // Number of bytes in Curbe data
36 #define CURBEDATA_SIZE 160
37
38 #define BIT5 0x20
39 #define BIT4 0x10
40 #define BIT3 0x08
41
42 //---------------------------------------------------------------------------
43 // Binding table indexes
44 //---------------------------------------------------------------------------
45 #define NUM_SURFACE_IDX 14
46 #define UNI 4
47
48 //---------------------------------------------------------------------------
49 // Const vector for Streamin
50 //---------------------------------------------------------------------------
51 const char MBX_Indx[16] = {0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3};
52 const char MBaddressIndx[16] = {0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3};
53
54 const char MBX_Indx_hevc[16] = {0,1,0,1};
55 const char MBaddressIndx_hevc[16] = {0,0,1,1};
56
57 enum ROISOURCE
58 {
59 ROIMAP_FROM_UNKNOWN = 0,
60 ROIMAP_FROM_APP = 1,
61 ROIMAP_FROM_APP_DIRTYRECT = 2,
62 ROIMAP_FROM_HME_STATICREGION = 3,
63 ROIMAP_FROM_PIXELVAR = 4
64 };
65
66
67 _GENX_ void
HME_SET_REF(vector_ref<uchar,CURBEDATA_SIZE> CURBEData,vector_ref<short,2> input_refine,vector_ref<short,2> output_refine,vector_ref<short,2> pos)68 inline HME_SET_REF(vector_ref<uchar, CURBEDATA_SIZE> CURBEData, vector_ref<short, 2> input_refine, vector_ref<short, 2> output_refine, vector_ref<short, 2> pos)
69 {
70 vector<short, 2> maxLens;
71
72 maxLens(0) = 512;
73 maxLens(1) = CURBEData.format<short>()[13] >> 2;
74
75 vector<short, 2> in_refine = input_refine >> 2;
76 vector<ushort, 2> search = CURBEData.select<2,1>(22);
77 vector<ushort, 2> tmp = search - 16;
78 vector<ushort, 2> widths = tmp >> 1;
79
80 vector<short, 2> pictureWidths;
81 pictureWidths(0) = CURBEData(18) << 4;
82 pictureWidths(1) = CURBEData(17) << 4;
83 pictureWidths(1) = pictureWidths(1) + 16;
84
85 vector<short, 2> VME_params = in_refine;
86 VME_params = VME_params - widths;
87
88 output_refine = VME_params + pos;
89 output_refine.merge(pos + maxLens - tmp, in_refine + tmp > maxLens);
90 output_refine.merge(pos - maxLens, in_refine - tmp < (-maxLens));
91
92 output_refine.merge((pictureWidths - 1) & 0xFFFC, output_refine > (pictureWidths - 1));
93
94 vector<short, 2> tmp_search = -search;
95 output_refine.merge((5 - search) & 0xFFFC, output_refine <= tmp_search);
96 output_refine = output_refine - pos;
97 }
98
99 // Main logic is in this function. The logic is shared by both P and B versions of the kernel.
100 _GENX_ void
HME(vector<uchar,CURBEDATA_SIZE> CURBEData,SurfaceIndex HME_MV_Data_Surface_index,SurfaceIndex HME_MV_Input_Data_Surface_index,SurfaceIndex DISTORTION_Surface,SurfaceIndex BRC_DISTORTION_Surface,SurfaceIndex Pred_Surface_L0,SurfaceIndex Pred_Surface_L1,SurfaceIndex StreamINSurface,SurfaceIndex StreamINSurface_input,SurfaceIndex SUM_Surface,SurfaceIndex TileInfo_Buffer,bool b,bool vdenc_enable,bool is_hevc_vp9_vdenc)101 inline HME( vector<uchar, CURBEDATA_SIZE> CURBEData,
102 SurfaceIndex HME_MV_Data_Surface_index,
103 SurfaceIndex HME_MV_Input_Data_Surface_index,
104 SurfaceIndex DISTORTION_Surface,
105 SurfaceIndex BRC_DISTORTION_Surface,
106 SurfaceIndex Pred_Surface_L0,
107 SurfaceIndex Pred_Surface_L1,
108 SurfaceIndex StreamINSurface,
109 SurfaceIndex StreamINSurface_input,
110 SurfaceIndex SUM_Surface,
111 SurfaceIndex TileInfo_Buffer,
112 bool b,
113 bool vdenc_enable,
114 bool is_hevc_vp9_vdenc)
115 {
116 ushort mb_x_pos = get_thread_origin_x();
117 ushort mb_y_pos = get_thread_origin_y();
118
119 vector<short, 2> pos;
120 ushort x_pos = pos(0) = mb_x_pos << 4;
121 ushort y_pos = pos(1) = mb_y_pos << 4;
122
123 bool useMVPrevStep = (CURBEData(24) & BIT4) != 0;
124 bool EnableMVSum = (CURBEData(24) & BIT5) != 0;
125 bool writeDistortions = (CURBEData(24) & BIT3) != 0;
126 uchar prevMVReadPosFactor = CURBEData(60);
127
128 uchar MVShiftFactor = CURBEData(61);
129 uchar picture_heightMB = CURBEData(17) + 1;
130 uchar NumRefIdxL0 = CURBEData(52);
131 uchar NumRefIdxL1 = CURBEData(53);
132 uchar BRCMVThreshold = CURBEData(20);
133
134 vector<ushort, 2> ActualMBDim = 0;
135 U8 HMEStreaminRefCost;
136 U8 ROIMapEnable;
137
138 if (!is_hevc_vp9_vdenc && vdenc_enable)
139 {
140 ActualMBDim = CURBEData.format<ushort>().select<2,1>(60);
141 HMEStreaminRefCost = CURBEData(54);
142 ROIMapEnable = CURBEData(55) >> 5;
143 }
144
145 vector<U32, 8> BD_MV_Sum = 0;
146
147 // used for determining whether to calculate refined MV on L0 and L1.
148 // If the absulte value of the input (higher level in the hierarchy ) refined MV data is below this threshold,
149 // we skip the refine calcualtion
150 uchar SUPER_COMBINEDIST = CURBEData(25);
151
152 // calculations on the reference region width and height
153 vector<short, 2> ref_region_size = CURBEData.select<2,1>(22) - 16;
154 vector<ushort, 2> search_coordinates = ref_region_size >> 1;
155 vector<short, 2> rxy = search_coordinates >> 2;
156
157 // data structures used for VME commands:
158 matrix<uchar, UNI, 32> UNIInput_tmp;
159 matrix<uchar, UNI, 32> UNIInput;
160 matrix<uchar, 4, 32> IMEInput;
161 matrix<uchar, 4, 32> FBRInput;
162 matrix<uchar, 9, 32> IME_output_MV_refine_L0;
163 matrix<uchar, 7, 32> VME_ME_REFINE_L0;
164 matrix<uchar, 9, 32> IME_output_MV_refine_L1;
165 matrix<uchar, 7, 32> VME_ME_REFINE_L1;
166 matrix<uchar, 7, 32> IME_output_MV_L0;
167 matrix<uchar, 7, 32> VME_ME_L0;
168 matrix<uchar, 7, 32> IME_output_MV_L1;
169 matrix<uchar, 7, 32> VME_ME_L1;
170 vector<short, 2> ref0;
171 vector<ushort, 16> costCenter = 0;
172
173 uchar FBRMbMode, FBRSubMbShape, FBRSubPredMode;
174
175 // VME search control
176 VMESearchCtrl searchControl = VME_SEARCH_SINGLE_REF_SINGLE_REC_SINGLE_START;
177
178 // initialize universal data
179 UNIInput_tmp.row(0) = 0;
180 UNIInput_tmp.format<ushort>()[0, 4] = x_pos;
181 UNIInput_tmp.format<ushort>()[0, 5] = y_pos;
182 UNIInput_tmp.row(0).format<uint>().select<2, 2>(3) = CURBEData.format<uint>().select<2, 2>(3);
183 UNIInput_tmp(0, 20) = 0;
184 UNIInput_tmp.row(1) = 0;
185 UNIInput_tmp.row(1).format<uint>().select<2, 1>(0) = CURBEData.format<uint>().select<2, 1>(0);
186 UNIInput_tmp.row(1).format<uint>().select<1, 1>(2) = CURBEData.format<uint>().select<1, 1>(2);
187 UNIInput_tmp.select<1,1,2,1>(1, 10) = rxy(0) + (rxy(1) << 4);
188 UNIInput_tmp.row(1).format<uint>().select<1,1>(7) = CURBEData.format<uint>().select<1, 1>(7);
189 UNIInput_tmp.row(2).format<uint>() = CURBEData.format<uint>().select<8, 1>(8);
190
191 // initialize IMEInput
192 IMEInput.row(0).format<uint>() = CURBEData.format<uint>().select<8, 1>(16);
193 IMEInput.row(1).format<uint>().select<6, 1>(0) = CURBEData.format<uint>().select<6, 1>(24);
194 IMEInput.row(1).format<uint>().select<2, 1>(6) = 0;
195
196 uchar current_iteration = 0;
197 uint picture_offset = 0;
198
199 // indicate whether we need to check L0 and L1 reference frames respectively
200 vector<uchar, 1> checkL0 = 0;
201 vector<uchar, 1> checkL1 = 0;
202
203 // field_support
204 char refFieldPolarityL0 = CURBEData(56);
205 char refFieldPolarityL1 = CURBEData(57);
206
207 uchar isField = (CURBEData(12) >> 7);
208
209 vector<char, 8> vec1;
210 vec1 = 0xff;
211
212 vector<char, 8>ref0Polarity;
213 ref0Polarity = 0;
214 ref0Polarity.merge(vec1,refFieldPolarityL0);
215
216 vector<char, 8>ref1Polarity;
217 ref1Polarity = 0;
218
219 if (b)
220 {
221 ref1Polarity.merge(vec1,refFieldPolarityL1);
222 }
223
224 /*some local declarations that were moved to the top*/
225 matrix<uchar, 4, 32> MV;
226
227 matrix<ushort, 4, 4> best_DistL0;
228 matrix<uint, 4, 4> best_refId;
229 matrix<uint, 4, 4> best_MV;
230 matrix<uchar, 4, 4> ROIMask(0);
231
232 //save values for streamIn
233 matrix<uint, 2, 8> streamIn_MV = 0;
234 uchar PredictorSelect = 0;
235 ushort PredictorRefIdx = 0;
236
237 if (!is_hevc_vp9_vdenc && vdenc_enable)
238 {
239 char Streamin_MaxRef;
240
241 best_DistL0 = 0xffff;
242 best_refId = 0;
243 Streamin_MaxRef = (NumRefIdxL0 > 2 ? 2 : NumRefIdxL0);
244 NumRefIdxL0 = Streamin_MaxRef;
245 }
246
247 if (is_hevc_vp9_vdenc && vdenc_enable)
248 {
249 NumRefIdxL0 = NumRefIdxL0 > 2 ? 2 : NumRefIdxL0;
250 ActualMBDim = CURBEData.format<ushort>().select<2, 1>(60);
251 }
252
253 while (current_iteration <= NumRefIdxL0)
254 {
255 // initialize ME refinement cost to maximum
256 VME_ME_REFINE_L0.row(0).format<ushort>()[4] = 0xFFFF;
257 VME_ME_REFINE_L1.row(0).format<ushort>()[4] = 0xFFFF;
258
259 UNIInput_tmp.row(1).format<char>().select<1,1>(5).merge(ref0Polarity(current_iteration), 0, isField);
260
261 // SuperHME or UltaraHME
262 if ( useMVPrevStep )
263 {
264 vector<uchar, 8> coarse_refine;
265 vector_ref<short, 4> coarse_mv_refine = coarse_refine.format<short>();
266 vector<short, 2> out_refine;
267
268 // ===========================================
269 // Get higher level HME MV refinement data
270 // ===========================================
271 /*
272 * The position of the MV of the last MB (in the vertical direction) should be
273 * at read_y_pos = (picture_heightMB << prevMVReadPosFactor)
274 * which means our input surface must have at least (picture_heightMB << prevMVReadPosFactor)
275 * motion vectors in each column. Each thread write 4 MV in a column so we have to round this number
276 * to the next multiplication of 4
277 */
278 uint subMB_offset_dscaled_pic = (( (picture_heightMB << prevMVReadPosFactor) + 3 ) & ~0x3);
279 subMB_offset_dscaled_pic *= current_iteration;
280
281 /*
282 * Read the MV of the current MB from the previous HME level.
283 * The desired MV located in the higher level in corresponding SubMB which has (pos << prevMVReadPosFactor)
284 * We look for the index in the array of subMB therfore we need to map between our MB index position
285 * (In the current resolution) to the SubMB index in the previous level.
286 *
287 * The index of the x pos is the horizontal index of the current MB shifted by the
288 * read position factor given by the user. we add <<3 because each MV is 8 bytes
289 * The index for the y pos is the vertical index of the current MB shifted by the
290 * read position factor given by the user. we add subMB_offset_dscaled_pic so we get
291 * the MV from the correct reference (the input surface arranged as a list).
292 */
293 uint read_x_pos = mb_x_pos << prevMVReadPosFactor;
294 uint read_y_pos = mb_y_pos << prevMVReadPosFactor;
295 read(HME_MV_Input_Data_Surface_index, read_x_pos << 3, read_y_pos + subMB_offset_dscaled_pic, coarse_refine);
296
297 // Compare input refine MV values to threshold
298 vector<ushort, 4> v_mask = (cm_abs<short>(coarse_mv_refine) >> 2) < SUPER_COMBINEDIST;
299
300 checkL0 = 1;
301 checkL0.merge(0, v_mask(0) & v_mask(1));
302
303 if (b)
304 {
305 checkL1 = 1;
306 checkL1.merge(0, v_mask(2) & v_mask(3));
307 }
308
309 if (checkL0(0))
310 {
311 UNIInput.row(0) = UNIInput_tmp.row(0);
312 UNIInput.row(1) = UNIInput_tmp.row(1);
313 UNIInput.row(2) = UNIInput_tmp.row(2);
314 UNIInput.row(3) = 0;
315 UNIInput.row(1).select<4, 1>(24) = current_iteration;
316
317 HME_SET_REF( CURBEData, coarse_mv_refine.select<2, 1>(0), out_refine, pos );
318 UNIInput.format<short, 4, 16>().select<1, 1, 2, 1>(0, 0) = out_refine;
319
320 ref0 = out_refine;
321 run_vme_ime(UNIInput,
322 IMEInput.select<2, 1, 32, 1>(0,0),
323 VME_STREAM_OUT,
324 searchControl,
325 Pred_Surface_L0,
326 ref0,
327 0,
328 costCenter,
329 IME_output_MV_refine_L0.format<uchar, 9, 32>());
330
331 UNIInput.format<int, 4, 8>()[2][5] = IME_output_MV_refine_L0.format<int, 9, 8>()[0][6] & 0xFFFF00;
332 UNIInput[2][20] = IME_output_MV_refine_L0[0][0] & 0x03;
333 FBRInput.row(0) = IME_output_MV_refine_L0.row(1);
334 FBRInput.row(1) = IME_output_MV_refine_L0.row(2);
335 FBRInput.row(2) = IME_output_MV_refine_L0.row(3);
336 FBRInput.row(3) = IME_output_MV_refine_L0.row(4);
337
338 FBRMbMode = VME_GET_UNIInput_FBRMbModeInput(UNIInput);
339 FBRSubMbShape = VME_GET_UNIInput_FBRSubMBShapeInput(UNIInput);
340 FBRSubPredMode = VME_GET_UNIInput_FBRSubPredModeInput(UNIInput);
341
342 run_vme_fbr(UNIInput,
343 FBRInput,
344 Pred_Surface_L0,
345 FBRMbMode,
346 FBRSubMbShape,
347 FBRSubPredMode,
348 VME_ME_REFINE_L0);
349 }
350
351 if (b)
352 {
353 if((current_iteration <= NumRefIdxL1) && checkL1(0))
354 {
355 UNIInput_tmp.row(1).format<char>().select<1, 1>(5).merge(ref1Polarity(current_iteration), 0, isField);
356
357 UNIInput.row(0) = UNIInput_tmp.row(0);
358 UNIInput.row(1) = UNIInput_tmp.row(1);
359 UNIInput.row(2) = UNIInput_tmp.row(2);
360 UNIInput.row(3) = 0;
361 UNIInput.row(1).select<4,1>(24) = current_iteration;
362
363 HME_SET_REF(CURBEData, coarse_mv_refine.select<2, 1>(2), out_refine, pos);
364
365 UNIInput.format<short, 4, 16>().select<1, 1, 2, 1>(0, 0) = out_refine;
366 ref0 = out_refine;
367
368 run_vme_ime(UNIInput,
369 IMEInput.select<2, 1, 32, 1>(0, 0),
370 VME_STREAM_OUT,
371 searchControl,
372 Pred_Surface_L1,
373 ref0,
374 0,
375 costCenter,
376 IME_output_MV_refine_L1.format<uchar, 9, 32>());
377
378 UNIInput.format<int, 4, 8>()[2][5] = IME_output_MV_refine_L1.format<int, 9, 8>()[0][6] & 0xFFFF00;
379 UNIInput[2][20] = IME_output_MV_refine_L1[0][0] & 0x03;
380 FBRInput.row(0) = IME_output_MV_refine_L1.row(1);
381 FBRInput.row(1) = IME_output_MV_refine_L1.row(2);
382 FBRInput.row(2) = IME_output_MV_refine_L1.row(3);
383 FBRInput.row(3) = IME_output_MV_refine_L1.row(4);
384
385 FBRMbMode = VME_GET_UNIInput_FBRMbModeInput(UNIInput);
386 FBRSubMbShape = VME_GET_UNIInput_FBRSubMBShapeInput(UNIInput);
387 FBRSubPredMode = VME_GET_UNIInput_FBRSubPredModeInput(UNIInput);
388
389 run_vme_fbr(UNIInput,
390 FBRInput,
391 Pred_Surface_L1,
392 FBRMbMode,
393 FBRSubMbShape,
394 FBRSubPredMode,
395 VME_ME_REFINE_L1);
396
397 VME_ME_REFINE_L0.format<int, 7, 8>().select<4,1,4,2>(1,1) = VME_ME_REFINE_L1.format<int, 7, 8>().select<4, 1, 4, 2>(1, 0);
398 }
399 }
400
401 VME_ME_REFINE_L0.format<short, 7, 16>().select<4, 1, 16, 1>(1, 0) <<= MVShiftFactor;
402 }
403
404 UNIInput.row(0) = UNIInput_tmp.row(0);
405 UNIInput.row(1) = UNIInput_tmp.row(1);
406 UNIInput.row(2) = UNIInput_tmp.row(2);
407 UNIInput.row(3) = 0;
408
409 UNIInput.row(1).select<4, 1>(24) = current_iteration;
410 UNIInput.format<short, 4, 16>().select<1, 1, 2, 1>(0, 0) = -search_coordinates;
411 ref0 = -search_coordinates;
412
413 if (checkL0(0))
414 {
415 IMEInput.row(2) = IME_output_MV_refine_L0.row(7);
416 IMEInput.row(3) = IME_output_MV_refine_L0.row(8);
417 run_vme_ime(UNIInput,
418 IMEInput,
419 VME_STREAM_IN,
420 searchControl,
421 Pred_Surface_L0,
422 ref0,
423 NULL,
424 costCenter,
425 IME_output_MV_L0);
426 }
427 else
428 {
429 run_vme_ime( UNIInput,
430 IMEInput.select<2, 1, 32, 1>(0,0),
431 VME_STREAM_DISABLE,
432 searchControl,
433 Pred_Surface_L0,
434 ref0,
435 NULL,
436 costCenter,
437 IME_output_MV_L0.format<uchar, 7, 32>());
438 }
439
440 UNIInput.format<int, 4, 8>()[2][5] = IME_output_MV_L0.format<int, 7, 8>()[0][6] & 0xFFFF00;
441 UNIInput[2][20] = IME_output_MV_L0[0][0] & 0x03;
442 FBRInput.row(0) = IME_output_MV_L0.row(1);
443 FBRInput.row(1) = IME_output_MV_L0.row(2);
444 FBRInput.row(2) = IME_output_MV_L0.row(3);
445 FBRInput.row(3) = IME_output_MV_L0.row(4);
446
447 FBRMbMode = VME_GET_UNIInput_FBRMbModeInput(UNIInput);
448 FBRSubMbShape = VME_GET_UNIInput_FBRSubMBShapeInput(UNIInput);
449 FBRSubPredMode = VME_GET_UNIInput_FBRSubPredModeInput(UNIInput);
450
451 run_vme_fbr(UNIInput,
452 FBRInput,
453 Pred_Surface_L0,
454 FBRMbMode,
455 FBRSubMbShape,
456 FBRSubPredMode,
457 VME_ME_L0);
458
459 if (b)
460 {
461 if(current_iteration <= NumRefIdxL1)
462 {
463 UNIInput_tmp.row(1).format<char>().select<1, 1>(5).merge(ref1Polarity(current_iteration), 0, isField);
464
465 UNIInput.row(0) = UNIInput_tmp.row(0);
466 UNIInput.row(1) = UNIInput_tmp.row(1);
467 UNIInput.row(2) = UNIInput_tmp.row(2);
468 UNIInput.row(3) = 0;
469
470 UNIInput.row(1).select<4, 1>(24) = current_iteration;
471 UNIInput.format<short, 4, 16>().select<1, 1, 2, 1>(0,0) = -search_coordinates;
472 ref0 = -search_coordinates;
473
474 if (checkL1(0))
475 {
476 IMEInput.row(2) = IME_output_MV_refine_L1.row(7);
477 IMEInput.row(3) = IME_output_MV_refine_L1.row(8);
478
479 run_vme_ime(UNIInput,
480 IMEInput,
481 VME_STREAM_IN,
482 searchControl,
483 Pred_Surface_L1,
484 ref0,
485 NULL,
486 costCenter,
487 IME_output_MV_L1);
488 }
489 else
490 {
491 run_vme_ime(UNIInput,
492 IMEInput.select<2, 1, 32, 1>(0, 0),
493 VME_STREAM_DISABLE,
494 searchControl,
495 Pred_Surface_L1,
496 ref0,
497 NULL,
498 costCenter,
499 IME_output_MV_L1.format<uchar, 7, 32>());
500 }
501
502 UNIInput.format<int, 4, 8>()[2][5] = IME_output_MV_L1.format<int, 7, 8>()[0][6] & 0xFFFF00;
503 UNIInput[2][20] = IME_output_MV_L1[0][0] & 0x03;
504 FBRInput.row(0) = IME_output_MV_L1.row(1);
505 FBRInput.row(1) = IME_output_MV_L1.row(2);
506 FBRInput.row(2) = IME_output_MV_L1.row(3);
507 FBRInput.row(3) = IME_output_MV_L1.row(4);
508
509 FBRMbMode = VME_GET_UNIInput_FBRMbModeInput(UNIInput);
510 FBRSubMbShape = VME_GET_UNIInput_FBRSubMBShapeInput(UNIInput);
511 FBRSubPredMode = VME_GET_UNIInput_FBRSubPredModeInput(UNIInput);
512
513 run_vme_fbr(UNIInput,
514 FBRInput,
515 Pred_Surface_L1,
516 FBRMbMode,
517 FBRSubMbShape,
518 FBRSubPredMode,
519 VME_ME_L1);
520
521 VME_ME_L0.format<int, 7, 8>().select<4, 1, 4, 2>(1, 1) = VME_ME_L1.format<int, 7, 8>().select<4, 1, 4, 2>(1, 0);
522 }
523 }
524
525 VME_ME_L0.format<short, 7, 16>().select<4, 1, 16, 1>(1, 0) <<= MVShiftFactor;
526
527 matrix<ushort, 4, 8> vmeMask = VME_ME_REFINE_L0.row(0).format<ushort>()[4] < VME_ME_L0.row(0).format<ushort>()[4];
528
529 MV.format<short, 4, 16>().select<4, 1, 8, 1>(0, 0).merge(
530 VME_ME_REFINE_L0.format<short, 7, 16>().select<2, 2, 16, 1>(1, 0),
531 VME_ME_L0.format<short, 7, 16>().select<2, 2, 16, 1>(1, 0), vmeMask);
532
533 MV.format<short, 4, 16>().select<4, 1, 8, 1>(0, 8).merge(
534 VME_ME_REFINE_L0.format<short, 7, 16>().select<2, 2, 16, 1>(2, 0),
535 VME_ME_L0.format<short, 7, 16>().select<2, 2, 16, 1>(2, 0), vmeMask);
536
537 if (b)
538 {
539 if(current_iteration <= NumRefIdxL1)
540 {
541 matrix<ushort, 4, 2> vmeMask1 = VME_ME_REFINE_L1.row(0).format<ushort>()[4] < VME_ME_L1.row(0).format<ushort>()[4];
542
543 MV.format<int, 4, 8>().select<4, 1, 2, 2>(0, 1).merge(
544 VME_ME_REFINE_L0.format<int, 7, 8>().select<2, 2, 4, 2>(1, 1),
545 VME_ME_L0.format<int, 7, 8>().select<2, 2, 4, 2>(1, 1), vmeMask1);
546
547 MV.format<int, 4, 8>().select<4, 1, 2, 2>(0, 5).merge(
548 VME_ME_REFINE_L0.format<int, 7, 8>().select<2, 2, 4, 2>(2, 1),
549 VME_ME_L0.format<int, 7, 8>().select<2, 2, 4, 2>(2, 1), vmeMask1);
550 }
551 }
552
553 /*
554 * Each MV (there are 1 per SubMB, 16 total) will be used by a corresponding MB in the next level.
555 * Each MB in the next level would use the MV of the corresponding Sub MB in the current level
556 */
557 write( HME_MV_Data_Surface_index, x_pos << 1, (y_pos >> SHIFT_PIXEL_TO_SUB_MB) + picture_offset, MV);
558
559 if (current_iteration==0)
560 {
561 matrix_ref<short, 4, 16>FinalMV = MV.format<short, 4, 16>();
562 matrix<U16, 2, 2> mx = cm_abs<U16, short>(FinalMV.select<2, 2, 2, 8>(0, 0));
563 matrix<U16, 2, 2> my = cm_abs<U16, short>(FinalMV.select<2, 2, 2, 8>(0, 1));
564
565 matrix<U16, 2, 2> mvsum = 0;
566 mvsum.merge(1, 0, (mx + my) < BRCMVThreshold);
567 BD_MV_Sum(0) = cm_sum<U32, U16, 4>(mvsum.format<U16>());
568 }
569
570 if (is_hevc_vp9_vdenc && vdenc_enable)
571 {
572 streamIn_MV.select<2, 1, 2, 4>(0, current_iteration) = MV.format<uint, 4, 8>().select<2, 2, 2, 4>(0, 0);
573 PredictorSelect = PredictorSelect | (1 << (2 * current_iteration));
574 PredictorRefIdx = PredictorRefIdx | (current_iteration << (current_iteration << 2));
575
576 if (b & (current_iteration == 0))
577 {
578 streamIn_MV.select<2, 1, 2, 4>(0, NumRefIdxL0 + 1) = MV.format<uint, 4, 8>().select<2, 2, 2, 4>(0, 1);
579
580 PredictorSelect = PredictorSelect | (2 << (2 * (NumRefIdxL0 + 1)));
581 PredictorRefIdx = PredictorRefIdx | (current_iteration << ((NumRefIdxL0 + 1) << 2));
582 }
583 }
584
585 if (b)
586 {
587 if(current_iteration <= NumRefIdxL1)
588 {
589 MV.format<int, 4, 8 >().select< 4, 1, 4, 2>(0,0) = MV.format<int, 4, 8 >().select< 4, 1, 4, 2>(0,1);
590 MV.format<int, 4, 8 >().select< 4, 1, 4, 2>(0,1) = 0;
591 write(HME_MV_Data_Surface_index, x_pos << 1, (y_pos >> SHIFT_PIXEL_TO_SUB_MB) + picture_offset + (picture_heightMB << 5) , MV);
592 }
593 }
594
595 if (writeDistortions)
596 {
597 matrix<uchar, 4, 8> DistL0;
598 matrix<uchar, 4, 8> DistL1;
599
600 matrix<ushort, 4, 8> distMask = VME_ME_REFINE_L0.row(0).format<ushort>()[4] < VME_ME_L0.row(0).format<ushort>()[4];
601 DistL0.merge(VME_ME_REFINE_L0.row(5).format<uchar, 4, 8>(), VME_ME_L0.row(5).format<uchar, 4, 8>(), distMask);
602
603 if (b)
604 {
605 if(current_iteration <= NumRefIdxL1)
606 {
607 distMask = VME_ME_REFINE_L1.row(0).format<ushort>()[4] < VME_ME_L1.row(0).format<ushort>()[4];
608 DistL1.merge(VME_ME_REFINE_L1.row(5).format<uchar, 4, 8>(), VME_ME_L1.row(5).format<uchar, 4, 8>(), distMask);
609 }
610 }
611
612 if (current_iteration == 0)
613 {
614 matrix<uchar, 4, 8> BRCdist = DistL0;
615
616 if (b)
617 {
618 if (current_iteration <= NumRefIdxL1)
619 {
620 BRCdist.format<ushort>() = cm_min<ushort>(DistL0.format<ushort>(), DistL1.format<ushort>());
621 }
622 }
623
624 if (!vdenc_enable)
625 {
626 write(BRC_DISTORTION_Surface, x_pos >> 1, y_pos >> 2, BRCdist);
627 }
628
629 BD_MV_Sum(1) = cm_sum<U32, U16, 16>(BRCdist.format<U16>());
630
631 vector<uint, 8> local_offset = 7;
632 vector<uint, 8> ret;
633
634 local_offset[0] = 0;
635 local_offset[1] = 1;
636
637 if (EnableMVSum)
638 {
639 write<uint, 8>(SUM_Surface, ATOMIC_ADD, 0, local_offset, BD_MV_Sum, ret);
640 }
641 }
642
643 matrix<uchar, 4, 8> dist;
644 dist = DistL0;
645 dist.format<ushort, 4, 4>().select<4, 1, 1, 1>(0, 1) = dist.format<ushort, 4, 4>().select<4, 1, 1, 1>(0, 0);
646 dist.format<ushort, 4, 4>().select<2, 2, 2, 1>(1, 2) = dist.format<ushort, 4, 4>().select<2, 2, 2, 1>(1, 0);
647 dist.format<ushort, 4, 4>().select<2, 2, 2, 1>(1, 0) = dist.format<ushort, 4, 4>().select<2, 2, 2, 1>(0, 0);
648 dist.format<ushort, 4, 4>().select<2, 2, 2, 1>(0, 2) = dist.format<ushort, 4, 4>().select<2, 2, 2, 1>(1, 2);
649
650 write(DISTORTION_Surface, x_pos >> 1, (y_pos >> 2) + picture_offset, dist);
651
652 if (!is_hevc_vp9_vdenc && vdenc_enable)
653 {
654 vector<uchar,1>factor = 0;
655 factor.merge(HMEStreaminRefCost, current_iteration != 0);
656
657 matrix<ushort, 4, 4>sumDist = dist.format<ushort>() + factor(0);
658 matrix<uint, 4, 4>curr_refID = current_iteration;
659 matrix<uchar, 4, 4>dist_mask = (best_DistL0 >= sumDist);
660
661 best_DistL0.merge(sumDist,dist_mask);
662 best_refId.merge(curr_refID,dist_mask);
663 best_MV.merge(MV.format<uint, 4, 8>().select<4, 1, 4, 2>(0, 0), dist_mask);
664 }
665
666 if (b)
667 {
668 if (current_iteration <= NumRefIdxL1)
669 {
670 dist = DistL1;
671 dist.format<ushort, 4, 4>().select<4, 1, 1, 1>(0, 1) = dist.format<ushort, 4, 4>().select<4, 1, 1, 1>(0, 0);
672 dist.format<ushort, 4, 4>().select<2, 2, 2, 1>(1, 2) = dist.format<ushort, 4, 4>().select<2, 2, 2, 1>(1, 0);
673 dist.format<ushort, 4, 4>().select<2, 2, 2, 1>(1, 0) = dist.format<ushort, 4, 4>().select<2, 2, 2, 1>(0, 0);
674 dist.format<ushort, 4, 4>().select<2, 2, 2, 1>(0, 2) = dist.format<ushort, 4, 4>().select<2, 2, 2, 1>(1, 2);
675 write(DISTORTION_Surface, x_pos >> 1, (y_pos >> 2) + picture_offset + (picture_heightMB << 5), dist);
676 }
677 }
678 }
679
680 /*
681 * All the MV of the references are arranged in the input surface as a list.
682 * Assume that each reference has X MB in a column so the MV of the top left block of the
683 * first reference is located at (0,0) and the top left MV of the next reference is located
684 * at ( X << SHIFT_MB_TO_SUB_MB, 0)
685 */
686 picture_offset += (picture_heightMB << SHIFT_MB_TO_SUB_MB);
687
688 if (0 == current_iteration && (!is_hevc_vp9_vdenc && vdenc_enable) && ROIMapEnable == 3)
689 {
690 ROIMask = !((best_DistL0 < 10) & (best_MV.format<ushort,4, 8>().select<4, 1, 4, 2>(0, 0) == 0) & (best_MV.format<ushort, 4, 8>().select<4, 1, 4, 2>(0, 1) == 0));
691 }
692
693 current_iteration++;
694 }
695
696 if (is_hevc_vp9_vdenc && vdenc_enable)
697 {
698 vector<ushort, 2> Sizein32x32RoundDown = (ActualMBDim) >> 5;
699 vector<ushort, 2> Sizein32x32RoundUp = (ActualMBDim + 31) >> 5;
700 vector<ushort, 2> mask = ((CURBEData[24] & 0x04) != 0);
701 Sizein32x32RoundUp.merge(((ActualMBDim + 63) >> 6) << 1, mask);
702
703 uint mb_xpos_32 = mb_x_pos << 1;
704 uint mb_ypos_32 = mb_y_pos << 1;
705
706 bool tiling = (CURBEData(24) & 0x40) != 0;
707
708 vector<uint, 32> Streamin_read_row1 = 0;
709 vector<uint, 32> Streamin_read_row2 = 0;
710
711 vector_ref<uint, 16> Streamin_read_row1_blk1 = Streamin_read_row1.select<16, 1>(0);
712 vector_ref<uint, 16> Streamin_read_row2_blk1 = Streamin_read_row2.select<16, 1>(0);
713
714 vector<uint, 4> streamin_offset;
715 vector<ushort, 8> tileinfo;
716
717 if (tiling)
718 {
719 vector<uint, 1> tile_info_offset;
720 vector<uint, 1> Offset_tile_streamin;
721 vector<ushort, 2> ActualMBDimLCU = Sizein32x32RoundUp;
722
723 if ((CURBEData[24] & 0x04))
724 {
725 ActualMBDimLCU = ActualMBDimLCU >> 1;
726
727 tile_info_offset = (mb_x_pos + (mb_y_pos * ActualMBDimLCU(0))) * TILEINFO_SIZE;
728 read(TileInfo_Buffer, tile_info_offset(0), tileinfo);
729
730 Offset_tile_streamin(0) = tileinfo(4) * tileinfo(5) + (tileinfo(4) * (tileinfo(7) - tileinfo(5)))
731 + (tileinfo(5) * (ActualMBDimLCU(0) - tileinfo(4)));
732
733 Offset_tile_streamin(0) = Offset_tile_streamin(0) * LCU64_STREAMIN_SIZE;
734 streamin_offset(0) = (mb_y_pos - tileinfo(5)) * (tileinfo(6) - tileinfo(4)) + mb_x_pos - tileinfo(4);
735
736 streamin_offset(0) = Offset_tile_streamin(0) + (streamin_offset(0) * LCU64_STREAMIN_SIZE);
737 streamin_offset(1) = streamin_offset(0) + STREAMIN_SIZE;
738 streamin_offset(2) = streamin_offset(0) + (2 * STREAMIN_SIZE);
739 streamin_offset(3) = streamin_offset(0) + (3 * STREAMIN_SIZE);
740 }
741 else
742 {
743 #pragma unroll
744 for (ushort i = 0; i < 4; i++)
745 {
746 ushort offset_x = i & 0x01;
747 ushort offset_y = (i & 0x02) >> 1;
748 tile_info_offset = ((mb_xpos_32 + offset_x) + ((mb_ypos_32 + offset_y) * Sizein32x32RoundUp(0))) * TILEINFO_SIZE;
749
750 read(TileInfo_Buffer, tile_info_offset(0), tileinfo);
751
752 Offset_tile_streamin(0) = tileinfo(4) * tileinfo(5) + (tileinfo(4) * (tileinfo(7) - tileinfo(5)))
753 + (tileinfo(5) * (Sizein32x32RoundUp(0) - tileinfo(4)));
754
755 Offset_tile_streamin(0) = Offset_tile_streamin(0) * STREAMIN_SIZE;
756
757 streamin_offset(i) = ((mb_ypos_32 + offset_y) - tileinfo(5)) * (tileinfo(6) - tileinfo(4)) + (mb_xpos_32 + offset_x) - tileinfo(4);
758 streamin_offset(i) = Offset_tile_streamin(0) + (streamin_offset(i) * STREAMIN_SIZE);
759 }
760 }
761 }
762 else
763 {
764 streamin_offset(0) = (mb_xpos_32 + (mb_ypos_32 * Sizein32x32RoundUp(0))) * STREAMIN_SIZE;
765 streamin_offset(2) = streamin_offset(0) + (Sizein32x32RoundUp(0) * STREAMIN_SIZE);
766
767 streamin_offset.select<1, 1>(0).merge((((mb_x_pos << 2) + ((mb_y_pos << 1) * Sizein32x32RoundUp(0))) * STREAMIN_SIZE), ((CURBEData[24] & 0x04) != 0));
768 streamin_offset.select<1, 1>(2).merge((streamin_offset.select<1, 1>(0) + (2 * STREAMIN_SIZE)), ((CURBEData[24] & 0x04) != 0));
769
770 streamin_offset(1) = streamin_offset(0) + STREAMIN_SIZE;
771 streamin_offset(3) = streamin_offset(2) + STREAMIN_SIZE;
772 }
773
774 if (!(CURBEData[24] & 0x02))
775 {
776 Streamin_read_row1.select<2, 16>(0) = CURBEData.format<uint>()(31);
777 Streamin_read_row2.select<2, 16>(0) = CURBEData.format<uint>()(31);
778 Streamin_read_row1.select<4, 1>(1) = CURBEData.format<uint>().select<4, 1>(32);
779 Streamin_read_row1.select<4, 1>(17) = CURBEData.format<uint>().select<4, 1>(32);
780 Streamin_read_row2.select<4, 1>(1) = CURBEData.format<uint>().select<4, 1>(32);
781 Streamin_read_row2.select<4, 1>(17) = CURBEData.format<uint>().select<4, 1>(32);
782 Streamin_read_row1.select<2, 16>(6) = CURBEData.format<uint>()(36);
783 Streamin_read_row2.select<2, 16>(6) = CURBEData.format<uint>()(36);
784 Streamin_read_row1.select<2, 16>(7) = CURBEData.format<uint>()(37);
785 Streamin_read_row2.select<2, 16>(7) = CURBEData.format<uint>()(37);
786 Streamin_read_row1.select<2, 16>(14) = CURBEData.format<uint>()(38);
787 Streamin_read_row2.select<2, 16>(14) = CURBEData.format<uint>()(38);
788
789 if (((mb_xpos_32 + 1) < Sizein32x32RoundDown(0)) & ((mb_ypos_32 + 1) < Sizein32x32RoundDown(1)))
790 {
791 Streamin_read_row1.select<4, 1>(8) = streamIn_MV.select<1, 1, 4, 1>(0, 0);
792 Streamin_read_row1.select<4, 1>(24) = streamIn_MV.select<1, 1, 4, 1>(0, 4);
793 Streamin_read_row2.select<4, 1>(8) = streamIn_MV.select<1, 1, 4, 1>(1, 0);
794 Streamin_read_row2.select<4, 1>(24) = streamIn_MV.select<1, 1, 4, 1>(1, 4);
795
796 Streamin_read_row1.format<uchar>().select<2, 64>(31) = PredictorSelect;
797 Streamin_read_row2.format<uchar>().select<2, 64>(31) = PredictorSelect;
798
799 Streamin_read_row1.format<ushort>().select<2, 32>(24) = PredictorRefIdx;
800 Streamin_read_row2.format<ushort>().select<2, 32>(24) = PredictorRefIdx;
801 }
802 else if (((mb_xpos_32 + 1) < Sizein32x32RoundDown(0)) & ((mb_ypos_32) < Sizein32x32RoundDown(1)))
803 {
804 Streamin_read_row1.select<4, 1>(8) = streamIn_MV.select<1, 1, 4, 1>(0, 0);
805 Streamin_read_row1.select<4, 1>(24) = streamIn_MV.select<1, 1, 4, 1>(0, 4);
806
807 Streamin_read_row1.format<uchar>().select<2, 64>(31) = PredictorSelect;
808 Streamin_read_row1.format<ushort>().select<2, 32>(24) = PredictorRefIdx;
809 }
810 else if (((mb_xpos_32) < Sizein32x32RoundDown(0)) & ((mb_ypos_32 + 1) < Sizein32x32RoundDown(1)))
811 {
812 Streamin_read_row1_blk1.select<4, 1>(8) = streamIn_MV.select<1, 1, 4, 1>(0, 0);
813 Streamin_read_row2_blk1.select<4, 1>(8) = streamIn_MV.select<1, 1, 4, 1>(1, 0);
814
815 Streamin_read_row1_blk1.format<uchar>()(31) = PredictorSelect;
816 Streamin_read_row2_blk1.format<uchar>()(31) = PredictorSelect;
817
818 Streamin_read_row1_blk1.format<ushort>()(24) = PredictorRefIdx;
819 Streamin_read_row2_blk1.format<ushort>()(24) = PredictorRefIdx;
820
821 }
822 else if (mb_xpos_32 < Sizein32x32RoundDown(0) && mb_ypos_32 < Sizein32x32RoundDown(1))
823 {
824 Streamin_read_row1_blk1.select<4, 1>(8) = streamIn_MV.select<1, 1, 4, 1>(0, 0);
825 Streamin_read_row1_blk1.format<uchar>()(31) = PredictorSelect;
826 Streamin_read_row1_blk1.format<ushort>()(24) = PredictorRefIdx;
827 }
828
829 // Note that in the if else blocks Sizein32x32RoundUp is being used. This is because when stream-in input is disabled, kernel has to populate other stream in fields(Num merge candidates etc) from CURBE
830 // For HME predictors we need to round down the frame size. For other fields we need to round up
831 if (((mb_xpos_32 + 1) < Sizein32x32RoundUp(0)) & ((mb_ypos_32 + 1) < Sizein32x32RoundUp(1)))
832 {
833 write(StreamINSurface, streamin_offset(0), Streamin_read_row1.select<16, 1>(0));
834 write(StreamINSurface, streamin_offset(1), Streamin_read_row1.select<16, 1>(16));
835 write(StreamINSurface, streamin_offset(2), Streamin_read_row2.select<16, 1>(0));
836 write(StreamINSurface, streamin_offset(3), Streamin_read_row2.select<16, 1>(16));
837 }
838 else if (((mb_xpos_32 + 1) < Sizein32x32RoundUp(0)) & ((mb_ypos_32) < Sizein32x32RoundUp(1)))
839 {
840 write(StreamINSurface, streamin_offset(0), Streamin_read_row1.select<16, 1>(0));
841 write(StreamINSurface, streamin_offset(1), Streamin_read_row1.select<16, 1>(16));
842 }
843 else if (((mb_xpos_32) < Sizein32x32RoundUp(0)) & ((mb_ypos_32 + 1) < Sizein32x32RoundUp(1)))
844 {
845 write(StreamINSurface, streamin_offset(0), Streamin_read_row1_blk1);
846 write(StreamINSurface, streamin_offset(2), Streamin_read_row2_blk1);
847 }
848 else
849 {
850 write(StreamINSurface, streamin_offset(0), Streamin_read_row1_blk1);
851 }
852 }
853 else
854 {
855 if (((mb_xpos_32 + 1) < Sizein32x32RoundDown(0)) & ((mb_ypos_32 + 1) < Sizein32x32RoundDown(1)))
856 {
857 read(StreamINSurface_input, streamin_offset(0), Streamin_read_row1.select<16,1>(0));
858 read(StreamINSurface_input, streamin_offset(1), Streamin_read_row1.select<16, 1>(16));
859 read(StreamINSurface_input, streamin_offset(2), Streamin_read_row2.select<16, 1>(0));
860 read(StreamINSurface_input, streamin_offset(3), Streamin_read_row2.select<16, 1>(16));
861
862 Streamin_read_row1.select<4, 1>(8) = streamIn_MV.select<1, 1, 4, 1>(0, 0);
863 Streamin_read_row1.select<4, 1>(24) = streamIn_MV.select<1, 1, 4, 1>(0, 4);
864 Streamin_read_row2.select<4, 1>(8) = streamIn_MV.select<1, 1, 4, 1>(1, 0);
865 Streamin_read_row2.select<4, 1>(24) = streamIn_MV.select<1, 1, 4, 1>(1, 4);
866
867
868 Streamin_read_row1.format<uchar>().select<2, 64>(31) = PredictorSelect;
869 Streamin_read_row2.format<uchar>().select<2, 64>(31) = PredictorSelect;
870
871 Streamin_read_row1.format<ushort>().select<2, 32>(24) = PredictorRefIdx;
872 Streamin_read_row2.format<ushort>().select<2, 32>(24) = PredictorRefIdx;
873
874
875 write(StreamINSurface, streamin_offset(0), Streamin_read_row1.select<16, 1>(0));
876 write(StreamINSurface, streamin_offset(1), Streamin_read_row1.select<16, 1>(16));
877 write(StreamINSurface, streamin_offset(2), Streamin_read_row2.select<16, 1>(0));
878 write(StreamINSurface, streamin_offset(3), Streamin_read_row2.select<16, 1>(16));
879 }
880 else if (((mb_xpos_32 + 1) < Sizein32x32RoundDown(0)) & ((mb_ypos_32) < Sizein32x32RoundDown(1)))
881 {
882 read(StreamINSurface_input, streamin_offset(0), Streamin_read_row1.select<16, 1>(0));
883 read(StreamINSurface_input, streamin_offset(1), Streamin_read_row1.select<16, 1>(16));
884
885 Streamin_read_row1.select<4, 1>(8) = streamIn_MV.select<1, 1, 4, 1>(0, 0);
886 Streamin_read_row1.select<4, 1>(24) = streamIn_MV.select<1, 1, 4, 1>(0, 4);
887
888 Streamin_read_row1.format<uchar>().select<2, 64>(31) = PredictorSelect;
889 Streamin_read_row1.format<ushort>().select<2, 32>(24) = PredictorRefIdx;
890
891 write(StreamINSurface, streamin_offset(0), Streamin_read_row1.select<16, 1>(0));
892 write(StreamINSurface, streamin_offset(1), Streamin_read_row1.select<16, 1>(16));
893 }
894 else if (((mb_xpos_32) < Sizein32x32RoundDown(0)) & ((mb_ypos_32 + 1) < Sizein32x32RoundDown(1)))
895 {
896 read(StreamINSurface_input, streamin_offset(0), Streamin_read_row1_blk1);
897 read(StreamINSurface_input, streamin_offset(2), Streamin_read_row2_blk1);
898
899 Streamin_read_row1_blk1.select<4, 1>(8) = streamIn_MV.select<1, 1, 4, 1>(0, 0);
900 Streamin_read_row2_blk1.select<4, 1>(8) = streamIn_MV.select<1, 1, 4, 1>(1, 0);
901
902 Streamin_read_row1_blk1.format<uchar>()(31) = PredictorSelect;
903 Streamin_read_row2_blk1.format<uchar>()(31) = PredictorSelect;
904
905 Streamin_read_row1_blk1.format<ushort>()(24) = PredictorRefIdx;
906 Streamin_read_row2_blk1.format<ushort>()(24) = PredictorRefIdx;
907
908 write(StreamINSurface, streamin_offset(0), Streamin_read_row1_blk1);
909 write(StreamINSurface, streamin_offset(2), Streamin_read_row2_blk1);
910 }
911 else if (mb_xpos_32 < Sizein32x32RoundDown(0) && mb_ypos_32 < Sizein32x32RoundDown(1))
912 {
913 read(StreamINSurface_input, streamin_offset(0), Streamin_read_row1_blk1);
914
915 Streamin_read_row1_blk1.select<4, 1>(8) = streamIn_MV.select<1, 1, 4, 1>(0, 0);
916 Streamin_read_row1_blk1.format<uchar>()(31) = PredictorSelect;
917 Streamin_read_row1_blk1.format<ushort>()(24) = PredictorRefIdx;
918
919 write(StreamINSurface, streamin_offset(0), Streamin_read_row1_blk1);
920 }
921 }
922 }
923
924 if (!is_hevc_vp9_vdenc && vdenc_enable)
925 {
926 uint initVal, streamin_offset;
927
928 vector<uint, 16> finalOffset;
929 vector<uint, 16> finalOffset_RefID;
930 vector<uint, 16> finalOffset_ROI;
931 vector<short,16> v_MBaddressIndx(MBaddressIndx );
932 vector<short,16> v_MB_X_Indx( MBX_Indx );
933 vector<uint, 16> lastMBaddr, lastMBmask;
934
935 vector<uint, 16> offset_debug = 0xffff0000;
936 vector<uint, 256> input_streamInsurface;
937
938 uint globOffset = ((x_pos + (y_pos * ActualMBDim(0))) << 2);
939
940 streamin_offset = ActualMBDim(0) * 16;
941
942 finalOffset = (v_MBaddressIndx * streamin_offset) + (v_MB_X_Indx * 16) + 2;
943 finalOffset_RefID = finalOffset + 2;
944 finalOffset_ROI = finalOffset - 2;
945
946 lastMBaddr = ((y_pos>>2) + v_MBaddressIndx + 1) * streamin_offset;
947 lastMBmask = ((finalOffset + globOffset) > lastMBaddr);
948
949 uint block1 = globOffset * 4;
950 read((StreamINSurface_input), block1, input_streamInsurface.select<32, 1>(0));
951
952 block1 = (globOffset + 32) * 4;
953 read((StreamINSurface_input), block1, input_streamInsurface.select<32, 1>(32));
954
955 block1 = (globOffset + streamin_offset) * 4;
956 read((StreamINSurface_input), block1, input_streamInsurface.select<32, 1>(64));
957
958 block1 = (globOffset + 32 + streamin_offset) * 4;
959 read((StreamINSurface_input), block1, input_streamInsurface.select<32, 1>(96));
960
961 block1 = (globOffset + streamin_offset * 2) * 4;
962 read((StreamINSurface_input), block1, input_streamInsurface.select<32, 1>(128));
963
964 block1 = (globOffset + 32 + streamin_offset * 2) * 4;
965 read((StreamINSurface_input), block1, input_streamInsurface.select<32, 1>(160));
966
967 block1 = (globOffset + streamin_offset * 3) * 4;
968 read((StreamINSurface_input), block1, input_streamInsurface.select<32, 1>(192));
969
970 block1 = (globOffset + 32 + streamin_offset * 3) * 4;
971 read((StreamINSurface_input), block1, input_streamInsurface.select<32, 1>(224));
972
973 input_streamInsurface.select<16, 16>(2) = best_MV.format<uint>();
974 input_streamInsurface.select<16, 16>(4) = best_refId.format<uint>();
975
976 if (ROIMapEnable == 3)
977 {
978 input_streamInsurface.select<16, 16>(0) = (input_streamInsurface.select<16, 16>(0) & 0xFFFFFF00) | (ROIMask.format<uchar>() & 0x000000FF);
979 }
980
981 uchar boundary_result = lastMBmask.any();
982 if ( boundary_result == 0)
983 {
984 block1 = globOffset * 4;
985 write((StreamINSurface), block1, input_streamInsurface.select<32, 1>(0));
986
987 block1 = (globOffset + 32) * 4;
988 write((StreamINSurface), block1, input_streamInsurface.select<32, 1>(32));
989
990 block1 = (globOffset + streamin_offset) * 4;
991 write((StreamINSurface), block1, input_streamInsurface.select<32, 1>(64));
992
993 block1 = (globOffset + 32 + streamin_offset) * 4;
994 write((StreamINSurface), block1, input_streamInsurface.select<32, 1>(96));
995
996 block1 = (globOffset + streamin_offset * 2) * 4;
997 write((StreamINSurface), block1, input_streamInsurface.select<32, 1>(128));
998
999 block1 = (globOffset + 32 + streamin_offset * 2) * 4;
1000 write((StreamINSurface), block1, input_streamInsurface.select<32, 1>(160));
1001
1002 block1 = (globOffset + streamin_offset * 3) * 4;
1003 write((StreamINSurface), block1, input_streamInsurface.select<32, 1>(192));
1004
1005 block1 = (globOffset + 32 + streamin_offset * 3) * 4;
1006 write((StreamINSurface), block1, input_streamInsurface.select<32, 1>(224));
1007 }
1008 else
1009 {
1010 block1 = globOffset * 4;
1011 if ((lastMBmask(0) == 0) && (lastMBmask(1) == 0))
1012 {
1013 write((StreamINSurface),block1, input_streamInsurface.select<32, 1>(0));
1014 }
1015 else if (lastMBmask(0) == 0)
1016 {
1017 write((StreamINSurface), block1, input_streamInsurface.select<16, 1>(0));
1018 }
1019
1020 block1 = (globOffset + 32) * 4;
1021 if ((lastMBmask(2) == 0) && (lastMBmask(3) == 0))
1022 {
1023 write((StreamINSurface), block1, input_streamInsurface.select<32, 1>(32));
1024 }
1025 else if (lastMBmask(2) == 0)
1026 {
1027 write((StreamINSurface), block1, input_streamInsurface.select<16, 1>(32));
1028 }
1029
1030 block1 = (globOffset + streamin_offset) * 4;
1031 if ((lastMBmask(4) == 0) && (lastMBmask(5) == 0))
1032 {
1033 write((StreamINSurface), block1, input_streamInsurface.select<32, 1>(64));
1034 }
1035 else if (lastMBmask(4) == 0)
1036 {
1037 write((StreamINSurface), block1, input_streamInsurface.select<16, 1>(64));
1038 }
1039
1040 block1 = (globOffset + 32 + streamin_offset) * 4;
1041 if ((lastMBmask(6) == 0) && (lastMBmask(7) == 0))
1042 {
1043 write((StreamINSurface), block1, input_streamInsurface.select<32, 1>(96));
1044 }
1045 else if (lastMBmask(6) == 0)
1046 {
1047 write((StreamINSurface), block1, input_streamInsurface.select<16, 1>(96));
1048 }
1049
1050 block1 = (globOffset + streamin_offset * 2) * 4;
1051 if ((lastMBmask(8) == 0) && (lastMBmask(9) == 0))
1052 {
1053 write((StreamINSurface), block1, input_streamInsurface.select<32, 1>(128));
1054 }
1055 else if (lastMBmask(8) == 0)
1056 {
1057 write((StreamINSurface), block1, input_streamInsurface.select<16, 1>(128));
1058 }
1059
1060 block1 = (globOffset + 32 + streamin_offset * 2) * 4;
1061 if ((lastMBmask(10) == 0) && (lastMBmask(11) == 0))
1062 {
1063 write((StreamINSurface), block1, input_streamInsurface.select<32, 1>(160));
1064 }
1065 else if (lastMBmask(10) == 0)
1066 {
1067 write((StreamINSurface), block1, input_streamInsurface.select<16, 1>(160));
1068 }
1069
1070 block1 = (globOffset + streamin_offset * 3) * 4;
1071 if ((lastMBmask(12) == 0) && (lastMBmask(13) == 0))
1072 {
1073 write((StreamINSurface), block1, input_streamInsurface.select<32, 1>(192));
1074 }
1075 else if (lastMBmask(12) == 0)
1076 {
1077 write((StreamINSurface), block1, input_streamInsurface.select<16, 1>(192));
1078 }
1079
1080 block1 = (globOffset + 32 + streamin_offset * 3) * 4;
1081 if ((lastMBmask(14) == 0 ) && (lastMBmask(15) == 0))
1082 {
1083 write((StreamINSurface), block1, input_streamInsurface.select<32, 1>(224));
1084 }
1085 else if (lastMBmask(14) == 0)
1086 {
1087 write((StreamINSurface), block1, input_streamInsurface.select<16, 1>(224));
1088 }
1089 }
1090 }
1091 }
1092
1093 extern "C" _GENX_MAIN_ void
HME_P(vector<uchar,CURBEDATA_SIZE> CURBEData,SurfaceIndex HME_MV_Data_Surface_index,SurfaceIndex HME_MV_Input_Data_Surface_index,SurfaceIndex DISTORTION_Surface,SurfaceIndex BRC_DISTORTION_Surface,SurfaceIndex Pred_Surface_L0,SurfaceIndex Pred_Surface_L1,SurfaceIndex StreamINSurface,SurfaceIndex StreamINSurface_input,SurfaceIndex SUM_Surface,SurfaceIndex TileInfo_Buffer)1094 HME_P( vector<uchar, CURBEDATA_SIZE> CURBEData,
1095 SurfaceIndex HME_MV_Data_Surface_index,
1096 SurfaceIndex HME_MV_Input_Data_Surface_index,
1097 SurfaceIndex DISTORTION_Surface,
1098 SurfaceIndex BRC_DISTORTION_Surface,
1099 SurfaceIndex Pred_Surface_L0,
1100 SurfaceIndex Pred_Surface_L1,
1101 SurfaceIndex StreamINSurface,
1102 SurfaceIndex StreamINSurface_input,
1103 SurfaceIndex SUM_Surface,
1104 SurfaceIndex TileInfo_Buffer
1105 )
1106 {
1107 HME(CURBEData,
1108 HME_MV_Data_Surface_index,
1109 HME_MV_Input_Data_Surface_index,
1110 DISTORTION_Surface,
1111 BRC_DISTORTION_Surface,
1112 Pred_Surface_L0,
1113 Pred_Surface_L1,
1114 StreamINSurface,
1115 StreamINSurface_input,
1116 SUM_Surface,
1117 TileInfo_Buffer,
1118 0,
1119 0,
1120 0);
1121 }
1122
1123 extern "C" _GENX_MAIN_ void
HME_B(vector<uchar,CURBEDATA_SIZE> CURBEData,SurfaceIndex HME_MV_Data_Surface_index,SurfaceIndex HME_MV_Input_Data_Surface_index,SurfaceIndex DISTORTION_Surface,SurfaceIndex BRC_DISTORTION_Surface,SurfaceIndex Pred_Surface_L0,SurfaceIndex Pred_Surface_L1,SurfaceIndex StreamINSurface,SurfaceIndex StreamINSurface_input,SurfaceIndex SUM_Surface,SurfaceIndex TileInfo_Buffer)1124 HME_B( vector<uchar, CURBEDATA_SIZE> CURBEData,
1125 SurfaceIndex HME_MV_Data_Surface_index,
1126 SurfaceIndex HME_MV_Input_Data_Surface_index,
1127 SurfaceIndex DISTORTION_Surface,
1128 SurfaceIndex BRC_DISTORTION_Surface,
1129 SurfaceIndex Pred_Surface_L0,
1130 SurfaceIndex Pred_Surface_L1,
1131 SurfaceIndex StreamINSurface,
1132 SurfaceIndex StreamINSurface_input,
1133 SurfaceIndex SUM_Surface,
1134 SurfaceIndex TileInfo_Buffer
1135 )
1136 {
1137 HME(CURBEData ,
1138 HME_MV_Data_Surface_index,
1139 HME_MV_Input_Data_Surface_index,
1140 DISTORTION_Surface,
1141 BRC_DISTORTION_Surface,
1142 Pred_Surface_L0,
1143 Pred_Surface_L1,
1144 StreamINSurface,
1145 StreamINSurface_input,
1146 SUM_Surface,
1147 TileInfo_Buffer,
1148 1,
1149 0,
1150 0);
1151 }
1152
1153 extern "C" _GENX_MAIN_ void
HME_VDENC_STREAMIN(vector<uchar,CURBEDATA_SIZE> CURBEData,SurfaceIndex HME_MV_Data_Surface_index,SurfaceIndex HME_MV_Input_Data_Surface_index,SurfaceIndex DISTORTION_Surface,SurfaceIndex BRC_DISTORTION_Surface,SurfaceIndex Pred_Surface_L0,SurfaceIndex Pred_Surface_L1,SurfaceIndex StreamINSurface,SurfaceIndex StreamINSurface_input,SurfaceIndex SUM_Surface,SurfaceIndex TileInfo_Buffer)1154 HME_VDENC_STREAMIN( vector<uchar, CURBEDATA_SIZE> CURBEData,
1155 SurfaceIndex HME_MV_Data_Surface_index,
1156 SurfaceIndex HME_MV_Input_Data_Surface_index,
1157 SurfaceIndex DISTORTION_Surface,
1158 SurfaceIndex BRC_DISTORTION_Surface,
1159 SurfaceIndex Pred_Surface_L0,
1160 SurfaceIndex Pred_Surface_L1,
1161 SurfaceIndex StreamINSurface,
1162 SurfaceIndex StreamINSurface_input,
1163 SurfaceIndex SUM_Surface,
1164 SurfaceIndex TileInfo_Buffer)
1165 {
1166 HME(CURBEData ,
1167 HME_MV_Data_Surface_index,
1168 HME_MV_Input_Data_Surface_index,
1169 DISTORTION_Surface,
1170 BRC_DISTORTION_Surface,
1171 Pred_Surface_L0,
1172 Pred_Surface_L1,
1173 StreamINSurface,
1174 StreamINSurface_input,
1175 SUM_Surface,
1176 TileInfo_Buffer,
1177 0,
1178 1,
1179 0);
1180 }
1181