1 /*
2 * Copyright (c) 2007-2021, Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included
12 * in all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
15 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20 * OTHER DEALINGS IN THE SOFTWARE.
21 */
22 //!
23 //! \file cm_queue_rt.cpp
24 //! \brief Contains CmQueueRT implementations.
25 //!
26
27 #include "cm_queue_rt.h"
28 #include "cm_event_ex.h"
29 #include "cm_mem.h"
30 #include "cm_device_rt.h"
31 #include "cm_event_rt.h"
32 #include "cm_task_rt.h"
33 #include "cm_task_internal.h"
34 #include "cm_thread_space_rt.h"
35 #include "cm_kernel_rt.h"
36 #include "cm_kernel_data.h"
37 #include "cm_buffer_rt.h"
38 #include "cm_group_space.h"
39 #include "cm_vebox_data.h"
40 #include "cm_surface_manager.h"
41 #include "cm_surface_2d_rt.h"
42 #include "cm_vebox_rt.h"
43 #include "cm_execution_adv.h"
44 #include "vp_common.h"
45
46 // Used by GPUCopy
47 #define BLOCK_PIXEL_WIDTH (32)
48 #define BLOCK_HEIGHT (8)
49 #define BLOCK_HEIGHT_NV12 (4)
50 #define SUB_BLOCK_PIXEL_WIDTH (8)
51 #define SUB_BLOCK_HEIGHT (8)
52 #define SUB_BLOCK_HEIGHT_NV12 (4)
53 #define INNER_LOOP (4)
54 #define BYTE_COPY_ONE_THREAD (1024*INNER_LOOP) //4K for each thread
55 #define THREAD_SPACE_WIDTH_INCREMENT (8)
56 //Used by unaligned copy
57 #define BLOCK_WIDTH (64)
58 #define PAGE_ALIGNED (0x1000)
59
60 #define GPUCOPY_KERNEL_LOCK(a) ((a)->locked = true)
61 #define GPUCOPY_KERNEL_UNLOCK(a) ((a)->locked = false)
62 using namespace CMRT_UMD;
63
64 namespace CMRT_UMD
65 {
66 typedef struct _tdata
67 {
68 void* pCmQueueRT;
69 void* buffer;
70 size_t offset;
71 unsigned char* sysMem;
72 uint64_t sysMemSize;
73 int dir;
74 void* threadSpace;
75 void* task;
76 void* wait_event;
77 void* event;
78 unsigned option;
79 uint64_t cpuFrrequency;
80 }CopyThreadData;
81
82
83 //*-----------------------------------------------------------------------------
84 //| Purpose: Create Queue
85 //| Returns: Result of the operation.
86 //*-----------------------------------------------------------------------------
Create(CmDeviceRT * device,CmQueueRT * & queue,CM_QUEUE_CREATE_OPTION queueCreateOption)87 int32_t CmQueueRT::Create(CmDeviceRT *device,
88 CmQueueRT* &queue,
89 CM_QUEUE_CREATE_OPTION queueCreateOption)
90 {
91 int32_t result = CM_SUCCESS;
92 queue = new (std::nothrow) CmQueueRT(device, queueCreateOption);
93 if( queue )
94 {
95 result = queue->Initialize( );
96 if( result != CM_SUCCESS )
97 {
98 CmQueueRT::Destroy( queue);
99 }
100 }
101 else
102 {
103 CM_ASSERTMESSAGE("Error: Failed to create CmQueue due to out of system memory.");
104 result = CM_OUT_OF_HOST_MEMORY;
105 }
106 return result;
107 }
108
109 //*-----------------------------------------------------------------------------
110 //| Purpose: Destroy Queue
111 //| Returns: Result of the operation.
112 //*-----------------------------------------------------------------------------
Destroy(CmQueueRT * & queue)113 int32_t CmQueueRT::Destroy(CmQueueRT* &queue )
114 {
115 if( queue == nullptr )
116 {
117 return CM_FAILURE;
118 }
119
120 uint32_t result = queue->CleanQueue();
121
122 queue->DestroyComputeGpuContext();
123
124 PCM_HAL_STATE cmHalState = ((PCM_CONTEXT_DATA)queue->m_device->GetAccelData())->cmHalState;
125 CM_CHK_NULL_RETURN_CMERROR(cmHalState);
126 if (cmHalState->pfnUnRegisterStream != nullptr && queue->m_streamIndex != cmHalState->osInterface->streamIndex)
127 {
128 cmHalState->pfnUnRegisterStream(queue->m_streamIndex, cmHalState);
129 }
130
131 CmSafeDelete( queue );
132
133 return result;
134 }
135
136 //*-----------------------------------------------------------------------------
137 //| Purpose: Constructor of Cm Queue
138 //| Returns: Result of the operation.
139 //*-----------------------------------------------------------------------------
CmQueueRT(CmDeviceRT * device,CM_QUEUE_CREATE_OPTION queueCreateOption)140 CmQueueRT::CmQueueRT(CmDeviceRT *device,
141 CM_QUEUE_CREATE_OPTION queueCreateOption):
142 m_device(device),
143 m_eventArray(CM_INIT_EVENT_COUNT),
144 m_eventCount(0),
145 m_copyKernelParamArray(CM_INIT_GPUCOPY_KERNL_COUNT),
146 m_copyKernelParamArrayCount(0),
147 m_halMaxValues(nullptr),
148 m_queueOption(queueCreateOption),
149 m_usingVirtualEngine(false),
150 m_osSyncEvent(nullptr),
151 m_trackerIndex(0),
152 m_fastTrackerIndex(0),
153 m_streamIndex(0),
154 m_gpuContextHandle(MOS_GPU_CONTEXT_INVALID_HANDLE),
155 m_syncBufferHandle(INVALID_SYNC_BUFFER_HANDLE)
156 {
157 MOS_ZeroMemory(&m_mosVeHintParams, sizeof(m_mosVeHintParams));
158 MosUtilities::MosQueryPerformanceFrequency(&m_CPUperformanceFrequency);
159 }
160
161 //*-----------------------------------------------------------------------------
162 //| Purpose: Destructor of Cm Queue
163 //| Returns: Result of the operation.
164 //*-----------------------------------------------------------------------------
~CmQueueRT()165 CmQueueRT::~CmQueueRT()
166 {
167 m_osSyncEvent = nullptr;
168 uint32_t eventArrayUsedSize = m_eventArray.GetMaxSize();
169 for( uint32_t i = 0; i < eventArrayUsedSize; i ++ )
170 {
171 CmEventRT* event = (CmEventRT*)m_eventArray.GetElement( i );
172 uint32_t eventReleaseTimes = 0;
173 while( event )
174 { // destroy the event no matter if it is released by user
175 if(eventReleaseTimes > 2)
176 {
177 // The max of event's reference cout is 2
178 // if the event is not released after 2 times, there is something wrong
179 CM_ASSERTMESSAGE("Error: The max of event's reference cout is 2.");
180 break;
181 }
182 CmEventRT::Destroy( event );
183 eventReleaseTimes ++;
184 }
185 }
186 m_eventArray.Delete();
187
188 // Do not destroy the kernel in m_copyKernelParamArray.
189 // They have been destoyed in ~CmDevice() before destroying Queue
190 for( uint32_t i = 0; i < m_copyKernelParamArrayCount; i ++ )
191 {
192 CM_GPUCOPY_KERNEL *gpuCopyParam = (CM_GPUCOPY_KERNEL*)m_copyKernelParamArray.GetElement( i );
193 CmSafeDelete(gpuCopyParam);
194 }
195
196 m_copyKernelParamArray.Delete();
197
198 CM_HAL_STATE *hal_state = static_cast<CM_CONTEXT_DATA*>(m_device->GetAccelData())->cmHalState;
199 ReleaseSyncBuffer(hal_state);
200 return;
201 }
202
203 //*-----------------------------------------------------------------------------
204 //| Purpose: Initialize Cm Queue
205 //| Returns: Result of the operation.
206 //*-----------------------------------------------------------------------------
Initialize()207 int32_t CmQueueRT::Initialize()
208 {
209 PCM_HAL_STATE cmHalState = ((PCM_CONTEXT_DATA)m_device->GetAccelData())->cmHalState;
210 CM_HAL_MAX_VALUES_EX* halMaxValuesEx = nullptr;
211 CM_RETURN_CODE hr = CM_SUCCESS;
212 m_device->GetHalMaxValues(m_halMaxValues, halMaxValuesEx);
213
214 // Assign a new tracker and record the tracker index
215 int ret = cmHalState->renderHal->trackerProducer.AssignNewTracker();
216 CM_CHK_COND_RETURN((ret < 0), CM_FAILURE, "Error: failed to assign a new tracker");
217 m_trackerIndex = ret;
218 if (cmHalState->advExecutor)
219 {
220 ret = cmHalState->advExecutor->AssignNewTracker();
221 CM_CHK_COND_RETURN((ret < 0), CM_FAILURE, "Error: failed to assign a new tracker");
222 m_fastTrackerIndex = ret;
223 }
224
225 // Creates or gets GPU Context for the test
226 if (m_queueOption.UserGPUContext == true)
227 {
228 // Checks if it is the user-provided GPU context. If it is valid, we will create the queue with the existing Context
229 if (cmHalState->osInterface->pfnIsGpuContextValid(cmHalState->osInterface, (MOS_GPU_CONTEXT)m_queueOption.GPUContext) != MOS_STATUS_SUCCESS)
230 {
231 // Returns failure
232 CM_ASSERTMESSAGE("Error: The user passed in an GPU context which is not valid");
233 return CM_INVALID_USER_GPU_CONTEXT_FOR_QUEUE_EX;
234 }
235 }
236 else
237 {
238 MOS_GPUCTX_CREATOPTIONS ctxCreateOption;
239 ctxCreateOption.CmdBufferNumScale
240 = HalCm_GetNumCmdBuffers(cmHalState->osInterface, cmHalState->cmDeviceParam.maxTasks);
241
242 // Create MDF preset GPU context, update GPUContext in m_queueOption
243 if (m_queueOption.QueueType == CM_QUEUE_TYPE_RENDER)
244 {
245 MOS_GPU_CONTEXT tmpGpuCtx = cmHalState->requestCustomGpuContext? MOS_GPU_CONTEXT_RENDER4: MOS_GPU_CONTEXT_RENDER3;;
246
247 // check if context handle was specified by user.
248 if (m_queueOption.GPUContext != 0)
249 {
250 tmpGpuCtx = (MOS_GPU_CONTEXT)m_queueOption.GPUContext;
251 }
252
253 // sanity check of context handle for CM
254 if (HalCm_IsValidGpuContext(tmpGpuCtx) == false)
255 {
256 return CM_INVALID_USER_GPU_CONTEXT_FOR_QUEUE_EX;
257 }
258
259 // SSEU overriding
260 if (cmHalState->cmHalInterface->IsOverridePowerOptionPerGpuContext())
261 {
262 // checking if need shutdown sub-slices for VME usage
263 if (m_queueOption.SseuUsageHint == CM_QUEUE_SSEU_USAGE_HINT_VME
264 && cmHalState->cmHalInterface->IsRequestShutdownSubslicesForVmeUsage())
265 {
266 MEDIA_SYSTEM_INFO *gtSystemInfo = cmHalState->osInterface->pfnGetGtSystemInfo(cmHalState->osInterface);
267 ctxCreateOption.packed.SliceCount = (uint8_t)gtSystemInfo->SliceCount;
268 ctxCreateOption.packed.SubSliceCount = (gtSystemInfo->SubSliceCount / gtSystemInfo->SliceCount) >> 1; // set to half
269 ctxCreateOption.packed.MaxEUcountPerSubSlice = gtSystemInfo->EUCount/gtSystemInfo->SubSliceCount;
270 ctxCreateOption.packed.MinEUcountPerSubSlice = gtSystemInfo->EUCount/gtSystemInfo->SubSliceCount;
271 }
272
273 #if (_DEBUG || _RELEASE_INTERNAL)
274 {
275 MediaUserSettingSharedPtr userSettingPtr = cmHalState->osInterface->pfnGetUserSettingInstance(cmHalState->osInterface);
276 uint32_t value = 0;
277 ReadUserSettingForDebug(
278 userSettingPtr,
279 value,
280 __MEDIA_USER_FEATURE_VALUE_SSEU_SETTING_OVERRIDE,
281 MediaUserSetting::Group::Device);
282
283 // +---------------+----------------+----------------+----------------+
284 // | EUCountMax | EUCountMin | SSCount | SliceCount |
285 // +-------------24+--------------16+---------------8+---------------0+
286 if (value != 0xDEADC0DE)
287 {
288 ctxCreateOption.packed.SliceCount = value & 0xFF; // Bits 0-7
289 ctxCreateOption.packed.SubSliceCount = (value >> 8) & 0xFF; // Bits 8-15
290 ctxCreateOption.packed.MaxEUcountPerSubSlice = (value >> 16) & 0xFF; // Bits 16-23
291 ctxCreateOption.packed.MinEUcountPerSubSlice = (value >> 24) & 0xFF; // Bits 24-31
292 }
293 }
294 #endif
295 }
296
297 ctxCreateOption.RAMode = m_queueOption.RAMode;
298 ctxCreateOption.isRealTimePriority = m_queueOption.IsRealTimePrioriy;
299
300 // Create render GPU context.
301 CM_CHK_MOSSTATUS_GOTOFINISH_CMERROR(
302 CreateGpuContext(cmHalState, tmpGpuCtx, MOS_GPU_NODE_3D,
303 &ctxCreateOption));
304
305 #if (_RELEASE_INTERNAL || _DEBUG)
306 #if defined(CM_DIRECT_GUC_SUPPORT)
307 //init GuC
308 CM_CHK_MOSSTATUS_GOTOFINISH_CMERROR(cmHalState->osInterface->pfnInitGuC(cmHalState->osInterface, MOS_GPU_NODE_3D));
309 #endif
310 #endif
311 m_queueOption.GPUContext = tmpGpuCtx;
312 }
313 else if (m_queueOption.QueueType == CM_QUEUE_TYPE_COMPUTE)
314 {
315 ctxCreateOption.RAMode = m_queueOption.RAMode;
316
317 bool bVeUsedInCm = false; //need change to true once feature is done in future.
318 #if (_DEBUG || _RELEASE_INTERNAL)
319 MOS_USER_FEATURE_VALUE_DATA UserFeatureData = {0};
320 MOS_UserFeature_ReadValue_ID(
321 nullptr, __MEDIA_USER_FEATURE_VALUE_MDF_CCS_USE_VE_INTERFACE,
322 &UserFeatureData, cmHalState->osInterface->pOsContext);
323 bVeUsedInCm = (UserFeatureData.u32Data == 0x1)? true: false;
324 #endif
325 Mos_SetVirtualEngineSupported(cmHalState->osInterface, bVeUsedInCm);
326
327 if (cmHalState->osInterface->veDefaultEnable && cmHalState->osInterface->bSupportVirtualEngine) // check if VE enabled on OS
328 {
329 // prepare virtual egine hint param on this cm queue.
330 CM_CHK_MOSSTATUS_GOTOFINISH_CMERROR(
331 HalCm_PrepareVEHintParam(cmHalState, false, &m_mosVeHintParams));
332
333 m_usingVirtualEngine = true;
334 }
335
336 ctxCreateOption.isRealTimePriority = m_queueOption.IsRealTimePrioriy;
337
338 CM_CHK_MOSSTATUS_GOTOFINISH_CMERROR(
339 CreateGpuContext(cmHalState, MOS_GPU_CONTEXT_CM_COMPUTE,
340 MOS_GPU_NODE_COMPUTE, &ctxCreateOption));
341 m_queueOption.GPUContext = MOS_GPU_CONTEXT_CM_COMPUTE;
342 }
343 else
344 {
345 // Returns failure
346 CM_ASSERTMESSAGE("Error: The QueueType is not supported by MDF.");
347 return CM_NOT_IMPLEMENTED;
348 }
349 }
350
351 finish:
352 return hr;
353 }
354
355 //*-----------------------------------------------------------------------------
356 //| Purpose: Checks whether any kernels in the task have a thread argument
357 //| Returns: Result of the operation.
358 //*-----------------------------------------------------------------------------
GetTaskHasThreadArg(CmKernelRT * kernelArray[],uint32_t numKernels,bool & threadArgExists)359 int32_t CmQueueRT::GetTaskHasThreadArg(CmKernelRT* kernelArray[], uint32_t numKernels, bool& threadArgExists)
360 {
361 threadArgExists = false;
362
363 for(uint32_t krn = 0; krn < numKernels; krn++)
364 {
365 if( !kernelArray[krn] )
366 {
367 CM_ASSERTMESSAGE("Error: The kernel in the task have no thread argument.");
368 return CM_FAILURE;
369 }
370
371 if( kernelArray[krn]->IsThreadArgExisted( ) )
372 {
373 threadArgExists = true;
374 break;
375 }
376 }
377
378 return CM_SUCCESS;
379 }
380
381 //*-----------------------------------------------------------------------------
382 //| Purpose: Enqueue Task
383 //| Arguments :
384 //| kernelArray [in] Pointer to kernel array
385 //| event [in] Reference to the pointer to Event
386 //| threadSpace [out] Pointer to thread space
387 //|
388 //| Returns: Result of the operation.
389 //*-----------------------------------------------------------------------------
Enqueue(CmTask * kernelArray,CmEvent * & event,const CmThreadSpace * threadSpace)390 CM_RT_API int32_t CmQueueRT::Enqueue(
391 CmTask* kernelArray,
392 CmEvent* & event,
393 const CmThreadSpace* threadSpace)
394 {
395 INSERT_API_CALL_LOG(GetHalState());
396
397 if (kernelArray == nullptr)
398 {
399 CM_ASSERTMESSAGE("Error: Kernel array is null.");
400 return CM_INVALID_ARG_VALUE;
401 }
402
403 CmTaskRT *kernelArrayRT = static_cast<CmTaskRT *>(kernelArray);
404 uint32_t kernelCount = 0;
405 kernelCount = kernelArrayRT->GetKernelCount();
406 if (kernelCount == 0)
407 {
408 CM_ASSERTMESSAGE("Error: Invalid kernel count.");
409 return CM_FAILURE;
410 }
411
412 if (kernelCount > m_halMaxValues->maxKernelsPerTask)
413 {
414 CM_ASSERTMESSAGE("Error: Kernel count exceeds max kernel per enqueue.");
415 return CM_EXCEED_MAX_KERNEL_PER_ENQUEUE;
416 }
417
418 int32_t result;
419 const CmThreadSpaceRT *threadSpaceRTConst = static_cast<const CmThreadSpaceRT *>(threadSpace);
420 PCM_HAL_STATE cmHalState = ((PCM_CONTEXT_DATA)m_device->GetAccelData())->cmHalState;
421 CM_CHK_NULL_RETURN_CMERROR(cmHalState);
422 if (cmHalState->cmHalInterface->CheckMediaModeAvailability() == false)
423 {
424 if (threadSpaceRTConst != nullptr)
425 {
426 result = EnqueueWithGroup(kernelArray, event, threadSpaceRTConst->GetThreadGroupSpace());
427 }
428 else
429 {
430 // If there isn't any shared thread space or associated thread space,
431 // create a temporary (maxThreadCount x 1) thread group space whose
432 // size equal to the max thread count of kernel who doesn't have a
433 // thread space associated.
434 uint32_t maxThreadCount = 1;
435 bool usedCommonTGS = false;
436 for (uint32_t i = 0; i < kernelCount; i++)
437 {
438 CmKernelRT *tmpKernel = kernelArrayRT->GetKernelPointer(i);
439 CmThreadGroupSpace *tmpTGS = nullptr;
440 tmpKernel->GetThreadGroupSpace(tmpTGS);
441
442 if (tmpTGS == nullptr)
443 {
444 usedCommonTGS = true;
445 uint32_t singleThreadCount = 0;
446 tmpKernel->GetThreadCount(singleThreadCount);
447 if (maxThreadCount < singleThreadCount)
448 {
449 maxThreadCount = singleThreadCount;
450 }
451 }
452 }
453
454 CmThreadGroupSpace *threadGroupSpaceTemp = nullptr;
455 if (usedCommonTGS == true)
456 {
457 result = m_device->CreateThreadGroupSpace(1, 1, maxThreadCount, 1, threadGroupSpaceTemp);
458 if (result != CM_SUCCESS)
459 {
460 CM_ASSERTMESSAGE("Error: Creating temporary thread group space failure.");
461 return result;
462 }
463 }
464
465 result = EnqueueWithGroup(kernelArray, event, threadGroupSpaceTemp);
466
467 if (threadGroupSpaceTemp != nullptr)
468 {
469 m_device->DestroyThreadGroupSpace(threadGroupSpaceTemp);
470 }
471 }
472 return result;
473 }
474
475 // check if meet the requirements of fast path
476 // if yes, switch to fast path
477 // else, continue the legacy path
478 if (cmHalState && cmHalState->advExecutor && cmHalState->cmHalInterface &&
479 cmHalState->advExecutor->SwitchToFastPath(kernelArray) &&
480 cmHalState->cmHalInterface->IsFastPathByDefault())
481 {
482 auto gpu_context_name
483 = static_cast<MOS_GPU_CONTEXT>(m_queueOption.GPUContext);
484 uint32_t old_stream_idx = cmHalState->pfnSetGpuContext(cmHalState,
485 gpu_context_name,
486 m_streamIndex,
487 m_gpuContextHandle);
488 result = cmHalState->advExecutor->SubmitTask(this, kernelArray, event,
489 threadSpace, gpu_context_name);
490 cmHalState->osInterface->streamIndex = old_stream_idx;
491 return result;
492 }
493
494 if (threadSpaceRTConst && threadSpaceRTConst->IsThreadAssociated())
495 {
496 if (threadSpaceRTConst->GetNeedSetKernelPointer() && threadSpaceRTConst->KernelPointerIsNULL())
497 {
498 CmKernelRT* tmp = nullptr;
499 tmp = kernelArrayRT->GetKernelPointer(0);
500 threadSpaceRTConst->SetKernelPointer(tmp);
501 }
502 }
503
504 #if _DEBUG
505 if (threadSpaceRTConst)
506 {
507 CmThreadSpaceRT *threadSpaceRT = const_cast<CmThreadSpaceRT*>(threadSpaceRTConst);
508 if (!threadSpaceRT->IntegrityCheck(kernelArrayRT))
509 {
510 CM_ASSERTMESSAGE("Error: Invalid thread space.");
511 return CM_INVALID_THREAD_SPACE;
512 }
513 }
514 #endif
515
516 if(m_device->IsPrintEnable())
517 {
518 m_device->CreatePrintBuffer();
519 }
520
521 typedef CmKernelRT* pCmKernel;
522 CmKernelRT** tmp = MOS_NewArray(pCmKernel, (kernelCount + 1));
523 if(tmp == nullptr)
524 {
525 CM_ASSERTMESSAGE("Error: Out of system memory.");
526 return CM_OUT_OF_HOST_MEMORY;
527 }
528
529 uint32_t totalThreadNumber = 0;
530 for(uint32_t i = 0; i < kernelCount; i++)
531 {
532 tmp[ i ] = kernelArrayRT->GetKernelPointer(i);
533
534 uint32_t singleThreadNumber = 0;
535 tmp[i]->GetThreadCount(singleThreadNumber);
536 if (singleThreadNumber == 0)
537 {
538 CmThreadSpaceRT *threadSpaceRT = const_cast<CmThreadSpaceRT*>(threadSpaceRTConst);
539 if (threadSpaceRT)
540 {
541 uint32_t width, height;
542 threadSpaceRT->GetThreadSpaceSize(width, height);
543 singleThreadNumber = width*height;
544 }
545 }
546 totalThreadNumber += singleThreadNumber;
547 }
548 tmp[kernelCount ] = nullptr;
549
550 CmEventRT *eventRT = static_cast<CmEventRT *>(event);
551 CM_TASK_CONFIG taskConfig;
552 kernelArrayRT->GetProperty(taskConfig);
553 result = Enqueue_RT(tmp, kernelCount, totalThreadNumber, eventRT, threadSpaceRTConst, kernelArrayRT->GetSyncBitmap(), kernelArrayRT->GetPowerOption(),
554 kernelArrayRT->GetConditionalEndBitmap(), kernelArrayRT->GetConditionalEndInfo(), &taskConfig);
555
556 if (eventRT)
557 {
558 eventRT->SetKernelNames(kernelArrayRT, const_cast<CmThreadSpaceRT*>(threadSpaceRTConst), nullptr);
559 }
560
561 event = eventRT;
562 MosSafeDeleteArray( tmp );
563
564 return result;
565 }
566
567 //*-----------------------------------------------------------------------------
568 //| Purpose: Enqueue Task
569 //| Arguments :
570 //| kernelArray [in] Pointer to kernel array
571 //| event [in] Reference to the pointer to Event
572 //| threadSpace [out] Pointer to thread space
573 //|
574 //| Returns: Result of the operation.
575 //*-----------------------------------------------------------------------------
Enqueue_RT(CmKernelRT * kernelArray[],const uint32_t kernelCount,const uint32_t totalThreadCount,CmEventRT * & event,const CmThreadSpaceRT * threadSpace,uint64_t syncBitmap,PCM_POWER_OPTION powerOption,uint64_t conditionalEndBitmap,CM_HAL_CONDITIONAL_BB_END_INFO * conditionalEndInfo,PCM_TASK_CONFIG taskConfig)576 int32_t CmQueueRT::Enqueue_RT(
577 CmKernelRT* kernelArray[],
578 const uint32_t kernelCount,
579 const uint32_t totalThreadCount,
580 CmEventRT* & event,
581 const CmThreadSpaceRT* threadSpace,
582 uint64_t syncBitmap,
583 PCM_POWER_OPTION powerOption,
584 uint64_t conditionalEndBitmap,
585 CM_HAL_CONDITIONAL_BB_END_INFO* conditionalEndInfo,
586 PCM_TASK_CONFIG taskConfig)
587 {
588 CM_NORMALMESSAGE("================ in origin path, media walker===================");
589 if(kernelArray == nullptr)
590 {
591 CM_ASSERTMESSAGE("Error: Kernel array is NULL.");
592 return CM_INVALID_ARG_VALUE;
593 }
594
595 if( kernelCount == 0 )
596 {
597 CM_ASSERTMESSAGE("Error: There are no valid kernels.");
598 return CM_INVALID_ARG_VALUE;
599 }
600
601 bool isEventVisible = (event == CM_NO_EVENT)? false:true;
602
603 CLock Locker(m_criticalSectionTaskInternal);
604
605 // set the current tracker index in renderhal
606 PCM_CONTEXT_DATA cmData = (PCM_CONTEXT_DATA)m_device->GetAccelData();
607 CM_CHK_NULL_RETURN_CMERROR(cmData);
608 CM_CHK_NULL_RETURN_CMERROR(cmData->cmHalState);
609 CM_CHK_NULL_RETURN_CMERROR(cmData->cmHalState->renderHal);
610 cmData->cmHalState->renderHal->currentTrackerIndex = m_trackerIndex;
611
612 CmTaskInternal* task = nullptr;
613 int32_t result = CmTaskInternal::Create(kernelCount, totalThreadCount, kernelArray, threadSpace, m_device, syncBitmap, task, conditionalEndBitmap, conditionalEndInfo);
614 if( result != CM_SUCCESS )
615 {
616 CM_ASSERTMESSAGE("Error: Create CM task internal failure.");
617 return result;
618 }
619
620 LARGE_INTEGER nEnqueueTime;
621 if ( !(MosUtilities::MosQueryPerformanceCounter( (uint64_t*)&nEnqueueTime.QuadPart )))
622 {
623 CM_ASSERTMESSAGE("Error: Query performance counter failure.");
624 CmTaskInternal::Destroy(task);
625 return CM_FAILURE;
626 }
627
628 int32_t taskDriverId = -1;
629
630 result = CreateEvent(task, isEventVisible, taskDriverId, event);
631 if (result != CM_SUCCESS)
632 {
633 CM_ASSERTMESSAGE("Error: Create event failure.");
634 return result;
635 }
636 if ( event != nullptr )
637 {
638 event->SetEnqueueTime( nEnqueueTime );
639 }
640
641 task->SetPowerOption( powerOption );
642
643 task->SetProperty(taskConfig);
644
645 if( !m_enqueuedTasks.Push( task ) )
646 {
647 CM_ASSERTMESSAGE("Error: Push enqueued tasks failure.");
648 return CM_FAILURE;
649 }
650
651 result = FlushTaskWithoutSync();
652
653 return result;
654 }
655
Enqueue_RT(CmKernelRT * kernelArray[],const uint32_t kernelCount,const uint32_t totalThreadCount,CmEventRT * & event,const CmThreadGroupSpace * threadGroupSpace,uint64_t syncBitmap,PCM_POWER_OPTION powerOption,uint64_t conditionalEndBitmap,CM_HAL_CONDITIONAL_BB_END_INFO * conditionalEndInfo,PCM_TASK_CONFIG taskConfig,const CM_EXECUTION_CONFIG * krnExecCfg)656 int32_t CmQueueRT::Enqueue_RT(CmKernelRT* kernelArray[],
657 const uint32_t kernelCount,
658 const uint32_t totalThreadCount,
659 CmEventRT* & event,
660 const CmThreadGroupSpace* threadGroupSpace,
661 uint64_t syncBitmap,
662 PCM_POWER_OPTION powerOption,
663 uint64_t conditionalEndBitmap,
664 CM_HAL_CONDITIONAL_BB_END_INFO* conditionalEndInfo,
665 PCM_TASK_CONFIG taskConfig,
666 const CM_EXECUTION_CONFIG* krnExecCfg)
667 {
668 CM_NORMALMESSAGE("================ in origin path, gpgpu walker===================");
669 if(kernelArray == nullptr)
670 {
671 CM_ASSERTMESSAGE("Error: Kernel array is NULL.");
672 return CM_INVALID_ARG_VALUE;
673 }
674
675 if( kernelCount == 0 )
676 {
677 CM_ASSERTMESSAGE("Error: There are no valid kernels.");
678 return CM_INVALID_ARG_VALUE;
679 }
680
681 CLock Locker(m_criticalSectionTaskInternal);
682
683 // set the current tracker index in renderhal
684 PCM_CONTEXT_DATA cmData = (PCM_CONTEXT_DATA)m_device->GetAccelData();
685 CM_CHK_NULL_RETURN_CMERROR(cmData);
686 CM_CHK_NULL_RETURN_CMERROR(cmData->cmHalState);
687 CM_CHK_NULL_RETURN_CMERROR(cmData->cmHalState->renderHal);
688 cmData->cmHalState->renderHal->currentTrackerIndex = m_trackerIndex;
689
690 CmTaskInternal* task = nullptr;
691 int32_t result = CmTaskInternal::Create( kernelCount, totalThreadCount, kernelArray,
692 threadGroupSpace, m_device, syncBitmap, task,
693 conditionalEndBitmap, conditionalEndInfo, krnExecCfg);
694 if( result != CM_SUCCESS )
695 {
696 CM_ASSERTMESSAGE("Error: Create CmTaskInternal failure.");
697 return result;
698 }
699
700 LARGE_INTEGER nEnqueueTime;
701 if ( !(MosUtilities::MosQueryPerformanceCounter( (uint64_t*)&nEnqueueTime.QuadPart )))
702 {
703 CM_ASSERTMESSAGE("Error: Query performance counter failure.");
704 CmTaskInternal::Destroy(task);
705 return CM_FAILURE;
706 }
707
708 int32_t taskDriverId = -1;
709
710 result = CreateEvent(task, !(event == CM_NO_EVENT) , taskDriverId, event);
711 if (result != CM_SUCCESS)
712 {
713 CM_ASSERTMESSAGE("Error: Create event failure.");
714 return result;
715 }
716 if ( event != nullptr )
717 {
718 event->SetEnqueueTime( nEnqueueTime );
719 }
720
721 task->SetPowerOption( powerOption );
722
723 task->SetProperty(taskConfig);
724
725 if( !m_enqueuedTasks.Push( task ) )
726 {
727 CM_ASSERTMESSAGE("Error: Push enqueued tasks failure.")
728 return CM_FAILURE;
729 }
730
731 result = FlushTaskWithoutSync();
732
733 return result;
734 }
735
Enqueue_RT(CmKernelRT * kernelArray[],CmEventRT * & event,uint32_t numTasksGenerated,bool isLastTask,uint32_t hints,PCM_POWER_OPTION powerOption)736 int32_t CmQueueRT::Enqueue_RT( CmKernelRT* kernelArray[],
737 CmEventRT* & event,
738 uint32_t numTasksGenerated,
739 bool isLastTask,
740 uint32_t hints,
741 PCM_POWER_OPTION powerOption)
742 {
743 int32_t result = CM_FAILURE;
744 uint32_t kernelCount = 0;
745 CmTaskInternal* task = nullptr;
746 int32_t taskDriverId = -1;
747 bool isEventVisible = (event == CM_NO_EVENT) ? false:true;
748 bool threadArgExists = false;
749
750 if( kernelArray == nullptr)
751 {
752 CM_ASSERTMESSAGE("Error: Kernel array is NULL.");
753 return CM_INVALID_ARG_VALUE;
754 }
755 while( kernelArray[ kernelCount ] )
756 {
757 kernelCount++;
758 }
759
760 if( kernelCount < CM_MINIMUM_NUM_KERNELS_ENQWHINTS )
761 {
762 CM_ASSERTMESSAGE("Error: EnqueueWithHints requires at least 2 kernels.");
763 return CM_FAILURE;
764 }
765
766 uint32_t totalThreadCount = 0;
767 for( uint32_t i = 0; i < kernelCount; i ++ )
768 {
769 uint32_t threadCount = 0;
770 kernelArray[i]->GetThreadCount( threadCount );
771 totalThreadCount += threadCount;
772 }
773
774 if( GetTaskHasThreadArg(kernelArray, kernelCount, threadArgExists) != CM_SUCCESS )
775 {
776 CM_ASSERTMESSAGE("Error: Thread argument checking fails.");
777 return CM_FAILURE;
778 }
779
780 if( !threadArgExists )
781 {
782 if (totalThreadCount > m_halMaxValues->maxUserThreadsPerTaskNoThreadArg )
783 {
784 CM_ASSERTMESSAGE("Error: Maximum number of threads per task exceeded.");
785 return CM_EXCEED_MAX_THREAD_AMOUNT_PER_ENQUEUE;
786 }
787 }
788 else
789 {
790 if( totalThreadCount > m_halMaxValues->maxUserThreadsPerTask )
791 {
792 CM_ASSERTMESSAGE("Error: Maximum number of threads per task exceeded.");
793 return CM_EXCEED_MAX_THREAD_AMOUNT_PER_ENQUEUE;
794 }
795 }
796
797 CLock Locker(m_criticalSectionTaskInternal);
798
799 // set the current tracker index in renderhal
800 PCM_CONTEXT_DATA cmData = (PCM_CONTEXT_DATA)m_device->GetAccelData();
801 CM_CHK_NULL_RETURN_CMERROR(cmData);
802 CM_CHK_NULL_RETURN_CMERROR(cmData->cmHalState);
803 CM_CHK_NULL_RETURN_CMERROR(cmData->cmHalState->renderHal);
804 cmData->cmHalState->renderHal->currentTrackerIndex = m_trackerIndex;
805
806 result = CmTaskInternal::Create( kernelCount, totalThreadCount, kernelArray, task, numTasksGenerated, isLastTask, hints, m_device );
807
808 if( result != CM_SUCCESS )
809 {
810 CM_ASSERTMESSAGE("Error: Create CM task internal failure.");
811 return result;
812 }
813
814 LARGE_INTEGER nEnqueueTime;
815 if ( !(MosUtilities::MosQueryPerformanceCounter( (uint64_t*)&nEnqueueTime.QuadPart )) )
816 {
817 CM_ASSERTMESSAGE("Error: Query performance counter failure.");
818 CmTaskInternal::Destroy(task);
819 return CM_FAILURE;
820 }
821
822 result = CreateEvent(task, isEventVisible, taskDriverId, event);
823 if (result != CM_SUCCESS)
824 {
825 CM_ASSERTMESSAGE("Error: Create event failure.");
826 return result;
827 }
828 if ( event != nullptr )
829 {
830 event->SetEnqueueTime( nEnqueueTime );
831 }
832
833 for( uint32_t i = 0; i < kernelCount; ++i )
834 {
835 CmKernelRT* kernel = nullptr;
836 task->GetKernel(i, kernel);
837 if( kernel != nullptr )
838 {
839 kernel->SetAdjustedYCoord(0);
840 }
841 }
842
843 task->SetPowerOption( powerOption );
844
845 if (!m_enqueuedTasks.Push(task))
846 {
847 CM_ASSERTMESSAGE("Error: Push enqueued tasks failure.")
848 return CM_FAILURE;
849 }
850
851 result = FlushTaskWithoutSync();
852
853 return result;
854 }
855
856 //*-----------------------------------------------------------------------------
857 //! Function to enqueue task with thread group space pointer
858 //! Arguments:
859 //! 1. Pointer to CmTask, which can only contain one kernel.
860 //! 2. Reference to the pointer to CmEvent that is to be returned
861 //! 3. Pointer to a CmThreadGroupSpace.
862 //! Return Value:
863 //! CM_SUCCESS if the task is successfully enqueued and the CmEvent is generated
864 //! CM_OUT_OF_HOST_MEMORY if out of host memory
865 //! CM_FAILURE otherwise
866 //! Notes:
867 //! If the kernel has per thread arg, GPGPU object is to be used.
868 //! If the kernel has no per thread arg. GPGPU walker is used.
869 //*-----------------------------------------------------------------------------
EnqueueWithGroup(CmTask * task,CmEvent * & event,const CmThreadGroupSpace * threadGroupSpace)870 CM_RT_API int32_t CmQueueRT::EnqueueWithGroup( CmTask* task, CmEvent* & event, const CmThreadGroupSpace* threadGroupSpace)
871 {
872 INSERT_API_CALL_LOG(GetHalState());
873
874 int32_t result;
875
876 if(task == nullptr)
877 {
878 CM_ASSERTMESSAGE("Error: Kernel array is NULL.");
879 return CM_INVALID_ARG_VALUE;
880 }
881
882 // check if meet the requirements of fast path
883 // if yes, switch to fast path
884 // else, continue the legacy path
885 PCM_HAL_STATE cmHalState = ((PCM_CONTEXT_DATA)m_device->GetAccelData())->cmHalState;
886 if (cmHalState && cmHalState->advExecutor && cmHalState->cmHalInterface &&
887 cmHalState->advExecutor->SwitchToFastPath(task) &&
888 cmHalState->cmHalInterface->IsFastPathByDefault())
889 {
890 auto gpu_context_name
891 = static_cast<MOS_GPU_CONTEXT>(m_queueOption.GPUContext);
892 uint32_t old_stream_idx = cmHalState->pfnSetGpuContext(cmHalState,
893 gpu_context_name,
894 m_streamIndex,
895 m_gpuContextHandle);
896 if (cmHalState->cmHalInterface->CheckMediaModeAvailability())
897 {
898 result = cmHalState->advExecutor->SubmitGpgpuTask(this, task, event,
899 threadGroupSpace,
900 gpu_context_name);
901 }
902 else
903 {
904 SelectSyncBuffer(cmHalState);
905 result = cmHalState->advExecutor->SubmitComputeTask(this, task, event,
906 threadGroupSpace,
907 gpu_context_name);
908 }
909 cmHalState->osInterface->streamIndex = old_stream_idx;
910 return result;
911 }
912
913 CmTaskRT *taskRT = static_cast<CmTaskRT *>(task);
914 uint32_t count = 0;
915 count = taskRT->GetKernelCount();
916
917 if( count == 0 )
918 {
919 CM_ASSERTMESSAGE("Error: There are no valid kernels.");
920 return CM_FAILURE;
921 }
922
923 if(m_device->IsPrintEnable())
924 {
925 m_device->CreatePrintBuffer();
926 }
927
928 typedef CmKernelRT* pCmKernel;
929 CmKernelRT** tmp = MOS_NewArray(pCmKernel, (count+1));
930 if(tmp == nullptr)
931 {
932 CM_ASSERTMESSAGE("Error: Out of system memory.");
933 return CM_OUT_OF_HOST_MEMORY;
934 }
935
936 uint32_t totalThreadNumber = 0;
937 for(uint32_t i = 0; i < count; i++)
938 {
939 uint32_t singleThreadNumber = 0;
940 tmp[ i ] = taskRT->GetKernelPointer(i);
941
942 //Thread arguments is not allowed in GPGPU_WALKER path
943 if(tmp[i]->IsThreadArgExisted())
944 {
945 CM_ASSERTMESSAGE("Error: No thread Args allowed when using group space");
946 MosSafeDeleteArray(tmp);
947 return CM_THREAD_ARG_NOT_ALLOWED;
948 }
949
950 tmp[i]->GetThreadCount(singleThreadNumber);
951 totalThreadNumber += singleThreadNumber;
952 }
953 tmp[count ] = nullptr;
954
955 CmEventRT *eventRT = static_cast<CmEventRT *>(event);
956 CM_TASK_CONFIG taskConfig;
957 taskRT->GetProperty(taskConfig);
958 result = Enqueue_RT( tmp, count, totalThreadNumber, eventRT,
959 threadGroupSpace, taskRT->GetSyncBitmap(),
960 taskRT->GetPowerOption(),
961 taskRT->GetConditionalEndBitmap(), taskRT->GetConditionalEndInfo(),
962 &taskConfig, taskRT->GetKernelExecuteConfig());
963
964 if (eventRT)
965 {
966 eventRT->SetKernelNames(taskRT, nullptr, const_cast<CmThreadGroupSpace*>(threadGroupSpace));
967 }
968
969 event = eventRT;
970 MosSafeDeleteArray( tmp );
971
972 return result;
973 }
974
EnqueueWithHints(CmTask * kernelArray,CmEvent * & event,uint32_t hints)975 CM_RT_API int32_t CmQueueRT::EnqueueWithHints(
976 CmTask* kernelArray,
977 CmEvent* & event,
978 uint32_t hints)
979 {
980 INSERT_API_CALL_LOG(GetHalState());
981
982 int32_t hr = CM_FAILURE;
983 uint32_t count = 0;
984 uint32_t index = 0;
985 CmKernelRT** kernels = nullptr;
986 uint32_t numTasks = 0;
987 bool splitTask = false;
988 bool lastTask = false;
989 uint32_t numTasksGenerated = 0;
990 CmEventRT *eventRT = static_cast<CmEventRT *>(event);
991
992 if (kernelArray == nullptr)
993 {
994 return CM_INVALID_ARG_VALUE;
995 }
996 CmTaskRT *kernelArrayRT = static_cast<CmTaskRT *>(kernelArray);
997 count = kernelArrayRT->GetKernelCount();
998 if( count == 0 )
999 {
1000 CM_ASSERTMESSAGE("Error: Invalid kernel count.");
1001 hr = CM_FAILURE;
1002 goto finish;
1003 }
1004
1005 if( count > m_halMaxValues->maxKernelsPerTask )
1006 {
1007 CM_ASSERTMESSAGE("Error: Kernel count exceeds maximum kernel per enqueue.");
1008 hr = CM_EXCEED_MAX_KERNEL_PER_ENQUEUE;
1009 goto finish;
1010 }
1011
1012 for (uint32_t i = 0; i < count; ++i)
1013 {
1014 CmKernelRT* kernelTmp = nullptr;
1015 CmThreadSpaceRT* threadSpaceTmp = nullptr;
1016 kernelTmp = kernelArrayRT->GetKernelPointer(i);
1017 CM_CHK_NULL_GOTOFINISH_CMERROR(kernelTmp);
1018 kernelTmp->GetThreadSpace(threadSpaceTmp);
1019 CM_CHK_NULL_GOTOFINISH_CMERROR(threadSpaceTmp);
1020 if (threadSpaceTmp->GetNeedSetKernelPointer() && threadSpaceTmp->KernelPointerIsNULL())
1021 {
1022 threadSpaceTmp->SetKernelPointer(kernelTmp);
1023 }
1024 }
1025
1026 #if _DEBUG
1027 if( !kernelArrayRT->IntegrityCheckKernelThreadspace() )
1028 {
1029 CM_ASSERTMESSAGE("Error: Integrity check for kernel thread space failed.");
1030 hr = CM_KERNEL_THREADSPACE_INTEGRITY_FAILED;
1031 goto finish;
1032 }
1033 #endif
1034
1035 numTasks = ( hints & CM_HINTS_MASK_NUM_TASKS ) >> CM_HINTS_NUM_BITS_TASK_POS;
1036 if( numTasks > 1 )
1037 {
1038 splitTask = true;
1039 }
1040
1041 if( m_device->IsPrintEnable() )
1042 {
1043 m_device->CreatePrintBuffer();
1044 }
1045
1046 kernels = MOS_NewArray(CmKernelRT*, (count + 1));
1047 CM_CHK_NULL_GOTOFINISH_CMERROR(kernels);
1048
1049 do
1050 {
1051 for (index = 0; index < count; ++index)
1052 {
1053 kernels[ index ] = kernelArrayRT->GetKernelPointer( index );
1054 }
1055
1056 kernels[ count ] = nullptr;
1057
1058 if(splitTask)
1059 {
1060 if( numTasksGenerated == (numTasks - 1 ) )
1061 {
1062 lastTask = true;
1063 }
1064 }
1065 else
1066 {
1067 lastTask = true;
1068 }
1069
1070 CM_CHK_CMSTATUS_GOTOFINISH(Enqueue_RT( kernels, eventRT, numTasksGenerated, lastTask, hints, kernelArrayRT->GetPowerOption() ));
1071 event = eventRT;
1072 numTasksGenerated++;
1073
1074 }while(numTasksGenerated < numTasks);
1075
1076 finish:
1077 MosSafeDeleteArray( kernels );
1078
1079 return hr;
1080 }
1081
1082 //*-----------------------------------------------------------------------------
1083 //! Enqueue an task, which contains one pre-defined kernel to
1084 //! copy from host memory to surface
1085 //! This is a non-blocking call. i.e. it returns immediately without waiting for
1086 //! GPU to finish the execution of the task.
1087 //! A CmEvent is generated each time a task is enqueued. The CmEvent can
1088 //! be used to check if the task finishs.
1089 //! INPUT:
1090 //! 1) Pointer to the CmSurface2D_RT as copy destination
1091 //! 2) Pointer to the host memory as copy source
1092 //! 3) Reference to the pointer to CMEvent
1093 //! 4) A boolean value to indicate if or not to flush the queue after enqueue the task
1094 //! by default the boolean value is TRUE.
1095 //! OUTPUT:
1096 //! CM_SUCCESS if the task is successfully enqueued and the CmEvent is generated;
1097 //! CM_OUT_OF_HOST_MEMORY if out of host memery;
1098 //! CM_FAILURE otherwise.
1099 //! More error code is coming.
1100 //*-----------------------------------------------------------------------------
EnqueueCopyCPUToGPU(CmSurface2D * surface,const unsigned char * sysMem,CmEvent * & event)1101 CM_RT_API int32_t CmQueueRT::EnqueueCopyCPUToGPU( CmSurface2D* surface, const unsigned char* sysMem, CmEvent* & event )
1102 {
1103 INSERT_API_CALL_LOG(GetHalState());
1104
1105 if (!m_device->HasGpuCopyKernel())
1106 {
1107 return CM_NOT_IMPLEMENTED;
1108 }
1109
1110 CmSurface2DRT *surfaceRT = static_cast<CmSurface2DRT *>(surface);
1111 return EnqueueCopyInternal(surfaceRT, (unsigned char*)sysMem, 0, 0, CM_FASTCOPY_CPU2GPU, CM_FASTCOPY_OPTION_NONBLOCKING, event);
1112 }
1113
1114 //*-----------------------------------------------------------------------------
1115 //! Enqueue an task, which contains one pre-defined kernel to
1116 //! copy from surface to host memory
1117 //! This is a non-blocking call. i.e. it returns immediately without waiting for
1118 //! GPU to finish the execution of the task.
1119 //! A CmEvent is generated each time a task is enqueued. The CmEvent can
1120 //! be used to check if the task finishs.
1121 //! INPUT:
1122 //! 1) Pointer to the CmSurface2D_RT as copy source
1123 //! 2) Pointer to the host memory as copy destination
1124 //! 3) Reference to the pointer to CMEvent
1125 //! 4) A boolean value to indicate if or not to flush the queue after enqueue the task
1126 //! by default the boolean value is TRUE.
1127 //! OUTPUT:
1128 //! CM_SUCCESS if the task is successfully enqueued and the CmEvent is generated;
1129 //! CM_OUT_OF_HOST_MEMORY if out of host memery;
1130 //! CM_FAILURE otherwise.
1131 //! More error code is coming.
1132 //*-----------------------------------------------------------------------------
EnqueueCopyGPUToCPU(CmSurface2D * surface,unsigned char * sysMem,CmEvent * & event)1133 CM_RT_API int32_t CmQueueRT::EnqueueCopyGPUToCPU( CmSurface2D* surface, unsigned char* sysMem, CmEvent* & event )
1134 {
1135 INSERT_API_CALL_LOG(GetHalState());
1136
1137 if (!m_device->HasGpuCopyKernel())
1138 {
1139 return CM_NOT_IMPLEMENTED;
1140 }
1141
1142 CmSurface2DRT *surfaceRT = static_cast<CmSurface2DRT *>(surface);
1143 return EnqueueCopyInternal(surfaceRT, sysMem, 0, 0, CM_FASTCOPY_GPU2CPU, CM_FASTCOPY_OPTION_NONBLOCKING, event);
1144 }
1145
EnqueueUnalignedCopyInternal(CmSurface2DRT * surface,unsigned char * sysMem,const uint32_t widthStride,const uint32_t heightStride,CM_GPUCOPY_DIRECTION direction)1146 int32_t CmQueueRT::EnqueueUnalignedCopyInternal( CmSurface2DRT* surface, unsigned char* sysMem, const uint32_t widthStride, const uint32_t heightStride, CM_GPUCOPY_DIRECTION direction)
1147 {
1148 int32_t hr = CM_SUCCESS;
1149 uint32_t bufferupSize = 0;
1150 uint32_t dstAddShiftOffset = 0;
1151 uint32_t threadWidth = 0;
1152 uint32_t threadHeight = 0;
1153 uint32_t threadNum = 0;
1154 uint32_t auxiliaryBufferupSize = 0;
1155 uint32_t width = 0;
1156 uint32_t height = 0;
1157 uint32_t sizePerPixel = 0;
1158 uint32_t widthByte = 0;
1159 uint32_t copyWidthByte = 0;
1160 uint32_t copyHeightRow = 0;
1161 uint32_t strideInBytes = widthStride;
1162 uint32_t heightStrideInRows = heightStride;
1163 size_t linearAddress = (size_t)sysMem;
1164 size_t linearAddressAligned = 0;
1165 unsigned char* hybridCopyAuxSysMem = nullptr;
1166
1167 CmBufferUP *bufferUP = nullptr;
1168 CmKernel *kernel = nullptr;
1169 CmBufferUP *hybridCopyAuxBufferUP = nullptr;
1170 SurfaceIndex *bufferIndexCM = nullptr;
1171 SurfaceIndex *hybridCopyAuxIndexCM = nullptr;
1172 SurfaceIndex *surf2DIndexCM = nullptr;
1173 CmThreadSpace *threadSpace = nullptr;
1174 CmTask *gpuCopyTask = nullptr;
1175 CmProgram *gpuCopyProgram = nullptr;
1176 CmEvent *event = nullptr;
1177 CM_STATUS status;
1178 CM_SURFACE_FORMAT format;
1179
1180 if ( surface )
1181 {
1182 CM_CHK_CMSTATUS_GOTOFINISH( surface->GetSurfaceDesc(width, height, format, sizePerPixel));
1183 }
1184 else
1185 {
1186 return CM_FAILURE;
1187 }
1188
1189 widthByte = width * sizePerPixel;
1190 // the actual copy region
1191 copyWidthByte = MOS_MIN(strideInBytes, widthByte);
1192 copyHeightRow = MOS_MIN(heightStrideInRows, height);
1193
1194 if(linearAddress == 0)
1195 {
1196 CM_ASSERTMESSAGE("Error: Pointer to system memory is null.");
1197 return CM_INVALID_ARG_VALUE;
1198 }
1199 if( (copyWidthByte > CM_MAX_THREADSPACE_WIDTH_FOR_MW * BLOCK_WIDTH ) || ( copyHeightRow > CM_MAX_THREADSPACE_HEIGHT_FOR_MW * BLOCK_HEIGHT) )
1200 { // each thread handles 64x8 block data. This API will fail if it exceeds the max thread space's size
1201 CM_ASSERTMESSAGE("Error: Invalid copy size.");
1202 return CM_INVALID_ARG_SIZE;
1203 }
1204
1205 if (sizeof (void *) == 8 ) //64-bit
1206 {
1207 linearAddressAligned = linearAddress & ADDRESS_PAGE_ALIGNMENT_MASK_X64;
1208 }
1209 else //32-bit
1210 {
1211 linearAddressAligned = linearAddress & ADDRESS_PAGE_ALIGNMENT_MASK_X86;
1212 }
1213 //Calculate Left Shift offset
1214 dstAddShiftOffset = (uint32_t)(linearAddress - linearAddressAligned);
1215
1216 if (format == CM_SURFACE_FORMAT_NV12 || format == CM_SURFACE_FORMAT_P010 || format == CM_SURFACE_FORMAT_P016)
1217 {
1218 bufferupSize = MOS_ALIGN_CEIL(strideInBytes * (heightStrideInRows + copyHeightRow * 1/2) + (uint32_t)dstAddShiftOffset , 64);
1219 }
1220 else
1221 {
1222 bufferupSize = MOS_ALIGN_CEIL(strideInBytes * heightStrideInRows + (uint32_t)dstAddShiftOffset, 64);
1223 }
1224
1225 CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateBufferUP(bufferupSize, ( void * )linearAddressAligned, bufferUP));
1226 CM_CHK_CMSTATUS_GOTOFINISH(bufferUP->GetIndex(bufferIndexCM));
1227 CM_CHK_CMSTATUS_GOTOFINISH(surface->GetIndex(surf2DIndexCM));
1228
1229 CM_CHK_CMSTATUS_GOTOFINISH( m_device->LoadPredefinedCopyKernel(gpuCopyProgram));
1230 CM_CHK_NULL_GOTOFINISH_CMERROR(gpuCopyProgram);
1231
1232 if (direction == CM_FASTCOPY_CPU2GPU)
1233 {
1234 if (format == CM_SURFACE_FORMAT_NV12 || format == CM_SURFACE_FORMAT_P010 || format == CM_SURFACE_FORMAT_P016)
1235 {
1236 CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateKernel(gpuCopyProgram, _NAME(surfaceCopy_write_unaligned_NV12), kernel, "PredefinedGPUCopyKernel"));
1237 }
1238 else
1239 {
1240 CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateKernel(gpuCopyProgram, _NAME(surfaceCopy_write_unaligned), kernel, "PredefinedGPUCopyKernel"));
1241
1242 }
1243 CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 0, sizeof( SurfaceIndex ), bufferIndexCM ));
1244 CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 1, sizeof( SurfaceIndex ), surf2DIndexCM ));
1245 }
1246 else
1247 {
1248 if (format == CM_SURFACE_FORMAT_NV12 || format == CM_SURFACE_FORMAT_P010 || format == CM_SURFACE_FORMAT_P016)
1249 {
1250 CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateKernel(gpuCopyProgram, _NAME(surfaceCopy_read_unaligned_NV12), kernel, "PredefinedGPUCopyKernel"));
1251 auxiliaryBufferupSize = BLOCK_WIDTH * 2 * (heightStrideInRows + copyHeightRow * 1/2);
1252 }
1253 else
1254 {
1255 CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateKernel(gpuCopyProgram, _NAME(surfaceCopy_read_unaligned), kernel, "PredefinedGPUCopyKernel"));
1256 auxiliaryBufferupSize = BLOCK_WIDTH * 2 * heightStrideInRows;
1257 }
1258 hybridCopyAuxSysMem = (unsigned char*)MOS_AlignedAllocMemory(auxiliaryBufferupSize, PAGE_ALIGNED);
1259 if(!hybridCopyAuxSysMem)
1260 {
1261 CM_ASSERTMESSAGE("Error: Out of system memory.");
1262 return CM_OUT_OF_HOST_MEMORY;
1263 }
1264 CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateBufferUP(auxiliaryBufferupSize, (void*)hybridCopyAuxSysMem, hybridCopyAuxBufferUP));
1265 CM_CHK_CMSTATUS_GOTOFINISH(hybridCopyAuxBufferUP->GetIndex(hybridCopyAuxIndexCM));
1266
1267 CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 0, sizeof( SurfaceIndex ), surf2DIndexCM ));
1268 CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 1, sizeof( SurfaceIndex ), bufferIndexCM ));
1269 CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 5, sizeof( uint32_t ), ©WidthByte ));
1270 CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 6, sizeof( SurfaceIndex ), hybridCopyAuxIndexCM ));
1271 }
1272
1273 CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 2, sizeof( uint32_t ), &strideInBytes ));
1274 CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 3, sizeof( uint32_t ), &heightStrideInRows ));
1275 CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 4, sizeof( uint32_t ), &dstAddShiftOffset ));
1276
1277 threadWidth = ( uint32_t )ceil( ( double )copyWidthByte/BLOCK_WIDTH );
1278 threadHeight = ( uint32_t )ceil( ( double )copyHeightRow/BLOCK_HEIGHT );
1279
1280 threadNum = threadWidth * threadHeight;
1281 CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetThreadCount( threadNum ));
1282
1283 CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateThreadSpace( threadWidth, threadHeight, threadSpace ));
1284 CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateTask(gpuCopyTask));
1285 CM_CHK_CMSTATUS_GOTOFINISH(gpuCopyTask->AddKernel( kernel ));
1286 CM_CHK_CMSTATUS_GOTOFINISH(EnqueueFast(gpuCopyTask, event, threadSpace));
1287
1288 if(event)
1289 {
1290 CM_CHK_CMSTATUS_GOTOFINISH(event->GetStatus(status));
1291 while(status != CM_STATUS_FINISHED)
1292 {
1293 if (status == CM_STATUS_RESET)
1294 {
1295 hr = CM_TASK_MEDIA_RESET;
1296 goto finish;
1297 }
1298 CM_CHK_CMSTATUS_GOTOFINISH(event->GetStatus(status));
1299 }
1300 }
1301 // CPU copy unaligned data
1302 if( direction == CM_FASTCOPY_GPU2CPU)
1303 {
1304 uint32_t readOffset = 0;
1305 uint32_t copyLines = 0;
1306 unsigned char* startBuffer = (unsigned char*)linearAddressAligned;
1307
1308 copyLines = (format == CM_SURFACE_FORMAT_NV12 || format == CM_SURFACE_FORMAT_P010 || format == CM_SURFACE_FORMAT_P016) ? heightStrideInRows + MOS_MIN(heightStrideInRows, height) * 1 / 2 : heightStrideInRows;
1309
1310 for(uint32_t i = 0; i < copyLines; ++i)
1311 {
1312 //copy begining of line
1313 size_t beginLineWriteOffset = strideInBytes * i + dstAddShiftOffset;
1314 uint32_t mod = ((uintptr_t)startBuffer + beginLineWriteOffset) < BLOCK_WIDTH ? ((uintptr_t)startBuffer + beginLineWriteOffset) : ((uintptr_t)startBuffer + beginLineWriteOffset) & (BLOCK_WIDTH - 1);
1315 uint32_t beginLineCopySize = (mod == 0) ? 0:(BLOCK_WIDTH - mod);
1316 //fix copy size for cases where the surface width is small
1317 if((beginLineCopySize > widthByte) || ( beginLineCopySize == 0 && widthByte < BLOCK_WIDTH ) )
1318 {
1319 beginLineCopySize = widthByte;
1320 }
1321 if(beginLineCopySize > 0)
1322 {
1323 CmSafeMemCopy((void *)( (unsigned char *)startBuffer + beginLineWriteOffset), (void *)(hybridCopyAuxSysMem + readOffset), beginLineCopySize);
1324 }
1325
1326 //copy end of line
1327 uint32_t alignedWrites = (copyWidthByte - beginLineCopySize) &~ (BLOCK_WIDTH - 1);
1328 uint32_t endLineWriteOffset = beginLineWriteOffset + alignedWrites + beginLineCopySize;
1329 uint32_t endLineCopySize = dstAddShiftOffset+ i * strideInBytes + copyWidthByte - endLineWriteOffset;
1330 if(endLineCopySize > 0 && endLineWriteOffset > beginLineWriteOffset)
1331 {
1332 CmSafeMemCopy((void *)((unsigned char *)startBuffer + endLineWriteOffset), (void *)(hybridCopyAuxSysMem + readOffset + BLOCK_WIDTH), endLineCopySize);
1333 }
1334 readOffset += (BLOCK_WIDTH * 2);
1335 }
1336 }
1337
1338 CM_CHK_CMSTATUS_GOTOFINISH(DestroyEventFast(event));
1339 CM_CHK_CMSTATUS_GOTOFINISH(m_device->DestroyTask(gpuCopyTask));
1340 CM_CHK_CMSTATUS_GOTOFINISH(m_device->DestroyThreadSpace(threadSpace));
1341 CM_CHK_CMSTATUS_GOTOFINISH(m_device->DestroyBufferUP(bufferUP));
1342 if (direction == CM_FASTCOPY_GPU2CPU)
1343 {
1344 if(hybridCopyAuxBufferUP)
1345 {
1346 CM_CHK_CMSTATUS_GOTOFINISH(m_device->DestroyBufferUP(hybridCopyAuxBufferUP));
1347 }
1348 if(hybridCopyAuxSysMem)
1349 {
1350 MOS_AlignedFreeMemory(hybridCopyAuxSysMem);
1351 hybridCopyAuxSysMem = nullptr;
1352 }
1353 }
1354 finish:
1355 if(hr != CM_SUCCESS)
1356 {
1357 if(bufferUP == nullptr)
1358 {
1359 // user need to know whether the failure is caused by out of BufferUP.
1360 hr = CM_GPUCOPY_OUT_OF_RESOURCE;
1361 }
1362
1363 if(event) DestroyEventFast(event);
1364 if(kernel) m_device->DestroyKernel(kernel);
1365 if(threadSpace) m_device->DestroyThreadSpace(threadSpace);
1366 if(gpuCopyTask) m_device->DestroyTask(gpuCopyTask);
1367 if(bufferUP) m_device->DestroyBufferUP(bufferUP);
1368 if(hybridCopyAuxBufferUP) m_device->DestroyBufferUP(hybridCopyAuxBufferUP);
1369 if(hybridCopyAuxSysMem) {MOS_AlignedFreeMemory(hybridCopyAuxSysMem); hybridCopyAuxSysMem = nullptr;}
1370 }
1371
1372 return hr;
1373 }
1374 //*-----------------------------------------------------------------------------
1375 //! Enqueue an task, which contains one pre-defined kernel to
1376 //! copy from surface to host memory or from host memory to surface
1377 //! This is a non-blocking call. i.e. it returns immediately without waiting for
1378 //! GPU to finish the execution of the task.
1379 //! A CmEvent is generated each time a task is enqueued. The CmEvent can
1380 //! be used to check if the task finishes.
1381 //! INPUT:
1382 //! 1) Pointer to the CmSurface2D
1383 //! 2) Pointer to the host memory
1384 //! 3) Width stride in bytes, if there is no padding in system memroy, it is set to zero.
1385 //! 4) Height stride in row, if there is no padding in system memroy, it is set to zero.
1386 //! 4) Copy direction, cpu->gpu (linear->tiled) or gpu->cpu(tiled->linear)
1387 //! 5) Reference to the pointer to CMEvent
1388 //! OUTPUT:
1389 //! CM_SUCCESS if the task is successfully enqueued and the CmEvent is generated;
1390 //! CM_OUT_OF_HOST_MEMORY if out of host memery;
1391 //! CM_FAILURE otherwise.
1392 //*-----------------------------------------------------------------------------
EnqueueCopyInternal(CmSurface2DRT * surface,unsigned char * sysMem,const uint32_t widthStride,const uint32_t heightStride,CM_GPUCOPY_DIRECTION direction,const uint32_t option,CmEvent * & event)1393 int32_t CmQueueRT::EnqueueCopyInternal(CmSurface2DRT* surface,
1394 unsigned char* sysMem,
1395 const uint32_t widthStride,
1396 const uint32_t heightStride,
1397 CM_GPUCOPY_DIRECTION direction,
1398 const uint32_t option,
1399 CmEvent* & event)
1400 {
1401 int32_t hr = CM_FAILURE;
1402 uint32_t width = 0;
1403 uint32_t height = 0;
1404 uint32_t sizePerPixel = 0;
1405 CM_SURFACE_FORMAT format = CM_SURFACE_FORMAT_INVALID;
1406
1407 if (surface)
1408 {
1409 CM_CHK_CMSTATUS_GOTOFINISH(surface->GetSurfaceDesc(width, height, format, sizePerPixel));
1410 }
1411 else
1412 {
1413 return CM_GPUCOPY_INVALID_SURFACES;
1414 }
1415
1416 if (format == CM_SURFACE_FORMAT_NV12 || format == CM_SURFACE_FORMAT_P010 || format == CM_SURFACE_FORMAT_P016)
1417 {
1418 hr = EnqueueCopyInternal_2Planes(surface, (unsigned char*)sysMem, format, width, widthStride, height, heightStride, sizePerPixel, direction, option, event);
1419 }
1420 else
1421 {
1422 hr = EnqueueCopyInternal_1Plane(surface, (unsigned char*)sysMem, format, width, widthStride, height, heightStride, sizePerPixel, direction, option, event);
1423 }
1424
1425 finish:
1426 return hr;
1427 }
1428
EnqueueCopyInternal_1Plane(CmSurface2DRT * surface,unsigned char * sysMem,CM_SURFACE_FORMAT format,const uint32_t widthInPixel,const uint32_t widthStride,const uint32_t heightInRow,const uint32_t heightStride,const uint32_t sizePerPixel,CM_GPUCOPY_DIRECTION direction,const uint32_t option,CmEvent * & event)1429 int32_t CmQueueRT::EnqueueCopyInternal_1Plane(CmSurface2DRT* surface,
1430 unsigned char* sysMem,
1431 CM_SURFACE_FORMAT format,
1432 const uint32_t widthInPixel,
1433 const uint32_t widthStride,
1434 const uint32_t heightInRow,
1435 const uint32_t heightStride,
1436 const uint32_t sizePerPixel,
1437 CM_GPUCOPY_DIRECTION direction,
1438 const uint32_t option,
1439 CmEvent* & event )
1440 {
1441 int32_t hr = CM_SUCCESS;
1442 uint32_t tempHeight = heightInRow;
1443 uint32_t strideInBytes = widthStride;
1444 uint32_t strideInDwords = 0;
1445 uint32_t heightStrideInRows = heightStride;
1446 uint32_t addedShiftLeftOffset = 0;
1447 size_t linearAddress = (size_t)sysMem;
1448 size_t linearAddressAligned = 0;
1449
1450 CmKernel *kernel = nullptr;
1451 CmBufferUP *cmbufferUP = nullptr;
1452 SurfaceIndex *bufferIndexCM = nullptr;
1453 SurfaceIndex *surf2DIndexCM = nullptr;
1454 CmThreadSpace *threadSpace = nullptr;
1455 CmTask *gpuCopyTask = nullptr;
1456 CmEvent *internalEvent = nullptr;
1457
1458 uint32_t threadWidth = 0;
1459 uint32_t threadHeight = 0;
1460 uint32_t threadNum = 0;
1461 uint32_t widthDword = 0;
1462 uint32_t widthByte = 0;
1463 uint32_t copyWidthByte = 0;
1464 uint32_t copyHeightRow = 0;
1465 uint32_t sliceCopyHeightRow = 0;
1466 uint32_t sliceCopyBufferUPSize = 0;
1467 int32_t totalBufferUPSize = 0;
1468 uint32_t startX = 0;
1469 uint32_t startY = 0;
1470 bool blSingleEnqueue = true;
1471 CM_GPUCOPY_KERNEL *gpuCopyKernelParam = nullptr;
1472
1473 PCM_HAL_STATE cmHalState = \
1474 ((PCM_CONTEXT_DATA)m_device->GetAccelData())->cmHalState;
1475
1476 widthByte = widthInPixel * sizePerPixel;
1477
1478 //Align the width regarding stride
1479 if(strideInBytes == 0)
1480 {
1481 strideInBytes = widthByte;
1482 }
1483
1484 if(heightStrideInRows == 0)
1485 {
1486 heightStrideInRows = heightInRow;
1487 }
1488
1489 // the actual copy region
1490 copyWidthByte = MOS_MIN(strideInBytes, widthByte);
1491 copyHeightRow = MOS_MIN(heightStrideInRows, heightInRow);
1492
1493 // Make sure stride and start address of system memory is 16-byte aligned.
1494 // if no padding in system memory , strideInBytes = widthByte.
1495 if(strideInBytes & 0xf)
1496 {
1497 CM_ASSERTMESSAGE("Error: Stride is not 16-byte aligned.");
1498 return CM_GPUCOPY_INVALID_STRIDE;
1499 }
1500 if((linearAddress & 0xf) || (linearAddress == 0))
1501 {
1502 CM_ASSERTMESSAGE("Error: Start address of system memory is not 16-byte aligned.");
1503 return CM_GPUCOPY_INVALID_SYSMEM;
1504 }
1505
1506 //Calculate actual total size of system memory
1507 totalBufferUPSize = strideInBytes * heightStrideInRows;
1508
1509 //Check thread space width here
1510 if( copyWidthByte > CM_MAX_THREADSPACE_WIDTH_FOR_MW * BLOCK_PIXEL_WIDTH *4 )
1511 { // each thread handles 128x8 block data. This API will fail if it exceeds the max thread space's size
1512 CM_ASSERTMESSAGE("Error: Invalid copy size.");
1513 return CM_GPUCOPY_INVALID_SIZE;
1514 }
1515
1516 while (totalBufferUPSize > 0)
1517 {
1518 if (sizeof (void *) == 8 ) //64-bit
1519 {
1520 linearAddressAligned = linearAddress & ADDRESS_PAGE_ALIGNMENT_MASK_X64;
1521 }
1522 else //32-bit
1523 {
1524 linearAddressAligned = linearAddress & ADDRESS_PAGE_ALIGNMENT_MASK_X86;
1525 }
1526
1527 //Calculate Left Shift offset
1528 addedShiftLeftOffset = (uint32_t)(linearAddress - linearAddressAligned);
1529 totalBufferUPSize += addedShiftLeftOffset;
1530
1531 if (totalBufferUPSize > CM_MAX_1D_SURF_WIDTH)
1532 {
1533 blSingleEnqueue = false;
1534 sliceCopyHeightRow = ((CM_MAX_1D_SURF_WIDTH - addedShiftLeftOffset)/(strideInBytes*(BLOCK_HEIGHT * INNER_LOOP))) * (BLOCK_HEIGHT * INNER_LOOP);
1535 sliceCopyBufferUPSize = sliceCopyHeightRow * strideInBytes + addedShiftLeftOffset;
1536 tempHeight = sliceCopyHeightRow;
1537 }
1538 else
1539 {
1540 sliceCopyHeightRow = copyHeightRow;
1541 sliceCopyBufferUPSize = totalBufferUPSize;
1542 if (!blSingleEnqueue)
1543 {
1544 tempHeight = sliceCopyHeightRow;
1545 }
1546 }
1547
1548 //Check thread space height here
1549 if(sliceCopyHeightRow > CM_MAX_THREADSPACE_HEIGHT_FOR_MW * BLOCK_HEIGHT * INNER_LOOP )
1550 { // each thread handles 128x8 block data. This API will fail if it exceeds the max thread space's size
1551 CM_ASSERTMESSAGE("Error: Invalid copy size.");
1552 return CM_GPUCOPY_INVALID_SIZE;
1553 }
1554
1555 kernel = nullptr;
1556 CM_CHK_CMSTATUS_GOTOFINISH( m_device->CreateBufferUP( sliceCopyBufferUPSize, ( void * )linearAddressAligned, cmbufferUP ));
1557 CM_CHK_NULL_GOTOFINISH_CMERROR(cmbufferUP);
1558
1559 //Configure memory object control for BufferUP to solve the cache-line issue.
1560 if (cmHalState->cmHalInterface->IsGPUCopySurfaceNoCacheWARequired())
1561 {
1562 CM_CHK_CMSTATUS_GOTOFINISH(cmbufferUP->SelectMemoryObjectControlSetting(MEMORY_OBJECT_CONTROL_SKL_NO_LLC_L3));
1563 }
1564 CM_CHK_CMSTATUS_GOTOFINISH(CreateGPUCopyKernel(copyWidthByte, sliceCopyHeightRow, format, direction, gpuCopyKernelParam));
1565 CM_CHK_NULL_GOTOFINISH_CMERROR(gpuCopyKernelParam);
1566 kernel = gpuCopyKernelParam->kernel;
1567
1568 CM_CHK_NULL_GOTOFINISH_CMERROR(kernel);
1569
1570 CM_CHK_NULL_GOTOFINISH_CMERROR(cmbufferUP);
1571 CM_CHK_CMSTATUS_GOTOFINISH(cmbufferUP->GetIndex( bufferIndexCM ));
1572 CM_CHK_CMSTATUS_GOTOFINISH(surface->GetIndex( surf2DIndexCM ));
1573
1574 threadWidth = ( uint32_t )ceil( ( double )copyWidthByte/BLOCK_PIXEL_WIDTH/4 );
1575 threadHeight = ( uint32_t )ceil( ( double )sliceCopyHeightRow/BLOCK_HEIGHT/INNER_LOOP );
1576 threadNum = threadWidth * threadHeight;
1577 CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetThreadCount( threadNum ));
1578 CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateThreadSpace( threadWidth, threadHeight, threadSpace ));
1579
1580 if(direction == CM_FASTCOPY_GPU2CPU)
1581 {
1582 surface->SetReadSyncFlag(true, this); // GPU -> CPU, set surf2d as read sync flag
1583 }
1584
1585 if( direction == CM_FASTCOPY_CPU2GPU)
1586 {
1587 if (cmHalState->cmHalInterface->IsSurfaceCompressionWARequired())
1588 {
1589 CM_CHK_CMSTATUS_GOTOFINISH(surface->SetCompressionMode(MEMCOMP_DISABLED));
1590 }
1591 CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 0, sizeof( SurfaceIndex ), bufferIndexCM) );
1592 CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 1, sizeof( SurfaceIndex ), surf2DIndexCM ));
1593 }
1594 else
1595 {
1596 CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 1, sizeof( SurfaceIndex ), bufferIndexCM ));
1597 CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 0, sizeof( SurfaceIndex ), surf2DIndexCM ));
1598 }
1599
1600
1601 widthDword = (uint32_t)ceil((double)widthByte / 4);
1602 strideInDwords = (uint32_t)ceil((double)strideInBytes / 4);
1603
1604 CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 2, sizeof( uint32_t ), &strideInDwords ));
1605 CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 3, sizeof( uint32_t ), &heightStrideInRows ));
1606 CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 4, sizeof( uint32_t ), &addedShiftLeftOffset ));
1607 CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 5, sizeof( uint32_t ), &threadHeight ));
1608
1609 if (direction == CM_FASTCOPY_GPU2CPU) //GPU-->CPU, read
1610 {
1611 CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 6, sizeof( uint32_t ), &widthDword ));
1612 CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 7, sizeof( uint32_t ), &tempHeight ));
1613 CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 8, sizeof(uint32_t), &startX));
1614 CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 9, sizeof(uint32_t), &startY));
1615 }
1616 else //CPU-->GPU, write
1617 {
1618 //this only works for the kernel surfaceCopy_write_32x32
1619 CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 6, sizeof( uint32_t ), &startX ));
1620 CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 7, sizeof( uint32_t ), &startY ));
1621 }
1622
1623 CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateTask(gpuCopyTask));
1624 CM_CHK_CMSTATUS_GOTOFINISH(gpuCopyTask->AddKernel( kernel ));
1625 if (option & CM_FASTCOPY_OPTION_DISABLE_TURBO_BOOST)
1626 {
1627 // disable turbo
1628 CM_TASK_CONFIG taskConfig;
1629 CmSafeMemSet(&taskConfig, 0, sizeof(CM_TASK_CONFIG));
1630 taskConfig.turboBoostFlag = CM_TURBO_BOOST_DISABLE;
1631 gpuCopyTask->SetProperty(taskConfig);
1632 }
1633 CM_CHK_CMSTATUS_GOTOFINISH(EnqueueFast(gpuCopyTask, internalEvent,
1634 threadSpace));
1635
1636 GPUCOPY_KERNEL_UNLOCK(gpuCopyKernelParam);
1637
1638 //update for next slice
1639 linearAddress += sliceCopyBufferUPSize - addedShiftLeftOffset;
1640 totalBufferUPSize -= sliceCopyBufferUPSize;
1641 copyHeightRow -= sliceCopyHeightRow;
1642 startX = 0;
1643 startY += sliceCopyHeightRow;
1644
1645 if(totalBufferUPSize > 0) //Intermediate event, we don't need it
1646 {
1647 CM_CHK_CMSTATUS_GOTOFINISH(DestroyEventFast(internalEvent));
1648 }
1649 else //Last one event, need keep or destroy it
1650 {
1651 if ((option & CM_FASTCOPY_OPTION_BLOCKING) && (internalEvent))
1652 {
1653 CM_CHK_CMSTATUS_GOTOFINISH(internalEvent->WaitForTaskFinished());
1654 }
1655
1656 if(event == CM_NO_EVENT) //User doesn't need CmEvent for this copy
1657 {
1658 event = nullptr;
1659 CM_CHK_CMSTATUS_GOTOFINISH(DestroyEventFast(internalEvent));
1660 }
1661 else //User needs this CmEvent
1662 {
1663 event = internalEvent;
1664 }
1665 }
1666
1667 CM_CHK_CMSTATUS_GOTOFINISH(m_device->DestroyTask(gpuCopyTask));
1668 CM_CHK_CMSTATUS_GOTOFINISH(m_device->DestroyThreadSpace(threadSpace));
1669 CM_CHK_CMSTATUS_GOTOFINISH(m_device->DestroyBufferUP(cmbufferUP));
1670 }
1671
1672 finish:
1673
1674 if(hr != CM_SUCCESS)
1675 {
1676 if(cmbufferUP == nullptr)
1677 {
1678 // user need to know whether the failure is caused by out of BufferUP.
1679 hr = CM_GPUCOPY_OUT_OF_RESOURCE;
1680 }
1681
1682 if(kernel && gpuCopyKernelParam) GPUCOPY_KERNEL_UNLOCK(gpuCopyKernelParam);
1683 if(threadSpace) m_device->DestroyThreadSpace(threadSpace);
1684 if(gpuCopyTask) m_device->DestroyTask(gpuCopyTask);
1685 if(cmbufferUP) m_device->DestroyBufferUP(cmbufferUP);
1686 if(internalEvent) DestroyEventFast(internalEvent);
1687
1688 // CM_FAILURE for all the other errors
1689 // return CM_EXCEED_MAX_TIMEOUT to notify app that gpu reset happens
1690 if( hr != CM_GPUCOPY_OUT_OF_RESOURCE && hr != CM_EXCEED_MAX_TIMEOUT)
1691 {
1692 hr = CM_FAILURE;
1693 }
1694 }
1695
1696 return hr;
1697 }
1698
EnqueueCopyInternal_2Planes(CmSurface2DRT * surface,unsigned char * sysMem,CM_SURFACE_FORMAT format,const uint32_t widthInPixel,const uint32_t widthStride,const uint32_t heightInRow,const uint32_t heightStride,const uint32_t sizePerPixel,CM_GPUCOPY_DIRECTION direction,const uint32_t option,CmEvent * & event)1699 int32_t CmQueueRT::EnqueueCopyInternal_2Planes(CmSurface2DRT* surface,
1700 unsigned char* sysMem,
1701 CM_SURFACE_FORMAT format,
1702 const uint32_t widthInPixel,
1703 const uint32_t widthStride,
1704 const uint32_t heightInRow,
1705 const uint32_t heightStride,
1706 const uint32_t sizePerPixel,
1707 CM_GPUCOPY_DIRECTION direction,
1708 const uint32_t option,
1709 CmEvent* & event)
1710 {
1711 int32_t hr = CM_SUCCESS;
1712 uint32_t strideInBytes = widthStride;
1713 uint32_t strideInDwords = 0;
1714 uint32_t heightStrideInRows = heightStride;
1715 size_t linearAddressY = 0;
1716 size_t linearAddressUV = 0;
1717 size_t linearAddressAlignedY = 0;
1718 size_t linearAddressAlignedUV = 0;
1719 uint32_t addedShiftLeftOffsetY = 0;
1720 uint32_t addedShiftLeftOffsetUV = 0;
1721
1722 CmKernel *kernel = nullptr;
1723 CmBufferUP *cmbufferUPY = nullptr;
1724 CmBufferUP *cmbufferUPUV = nullptr;
1725 SurfaceIndex *bufferUPIndexY = nullptr;
1726 SurfaceIndex *bufferUPIndexUV = nullptr;
1727 SurfaceIndex *surf2DIndexCM = nullptr;
1728 CmThreadSpace *threadSpace = nullptr;
1729 CmTask *gpuCopyTask = nullptr;
1730 CmEvent *internalEvent = nullptr;
1731
1732 uint32_t threadWidth = 0;
1733 uint32_t threadHeight = 0;
1734 uint32_t threadNum = 0;
1735 uint32_t widthDword = 0;
1736 uint32_t widthByte = 0;
1737 uint32_t copyWidthByte = 0;
1738 uint32_t copyHeightRow = 0;
1739 uint32_t bufferUPYSize = 0;
1740 uint32_t bufferUPUVSize = 0;
1741
1742 CM_GPUCOPY_KERNEL *gpuCopyKernelParam = nullptr;
1743 PCM_HAL_STATE cmHalState = \
1744 ((PCM_CONTEXT_DATA)m_device->GetAccelData())->cmHalState;
1745
1746 widthByte = widthInPixel * sizePerPixel;
1747
1748 //Align the width regarding stride
1749 if (strideInBytes == 0)
1750 {
1751 strideInBytes = widthByte;
1752 }
1753
1754 if (heightStrideInRows == 0)
1755 {
1756 heightStrideInRows = heightInRow;
1757 }
1758
1759 // the actual copy region
1760 copyWidthByte = MOS_MIN(strideInBytes, widthByte);
1761 copyHeightRow = MOS_MIN(heightStrideInRows, heightInRow);
1762
1763 // Make sure stride and start address of system memory is 16-byte aligned.
1764 // if no padding in system memory , strideInBytes = widthByte.
1765 if (strideInBytes & 0xf)
1766 {
1767 CM_ASSERTMESSAGE("Error: Stride is not 16-byte aligned.");
1768 return CM_GPUCOPY_INVALID_STRIDE;
1769 }
1770
1771 //Check thread space width here
1772 if (copyWidthByte > CM_MAX_THREADSPACE_WIDTH_FOR_MW * BLOCK_PIXEL_WIDTH * 4)
1773 { // each thread handles 128x8 block data. This API will fail if it exceeds the max thread space's size
1774 CM_ASSERTMESSAGE("Error: Invalid copy size.");
1775 return CM_GPUCOPY_INVALID_SIZE;
1776 }
1777
1778 linearAddressY = (size_t)sysMem;
1779 linearAddressUV = (size_t)((char*)sysMem + strideInBytes * heightStrideInRows);
1780
1781 if ((linearAddressY & 0xf) || (linearAddressY == 0) || (linearAddressAlignedUV & 0xf))
1782 {
1783 CM_ASSERTMESSAGE("Error: Start address of system memory is not 16-byte aligned.");
1784 return CM_GPUCOPY_INVALID_SYSMEM;
1785 }
1786
1787 if (sizeof (void *) == 8) //64-bit
1788 {
1789 linearAddressAlignedY = linearAddressY & ADDRESS_PAGE_ALIGNMENT_MASK_X64;
1790 linearAddressAlignedUV = linearAddressUV & ADDRESS_PAGE_ALIGNMENT_MASK_X64;
1791 }
1792 else //32-bit
1793 {
1794 linearAddressAlignedY = linearAddressY & ADDRESS_PAGE_ALIGNMENT_MASK_X86;
1795 linearAddressAlignedUV = linearAddressUV & ADDRESS_PAGE_ALIGNMENT_MASK_X86;
1796 }
1797
1798 //Calculate Left Shift offset
1799 addedShiftLeftOffsetY = (uint32_t)(linearAddressY - linearAddressAlignedY);
1800 addedShiftLeftOffsetUV = (uint32_t)(linearAddressUV - linearAddressAlignedUV);
1801
1802 //Calculate actual total size of system memory, assume it's NV12/P010/P016 formats
1803 bufferUPYSize = strideInBytes * heightStrideInRows + addedShiftLeftOffsetY;
1804 bufferUPUVSize = strideInBytes * copyHeightRow * 1 / 2 + addedShiftLeftOffsetUV;
1805
1806 //Check thread space height here
1807 if (copyHeightRow > CM_MAX_THREADSPACE_HEIGHT_FOR_MW * BLOCK_HEIGHT * INNER_LOOP)
1808 { // each thread handles 128x8 block data. This API will fail if it exceeds the max thread space's size
1809 CM_ASSERTMESSAGE("Error: Invalid copy size.");
1810 return CM_GPUCOPY_INVALID_SIZE;
1811 }
1812
1813 kernel = nullptr;
1814 CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateBufferUP(bufferUPYSize, (void *)linearAddressAlignedY, cmbufferUPY));
1815 CM_CHK_NULL_GOTOFINISH_CMERROR(cmbufferUPY);
1816 CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateBufferUP(bufferUPUVSize, (void *)linearAddressAlignedUV, cmbufferUPUV));
1817 CM_CHK_NULL_GOTOFINISH_CMERROR(cmbufferUPUV);
1818
1819 //Configure memory object control for the two BufferUP to solve the same cache-line coherency issue.
1820 if (cmHalState->cmHalInterface->IsGPUCopySurfaceNoCacheWARequired())
1821 {
1822 CM_CHK_CMSTATUS_GOTOFINISH(cmbufferUPY->SelectMemoryObjectControlSetting(MEMORY_OBJECT_CONTROL_SKL_NO_LLC_L3));
1823 CM_CHK_CMSTATUS_GOTOFINISH(cmbufferUPUV->SelectMemoryObjectControlSetting(MEMORY_OBJECT_CONTROL_SKL_NO_LLC_L3));
1824 }
1825 else
1826 {
1827 CM_CHK_CMSTATUS_GOTOFINISH(static_cast< CmBuffer_RT* >(cmbufferUPY)->SetMemoryObjectControl(MEMORY_OBJECT_CONTROL_FROM_GTT_ENTRY, CM_WRITE_THROUGH, 0));
1828 CM_CHK_CMSTATUS_GOTOFINISH(static_cast< CmBuffer_RT* >(cmbufferUPUV)->SetMemoryObjectControl(MEMORY_OBJECT_CONTROL_FROM_GTT_ENTRY, CM_WRITE_THROUGH, 0));
1829 }
1830
1831 CM_CHK_CMSTATUS_GOTOFINISH(CreateGPUCopyKernel(copyWidthByte, copyHeightRow, format, direction, gpuCopyKernelParam));
1832 CM_CHK_NULL_GOTOFINISH_CMERROR(gpuCopyKernelParam);
1833 kernel = gpuCopyKernelParam->kernel;
1834
1835 CM_CHK_NULL_GOTOFINISH_CMERROR(kernel);
1836
1837 CM_CHK_NULL_GOTOFINISH_CMERROR(cmbufferUPY);
1838 CM_CHK_NULL_GOTOFINISH_CMERROR(cmbufferUPUV);
1839 CM_CHK_CMSTATUS_GOTOFINISH(cmbufferUPY->GetIndex(bufferUPIndexY));
1840 CM_CHK_CMSTATUS_GOTOFINISH(cmbufferUPUV->GetIndex(bufferUPIndexUV));
1841 CM_CHK_CMSTATUS_GOTOFINISH(surface->GetIndex(surf2DIndexCM));
1842
1843 threadWidth = (uint32_t)ceil((double)copyWidthByte / BLOCK_PIXEL_WIDTH / 4);
1844 threadHeight = (uint32_t)ceil((double)copyHeightRow / BLOCK_HEIGHT / INNER_LOOP);
1845 threadNum = threadWidth * threadHeight;
1846 CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetThreadCount(threadNum));
1847 CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateThreadSpace(threadWidth, threadHeight, threadSpace));
1848
1849 widthDword = (uint32_t)ceil((double)widthByte / 4);
1850 strideInDwords = (uint32_t)ceil((double)strideInBytes / 4);
1851
1852 if (direction == CM_FASTCOPY_CPU2GPU) //Write
1853 {
1854 //Input BufferUP_Y and BufferUP_UV
1855 if (cmHalState->cmHalInterface->IsSurfaceCompressionWARequired())
1856 {
1857 CM_CHK_CMSTATUS_GOTOFINISH(surface->SetCompressionMode(MEMCOMP_DISABLED));
1858 }
1859 CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg(0, sizeof(SurfaceIndex), bufferUPIndexY));
1860 CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg(1, sizeof(SurfaceIndex), bufferUPIndexUV));
1861 //Output Surface2D
1862 CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg(2, sizeof(SurfaceIndex), surf2DIndexCM));
1863 //Other parameters
1864 CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg(3, sizeof(uint32_t), &strideInDwords));
1865 CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg(4, sizeof(uint32_t), &heightStrideInRows));
1866 CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg(5, sizeof(uint32_t), &addedShiftLeftOffsetY));
1867 CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg(6, sizeof(uint32_t), &addedShiftLeftOffsetUV));
1868 CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg(7, sizeof(uint32_t), &threadHeight));
1869 }
1870 else //Read
1871 {
1872 //Input Surface2D
1873 CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg(0, sizeof(SurfaceIndex), surf2DIndexCM));
1874 //Output BufferUP_Y and BufferUP_UV
1875 CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg(1, sizeof(SurfaceIndex), bufferUPIndexY));
1876 CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg(2, sizeof(SurfaceIndex), bufferUPIndexUV));
1877 //Other parameters
1878 CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg(3, sizeof(uint32_t), &strideInDwords));
1879 CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg(4, sizeof(uint32_t), &heightStrideInRows));
1880 CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg(5, sizeof(uint32_t), &addedShiftLeftOffsetY));
1881 CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg(6, sizeof(uint32_t), &addedShiftLeftOffsetUV));
1882 CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg(7, sizeof(uint32_t), &threadHeight));
1883 CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg(8, sizeof(uint32_t), &widthDword));
1884 CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg(9, sizeof(uint32_t), &heightInRow));
1885
1886 surface->SetReadSyncFlag(true, this); // GPU -> CPU, set surf2d as read sync flag
1887 }
1888
1889 CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateTask(gpuCopyTask));
1890 CM_CHK_CMSTATUS_GOTOFINISH(gpuCopyTask->AddKernel(kernel));
1891 if (option & CM_FASTCOPY_OPTION_DISABLE_TURBO_BOOST)
1892 {
1893 // disable turbo
1894 CM_TASK_CONFIG taskConfig;
1895 CmSafeMemSet(&taskConfig, 0, sizeof(CM_TASK_CONFIG));
1896 taskConfig.turboBoostFlag = CM_TURBO_BOOST_DISABLE;
1897 gpuCopyTask->SetProperty(taskConfig);
1898 }
1899 CM_CHK_CMSTATUS_GOTOFINISH(EnqueueFast(gpuCopyTask, internalEvent,
1900 threadSpace));
1901
1902 GPUCOPY_KERNEL_UNLOCK(gpuCopyKernelParam);
1903
1904 if ((option & CM_FASTCOPY_OPTION_BLOCKING) && (internalEvent))
1905 {
1906 CM_CHK_CMSTATUS_GOTOFINISH(internalEvent->WaitForTaskFinished());
1907 }
1908
1909 if (event == CM_NO_EVENT) //User doesn't need CmEvent for this copy
1910 {
1911 event = nullptr;
1912 CM_CHK_CMSTATUS_GOTOFINISH(DestroyEventFast(internalEvent));
1913 }
1914 else //User needs this CmEvent
1915 {
1916 event = internalEvent;
1917 }
1918
1919 CM_CHK_CMSTATUS_GOTOFINISH(m_device->DestroyTask(gpuCopyTask));
1920 CM_CHK_CMSTATUS_GOTOFINISH(m_device->DestroyThreadSpace(threadSpace));
1921 CM_CHK_CMSTATUS_GOTOFINISH(m_device->DestroyBufferUP(cmbufferUPY));
1922 CM_CHK_CMSTATUS_GOTOFINISH(m_device->DestroyBufferUP(cmbufferUPUV));
1923
1924 finish:
1925
1926 if (hr != CM_SUCCESS)
1927 {
1928 if ((cmbufferUPY == nullptr) || (cmbufferUPUV == nullptr))
1929 {
1930 // user need to know whether the failure is caused by out of BufferUP.
1931 hr = CM_GPUCOPY_OUT_OF_RESOURCE;
1932 }
1933
1934 if (kernel && gpuCopyKernelParam) GPUCOPY_KERNEL_UNLOCK(gpuCopyKernelParam);
1935 if (threadSpace) m_device->DestroyThreadSpace(threadSpace);
1936 if (gpuCopyTask) m_device->DestroyTask(gpuCopyTask);
1937 if (cmbufferUPY) m_device->DestroyBufferUP(cmbufferUPY);
1938 if (cmbufferUPUV) m_device->DestroyBufferUP(cmbufferUPUV);
1939 if (internalEvent) DestroyEventFast(internalEvent);
1940
1941 // CM_FAILURE for all the other errors
1942 // return CM_EXCEED_MAX_TIMEOUT to notify app that gpu reset happens
1943 if( hr != CM_GPUCOPY_OUT_OF_RESOURCE && hr != CM_EXCEED_MAX_TIMEOUT)
1944 {
1945 hr = CM_FAILURE;
1946 }
1947 }
1948
1949 return hr;
1950 }
1951
1952 //*-----------------------------------------------------------------------------
1953 //! Enqueue an task, which contains one pre-defined kernel to copy from video memory to video memory
1954 //! This is a non-blocking call. i.e. it returns immediately without waiting for
1955 //! GPU to finish the execution of the task.
1956 //! A CmEvent is generated each time a task is enqueued. The CmEvent can
1957 //! be used to check if the task finishes.
1958 //! INPUT:
1959 //! 1) Pointer to the CmSurface2D as copy destination
1960 //! 2) Pointer to the CmSurface2D as copy source
1961 //! 3) Option passed from user, blocking copy, non-blocking copy or disable turbo boost
1962 //! 4) Reference to the pointer to CMEvent
1963 //! OUTPUT:
1964 //! CM_SUCCESS if the task is successfully enqueued and the CmEvent is generated;
1965 //! CM_OUT_OF_HOST_MEMORY if out of host memery;
1966 //! CM_GPUCOPY_INVALID_SURFACES if input/output surfaces' width/format are different or
1967 //! input surface's height is larger than output surface's
1968 //! Restrictions:
1969 //! 1) Surface's width should be 64-byte aligned.
1970 //! 2) The input surface's width/height/format should be the same as output surface's.
1971 //*-----------------------------------------------------------------------------
EnqueueCopyGPUToGPU(CmSurface2D * outputSurface,CmSurface2D * inputSurface,uint32_t option,CmEvent * & event)1972 CM_RT_API int32_t CmQueueRT::EnqueueCopyGPUToGPU( CmSurface2D* outputSurface, CmSurface2D* inputSurface, uint32_t option, CmEvent* & event )
1973 {
1974 INSERT_API_CALL_LOG(GetHalState());
1975
1976 if (!m_device->HasGpuCopyKernel())
1977 {
1978 return CM_NOT_IMPLEMENTED;
1979 }
1980
1981 uint32_t srcSurfaceWidth = 0;
1982 uint32_t srcSurfaceHeight = 0;
1983 uint32_t dstSurfaceWidth = 0;
1984 uint32_t dstSurfaceHeight = 0;
1985
1986 CM_SURFACE_FORMAT srcSurfaceFormat = CM_SURFACE_FORMAT_INVALID;
1987 CM_SURFACE_FORMAT dstSurfaceFormat = CM_SURFACE_FORMAT_INVALID;
1988
1989 int32_t hr = CM_SUCCESS;
1990 uint32_t srcSizePerPixel = 0;
1991 uint32_t dstSizePerPixel = 0;
1992 uint32_t threadWidth = 0;
1993 uint32_t threadHeight = 0;
1994
1995 CmKernel *kernel = nullptr;
1996 SurfaceIndex *surfaceInputIndex = nullptr;
1997 SurfaceIndex *surfaceOutputIndex = nullptr;
1998 CmThreadSpace *threadSpace = nullptr;
1999 CmTask *task = nullptr;
2000 uint32_t srcSurfAlignedWidthInBytes = 0;
2001 CM_GPUCOPY_KERNEL *gpuCopyKernelParam = nullptr;
2002
2003 if ((outputSurface == nullptr) || (inputSurface == nullptr))
2004 {
2005 CM_ASSERTMESSAGE("Error: Pointer to input surface or output surface is null.");
2006 return CM_FAILURE;
2007 }
2008
2009 PCM_HAL_STATE cmHalState = ((PCM_CONTEXT_DATA)m_device->GetAccelData())->cmHalState;
2010 CmSurface2DRT *outputSurfaceRT = static_cast<CmSurface2DRT *>(outputSurface);
2011 CmSurface2DRT *inputSurfaceRT = static_cast<CmSurface2DRT *>(inputSurface);
2012 if (cmHalState->cmHalInterface->IsSurfaceCompressionWARequired())
2013 {
2014 CM_CHK_CMSTATUS_GOTOFINISH(outputSurfaceRT->SetCompressionMode(MEMCOMP_DISABLED));
2015 }
2016
2017 CM_CHK_CMSTATUS_GOTOFINISH(outputSurfaceRT->GetSurfaceDesc(dstSurfaceWidth, dstSurfaceHeight, dstSurfaceFormat, dstSizePerPixel));
2018 CM_CHK_CMSTATUS_GOTOFINISH(inputSurfaceRT->GetSurfaceDesc(srcSurfaceWidth, srcSurfaceHeight, srcSurfaceFormat, srcSizePerPixel));
2019
2020 if ((dstSurfaceWidth != srcSurfaceWidth) ||
2021 (dstSurfaceHeight < srcSurfaceHeight) || //relax the restriction
2022 (dstSizePerPixel != srcSizePerPixel))
2023 {
2024 CM_ASSERTMESSAGE("Error: Size of dest surface does not match src surface.");
2025 return CM_GPUCOPY_INVALID_SURFACES;
2026 }
2027
2028 //To support copy b/w Format_A8R8G8B8 and Format_A8B8G8R8
2029 if (dstSurfaceFormat != srcSurfaceFormat)
2030 {
2031 if (!((dstSurfaceFormat == CM_SURFACE_FORMAT_A8R8G8B8) && (srcSurfaceFormat == CM_SURFACE_FORMAT_A8B8G8R8)) &&
2032 !((dstSurfaceFormat == CM_SURFACE_FORMAT_A8R8G8B8) && (srcSurfaceFormat == CM_SURFACE_FORMAT_A8B8G8R8)))
2033 {
2034 CM_ASSERTMESSAGE("Error: Only support copy b/w Format_A8R8G8B8 and Format_A8B8G8R8 if src format is not matched with dst format.");
2035 return CM_GPUCOPY_INVALID_SURFACES;
2036 }
2037 }
2038
2039 // 128Bytes aligned
2040 srcSurfAlignedWidthInBytes = (uint32_t)(ceil((double)srcSurfaceWidth*srcSizePerPixel / BLOCK_PIXEL_WIDTH / 4) * (BLOCK_PIXEL_WIDTH * 4));
2041
2042 if (srcSurfaceHeight > CM_MAX_THREADSPACE_WIDTH_FOR_MW *BLOCK_HEIGHT *INNER_LOOP)
2043 {
2044 CM_ASSERTMESSAGE("Error: Invalid copy size.");
2045 return CM_GPUCOPY_INVALID_SIZE;
2046 }
2047
2048 CM_CHK_CMSTATUS_GOTOFINISH(CreateGPUCopyKernel(srcSurfaceWidth*srcSizePerPixel, srcSurfaceHeight, srcSurfaceFormat, CM_FASTCOPY_GPU2GPU, gpuCopyKernelParam));
2049 CM_CHK_NULL_GOTOFINISH_CMERROR(gpuCopyKernelParam);
2050
2051 CM_CHK_NULL_GOTOFINISH_CMERROR(gpuCopyKernelParam->kernel);
2052 kernel = gpuCopyKernelParam->kernel;
2053
2054 CM_CHK_CMSTATUS_GOTOFINISH(inputSurface->GetIndex(surfaceInputIndex));
2055 CM_CHK_CMSTATUS_GOTOFINISH(outputSurface->GetIndex(surfaceOutputIndex));
2056
2057 threadWidth = srcSurfAlignedWidthInBytes / (BLOCK_PIXEL_WIDTH * 4);
2058 threadHeight = (uint32_t)ceil((double)srcSurfaceHeight / BLOCK_HEIGHT / INNER_LOOP);
2059
2060 CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetThreadCount(threadWidth * threadHeight));
2061
2062 CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg(0, sizeof(SurfaceIndex), surfaceInputIndex));
2063 CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg(1, sizeof(SurfaceIndex), surfaceOutputIndex));
2064 CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg(2, sizeof(uint32_t), &threadHeight));
2065
2066 CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateThreadSpace(threadWidth, threadHeight, threadSpace));
2067
2068 CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateTask(task));
2069 CM_CHK_NULL_GOTOFINISH_CMERROR(task);
2070 CM_CHK_CMSTATUS_GOTOFINISH(task->AddKernel(kernel));
2071
2072 if (option & CM_FASTCOPY_OPTION_DISABLE_TURBO_BOOST)
2073 {
2074 // disable turbo
2075 CM_TASK_CONFIG taskConfig;
2076 CmSafeMemSet(&taskConfig, 0, sizeof(CM_TASK_CONFIG));
2077 taskConfig.turboBoostFlag = CM_TURBO_BOOST_DISABLE;
2078 task->SetProperty(taskConfig);
2079 }
2080
2081 CM_CHK_CMSTATUS_GOTOFINISH(EnqueueFast(task, event, threadSpace));
2082 if ((option & CM_FASTCOPY_OPTION_BLOCKING) && (event))
2083 {
2084 CM_CHK_CMSTATUS_GOTOFINISH(event->WaitForTaskFinished());
2085 }
2086
2087 finish:
2088
2089 if (kernel && gpuCopyKernelParam) GPUCOPY_KERNEL_UNLOCK(gpuCopyKernelParam);
2090 if (threadSpace) m_device->DestroyThreadSpace(threadSpace);
2091 if (task) m_device->DestroyTask(task);
2092
2093 return hr;
2094 }
2095
2096 //*-----------------------------------------------------------------------------
2097 //! Enqueue an task, which contains one pre-defined kernel to copy from system memory to system memory
2098 //! This is a non-blocking call. i.e. it returns immediately without waiting for
2099 //! GPU to finish the execution of the task.
2100 //! A CmEvent is generated each time a task is enqueued. The CmEvent can be used to check if the task finishs.
2101 //! If the size is less than 1KB, CPU is used to do the copy and event will be set as nullptr .
2102 //!
2103 //! INPUT:
2104 //! 1) Pointer to the system memory as copy destination
2105 //! 2) Pointer to the system memory as copy source
2106 //! 3) The size in bytes of memory be copied.
2107 //! 4) Option passed from user, blocking copy, non-blocking copy or disable turbo boost
2108 //! 5) Reference to the pointer to CMEvent
2109 //! OUTPUT:
2110 //! CM_SUCCESS if the task is successfully enqueued and the CmEvent is generated;
2111 //! CM_OUT_OF_HOST_MEMORY if out of host memery;
2112 //! CM_GPUCOPY_INVALID_SYSMEM if the sysMem is not 16-byte aligned or is NULL.
2113 //! CM_GPUCOPY_OUT_OF_RESOURCE if runtime run out of BufferUP.
2114 //! CM_GPUCOPY_INVALID_SIZE if its size plus shift-left offset large than CM_MAX_1D_SURF_WIDTH.
2115 //! Restrictions:
2116 //! 1) dstSysMem and srcSysMem should be 16-byte aligned.
2117 //*-----------------------------------------------------------------------------
EnqueueCopyCPUToCPU(unsigned char * dstSysMem,unsigned char * srcSysMem,uint32_t size,uint32_t option,CmEvent * & event)2118 CM_RT_API int32_t CmQueueRT::EnqueueCopyCPUToCPU( unsigned char* dstSysMem, unsigned char* srcSysMem, uint32_t size, uint32_t option, CmEvent* & event )
2119 {
2120 INSERT_API_CALL_LOG(GetHalState());
2121
2122 if (!m_device->HasGpuCopyKernel())
2123 {
2124 return CM_NOT_IMPLEMENTED;
2125 }
2126
2127 int hr = CM_SUCCESS;
2128 size_t inputLinearAddress = (size_t )srcSysMem;
2129 size_t outputLinearAddress = (size_t )dstSysMem;
2130
2131 size_t inputLinearAddressAligned = 0;
2132 size_t outputLinearAddressAligned = 0;
2133
2134 CmBufferUP *surfaceInput = nullptr;
2135 CmBufferUP *surfaceOutput = nullptr;
2136 CmKernel *kernel = nullptr;
2137 SurfaceIndex *surfaceInputIndex = nullptr;
2138 SurfaceIndex *surfaceOutputIndex = nullptr;
2139 CmThreadSpace *threadSpace = nullptr;
2140 CmTask *task = nullptr;
2141
2142 int32_t srcLeftShiftOffset = 0;
2143 int32_t dstLeftShiftOffset = 0;
2144 uint32_t threadWidth = 0;
2145 uint32_t threadHeight = 0;
2146 uint32_t threadNum = 0;
2147 uint32_t gpuMemcopySize = 0;
2148 uint32_t cpuMemcopySize = 0;
2149 CM_GPUCOPY_KERNEL *gpuCopyKernelParam = nullptr;
2150
2151 if((inputLinearAddress & 0xf) || (outputLinearAddress & 0xf) ||
2152 (inputLinearAddress == 0) || (outputLinearAddress == 0))
2153 {
2154 CM_ASSERTMESSAGE("Error: Start address of system memory is not 16-byte aligned.");
2155 return CM_GPUCOPY_INVALID_SYSMEM;
2156 }
2157
2158 // Get page aligned address
2159 if (sizeof (void *) == 8 ) //64-bit
2160 {
2161 inputLinearAddressAligned = inputLinearAddress & ADDRESS_PAGE_ALIGNMENT_MASK_X64; // make sure the address page aligned.
2162 outputLinearAddressAligned = outputLinearAddress & ADDRESS_PAGE_ALIGNMENT_MASK_X64; // make sure the address page aligned.
2163 }
2164 else
2165 {
2166 inputLinearAddressAligned = inputLinearAddress & ADDRESS_PAGE_ALIGNMENT_MASK_X86; // make sure the address page aligned.
2167 outputLinearAddressAligned = outputLinearAddress & ADDRESS_PAGE_ALIGNMENT_MASK_X86; // make sure the address page aligned.
2168 }
2169
2170 srcLeftShiftOffset = (int32_t)(inputLinearAddress - inputLinearAddressAligned) ;
2171 dstLeftShiftOffset = (int32_t)(outputLinearAddress - outputLinearAddressAligned) ;
2172
2173 if(((size + srcLeftShiftOffset) > CM_MAX_1D_SURF_WIDTH)||
2174 ((size + dstLeftShiftOffset) > CM_MAX_1D_SURF_WIDTH))
2175 {
2176 CM_ASSERTMESSAGE("Error: Invalid copy size.");
2177 return CM_GPUCOPY_INVALID_SIZE;
2178 }
2179
2180 threadWidth = 0;
2181 threadHeight = 0;
2182 threadNum = size / BYTE_COPY_ONE_THREAD; // each thread copys 32 x 4 x32 bytes = 1K
2183
2184 if( threadNum == 0)
2185 {
2186 //if the size of data is less than data copied per thread ( 4K), use CPU to copy it instead of GPU.
2187 CmFastMemCopy((void *)(outputLinearAddress),
2188 (void *)(inputLinearAddress),
2189 size); //SSE copy used in CMRT.
2190
2191 event = nullptr;
2192 return CM_SUCCESS;
2193 }
2194
2195 //Calculate proper thread space's width and height
2196 threadWidth = 1;
2197 threadHeight = threadNum/threadWidth;
2198 while((threadHeight > CM_MAX_THREADSPACE_HEIGHT_FOR_MW))
2199 {
2200 if(threadWidth > CM_MAX_THREADSPACE_WIDTH_FOR_MW)
2201 {
2202 hr = CM_GPUCOPY_INVALID_SIZE; // thread number exceed 511*511
2203 goto finish;
2204 }
2205 else if (threadWidth == 1)
2206 {
2207 threadWidth = THREAD_SPACE_WIDTH_INCREMENT; // first time,
2208 threadHeight = threadNum/threadWidth;
2209 }
2210 else
2211 {
2212 threadWidth += THREAD_SPACE_WIDTH_INCREMENT; // increase 8 per iteration
2213 threadHeight = threadNum/threadWidth;
2214 }
2215 }
2216
2217 CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateBufferUP(size + srcLeftShiftOffset, (void *)inputLinearAddressAligned,surfaceInput));
2218
2219 CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateBufferUP(size + dstLeftShiftOffset, (void *)outputLinearAddressAligned,surfaceOutput));
2220
2221 CM_CHK_CMSTATUS_GOTOFINISH(CreateGPUCopyKernel(size, 0, CM_SURFACE_FORMAT_INVALID, CM_FASTCOPY_CPU2CPU, gpuCopyKernelParam));
2222 CM_CHK_NULL_GOTOFINISH_CMERROR(gpuCopyKernelParam);
2223 CM_CHK_NULL_GOTOFINISH_CMERROR(gpuCopyKernelParam->kernel);
2224 kernel = gpuCopyKernelParam->kernel;
2225
2226 CM_CHK_NULL_GOTOFINISH_CMERROR(surfaceInput);
2227 CM_CHK_CMSTATUS_GOTOFINISH(surfaceInput->GetIndex(surfaceInputIndex));
2228 CM_CHK_NULL_GOTOFINISH_CMERROR(surfaceOutput);
2229 CM_CHK_CMSTATUS_GOTOFINISH(surfaceOutput->GetIndex(surfaceOutputIndex));
2230
2231 CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetThreadCount(threadWidth * threadHeight));
2232 CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 0, sizeof( SurfaceIndex ), surfaceInputIndex ));
2233 CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 1, sizeof( SurfaceIndex ), surfaceOutputIndex ));
2234 CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 2, sizeof( int ), &threadWidth ));
2235 CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 3, sizeof( int ), &threadHeight ));
2236 CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 4, sizeof( int ), &srcLeftShiftOffset ));
2237 CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 5, sizeof( int ), &dstLeftShiftOffset ));
2238 CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 6, sizeof( int ), &size ));
2239
2240 CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateThreadSpace(threadWidth, threadHeight, threadSpace));
2241
2242 CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateTask(task));
2243 CM_CHK_NULL_GOTOFINISH_CMERROR(task);
2244 CM_CHK_CMSTATUS_GOTOFINISH(task->AddKernel (kernel));
2245
2246 if (option & CM_FASTCOPY_OPTION_DISABLE_TURBO_BOOST)
2247 {
2248 // disable turbo
2249 CM_TASK_CONFIG taskConfig;
2250 CmSafeMemSet(&taskConfig, 0, sizeof(CM_TASK_CONFIG));
2251 taskConfig.turboBoostFlag = CM_TURBO_BOOST_DISABLE;
2252 task->SetProperty(taskConfig);
2253 }
2254
2255 CM_CHK_CMSTATUS_GOTOFINISH(EnqueueFast(task, event, threadSpace));
2256
2257 if ((option & CM_FASTCOPY_OPTION_BLOCKING) && (event))
2258 {
2259 CM_CHK_CMSTATUS_GOTOFINISH(event->WaitForTaskFinished());
2260 }
2261
2262 //Copy the unaligned part by using CPU
2263 gpuMemcopySize = threadHeight * threadWidth *BYTE_COPY_ONE_THREAD;
2264 cpuMemcopySize = size - threadHeight * threadWidth *BYTE_COPY_ONE_THREAD;
2265
2266 CmFastMemCopy((void *)(outputLinearAddress+gpuMemcopySize),
2267 (void *)(inputLinearAddress+gpuMemcopySize),
2268 cpuMemcopySize); //SSE copy used in CMRT.
2269
2270 CM_CHK_CMSTATUS_GOTOFINISH(m_device->DestroyThreadSpace(threadSpace));
2271 CM_CHK_CMSTATUS_GOTOFINISH(m_device->DestroyTask(task));
2272 CM_CHK_CMSTATUS_GOTOFINISH(m_device->DestroyBufferUP(surfaceOutput)); // ref_cnf to guarantee task finish before BufferUP being really destroy.
2273 CM_CHK_CMSTATUS_GOTOFINISH(m_device->DestroyBufferUP(surfaceInput));
2274
2275 GPUCOPY_KERNEL_UNLOCK(gpuCopyKernelParam);
2276
2277 finish:
2278 if(hr != CM_SUCCESS)
2279 { //Failed
2280 if( surfaceInput == nullptr || surfaceOutput == nullptr)
2281 {
2282 hr = CM_GPUCOPY_OUT_OF_RESOURCE; // user need to know whether the failure is caused by out of BufferUP.
2283 }
2284 else
2285 {
2286 hr = CM_FAILURE;
2287 }
2288 if(surfaceInput) m_device->DestroyBufferUP(surfaceInput);
2289 if(surfaceOutput) m_device->DestroyBufferUP(surfaceOutput);
2290 if(kernel && gpuCopyKernelParam) GPUCOPY_KERNEL_UNLOCK(gpuCopyKernelParam);
2291 if(threadSpace) m_device->DestroyThreadSpace(threadSpace);
2292 if(task) m_device->DestroyTask(task);
2293 }
2294
2295 return hr;
2296 }
2297
2298
2299 //worker thread for video buffer copy to/from system memory
2300 //support wait event and provide notification event
BufferCopyThread(void * threadData)2301 void BufferCopyThread(void* threadData)
2302 {
2303 int hr = CM_SUCCESS;
2304 CopyThreadData* data = (CopyThreadData*)threadData;
2305
2306 CmBuffer_RT* buffer = (CmBuffer_RT*)(data->buffer);
2307 unsigned char* sysMem = (unsigned char*)data->sysMem;
2308 CmEvent* wait_event = (CmEvent*)(data->wait_event);
2309 CmEvent* notify_event = (CmEvent*)(data->event);
2310 CmEventRT* eventRT = dynamic_cast<CmEventRT*>(notify_event);
2311 CM_CHK_NULL_RETURN_VOID(eventRT);
2312 CmEventEx* eex = dynamic_cast<CmEventEx*>(notify_event);
2313
2314 uint32_t offset = data->offset;
2315 uint64_t cpuMemCopySize = data->sysMemSize;
2316 uint64_t ts = 0, te = 0;
2317 MosUtilities::MosQueryPerformanceCounter(&ts);
2318 // CPU buffer copy call with wait event
2319 if(data->dir)
2320 hr = buffer->WriteBuffer(sysMem, wait_event, cpuMemCopySize, offset);
2321 else
2322 hr = buffer->ReadBuffer((unsigned char*)sysMem, wait_event, cpuMemCopySize, offset);
2323 MosUtilities::MosQueryPerformanceCounter(&te);
2324 uint64_t etime = (te - ts)*1000000000 / data->cpuFrrequency;
2325 eventRT->ModifyStatus(CM_STATUS_FINISHED, etime);
2326
2327 MOS_Delete(data);
2328 }
2329
EnqueueBufferCopy(CmBuffer * buffer,size_t offset,const unsigned char * sysMem,uint64_t sysMemSize,CM_GPUCOPY_DIRECTION dir,CmEvent * wait_event,CmEvent * & event,unsigned option)2330 int32_t CmQueueRT::EnqueueBufferCopy(CmBuffer* buffer, size_t offset, const unsigned char* sysMem, uint64_t sysMemSize, CM_GPUCOPY_DIRECTION dir, CmEvent* wait_event, CmEvent*& event, unsigned option)
2331 {
2332 INSERT_API_CALL_LOG(GetHalState());
2333 int hr = CM_SUCCESS;
2334 bool bCPUcopy = option>0 ? true:false;
2335 if ((offset) || (sysMemSize > 1069551616))
2336 bCPUcopy = true;
2337
2338 MOS_THREADHANDLE workThread = 0;
2339 CmBufferUP* sysUPbuffer = nullptr;
2340 CmBufferUP* surfaceOutput = nullptr;
2341 CmKernel* kernel = nullptr;
2342 SurfaceIndex* vBufferIndex = nullptr;
2343 SurfaceIndex* sysUPIndex = nullptr;
2344 CmThreadSpace* threadSpace = nullptr;
2345 CmTask* task = nullptr;
2346 CM_GPUCOPY_KERNEL* gpuCopyKernelParam = nullptr;
2347
2348 int32_t sysLeftShiftOffset = 0;
2349 int32_t dstLeftShiftOffset = 0;
2350 uint32_t threadWidth = 0;
2351 uint32_t threadHeight = 0;
2352 uint32_t threadNum = 0;
2353 uint32_t copySize = (uint32_t)sysMemSize;
2354 uint32_t cpuMemcopySize = 0;
2355 size_t systemLinearAddressAligned = 0;
2356
2357 threadNum = copySize / BYTE_COPY_ONE_THREAD;
2358
2359 int32_t taskDriverId = -1;
2360 CmEventRT* eventRT = static_cast<CmEventRT*>(event);
2361 hr = CreateEvent((CmTaskInternal *)task, true, taskDriverId, eventRT);
2362 event = static_cast<CmEvent*>(eventRT);
2363
2364 if (((size_t)sysMem & 0xf) || (sysMem == 0))
2365 {
2366 CM_ASSERTMESSAGE("Error: Start address of system memory is not 16-byte aligned.");
2367 bCPUcopy = true;
2368 }
2369
2370 // Get page aligned address
2371 if (sizeof(void*) == 8) //64-bit
2372 {
2373 systemLinearAddressAligned = (size_t)sysMem & ADDRESS_PAGE_ALIGNMENT_MASK_X64; // make sure the address page aligned.
2374 }
2375 else
2376 {
2377 systemLinearAddressAligned = (size_t)sysMem & ADDRESS_PAGE_ALIGNMENT_MASK_X86; // make sure the address page aligned.
2378 }
2379
2380 sysLeftShiftOffset = (int32_t)((size_t)sysMem - systemLinearAddressAligned);
2381
2382
2383 if ((sysMemSize + sysLeftShiftOffset) > CM_MAX_1D_SURF_WIDTH)
2384 {
2385 CM_ASSERTMESSAGE("Error: Invalid copy size.");
2386 return CM_GPUCOPY_INVALID_SIZE;
2387 }
2388
2389 if (!m_device->HasGpuCopyKernel())
2390 {
2391 //return CM_NOT_IMPLEMENTED;
2392 bCPUcopy = true;
2393 }
2394
2395 if (sysMem == nullptr)
2396 {
2397 CM_ASSERTMESSAGE("Error: Pointer to system memory is null.");
2398 return CM_NULL_POINTER;
2399 }
2400
2401 threadWidth = CM_MAX_THREADSPACE_WIDTH_FOR_MW;
2402 threadHeight = (threadNum + threadWidth - 1) / threadWidth;
2403 while (threadHeight > (CM_MAX_THREADSPACE_HEIGHT_SKLUP_FOR_MW >>1))
2404 {
2405 threadWidth++; // THREAD_SPACE_WIDTH_INCREMENT; //threadWidth << 1;
2406 threadHeight = (threadNum + threadWidth - 1) / threadWidth;
2407
2408 if (threadWidth > (CM_MAX_THREADSPACE_WIDTH_SKLUP_FOR_MW >>1))
2409 {
2410 hr = CM_GPUCOPY_INVALID_SIZE; // thread number exceed 1023*1023
2411 goto finish;
2412 }
2413 }
2414
2415 if (bCPUcopy)
2416 {
2417 void* data = MOS_New(CopyThreadData); // malloc use mos utility
2418
2419 ((CopyThreadData*)data)->buffer = dynamic_cast<CmBuffer_RT*>(buffer);
2420 ((CopyThreadData*)data)->offset = offset;
2421 ((CopyThreadData*)data)->sysMem = (unsigned char*)sysMem;
2422 ((CopyThreadData*)data)->sysMemSize = sysMemSize;
2423 ((CopyThreadData*)data)->dir = dir;
2424 ((CopyThreadData*)data)->wait_event = wait_event;
2425 ((CopyThreadData*)data)->event = event;
2426 ((CopyThreadData*)data)->option = option;
2427 ((CopyThreadData*)data)->pCmQueueRT = this;
2428 ((CopyThreadData*)data)->cpuFrrequency = m_CPUperformanceFrequency;
2429
2430 workThread = MosUtilities::MosCreateThread((void*)BufferCopyThread, data);
2431 if (workThread)
2432 hr = CM_SUCCESS;
2433 else
2434 hr = CM_INVALID_MOS_RESOURCE_HANDLE;
2435 }
2436 else
2437 {
2438 CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateBufferUP((int)sysMemSize + sysLeftShiftOffset, (void*)systemLinearAddressAligned, sysUPbuffer));
2439
2440 CM_CHK_CMSTATUS_GOTOFINISH(CreateGPUCopyKernel((int)sysMemSize, 0, CM_SURFACE_FORMAT_INVALID, CM_FASTCOPY_CPU2CPU, gpuCopyKernelParam));
2441 CM_CHK_NULL_GOTOFINISH_CMERROR(gpuCopyKernelParam);
2442 CM_CHK_NULL_GOTOFINISH_CMERROR(gpuCopyKernelParam->kernel);
2443 kernel = gpuCopyKernelParam->kernel;
2444
2445 CM_CHK_NULL_GOTOFINISH_CMERROR(buffer);
2446 CM_CHK_CMSTATUS_GOTOFINISH(buffer->GetIndex(vBufferIndex));
2447 CM_CHK_NULL_GOTOFINISH_CMERROR(sysUPbuffer);
2448 CM_CHK_CMSTATUS_GOTOFINISH(sysUPbuffer->GetIndex(sysUPIndex));
2449 CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetThreadCount(threadNum));
2450
2451 if (dir)
2452 {
2453 CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg(0, sizeof(SurfaceIndex), sysUPIndex));
2454 CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg(1, sizeof(SurfaceIndex), vBufferIndex));
2455 }
2456 else
2457 {
2458 CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg(0, sizeof(SurfaceIndex), vBufferIndex));
2459 CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg(1, sizeof(SurfaceIndex), sysUPIndex));
2460 }
2461
2462 CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg(2, sizeof(int), &threadWidth));
2463 CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg(3, sizeof(int), &threadHeight));
2464 CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg(4, sizeof(int), &offset));
2465 CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg(5, sizeof(int), &sysLeftShiftOffset));
2466 CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg(6, sizeof(int), ©Size));
2467
2468 CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateThreadSpace(threadWidth, threadHeight, threadSpace));
2469
2470 CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateTask(task));
2471 CM_CHK_NULL_GOTOFINISH_CMERROR(task);
2472 CM_CHK_CMSTATUS_GOTOFINISH(task->AddKernel(kernel));
2473
2474 CM_CHK_CMSTATUS_GOTOFINISH(EnqueueFast(task, event, threadSpace));
2475 }
2476
2477 finish:
2478 if (hr != CM_SUCCESS)
2479 { //Failed
2480 if (sysUPbuffer == nullptr || buffer == nullptr)
2481 {
2482 hr = CM_GPUCOPY_OUT_OF_RESOURCE; // user need to know whether the failure is caused by out of BufferUP.
2483 }
2484 else
2485 {
2486 hr = CM_FAILURE;
2487 }
2488 }
2489
2490 if (sysUPbuffer) m_device->DestroyBufferUP(sysUPbuffer);
2491 if (kernel && gpuCopyKernelParam) GPUCOPY_KERNEL_UNLOCK(gpuCopyKernelParam);
2492 if (threadSpace) m_device->DestroyThreadSpace(threadSpace);
2493 if (task) m_device->DestroyTask(task);
2494
2495 return hr;
2496 }
2497
2498
2499
2500
2501 //*----------------------------------------------------------------------------------------
2502 //| Purpose: Pop task from flushed Queue, Update surface state and Destroy the task
2503 //| Notes:
2504 //*----------------------------------------------------------------------------------------
PopTaskFromFlushedQueue()2505 void CmQueueRT::PopTaskFromFlushedQueue()
2506 {
2507 CmTaskInternal* topTask = (CmTaskInternal*)m_flushedTasks.Pop();
2508
2509 if ( topTask != nullptr )
2510 {
2511 CmEventRT *event = nullptr;
2512 topTask->GetTaskEvent( event );
2513 if ( event != nullptr )
2514 {
2515 LARGE_INTEGER nTime;
2516 if ( !(MosUtilities::MosQueryPerformanceCounter( (uint64_t*)&nTime.QuadPart )) )
2517 {
2518 CM_ASSERTMESSAGE("Error: Query performace counter failure.");
2519 }
2520 else
2521 {
2522 event->SetCompleteTime( nTime );
2523 }
2524 }
2525
2526 CmTaskInternal::Destroy( topTask );
2527 }
2528 return;
2529 }
2530
TouchFlushedTasks()2531 int32_t CmQueueRT::TouchFlushedTasks( )
2532 {
2533 int32_t hr = CM_SUCCESS;
2534
2535 if (m_flushedTasks.IsEmpty())
2536 {
2537 if (!m_enqueuedTasks.IsEmpty())
2538 {
2539 // if FlushedQueue is empty and EnqueuedQueue is not empty
2540 // try flush task to FlushedQueue
2541 hr = FlushTaskWithoutSync();
2542 if (FAILED(hr))
2543 {
2544 return hr;
2545 }
2546 }
2547 else
2548 { // no task in flushedQueue and EnqueuedQueue, just skip
2549 return CM_SUCCESS;
2550 }
2551 }
2552
2553 // Flush FlushedQueue
2554 hr = QueryFlushedTasks();
2555
2556 return hr;
2557 }
2558
2559 //*-----------------------------------------------------------------------------
2560 //! Flush the queue, i.e. submit all tasks in the queue to execute according
2561 //! to their order in the the queue. The queue will be empty after flush,
2562 //! This is a non-blocking call. i.e. it returns immediately without waiting for
2563 //! GPU to finish the execution of tasks.
2564 //! INPUT:
2565 //! OUTPUT:
2566 //! CM_SUCCESS if all tasks in the queue are submitted
2567 //! CM_FAILURE otherwise.
2568 //! More error code is coming.
2569 //!
2570 //*-----------------------------------------------------------------------------
QueryFlushedTasks()2571 int32_t CmQueueRT::QueryFlushedTasks()
2572 {
2573 int32_t hr = CM_SUCCESS;
2574
2575 m_criticalSectionFlushedTask.Acquire();
2576 while( !m_flushedTasks.IsEmpty() )
2577 {
2578 CmTaskInternal* task = (CmTaskInternal*)m_flushedTasks.Top();
2579 CM_CHK_NULL_GOTOFINISH_CMERROR(task);
2580
2581 CM_STATUS status = CM_STATUS_FLUSHED ;
2582 task->GetTaskStatus(status);
2583 if( status == CM_STATUS_FINISHED )
2584 {
2585 PopTaskFromFlushedQueue();
2586 }
2587 else
2588 {
2589 // media reset
2590 if (status == CM_STATUS_RESET)
2591 {
2592 PCM_CONTEXT_DATA cmData = (PCM_CONTEXT_DATA)m_device->GetAccelData();
2593
2594 // Clear task status table in Cm Hal State
2595 int32_t taskId = 0;
2596 CmEventRT*pTopTaskEvent = nullptr;
2597 task->GetTaskEvent(pTopTaskEvent);
2598 CM_CHK_NULL_GOTOFINISH_CMERROR(pTopTaskEvent);
2599
2600 pTopTaskEvent->GetTaskDriverId(taskId);
2601 cmData->cmHalState->taskStatusTable[taskId] = CM_INVALID_INDEX;
2602
2603 //Pop task and Destroy it
2604 PopTaskFromFlushedQueue();
2605 }
2606
2607 // It is an in-order queue, if this one hasn't finshed,
2608 // the following ones haven't finished either.
2609 break;
2610 }
2611 }
2612
2613 finish:
2614 m_criticalSectionFlushedTask.Release();
2615
2616 return hr;
2617 }
2618
2619 //*-----------------------------------------------------------------------------
2620 //! This is a blocking call. It will NOT return untill
2621 //! all tasks in GPU and all tasks in queue finishes execution.
2622 //! It will first flush the queue if the queue is not empty.
2623 //! INPUT:
2624 //! OUTPUT:
2625 //! CM_SUCCESS if all tasks finish execution.
2626 //! CM_FAILURE otherwise.
2627 //! More error code is coming.
2628 //*-----------------------------------------------------------------------------
DestroyEvent(CmEvent * & event)2629 CM_RT_API int32_t CmQueueRT::DestroyEvent( CmEvent* & event )
2630 {
2631
2632 CLock Lock(m_criticalSectionEvent);
2633
2634 if (event == nullptr)
2635 {
2636 return CM_FAILURE;
2637 }
2638
2639 uint32_t index = 0;
2640
2641 CmEventRT *eventRT = dynamic_cast<CmEventRT *>(event);
2642 if (eventRT == nullptr)
2643 {
2644 return DestroyEventFast(event);
2645 }
2646 eventRT->GetIndex(index);
2647 CM_ASSERT( m_eventArray.GetElement( index ) == eventRT );
2648
2649 int32_t status = CmEventRT::Destroy( eventRT );
2650 if( status == CM_SUCCESS && eventRT == nullptr)
2651 {
2652 m_eventArray.SetElement(index, nullptr);
2653 }
2654
2655 // Should return nullptr to application even the event is not destroyed
2656 // since its reference count is not zero
2657 event = nullptr;
2658
2659 return status;
2660 }
2661
2662 //*-----------------------------------------------------------------------------
2663 //| Purpose: Clean the Queue if its tasks time out
2664 //| Returns: Result of the operation.
2665 //*-----------------------------------------------------------------------------
CleanQueue()2666 int32_t CmQueueRT::CleanQueue( )
2667 {
2668
2669 int32_t status = CM_SUCCESS;
2670
2671 // Maybe not necessary since
2672 // it is called by ~CmDevice only
2673 // Update: necessary because it calls FlushBlockWithoutSync
2674 if( !m_enqueuedTasks.IsEmpty() )
2675 {
2676 // If there are tasks not flushed (i.e. not send to driver )
2677 // wait untill all such tasks are flushed
2678 FlushTaskWithoutSync( true );
2679 }
2680 CM_ASSERT( m_enqueuedTasks.IsEmpty() );
2681
2682 //Used for timeout detection
2683 LARGE_INTEGER freq;
2684 MosUtilities::MosQueryPerformanceFrequency((uint64_t *)&freq.QuadPart);
2685 LARGE_INTEGER start;
2686 MosUtilities::MosQueryPerformanceCounter((uint64_t*)&start.QuadPart);
2687 int64_t timeout = start.QuadPart + (CM_MAX_TIMEOUT * freq.QuadPart * m_flushedTasks.GetCount()); //Count to timeout at
2688
2689 while( !m_flushedTasks.IsEmpty() && status != CM_EXCEED_MAX_TIMEOUT )
2690 {
2691 QueryFlushedTasks();
2692
2693 LARGE_INTEGER current;
2694 MosUtilities::MosQueryPerformanceCounter((uint64_t*)¤t.QuadPart);
2695 if( current.QuadPart > timeout )
2696 status = CM_EXCEED_MAX_TIMEOUT;
2697 }
2698
2699 return status;
2700 }
2701
GetQueueOption()2702 CM_QUEUE_CREATE_OPTION &CmQueueRT::GetQueueOption()
2703 {
2704 return m_queueOption;
2705 }
2706
2707 //*-----------------------------------------------------------------------------
2708 //| Purpose: Get the count of task in queue
2709 //| Returns: Result of the operation.
2710 //*-----------------------------------------------------------------------------
GetTaskCount(uint32_t & numTasks)2711 int32_t CmQueueRT::GetTaskCount( uint32_t& numTasks )
2712 {
2713 numTasks = m_enqueuedTasks.GetCount() + m_flushedTasks.GetCount();
2714 return CM_SUCCESS;
2715 }
2716
2717 //*-----------------------------------------------------------------------------
2718 //| Purpose: Use GPU to init Surface2D
2719 //| Returns: result of operation
2720 //*-----------------------------------------------------------------------------
EnqueueInitSurface2D(CmSurface2D * surf2D,const uint32_t initValue,CmEvent * & event)2721 CM_RT_API int32_t CmQueueRT::EnqueueInitSurface2D( CmSurface2D* surf2D, const uint32_t initValue, CmEvent* &event)
2722 {
2723 INSERT_API_CALL_LOG(GetHalState());
2724
2725 if (!m_device->HasGpuInitKernel())
2726 {
2727 return CM_NOT_IMPLEMENTED;
2728 }
2729
2730 int32_t hr = CM_SUCCESS;
2731 uint32_t width = 0;
2732 uint32_t height = 0;
2733 uint32_t sizePerPixel = 0;
2734 CmProgram *gpuInitKernelProgram = nullptr;
2735 CmKernel *kernel = nullptr;
2736 SurfaceIndex *outputIndexCM = nullptr;
2737 CmThreadSpace *threadSpace = nullptr;
2738 CmTask *gpuCopyTask = nullptr;
2739 uint32_t threadWidth = 0;
2740 uint32_t threadHeight = 0;
2741 uint32_t threadNum = 0;
2742 CmSurfaceManager* surfaceMgr = nullptr;
2743 CM_SURFACE_FORMAT format = CM_SURFACE_FORMAT_INVALID;
2744
2745 if(!surf2D)
2746 {
2747 CM_ASSERTMESSAGE("Error: Pointer to surface 2d is null.");
2748 return CM_FAILURE;
2749 }
2750 CmSurface2DRT *surf2DRT = static_cast<CmSurface2DRT *>(surf2D);
2751
2752 CM_CHK_CMSTATUS_GOTOFINISH(m_device->LoadPredefinedInitKernel(gpuInitKernelProgram));
2753
2754 CM_CHK_CMSTATUS_GOTOFINISH(surf2DRT->GetSurfaceDesc(width, height, format,sizePerPixel));
2755
2756 m_device->GetSurfaceManager(surfaceMgr);
2757 CM_CHK_NULL_GOTOFINISH_CMERROR(surfaceMgr);
2758
2759 if (format == CM_SURFACE_FORMAT_NV12 || format == CM_SURFACE_FORMAT_P010 || format == CM_SURFACE_FORMAT_P016)
2760 {
2761 CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateKernel( gpuInitKernelProgram, _NAME( surfaceCopy_set_NV12 ), kernel, "PredefinedGPUCopyKernel"));
2762 }
2763 else
2764 {
2765 CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateKernel( gpuInitKernelProgram, _NAME( surfaceCopy_set ), kernel, "PredefinedGPUCopyKernel" ));
2766 }
2767 CM_CHK_NULL_GOTOFINISH_CMERROR(kernel);
2768 CM_CHK_CMSTATUS_GOTOFINISH(surf2D->GetIndex( outputIndexCM ));
2769
2770 threadWidth = ( uint32_t )ceil( ( double )width*sizePerPixel/BLOCK_PIXEL_WIDTH/4 );
2771 threadHeight = ( uint32_t )ceil( ( double )height/BLOCK_HEIGHT );
2772 threadNum = threadWidth * threadHeight;
2773 CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetThreadCount( threadNum ));
2774
2775 CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateThreadSpace( threadWidth, threadHeight, threadSpace ));
2776 CM_CHK_NULL_GOTOFINISH_CMERROR(threadSpace);
2777
2778 CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 0, sizeof( uint32_t ), &initValue ));
2779 CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 1, sizeof( SurfaceIndex ), outputIndexCM ));
2780
2781 CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateTask(gpuCopyTask));
2782 CM_CHK_NULL_GOTOFINISH_CMERROR(gpuCopyTask);
2783
2784 CM_CHK_CMSTATUS_GOTOFINISH(gpuCopyTask->AddKernel( kernel ));
2785
2786 CM_CHK_CMSTATUS_GOTOFINISH(EnqueueFast(gpuCopyTask, event, threadSpace));
2787
2788 finish:
2789
2790 if (kernel) m_device->DestroyKernel( kernel );
2791 if (gpuCopyTask) m_device->DestroyTask(gpuCopyTask);
2792 if (threadSpace) m_device->DestroyThreadSpace(threadSpace);
2793
2794 return hr;
2795 }
2796
2797 //*-----------------------------------------------------------------------------
2798 //! Flush a geneal task to HAL CM layer for execution.
2799 //! This is a non-blocking call. i.e. it returs immediately without waiting for
2800 //! GPU to finish the execution of tasks.
2801 //! INPUT: task -- Pointer to CmTaskInternal object
2802 //! OUTPUT:
2803 //! CM_SUCCESS if all tasks in the queue are submitted
2804 //! CM_FAILURE otherwise.
2805 //*-----------------------------------------------------------------------------
FlushGeneralTask(CmTaskInternal * task)2806 int32_t CmQueueRT::FlushGeneralTask(CmTaskInternal* task)
2807 {
2808 CM_RETURN_CODE hr = CM_SUCCESS;
2809 CM_HAL_EXEC_TASK_PARAM param;
2810 PCM_HAL_KERNEL_PARAM kernelParam = nullptr;
2811 CmKernelData* kernelData = nullptr;
2812 uint32_t kernelDataSize = 0;
2813 PCM_CONTEXT_DATA cmData = nullptr;
2814 CmEventRT* event = nullptr;
2815 uint32_t totalThreadCount= 0;
2816 uint32_t count = 0;
2817 PCM_HAL_KERNEL_PARAM tempData = nullptr;
2818 uint32_t maxTSWidth = 0;
2819 bool hasThreadArg = false;
2820
2821 CmSafeMemSet( ¶m, 0, sizeof( CM_HAL_EXEC_TASK_PARAM ) );
2822
2823 //GT-PIN
2824 if(m_device->CheckGTPinEnabled())
2825 {
2826 CM_CHK_CMSTATUS_GOTOFINISH(task->GetKernelSurfInfo(param.surfEntryInfoArrays));
2827 }
2828
2829 task->GetKernelCount( count );
2830 param.numKernels = count;
2831
2832 param.kernels = MOS_NewArray(PCM_HAL_KERNEL_PARAM,count);
2833 param.kernelSizes = MOS_NewArray(uint32_t,count);
2834 param.kernelCurbeOffset = MOS_NewArray(uint32_t,count);
2835 param.queueOption = m_queueOption;
2836
2837 CM_CHK_NULL_GOTOFINISH(param.kernels, CM_OUT_OF_HOST_MEMORY);
2838 CM_CHK_NULL_GOTOFINISH(param.kernelSizes, CM_OUT_OF_HOST_MEMORY);
2839 CM_CHK_NULL_GOTOFINISH(param.kernelCurbeOffset, CM_OUT_OF_HOST_MEMORY);
2840
2841 for( uint32_t i = 0; i < count; i ++ )
2842 {
2843 task->GetKernelData( i, kernelData );
2844 CM_CHK_NULL_GOTOFINISH_CMERROR(kernelData);
2845
2846 kernelParam = kernelData->GetHalCmKernelData();
2847 CM_CHK_NULL_GOTOFINISH_CMERROR(kernelParam);
2848
2849 hasThreadArg |= kernelParam->perThreadArgExisted;
2850
2851 task->GetKernelDataSize( i, kernelDataSize );
2852 if(kernelDataSize == 0)
2853 {
2854 CM_ASSERTMESSAGE("Error: Invalid kernel data size.");
2855 hr = CM_FAILURE;
2856 goto finish;
2857 }
2858
2859 tempData = kernelData->GetHalCmKernelData();
2860
2861 param.kernels[ i ] = tempData;
2862 param.kernelSizes[ i ] = kernelDataSize;
2863 param.kernelCurbeOffset[ i ] = task->GetKernelCurbeOffset(i);
2864 param.globalSurfaceUsed |= tempData->globalSurfaceUsed;
2865 param.kernelDebugEnabled |= tempData->kernelDebugEnabled;
2866 }
2867
2868 /*
2869 * Preset the default TS width/height/dependency:
2870 * TS width = MOS_MIN(CM_MAX_THREADSPACE_WIDTH, threadcount)
2871 * TS height = totalThreadCount/CM_MAX_THREADSPACE_WIDTH + 1
2872 * dependency = CM_NONE_DEPENDENCY
2873 * For threadSpace is nullptr case, we will pass the default TS width/height/dependency to driver
2874 * For threadSpace is valid case, the TS width/height/dependency will be update according to thread space set by user.
2875 */
2876 task->GetTotalThreadCount(totalThreadCount);
2877
2878 if (hasThreadArg)
2879 {
2880 maxTSWidth = CM_MAX_THREADSPACE_WIDTH_FOR_MW + 1; // 512 allowed for media object
2881 }
2882 else
2883 {
2884 maxTSWidth = CM_MAX_THREADSPACE_WIDTH_FOR_MW; // 511 for media walker
2885 }
2886
2887 param.threadSpaceWidth = (totalThreadCount > maxTSWidth) ? maxTSWidth : totalThreadCount;
2888 if(totalThreadCount%maxTSWidth)
2889 {
2890 param.threadSpaceHeight = totalThreadCount/maxTSWidth + 1;
2891 }
2892 else
2893 {
2894 param.threadSpaceHeight = totalThreadCount/maxTSWidth;
2895 }
2896
2897 param.dependencyPattern = CM_NONE_DEPENDENCY;
2898
2899 if (task->IsThreadSpaceCreated()) //scoreboard data preparation
2900 {
2901 if(task->IsThreadCoordinatesExisted())
2902 {
2903 param.threadCoordinates = MOS_NewArray(PCM_HAL_SCOREBOARD, count);
2904 param.dependencyMasks = MOS_NewArray(PCM_HAL_MASK_AND_RESET, count);
2905
2906 CM_CHK_NULL_GOTOFINISH(param.threadCoordinates, CM_OUT_OF_HOST_MEMORY);
2907 CM_CHK_NULL_GOTOFINISH(param.dependencyMasks, CM_OUT_OF_HOST_MEMORY);
2908 for(uint32_t i=0; i<count; i++)
2909 {
2910 void *kernelCoordinates = nullptr;
2911 void *dependencyMasks = nullptr;
2912 task->GetKernelCoordinates(i, kernelCoordinates);
2913 task->GetKernelDependencyMasks(i, dependencyMasks);
2914 param.threadCoordinates[i] = (PCM_HAL_SCOREBOARD)kernelCoordinates;
2915 param.dependencyMasks[i] = (PCM_HAL_MASK_AND_RESET)dependencyMasks;
2916 }
2917 }
2918 else
2919 {
2920 param.threadCoordinates = nullptr;
2921 }
2922
2923 task->GetDependencyPattern(param.dependencyPattern);
2924
2925 task->GetThreadSpaceSize(param.threadSpaceWidth, param.threadSpaceHeight);
2926
2927 task->GetWalkingPattern(param.walkingPattern);
2928
2929 if( task->CheckWalkingParametersSet( ) )
2930 {
2931 param.walkingParamsValid = 1;
2932 CM_CHK_CMSTATUS_GOTOFINISH(task->GetWalkingParameters(param.walkingParams));
2933 }
2934 else
2935 {
2936 param.walkingParamsValid = 0;
2937 }
2938
2939 if( task->CheckDependencyVectorsSet( ) )
2940 {
2941 param.dependencyVectorsValid = 1;
2942 CM_CHK_CMSTATUS_GOTOFINISH(task->GetDependencyVectors(param.dependencyVectors));
2943 }
2944 else
2945 {
2946 param.dependencyVectorsValid = 0;
2947 }
2948 }
2949 if (param.threadSpaceWidth == 0)
2950 {
2951 CM_ASSERTMESSAGE("Error: Invalid thread space.");
2952 hr = CM_INVALID_THREAD_SPACE;
2953 goto finish;
2954 }
2955 task->GetColorCountMinusOne(param.colorCountMinusOne);
2956 task->GetMediaWalkerGroupSelect(param.mediaWalkerGroupSelect);
2957
2958 param.syncBitmap = task->GetSyncBitmap();
2959 param.conditionalEndBitmap = task->GetConditionalEndBitmap();
2960 param.userDefinedMediaState = task->GetMediaStatePtr();
2961 CmSafeMemCopy(param.conditionalEndInfo, task->GetConditionalEndInfo(), sizeof(param.conditionalEndInfo));
2962
2963 CM_TASK_CONFIG taskConfig;
2964 task->GetProperty(taskConfig);
2965 CmSafeMemCopy(¶m.taskConfig, &taskConfig, sizeof(param.taskConfig));
2966 cmData = (PCM_CONTEXT_DATA)m_device->GetAccelData();
2967
2968 CM_CHK_MOSSTATUS_GOTOFINISH_CMERROR(cmData->cmHalState->pfnSetPowerOption(cmData->cmHalState, task->GetPowerOption()));
2969
2970 CM_CHK_MOSSTATUS_GOTOFINISH_CMERROR(
2971 ExecuteGeneralTask(cmData->cmHalState,
2972 ¶m,
2973 static_cast<MOS_GPU_CONTEXT>(m_queueOption.GPUContext)));
2974
2975 if( param.taskIdOut < 0 )
2976 {
2977 CM_ASSERTMESSAGE("Error: Invalid task ID.");
2978 hr = CM_FAILURE;
2979 goto finish;
2980 }
2981
2982 TASK_LOG(task);
2983
2984 task->GetTaskEvent( event );
2985 CM_CHK_NULL_GOTOFINISH_CMERROR(event);
2986 CM_CHK_CMSTATUS_GOTOFINISH(event->SetTaskDriverId( param.taskIdOut ));
2987 CM_CHK_CMSTATUS_GOTOFINISH(event->SetTaskOsData( param.osData ));
2988 CM_CHK_CMSTATUS_GOTOFINISH(task->ResetKernelDataStatus());
2989
2990 //GT-PIN
2991 if(m_device->CheckGTPinEnabled())
2992 {
2993 //No need to clear the SurEntryInfoArrays here. It will be destored by CmInternalTask
2994 CM_CHK_CMSTATUS_GOTOFINISH(event->SetSurfaceDetails(param.surfEntryInfoArrays));
2995 }
2996
2997 finish:
2998 MosSafeDeleteArray( param.kernels );
2999 MosSafeDeleteArray( param.kernelSizes );
3000 MosSafeDeleteArray( param.threadCoordinates);
3001 MosSafeDeleteArray( param.dependencyMasks);
3002 MosSafeDeleteArray( param.kernelCurbeOffset);
3003
3004 return hr;
3005 }
3006
3007 //*-----------------------------------------------------------------------------
3008 //! Flush a thread group based task to HAL CM layer for execution.
3009 //! This is a non-blocking call. i.e. it returs immediately without waiting for
3010 //! GPU to finish the execution of tasks.
3011 //! INPUT: task -- Pointer to CmTaskInternal object
3012 //! OUTPUT:
3013 //! CM_SUCCESS if all tasks in the queue are submitted
3014 //! CM_FAILURE otherwise.
3015 //*-----------------------------------------------------------------------------
FlushGroupTask(CmTaskInternal * task)3016 int32_t CmQueueRT::FlushGroupTask(CmTaskInternal* task)
3017 {
3018 CM_RETURN_CODE hr = CM_SUCCESS;
3019
3020 CM_HAL_EXEC_TASK_GROUP_PARAM param;
3021 CmKernelData* kernelData = nullptr;
3022 uint32_t kernelDataSize = 0;
3023 uint32_t count = 0;
3024 PCM_CONTEXT_DATA cmData = nullptr;
3025 CmEventRT * event = nullptr;
3026 PCM_HAL_KERNEL_PARAM tempData = nullptr;
3027
3028 CmSafeMemSet( ¶m, 0, sizeof( CM_HAL_EXEC_TASK_GROUP_PARAM ) );
3029
3030 //GT-PIN
3031 if(this->m_device->CheckGTPinEnabled())
3032 {
3033 CM_CHK_CMSTATUS_GOTOFINISH(task->GetKernelSurfInfo(param.surEntryInfoArrays));
3034 }
3035
3036 task->GetKernelCount( count );
3037 param.numKernels = count;
3038
3039 param.kernels = MOS_NewArray(PCM_HAL_KERNEL_PARAM, count);
3040 param.kernelSizes = MOS_NewArray(uint32_t, count);
3041 param.kernelCurbeOffset = MOS_NewArray(uint32_t, count);
3042 param.queueOption = m_queueOption;
3043 param.mosVeHintParams = (m_usingVirtualEngine)? &m_mosVeHintParams: nullptr;
3044
3045 CM_TASK_CONFIG taskConfig;
3046 task->GetProperty(taskConfig);
3047 CmSafeMemCopy(¶m.taskConfig, &taskConfig, sizeof(param.taskConfig));
3048 CM_CHK_NULL_GOTOFINISH_CMERROR(param.kernels);
3049 CM_CHK_NULL_GOTOFINISH_CMERROR(param.kernelSizes);
3050 CM_CHK_NULL_GOTOFINISH_CMERROR(param.kernelCurbeOffset);
3051
3052 for( uint32_t i = 0; i < count; i ++ )
3053 {
3054 task->GetKernelData( i, kernelData );
3055 CM_CHK_NULL_GOTOFINISH_CMERROR(kernelData);
3056
3057 task->GetKernelDataSize( i, kernelDataSize );
3058 if( kernelDataSize == 0)
3059 {
3060 CM_ASSERTMESSAGE("Error: Invalid kernel data size.");
3061 hr = CM_FAILURE;
3062 goto finish;
3063 }
3064
3065 tempData = kernelData->GetHalCmKernelData( );
3066
3067 param.kernels[ i ] = tempData;
3068 param.kernelSizes[ i ] = kernelDataSize;
3069 param.kernelCurbeOffset [ i ] = task->GetKernelCurbeOffset(i);
3070 param.globalSurfaceUsed |= tempData->globalSurfaceUsed;
3071 param.kernelDebugEnabled |= tempData->kernelDebugEnabled;
3072 }
3073
3074 task->GetSLMSize(param.slmSize);
3075 if(param.slmSize > MAX_SLM_SIZE_PER_GROUP_IN_1K)
3076 {
3077 CM_ASSERTMESSAGE("Error: SLM size exceeds the maximum per group.");
3078 hr = CM_EXCEED_MAX_SLM_SIZE;
3079 goto finish;
3080 }
3081
3082 if (task->IsThreadGroupSpaceCreated())//thread group size
3083 {
3084 task->GetThreadGroupSpaceSize(param.threadSpaceWidth, param.threadSpaceHeight,
3085 param.threadSpaceDepth, param.groupSpaceWidth,
3086 param.groupSpaceHeight, param.groupSpaceDepth);
3087 }
3088
3089 param.syncBitmap = task->GetSyncBitmap();
3090 param.conditionalEndBitmap = task->GetConditionalEndBitmap();
3091 param.userDefinedMediaState = task->GetMediaStatePtr();
3092 CmSafeMemCopy(param.conditionalEndInfo, task->GetConditionalEndInfo(), sizeof(param.conditionalEndInfo));
3093 CmSafeMemCopy(param.krnExecCfg, task->GetKernelExecuteConfig(), sizeof(param.krnExecCfg));
3094
3095 // Call HAL layer to execute pfnExecuteGroupTask
3096 cmData = (PCM_CONTEXT_DATA)m_device->GetAccelData();
3097
3098 CM_CHK_MOSSTATUS_GOTOFINISH_CMERROR( cmData->cmHalState->pfnSetPowerOption( cmData->cmHalState, task->GetPowerOption() ) );
3099
3100 CM_CHK_MOSSTATUS_GOTOFINISH_CMERROR(
3101 ExecuteGroupTask(cmData->cmHalState,
3102 ¶m,
3103 static_cast<MOS_GPU_CONTEXT>(m_queueOption.GPUContext)));
3104
3105 if( param.taskIdOut < 0 )
3106 {
3107 CM_ASSERTMESSAGE("Error: Invalid task ID.");
3108 hr = CM_FAILURE;
3109 goto finish;
3110 }
3111 TASK_LOG(task);
3112 task->GetTaskEvent( event );
3113 CM_CHK_NULL_GOTOFINISH_CMERROR( event );
3114 CM_CHK_CMSTATUS_GOTOFINISH(event->SetTaskDriverId( param.taskIdOut ));
3115 CM_CHK_CMSTATUS_GOTOFINISH(event->SetTaskOsData( param.osData ));
3116 CM_CHK_CMSTATUS_GOTOFINISH(task->ResetKernelDataStatus());
3117
3118 //GT-PIN
3119 if(this->m_device->CheckGTPinEnabled())
3120 {
3121 CM_CHK_CMSTATUS_GOTOFINISH(event->SetSurfaceDetails(param.surEntryInfoArrays));
3122 }
3123
3124 finish:
3125 MosSafeDeleteArray( param.kernels );
3126 MosSafeDeleteArray( param.kernelSizes );
3127 MosSafeDeleteArray( param.kernelCurbeOffset);
3128
3129 return hr;
3130 }
3131
3132 //*-----------------------------------------------------------------------------
3133 //! Flush a VEBOX task to HAL CM layer for execution.
3134 //! This is a non-blocking call. i.e. it returs immediately without waiting for
3135 //! GPU to finish the execution of tasks.
3136 //! INPUT: task -- Pointer to CmTaskInternal object
3137 //! OUTPUT:
3138 //! CM_SUCCESS if all tasks in the queue are submitted
3139 //! CM_FAILURE otherwise.
3140 //*-----------------------------------------------------------------------------
FlushVeboxTask(CmTaskInternal * task)3141 int32_t CmQueueRT::FlushVeboxTask(CmTaskInternal* task)
3142 {
3143 CM_RETURN_CODE hr = CM_SUCCESS;
3144
3145 CM_HAL_EXEC_VEBOX_TASK_PARAM param;
3146 PCM_CONTEXT_DATA cmData = nullptr;
3147 CmEventRT * event = nullptr;
3148 uint8_t *stateData = nullptr;
3149 uint8_t *surfaceData = nullptr;
3150 CmBuffer_RT * temp = nullptr;
3151
3152 uint32_t original_stream_index = 0;
3153
3154 CmSafeMemSet( ¶m, 0, sizeof( CM_HAL_EXEC_VEBOX_TASK_PARAM ) );
3155 //Set VEBOX state data pointer and size
3156 //Set VEBOX surface data pointer and size
3157 CM_VEBOX_STATE cmVeboxState;
3158 CmBufferUP *veboxParamBuf = nullptr;
3159 CM_VEBOX_SURFACE_DATA cmVeboxSurfaceData;
3160 task->GetVeboxState(cmVeboxState);
3161 task->GetVeboxParam(veboxParamBuf);
3162 task->GetVeboxSurfaceData(cmVeboxSurfaceData);
3163 CM_CHK_NULL_GOTOFINISH_CMERROR(veboxParamBuf);
3164
3165 temp = static_cast<CmBuffer_RT*>(veboxParamBuf);
3166 temp->GetHandle(param.veboxParamIndex);
3167
3168 param.cmVeboxState = cmVeboxState;
3169 param.veboxParam = veboxParamBuf;
3170
3171 param.veboxSurfaceData = cmVeboxSurfaceData;
3172
3173 param.queueOption = m_queueOption;
3174
3175 //Set VEBOX task id to -1
3176 param.taskIdOut = -1;
3177
3178 cmData = (PCM_CONTEXT_DATA)m_device->GetAccelData();
3179 original_stream_index = cmData->cmHalState->osInterface->streamIndex;
3180 cmData->cmHalState->pfnSetGpuContext(cmData->cmHalState, MOS_GPU_CONTEXT_VEBOX,
3181 original_stream_index, m_gpuContextHandle);
3182 RegisterSyncEvent();
3183
3184 CM_CHK_MOSSTATUS_GOTOFINISH_CMERROR( cmData->cmHalState->pfnExecuteVeboxTask( cmData->cmHalState, ¶m ) );
3185
3186 if( param.taskIdOut < 0 )
3187 {
3188 CM_ASSERTMESSAGE("Error: Invalid task ID.");
3189 hr = CM_FAILURE;
3190 goto finish;
3191 }
3192
3193 task->GetTaskEvent( event );
3194 CM_CHK_NULL_GOTOFINISH_CMERROR( event );
3195 CM_CHK_CMSTATUS_GOTOFINISH(event->SetTaskDriverId( param.taskIdOut ));
3196 CM_CHK_CMSTATUS_GOTOFINISH(event->SetTaskOsData( param.osData ));
3197
3198 finish:
3199 return hr;
3200 }
3201
3202 //*-----------------------------------------------------------------------------
3203 //! Flush the queue, i.e. submit all tasks in the queue to execute according
3204 //! to their order in the the queue. The queue will be empty after flush,
3205 //! This is a non-blocking call. i.e. it returns immediately without waiting for
3206 //! GPU to finish the execution of tasks.
3207 //! INPUT:
3208 //! OUTPUT:
3209 //! CM_SUCCESS if all tasks in the queue are submitted
3210 //! CM_FAILURE otherwise.
3211 //*-----------------------------------------------------------------------------
FlushEnqueueWithHintsTask(CmTaskInternal * task)3212 int32_t CmQueueRT::FlushEnqueueWithHintsTask( CmTaskInternal* task )
3213 {
3214 CM_RETURN_CODE hr = CM_SUCCESS;
3215 CM_HAL_EXEC_HINTS_TASK_PARAM param;
3216 PCM_CONTEXT_DATA cmData = nullptr;
3217 CmKernelData* kernelData = nullptr;
3218 uint32_t kernelDataSize = 0;
3219 uint32_t count = 0;
3220 CmEventRT *event = nullptr;
3221 PCM_HAL_KERNEL_PARAM tempData = nullptr;
3222
3223 uint32_t original_stream_index = 0;
3224
3225 CmSafeMemSet( ¶m, 0, sizeof( CM_HAL_EXEC_HINTS_TASK_PARAM ) );
3226 task->GetKernelCount ( count );
3227 param.numKernels = count;
3228 param.kernels = MOS_NewArray(PCM_HAL_KERNEL_PARAM, count);
3229 param.kernelSizes = MOS_NewArray(uint32_t, count);
3230 param.kernelCurbeOffset = MOS_NewArray(uint32_t, count);
3231 param.queueOption = m_queueOption;
3232
3233 CM_CHK_NULL_GOTOFINISH_CMERROR(param.kernels);
3234 CM_CHK_NULL_GOTOFINISH_CMERROR(param.kernelSizes);
3235 CM_CHK_NULL_GOTOFINISH_CMERROR(param.kernelCurbeOffset);
3236
3237 task->GetHints(param.hints);
3238 task->GetNumTasksGenerated(param.numTasksGenerated);
3239 task->GetLastTask(param.isLastTask);
3240
3241 for( uint32_t i = 0; i < count; i ++ )
3242 {
3243 task->GetKernelData( i, kernelData );
3244 CM_CHK_NULL_GOTOFINISH_CMERROR( kernelData );
3245
3246 task->GetKernelDataSize( i, kernelDataSize );
3247 if( kernelDataSize == 0 )
3248 {
3249 CM_ASSERTMESSAGE("Error: Invalid kernel data size.");
3250 hr = CM_FAILURE;
3251 goto finish;
3252 }
3253
3254 tempData = kernelData->GetHalCmKernelData();
3255
3256 param.kernels[ i ] = tempData;
3257 param.kernelSizes[ i ] = kernelDataSize;
3258 param.kernelCurbeOffset[ i ] = task->GetKernelCurbeOffset(i);
3259 }
3260
3261 param.userDefinedMediaState = task->GetMediaStatePtr();
3262 cmData = (PCM_CONTEXT_DATA)m_device->GetAccelData();
3263 CM_CHK_NULL_GOTOFINISH_CMERROR(cmData);
3264
3265 CM_CHK_MOSSTATUS_GOTOFINISH_CMERROR(cmData->cmHalState->pfnSetPowerOption(cmData->cmHalState, task->GetPowerOption()));
3266
3267 original_stream_index = cmData->cmHalState->osInterface->streamIndex;
3268 cmData->cmHalState->pfnSetGpuContext(
3269 cmData->cmHalState, static_cast<MOS_GPU_CONTEXT>(m_queueOption.GPUContext),
3270 original_stream_index, m_gpuContextHandle);
3271 RegisterSyncEvent();
3272
3273 CM_CHK_MOSSTATUS_GOTOFINISH_CMERROR(cmData->cmHalState->pfnExecuteHintsTask(cmData->cmHalState, ¶m));
3274
3275 if( param.taskIdOut < 0 )
3276 {
3277 CM_ASSERTMESSAGE("Error: Invalid task ID.");
3278 hr = CM_FAILURE;
3279 goto finish;
3280 }
3281
3282 TASK_LOG(task);
3283
3284 task->GetTaskEvent( event );
3285 CM_CHK_NULL_GOTOFINISH_CMERROR( event );
3286 CM_CHK_CMSTATUS_GOTOFINISH(event->SetTaskDriverId( param.taskIdOut ));
3287 CM_CHK_CMSTATUS_GOTOFINISH(event->SetTaskOsData( param.osData ));
3288 CM_CHK_CMSTATUS_GOTOFINISH(task->ResetKernelDataStatus());
3289
3290 finish:
3291
3292 MosSafeDeleteArray( param.kernels );
3293 MosSafeDeleteArray( param.kernelSizes );
3294 MosSafeDeleteArray( param.kernelCurbeOffset );
3295
3296 return hr;
3297 }
3298
3299 //*-----------------------------------------------------------------------------
3300 //! Flush the queue, i.e. submit all tasks in the queue to execute according
3301 //! to their order in the the queue. The queue will be empty after flush,
3302 //! This is a non-blocking call. i.e. it returs immediately without waiting for
3303 //! GPU to finish the execution of tasks.
3304 //! INPUT:
3305 //! OUTPUT:
3306 //! CM_SUCCESS if all tasks in the queue are submitted
3307 //! CM_FAILURE otherwise.
3308 //*-----------------------------------------------------------------------------
FlushTaskWithoutSync(bool flushBlocked)3309 int32_t CmQueueRT::FlushTaskWithoutSync( bool flushBlocked )
3310 {
3311 int32_t hr = CM_SUCCESS;
3312 CmTaskInternal* task = nullptr;
3313 uint32_t taskType = CM_TASK_TYPE_DEFAULT;
3314 uint32_t freeSurfNum = 0;
3315 CmSurfaceManager* surfaceMgr = nullptr;
3316 CSync* surfaceLock = nullptr;
3317 PCM_CONTEXT_DATA cmData = (PCM_CONTEXT_DATA)m_device->GetAccelData();
3318 CmEventRT* event = nullptr;
3319 int32_t taskId = 0;
3320
3321 m_criticalSectionHalExecute.Acquire(); // Enter HalCm Execute Protection
3322
3323 while( !m_enqueuedTasks.IsEmpty() )
3324 {
3325 uint32_t flushedTaskCount = m_flushedTasks.GetCount();
3326 if ( flushBlocked )
3327 {
3328 while( flushedTaskCount >= m_halMaxValues->maxTasks )
3329 {
3330 // If the task count in flushed queue is no less than hw restrictiion,
3331 // query the staus of flushed task queue. Remove any finished tasks from the queue
3332 QueryFlushedTasks();
3333 flushedTaskCount = m_flushedTasks.GetCount();
3334 }
3335 }
3336 else
3337 {
3338 if( flushedTaskCount >= m_halMaxValues->maxTasks )
3339 {
3340 // If the task count in flushed queue is no less than hw restrictiion,
3341 // query the staus of flushed task queue. Remove any finished tasks from the queue
3342 QueryFlushedTasks();
3343 flushedTaskCount = m_flushedTasks.GetCount();
3344 if( flushedTaskCount >= m_halMaxValues->maxTasks )
3345 {
3346 // If none of flushed tasks finishes, we can't flush more taks.
3347 break;
3348 }
3349 }
3350 }
3351
3352 task = (CmTaskInternal*)m_enqueuedTasks.Pop();
3353 CM_CHK_NULL_GOTOFINISH_CMERROR( task );
3354
3355 CmNotifierGroup *notifiers = m_device->GetNotifiers();
3356 if (notifiers != nullptr)
3357 {
3358 notifiers->NotifyTaskFlushed(m_device, task);
3359 }
3360
3361 task->GetTaskType(taskType);
3362
3363 switch(taskType)
3364 {
3365 case CM_INTERNAL_TASK_WITH_THREADSPACE:
3366 hr = FlushGeneralTask(task);
3367 break;
3368
3369 case CM_INTERNAL_TASK_WITH_THREADGROUPSPACE:
3370 hr = FlushGroupTask(task);
3371 break;
3372
3373 case CM_INTERNAL_TASK_VEBOX:
3374 hr = FlushVeboxTask(task);
3375 break;
3376
3377 case CM_INTERNAL_TASK_ENQUEUEWITHHINTS:
3378 hr = FlushEnqueueWithHintsTask(task);
3379 break;
3380
3381 default: // by default, assume the task is considered as general task: CM_INTERNAL_TASK_WITH_THREADSPACE
3382 hr = FlushGeneralTask(task);
3383 break;
3384 }
3385
3386 if(hr == CM_SUCCESS)
3387 {
3388 m_flushedTasks.Push( task );
3389 task->VtuneSetFlushTime(); // Record Flush Time
3390 }
3391 else
3392 {
3393 // Failed to flush, destroy the task.
3394 CmTaskInternal::Destroy( task );
3395 }
3396
3397 } // loop for task
3398
3399 #if MDF_SURFACE_CONTENT_DUMP
3400 if (cmData->cmHalState->dumpSurfaceContent)
3401 {
3402 task->GetTaskEvent(event);
3403 if (event != nullptr)
3404 {
3405 while (event->GetStatusWithoutFlush() != CM_STATUS_FINISHED)
3406 {
3407 event->Query();
3408 }
3409 event->GetTaskDriverId(taskId);
3410 }
3411 task->SurfaceDump(taskId);
3412 }
3413 #endif
3414 QueryFlushedTasks();
3415
3416 finish:
3417 m_criticalSectionHalExecute.Release();//Leave HalCm Execute Protection
3418
3419 //Delayed destroy for resource
3420 m_device->GetSurfaceManager(surfaceMgr);
3421 if (!surfaceMgr)
3422 {
3423 CM_ASSERTMESSAGE("Error: Pointer to surface manager is null.");
3424 return CM_NULL_POINTER;
3425 }
3426
3427 surfaceLock = m_device->GetSurfaceCreationLock();
3428 if (surfaceLock == nullptr)
3429 {
3430 CM_ASSERTMESSAGE("Error: Pointer to surface creation lock is null.");
3431 return CM_NULL_POINTER;
3432 }
3433 surfaceLock->Acquire();
3434 surfaceMgr->RefreshDelayDestroySurfaces(freeSurfNum);
3435 surfaceLock->Release();
3436
3437 return hr;
3438 }
3439
3440 //*-----------------------------------------------------------------------------
3441 //| Purpose: Enqueue a Vebox Task
3442 //| Arguments :
3443 //| pVebox_G75 [in] Pointer to a CmVebox object
3444 //| event [in] Reference to the pointer to Event
3445 //|
3446 //| Returns: Result of the operation.
3447 //*-----------------------------------------------------------------------------
EnqueueVebox(CmVebox * vebox,CmEvent * & event)3448 CM_RT_API int32_t CmQueueRT::EnqueueVebox(CmVebox * vebox, CmEvent* & event)
3449 {
3450 INSERT_API_CALL_LOG(GetHalState());
3451
3452 int32_t hr = CM_SUCCESS;
3453 CmTaskInternal* task = nullptr;
3454 int32_t taskDriverId = -1;
3455 bool isEventVisible = (event == CM_NO_EVENT)? false:true;
3456 CmEventRT *eventRT = static_cast<CmEventRT *>(event);
3457
3458 //Check if the input is valid
3459 if ( vebox == nullptr )
3460 {
3461 CM_ASSERTMESSAGE("Error: Pointer to vebox is null.");
3462 return CM_NULL_POINTER;
3463 }
3464 CmVeboxRT *veboxRT = static_cast<CmVeboxRT *>(vebox);
3465 CM_CHK_CMSTATUS_GOTOFINISH(CmTaskInternal::Create(m_device, veboxRT, task ));
3466
3467 LARGE_INTEGER nEnqueueTime;
3468 if ( !(MosUtilities::MosQueryPerformanceCounter( (uint64_t*)&nEnqueueTime.QuadPart )) )
3469 {
3470 CM_ASSERTMESSAGE("Error: Query Performance counter failure.");
3471 hr = CM_FAILURE;
3472 goto finish;
3473 }
3474
3475 CM_CHK_CMSTATUS_GOTOFINISH(CreateEvent(task, isEventVisible, taskDriverId, eventRT));
3476
3477 if ( eventRT != nullptr )
3478 {
3479 eventRT->SetEnqueueTime( nEnqueueTime );
3480 }
3481 event = eventRT;
3482
3483 if (!m_enqueuedTasks.Push(task))
3484 {
3485 CM_ASSERTMESSAGE("Error: Push enqueued tasks failure.")
3486 hr = CM_FAILURE;
3487 goto finish;
3488 }
3489
3490 CM_CHK_CMSTATUS_GOTOFINISH(FlushTaskWithoutSync());
3491
3492 finish:
3493 if (hr != CM_SUCCESS)
3494 {
3495 CmTaskInternal::Destroy(task);
3496 }
3497 return hr;
3498 }
3499
3500 //*-----------------------------------------------------------------------------
3501 //| Purpose: Create Event and Update event in m_eventArray
3502 //| Returns: result of operation
3503 //*-----------------------------------------------------------------------------
CreateEvent(CmTaskInternal * task,bool isVisible,int32_t & taskDriverId,CmEventRT * & event)3504 int32_t CmQueueRT::CreateEvent(CmTaskInternal *task, bool isVisible, int32_t &taskDriverId, CmEventRT *&event )
3505 {
3506 int32_t hr = CM_SUCCESS;
3507
3508 m_criticalSectionEvent.Acquire();
3509
3510 uint32_t freeSlotInEventArray = m_eventArray.GetFirstFreeIndex();
3511
3512 hr = CmEventRT::Create( freeSlotInEventArray, this, task, taskDriverId, m_device, isVisible, event );
3513
3514 if (hr == CM_SUCCESS)
3515 {
3516 m_eventArray.SetElement( freeSlotInEventArray, event );
3517 m_eventCount ++;
3518
3519 if (task)
3520 task->SetTaskEvent( event );
3521
3522 if (!isVisible)
3523 {
3524 event = nullptr;
3525 }
3526 }
3527 else
3528 {
3529 CM_ASSERTMESSAGE("Error: Create Event failure.")
3530 }
3531
3532 m_criticalSectionEvent.Release();
3533
3534 return hr;
3535 }
3536
3537 //*---------------------------------------------------------------------------------------------------------
3538 //| Name: EnqueueCopyCPUToGPUFullStride()
3539 //| Purpose: Copy data from system memory to video memory (surface)
3540 //| Arguments:
3541 //| surface [in] Pointer to a CmSurface2D object as copy destination
3542 //| sysMem [in] Pointer to a system memory as copy source
3543 //| widthStride [in] Width stride in bytes for system memory (to calculate start of next line)
3544 //| heightStride [in] Width stride in row for system memory (to calculate start of next plane)
3545 //| option [in] Option passed from user, blocking copy, non-blocking copy or disable turbo boost
3546 //| event [in,out] Reference to the pointer to Event
3547 //| Returns: Result of the operation.
3548 //|
3549 //| Restrictions & Notes:
3550 //| 1) sysMem must be 16-byte aligned.
3551 //| 2) Surface's width must be 16-byte aligned regarding performance.
3552 //| 3) widthStride and heightStride are used to indicate the padding information in system memory
3553 //| widthStride = width_in_pixel * bytes_per_pixel + padding_in_bytes
3554 //| heightStride = height + padding_in_row
3555 //*---------------------------------------------------------------------------------------------------------
EnqueueCopyCPUToGPUFullStride(CmSurface2D * surface,const unsigned char * sysMem,const uint32_t widthStride,const uint32_t heightStride,const uint32_t option,CmEvent * & event)3556 CM_RT_API int32_t CmQueueRT::EnqueueCopyCPUToGPUFullStride( CmSurface2D* surface,
3557 const unsigned char* sysMem,
3558 const uint32_t widthStride,
3559 const uint32_t heightStride,
3560 const uint32_t option,
3561 CmEvent* & event )
3562 {
3563 INSERT_API_CALL_LOG(GetHalState());
3564
3565 if (!m_device->HasGpuCopyKernel())
3566 {
3567 return CM_NOT_IMPLEMENTED;
3568 }
3569
3570 CmSurface2DRT *surfaceRT = static_cast<CmSurface2DRT *>(surface);
3571 return EnqueueCopyInternal(surfaceRT, (unsigned char*)sysMem, widthStride, heightStride, CM_FASTCOPY_CPU2GPU, option, event);
3572 }
3573
3574 //*---------------------------------------------------------------------------------------------------------
3575 //| Name: EnqueueCopyGPUToCPUFullStride()
3576 //| Purpose: Copy data from tiled video memory (surface) to linear system memory
3577 //| Arguments:
3578 //| surface [in] Pointer to a CmSurface2D object as copy source
3579 //| sysMem [in] Pointer to a system memory as copy destination
3580 //| widthStride [in] Width stride in bytes for system memory (to calculate start of next line)
3581 //| heightStride [in] Width stride in row for system memory (to calculate start of next plane)
3582 //| option [in] Option passed from user, blocking copy,non-blocking copy or disable turbo boost
3583 //| event [in,out] Reference to the pointer to Event
3584 //| Returns: Result of the operation.
3585 //|
3586 //| Restrictions & Notes:
3587 //| 1) sysMem must be 16-byte aligned.
3588 //| 2) Surface's width must be 16-byte aligned regarding performance.
3589 //| 3) widthStride and heightStride are used to indicate the padding information in system memory
3590 //| widthStride = width_in_pixel * bytes_per_pixel + padding_in_bytes
3591 //| heightStride = height + padding_in_row
3592 //*---------------------------------------------------------------------------------------------------------
EnqueueCopyGPUToCPUFullStride(CmSurface2D * surface,unsigned char * sysMem,const uint32_t widthStride,const uint32_t heightStride,const uint32_t option,CmEvent * & event)3593 CM_RT_API int32_t CmQueueRT::EnqueueCopyGPUToCPUFullStride( CmSurface2D* surface,
3594 unsigned char* sysMem,
3595 const uint32_t widthStride,
3596 const uint32_t heightStride,
3597 const uint32_t option,
3598 CmEvent* & event )
3599 {
3600 INSERT_API_CALL_LOG(GetHalState());
3601
3602 if (!m_device->HasGpuCopyKernel())
3603 {
3604 return CM_NOT_IMPLEMENTED;
3605 }
3606
3607 CmSurface2DRT *surfaceRT = static_cast<CmSurface2DRT *>(surface);
3608 return EnqueueCopyInternal(surfaceRT, sysMem, widthStride, heightStride, CM_FASTCOPY_GPU2CPU, option, event);
3609 }
3610
3611 //*---------------------------------------------------------------------------------------------------------
3612 //| Name: CreateGPUCopyKernel()
3613 //| Purpose: Create GPUCopy kernel, reuse the kernel if it has been created and resuable
3614 //| Arguments:
3615 //| widthInByte [in] surface's width in bytes
3616 //| height [in] surface's height
3617 //| format [in] surface's height
3618 //| copyDirection [in] copy direction, cpu -> gpu or gpu -> cpu
3619 //| gpuCopyKernelParam [out] kernel param
3620 //|
3621 //| Returns: Result of the operation.
3622 //|
3623 //*---------------------------------------------------------------------------------------------------------
CreateGPUCopyKernel(uint32_t widthInByte,uint32_t height,CM_SURFACE_FORMAT format,CM_GPUCOPY_DIRECTION copyDirection,CM_GPUCOPY_KERNEL * & gpuCopyKernelParam)3624 int32_t CmQueueRT::CreateGPUCopyKernel(uint32_t widthInByte,
3625 uint32_t height,
3626 CM_SURFACE_FORMAT format,
3627 CM_GPUCOPY_DIRECTION copyDirection,
3628 CM_GPUCOPY_KERNEL* &gpuCopyKernelParam)
3629 {
3630 int32_t hr = CM_SUCCESS;
3631
3632 //Search existing kernel
3633 CM_CHK_CMSTATUS_GOTOFINISH(SearchGPUCopyKernel(widthInByte, height, format, copyDirection, gpuCopyKernelParam));
3634
3635 if(gpuCopyKernelParam != nullptr)
3636 { // reuse
3637 GPUCOPY_KERNEL_LOCK(gpuCopyKernelParam);
3638 }
3639 else
3640 {
3641 gpuCopyKernelParam = new (std::nothrow) CM_GPUCOPY_KERNEL ;
3642 CM_CHK_NULL_GOTOFINISH_CMERROR(gpuCopyKernelParam);
3643 CmSafeMemSet(gpuCopyKernelParam, 0, sizeof(CM_GPUCOPY_KERNEL));
3644
3645 CM_CHK_CMSTATUS_GOTOFINISH(AllocateGPUCopyKernel(widthInByte, height, format, copyDirection, gpuCopyKernelParam->kernel));
3646 CM_CHK_CMSTATUS_GOTOFINISH(GetGPUCopyKrnID(widthInByte, height, format, copyDirection, gpuCopyKernelParam->kernelID));
3647 GPUCOPY_KERNEL_LOCK(gpuCopyKernelParam);
3648
3649 CM_CHK_CMSTATUS_GOTOFINISH(AddGPUCopyKernel(gpuCopyKernelParam));
3650 }
3651
3652 finish:
3653 if( hr != CM_SUCCESS)
3654 {
3655 CmSafeDelete(gpuCopyKernelParam);
3656 }
3657
3658 return hr;
3659 }
3660
3661 //*---------------------------------------------------------------------------------------------------------
3662 //| Name: SearchGPUCopyKernel()
3663 //| Purpose: Search if the required kernel exists
3664 //| Arguments:
3665 //| widthInByte [in] surface's width in bytes
3666 //| height [in] surface's height
3667 //| format [in] surface's height
3668 //| copyDirection [in] copy direction, cpu -> gpu or gpu -> cpu
3669 //| gpuCopyKernelParam [out] kernel param
3670 //|
3671 //| Returns: Result of the operation.
3672 //|
3673 //*---------------------------------------------------------------------------------------------------------
SearchGPUCopyKernel(uint32_t widthInByte,uint32_t height,CM_SURFACE_FORMAT format,CM_GPUCOPY_DIRECTION copyDirection,CM_GPUCOPY_KERNEL * & kernelParam)3674 int32_t CmQueueRT::SearchGPUCopyKernel(uint32_t widthInByte,
3675 uint32_t height,
3676 CM_SURFACE_FORMAT format,
3677 CM_GPUCOPY_DIRECTION copyDirection,
3678 CM_GPUCOPY_KERNEL* &kernelParam)
3679 {
3680 int32_t hr = CM_SUCCESS;
3681 CM_GPUCOPY_KERNEL *gpucopyKernel = nullptr;
3682 CM_GPUCOPY_KERNEL_ID kernelTypeID = GPU_COPY_KERNEL_UNKNOWN;
3683
3684 kernelParam = nullptr;
3685 CM_CHK_CMSTATUS_GOTOFINISH(GetGPUCopyKrnID(widthInByte, height, format, copyDirection, kernelTypeID));
3686
3687 for(uint32_t index =0 ; index< m_copyKernelParamArrayCount; index++)
3688 {
3689 gpucopyKernel = (CM_GPUCOPY_KERNEL*)m_copyKernelParamArray.GetElement(index);
3690 if(gpucopyKernel != nullptr)
3691 {
3692 if(!gpucopyKernel->locked &&
3693 gpucopyKernel->kernelID == kernelTypeID)
3694 {
3695 kernelParam = gpucopyKernel;
3696 break;
3697 }
3698 }
3699 }
3700
3701 finish:
3702 return hr;
3703 }
3704
3705 //*---------------------------------------------------------------------------------------------------------
3706 //| Name: AddGPUCopyKernel()
3707 //| Purpose: Add new kernel into m_copyKernelParamArray
3708 //| Arguments:
3709 //| widthInByte [in] surface's width in bytes
3710 //| height [in] surface's height
3711 //| format [in] surface's height
3712 //| copyDirection [in] copy direction, cpu -> gpu or gpu -> cpu
3713 //| gpuCopyKernelParam [out] kernel param
3714 //|
3715 //| Returns: Result of the operation.
3716 //|
3717 //*---------------------------------------------------------------------------------------------------------
AddGPUCopyKernel(CM_GPUCOPY_KERNEL * & kernelParam)3718 int32_t CmQueueRT::AddGPUCopyKernel(CM_GPUCOPY_KERNEL* &kernelParam)
3719 {
3720 int32_t hr = CM_SUCCESS;
3721 // critical section protection
3722 CLock locker(m_criticalSectionGPUCopyKrn);
3723
3724 CM_CHK_NULL_GOTOFINISH(kernelParam, CM_INVALID_GPUCOPY_KERNEL);
3725
3726 // the newly created kernel must be locked
3727 if(!kernelParam->locked)
3728 {
3729 CM_ASSERTMESSAGE("Error: The newly created kernel must be locked.")
3730 hr = CM_INVALID_GPUCOPY_KERNEL;
3731 goto finish;
3732 }
3733
3734 m_copyKernelParamArray.SetElement(m_copyKernelParamArrayCount, kernelParam);
3735 m_copyKernelParamArrayCount ++;
3736
3737 finish:
3738 return hr;
3739 }
3740
3741 //*---------------------------------------------------------------------------------------------------------
3742 //| Name: GetGPUCopyKrnID()
3743 //| Purpose: Calculate the kernel ID accroding surface's width, height and copy direction
3744 //| Arguments:
3745 //| widthInByte [in] surface's width in bytes
3746 //| height [in] surface's height
3747 //| format [in] surface's height
3748 //| copyDirection [in] copy direction, cpu -> gpu or gpu -> cpu
3749 //| kernelID [out] kernel id
3750 //|
3751 //| Returns: Result of the operation.
3752 //|
3753 //*---------------------------------------------------------------------------------------------------------
GetGPUCopyKrnID(uint32_t widthInByte,uint32_t height,CM_SURFACE_FORMAT format,CM_GPUCOPY_DIRECTION copyDirection,CM_GPUCOPY_KERNEL_ID & kernelID)3754 int32_t CmQueueRT::GetGPUCopyKrnID( uint32_t widthInByte, uint32_t height, CM_SURFACE_FORMAT format,
3755 CM_GPUCOPY_DIRECTION copyDirection, CM_GPUCOPY_KERNEL_ID &kernelID )
3756 {
3757 int32_t hr = CM_SUCCESS;
3758
3759 kernelID = GPU_COPY_KERNEL_UNKNOWN;
3760
3761 if (format == CM_SURFACE_FORMAT_NV12 || format == CM_SURFACE_FORMAT_P010 || format == CM_SURFACE_FORMAT_P016)
3762 {
3763 switch(copyDirection)
3764 {
3765 case CM_FASTCOPY_GPU2CPU:
3766 if ( (height&0x7) ||(widthInByte&0x7f))
3767 {
3768 kernelID = GPU_COPY_KERNEL_GPU2CPU_UNALIGNED_NV12_ID ;
3769 }
3770 else
3771 { // height 8-row aligned, widthByte 128 multiple
3772 kernelID = GPU_COPY_KERNEL_GPU2CPU_ALIGNED_NV12_ID ;
3773 }
3774 break;
3775
3776 case CM_FASTCOPY_CPU2GPU:
3777 kernelID = GPU_COPY_KERNEL_CPU2GPU_NV12_ID;
3778 break;
3779
3780 case CM_FASTCOPY_GPU2GPU:
3781 kernelID = GPU_COPY_KERNEL_GPU2GPU_NV12_ID;
3782 break;
3783
3784 case CM_FASTCOPY_CPU2CPU:
3785 kernelID = GPU_COPY_KERNEL_CPU2CPU_ID;
3786 break;
3787
3788 default :
3789 CM_ASSERTMESSAGE("Error: Invalid fast copy direction.")
3790 hr = CM_FAILURE;
3791 break;
3792 }
3793 }
3794 else
3795 {
3796 switch(copyDirection)
3797 {
3798 case CM_FASTCOPY_GPU2CPU:
3799 if ( (height&0x7) ||(widthInByte&0x7f))
3800 {
3801 kernelID = GPU_COPY_KERNEL_GPU2CPU_UNALIGNED_ID;
3802 }
3803 else
3804 { // height 8-row aligned, widthByte 128 multiple
3805 kernelID = GPU_COPY_KERNEL_GPU2CPU_ALIGNED_ID;
3806 }
3807 break;
3808
3809 case CM_FASTCOPY_CPU2GPU:
3810 kernelID = GPU_COPY_KERNEL_CPU2GPU_ID;
3811 break;
3812
3813 case CM_FASTCOPY_GPU2GPU:
3814 kernelID = GPU_COPY_KERNEL_GPU2GPU_ID;
3815 break;
3816
3817 case CM_FASTCOPY_CPU2CPU:
3818 kernelID = GPU_COPY_KERNEL_CPU2CPU_ID;
3819 break;
3820
3821 default :
3822 CM_ASSERTMESSAGE("Error: Invalid fast copy direction.")
3823 hr = CM_FAILURE;
3824 break;
3825 }
3826 }
3827
3828 return hr;
3829 }
3830
3831 //*---------------------------------------------------------------------------------------------------------
3832 //| Name: AllocateGPUCopyKernel()
3833 //| Purpose: Allocate GPUCopy Kernel
3834 //| Arguments:
3835 //| widthInByte [in] surface's width in bytes
3836 //| height [in] surface's height
3837 //| format [in] surface's height
3838 //| copyDirection [in] copy direction, cpu -> gpu or gpu -> cpu
3839 //| kernel [out] pointer to created kernel
3840 //|
3841 //| Returns: Result of the operation.
3842 //|
3843 //*---------------------------------------------------------------------------------------------------------
AllocateGPUCopyKernel(uint32_t widthInByte,uint32_t height,CM_SURFACE_FORMAT format,CM_GPUCOPY_DIRECTION copyDirection,CmKernel * & kernel)3844 int32_t CmQueueRT::AllocateGPUCopyKernel( uint32_t widthInByte, uint32_t height, CM_SURFACE_FORMAT format,
3845 CM_GPUCOPY_DIRECTION copyDirection, CmKernel *&kernel )
3846 {
3847 int32_t hr = CM_SUCCESS;
3848 CmProgram *gpuCopyProgram = nullptr;
3849
3850 CM_CHK_CMSTATUS_GOTOFINISH( m_device->LoadPredefinedCopyKernel(gpuCopyProgram));
3851 CM_CHK_NULL_GOTOFINISH_CMERROR(gpuCopyProgram);
3852
3853 if (format == CM_SURFACE_FORMAT_NV12 || format == CM_SURFACE_FORMAT_P010 || format == CM_SURFACE_FORMAT_P016)
3854 {
3855 switch(copyDirection)
3856 {
3857 case CM_FASTCOPY_GPU2CPU:
3858 if ( (height&0x7) ||(widthInByte&0x7f))
3859 {
3860 CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateKernel( gpuCopyProgram, _NAME( surfaceCopy_read_NV12_32x32 ) , kernel,"PredefinedGPUCopyKernel"));
3861 }
3862 else
3863 { // height 8-row aligned, widthByte 128 multiple
3864 CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateKernel( gpuCopyProgram, _NAME( surfaceCopy_read_NV12_aligned_32x32 ) , kernel,"PredefinedGPUCopyKernel"));
3865 }
3866 break;
3867
3868 case CM_FASTCOPY_CPU2GPU:
3869 CM_CHK_CMSTATUS_GOTOFINISH( m_device->CreateKernel( gpuCopyProgram, _NAME( surfaceCopy_write_NV12_32x32 ), kernel, "PredefinedGPUCopyKernel"));
3870 break;
3871
3872 case CM_FASTCOPY_GPU2GPU:
3873 CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateKernel(gpuCopyProgram, _NAME(SurfaceCopy_2DTo2D_NV12_32x32), kernel, "PredefinedGPUCopyKernel"));
3874 break;
3875
3876 case CM_FASTCOPY_CPU2CPU:
3877 CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateKernel(gpuCopyProgram, _NAME(SurfaceCopy_BufferToBuffer_4k), kernel, "PredefinedGPUCopyKernel"));
3878 break;
3879
3880 default :
3881 CM_ASSERTMESSAGE("Error: Invalid fast copy direction.")
3882 hr = CM_FAILURE;
3883 break;
3884 }
3885 }
3886 else
3887 {
3888 switch(copyDirection)
3889 {
3890 case CM_FASTCOPY_GPU2CPU:
3891 if ( (height&0x7) ||(widthInByte&0x7f))
3892 {
3893 CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateKernel( gpuCopyProgram, _NAME( surfaceCopy_read_32x32 ) , kernel, "PredefinedGPUCopyKernel"));
3894 }
3895 else
3896 { // height 8-row aligned, widthByte 128 multiple
3897 CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateKernel( gpuCopyProgram, _NAME( surfaceCopy_read_aligned_32x32 ) , kernel, "PredefinedGPUCopyKernel"));
3898 }
3899 break;
3900
3901 case CM_FASTCOPY_CPU2GPU:
3902 CM_CHK_CMSTATUS_GOTOFINISH( m_device->CreateKernel( gpuCopyProgram, _NAME( surfaceCopy_write_32x32 ), kernel, "PredefinedGPUCopyKernel" ));
3903 break;
3904
3905 case CM_FASTCOPY_GPU2GPU:
3906 CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateKernel(gpuCopyProgram, _NAME(SurfaceCopy_2DTo2D_32x32), kernel, "PredefinedGPUCopyKernel"));
3907 break;
3908
3909 case CM_FASTCOPY_CPU2CPU:
3910 CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateKernel(gpuCopyProgram, _NAME(SurfaceCopy_BufferToBuffer_4k), kernel, "PredefinedGPUCopyKernel"));
3911 break;
3912
3913 default :
3914 CM_ASSERTMESSAGE("Error: Invalid fast copy direction.")
3915 hr = CM_FAILURE;
3916 break;
3917 }
3918 }
3919
3920 finish:
3921 return hr;
3922 }
3923
EnqueueFast(CmTask * task,CmEvent * & event,const CmThreadSpace * threadSpace)3924 CM_RT_API int32_t CmQueueRT::EnqueueFast(CmTask *task,
3925 CmEvent* &event,
3926 const CmThreadSpace *threadSpace)
3927 {
3928 CM_HAL_STATE * state = ((PCM_CONTEXT_DATA)m_device->GetAccelData())->cmHalState;
3929 int32_t result = CM_SUCCESS;
3930 if (state == nullptr)
3931 {
3932 result = CM_NULL_POINTER;
3933 }
3934 else if (state->advExecutor == nullptr ||
3935 state->advExecutor->SwitchToFastPath(task) == false)
3936 {
3937 return Enqueue(task, event, threadSpace);
3938 }
3939 else
3940 {
3941 auto gpu_context_name
3942 = static_cast<MOS_GPU_CONTEXT>(m_queueOption.GPUContext);
3943 // Selects the correct GPU context.
3944 uint32_t old_stream_idx = state->pfnSetGpuContext(state, gpu_context_name,
3945 m_streamIndex,
3946 m_gpuContextHandle);
3947 const CmThreadSpaceRT *threadSpaceRTConst
3948 = static_cast<const CmThreadSpaceRT*>(threadSpace);
3949 if (state->cmHalInterface->CheckMediaModeAvailability() == false)
3950 {
3951 if (threadSpaceRTConst != nullptr)
3952 {
3953 result = state->advExecutor->SubmitComputeTask(
3954 this, task, event, threadSpaceRTConst->GetThreadGroupSpace(),
3955 gpu_context_name);
3956 }
3957 else
3958 {
3959 result = state->advExecutor->SubmitComputeTask(this, task, event,
3960 nullptr,
3961 gpu_context_name);
3962 }
3963 }
3964 else
3965 {
3966 result = state->advExecutor->SubmitTask(this, task, event, threadSpace,
3967 gpu_context_name);
3968 }
3969 state->osInterface->streamIndex = old_stream_idx;
3970 }
3971 return result;
3972 }
3973
DestroyEventFast(CmEvent * & event)3974 CM_RT_API int32_t CmQueueRT::DestroyEventFast(CmEvent *&event)
3975 {
3976 CM_HAL_STATE * state = ((PCM_CONTEXT_DATA)m_device->GetAccelData())->cmHalState;
3977
3978 if (state == nullptr)
3979 {
3980 return CM_NULL_POINTER;
3981 }
3982 else if (state->advExecutor == nullptr)
3983 {
3984 return DestroyEvent(event);
3985 }
3986 else
3987 {
3988 return state->advExecutor->DestoryEvent(this, event);
3989 }
3990 }
3991
3992 CM_RT_API int32_t
EnqueueWithGroupFast(CmTask * task,CmEvent * & event,const CmThreadGroupSpace * threadGroupSpace)3993 CmQueueRT::EnqueueWithGroupFast(CmTask *task,
3994 CmEvent* &event,
3995 const CmThreadGroupSpace *threadGroupSpace)
3996 {
3997 CM_HAL_STATE * state = ((PCM_CONTEXT_DATA)m_device->GetAccelData())->cmHalState;
3998 int32_t result = CM_SUCCESS;
3999 if (state == nullptr)
4000 {
4001 return CM_NULL_POINTER;
4002 }
4003 else if (state->advExecutor == nullptr ||
4004 state->advExecutor->SwitchToFastPath(task) == false)
4005 {
4006 return EnqueueWithGroup(task, event, threadGroupSpace);
4007 }
4008
4009 auto gpu_context_name = static_cast<MOS_GPU_CONTEXT>(m_queueOption.GPUContext);
4010 // Selects the correct GPU context.
4011 uint32_t old_stream_idx = state->pfnSetGpuContext(state, gpu_context_name,
4012 m_streamIndex,
4013 m_gpuContextHandle);
4014 if (state->cmHalInterface->CheckMediaModeAvailability())
4015 {
4016 result = state->advExecutor->SubmitGpgpuTask(this, task, event,
4017 threadGroupSpace,
4018 gpu_context_name);
4019 }
4020 else
4021 {
4022 result = state->advExecutor->SubmitComputeTask(this, task, event,
4023 threadGroupSpace,
4024 gpu_context_name);
4025 }
4026 state->osInterface->streamIndex = old_stream_idx;
4027 return result;
4028 }
4029
GetOSSyncEventHandle(void * & hOSSyncEvent)4030 int32_t CmQueueRT::GetOSSyncEventHandle(void *& hOSSyncEvent)
4031 {
4032 hOSSyncEvent = m_osSyncEvent;
4033 return CM_SUCCESS;
4034 }
4035
4036
RegisterSyncEvent()4037 int32_t CmQueueRT::RegisterSyncEvent()
4038 {
4039 CM_RETURN_CODE hr = CM_SUCCESS;
4040
4041 CM_HAL_OSSYNC_PARAM syncParam;
4042 void *syncEventHandle = nullptr;
4043 syncParam.osSyncEvent = syncEventHandle;
4044
4045 PCM_CONTEXT_DATA cmData = (PCM_CONTEXT_DATA)m_device->GetAccelData();
4046 PCM_HAL_STATE cmHalState = cmData->cmHalState;
4047 // Call HAL layer to wait for Task finished with event-driven mechanism
4048 CM_CHK_MOSSTATUS_GOTOFINISH_CMERROR(cmHalState->pfnRegisterUMDNotifyEventHandle(cmHalState, &syncParam));
4049
4050 m_osSyncEvent = syncParam.osSyncEvent;
4051
4052 finish:
4053 return hr;
4054 }
4055
CreateGpuContext(CM_HAL_STATE * halState,MOS_GPU_CONTEXT gpuContextName,MOS_GPU_NODE gpuNode,MOS_GPUCTX_CREATOPTIONS * createOptions)4056 MOS_STATUS CmQueueRT::CreateGpuContext(CM_HAL_STATE *halState,
4057 MOS_GPU_CONTEXT gpuContextName,
4058 MOS_GPU_NODE gpuNode,
4059 MOS_GPUCTX_CREATOPTIONS *createOptions)
4060 {
4061 uint32_t old_stream_idx = 0;
4062 MOS_STATUS status = MOS_STATUS_UNKNOWN;
4063 if (MOS_GPU_CONTEXT_CM_COMPUTE == gpuContextName)
4064 {
4065 m_streamIndex = halState->pfnRegisterStream(halState);
4066 old_stream_idx = halState->osInterface->streamIndex;
4067 halState->osInterface->streamIndex = m_streamIndex;
4068 m_gpuContextHandle = halState->pfnCreateGpuComputeContext(halState,
4069 createOptions);
4070 if (MOS_GPU_CONTEXT_INVALID_HANDLE != m_gpuContextHandle)
4071 {
4072 status = MOS_STATUS_SUCCESS;
4073 CreateSyncBuffer(halState);
4074 }
4075 }
4076 else
4077 { // As there is only one render context, the original stream index will be used.
4078 old_stream_idx = m_streamIndex = halState->osInterface->streamIndex;
4079 status = halState->pfnCreateGPUContext(halState, gpuContextName, gpuNode,
4080 createOptions);
4081 }
4082 halState->osInterface->streamIndex = old_stream_idx;
4083 return status;
4084 }
4085
DestroyComputeGpuContext()4086 MOS_STATUS CmQueueRT::DestroyComputeGpuContext()
4087 {
4088 MOS_STATUS status = MOS_STATUS_SUCCESS;
4089 PCM_CONTEXT_DATA cmCtxData = nullptr;
4090 PCM_HAL_STATE cmHalState = nullptr;
4091
4092 if (MOS_GPU_CONTEXT_INVALID_HANDLE == m_gpuContextHandle)
4093 {
4094 return MOS_STATUS_SUCCESS;
4095 }
4096
4097 cmCtxData = (PCM_CONTEXT_DATA)m_device->GetAccelData();
4098 if(!cmCtxData || !cmCtxData->cmHalState || !cmCtxData->cmHalState->osInterface)
4099 {
4100 return MOS_STATUS_INVALID_PARAMETER;
4101 }
4102
4103 cmHalState = cmCtxData->cmHalState;
4104
4105 status = cmHalState->osInterface->pfnDestroyGpuComputeContext(cmHalState->osInterface, m_gpuContextHandle);
4106
4107 return status;
4108 }
4109
ExecuteGroupTask(CM_HAL_STATE * halState,CM_HAL_EXEC_TASK_GROUP_PARAM * taskParam,MOS_GPU_CONTEXT gpuContextName)4110 MOS_STATUS CmQueueRT::ExecuteGroupTask(CM_HAL_STATE *halState,
4111 CM_HAL_EXEC_TASK_GROUP_PARAM *taskParam,
4112 MOS_GPU_CONTEXT gpuContextName)
4113 {
4114 uint32_t old_stream_idx = halState->pfnSetGpuContext(halState, gpuContextName,
4115 m_streamIndex,
4116 m_gpuContextHandle);
4117 if (INVALID_STREAM_INDEX == old_stream_idx)
4118 {
4119 return MOS_STATUS_UNKNOWN;
4120 }
4121 RegisterSyncEvent();
4122 CM_CHK_MOSSTATUS_RETURN(SelectSyncBuffer(halState));
4123 MOS_STATUS result = halState->pfnExecuteGroupTask(halState, taskParam);
4124 halState->osInterface->streamIndex = old_stream_idx;
4125 return result;
4126 }
4127
ExecuteGeneralTask(CM_HAL_STATE * halState,CM_HAL_EXEC_TASK_PARAM * taskParam,MOS_GPU_CONTEXT gpuContextName)4128 MOS_STATUS CmQueueRT::ExecuteGeneralTask(CM_HAL_STATE *halState,
4129 CM_HAL_EXEC_TASK_PARAM *taskParam,
4130 MOS_GPU_CONTEXT gpuContextName)
4131 {
4132 uint32_t old_stream_idx = halState->pfnSetGpuContext(halState, gpuContextName,
4133 m_streamIndex,
4134 m_gpuContextHandle);
4135 if (INVALID_STREAM_INDEX == old_stream_idx)
4136 {
4137 return MOS_STATUS_UNKNOWN;
4138 }
4139 RegisterSyncEvent();
4140 MOS_STATUS result = halState->pfnExecuteTask(halState, taskParam);
4141 halState->osInterface->streamIndex = old_stream_idx;
4142 return result;
4143 }
4144
4145 #if CM_LOG_ON
GetHalState()4146 CM_HAL_STATE* CmQueueRT::GetHalState() { return m_device->GetHalState(); }
4147 #endif // #if CM_LOG_ON
4148 } // namespace
4149