xref: /aosp_15_r20/external/intel-media-driver/media_driver/agnostic/common/cm/cm_queue_rt.cpp (revision ba62d9d3abf0e404f2022b4cd7a85e107f48596f)
1 /*
2 * Copyright (c) 2007-2021, Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included
12 * in all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
15 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20 * OTHER DEALINGS IN THE SOFTWARE.
21 */
22 //!
23 //! \file      cm_queue_rt.cpp
24 //! \brief     Contains CmQueueRT implementations.
25 //!
26 
27 #include "cm_queue_rt.h"
28 #include "cm_event_ex.h"
29 #include "cm_mem.h"
30 #include "cm_device_rt.h"
31 #include "cm_event_rt.h"
32 #include "cm_task_rt.h"
33 #include "cm_task_internal.h"
34 #include "cm_thread_space_rt.h"
35 #include "cm_kernel_rt.h"
36 #include "cm_kernel_data.h"
37 #include "cm_buffer_rt.h"
38 #include "cm_group_space.h"
39 #include "cm_vebox_data.h"
40 #include "cm_surface_manager.h"
41 #include "cm_surface_2d_rt.h"
42 #include "cm_vebox_rt.h"
43 #include "cm_execution_adv.h"
44 #include "vp_common.h"
45 
46 // Used by GPUCopy
47 #define BLOCK_PIXEL_WIDTH            (32)
48 #define BLOCK_HEIGHT                 (8)
49 #define BLOCK_HEIGHT_NV12            (4)
50 #define SUB_BLOCK_PIXEL_WIDTH        (8)
51 #define SUB_BLOCK_HEIGHT             (8)
52 #define SUB_BLOCK_HEIGHT_NV12        (4)
53 #define INNER_LOOP                   (4)
54 #define BYTE_COPY_ONE_THREAD         (1024*INNER_LOOP)  //4K for each thread
55 #define THREAD_SPACE_WIDTH_INCREMENT (8)
56 //Used by unaligned copy
57 #define BLOCK_WIDTH                  (64)
58 #define PAGE_ALIGNED                 (0x1000)
59 
60 #define GPUCOPY_KERNEL_LOCK(a) ((a)->locked = true)
61 #define GPUCOPY_KERNEL_UNLOCK(a) ((a)->locked = false)
62 using namespace CMRT_UMD;
63 
64 namespace CMRT_UMD
65 {
66     typedef struct _tdata
67     {
68         void* pCmQueueRT;
69         void* buffer;
70         size_t offset;
71         unsigned char* sysMem;
72         uint64_t sysMemSize;
73         int dir;
74         void* threadSpace;
75         void* task;
76         void* wait_event;
77         void* event;
78         unsigned option;
79         uint64_t cpuFrrequency;
80     }CopyThreadData;
81 
82 
83 //*-----------------------------------------------------------------------------
84 //| Purpose:    Create Queue
85 //| Returns:    Result of the operation.
86 //*-----------------------------------------------------------------------------
Create(CmDeviceRT * device,CmQueueRT * & queue,CM_QUEUE_CREATE_OPTION queueCreateOption)87 int32_t CmQueueRT::Create(CmDeviceRT *device,
88                           CmQueueRT* &queue,
89                           CM_QUEUE_CREATE_OPTION queueCreateOption)
90 {
91     int32_t result = CM_SUCCESS;
92     queue = new (std::nothrow) CmQueueRT(device, queueCreateOption);
93     if( queue )
94     {
95         result = queue->Initialize( );
96         if( result != CM_SUCCESS )
97         {
98             CmQueueRT::Destroy( queue);
99         }
100     }
101     else
102     {
103         CM_ASSERTMESSAGE("Error: Failed to create CmQueue due to out of system memory.");
104         result = CM_OUT_OF_HOST_MEMORY;
105     }
106     return result;
107 }
108 
109 //*-----------------------------------------------------------------------------
110 //| Purpose:    Destroy Queue
111 //| Returns:    Result of the operation.
112 //*-----------------------------------------------------------------------------
Destroy(CmQueueRT * & queue)113 int32_t CmQueueRT::Destroy(CmQueueRT* &queue )
114 {
115     if( queue == nullptr )
116     {
117         return CM_FAILURE;
118     }
119 
120     uint32_t result = queue->CleanQueue();
121 
122     queue->DestroyComputeGpuContext();
123 
124     PCM_HAL_STATE cmHalState = ((PCM_CONTEXT_DATA)queue->m_device->GetAccelData())->cmHalState;
125     CM_CHK_NULL_RETURN_CMERROR(cmHalState);
126     if (cmHalState->pfnUnRegisterStream != nullptr && queue->m_streamIndex != cmHalState->osInterface->streamIndex)
127     {
128         cmHalState->pfnUnRegisterStream(queue->m_streamIndex, cmHalState);
129     }
130 
131     CmSafeDelete( queue );
132 
133     return result;
134 }
135 
136 //*-----------------------------------------------------------------------------
137 //| Purpose:    Constructor of Cm Queue
138 //| Returns:    Result of the operation.
139 //*-----------------------------------------------------------------------------
CmQueueRT(CmDeviceRT * device,CM_QUEUE_CREATE_OPTION queueCreateOption)140 CmQueueRT::CmQueueRT(CmDeviceRT *device,
141                      CM_QUEUE_CREATE_OPTION queueCreateOption):
142     m_device(device),
143     m_eventArray(CM_INIT_EVENT_COUNT),
144     m_eventCount(0),
145     m_copyKernelParamArray(CM_INIT_GPUCOPY_KERNL_COUNT),
146     m_copyKernelParamArrayCount(0),
147     m_halMaxValues(nullptr),
148     m_queueOption(queueCreateOption),
149     m_usingVirtualEngine(false),
150     m_osSyncEvent(nullptr),
151     m_trackerIndex(0),
152     m_fastTrackerIndex(0),
153     m_streamIndex(0),
154     m_gpuContextHandle(MOS_GPU_CONTEXT_INVALID_HANDLE),
155     m_syncBufferHandle(INVALID_SYNC_BUFFER_HANDLE)
156 {
157     MOS_ZeroMemory(&m_mosVeHintParams, sizeof(m_mosVeHintParams));
158     MosUtilities::MosQueryPerformanceFrequency(&m_CPUperformanceFrequency);
159 }
160 
161 //*-----------------------------------------------------------------------------
162 //| Purpose:    Destructor of Cm Queue
163 //| Returns:    Result of the operation.
164 //*-----------------------------------------------------------------------------
~CmQueueRT()165 CmQueueRT::~CmQueueRT()
166 {
167     m_osSyncEvent = nullptr;
168     uint32_t eventArrayUsedSize = m_eventArray.GetMaxSize();
169     for( uint32_t i = 0; i < eventArrayUsedSize; i ++ )
170     {
171         CmEventRT* event = (CmEventRT*)m_eventArray.GetElement( i );
172         uint32_t eventReleaseTimes = 0;
173         while( event )
174         {   // destroy the event no matter if it is released by user
175             if(eventReleaseTimes > 2)
176             {
177                 // The max of event's reference cout is 2
178                 // if the event is not released after 2 times, there is something wrong
179                 CM_ASSERTMESSAGE("Error: The max of event's reference cout is 2.");
180                 break;
181             }
182             CmEventRT::Destroy( event );
183             eventReleaseTimes ++;
184         }
185     }
186     m_eventArray.Delete();
187 
188     // Do not destroy the kernel in m_copyKernelParamArray.
189     // They have been destoyed in ~CmDevice() before destroying Queue
190     for( uint32_t i = 0; i < m_copyKernelParamArrayCount; i ++ )
191     {
192         CM_GPUCOPY_KERNEL *gpuCopyParam = (CM_GPUCOPY_KERNEL*)m_copyKernelParamArray.GetElement( i );
193         CmSafeDelete(gpuCopyParam);
194     }
195 
196     m_copyKernelParamArray.Delete();
197 
198     CM_HAL_STATE *hal_state = static_cast<CM_CONTEXT_DATA*>(m_device->GetAccelData())->cmHalState;
199     ReleaseSyncBuffer(hal_state);
200     return;
201 }
202 
203 //*-----------------------------------------------------------------------------
204 //| Purpose:    Initialize Cm Queue
205 //| Returns:    Result of the operation.
206 //*-----------------------------------------------------------------------------
Initialize()207 int32_t CmQueueRT::Initialize()
208 {
209     PCM_HAL_STATE cmHalState = ((PCM_CONTEXT_DATA)m_device->GetAccelData())->cmHalState;
210     CM_HAL_MAX_VALUES_EX* halMaxValuesEx = nullptr;
211     CM_RETURN_CODE hr = CM_SUCCESS;
212     m_device->GetHalMaxValues(m_halMaxValues, halMaxValuesEx);
213 
214     // Assign a new tracker and record the tracker index
215     int ret = cmHalState->renderHal->trackerProducer.AssignNewTracker();
216     CM_CHK_COND_RETURN((ret < 0), CM_FAILURE, "Error: failed to assign a new tracker");
217     m_trackerIndex = ret;
218     if (cmHalState->advExecutor)
219     {
220         ret = cmHalState->advExecutor->AssignNewTracker();
221         CM_CHK_COND_RETURN((ret < 0), CM_FAILURE, "Error: failed to assign a new tracker");
222         m_fastTrackerIndex = ret;
223     }
224 
225     // Creates or gets GPU Context for the test
226     if (m_queueOption.UserGPUContext == true)
227     {
228         // Checks if it is the user-provided GPU context. If it is valid, we will create the queue with the existing Context
229         if (cmHalState->osInterface->pfnIsGpuContextValid(cmHalState->osInterface, (MOS_GPU_CONTEXT)m_queueOption.GPUContext) != MOS_STATUS_SUCCESS)
230         {
231             // Returns failure
232             CM_ASSERTMESSAGE("Error: The user passed in an GPU context which is not valid");
233             return CM_INVALID_USER_GPU_CONTEXT_FOR_QUEUE_EX;
234         }
235     }
236     else
237     {
238         MOS_GPUCTX_CREATOPTIONS ctxCreateOption;
239         ctxCreateOption.CmdBufferNumScale
240           = HalCm_GetNumCmdBuffers(cmHalState->osInterface, cmHalState->cmDeviceParam.maxTasks);
241 
242         // Create MDF preset GPU context, update GPUContext in m_queueOption
243         if (m_queueOption.QueueType == CM_QUEUE_TYPE_RENDER)
244         {
245             MOS_GPU_CONTEXT tmpGpuCtx = cmHalState->requestCustomGpuContext? MOS_GPU_CONTEXT_RENDER4: MOS_GPU_CONTEXT_RENDER3;;
246 
247             // check if context handle was specified by user.
248             if (m_queueOption.GPUContext != 0)
249             {
250                 tmpGpuCtx = (MOS_GPU_CONTEXT)m_queueOption.GPUContext;
251             }
252 
253             // sanity check of context handle for CM
254             if (HalCm_IsValidGpuContext(tmpGpuCtx) == false)
255             {
256                 return CM_INVALID_USER_GPU_CONTEXT_FOR_QUEUE_EX;
257             }
258 
259             // SSEU overriding
260             if (cmHalState->cmHalInterface->IsOverridePowerOptionPerGpuContext())
261             {
262                 // checking if need shutdown sub-slices for VME usage
263                 if (m_queueOption.SseuUsageHint == CM_QUEUE_SSEU_USAGE_HINT_VME
264                  && cmHalState->cmHalInterface->IsRequestShutdownSubslicesForVmeUsage())
265                 {
266                     MEDIA_SYSTEM_INFO *gtSystemInfo = cmHalState->osInterface->pfnGetGtSystemInfo(cmHalState->osInterface);
267                     ctxCreateOption.packed.SliceCount    = (uint8_t)gtSystemInfo->SliceCount;
268                     ctxCreateOption.packed.SubSliceCount = (gtSystemInfo->SubSliceCount / gtSystemInfo->SliceCount) >> 1; // set to half
269                     ctxCreateOption.packed.MaxEUcountPerSubSlice = gtSystemInfo->EUCount/gtSystemInfo->SubSliceCount;
270                     ctxCreateOption.packed.MinEUcountPerSubSlice = gtSystemInfo->EUCount/gtSystemInfo->SubSliceCount;
271                 }
272 
273 #if (_DEBUG || _RELEASE_INTERNAL)
274                 {
275                     MediaUserSettingSharedPtr   userSettingPtr = cmHalState->osInterface->pfnGetUserSettingInstance(cmHalState->osInterface);
276                     uint32_t                    value          = 0;
277                     ReadUserSettingForDebug(
278                         userSettingPtr,
279                         value,
280                         __MEDIA_USER_FEATURE_VALUE_SSEU_SETTING_OVERRIDE,
281                         MediaUserSetting::Group::Device);
282 
283                     // +---------------+----------------+----------------+----------------+
284                     // |   EUCountMax  |   EUCountMin   |     SSCount    |   SliceCount   |
285                     // +-------------24+--------------16+---------------8+---------------0+
286                     if (value != 0xDEADC0DE)
287                     {
288                         ctxCreateOption.packed.SliceCount            = value         & 0xFF;       // Bits 0-7
289                         ctxCreateOption.packed.SubSliceCount         = (value >>  8) & 0xFF;       // Bits 8-15
290                         ctxCreateOption.packed.MaxEUcountPerSubSlice = (value >> 16) & 0xFF;       // Bits 16-23
291                         ctxCreateOption.packed.MinEUcountPerSubSlice = (value >> 24) & 0xFF;       // Bits 24-31
292                     }
293                 }
294 #endif
295             }
296 
297             ctxCreateOption.RAMode = m_queueOption.RAMode;
298             ctxCreateOption.isRealTimePriority = m_queueOption.IsRealTimePrioriy;
299 
300             // Create render GPU context.
301             CM_CHK_MOSSTATUS_GOTOFINISH_CMERROR(
302                 CreateGpuContext(cmHalState, tmpGpuCtx, MOS_GPU_NODE_3D,
303                                  &ctxCreateOption));
304 
305 #if (_RELEASE_INTERNAL || _DEBUG)
306 #if defined(CM_DIRECT_GUC_SUPPORT)
307             //init GuC
308             CM_CHK_MOSSTATUS_GOTOFINISH_CMERROR(cmHalState->osInterface->pfnInitGuC(cmHalState->osInterface, MOS_GPU_NODE_3D));
309 #endif
310 #endif
311             m_queueOption.GPUContext = tmpGpuCtx;
312         }
313         else if (m_queueOption.QueueType == CM_QUEUE_TYPE_COMPUTE)
314         {
315             ctxCreateOption.RAMode = m_queueOption.RAMode;
316 
317             bool bVeUsedInCm = false; //need change to true once feature is done in future.
318 #if (_DEBUG || _RELEASE_INTERNAL)
319             MOS_USER_FEATURE_VALUE_DATA UserFeatureData = {0};
320             MOS_UserFeature_ReadValue_ID(
321                 nullptr, __MEDIA_USER_FEATURE_VALUE_MDF_CCS_USE_VE_INTERFACE,
322                 &UserFeatureData, cmHalState->osInterface->pOsContext);
323             bVeUsedInCm = (UserFeatureData.u32Data == 0x1)? true: false;
324 #endif
325             Mos_SetVirtualEngineSupported(cmHalState->osInterface, bVeUsedInCm);
326 
327             if (cmHalState->osInterface->veDefaultEnable && cmHalState->osInterface->bSupportVirtualEngine) // check if VE enabled on OS
328             {
329                 // prepare virtual egine hint param on this cm queue.
330                 CM_CHK_MOSSTATUS_GOTOFINISH_CMERROR(
331                     HalCm_PrepareVEHintParam(cmHalState, false, &m_mosVeHintParams));
332 
333                 m_usingVirtualEngine = true;
334             }
335 
336             ctxCreateOption.isRealTimePriority = m_queueOption.IsRealTimePrioriy;
337 
338             CM_CHK_MOSSTATUS_GOTOFINISH_CMERROR(
339                 CreateGpuContext(cmHalState, MOS_GPU_CONTEXT_CM_COMPUTE,
340                                  MOS_GPU_NODE_COMPUTE, &ctxCreateOption));
341             m_queueOption.GPUContext = MOS_GPU_CONTEXT_CM_COMPUTE;
342         }
343         else
344         {
345             // Returns failure
346             CM_ASSERTMESSAGE("Error: The QueueType is not supported by MDF.");
347             return CM_NOT_IMPLEMENTED;
348         }
349     }
350 
351 finish:
352     return hr;
353 }
354 
355 //*-----------------------------------------------------------------------------
356 //| Purpose:    Checks whether any kernels in the task have a thread argument
357 //| Returns:    Result of the operation.
358 //*-----------------------------------------------------------------------------
GetTaskHasThreadArg(CmKernelRT * kernelArray[],uint32_t numKernels,bool & threadArgExists)359 int32_t CmQueueRT::GetTaskHasThreadArg(CmKernelRT* kernelArray[], uint32_t numKernels, bool& threadArgExists)
360 {
361     threadArgExists = false;
362 
363     for(uint32_t krn = 0; krn < numKernels; krn++)
364     {
365         if( !kernelArray[krn] )
366         {
367             CM_ASSERTMESSAGE("Error: The kernel in the task have no thread argument.");
368             return CM_FAILURE;
369         }
370 
371         if( kernelArray[krn]->IsThreadArgExisted( ) )
372         {
373             threadArgExists = true;
374             break;
375         }
376     }
377 
378     return CM_SUCCESS;
379 }
380 
381 //*-----------------------------------------------------------------------------
382 //| Purpose:    Enqueue Task
383 //| Arguments :
384 //|               kernelArray      [in]       Pointer to kernel array
385 //|               event            [in]       Reference to the pointer to Event
386 //|               threadSpace               [out]      Pointer to thread space
387 //|
388 //| Returns:    Result of the operation.
389 //*-----------------------------------------------------------------------------
Enqueue(CmTask * kernelArray,CmEvent * & event,const CmThreadSpace * threadSpace)390 CM_RT_API int32_t CmQueueRT::Enqueue(
391            CmTask* kernelArray,
392            CmEvent* & event,
393            const CmThreadSpace* threadSpace)
394 {
395     INSERT_API_CALL_LOG(GetHalState());
396 
397     if (kernelArray == nullptr)
398     {
399         CM_ASSERTMESSAGE("Error: Kernel array is null.");
400         return CM_INVALID_ARG_VALUE;
401     }
402 
403     CmTaskRT *kernelArrayRT = static_cast<CmTaskRT *>(kernelArray);
404     uint32_t kernelCount = 0;
405     kernelCount = kernelArrayRT->GetKernelCount();
406     if (kernelCount == 0)
407     {
408         CM_ASSERTMESSAGE("Error: Invalid kernel count.");
409         return CM_FAILURE;
410     }
411 
412     if (kernelCount > m_halMaxValues->maxKernelsPerTask)
413     {
414         CM_ASSERTMESSAGE("Error: Kernel count exceeds max kernel per enqueue.");
415         return CM_EXCEED_MAX_KERNEL_PER_ENQUEUE;
416     }
417 
418     int32_t result;
419     const CmThreadSpaceRT *threadSpaceRTConst = static_cast<const CmThreadSpaceRT *>(threadSpace);
420     PCM_HAL_STATE cmHalState = ((PCM_CONTEXT_DATA)m_device->GetAccelData())->cmHalState;
421     CM_CHK_NULL_RETURN_CMERROR(cmHalState);
422     if (cmHalState->cmHalInterface->CheckMediaModeAvailability() == false)
423     {
424         if (threadSpaceRTConst != nullptr)
425         {
426             result = EnqueueWithGroup(kernelArray, event, threadSpaceRTConst->GetThreadGroupSpace());
427         }
428         else
429         {
430             // If there isn't any shared thread space or associated thread space,
431             // create a temporary (maxThreadCount x 1) thread group space whose
432             // size equal to the max thread count of kernel who doesn't have a
433             // thread space associated.
434             uint32_t maxThreadCount = 1;
435             bool usedCommonTGS = false;
436             for (uint32_t i = 0; i < kernelCount; i++)
437             {
438                 CmKernelRT *tmpKernel = kernelArrayRT->GetKernelPointer(i);
439                 CmThreadGroupSpace *tmpTGS = nullptr;
440                 tmpKernel->GetThreadGroupSpace(tmpTGS);
441 
442                 if (tmpTGS == nullptr)
443                 {
444                     usedCommonTGS = true;
445                     uint32_t singleThreadCount = 0;
446                     tmpKernel->GetThreadCount(singleThreadCount);
447                     if (maxThreadCount < singleThreadCount)
448                     {
449                         maxThreadCount = singleThreadCount;
450                     }
451                 }
452             }
453 
454             CmThreadGroupSpace *threadGroupSpaceTemp = nullptr;
455             if (usedCommonTGS == true)
456             {
457                 result = m_device->CreateThreadGroupSpace(1, 1, maxThreadCount, 1, threadGroupSpaceTemp);
458                 if (result != CM_SUCCESS)
459                 {
460                     CM_ASSERTMESSAGE("Error: Creating temporary thread group space failure.");
461                     return result;
462                 }
463             }
464 
465             result = EnqueueWithGroup(kernelArray, event, threadGroupSpaceTemp);
466 
467             if (threadGroupSpaceTemp != nullptr)
468             {
469                 m_device->DestroyThreadGroupSpace(threadGroupSpaceTemp);
470             }
471         }
472         return result;
473     }
474 
475     // check if meet the requirements of fast path
476     // if yes, switch to fast path
477     // else, continue the legacy path
478     if (cmHalState && cmHalState->advExecutor && cmHalState->cmHalInterface &&
479         cmHalState->advExecutor->SwitchToFastPath(kernelArray) &&
480         cmHalState->cmHalInterface->IsFastPathByDefault())
481     {
482         auto gpu_context_name
483                 = static_cast<MOS_GPU_CONTEXT>(m_queueOption.GPUContext);
484         uint32_t old_stream_idx = cmHalState->pfnSetGpuContext(cmHalState,
485                                                                gpu_context_name,
486                                                                m_streamIndex,
487                                                                m_gpuContextHandle);
488         result = cmHalState->advExecutor->SubmitTask(this, kernelArray, event,
489                                                      threadSpace, gpu_context_name);
490         cmHalState->osInterface->streamIndex = old_stream_idx;
491         return result;
492     }
493 
494     if (threadSpaceRTConst && threadSpaceRTConst->IsThreadAssociated())
495     {
496         if (threadSpaceRTConst->GetNeedSetKernelPointer() && threadSpaceRTConst->KernelPointerIsNULL())
497         {
498             CmKernelRT* tmp = nullptr;
499             tmp = kernelArrayRT->GetKernelPointer(0);
500             threadSpaceRTConst->SetKernelPointer(tmp);
501         }
502     }
503 
504 #if _DEBUG
505     if (threadSpaceRTConst)
506     {
507         CmThreadSpaceRT *threadSpaceRT = const_cast<CmThreadSpaceRT*>(threadSpaceRTConst);
508         if (!threadSpaceRT->IntegrityCheck(kernelArrayRT))
509         {
510             CM_ASSERTMESSAGE("Error: Invalid thread space.");
511             return CM_INVALID_THREAD_SPACE;
512         }
513     }
514 #endif
515 
516     if(m_device->IsPrintEnable())
517     {
518         m_device->CreatePrintBuffer();
519     }
520 
521     typedef CmKernelRT* pCmKernel;
522     CmKernelRT** tmp = MOS_NewArray(pCmKernel, (kernelCount + 1));
523     if(tmp == nullptr)
524     {
525         CM_ASSERTMESSAGE("Error: Out of system memory.");
526         return CM_OUT_OF_HOST_MEMORY;
527     }
528 
529     uint32_t totalThreadNumber = 0;
530     for(uint32_t i = 0; i < kernelCount; i++)
531     {
532         tmp[ i ] = kernelArrayRT->GetKernelPointer(i);
533 
534         uint32_t singleThreadNumber = 0;
535         tmp[i]->GetThreadCount(singleThreadNumber);
536         if (singleThreadNumber == 0)
537         {
538             CmThreadSpaceRT *threadSpaceRT = const_cast<CmThreadSpaceRT*>(threadSpaceRTConst);
539             if (threadSpaceRT)
540             {
541                 uint32_t width, height;
542                 threadSpaceRT->GetThreadSpaceSize(width, height);
543                 singleThreadNumber = width*height;
544             }
545         }
546         totalThreadNumber += singleThreadNumber;
547     }
548     tmp[kernelCount ] = nullptr;
549 
550     CmEventRT *eventRT = static_cast<CmEventRT *>(event);
551     CM_TASK_CONFIG taskConfig;
552     kernelArrayRT->GetProperty(taskConfig);
553     result = Enqueue_RT(tmp, kernelCount, totalThreadNumber, eventRT, threadSpaceRTConst, kernelArrayRT->GetSyncBitmap(), kernelArrayRT->GetPowerOption(),
554                         kernelArrayRT->GetConditionalEndBitmap(), kernelArrayRT->GetConditionalEndInfo(), &taskConfig);
555 
556     if (eventRT)
557     {
558         eventRT->SetKernelNames(kernelArrayRT, const_cast<CmThreadSpaceRT*>(threadSpaceRTConst), nullptr);
559     }
560 
561     event = eventRT;
562     MosSafeDeleteArray( tmp );
563 
564     return result;
565 }
566 
567 //*-----------------------------------------------------------------------------
568 //| Purpose:      Enqueue Task
569 //| Arguments :
570 //|               kernelArray      [in]       Pointer to kernel array
571 //|               event            [in]       Reference to the pointer to Event
572 //|               threadSpace               [out]      Pointer to thread space
573 //|
574 //| Returns:    Result of the operation.
575 //*-----------------------------------------------------------------------------
Enqueue_RT(CmKernelRT * kernelArray[],const uint32_t kernelCount,const uint32_t totalThreadCount,CmEventRT * & event,const CmThreadSpaceRT * threadSpace,uint64_t syncBitmap,PCM_POWER_OPTION powerOption,uint64_t conditionalEndBitmap,CM_HAL_CONDITIONAL_BB_END_INFO * conditionalEndInfo,PCM_TASK_CONFIG taskConfig)576 int32_t CmQueueRT::Enqueue_RT(
577                         CmKernelRT* kernelArray[],
578                         const uint32_t kernelCount,
579                         const uint32_t totalThreadCount,
580                         CmEventRT* & event,
581                         const CmThreadSpaceRT* threadSpace,
582                         uint64_t    syncBitmap,
583                         PCM_POWER_OPTION powerOption,
584                         uint64_t    conditionalEndBitmap,
585                         CM_HAL_CONDITIONAL_BB_END_INFO* conditionalEndInfo,
586                         PCM_TASK_CONFIG  taskConfig)
587 {
588     CM_NORMALMESSAGE("================ in origin path, media walker===================");
589     if(kernelArray == nullptr)
590     {
591         CM_ASSERTMESSAGE("Error: Kernel array is NULL.");
592         return CM_INVALID_ARG_VALUE;
593     }
594 
595     if( kernelCount == 0 )
596     {
597         CM_ASSERTMESSAGE("Error: There are no valid kernels.");
598         return CM_INVALID_ARG_VALUE;
599     }
600 
601     bool isEventVisible = (event == CM_NO_EVENT)? false:true;
602 
603     CLock Locker(m_criticalSectionTaskInternal);
604 
605     // set the current tracker index in renderhal
606     PCM_CONTEXT_DATA cmData = (PCM_CONTEXT_DATA)m_device->GetAccelData();
607     CM_CHK_NULL_RETURN_CMERROR(cmData);
608     CM_CHK_NULL_RETURN_CMERROR(cmData->cmHalState);
609     CM_CHK_NULL_RETURN_CMERROR(cmData->cmHalState->renderHal);
610     cmData->cmHalState->renderHal->currentTrackerIndex = m_trackerIndex;
611 
612     CmTaskInternal* task = nullptr;
613     int32_t result = CmTaskInternal::Create(kernelCount, totalThreadCount, kernelArray, threadSpace, m_device, syncBitmap, task, conditionalEndBitmap, conditionalEndInfo);
614     if( result != CM_SUCCESS )
615     {
616         CM_ASSERTMESSAGE("Error: Create CM task internal failure.");
617         return result;
618     }
619 
620     LARGE_INTEGER nEnqueueTime;
621     if ( !(MosUtilities::MosQueryPerformanceCounter( (uint64_t*)&nEnqueueTime.QuadPart )))
622     {
623         CM_ASSERTMESSAGE("Error: Query performance counter failure.");
624         CmTaskInternal::Destroy(task);
625         return CM_FAILURE;
626     }
627 
628     int32_t taskDriverId = -1;
629 
630     result = CreateEvent(task, isEventVisible, taskDriverId, event);
631     if (result != CM_SUCCESS)
632     {
633         CM_ASSERTMESSAGE("Error: Create event failure.");
634         return result;
635     }
636     if ( event != nullptr )
637     {
638         event->SetEnqueueTime( nEnqueueTime );
639     }
640 
641     task->SetPowerOption( powerOption );
642 
643     task->SetProperty(taskConfig);
644 
645     if( !m_enqueuedTasks.Push( task ) )
646     {
647         CM_ASSERTMESSAGE("Error: Push enqueued tasks failure.");
648         return CM_FAILURE;
649     }
650 
651     result = FlushTaskWithoutSync();
652 
653     return result;
654 }
655 
Enqueue_RT(CmKernelRT * kernelArray[],const uint32_t kernelCount,const uint32_t totalThreadCount,CmEventRT * & event,const CmThreadGroupSpace * threadGroupSpace,uint64_t syncBitmap,PCM_POWER_OPTION powerOption,uint64_t conditionalEndBitmap,CM_HAL_CONDITIONAL_BB_END_INFO * conditionalEndInfo,PCM_TASK_CONFIG taskConfig,const CM_EXECUTION_CONFIG * krnExecCfg)656 int32_t CmQueueRT::Enqueue_RT(CmKernelRT* kernelArray[],
657                         const uint32_t kernelCount,
658                         const uint32_t totalThreadCount,
659                         CmEventRT* & event,
660                         const CmThreadGroupSpace* threadGroupSpace,
661                         uint64_t    syncBitmap,
662                         PCM_POWER_OPTION powerOption,
663                         uint64_t    conditionalEndBitmap,
664                         CM_HAL_CONDITIONAL_BB_END_INFO* conditionalEndInfo,
665                         PCM_TASK_CONFIG  taskConfig,
666                         const CM_EXECUTION_CONFIG* krnExecCfg)
667 {
668     CM_NORMALMESSAGE("================ in origin path, gpgpu walker===================");
669     if(kernelArray == nullptr)
670     {
671         CM_ASSERTMESSAGE("Error: Kernel array is NULL.");
672         return CM_INVALID_ARG_VALUE;
673     }
674 
675     if( kernelCount == 0 )
676     {
677         CM_ASSERTMESSAGE("Error: There are no valid kernels.");
678         return CM_INVALID_ARG_VALUE;
679     }
680 
681     CLock Locker(m_criticalSectionTaskInternal);
682 
683     // set the current tracker index in renderhal
684     PCM_CONTEXT_DATA cmData = (PCM_CONTEXT_DATA)m_device->GetAccelData();
685     CM_CHK_NULL_RETURN_CMERROR(cmData);
686     CM_CHK_NULL_RETURN_CMERROR(cmData->cmHalState);
687     CM_CHK_NULL_RETURN_CMERROR(cmData->cmHalState->renderHal);
688     cmData->cmHalState->renderHal->currentTrackerIndex = m_trackerIndex;
689 
690     CmTaskInternal* task = nullptr;
691     int32_t result = CmTaskInternal::Create( kernelCount, totalThreadCount, kernelArray,
692                                             threadGroupSpace, m_device, syncBitmap, task,
693                                             conditionalEndBitmap, conditionalEndInfo, krnExecCfg);
694     if( result != CM_SUCCESS )
695     {
696         CM_ASSERTMESSAGE("Error: Create CmTaskInternal failure.");
697         return result;
698     }
699 
700     LARGE_INTEGER nEnqueueTime;
701     if ( !(MosUtilities::MosQueryPerformanceCounter( (uint64_t*)&nEnqueueTime.QuadPart )))
702     {
703         CM_ASSERTMESSAGE("Error: Query performance counter failure.");
704         CmTaskInternal::Destroy(task);
705         return CM_FAILURE;
706     }
707 
708     int32_t taskDriverId = -1;
709 
710     result = CreateEvent(task, !(event == CM_NO_EVENT) , taskDriverId, event);
711     if (result != CM_SUCCESS)
712     {
713         CM_ASSERTMESSAGE("Error: Create event failure.");
714         return result;
715     }
716     if ( event != nullptr )
717     {
718         event->SetEnqueueTime( nEnqueueTime );
719     }
720 
721     task->SetPowerOption( powerOption );
722 
723     task->SetProperty(taskConfig);
724 
725     if( !m_enqueuedTasks.Push( task ) )
726     {
727         CM_ASSERTMESSAGE("Error: Push enqueued tasks failure.")
728         return CM_FAILURE;
729     }
730 
731     result = FlushTaskWithoutSync();
732 
733     return result;
734 }
735 
Enqueue_RT(CmKernelRT * kernelArray[],CmEventRT * & event,uint32_t numTasksGenerated,bool isLastTask,uint32_t hints,PCM_POWER_OPTION powerOption)736 int32_t CmQueueRT::Enqueue_RT( CmKernelRT* kernelArray[],
737                         CmEventRT* & event,
738                         uint32_t numTasksGenerated,
739                         bool isLastTask,
740                         uint32_t hints,
741                         PCM_POWER_OPTION powerOption)
742 {
743     int32_t result = CM_FAILURE;
744     uint32_t kernelCount = 0;
745     CmTaskInternal* task = nullptr;
746     int32_t taskDriverId = -1;
747     bool isEventVisible = (event == CM_NO_EVENT) ? false:true;
748     bool threadArgExists = false;
749 
750     if( kernelArray == nullptr)
751     {
752         CM_ASSERTMESSAGE("Error: Kernel array is NULL.");
753         return CM_INVALID_ARG_VALUE;
754     }
755     while( kernelArray[ kernelCount ] )
756     {
757         kernelCount++;
758     }
759 
760     if( kernelCount < CM_MINIMUM_NUM_KERNELS_ENQWHINTS )
761     {
762         CM_ASSERTMESSAGE("Error: EnqueueWithHints requires at least 2 kernels.");
763         return CM_FAILURE;
764     }
765 
766     uint32_t totalThreadCount = 0;
767     for( uint32_t i = 0; i < kernelCount; i ++ )
768     {
769         uint32_t threadCount = 0;
770         kernelArray[i]->GetThreadCount( threadCount );
771         totalThreadCount += threadCount;
772     }
773 
774     if( GetTaskHasThreadArg(kernelArray, kernelCount, threadArgExists) != CM_SUCCESS )
775     {
776         CM_ASSERTMESSAGE("Error: Thread argument checking fails.");
777         return CM_FAILURE;
778     }
779 
780     if( !threadArgExists )
781     {
782         if (totalThreadCount > m_halMaxValues->maxUserThreadsPerTaskNoThreadArg )
783         {
784             CM_ASSERTMESSAGE("Error: Maximum number of threads per task exceeded.");
785             return CM_EXCEED_MAX_THREAD_AMOUNT_PER_ENQUEUE;
786         }
787     }
788     else
789     {
790         if( totalThreadCount > m_halMaxValues->maxUserThreadsPerTask )
791         {
792             CM_ASSERTMESSAGE("Error: Maximum number of threads per task exceeded.");
793             return CM_EXCEED_MAX_THREAD_AMOUNT_PER_ENQUEUE;
794         }
795     }
796 
797     CLock Locker(m_criticalSectionTaskInternal);
798 
799     // set the current tracker index in renderhal
800     PCM_CONTEXT_DATA cmData = (PCM_CONTEXT_DATA)m_device->GetAccelData();
801     CM_CHK_NULL_RETURN_CMERROR(cmData);
802     CM_CHK_NULL_RETURN_CMERROR(cmData->cmHalState);
803     CM_CHK_NULL_RETURN_CMERROR(cmData->cmHalState->renderHal);
804     cmData->cmHalState->renderHal->currentTrackerIndex = m_trackerIndex;
805 
806     result = CmTaskInternal::Create( kernelCount, totalThreadCount, kernelArray, task, numTasksGenerated, isLastTask, hints, m_device );
807 
808     if( result != CM_SUCCESS )
809     {
810         CM_ASSERTMESSAGE("Error: Create CM task internal failure.");
811         return result;
812     }
813 
814     LARGE_INTEGER nEnqueueTime;
815     if ( !(MosUtilities::MosQueryPerformanceCounter( (uint64_t*)&nEnqueueTime.QuadPart )) )
816     {
817         CM_ASSERTMESSAGE("Error: Query performance counter failure.");
818         CmTaskInternal::Destroy(task);
819         return CM_FAILURE;
820     }
821 
822     result = CreateEvent(task, isEventVisible, taskDriverId, event);
823     if (result != CM_SUCCESS)
824     {
825         CM_ASSERTMESSAGE("Error: Create event failure.");
826         return result;
827     }
828     if ( event != nullptr )
829     {
830         event->SetEnqueueTime( nEnqueueTime );
831     }
832 
833     for( uint32_t i = 0; i < kernelCount; ++i )
834     {
835         CmKernelRT* kernel = nullptr;
836         task->GetKernel(i, kernel);
837         if( kernel != nullptr )
838         {
839             kernel->SetAdjustedYCoord(0);
840         }
841     }
842 
843     task->SetPowerOption( powerOption );
844 
845     if (!m_enqueuedTasks.Push(task))
846     {
847         CM_ASSERTMESSAGE("Error: Push enqueued tasks failure.")
848         return CM_FAILURE;
849     }
850 
851     result = FlushTaskWithoutSync();
852 
853     return result;
854 }
855 
856 //*-----------------------------------------------------------------------------
857 //! Function to enqueue task with thread group space pointer
858 //! Arguments:
859 //!     1. Pointer to CmTask, which can only contain one kernel.
860 //!     2. Reference to the pointer to CmEvent that is to be returned
861 //!     3. Pointer to a CmThreadGroupSpace.
862 //! Return Value:
863 //!     CM_SUCCESS if the task is successfully enqueued and the CmEvent is generated
864 //!     CM_OUT_OF_HOST_MEMORY if out of host memory
865 //!     CM_FAILURE otherwise
866 //! Notes:
867 //!     If the kernel has per thread arg, GPGPU object is to be used.
868 //!     If the kernel has no per thread  arg. GPGPU walker is used.
869 //*-----------------------------------------------------------------------------
EnqueueWithGroup(CmTask * task,CmEvent * & event,const CmThreadGroupSpace * threadGroupSpace)870 CM_RT_API int32_t CmQueueRT::EnqueueWithGroup( CmTask* task, CmEvent* & event, const CmThreadGroupSpace* threadGroupSpace)
871 {
872     INSERT_API_CALL_LOG(GetHalState());
873 
874     int32_t result;
875 
876     if(task == nullptr)
877     {
878         CM_ASSERTMESSAGE("Error: Kernel array is NULL.");
879         return CM_INVALID_ARG_VALUE;
880     }
881 
882     // check if meet the requirements of fast path
883     // if yes, switch to fast path
884     // else, continue the legacy path
885     PCM_HAL_STATE cmHalState = ((PCM_CONTEXT_DATA)m_device->GetAccelData())->cmHalState;
886     if (cmHalState && cmHalState->advExecutor && cmHalState->cmHalInterface &&
887         cmHalState->advExecutor->SwitchToFastPath(task) &&
888         cmHalState->cmHalInterface->IsFastPathByDefault())
889     {
890         auto gpu_context_name
891                 = static_cast<MOS_GPU_CONTEXT>(m_queueOption.GPUContext);
892         uint32_t old_stream_idx = cmHalState->pfnSetGpuContext(cmHalState,
893                                                                gpu_context_name,
894                                                                m_streamIndex,
895                                                                m_gpuContextHandle);
896         if (cmHalState->cmHalInterface->CheckMediaModeAvailability())
897         {
898             result = cmHalState->advExecutor->SubmitGpgpuTask(this, task, event,
899                                                               threadGroupSpace,
900                                                               gpu_context_name);
901         }
902         else
903         {
904             SelectSyncBuffer(cmHalState);
905             result = cmHalState->advExecutor->SubmitComputeTask(this, task, event,
906                                                                 threadGroupSpace,
907                                                                 gpu_context_name);
908         }
909         cmHalState->osInterface->streamIndex = old_stream_idx;
910         return result;
911     }
912 
913     CmTaskRT *taskRT = static_cast<CmTaskRT *>(task);
914     uint32_t count = 0;
915     count = taskRT->GetKernelCount();
916 
917     if( count == 0 )
918     {
919         CM_ASSERTMESSAGE("Error: There are no valid kernels.");
920         return CM_FAILURE;
921     }
922 
923     if(m_device->IsPrintEnable())
924     {
925         m_device->CreatePrintBuffer();
926     }
927 
928     typedef CmKernelRT* pCmKernel;
929     CmKernelRT** tmp = MOS_NewArray(pCmKernel, (count+1));
930     if(tmp == nullptr)
931     {
932         CM_ASSERTMESSAGE("Error: Out of system memory.");
933         return CM_OUT_OF_HOST_MEMORY;
934     }
935 
936     uint32_t totalThreadNumber = 0;
937     for(uint32_t i = 0; i < count; i++)
938     {
939         uint32_t singleThreadNumber = 0;
940         tmp[ i ] = taskRT->GetKernelPointer(i);
941 
942         //Thread arguments is not allowed in GPGPU_WALKER path
943         if(tmp[i]->IsThreadArgExisted())
944         {
945             CM_ASSERTMESSAGE("Error: No thread Args allowed when using group space");
946             MosSafeDeleteArray(tmp);
947             return CM_THREAD_ARG_NOT_ALLOWED;
948         }
949 
950         tmp[i]->GetThreadCount(singleThreadNumber);
951         totalThreadNumber += singleThreadNumber;
952     }
953     tmp[count ] = nullptr;
954 
955     CmEventRT *eventRT = static_cast<CmEventRT *>(event);
956     CM_TASK_CONFIG taskConfig;
957     taskRT->GetProperty(taskConfig);
958     result = Enqueue_RT( tmp, count, totalThreadNumber, eventRT,
959                          threadGroupSpace, taskRT->GetSyncBitmap(),
960                          taskRT->GetPowerOption(),
961                          taskRT->GetConditionalEndBitmap(), taskRT->GetConditionalEndInfo(),
962                          &taskConfig, taskRT->GetKernelExecuteConfig());
963 
964     if (eventRT)
965     {
966         eventRT->SetKernelNames(taskRT, nullptr, const_cast<CmThreadGroupSpace*>(threadGroupSpace));
967     }
968 
969     event = eventRT;
970     MosSafeDeleteArray( tmp );
971 
972     return result;
973 }
974 
EnqueueWithHints(CmTask * kernelArray,CmEvent * & event,uint32_t hints)975 CM_RT_API int32_t CmQueueRT::EnqueueWithHints(
976                                         CmTask* kernelArray,
977                                         CmEvent* & event,
978                                         uint32_t hints)
979 {
980     INSERT_API_CALL_LOG(GetHalState());
981 
982     int32_t            hr                = CM_FAILURE;
983     uint32_t           count             = 0;
984     uint32_t           index             = 0;
985     CmKernelRT**         kernels          = nullptr;
986     uint32_t           numTasks          = 0;
987     bool               splitTask         = false;
988     bool               lastTask          = false;
989     uint32_t           numTasksGenerated = 0;
990     CmEventRT          *eventRT = static_cast<CmEventRT *>(event);
991 
992     if (kernelArray == nullptr)
993     {
994         return CM_INVALID_ARG_VALUE;
995     }
996     CmTaskRT         *kernelArrayRT   = static_cast<CmTaskRT *>(kernelArray);
997     count = kernelArrayRT->GetKernelCount();
998     if( count == 0 )
999     {
1000         CM_ASSERTMESSAGE("Error: Invalid kernel count.");
1001         hr = CM_FAILURE;
1002         goto finish;
1003     }
1004 
1005     if( count > m_halMaxValues->maxKernelsPerTask )
1006     {
1007         CM_ASSERTMESSAGE("Error: Kernel count exceeds maximum kernel per enqueue.");
1008         hr = CM_EXCEED_MAX_KERNEL_PER_ENQUEUE;
1009         goto finish;
1010     }
1011 
1012     for (uint32_t i = 0; i < count; ++i)
1013     {
1014         CmKernelRT* kernelTmp = nullptr;
1015         CmThreadSpaceRT* threadSpaceTmp = nullptr;
1016         kernelTmp = kernelArrayRT->GetKernelPointer(i);
1017         CM_CHK_NULL_GOTOFINISH_CMERROR(kernelTmp);
1018         kernelTmp->GetThreadSpace(threadSpaceTmp);
1019         CM_CHK_NULL_GOTOFINISH_CMERROR(threadSpaceTmp);
1020         if (threadSpaceTmp->GetNeedSetKernelPointer() && threadSpaceTmp->KernelPointerIsNULL())
1021         {
1022             threadSpaceTmp->SetKernelPointer(kernelTmp);
1023         }
1024     }
1025 
1026 #if _DEBUG
1027     if( !kernelArrayRT->IntegrityCheckKernelThreadspace() )
1028     {
1029         CM_ASSERTMESSAGE("Error: Integrity check for kernel thread space failed.");
1030         hr = CM_KERNEL_THREADSPACE_INTEGRITY_FAILED;
1031         goto finish;
1032     }
1033 #endif
1034 
1035     numTasks = ( hints & CM_HINTS_MASK_NUM_TASKS ) >> CM_HINTS_NUM_BITS_TASK_POS;
1036     if( numTasks > 1 )
1037     {
1038         splitTask = true;
1039     }
1040 
1041     if( m_device->IsPrintEnable() )
1042     {
1043         m_device->CreatePrintBuffer();
1044     }
1045 
1046     kernels = MOS_NewArray(CmKernelRT*, (count + 1));
1047     CM_CHK_NULL_GOTOFINISH_CMERROR(kernels);
1048 
1049     do
1050     {
1051         for (index = 0; index < count; ++index)
1052         {
1053             kernels[ index ] = kernelArrayRT->GetKernelPointer( index );
1054         }
1055 
1056         kernels[ count ] = nullptr;
1057 
1058         if(splitTask)
1059         {
1060             if( numTasksGenerated == (numTasks - 1 ) )
1061             {
1062                 lastTask = true;
1063             }
1064         }
1065         else
1066         {
1067             lastTask = true;
1068         }
1069 
1070         CM_CHK_CMSTATUS_GOTOFINISH(Enqueue_RT( kernels, eventRT, numTasksGenerated, lastTask, hints, kernelArrayRT->GetPowerOption() ));
1071         event = eventRT;
1072         numTasksGenerated++;
1073 
1074     }while(numTasksGenerated < numTasks);
1075 
1076 finish:
1077     MosSafeDeleteArray( kernels );
1078 
1079     return hr;
1080 }
1081 
1082 //*-----------------------------------------------------------------------------
1083 //! Enqueue an task, which contains one pre-defined kernel to
1084 //! copy from host memory to surface
1085 //! This is a non-blocking call. i.e. it returns immediately without waiting for
1086 //! GPU to finish the execution of the task.
1087 //! A CmEvent is generated each time a task is enqueued. The CmEvent can
1088 //! be used to check if the task finishs.
1089 //! INPUT:
1090 //!     1) Pointer to the CmSurface2D_RT as copy destination
1091 //!     2) Pointer to the host memory as copy source
1092 //!     3) Reference to the pointer to CMEvent
1093 //!     4) A boolean value to indicate if or not to flush the queue after enqueue the task
1094 //!        by default the boolean value is TRUE.
1095 //! OUTPUT:
1096 //!     CM_SUCCESS if the task is successfully enqueued and the CmEvent is generated;
1097 //!     CM_OUT_OF_HOST_MEMORY if out of host memery;
1098 //!     CM_FAILURE otherwise.
1099 //!     More error code is coming.
1100 //*-----------------------------------------------------------------------------
EnqueueCopyCPUToGPU(CmSurface2D * surface,const unsigned char * sysMem,CmEvent * & event)1101 CM_RT_API int32_t CmQueueRT::EnqueueCopyCPUToGPU( CmSurface2D* surface, const unsigned char* sysMem, CmEvent* & event )
1102 {
1103     INSERT_API_CALL_LOG(GetHalState());
1104 
1105     if (!m_device->HasGpuCopyKernel())
1106     {
1107         return CM_NOT_IMPLEMENTED;
1108     }
1109 
1110     CmSurface2DRT *surfaceRT = static_cast<CmSurface2DRT *>(surface);
1111     return EnqueueCopyInternal(surfaceRT, (unsigned char*)sysMem, 0, 0, CM_FASTCOPY_CPU2GPU, CM_FASTCOPY_OPTION_NONBLOCKING, event);
1112 }
1113 
1114 //*-----------------------------------------------------------------------------
1115 //! Enqueue an task, which contains one pre-defined kernel to
1116 //! copy from surface to host memory
1117 //! This is a non-blocking call. i.e. it returns immediately without waiting for
1118 //! GPU to finish the execution of the task.
1119 //! A CmEvent is generated each time a task is enqueued. The CmEvent can
1120 //! be used to check if the task finishs.
1121 //! INPUT:
1122 //!     1) Pointer to the CmSurface2D_RT as copy source
1123 //!     2) Pointer to the host memory as copy destination
1124 //!     3) Reference to the pointer to CMEvent
1125 //!     4) A boolean value to indicate if or not to flush the queue after enqueue the task
1126 //!        by default the boolean value is TRUE.
1127 //! OUTPUT:
1128 //!     CM_SUCCESS if the task is successfully enqueued and the CmEvent is generated;
1129 //!     CM_OUT_OF_HOST_MEMORY if out of host memery;
1130 //!     CM_FAILURE otherwise.
1131 //!     More error code is coming.
1132 //*-----------------------------------------------------------------------------
EnqueueCopyGPUToCPU(CmSurface2D * surface,unsigned char * sysMem,CmEvent * & event)1133 CM_RT_API int32_t CmQueueRT::EnqueueCopyGPUToCPU( CmSurface2D* surface, unsigned char* sysMem, CmEvent* & event )
1134 {
1135     INSERT_API_CALL_LOG(GetHalState());
1136 
1137     if (!m_device->HasGpuCopyKernel())
1138     {
1139         return CM_NOT_IMPLEMENTED;
1140     }
1141 
1142     CmSurface2DRT *surfaceRT = static_cast<CmSurface2DRT *>(surface);
1143     return EnqueueCopyInternal(surfaceRT, sysMem, 0, 0, CM_FASTCOPY_GPU2CPU, CM_FASTCOPY_OPTION_NONBLOCKING, event);
1144 }
1145 
EnqueueUnalignedCopyInternal(CmSurface2DRT * surface,unsigned char * sysMem,const uint32_t widthStride,const uint32_t heightStride,CM_GPUCOPY_DIRECTION direction)1146 int32_t CmQueueRT::EnqueueUnalignedCopyInternal( CmSurface2DRT* surface, unsigned char* sysMem, const uint32_t widthStride, const uint32_t heightStride, CM_GPUCOPY_DIRECTION direction)
1147 {
1148     int32_t         hr                          = CM_SUCCESS;
1149     uint32_t        bufferupSize               = 0;
1150     uint32_t        dstAddShiftOffset           = 0;
1151     uint32_t        threadWidth                 = 0;
1152     uint32_t        threadHeight                = 0;
1153     uint32_t        threadNum                   = 0;
1154     uint32_t        auxiliaryBufferupSize     = 0;
1155     uint32_t        width                       = 0;
1156     uint32_t        height                      = 0;
1157     uint32_t        sizePerPixel                = 0;
1158     uint32_t        widthByte                  = 0;
1159     uint32_t        copyWidthByte             = 0;
1160     uint32_t        copyHeightRow             = 0;
1161     uint32_t        strideInBytes             = widthStride;
1162     uint32_t        heightStrideInRows       = heightStride;
1163     size_t          linearAddress              = (size_t)sysMem;
1164     size_t          linearAddressAligned       = 0;
1165     unsigned char*  hybridCopyAuxSysMem        = nullptr;
1166 
1167     CmBufferUP             *bufferUP                  = nullptr;
1168     CmKernel               *kernel                    = nullptr;
1169     CmBufferUP             *hybridCopyAuxBufferUP     = nullptr;
1170     SurfaceIndex           *bufferIndexCM             = nullptr;
1171     SurfaceIndex           *hybridCopyAuxIndexCM      = nullptr;
1172     SurfaceIndex           *surf2DIndexCM             = nullptr;
1173     CmThreadSpace          *threadSpace               = nullptr;
1174     CmTask                 *gpuCopyTask               = nullptr;
1175     CmProgram              *gpuCopyProgram            = nullptr;
1176     CmEvent                *event                     = nullptr;
1177     CM_STATUS              status;
1178     CM_SURFACE_FORMAT      format;
1179 
1180     if ( surface )
1181     {
1182         CM_CHK_CMSTATUS_GOTOFINISH( surface->GetSurfaceDesc(width, height, format, sizePerPixel));
1183     }
1184     else
1185     {
1186         return CM_FAILURE;
1187     }
1188 
1189     widthByte                  = width * sizePerPixel;
1190     // the actual copy region
1191     copyWidthByte             = MOS_MIN(strideInBytes, widthByte);
1192     copyHeightRow             = MOS_MIN(heightStrideInRows, height);
1193 
1194     if(linearAddress == 0)
1195     {
1196         CM_ASSERTMESSAGE("Error: Pointer to system memory is null.");
1197         return CM_INVALID_ARG_VALUE;
1198     }
1199     if( (copyWidthByte > CM_MAX_THREADSPACE_WIDTH_FOR_MW * BLOCK_WIDTH ) || ( copyHeightRow > CM_MAX_THREADSPACE_HEIGHT_FOR_MW * BLOCK_HEIGHT) )
1200     {  // each thread handles 64x8 block data. This API will fail if it exceeds the max thread space's size
1201         CM_ASSERTMESSAGE("Error: Invalid copy size.");
1202         return CM_INVALID_ARG_SIZE;
1203     }
1204 
1205     if (sizeof (void *) == 8 ) //64-bit
1206     {
1207         linearAddressAligned        = linearAddress & ADDRESS_PAGE_ALIGNMENT_MASK_X64;
1208     }
1209     else  //32-bit
1210     {
1211         linearAddressAligned        = linearAddress & ADDRESS_PAGE_ALIGNMENT_MASK_X86;
1212     }
1213     //Calculate  Left Shift offset
1214     dstAddShiftOffset               = (uint32_t)(linearAddress - linearAddressAligned);
1215 
1216     if (format == CM_SURFACE_FORMAT_NV12 || format == CM_SURFACE_FORMAT_P010 || format == CM_SURFACE_FORMAT_P016)
1217     {
1218         bufferupSize = MOS_ALIGN_CEIL(strideInBytes * (heightStrideInRows + copyHeightRow * 1/2) + (uint32_t)dstAddShiftOffset , 64);
1219     }
1220     else
1221     {
1222         bufferupSize = MOS_ALIGN_CEIL(strideInBytes * heightStrideInRows  + (uint32_t)dstAddShiftOffset, 64);
1223     }
1224 
1225     CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateBufferUP(bufferupSize, ( void * )linearAddressAligned, bufferUP));
1226     CM_CHK_CMSTATUS_GOTOFINISH(bufferUP->GetIndex(bufferIndexCM));
1227     CM_CHK_CMSTATUS_GOTOFINISH(surface->GetIndex(surf2DIndexCM));
1228 
1229     CM_CHK_CMSTATUS_GOTOFINISH( m_device->LoadPredefinedCopyKernel(gpuCopyProgram));
1230     CM_CHK_NULL_GOTOFINISH_CMERROR(gpuCopyProgram);
1231 
1232     if (direction == CM_FASTCOPY_CPU2GPU)
1233     {
1234         if (format == CM_SURFACE_FORMAT_NV12 || format == CM_SURFACE_FORMAT_P010 || format == CM_SURFACE_FORMAT_P016)
1235         {
1236             CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateKernel(gpuCopyProgram, _NAME(surfaceCopy_write_unaligned_NV12), kernel, "PredefinedGPUCopyKernel"));
1237         }
1238         else
1239         {
1240             CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateKernel(gpuCopyProgram, _NAME(surfaceCopy_write_unaligned), kernel, "PredefinedGPUCopyKernel"));
1241 
1242         }
1243         CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 0, sizeof( SurfaceIndex ), bufferIndexCM ));
1244         CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 1, sizeof( SurfaceIndex ), surf2DIndexCM ));
1245     }
1246     else
1247     {
1248         if (format == CM_SURFACE_FORMAT_NV12 || format == CM_SURFACE_FORMAT_P010 || format == CM_SURFACE_FORMAT_P016)
1249         {
1250             CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateKernel(gpuCopyProgram, _NAME(surfaceCopy_read_unaligned_NV12), kernel, "PredefinedGPUCopyKernel"));
1251             auxiliaryBufferupSize = BLOCK_WIDTH * 2 * (heightStrideInRows + copyHeightRow * 1/2);
1252         }
1253         else
1254         {
1255             CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateKernel(gpuCopyProgram, _NAME(surfaceCopy_read_unaligned), kernel, "PredefinedGPUCopyKernel"));
1256             auxiliaryBufferupSize = BLOCK_WIDTH * 2 * heightStrideInRows;
1257         }
1258         hybridCopyAuxSysMem = (unsigned char*)MOS_AlignedAllocMemory(auxiliaryBufferupSize, PAGE_ALIGNED);
1259         if(!hybridCopyAuxSysMem)
1260         {
1261             CM_ASSERTMESSAGE("Error: Out of system memory.");
1262             return CM_OUT_OF_HOST_MEMORY;
1263         }
1264         CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateBufferUP(auxiliaryBufferupSize, (void*)hybridCopyAuxSysMem, hybridCopyAuxBufferUP));
1265         CM_CHK_CMSTATUS_GOTOFINISH(hybridCopyAuxBufferUP->GetIndex(hybridCopyAuxIndexCM));
1266 
1267         CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 0, sizeof( SurfaceIndex ), surf2DIndexCM ));
1268         CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 1, sizeof( SurfaceIndex ), bufferIndexCM ));
1269         CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 5, sizeof( uint32_t ), &copyWidthByte ));
1270         CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 6, sizeof( SurfaceIndex ), hybridCopyAuxIndexCM ));
1271     }
1272 
1273     CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 2, sizeof( uint32_t ), &strideInBytes ));
1274     CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 3, sizeof( uint32_t ), &heightStrideInRows ));
1275     CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 4, sizeof( uint32_t ), &dstAddShiftOffset ));
1276 
1277     threadWidth = ( uint32_t )ceil( ( double )copyWidthByte/BLOCK_WIDTH );
1278     threadHeight = ( uint32_t )ceil( ( double )copyHeightRow/BLOCK_HEIGHT );
1279 
1280     threadNum = threadWidth * threadHeight;
1281     CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetThreadCount( threadNum ));
1282 
1283     CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateThreadSpace( threadWidth, threadHeight, threadSpace ));
1284     CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateTask(gpuCopyTask));
1285     CM_CHK_CMSTATUS_GOTOFINISH(gpuCopyTask->AddKernel( kernel ));
1286     CM_CHK_CMSTATUS_GOTOFINISH(EnqueueFast(gpuCopyTask, event, threadSpace));
1287 
1288     if(event)
1289     {
1290         CM_CHK_CMSTATUS_GOTOFINISH(event->GetStatus(status));
1291         while(status != CM_STATUS_FINISHED)
1292         {
1293             if (status == CM_STATUS_RESET)
1294             {
1295                 hr = CM_TASK_MEDIA_RESET;
1296                 goto finish;
1297             }
1298             CM_CHK_CMSTATUS_GOTOFINISH(event->GetStatus(status));
1299         }
1300     }
1301     // CPU copy unaligned data
1302     if( direction == CM_FASTCOPY_GPU2CPU)
1303     {
1304         uint32_t readOffset = 0;
1305         uint32_t copyLines = 0;
1306         unsigned char* startBuffer = (unsigned char*)linearAddressAligned;
1307 
1308         copyLines = (format == CM_SURFACE_FORMAT_NV12 || format == CM_SURFACE_FORMAT_P010 || format == CM_SURFACE_FORMAT_P016) ? heightStrideInRows + MOS_MIN(heightStrideInRows, height) * 1 / 2 : heightStrideInRows;
1309 
1310         for(uint32_t i = 0; i < copyLines; ++i)
1311         {
1312             //copy begining of line
1313             size_t beginLineWriteOffset = strideInBytes * i + dstAddShiftOffset;
1314             uint32_t mod = ((uintptr_t)startBuffer + beginLineWriteOffset) < BLOCK_WIDTH ? ((uintptr_t)startBuffer + beginLineWriteOffset) : ((uintptr_t)startBuffer + beginLineWriteOffset) & (BLOCK_WIDTH - 1);
1315             uint32_t beginLineCopySize = (mod == 0) ? 0:(BLOCK_WIDTH - mod);
1316             //fix copy size for cases where the surface width is small
1317             if((beginLineCopySize > widthByte) || ( beginLineCopySize == 0 && widthByte < BLOCK_WIDTH ) )
1318             {
1319                 beginLineCopySize = widthByte;
1320             }
1321             if(beginLineCopySize > 0)
1322             {
1323                 CmSafeMemCopy((void *)( (unsigned char *)startBuffer + beginLineWriteOffset), (void *)(hybridCopyAuxSysMem + readOffset), beginLineCopySize);
1324             }
1325 
1326             //copy end of line
1327             uint32_t alignedWrites = (copyWidthByte - beginLineCopySize) &~ (BLOCK_WIDTH - 1);
1328             uint32_t endLineWriteOffset = beginLineWriteOffset + alignedWrites + beginLineCopySize;
1329             uint32_t endLineCopySize = dstAddShiftOffset+ i * strideInBytes + copyWidthByte - endLineWriteOffset;
1330             if(endLineCopySize > 0 && endLineWriteOffset > beginLineWriteOffset)
1331             {
1332                 CmSafeMemCopy((void *)((unsigned char *)startBuffer + endLineWriteOffset), (void *)(hybridCopyAuxSysMem + readOffset + BLOCK_WIDTH), endLineCopySize);
1333             }
1334             readOffset += (BLOCK_WIDTH * 2);
1335         }
1336     }
1337 
1338     CM_CHK_CMSTATUS_GOTOFINISH(DestroyEventFast(event));
1339     CM_CHK_CMSTATUS_GOTOFINISH(m_device->DestroyTask(gpuCopyTask));
1340     CM_CHK_CMSTATUS_GOTOFINISH(m_device->DestroyThreadSpace(threadSpace));
1341     CM_CHK_CMSTATUS_GOTOFINISH(m_device->DestroyBufferUP(bufferUP));
1342     if (direction == CM_FASTCOPY_GPU2CPU)
1343     {
1344         if(hybridCopyAuxBufferUP)
1345         {
1346             CM_CHK_CMSTATUS_GOTOFINISH(m_device->DestroyBufferUP(hybridCopyAuxBufferUP));
1347         }
1348         if(hybridCopyAuxSysMem)
1349         {
1350             MOS_AlignedFreeMemory(hybridCopyAuxSysMem);
1351             hybridCopyAuxSysMem = nullptr;
1352         }
1353     }
1354 finish:
1355     if(hr != CM_SUCCESS)
1356     {
1357         if(bufferUP == nullptr)
1358         {
1359             // user need to know whether the failure is caused by out of BufferUP.
1360             hr = CM_GPUCOPY_OUT_OF_RESOURCE;
1361         }
1362 
1363         if(event)                          DestroyEventFast(event);
1364         if(kernel)                         m_device->DestroyKernel(kernel);
1365         if(threadSpace)                    m_device->DestroyThreadSpace(threadSpace);
1366         if(gpuCopyTask)                    m_device->DestroyTask(gpuCopyTask);
1367         if(bufferUP)                       m_device->DestroyBufferUP(bufferUP);
1368         if(hybridCopyAuxBufferUP)          m_device->DestroyBufferUP(hybridCopyAuxBufferUP);
1369         if(hybridCopyAuxSysMem)            {MOS_AlignedFreeMemory(hybridCopyAuxSysMem); hybridCopyAuxSysMem = nullptr;}
1370     }
1371 
1372     return hr;
1373 }
1374 //*-----------------------------------------------------------------------------
1375 //! Enqueue an task, which contains one pre-defined kernel to
1376 //! copy from surface to host memory or from host memory to surface
1377 //! This is a non-blocking call. i.e. it returns immediately without waiting for
1378 //! GPU to finish the execution of the task.
1379 //! A CmEvent is generated each time a task is enqueued. The CmEvent can
1380 //! be used to check if the task finishes.
1381 //! INPUT:
1382 //!     1) Pointer to the CmSurface2D
1383 //!     2) Pointer to the host memory
1384 //!     3) Width stride in bytes, if there is no padding in system memroy, it is set to zero.
1385 //!     4) Height stride in row, if there is no padding in system memroy, it is set to zero.
1386 //!     4) Copy direction, cpu->gpu (linear->tiled) or gpu->cpu(tiled->linear)
1387 //!     5) Reference to the pointer to CMEvent
1388 //! OUTPUT:
1389 //!     CM_SUCCESS if the task is successfully enqueued and the CmEvent is generated;
1390 //!     CM_OUT_OF_HOST_MEMORY if out of host memery;
1391 //!     CM_FAILURE otherwise.
1392 //*-----------------------------------------------------------------------------
EnqueueCopyInternal(CmSurface2DRT * surface,unsigned char * sysMem,const uint32_t widthStride,const uint32_t heightStride,CM_GPUCOPY_DIRECTION direction,const uint32_t option,CmEvent * & event)1393 int32_t CmQueueRT::EnqueueCopyInternal(CmSurface2DRT* surface,
1394                                 unsigned char* sysMem,
1395                                 const uint32_t widthStride,
1396                                 const uint32_t heightStride,
1397                                 CM_GPUCOPY_DIRECTION direction,
1398                                 const uint32_t option,
1399                                 CmEvent* & event)
1400 {
1401     int32_t hr                  = CM_FAILURE;
1402     uint32_t width               = 0;
1403     uint32_t height              = 0;
1404     uint32_t sizePerPixel        = 0;
1405     CM_SURFACE_FORMAT format    = CM_SURFACE_FORMAT_INVALID;
1406 
1407     if (surface)
1408     {
1409         CM_CHK_CMSTATUS_GOTOFINISH(surface->GetSurfaceDesc(width, height, format, sizePerPixel));
1410     }
1411     else
1412     {
1413         return CM_GPUCOPY_INVALID_SURFACES;
1414     }
1415 
1416     if (format == CM_SURFACE_FORMAT_NV12 || format == CM_SURFACE_FORMAT_P010 || format == CM_SURFACE_FORMAT_P016)
1417     {
1418         hr = EnqueueCopyInternal_2Planes(surface, (unsigned char*)sysMem, format, width, widthStride, height, heightStride, sizePerPixel, direction, option, event);
1419     }
1420     else
1421     {
1422         hr = EnqueueCopyInternal_1Plane(surface, (unsigned char*)sysMem, format, width, widthStride, height, heightStride, sizePerPixel, direction, option, event);
1423     }
1424 
1425 finish:
1426     return hr;
1427 }
1428 
EnqueueCopyInternal_1Plane(CmSurface2DRT * surface,unsigned char * sysMem,CM_SURFACE_FORMAT format,const uint32_t widthInPixel,const uint32_t widthStride,const uint32_t heightInRow,const uint32_t heightStride,const uint32_t sizePerPixel,CM_GPUCOPY_DIRECTION direction,const uint32_t option,CmEvent * & event)1429 int32_t CmQueueRT::EnqueueCopyInternal_1Plane(CmSurface2DRT* surface,
1430                                     unsigned char* sysMem,
1431                                     CM_SURFACE_FORMAT format,
1432                                     const uint32_t widthInPixel,
1433                                     const uint32_t widthStride,
1434                                     const uint32_t heightInRow,
1435                                     const uint32_t heightStride,
1436                                     const uint32_t sizePerPixel,
1437                                     CM_GPUCOPY_DIRECTION direction,
1438                                     const uint32_t option,
1439                                     CmEvent* & event )
1440 {
1441     int32_t         hr                      = CM_SUCCESS;
1442     uint32_t        tempHeight              = heightInRow;
1443     uint32_t        strideInBytes         = widthStride;
1444     uint32_t        strideInDwords        = 0;
1445     uint32_t        heightStrideInRows   = heightStride;
1446     uint32_t        addedShiftLeftOffset    = 0;
1447     size_t          linearAddress          = (size_t)sysMem;
1448     size_t          linearAddressAligned   = 0;
1449 
1450     CmKernel        *kernel            = nullptr;
1451     CmBufferUP      *cmbufferUP        = nullptr;
1452     SurfaceIndex    *bufferIndexCM     = nullptr;
1453     SurfaceIndex    *surf2DIndexCM     = nullptr;
1454     CmThreadSpace   *threadSpace                = nullptr;
1455     CmTask          *gpuCopyTask       = nullptr;
1456     CmEvent         *internalEvent     = nullptr;
1457 
1458     uint32_t        threadWidth             = 0;
1459     uint32_t        threadHeight            = 0;
1460     uint32_t        threadNum               = 0;
1461     uint32_t        widthDword             = 0;
1462     uint32_t        widthByte              = 0;
1463     uint32_t        copyWidthByte         = 0;
1464     uint32_t        copyHeightRow         = 0;
1465     uint32_t        sliceCopyHeightRow   = 0;
1466     uint32_t        sliceCopyBufferUPSize   = 0;
1467     int32_t         totalBufferUPSize       = 0;
1468     uint32_t        startX                 = 0;
1469     uint32_t        startY                 = 0;
1470     bool            blSingleEnqueue         = true;
1471     CM_GPUCOPY_KERNEL *gpuCopyKernelParam     = nullptr;
1472 
1473     PCM_HAL_STATE   cmHalState    =        \
1474         ((PCM_CONTEXT_DATA)m_device->GetAccelData())->cmHalState;
1475 
1476     widthByte    = widthInPixel * sizePerPixel;
1477 
1478     //Align the width regarding stride
1479    if(strideInBytes == 0)
1480    {
1481         strideInBytes = widthByte;
1482    }
1483 
1484    if(heightStrideInRows == 0)
1485    {
1486         heightStrideInRows = heightInRow;
1487    }
1488 
1489     // the actual copy region
1490     copyWidthByte = MOS_MIN(strideInBytes, widthByte);
1491     copyHeightRow = MOS_MIN(heightStrideInRows, heightInRow);
1492 
1493     // Make sure stride and start address of system memory is 16-byte aligned.
1494     // if no padding in system memory , strideInBytes = widthByte.
1495     if(strideInBytes & 0xf)
1496     {
1497         CM_ASSERTMESSAGE("Error: Stride is not 16-byte aligned.");
1498         return CM_GPUCOPY_INVALID_STRIDE;
1499     }
1500     if((linearAddress & 0xf) || (linearAddress == 0))
1501     {
1502         CM_ASSERTMESSAGE("Error: Start address of system memory is not 16-byte aligned.");
1503         return CM_GPUCOPY_INVALID_SYSMEM;
1504     }
1505 
1506     //Calculate actual total size of system memory
1507     totalBufferUPSize = strideInBytes * heightStrideInRows;
1508 
1509     //Check thread space width here
1510     if( copyWidthByte > CM_MAX_THREADSPACE_WIDTH_FOR_MW * BLOCK_PIXEL_WIDTH *4 )
1511     {  // each thread handles 128x8 block data. This API will fail if it exceeds the max thread space's size
1512         CM_ASSERTMESSAGE("Error: Invalid copy size.");
1513         return CM_GPUCOPY_INVALID_SIZE;
1514     }
1515 
1516     while (totalBufferUPSize > 0)
1517     {
1518         if (sizeof (void *) == 8 ) //64-bit
1519         {
1520             linearAddressAligned        = linearAddress & ADDRESS_PAGE_ALIGNMENT_MASK_X64;
1521         }
1522         else  //32-bit
1523         {
1524             linearAddressAligned        = linearAddress & ADDRESS_PAGE_ALIGNMENT_MASK_X86;
1525         }
1526 
1527         //Calculate  Left Shift offset
1528         addedShiftLeftOffset = (uint32_t)(linearAddress - linearAddressAligned);
1529         totalBufferUPSize   += addedShiftLeftOffset;
1530 
1531         if (totalBufferUPSize > CM_MAX_1D_SURF_WIDTH)
1532         {
1533             blSingleEnqueue = false;
1534             sliceCopyHeightRow = ((CM_MAX_1D_SURF_WIDTH - addedShiftLeftOffset)/(strideInBytes*(BLOCK_HEIGHT * INNER_LOOP))) * (BLOCK_HEIGHT * INNER_LOOP);
1535             sliceCopyBufferUPSize = sliceCopyHeightRow * strideInBytes + addedShiftLeftOffset;
1536             tempHeight = sliceCopyHeightRow;
1537         }
1538         else
1539         {
1540             sliceCopyHeightRow = copyHeightRow;
1541             sliceCopyBufferUPSize = totalBufferUPSize;
1542             if (!blSingleEnqueue)
1543             {
1544                 tempHeight = sliceCopyHeightRow;
1545             }
1546         }
1547 
1548         //Check thread space height here
1549         if(sliceCopyHeightRow > CM_MAX_THREADSPACE_HEIGHT_FOR_MW * BLOCK_HEIGHT * INNER_LOOP )
1550         {  // each thread handles 128x8 block data. This API will fail if it exceeds the max thread space's size
1551             CM_ASSERTMESSAGE("Error: Invalid copy size.");
1552             return CM_GPUCOPY_INVALID_SIZE;
1553         }
1554 
1555         kernel = nullptr;
1556         CM_CHK_CMSTATUS_GOTOFINISH( m_device->CreateBufferUP(  sliceCopyBufferUPSize, ( void * )linearAddressAligned, cmbufferUP ));
1557         CM_CHK_NULL_GOTOFINISH_CMERROR(cmbufferUP);
1558 
1559         //Configure memory object control for BufferUP to solve the cache-line issue.
1560         if (cmHalState->cmHalInterface->IsGPUCopySurfaceNoCacheWARequired())
1561         {
1562             CM_CHK_CMSTATUS_GOTOFINISH(cmbufferUP->SelectMemoryObjectControlSetting(MEMORY_OBJECT_CONTROL_SKL_NO_LLC_L3));
1563         }
1564         CM_CHK_CMSTATUS_GOTOFINISH(CreateGPUCopyKernel(copyWidthByte, sliceCopyHeightRow, format, direction, gpuCopyKernelParam));
1565         CM_CHK_NULL_GOTOFINISH_CMERROR(gpuCopyKernelParam);
1566         kernel = gpuCopyKernelParam->kernel;
1567 
1568         CM_CHK_NULL_GOTOFINISH_CMERROR(kernel);
1569 
1570         CM_CHK_NULL_GOTOFINISH_CMERROR(cmbufferUP);
1571         CM_CHK_CMSTATUS_GOTOFINISH(cmbufferUP->GetIndex( bufferIndexCM ));
1572         CM_CHK_CMSTATUS_GOTOFINISH(surface->GetIndex( surf2DIndexCM ));
1573 
1574         threadWidth = ( uint32_t )ceil( ( double )copyWidthByte/BLOCK_PIXEL_WIDTH/4 );
1575         threadHeight = ( uint32_t )ceil( ( double )sliceCopyHeightRow/BLOCK_HEIGHT/INNER_LOOP );
1576         threadNum = threadWidth * threadHeight;
1577         CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetThreadCount( threadNum ));
1578         CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateThreadSpace( threadWidth, threadHeight, threadSpace ));
1579 
1580         if(direction == CM_FASTCOPY_GPU2CPU)
1581         {
1582             surface->SetReadSyncFlag(true, this); // GPU -> CPU, set surf2d as read sync flag
1583         }
1584 
1585         if( direction == CM_FASTCOPY_CPU2GPU)
1586         {
1587             if (cmHalState->cmHalInterface->IsSurfaceCompressionWARequired())
1588             {
1589                 CM_CHK_CMSTATUS_GOTOFINISH(surface->SetCompressionMode(MEMCOMP_DISABLED));
1590             }
1591             CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 0, sizeof( SurfaceIndex ), bufferIndexCM) );
1592             CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 1, sizeof( SurfaceIndex ), surf2DIndexCM ));
1593         }
1594         else
1595         {
1596             CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 1, sizeof( SurfaceIndex ), bufferIndexCM ));
1597             CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 0, sizeof( SurfaceIndex ), surf2DIndexCM ));
1598         }
1599 
1600 
1601         widthDword = (uint32_t)ceil((double)widthByte / 4);
1602         strideInDwords = (uint32_t)ceil((double)strideInBytes / 4);
1603 
1604         CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 2, sizeof( uint32_t ), &strideInDwords ));
1605         CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 3, sizeof( uint32_t ), &heightStrideInRows ));
1606         CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 4, sizeof( uint32_t ), &addedShiftLeftOffset ));
1607         CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 5, sizeof( uint32_t ), &threadHeight ));
1608 
1609         if (direction == CM_FASTCOPY_GPU2CPU)  //GPU-->CPU, read
1610         {
1611             CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 6, sizeof( uint32_t ), &widthDword ));
1612             CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 7, sizeof( uint32_t ), &tempHeight ));
1613             CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 8, sizeof(uint32_t), &startX));
1614             CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 9, sizeof(uint32_t), &startY));
1615         }
1616         else  //CPU-->GPU, write
1617         {
1618             //this only works for the kernel surfaceCopy_write_32x32
1619             CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 6, sizeof( uint32_t ), &startX ));
1620             CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 7, sizeof( uint32_t ), &startY ));
1621         }
1622 
1623         CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateTask(gpuCopyTask));
1624         CM_CHK_CMSTATUS_GOTOFINISH(gpuCopyTask->AddKernel( kernel ));
1625         if (option & CM_FASTCOPY_OPTION_DISABLE_TURBO_BOOST)
1626         {
1627             // disable turbo
1628             CM_TASK_CONFIG taskConfig;
1629             CmSafeMemSet(&taskConfig, 0, sizeof(CM_TASK_CONFIG));
1630             taskConfig.turboBoostFlag = CM_TURBO_BOOST_DISABLE;
1631             gpuCopyTask->SetProperty(taskConfig);
1632         }
1633         CM_CHK_CMSTATUS_GOTOFINISH(EnqueueFast(gpuCopyTask, internalEvent,
1634                                            threadSpace));
1635 
1636         GPUCOPY_KERNEL_UNLOCK(gpuCopyKernelParam);
1637 
1638         //update for next slice
1639         linearAddress += sliceCopyBufferUPSize - addedShiftLeftOffset;
1640         totalBufferUPSize -= sliceCopyBufferUPSize;
1641         copyHeightRow -= sliceCopyHeightRow;
1642         startX = 0;
1643         startY += sliceCopyHeightRow;
1644 
1645         if(totalBufferUPSize > 0)   //Intermediate event, we don't need it
1646         {
1647             CM_CHK_CMSTATUS_GOTOFINISH(DestroyEventFast(internalEvent));
1648         }
1649         else //Last one event, need keep or destroy it
1650         {
1651             if ((option & CM_FASTCOPY_OPTION_BLOCKING) && (internalEvent))
1652             {
1653                 CM_CHK_CMSTATUS_GOTOFINISH(internalEvent->WaitForTaskFinished());
1654             }
1655 
1656             if(event == CM_NO_EVENT)  //User doesn't need CmEvent for this copy
1657             {
1658                 event = nullptr;
1659                 CM_CHK_CMSTATUS_GOTOFINISH(DestroyEventFast(internalEvent));
1660             }
1661             else //User needs this CmEvent
1662             {
1663                 event = internalEvent;
1664             }
1665         }
1666 
1667         CM_CHK_CMSTATUS_GOTOFINISH(m_device->DestroyTask(gpuCopyTask));
1668         CM_CHK_CMSTATUS_GOTOFINISH(m_device->DestroyThreadSpace(threadSpace));
1669         CM_CHK_CMSTATUS_GOTOFINISH(m_device->DestroyBufferUP(cmbufferUP));
1670     }
1671 
1672 finish:
1673 
1674     if(hr != CM_SUCCESS)
1675     {
1676         if(cmbufferUP == nullptr)
1677         {
1678             // user need to know whether the failure is caused by out of BufferUP.
1679             hr = CM_GPUCOPY_OUT_OF_RESOURCE;
1680         }
1681 
1682         if(kernel && gpuCopyKernelParam)        GPUCOPY_KERNEL_UNLOCK(gpuCopyKernelParam);
1683         if(threadSpace)                                m_device->DestroyThreadSpace(threadSpace);
1684         if(gpuCopyTask)                       m_device->DestroyTask(gpuCopyTask);
1685         if(cmbufferUP)                        m_device->DestroyBufferUP(cmbufferUP);
1686         if(internalEvent)                     DestroyEventFast(internalEvent);
1687 
1688         // CM_FAILURE for all the other errors
1689         // return CM_EXCEED_MAX_TIMEOUT to notify app that gpu reset happens
1690         if( hr != CM_GPUCOPY_OUT_OF_RESOURCE && hr != CM_EXCEED_MAX_TIMEOUT)
1691         {
1692             hr = CM_FAILURE;
1693         }
1694     }
1695 
1696     return hr;
1697 }
1698 
EnqueueCopyInternal_2Planes(CmSurface2DRT * surface,unsigned char * sysMem,CM_SURFACE_FORMAT format,const uint32_t widthInPixel,const uint32_t widthStride,const uint32_t heightInRow,const uint32_t heightStride,const uint32_t sizePerPixel,CM_GPUCOPY_DIRECTION direction,const uint32_t option,CmEvent * & event)1699 int32_t CmQueueRT::EnqueueCopyInternal_2Planes(CmSurface2DRT* surface,
1700                                         unsigned char* sysMem,
1701                                         CM_SURFACE_FORMAT format,
1702                                         const uint32_t widthInPixel,
1703                                         const uint32_t widthStride,
1704                                         const uint32_t heightInRow,
1705                                         const uint32_t heightStride,
1706                                         const uint32_t sizePerPixel,
1707                                         CM_GPUCOPY_DIRECTION direction,
1708                                         const uint32_t option,
1709                                         CmEvent* & event)
1710 {
1711     int32_t         hr                      = CM_SUCCESS;
1712     uint32_t        strideInBytes         = widthStride;
1713     uint32_t        strideInDwords        = 0;
1714     uint32_t        heightStrideInRows   = heightStride;
1715     size_t          linearAddressY        = 0;
1716     size_t          linearAddressUV       = 0;
1717     size_t          linearAddressAlignedY = 0;
1718     size_t          linearAddressAlignedUV = 0;
1719     uint32_t        addedShiftLeftOffsetY  = 0;
1720     uint32_t        addedShiftLeftOffsetUV = 0;
1721 
1722     CmKernel        *kernel                = nullptr;
1723     CmBufferUP      *cmbufferUPY          = nullptr;
1724     CmBufferUP      *cmbufferUPUV         = nullptr;
1725     SurfaceIndex    *bufferUPIndexY       = nullptr;
1726     SurfaceIndex    *bufferUPIndexUV      = nullptr;
1727     SurfaceIndex    *surf2DIndexCM         = nullptr;
1728     CmThreadSpace   *threadSpace           = nullptr;
1729     CmTask          *gpuCopyTask           = nullptr;
1730     CmEvent         *internalEvent         = nullptr;
1731 
1732     uint32_t        threadWidth             = 0;
1733     uint32_t        threadHeight            = 0;
1734     uint32_t        threadNum               = 0;
1735     uint32_t        widthDword             = 0;
1736     uint32_t        widthByte              = 0;
1737     uint32_t        copyWidthByte         = 0;
1738     uint32_t        copyHeightRow         = 0;
1739     uint32_t        bufferUPYSize         = 0;
1740     uint32_t        bufferUPUVSize        = 0;
1741 
1742     CM_GPUCOPY_KERNEL *gpuCopyKernelParam = nullptr;
1743     PCM_HAL_STATE       cmHalState    =      \
1744         ((PCM_CONTEXT_DATA)m_device->GetAccelData())->cmHalState;
1745 
1746     widthByte = widthInPixel * sizePerPixel;
1747 
1748     //Align the width regarding stride
1749     if (strideInBytes == 0)
1750     {
1751         strideInBytes = widthByte;
1752     }
1753 
1754     if (heightStrideInRows == 0)
1755     {
1756         heightStrideInRows = heightInRow;
1757     }
1758 
1759     // the actual copy region
1760     copyWidthByte = MOS_MIN(strideInBytes, widthByte);
1761     copyHeightRow = MOS_MIN(heightStrideInRows, heightInRow);
1762 
1763     // Make sure stride and start address of system memory is 16-byte aligned.
1764     // if no padding in system memory , strideInBytes = widthByte.
1765     if (strideInBytes & 0xf)
1766     {
1767         CM_ASSERTMESSAGE("Error: Stride is not 16-byte aligned.");
1768         return CM_GPUCOPY_INVALID_STRIDE;
1769     }
1770 
1771     //Check thread space width here
1772     if (copyWidthByte > CM_MAX_THREADSPACE_WIDTH_FOR_MW * BLOCK_PIXEL_WIDTH * 4)
1773     {  // each thread handles 128x8 block data. This API will fail if it exceeds the max thread space's size
1774         CM_ASSERTMESSAGE("Error: Invalid copy size.");
1775         return CM_GPUCOPY_INVALID_SIZE;
1776     }
1777 
1778     linearAddressY = (size_t)sysMem;
1779     linearAddressUV = (size_t)((char*)sysMem + strideInBytes * heightStrideInRows);
1780 
1781     if ((linearAddressY & 0xf) || (linearAddressY == 0) || (linearAddressAlignedUV & 0xf))
1782     {
1783         CM_ASSERTMESSAGE("Error: Start address of system memory is not 16-byte aligned.");
1784         return CM_GPUCOPY_INVALID_SYSMEM;
1785     }
1786 
1787     if (sizeof (void *) == 8) //64-bit
1788     {
1789         linearAddressAlignedY = linearAddressY & ADDRESS_PAGE_ALIGNMENT_MASK_X64;
1790         linearAddressAlignedUV = linearAddressUV & ADDRESS_PAGE_ALIGNMENT_MASK_X64;
1791     }
1792     else  //32-bit
1793     {
1794         linearAddressAlignedY = linearAddressY & ADDRESS_PAGE_ALIGNMENT_MASK_X86;
1795         linearAddressAlignedUV = linearAddressUV & ADDRESS_PAGE_ALIGNMENT_MASK_X86;
1796     }
1797 
1798     //Calculate  Left Shift offset
1799     addedShiftLeftOffsetY = (uint32_t)(linearAddressY - linearAddressAlignedY);
1800     addedShiftLeftOffsetUV = (uint32_t)(linearAddressUV - linearAddressAlignedUV);
1801 
1802     //Calculate actual total size of system memory, assume it's NV12/P010/P016 formats
1803     bufferUPYSize = strideInBytes * heightStrideInRows + addedShiftLeftOffsetY;
1804     bufferUPUVSize = strideInBytes * copyHeightRow * 1 / 2 + addedShiftLeftOffsetUV;
1805 
1806     //Check thread space height here
1807     if (copyHeightRow > CM_MAX_THREADSPACE_HEIGHT_FOR_MW * BLOCK_HEIGHT * INNER_LOOP)
1808     {  // each thread handles 128x8 block data. This API will fail if it exceeds the max thread space's size
1809         CM_ASSERTMESSAGE("Error: Invalid copy size.");
1810         return CM_GPUCOPY_INVALID_SIZE;
1811     }
1812 
1813     kernel = nullptr;
1814     CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateBufferUP(bufferUPYSize, (void *)linearAddressAlignedY, cmbufferUPY));
1815     CM_CHK_NULL_GOTOFINISH_CMERROR(cmbufferUPY);
1816     CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateBufferUP(bufferUPUVSize, (void *)linearAddressAlignedUV, cmbufferUPUV));
1817     CM_CHK_NULL_GOTOFINISH_CMERROR(cmbufferUPUV);
1818 
1819     //Configure memory object control for the two BufferUP to solve the same cache-line coherency issue.
1820     if (cmHalState->cmHalInterface->IsGPUCopySurfaceNoCacheWARequired())
1821     {
1822         CM_CHK_CMSTATUS_GOTOFINISH(cmbufferUPY->SelectMemoryObjectControlSetting(MEMORY_OBJECT_CONTROL_SKL_NO_LLC_L3));
1823         CM_CHK_CMSTATUS_GOTOFINISH(cmbufferUPUV->SelectMemoryObjectControlSetting(MEMORY_OBJECT_CONTROL_SKL_NO_LLC_L3));
1824     }
1825     else
1826     {
1827         CM_CHK_CMSTATUS_GOTOFINISH(static_cast< CmBuffer_RT* >(cmbufferUPY)->SetMemoryObjectControl(MEMORY_OBJECT_CONTROL_FROM_GTT_ENTRY, CM_WRITE_THROUGH, 0));
1828         CM_CHK_CMSTATUS_GOTOFINISH(static_cast< CmBuffer_RT* >(cmbufferUPUV)->SetMemoryObjectControl(MEMORY_OBJECT_CONTROL_FROM_GTT_ENTRY, CM_WRITE_THROUGH, 0));
1829     }
1830 
1831     CM_CHK_CMSTATUS_GOTOFINISH(CreateGPUCopyKernel(copyWidthByte, copyHeightRow, format, direction, gpuCopyKernelParam));
1832     CM_CHK_NULL_GOTOFINISH_CMERROR(gpuCopyKernelParam);
1833     kernel = gpuCopyKernelParam->kernel;
1834 
1835     CM_CHK_NULL_GOTOFINISH_CMERROR(kernel);
1836 
1837     CM_CHK_NULL_GOTOFINISH_CMERROR(cmbufferUPY);
1838     CM_CHK_NULL_GOTOFINISH_CMERROR(cmbufferUPUV);
1839     CM_CHK_CMSTATUS_GOTOFINISH(cmbufferUPY->GetIndex(bufferUPIndexY));
1840     CM_CHK_CMSTATUS_GOTOFINISH(cmbufferUPUV->GetIndex(bufferUPIndexUV));
1841     CM_CHK_CMSTATUS_GOTOFINISH(surface->GetIndex(surf2DIndexCM));
1842 
1843     threadWidth = (uint32_t)ceil((double)copyWidthByte / BLOCK_PIXEL_WIDTH / 4);
1844     threadHeight = (uint32_t)ceil((double)copyHeightRow / BLOCK_HEIGHT / INNER_LOOP);
1845     threadNum = threadWidth * threadHeight;
1846     CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetThreadCount(threadNum));
1847     CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateThreadSpace(threadWidth, threadHeight, threadSpace));
1848 
1849     widthDword = (uint32_t)ceil((double)widthByte / 4);
1850     strideInDwords = (uint32_t)ceil((double)strideInBytes / 4);
1851 
1852     if (direction == CM_FASTCOPY_CPU2GPU) //Write
1853     {
1854         //Input BufferUP_Y and BufferUP_UV
1855         if (cmHalState->cmHalInterface->IsSurfaceCompressionWARequired())
1856         {
1857             CM_CHK_CMSTATUS_GOTOFINISH(surface->SetCompressionMode(MEMCOMP_DISABLED));
1858         }
1859         CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg(0, sizeof(SurfaceIndex), bufferUPIndexY));
1860         CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg(1, sizeof(SurfaceIndex), bufferUPIndexUV));
1861         //Output Surface2D
1862         CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg(2, sizeof(SurfaceIndex), surf2DIndexCM));
1863         //Other parameters
1864         CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg(3, sizeof(uint32_t), &strideInDwords));
1865         CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg(4, sizeof(uint32_t), &heightStrideInRows));
1866         CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg(5, sizeof(uint32_t), &addedShiftLeftOffsetY));
1867         CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg(6, sizeof(uint32_t), &addedShiftLeftOffsetUV));
1868         CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg(7, sizeof(uint32_t), &threadHeight));
1869     }
1870     else  //Read
1871     {
1872         //Input Surface2D
1873         CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg(0, sizeof(SurfaceIndex), surf2DIndexCM));
1874         //Output BufferUP_Y and BufferUP_UV
1875         CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg(1, sizeof(SurfaceIndex), bufferUPIndexY));
1876         CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg(2, sizeof(SurfaceIndex), bufferUPIndexUV));
1877         //Other parameters
1878         CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg(3, sizeof(uint32_t), &strideInDwords));
1879         CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg(4, sizeof(uint32_t), &heightStrideInRows));
1880         CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg(5, sizeof(uint32_t), &addedShiftLeftOffsetY));
1881         CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg(6, sizeof(uint32_t), &addedShiftLeftOffsetUV));
1882         CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg(7, sizeof(uint32_t), &threadHeight));
1883         CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg(8, sizeof(uint32_t), &widthDword));
1884         CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg(9, sizeof(uint32_t), &heightInRow));
1885 
1886         surface->SetReadSyncFlag(true, this); // GPU -> CPU, set surf2d as read sync flag
1887     }
1888 
1889     CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateTask(gpuCopyTask));
1890     CM_CHK_CMSTATUS_GOTOFINISH(gpuCopyTask->AddKernel(kernel));
1891     if (option & CM_FASTCOPY_OPTION_DISABLE_TURBO_BOOST)
1892     {
1893         // disable turbo
1894         CM_TASK_CONFIG taskConfig;
1895         CmSafeMemSet(&taskConfig, 0, sizeof(CM_TASK_CONFIG));
1896         taskConfig.turboBoostFlag = CM_TURBO_BOOST_DISABLE;
1897         gpuCopyTask->SetProperty(taskConfig);
1898     }
1899     CM_CHK_CMSTATUS_GOTOFINISH(EnqueueFast(gpuCopyTask, internalEvent,
1900                                        threadSpace));
1901 
1902     GPUCOPY_KERNEL_UNLOCK(gpuCopyKernelParam);
1903 
1904     if ((option & CM_FASTCOPY_OPTION_BLOCKING) && (internalEvent))
1905     {
1906         CM_CHK_CMSTATUS_GOTOFINISH(internalEvent->WaitForTaskFinished());
1907     }
1908 
1909     if (event == CM_NO_EVENT)  //User doesn't need CmEvent for this copy
1910     {
1911         event = nullptr;
1912         CM_CHK_CMSTATUS_GOTOFINISH(DestroyEventFast(internalEvent));
1913     }
1914     else //User needs this CmEvent
1915     {
1916         event = internalEvent;
1917     }
1918 
1919     CM_CHK_CMSTATUS_GOTOFINISH(m_device->DestroyTask(gpuCopyTask));
1920     CM_CHK_CMSTATUS_GOTOFINISH(m_device->DestroyThreadSpace(threadSpace));
1921     CM_CHK_CMSTATUS_GOTOFINISH(m_device->DestroyBufferUP(cmbufferUPY));
1922     CM_CHK_CMSTATUS_GOTOFINISH(m_device->DestroyBufferUP(cmbufferUPUV));
1923 
1924 finish:
1925 
1926     if (hr != CM_SUCCESS)
1927     {
1928         if ((cmbufferUPY == nullptr) || (cmbufferUPUV == nullptr))
1929         {
1930             // user need to know whether the failure is caused by out of BufferUP.
1931             hr = CM_GPUCOPY_OUT_OF_RESOURCE;
1932         }
1933 
1934         if (kernel && gpuCopyKernelParam)        GPUCOPY_KERNEL_UNLOCK(gpuCopyKernelParam);
1935         if (threadSpace)                                m_device->DestroyThreadSpace(threadSpace);
1936         if (gpuCopyTask)                       m_device->DestroyTask(gpuCopyTask);
1937         if (cmbufferUPY)                      m_device->DestroyBufferUP(cmbufferUPY);
1938         if (cmbufferUPUV)                     m_device->DestroyBufferUP(cmbufferUPUV);
1939         if (internalEvent)                     DestroyEventFast(internalEvent);
1940 
1941         // CM_FAILURE for all the other errors
1942         // return CM_EXCEED_MAX_TIMEOUT to notify app that gpu reset happens
1943         if( hr != CM_GPUCOPY_OUT_OF_RESOURCE && hr != CM_EXCEED_MAX_TIMEOUT)
1944         {
1945             hr = CM_FAILURE;
1946         }
1947     }
1948 
1949     return hr;
1950 }
1951 
1952 //*-----------------------------------------------------------------------------
1953 //! Enqueue an task, which contains one pre-defined kernel to copy from video memory to video memory
1954 //! This is a non-blocking call. i.e. it returns immediately without waiting for
1955 //! GPU to finish the execution of the task.
1956 //! A CmEvent is generated each time a task is enqueued. The CmEvent can
1957 //! be used to check if the task finishes.
1958 //! INPUT:
1959 //!     1) Pointer to the CmSurface2D as copy destination
1960 //!     2) Pointer to the CmSurface2D  as copy source
1961 //!     3) Option passed from user, blocking copy, non-blocking copy or disable turbo boost
1962 //!     4) Reference to the pointer to CMEvent
1963 //! OUTPUT:
1964 //!     CM_SUCCESS if the task is successfully enqueued and the CmEvent is generated;
1965 //!     CM_OUT_OF_HOST_MEMORY if out of host memery;
1966 //!     CM_GPUCOPY_INVALID_SURFACES if input/output surfaces' width/format are different or
1967 //!                                 input surface's height is larger than output surface's
1968 //! Restrictions:
1969 //!     1) Surface's width should be 64-byte aligned.
1970 //!     2) The input surface's width/height/format should be the same as output surface's.
1971 //*-----------------------------------------------------------------------------
EnqueueCopyGPUToGPU(CmSurface2D * outputSurface,CmSurface2D * inputSurface,uint32_t option,CmEvent * & event)1972 CM_RT_API int32_t CmQueueRT::EnqueueCopyGPUToGPU( CmSurface2D* outputSurface, CmSurface2D* inputSurface, uint32_t option, CmEvent* & event )
1973 {
1974     INSERT_API_CALL_LOG(GetHalState());
1975 
1976     if (!m_device->HasGpuCopyKernel())
1977     {
1978         return CM_NOT_IMPLEMENTED;
1979     }
1980 
1981     uint32_t srcSurfaceWidth = 0;
1982     uint32_t srcSurfaceHeight = 0;
1983     uint32_t dstSurfaceWidth = 0;
1984     uint32_t dstSurfaceHeight = 0;
1985 
1986     CM_SURFACE_FORMAT srcSurfaceFormat = CM_SURFACE_FORMAT_INVALID;
1987     CM_SURFACE_FORMAT dstSurfaceFormat = CM_SURFACE_FORMAT_INVALID;
1988 
1989     int32_t             hr = CM_SUCCESS;
1990     uint32_t            srcSizePerPixel = 0;
1991     uint32_t            dstSizePerPixel = 0;
1992     uint32_t            threadWidth = 0;
1993     uint32_t            threadHeight = 0;
1994 
1995     CmKernel            *kernel = nullptr;
1996     SurfaceIndex        *surfaceInputIndex = nullptr;
1997     SurfaceIndex        *surfaceOutputIndex = nullptr;
1998     CmThreadSpace       *threadSpace = nullptr;
1999     CmTask              *task = nullptr;
2000     uint32_t            srcSurfAlignedWidthInBytes = 0;
2001     CM_GPUCOPY_KERNEL *gpuCopyKernelParam = nullptr;
2002 
2003     if ((outputSurface == nullptr) || (inputSurface == nullptr))
2004     {
2005         CM_ASSERTMESSAGE("Error: Pointer to input surface or output surface is null.");
2006         return CM_FAILURE;
2007     }
2008 
2009     PCM_HAL_STATE   cmHalState = ((PCM_CONTEXT_DATA)m_device->GetAccelData())->cmHalState;
2010     CmSurface2DRT *outputSurfaceRT = static_cast<CmSurface2DRT *>(outputSurface);
2011     CmSurface2DRT *inputSurfaceRT = static_cast<CmSurface2DRT *>(inputSurface);
2012     if (cmHalState->cmHalInterface->IsSurfaceCompressionWARequired())
2013     {
2014         CM_CHK_CMSTATUS_GOTOFINISH(outputSurfaceRT->SetCompressionMode(MEMCOMP_DISABLED));
2015     }
2016 
2017     CM_CHK_CMSTATUS_GOTOFINISH(outputSurfaceRT->GetSurfaceDesc(dstSurfaceWidth, dstSurfaceHeight, dstSurfaceFormat, dstSizePerPixel));
2018     CM_CHK_CMSTATUS_GOTOFINISH(inputSurfaceRT->GetSurfaceDesc(srcSurfaceWidth, srcSurfaceHeight, srcSurfaceFormat, srcSizePerPixel));
2019 
2020     if ((dstSurfaceWidth != srcSurfaceWidth) ||
2021         (dstSurfaceHeight < srcSurfaceHeight) ||  //relax the restriction
2022         (dstSizePerPixel != srcSizePerPixel))
2023     {
2024         CM_ASSERTMESSAGE("Error: Size of dest surface does not match src surface.");
2025         return CM_GPUCOPY_INVALID_SURFACES;
2026     }
2027 
2028     //To support copy b/w Format_A8R8G8B8 and Format_A8B8G8R8
2029     if (dstSurfaceFormat != srcSurfaceFormat)
2030     {
2031         if (!((dstSurfaceFormat == CM_SURFACE_FORMAT_A8R8G8B8) && (srcSurfaceFormat == CM_SURFACE_FORMAT_A8B8G8R8)) &&
2032             !((dstSurfaceFormat == CM_SURFACE_FORMAT_A8R8G8B8) && (srcSurfaceFormat == CM_SURFACE_FORMAT_A8B8G8R8)))
2033         {
2034             CM_ASSERTMESSAGE("Error: Only support copy b/w Format_A8R8G8B8 and Format_A8B8G8R8 if src format is not matched with dst format.");
2035             return CM_GPUCOPY_INVALID_SURFACES;
2036         }
2037     }
2038 
2039     // 128Bytes aligned
2040     srcSurfAlignedWidthInBytes = (uint32_t)(ceil((double)srcSurfaceWidth*srcSizePerPixel / BLOCK_PIXEL_WIDTH / 4) * (BLOCK_PIXEL_WIDTH * 4));
2041 
2042     if (srcSurfaceHeight > CM_MAX_THREADSPACE_WIDTH_FOR_MW *BLOCK_HEIGHT *INNER_LOOP)
2043     {
2044         CM_ASSERTMESSAGE("Error: Invalid copy size.");
2045         return CM_GPUCOPY_INVALID_SIZE;
2046     }
2047 
2048     CM_CHK_CMSTATUS_GOTOFINISH(CreateGPUCopyKernel(srcSurfaceWidth*srcSizePerPixel, srcSurfaceHeight, srcSurfaceFormat, CM_FASTCOPY_GPU2GPU, gpuCopyKernelParam));
2049     CM_CHK_NULL_GOTOFINISH_CMERROR(gpuCopyKernelParam);
2050 
2051     CM_CHK_NULL_GOTOFINISH_CMERROR(gpuCopyKernelParam->kernel);
2052     kernel = gpuCopyKernelParam->kernel;
2053 
2054     CM_CHK_CMSTATUS_GOTOFINISH(inputSurface->GetIndex(surfaceInputIndex));
2055     CM_CHK_CMSTATUS_GOTOFINISH(outputSurface->GetIndex(surfaceOutputIndex));
2056 
2057     threadWidth = srcSurfAlignedWidthInBytes / (BLOCK_PIXEL_WIDTH * 4);
2058     threadHeight = (uint32_t)ceil((double)srcSurfaceHeight / BLOCK_HEIGHT / INNER_LOOP);
2059 
2060     CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetThreadCount(threadWidth * threadHeight));
2061 
2062     CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg(0, sizeof(SurfaceIndex), surfaceInputIndex));
2063     CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg(1, sizeof(SurfaceIndex), surfaceOutputIndex));
2064     CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg(2, sizeof(uint32_t), &threadHeight));
2065 
2066     CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateThreadSpace(threadWidth, threadHeight, threadSpace));
2067 
2068     CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateTask(task));
2069     CM_CHK_NULL_GOTOFINISH_CMERROR(task);
2070     CM_CHK_CMSTATUS_GOTOFINISH(task->AddKernel(kernel));
2071 
2072     if (option & CM_FASTCOPY_OPTION_DISABLE_TURBO_BOOST)
2073     {
2074         // disable turbo
2075         CM_TASK_CONFIG taskConfig;
2076         CmSafeMemSet(&taskConfig, 0, sizeof(CM_TASK_CONFIG));
2077         taskConfig.turboBoostFlag = CM_TURBO_BOOST_DISABLE;
2078         task->SetProperty(taskConfig);
2079     }
2080 
2081     CM_CHK_CMSTATUS_GOTOFINISH(EnqueueFast(task, event, threadSpace));
2082     if ((option & CM_FASTCOPY_OPTION_BLOCKING) && (event))
2083     {
2084         CM_CHK_CMSTATUS_GOTOFINISH(event->WaitForTaskFinished());
2085     }
2086 
2087 finish:
2088 
2089     if (kernel && gpuCopyKernelParam)        GPUCOPY_KERNEL_UNLOCK(gpuCopyKernelParam);
2090     if (threadSpace)                                m_device->DestroyThreadSpace(threadSpace);
2091     if (task)                              m_device->DestroyTask(task);
2092 
2093     return hr;
2094 }
2095 
2096 //*-----------------------------------------------------------------------------
2097 //! Enqueue an task, which contains one pre-defined kernel to copy from system memory to system memory
2098 //! This is a non-blocking call. i.e. it returns immediately without waiting for
2099 //! GPU to finish the execution of the task.
2100 //! A CmEvent is generated each time a task is enqueued. The CmEvent can be used to check if the task finishs.
2101 //! If the size is less than 1KB,  CPU is used to do the copy and event will be set as nullptr .
2102 //!
2103 //! INPUT:
2104 //!     1) Pointer to the system memory as copy destination
2105 //!     2) Pointer to the system memory as copy source
2106 //!     3) The size in bytes of memory be copied.
2107 //!     4) Option passed from user, blocking copy, non-blocking copy or disable turbo boost
2108 //!     5) Reference to the pointer to CMEvent
2109 //! OUTPUT:
2110 //!     CM_SUCCESS if the task is successfully enqueued and the CmEvent is generated;
2111 //!     CM_OUT_OF_HOST_MEMORY if out of host memery;
2112 //!     CM_GPUCOPY_INVALID_SYSMEM if the sysMem is not 16-byte aligned or is NULL.
2113 //!     CM_GPUCOPY_OUT_OF_RESOURCE if runtime run out of BufferUP.
2114 //!     CM_GPUCOPY_INVALID_SIZE  if its size plus shift-left offset large than CM_MAX_1D_SURF_WIDTH.
2115 //! Restrictions:
2116 //!     1) dstSysMem and srcSysMem should be 16-byte aligned.
2117 //*-----------------------------------------------------------------------------
EnqueueCopyCPUToCPU(unsigned char * dstSysMem,unsigned char * srcSysMem,uint32_t size,uint32_t option,CmEvent * & event)2118 CM_RT_API int32_t CmQueueRT::EnqueueCopyCPUToCPU( unsigned char* dstSysMem, unsigned char* srcSysMem, uint32_t size, uint32_t option, CmEvent* & event )
2119 {
2120     INSERT_API_CALL_LOG(GetHalState());
2121 
2122     if (!m_device->HasGpuCopyKernel())
2123     {
2124         return CM_NOT_IMPLEMENTED;
2125     }
2126 
2127     int hr = CM_SUCCESS;
2128     size_t inputLinearAddress  = (size_t )srcSysMem;
2129     size_t outputLinearAddress = (size_t )dstSysMem;
2130 
2131     size_t inputLinearAddressAligned = 0;
2132     size_t outputLinearAddressAligned = 0;
2133 
2134     CmBufferUP      *surfaceInput          = nullptr;
2135     CmBufferUP      *surfaceOutput         = nullptr;
2136     CmKernel        *kernel                = nullptr;
2137     SurfaceIndex    *surfaceInputIndex     = nullptr;
2138     SurfaceIndex    *surfaceOutputIndex    = nullptr;
2139     CmThreadSpace   *threadSpace           = nullptr;
2140     CmTask          *task                  = nullptr;
2141 
2142     int32_t         srcLeftShiftOffset      = 0;
2143     int32_t         dstLeftShiftOffset      = 0;
2144     uint32_t        threadWidth             = 0;
2145     uint32_t        threadHeight            = 0;
2146     uint32_t        threadNum              = 0;
2147     uint32_t        gpuMemcopySize        = 0;
2148     uint32_t        cpuMemcopySize        = 0;
2149     CM_GPUCOPY_KERNEL *gpuCopyKernelParam     = nullptr;
2150 
2151     if((inputLinearAddress & 0xf) || (outputLinearAddress & 0xf) ||
2152         (inputLinearAddress == 0) || (outputLinearAddress == 0))
2153     {
2154         CM_ASSERTMESSAGE("Error: Start address of system memory is not 16-byte aligned.");
2155         return CM_GPUCOPY_INVALID_SYSMEM;
2156     }
2157 
2158     // Get page aligned address
2159     if (sizeof (void *) == 8 ) //64-bit
2160     {
2161         inputLinearAddressAligned  = inputLinearAddress  & ADDRESS_PAGE_ALIGNMENT_MASK_X64;  // make sure the address page aligned.
2162         outputLinearAddressAligned = outputLinearAddress & ADDRESS_PAGE_ALIGNMENT_MASK_X64;  // make sure the address page aligned.
2163     }
2164     else
2165     {
2166         inputLinearAddressAligned  = inputLinearAddress  & ADDRESS_PAGE_ALIGNMENT_MASK_X86;  // make sure the address page aligned.
2167         outputLinearAddressAligned = outputLinearAddress & ADDRESS_PAGE_ALIGNMENT_MASK_X86;  // make sure the address page aligned.
2168     }
2169 
2170     srcLeftShiftOffset = (int32_t)(inputLinearAddress  - inputLinearAddressAligned) ;
2171     dstLeftShiftOffset = (int32_t)(outputLinearAddress - outputLinearAddressAligned) ;
2172 
2173     if(((size + srcLeftShiftOffset) > CM_MAX_1D_SURF_WIDTH)||
2174        ((size + dstLeftShiftOffset) > CM_MAX_1D_SURF_WIDTH))
2175     {
2176         CM_ASSERTMESSAGE("Error: Invalid copy size.");
2177         return CM_GPUCOPY_INVALID_SIZE;
2178     }
2179 
2180     threadWidth  = 0;
2181     threadHeight = 0;
2182     threadNum = size / BYTE_COPY_ONE_THREAD; // each thread copys 32 x 4 x32 bytes = 1K
2183 
2184     if( threadNum == 0)
2185     {
2186         //if the size of data is less than data copied per thread ( 4K), use CPU to copy it instead of GPU.
2187         CmFastMemCopy((void *)(outputLinearAddress),
2188                       (void *)(inputLinearAddress),
2189                       size); //SSE copy used in CMRT.
2190 
2191         event = nullptr;
2192         return CM_SUCCESS;
2193     }
2194 
2195     //Calculate proper thread space's width and height
2196     threadWidth  = 1;
2197     threadHeight = threadNum/threadWidth;
2198     while((threadHeight > CM_MAX_THREADSPACE_HEIGHT_FOR_MW))
2199     {
2200         if(threadWidth > CM_MAX_THREADSPACE_WIDTH_FOR_MW)
2201         {
2202             hr = CM_GPUCOPY_INVALID_SIZE; // thread number exceed 511*511
2203             goto finish;
2204         }
2205         else if (threadWidth == 1)
2206         {
2207             threadWidth  =  THREAD_SPACE_WIDTH_INCREMENT; // first time,
2208             threadHeight = threadNum/threadWidth;
2209         }
2210         else
2211         {
2212             threadWidth +=  THREAD_SPACE_WIDTH_INCREMENT; // increase 8 per iteration
2213             threadHeight = threadNum/threadWidth;
2214         }
2215     }
2216 
2217     CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateBufferUP(size + srcLeftShiftOffset, (void *)inputLinearAddressAligned,surfaceInput));
2218 
2219     CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateBufferUP(size + dstLeftShiftOffset, (void *)outputLinearAddressAligned,surfaceOutput));
2220 
2221     CM_CHK_CMSTATUS_GOTOFINISH(CreateGPUCopyKernel(size, 0, CM_SURFACE_FORMAT_INVALID, CM_FASTCOPY_CPU2CPU, gpuCopyKernelParam));
2222     CM_CHK_NULL_GOTOFINISH_CMERROR(gpuCopyKernelParam);
2223     CM_CHK_NULL_GOTOFINISH_CMERROR(gpuCopyKernelParam->kernel);
2224     kernel = gpuCopyKernelParam->kernel;
2225 
2226     CM_CHK_NULL_GOTOFINISH_CMERROR(surfaceInput);
2227     CM_CHK_CMSTATUS_GOTOFINISH(surfaceInput->GetIndex(surfaceInputIndex));
2228     CM_CHK_NULL_GOTOFINISH_CMERROR(surfaceOutput);
2229     CM_CHK_CMSTATUS_GOTOFINISH(surfaceOutput->GetIndex(surfaceOutputIndex));
2230 
2231     CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetThreadCount(threadWidth * threadHeight));
2232     CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 0, sizeof( SurfaceIndex ), surfaceInputIndex ));
2233     CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 1, sizeof( SurfaceIndex ), surfaceOutputIndex ));
2234     CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 2, sizeof( int ), &threadWidth ));
2235     CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 3, sizeof( int ), &threadHeight ));
2236     CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 4, sizeof( int ), &srcLeftShiftOffset ));
2237     CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 5, sizeof( int ), &dstLeftShiftOffset ));
2238     CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 6, sizeof( int ), &size ));
2239 
2240     CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateThreadSpace(threadWidth, threadHeight, threadSpace));
2241 
2242     CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateTask(task));
2243     CM_CHK_NULL_GOTOFINISH_CMERROR(task);
2244     CM_CHK_CMSTATUS_GOTOFINISH(task->AddKernel (kernel));
2245 
2246     if (option & CM_FASTCOPY_OPTION_DISABLE_TURBO_BOOST)
2247     {
2248         // disable turbo
2249         CM_TASK_CONFIG taskConfig;
2250         CmSafeMemSet(&taskConfig, 0, sizeof(CM_TASK_CONFIG));
2251         taskConfig.turboBoostFlag = CM_TURBO_BOOST_DISABLE;
2252         task->SetProperty(taskConfig);
2253     }
2254 
2255     CM_CHK_CMSTATUS_GOTOFINISH(EnqueueFast(task, event, threadSpace));
2256 
2257     if ((option & CM_FASTCOPY_OPTION_BLOCKING) && (event))
2258     {
2259         CM_CHK_CMSTATUS_GOTOFINISH(event->WaitForTaskFinished());
2260     }
2261 
2262     //Copy the unaligned part by using CPU
2263     gpuMemcopySize = threadHeight * threadWidth *BYTE_COPY_ONE_THREAD;
2264     cpuMemcopySize = size - threadHeight * threadWidth *BYTE_COPY_ONE_THREAD;
2265 
2266     CmFastMemCopy((void *)(outputLinearAddress+gpuMemcopySize),
2267                   (void *)(inputLinearAddress+gpuMemcopySize),
2268                           cpuMemcopySize); //SSE copy used in CMRT.
2269 
2270     CM_CHK_CMSTATUS_GOTOFINISH(m_device->DestroyThreadSpace(threadSpace));
2271     CM_CHK_CMSTATUS_GOTOFINISH(m_device->DestroyTask(task));
2272     CM_CHK_CMSTATUS_GOTOFINISH(m_device->DestroyBufferUP(surfaceOutput));   // ref_cnf to guarantee task finish before BufferUP being really destroy.
2273     CM_CHK_CMSTATUS_GOTOFINISH(m_device->DestroyBufferUP(surfaceInput));
2274 
2275     GPUCOPY_KERNEL_UNLOCK(gpuCopyKernelParam);
2276 
2277 finish:
2278     if(hr != CM_SUCCESS)
2279     {   //Failed
2280         if( surfaceInput == nullptr || surfaceOutput == nullptr)
2281         {
2282             hr = CM_GPUCOPY_OUT_OF_RESOURCE; // user need to know whether the failure is caused by out of BufferUP.
2283         }
2284         else
2285         {
2286             hr = CM_FAILURE;
2287         }
2288         if(surfaceInput)                      m_device->DestroyBufferUP(surfaceInput);
2289         if(surfaceOutput)                     m_device->DestroyBufferUP(surfaceOutput);
2290         if(kernel && gpuCopyKernelParam)        GPUCOPY_KERNEL_UNLOCK(gpuCopyKernelParam);
2291         if(threadSpace)                                m_device->DestroyThreadSpace(threadSpace);
2292         if(task)                              m_device->DestroyTask(task);
2293     }
2294 
2295     return hr;
2296 }
2297 
2298 
2299 //worker thread for video buffer copy to/from system memory
2300 //support wait event and provide notification event
BufferCopyThread(void * threadData)2301 void BufferCopyThread(void* threadData)
2302 {
2303     int hr = CM_SUCCESS;
2304     CopyThreadData* data = (CopyThreadData*)threadData;
2305 
2306     CmBuffer_RT* buffer = (CmBuffer_RT*)(data->buffer);
2307     unsigned char* sysMem = (unsigned char*)data->sysMem;
2308     CmEvent* wait_event = (CmEvent*)(data->wait_event);
2309     CmEvent* notify_event = (CmEvent*)(data->event);
2310     CmEventRT* eventRT = dynamic_cast<CmEventRT*>(notify_event);
2311     CM_CHK_NULL_RETURN_VOID(eventRT);
2312     CmEventEx* eex = dynamic_cast<CmEventEx*>(notify_event);
2313 
2314     uint32_t offset = data->offset;
2315     uint64_t cpuMemCopySize = data->sysMemSize;
2316     uint64_t ts = 0, te = 0;
2317     MosUtilities::MosQueryPerformanceCounter(&ts);
2318     // CPU buffer copy call with wait event
2319     if(data->dir)
2320         hr = buffer->WriteBuffer(sysMem, wait_event, cpuMemCopySize, offset);
2321     else
2322         hr = buffer->ReadBuffer((unsigned char*)sysMem, wait_event, cpuMemCopySize, offset);
2323     MosUtilities::MosQueryPerformanceCounter(&te);
2324     uint64_t etime = (te - ts)*1000000000 / data->cpuFrrequency;
2325     eventRT->ModifyStatus(CM_STATUS_FINISHED, etime);
2326 
2327     MOS_Delete(data);
2328 }
2329 
EnqueueBufferCopy(CmBuffer * buffer,size_t offset,const unsigned char * sysMem,uint64_t sysMemSize,CM_GPUCOPY_DIRECTION dir,CmEvent * wait_event,CmEvent * & event,unsigned option)2330 int32_t CmQueueRT::EnqueueBufferCopy(CmBuffer* buffer, size_t offset, const unsigned char* sysMem, uint64_t sysMemSize, CM_GPUCOPY_DIRECTION dir, CmEvent* wait_event, CmEvent*& event, unsigned option)
2331 {
2332     INSERT_API_CALL_LOG(GetHalState());
2333     int hr = CM_SUCCESS;
2334     bool bCPUcopy = option>0 ? true:false;
2335     if ((offset) || (sysMemSize > 1069551616))
2336         bCPUcopy = true;
2337 
2338     MOS_THREADHANDLE workThread = 0;
2339     CmBufferUP* sysUPbuffer = nullptr;
2340     CmBufferUP* surfaceOutput = nullptr;
2341     CmKernel* kernel = nullptr;
2342     SurfaceIndex* vBufferIndex = nullptr;
2343     SurfaceIndex* sysUPIndex = nullptr;
2344     CmThreadSpace* threadSpace = nullptr;
2345     CmTask* task = nullptr;
2346     CM_GPUCOPY_KERNEL* gpuCopyKernelParam = nullptr;
2347 
2348     int32_t         sysLeftShiftOffset = 0;
2349     int32_t         dstLeftShiftOffset = 0;
2350     uint32_t        threadWidth = 0;
2351     uint32_t        threadHeight = 0;
2352     uint32_t        threadNum = 0;
2353     uint32_t        copySize = (uint32_t)sysMemSize;
2354     uint32_t        cpuMemcopySize = 0;
2355     size_t          systemLinearAddressAligned = 0;
2356 
2357     threadNum = copySize / BYTE_COPY_ONE_THREAD;
2358 
2359     int32_t taskDriverId = -1;
2360     CmEventRT* eventRT = static_cast<CmEventRT*>(event);
2361     hr = CreateEvent((CmTaskInternal *)task, true, taskDriverId, eventRT);
2362     event = static_cast<CmEvent*>(eventRT);
2363 
2364     if (((size_t)sysMem & 0xf) || (sysMem == 0))
2365     {
2366         CM_ASSERTMESSAGE("Error: Start address of system memory is not 16-byte aligned.");
2367         bCPUcopy = true;
2368     }
2369 
2370     // Get page aligned address
2371     if (sizeof(void*) == 8) //64-bit
2372     {
2373         systemLinearAddressAligned = (size_t)sysMem & ADDRESS_PAGE_ALIGNMENT_MASK_X64;  // make sure the address page aligned.
2374     }
2375     else
2376     {
2377         systemLinearAddressAligned = (size_t)sysMem & ADDRESS_PAGE_ALIGNMENT_MASK_X86;  // make sure the address page aligned.
2378     }
2379 
2380     sysLeftShiftOffset = (int32_t)((size_t)sysMem - systemLinearAddressAligned);
2381 
2382 
2383     if ((sysMemSize + sysLeftShiftOffset) > CM_MAX_1D_SURF_WIDTH)
2384     {
2385         CM_ASSERTMESSAGE("Error: Invalid copy size.");
2386         return CM_GPUCOPY_INVALID_SIZE;
2387     }
2388 
2389     if (!m_device->HasGpuCopyKernel())
2390     {
2391         //return CM_NOT_IMPLEMENTED;
2392         bCPUcopy = true;
2393     }
2394 
2395     if (sysMem == nullptr)
2396     {
2397         CM_ASSERTMESSAGE("Error: Pointer to system memory is null.");
2398         return CM_NULL_POINTER;
2399     }
2400 
2401     threadWidth = CM_MAX_THREADSPACE_WIDTH_FOR_MW;
2402     threadHeight = (threadNum + threadWidth - 1) / threadWidth;
2403     while (threadHeight > (CM_MAX_THREADSPACE_HEIGHT_SKLUP_FOR_MW >>1))
2404     {
2405         threadWidth++; // THREAD_SPACE_WIDTH_INCREMENT; //threadWidth << 1;
2406         threadHeight = (threadNum + threadWidth - 1) / threadWidth;
2407 
2408         if (threadWidth > (CM_MAX_THREADSPACE_WIDTH_SKLUP_FOR_MW >>1))
2409         {
2410             hr = CM_GPUCOPY_INVALID_SIZE; // thread number exceed 1023*1023
2411             goto finish;
2412         }
2413     }
2414 
2415      if (bCPUcopy)
2416     {
2417         void* data = MOS_New(CopyThreadData);  // malloc  use mos utility
2418 
2419         ((CopyThreadData*)data)->buffer = dynamic_cast<CmBuffer_RT*>(buffer);
2420         ((CopyThreadData*)data)->offset = offset;
2421         ((CopyThreadData*)data)->sysMem = (unsigned char*)sysMem;
2422         ((CopyThreadData*)data)->sysMemSize = sysMemSize;
2423         ((CopyThreadData*)data)->dir = dir;
2424         ((CopyThreadData*)data)->wait_event = wait_event;
2425         ((CopyThreadData*)data)->event = event;
2426         ((CopyThreadData*)data)->option = option;
2427         ((CopyThreadData*)data)->pCmQueueRT = this;
2428         ((CopyThreadData*)data)->cpuFrrequency = m_CPUperformanceFrequency;
2429 
2430         workThread = MosUtilities::MosCreateThread((void*)BufferCopyThread, data);
2431         if (workThread)
2432             hr = CM_SUCCESS;
2433         else
2434             hr = CM_INVALID_MOS_RESOURCE_HANDLE;
2435     }
2436     else
2437     {
2438         CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateBufferUP((int)sysMemSize + sysLeftShiftOffset, (void*)systemLinearAddressAligned, sysUPbuffer));
2439 
2440         CM_CHK_CMSTATUS_GOTOFINISH(CreateGPUCopyKernel((int)sysMemSize, 0, CM_SURFACE_FORMAT_INVALID, CM_FASTCOPY_CPU2CPU, gpuCopyKernelParam));
2441         CM_CHK_NULL_GOTOFINISH_CMERROR(gpuCopyKernelParam);
2442         CM_CHK_NULL_GOTOFINISH_CMERROR(gpuCopyKernelParam->kernel);
2443         kernel = gpuCopyKernelParam->kernel;
2444 
2445         CM_CHK_NULL_GOTOFINISH_CMERROR(buffer);
2446         CM_CHK_CMSTATUS_GOTOFINISH(buffer->GetIndex(vBufferIndex));
2447         CM_CHK_NULL_GOTOFINISH_CMERROR(sysUPbuffer);
2448         CM_CHK_CMSTATUS_GOTOFINISH(sysUPbuffer->GetIndex(sysUPIndex));
2449         CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetThreadCount(threadNum));
2450 
2451         if (dir)
2452         {
2453             CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg(0, sizeof(SurfaceIndex), sysUPIndex));
2454             CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg(1, sizeof(SurfaceIndex), vBufferIndex));
2455         }
2456         else
2457         {
2458             CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg(0, sizeof(SurfaceIndex), vBufferIndex));
2459             CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg(1, sizeof(SurfaceIndex), sysUPIndex));
2460         }
2461 
2462         CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg(2, sizeof(int), &threadWidth));
2463         CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg(3, sizeof(int), &threadHeight));
2464         CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg(4, sizeof(int), &offset));
2465         CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg(5, sizeof(int), &sysLeftShiftOffset));
2466         CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg(6, sizeof(int), &copySize));
2467 
2468         CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateThreadSpace(threadWidth, threadHeight, threadSpace));
2469 
2470         CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateTask(task));
2471         CM_CHK_NULL_GOTOFINISH_CMERROR(task);
2472         CM_CHK_CMSTATUS_GOTOFINISH(task->AddKernel(kernel));
2473 
2474         CM_CHK_CMSTATUS_GOTOFINISH(EnqueueFast(task, event, threadSpace));
2475     }
2476 
2477 finish:
2478     if (hr != CM_SUCCESS)
2479     {   //Failed
2480         if (sysUPbuffer == nullptr || buffer == nullptr)
2481         {
2482             hr = CM_GPUCOPY_OUT_OF_RESOURCE; // user need to know whether the failure is caused by out of BufferUP.
2483         }
2484         else
2485         {
2486             hr = CM_FAILURE;
2487         }
2488     }
2489 
2490        if (sysUPbuffer)                      m_device->DestroyBufferUP(sysUPbuffer);
2491        if (kernel && gpuCopyKernelParam)      GPUCOPY_KERNEL_UNLOCK(gpuCopyKernelParam);
2492        if (threadSpace)                       m_device->DestroyThreadSpace(threadSpace);
2493        if (task)                              m_device->DestroyTask(task);
2494 
2495     return hr;
2496 }
2497 
2498 
2499 
2500 
2501 //*----------------------------------------------------------------------------------------
2502 //| Purpose:    Pop task from flushed Queue, Update surface state and Destroy the task
2503 //| Notes:
2504 //*----------------------------------------------------------------------------------------
PopTaskFromFlushedQueue()2505 void CmQueueRT::PopTaskFromFlushedQueue()
2506 {
2507     CmTaskInternal* topTask = (CmTaskInternal*)m_flushedTasks.Pop();
2508 
2509     if ( topTask != nullptr )
2510     {
2511         CmEventRT *event = nullptr;
2512         topTask->GetTaskEvent( event );
2513         if ( event != nullptr )
2514         {
2515             LARGE_INTEGER nTime;
2516             if ( !(MosUtilities::MosQueryPerformanceCounter( (uint64_t*)&nTime.QuadPart )) )
2517             {
2518                 CM_ASSERTMESSAGE("Error: Query performace counter failure.");
2519             }
2520             else
2521             {
2522                 event->SetCompleteTime( nTime );
2523             }
2524         }
2525 
2526         CmTaskInternal::Destroy( topTask );
2527     }
2528     return;
2529 }
2530 
TouchFlushedTasks()2531 int32_t CmQueueRT::TouchFlushedTasks( )
2532 {
2533     int32_t hr = CM_SUCCESS;
2534 
2535     if (m_flushedTasks.IsEmpty())
2536     {
2537         if (!m_enqueuedTasks.IsEmpty())
2538         {
2539             // if FlushedQueue is empty and EnqueuedQueue is not empty
2540             // try flush task to FlushedQueue
2541             hr = FlushTaskWithoutSync();
2542             if (FAILED(hr))
2543             {
2544                 return hr;
2545             }
2546         }
2547         else
2548         {   // no task in flushedQueue and EnqueuedQueue, just skip
2549             return CM_SUCCESS;
2550         }
2551     }
2552 
2553     // Flush FlushedQueue
2554     hr = QueryFlushedTasks();
2555 
2556     return hr;
2557 }
2558 
2559 //*-----------------------------------------------------------------------------
2560 //! Flush the queue, i.e. submit all tasks in the queue to execute according
2561 //! to their order in the the queue. The queue will be empty after flush,
2562 //! This is a non-blocking call. i.e. it returns immediately without waiting for
2563 //! GPU to finish the execution of tasks.
2564 //! INPUT:
2565 //! OUTPUT:
2566 //!     CM_SUCCESS if all tasks in the queue are submitted
2567 //!     CM_FAILURE otherwise.
2568 //!     More error code is coming.
2569 //!
2570 //*-----------------------------------------------------------------------------
QueryFlushedTasks()2571 int32_t CmQueueRT::QueryFlushedTasks()
2572 {
2573     int32_t hr   = CM_SUCCESS;
2574 
2575     m_criticalSectionFlushedTask.Acquire();
2576     while( !m_flushedTasks.IsEmpty() )
2577     {
2578         CmTaskInternal* task = (CmTaskInternal*)m_flushedTasks.Top();
2579         CM_CHK_NULL_GOTOFINISH_CMERROR(task);
2580 
2581         CM_STATUS status = CM_STATUS_FLUSHED ;
2582         task->GetTaskStatus(status);
2583         if( status == CM_STATUS_FINISHED )
2584         {
2585             PopTaskFromFlushedQueue();
2586         }
2587         else
2588         {
2589             // media reset
2590             if (status == CM_STATUS_RESET)
2591             {
2592                 PCM_CONTEXT_DATA cmData = (PCM_CONTEXT_DATA)m_device->GetAccelData();
2593 
2594                 // Clear task status table in Cm Hal State
2595                 int32_t taskId = 0;
2596                 CmEventRT*pTopTaskEvent = nullptr;
2597                 task->GetTaskEvent(pTopTaskEvent);
2598                 CM_CHK_NULL_GOTOFINISH_CMERROR(pTopTaskEvent);
2599 
2600                 pTopTaskEvent->GetTaskDriverId(taskId);
2601                 cmData->cmHalState->taskStatusTable[taskId] = CM_INVALID_INDEX;
2602 
2603                 //Pop task and Destroy it
2604                 PopTaskFromFlushedQueue();
2605             }
2606 
2607             // It is an in-order queue, if this one hasn't finshed,
2608             // the following ones haven't finished either.
2609             break;
2610         }
2611     }
2612 
2613 finish:
2614     m_criticalSectionFlushedTask.Release();
2615 
2616     return hr;
2617 }
2618 
2619 //*-----------------------------------------------------------------------------
2620 //! This is a blocking call. It will NOT return untill
2621 //! all tasks in GPU and all tasks in queue finishes execution.
2622 //! It will first flush the queue if the queue is not empty.
2623 //! INPUT:
2624 //! OUTPUT:
2625 //!     CM_SUCCESS if all tasks finish execution.
2626 //!     CM_FAILURE otherwise.
2627 //!     More error code is coming.
2628 //*-----------------------------------------------------------------------------
DestroyEvent(CmEvent * & event)2629 CM_RT_API int32_t CmQueueRT::DestroyEvent( CmEvent* & event )
2630 {
2631 
2632     CLock Lock(m_criticalSectionEvent);
2633 
2634     if (event == nullptr)
2635     {
2636         return CM_FAILURE;
2637     }
2638 
2639     uint32_t index = 0;
2640 
2641     CmEventRT *eventRT = dynamic_cast<CmEventRT *>(event);
2642     if (eventRT == nullptr)
2643     {
2644         return DestroyEventFast(event);
2645     }
2646     eventRT->GetIndex(index);
2647     CM_ASSERT( m_eventArray.GetElement( index ) == eventRT );
2648 
2649     int32_t status = CmEventRT::Destroy( eventRT );
2650     if( status == CM_SUCCESS && eventRT == nullptr)
2651     {
2652         m_eventArray.SetElement(index, nullptr);
2653     }
2654 
2655     // Should return nullptr to application even the event is not destroyed
2656     // since its reference count is not zero
2657     event = nullptr;
2658 
2659     return status;
2660 }
2661 
2662 //*-----------------------------------------------------------------------------
2663 //| Purpose:    Clean the Queue if its tasks time out
2664 //| Returns:    Result of the operation.
2665 //*-----------------------------------------------------------------------------
CleanQueue()2666 int32_t CmQueueRT::CleanQueue( )
2667 {
2668 
2669     int32_t status = CM_SUCCESS;
2670 
2671     // Maybe not necessary since
2672     // it is called by ~CmDevice only
2673     // Update: necessary because it calls FlushBlockWithoutSync
2674     if( !m_enqueuedTasks.IsEmpty() )
2675     {
2676         // If there are tasks not flushed (i.e. not send to driver )
2677         // wait untill all such tasks are flushed
2678         FlushTaskWithoutSync( true );
2679     }
2680     CM_ASSERT( m_enqueuedTasks.IsEmpty() );
2681 
2682     //Used for timeout detection
2683     LARGE_INTEGER freq;
2684     MosUtilities::MosQueryPerformanceFrequency((uint64_t *)&freq.QuadPart);
2685     LARGE_INTEGER start;
2686     MosUtilities::MosQueryPerformanceCounter((uint64_t*)&start.QuadPart);
2687     int64_t timeout = start.QuadPart + (CM_MAX_TIMEOUT * freq.QuadPart * m_flushedTasks.GetCount()); //Count to timeout at
2688 
2689     while( !m_flushedTasks.IsEmpty() && status != CM_EXCEED_MAX_TIMEOUT )
2690     {
2691         QueryFlushedTasks();
2692 
2693         LARGE_INTEGER current;
2694         MosUtilities::MosQueryPerformanceCounter((uint64_t*)&current.QuadPart);
2695         if( current.QuadPart > timeout )
2696             status = CM_EXCEED_MAX_TIMEOUT;
2697     }
2698 
2699     return status;
2700 }
2701 
GetQueueOption()2702 CM_QUEUE_CREATE_OPTION &CmQueueRT::GetQueueOption()
2703 {
2704     return m_queueOption;
2705 }
2706 
2707 //*-----------------------------------------------------------------------------
2708 //| Purpose:    Get the count of task in queue
2709 //| Returns:    Result of the operation.
2710 //*-----------------------------------------------------------------------------
GetTaskCount(uint32_t & numTasks)2711 int32_t CmQueueRT::GetTaskCount( uint32_t& numTasks )
2712 {
2713     numTasks = m_enqueuedTasks.GetCount() + m_flushedTasks.GetCount();
2714     return CM_SUCCESS;
2715 }
2716 
2717 //*-----------------------------------------------------------------------------
2718 //| Purpose:   Use GPU to init Surface2D
2719 //| Returns:   result of operation
2720 //*-----------------------------------------------------------------------------
EnqueueInitSurface2D(CmSurface2D * surf2D,const uint32_t initValue,CmEvent * & event)2721 CM_RT_API int32_t CmQueueRT::EnqueueInitSurface2D( CmSurface2D* surf2D, const uint32_t initValue, CmEvent* &event)
2722 {
2723     INSERT_API_CALL_LOG(GetHalState());
2724 
2725     if (!m_device->HasGpuInitKernel())
2726     {
2727         return CM_NOT_IMPLEMENTED;
2728     }
2729 
2730     int32_t         hr                      = CM_SUCCESS;
2731     uint32_t        width                   = 0;
2732     uint32_t        height                  = 0;
2733     uint32_t        sizePerPixel            = 0;
2734     CmProgram       *gpuInitKernelProgram  = nullptr;
2735     CmKernel        *kernel                = nullptr;
2736     SurfaceIndex    *outputIndexCM         = nullptr;
2737     CmThreadSpace   *threadSpace           = nullptr;
2738     CmTask          *gpuCopyTask           = nullptr;
2739     uint32_t        threadWidth             = 0;
2740     uint32_t        threadHeight            = 0;
2741     uint32_t        threadNum               = 0;
2742     CmSurfaceManager* surfaceMgr           = nullptr;
2743     CM_SURFACE_FORMAT      format           = CM_SURFACE_FORMAT_INVALID;
2744 
2745     if(!surf2D)
2746     {
2747         CM_ASSERTMESSAGE("Error: Pointer to surface 2d is null.");
2748         return CM_FAILURE;
2749     }
2750     CmSurface2DRT *surf2DRT = static_cast<CmSurface2DRT *>(surf2D);
2751 
2752     CM_CHK_CMSTATUS_GOTOFINISH(m_device->LoadPredefinedInitKernel(gpuInitKernelProgram));
2753 
2754     CM_CHK_CMSTATUS_GOTOFINISH(surf2DRT->GetSurfaceDesc(width, height, format,sizePerPixel));
2755 
2756     m_device->GetSurfaceManager(surfaceMgr);
2757     CM_CHK_NULL_GOTOFINISH_CMERROR(surfaceMgr);
2758 
2759     if (format == CM_SURFACE_FORMAT_NV12 || format == CM_SURFACE_FORMAT_P010 || format == CM_SURFACE_FORMAT_P016)
2760     {
2761         CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateKernel( gpuInitKernelProgram, _NAME( surfaceCopy_set_NV12 ), kernel, "PredefinedGPUCopyKernel"));
2762     }
2763     else
2764     {
2765         CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateKernel( gpuInitKernelProgram, _NAME( surfaceCopy_set ), kernel, "PredefinedGPUCopyKernel" ));
2766     }
2767     CM_CHK_NULL_GOTOFINISH_CMERROR(kernel);
2768     CM_CHK_CMSTATUS_GOTOFINISH(surf2D->GetIndex( outputIndexCM ));
2769 
2770     threadWidth = ( uint32_t )ceil( ( double )width*sizePerPixel/BLOCK_PIXEL_WIDTH/4 );
2771     threadHeight = ( uint32_t )ceil( ( double )height/BLOCK_HEIGHT );
2772     threadNum = threadWidth * threadHeight;
2773     CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetThreadCount( threadNum ));
2774 
2775     CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateThreadSpace( threadWidth, threadHeight, threadSpace ));
2776     CM_CHK_NULL_GOTOFINISH_CMERROR(threadSpace);
2777 
2778     CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 0, sizeof( uint32_t ), &initValue ));
2779     CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 1, sizeof( SurfaceIndex ), outputIndexCM ));
2780 
2781     CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateTask(gpuCopyTask));
2782     CM_CHK_NULL_GOTOFINISH_CMERROR(gpuCopyTask);
2783 
2784     CM_CHK_CMSTATUS_GOTOFINISH(gpuCopyTask->AddKernel( kernel ));
2785 
2786     CM_CHK_CMSTATUS_GOTOFINISH(EnqueueFast(gpuCopyTask, event, threadSpace));
2787 
2788 finish:
2789 
2790     if (kernel)        m_device->DestroyKernel( kernel );
2791     if (gpuCopyTask)   m_device->DestroyTask(gpuCopyTask);
2792     if (threadSpace)            m_device->DestroyThreadSpace(threadSpace);
2793 
2794     return hr;
2795 }
2796 
2797 //*-----------------------------------------------------------------------------
2798 //! Flush a geneal task to HAL CM layer for execution.
2799 //! This is a non-blocking call. i.e. it returs immediately without waiting for
2800 //! GPU to finish the execution of tasks.
2801 //! INPUT: task -- Pointer to CmTaskInternal object
2802 //! OUTPUT:
2803 //!     CM_SUCCESS if all tasks in the queue are submitted
2804 //!     CM_FAILURE otherwise.
2805 //*-----------------------------------------------------------------------------
FlushGeneralTask(CmTaskInternal * task)2806 int32_t CmQueueRT::FlushGeneralTask(CmTaskInternal* task)
2807 {
2808     CM_RETURN_CODE          hr              = CM_SUCCESS;
2809     CM_HAL_EXEC_TASK_PARAM  param;
2810     PCM_HAL_KERNEL_PARAM    kernelParam    = nullptr;
2811     CmKernelData*           kernelData     = nullptr;
2812     uint32_t                kernelDataSize  = 0;
2813     PCM_CONTEXT_DATA        cmData         = nullptr;
2814     CmEventRT*              event          = nullptr;
2815     uint32_t                totalThreadCount= 0;
2816     uint32_t                count           = 0;
2817     PCM_HAL_KERNEL_PARAM    tempData       = nullptr;
2818     uint32_t                maxTSWidth      = 0;
2819     bool                    hasThreadArg    = false;
2820 
2821     CmSafeMemSet( &param, 0, sizeof( CM_HAL_EXEC_TASK_PARAM ) );
2822 
2823     //GT-PIN
2824     if(m_device->CheckGTPinEnabled())
2825     {
2826         CM_CHK_CMSTATUS_GOTOFINISH(task->GetKernelSurfInfo(param.surfEntryInfoArrays));
2827     }
2828 
2829     task->GetKernelCount( count );
2830     param.numKernels = count;
2831 
2832     param.kernels = MOS_NewArray(PCM_HAL_KERNEL_PARAM,count);
2833     param.kernelSizes = MOS_NewArray(uint32_t,count);
2834     param.kernelCurbeOffset = MOS_NewArray(uint32_t,count);
2835     param.queueOption = m_queueOption;
2836 
2837     CM_CHK_NULL_GOTOFINISH(param.kernels, CM_OUT_OF_HOST_MEMORY);
2838     CM_CHK_NULL_GOTOFINISH(param.kernelSizes, CM_OUT_OF_HOST_MEMORY);
2839     CM_CHK_NULL_GOTOFINISH(param.kernelCurbeOffset, CM_OUT_OF_HOST_MEMORY);
2840 
2841     for( uint32_t i = 0; i < count; i ++ )
2842     {
2843         task->GetKernelData( i, kernelData );
2844         CM_CHK_NULL_GOTOFINISH_CMERROR(kernelData);
2845 
2846         kernelParam = kernelData->GetHalCmKernelData();
2847         CM_CHK_NULL_GOTOFINISH_CMERROR(kernelParam);
2848 
2849         hasThreadArg |= kernelParam->perThreadArgExisted;
2850 
2851         task->GetKernelDataSize( i, kernelDataSize );
2852         if(kernelDataSize == 0)
2853         {
2854             CM_ASSERTMESSAGE("Error: Invalid kernel data size.");
2855             hr = CM_FAILURE;
2856             goto finish;
2857         }
2858 
2859         tempData = kernelData->GetHalCmKernelData();
2860 
2861         param.kernels[ i ]             = tempData;
2862         param.kernelSizes[ i ]        = kernelDataSize;
2863         param.kernelCurbeOffset[ i ]  = task->GetKernelCurbeOffset(i);
2864         param.globalSurfaceUsed       |= tempData->globalSurfaceUsed;
2865         param.kernelDebugEnabled      |= tempData->kernelDebugEnabled;
2866     }
2867 
2868     /*
2869     * Preset the default TS width/height/dependency:
2870     *     TS width   = MOS_MIN(CM_MAX_THREADSPACE_WIDTH, threadcount)
2871     *     TS height  = totalThreadCount/CM_MAX_THREADSPACE_WIDTH + 1
2872     *     dependency = CM_NONE_DEPENDENCY
2873     * For threadSpace is nullptr case, we will pass the default TS width/height/dependency to driver
2874     * For threadSpace is valid case, the TS width/height/dependency will be update according to thread space set by user.
2875     */
2876     task->GetTotalThreadCount(totalThreadCount);
2877 
2878     if (hasThreadArg)
2879     {
2880         maxTSWidth = CM_MAX_THREADSPACE_WIDTH_FOR_MW + 1; // 512 allowed for media object
2881     }
2882     else
2883     {
2884         maxTSWidth = CM_MAX_THREADSPACE_WIDTH_FOR_MW; // 511 for media walker
2885     }
2886 
2887     param.threadSpaceWidth = (totalThreadCount > maxTSWidth) ? maxTSWidth : totalThreadCount;
2888     if(totalThreadCount%maxTSWidth)
2889     {
2890         param.threadSpaceHeight = totalThreadCount/maxTSWidth + 1;
2891     }
2892     else
2893     {
2894         param.threadSpaceHeight = totalThreadCount/maxTSWidth;
2895     }
2896 
2897     param.dependencyPattern = CM_NONE_DEPENDENCY;
2898 
2899     if (task->IsThreadSpaceCreated()) //scoreboard data preparation
2900     {
2901         if(task->IsThreadCoordinatesExisted())
2902         {
2903             param.threadCoordinates = MOS_NewArray(PCM_HAL_SCOREBOARD, count);
2904             param.dependencyMasks = MOS_NewArray(PCM_HAL_MASK_AND_RESET, count);
2905 
2906             CM_CHK_NULL_GOTOFINISH(param.threadCoordinates, CM_OUT_OF_HOST_MEMORY);
2907             CM_CHK_NULL_GOTOFINISH(param.dependencyMasks, CM_OUT_OF_HOST_MEMORY);
2908             for(uint32_t i=0; i<count; i++)
2909             {
2910                 void *kernelCoordinates = nullptr;
2911                 void *dependencyMasks = nullptr;
2912                 task->GetKernelCoordinates(i, kernelCoordinates);
2913                 task->GetKernelDependencyMasks(i, dependencyMasks);
2914                 param.threadCoordinates[i] = (PCM_HAL_SCOREBOARD)kernelCoordinates;
2915                 param.dependencyMasks[i] = (PCM_HAL_MASK_AND_RESET)dependencyMasks;
2916             }
2917         }
2918         else
2919         {
2920             param.threadCoordinates = nullptr;
2921         }
2922 
2923         task->GetDependencyPattern(param.dependencyPattern);
2924 
2925         task->GetThreadSpaceSize(param.threadSpaceWidth, param.threadSpaceHeight);
2926 
2927         task->GetWalkingPattern(param.walkingPattern);
2928 
2929         if( task->CheckWalkingParametersSet( ) )
2930         {
2931             param.walkingParamsValid = 1;
2932             CM_CHK_CMSTATUS_GOTOFINISH(task->GetWalkingParameters(param.walkingParams));
2933         }
2934         else
2935         {
2936             param.walkingParamsValid = 0;
2937         }
2938 
2939         if( task->CheckDependencyVectorsSet( ) )
2940         {
2941             param.dependencyVectorsValid = 1;
2942             CM_CHK_CMSTATUS_GOTOFINISH(task->GetDependencyVectors(param.dependencyVectors));
2943         }
2944         else
2945         {
2946             param.dependencyVectorsValid = 0;
2947         }
2948     }
2949     if (param.threadSpaceWidth == 0)
2950     {
2951         CM_ASSERTMESSAGE("Error: Invalid thread space.");
2952         hr = CM_INVALID_THREAD_SPACE;
2953         goto finish;
2954     }
2955     task->GetColorCountMinusOne(param.colorCountMinusOne);
2956     task->GetMediaWalkerGroupSelect(param.mediaWalkerGroupSelect);
2957 
2958     param.syncBitmap = task->GetSyncBitmap();
2959     param.conditionalEndBitmap = task->GetConditionalEndBitmap();
2960     param.userDefinedMediaState = task->GetMediaStatePtr();
2961     CmSafeMemCopy(param.conditionalEndInfo, task->GetConditionalEndInfo(), sizeof(param.conditionalEndInfo));
2962 
2963     CM_TASK_CONFIG taskConfig;
2964     task->GetProperty(taskConfig);
2965     CmSafeMemCopy(&param.taskConfig, &taskConfig, sizeof(param.taskConfig));
2966     cmData = (PCM_CONTEXT_DATA)m_device->GetAccelData();
2967 
2968     CM_CHK_MOSSTATUS_GOTOFINISH_CMERROR(cmData->cmHalState->pfnSetPowerOption(cmData->cmHalState, task->GetPowerOption()));
2969 
2970     CM_CHK_MOSSTATUS_GOTOFINISH_CMERROR(
2971         ExecuteGeneralTask(cmData->cmHalState,
2972                            &param,
2973                            static_cast<MOS_GPU_CONTEXT>(m_queueOption.GPUContext)));
2974 
2975     if( param.taskIdOut < 0 )
2976     {
2977         CM_ASSERTMESSAGE("Error: Invalid task ID.");
2978         hr = CM_FAILURE;
2979         goto finish;
2980     }
2981 
2982     TASK_LOG(task);
2983 
2984     task->GetTaskEvent( event );
2985     CM_CHK_NULL_GOTOFINISH_CMERROR(event);
2986     CM_CHK_CMSTATUS_GOTOFINISH(event->SetTaskDriverId( param.taskIdOut ));
2987     CM_CHK_CMSTATUS_GOTOFINISH(event->SetTaskOsData( param.osData ));
2988     CM_CHK_CMSTATUS_GOTOFINISH(task->ResetKernelDataStatus());
2989 
2990     //GT-PIN
2991     if(m_device->CheckGTPinEnabled())
2992     {
2993         //No need to clear the SurEntryInfoArrays here. It will be destored by CmInternalTask
2994         CM_CHK_CMSTATUS_GOTOFINISH(event->SetSurfaceDetails(param.surfEntryInfoArrays));
2995     }
2996 
2997 finish:
2998     MosSafeDeleteArray( param.kernels );
2999     MosSafeDeleteArray( param.kernelSizes );
3000     MosSafeDeleteArray( param.threadCoordinates);
3001     MosSafeDeleteArray( param.dependencyMasks);
3002     MosSafeDeleteArray( param.kernelCurbeOffset);
3003 
3004     return hr;
3005 }
3006 
3007 //*-----------------------------------------------------------------------------
3008 //! Flush a thread group based task to HAL CM layer for execution.
3009 //! This is a non-blocking call. i.e. it returs immediately without waiting for
3010 //! GPU to finish the execution of tasks.
3011 //! INPUT: task -- Pointer to CmTaskInternal object
3012 //! OUTPUT:
3013 //!     CM_SUCCESS if all tasks in the queue are submitted
3014 //!     CM_FAILURE otherwise.
3015 //*-----------------------------------------------------------------------------
FlushGroupTask(CmTaskInternal * task)3016 int32_t CmQueueRT::FlushGroupTask(CmTaskInternal* task)
3017 {
3018     CM_RETURN_CODE  hr          = CM_SUCCESS;
3019 
3020     CM_HAL_EXEC_TASK_GROUP_PARAM param;
3021     CmKernelData* kernelData   = nullptr;
3022     uint32_t kernelDataSize        = 0;
3023     uint32_t count                  = 0;
3024     PCM_CONTEXT_DATA cmData    = nullptr;
3025     CmEventRT * event          = nullptr;
3026     PCM_HAL_KERNEL_PARAM tempData  = nullptr;
3027 
3028     CmSafeMemSet( &param, 0, sizeof( CM_HAL_EXEC_TASK_GROUP_PARAM ) );
3029 
3030     //GT-PIN
3031     if(this->m_device->CheckGTPinEnabled())
3032     {
3033         CM_CHK_CMSTATUS_GOTOFINISH(task->GetKernelSurfInfo(param.surEntryInfoArrays));
3034     }
3035 
3036     task->GetKernelCount( count );
3037     param.numKernels = count;
3038 
3039     param.kernels = MOS_NewArray(PCM_HAL_KERNEL_PARAM, count);
3040     param.kernelSizes = MOS_NewArray(uint32_t, count);
3041     param.kernelCurbeOffset = MOS_NewArray(uint32_t, count);
3042     param.queueOption = m_queueOption;
3043     param.mosVeHintParams = (m_usingVirtualEngine)? &m_mosVeHintParams: nullptr;
3044 
3045     CM_TASK_CONFIG taskConfig;
3046     task->GetProperty(taskConfig);
3047     CmSafeMemCopy(&param.taskConfig, &taskConfig, sizeof(param.taskConfig));
3048     CM_CHK_NULL_GOTOFINISH_CMERROR(param.kernels);
3049     CM_CHK_NULL_GOTOFINISH_CMERROR(param.kernelSizes);
3050     CM_CHK_NULL_GOTOFINISH_CMERROR(param.kernelCurbeOffset);
3051 
3052     for( uint32_t i = 0; i < count; i ++ )
3053     {
3054         task->GetKernelData( i, kernelData );
3055         CM_CHK_NULL_GOTOFINISH_CMERROR(kernelData);
3056 
3057         task->GetKernelDataSize( i, kernelDataSize );
3058         if( kernelDataSize == 0)
3059         {
3060             CM_ASSERTMESSAGE("Error: Invalid kernel data size.");
3061             hr = CM_FAILURE;
3062             goto finish;
3063         }
3064 
3065         tempData = kernelData->GetHalCmKernelData( );
3066 
3067         param.kernels[ i ]             = tempData;
3068         param.kernelSizes[ i ]        = kernelDataSize;
3069         param.kernelCurbeOffset [ i ] = task->GetKernelCurbeOffset(i);
3070         param.globalSurfaceUsed        |= tempData->globalSurfaceUsed;
3071         param.kernelDebugEnabled       |= tempData->kernelDebugEnabled;
3072     }
3073 
3074     task->GetSLMSize(param.slmSize);
3075     if(param.slmSize > MAX_SLM_SIZE_PER_GROUP_IN_1K)
3076     {
3077         CM_ASSERTMESSAGE("Error: SLM size exceeds the maximum per group.");
3078         hr = CM_EXCEED_MAX_SLM_SIZE;
3079         goto finish;
3080     }
3081 
3082     if (task->IsThreadGroupSpaceCreated())//thread group size
3083     {
3084         task->GetThreadGroupSpaceSize(param.threadSpaceWidth, param.threadSpaceHeight,
3085                                       param.threadSpaceDepth, param.groupSpaceWidth,
3086                                       param.groupSpaceHeight, param.groupSpaceDepth);
3087     }
3088 
3089     param.syncBitmap = task->GetSyncBitmap();
3090     param.conditionalEndBitmap = task->GetConditionalEndBitmap();
3091     param.userDefinedMediaState = task->GetMediaStatePtr();
3092     CmSafeMemCopy(param.conditionalEndInfo, task->GetConditionalEndInfo(), sizeof(param.conditionalEndInfo));
3093     CmSafeMemCopy(param.krnExecCfg, task->GetKernelExecuteConfig(), sizeof(param.krnExecCfg));
3094 
3095     // Call HAL layer to execute pfnExecuteGroupTask
3096     cmData = (PCM_CONTEXT_DATA)m_device->GetAccelData();
3097 
3098     CM_CHK_MOSSTATUS_GOTOFINISH_CMERROR( cmData->cmHalState->pfnSetPowerOption( cmData->cmHalState, task->GetPowerOption() ) );
3099 
3100     CM_CHK_MOSSTATUS_GOTOFINISH_CMERROR(
3101         ExecuteGroupTask(cmData->cmHalState,
3102                          &param,
3103                          static_cast<MOS_GPU_CONTEXT>(m_queueOption.GPUContext)));
3104 
3105     if( param.taskIdOut < 0 )
3106     {
3107         CM_ASSERTMESSAGE("Error: Invalid task ID.");
3108         hr = CM_FAILURE;
3109         goto finish;
3110     }
3111     TASK_LOG(task);
3112     task->GetTaskEvent( event );
3113     CM_CHK_NULL_GOTOFINISH_CMERROR( event );
3114     CM_CHK_CMSTATUS_GOTOFINISH(event->SetTaskDriverId( param.taskIdOut ));
3115     CM_CHK_CMSTATUS_GOTOFINISH(event->SetTaskOsData( param.osData ));
3116     CM_CHK_CMSTATUS_GOTOFINISH(task->ResetKernelDataStatus());
3117 
3118     //GT-PIN
3119     if(this->m_device->CheckGTPinEnabled())
3120     {
3121         CM_CHK_CMSTATUS_GOTOFINISH(event->SetSurfaceDetails(param.surEntryInfoArrays));
3122     }
3123 
3124 finish:
3125     MosSafeDeleteArray( param.kernels );
3126     MosSafeDeleteArray( param.kernelSizes );
3127     MosSafeDeleteArray( param.kernelCurbeOffset);
3128 
3129     return hr;
3130 }
3131 
3132 //*-----------------------------------------------------------------------------
3133 //! Flush a VEBOX task to HAL CM layer for execution.
3134 //! This is a non-blocking call. i.e. it returs immediately without waiting for
3135 //! GPU to finish the execution of tasks.
3136 //! INPUT: task -- Pointer to CmTaskInternal object
3137 //! OUTPUT:
3138 //!     CM_SUCCESS if all tasks in the queue are submitted
3139 //!     CM_FAILURE otherwise.
3140 //*-----------------------------------------------------------------------------
FlushVeboxTask(CmTaskInternal * task)3141 int32_t CmQueueRT::FlushVeboxTask(CmTaskInternal* task)
3142 {
3143     CM_RETURN_CODE  hr          = CM_SUCCESS;
3144 
3145     CM_HAL_EXEC_VEBOX_TASK_PARAM param;
3146     PCM_CONTEXT_DATA cmData    = nullptr;
3147     CmEventRT * event          = nullptr;
3148     uint8_t *stateData           = nullptr;
3149     uint8_t *surfaceData         = nullptr;
3150     CmBuffer_RT * temp          = nullptr;
3151 
3152     uint32_t original_stream_index = 0;
3153 
3154     CmSafeMemSet( &param, 0, sizeof( CM_HAL_EXEC_VEBOX_TASK_PARAM ) );
3155     //Set VEBOX state data pointer and size
3156     //Set VEBOX surface data pointer and size
3157     CM_VEBOX_STATE cmVeboxState;
3158     CmBufferUP *veboxParamBuf = nullptr;
3159     CM_VEBOX_SURFACE_DATA cmVeboxSurfaceData;
3160     task->GetVeboxState(cmVeboxState);
3161     task->GetVeboxParam(veboxParamBuf);
3162     task->GetVeboxSurfaceData(cmVeboxSurfaceData);
3163     CM_CHK_NULL_GOTOFINISH_CMERROR(veboxParamBuf);
3164 
3165     temp = static_cast<CmBuffer_RT*>(veboxParamBuf);
3166     temp->GetHandle(param.veboxParamIndex);
3167 
3168     param.cmVeboxState = cmVeboxState;
3169     param.veboxParam = veboxParamBuf;
3170 
3171     param.veboxSurfaceData = cmVeboxSurfaceData;
3172 
3173     param.queueOption = m_queueOption;
3174 
3175     //Set VEBOX task id to -1
3176     param.taskIdOut = -1;
3177 
3178     cmData = (PCM_CONTEXT_DATA)m_device->GetAccelData();
3179     original_stream_index = cmData->cmHalState->osInterface->streamIndex;
3180     cmData->cmHalState->pfnSetGpuContext(cmData->cmHalState, MOS_GPU_CONTEXT_VEBOX,
3181                                          original_stream_index, m_gpuContextHandle);
3182     RegisterSyncEvent();
3183 
3184     CM_CHK_MOSSTATUS_GOTOFINISH_CMERROR( cmData->cmHalState->pfnExecuteVeboxTask( cmData->cmHalState, &param ) );
3185 
3186     if( param.taskIdOut < 0 )
3187     {
3188         CM_ASSERTMESSAGE("Error: Invalid task ID.");
3189         hr = CM_FAILURE;
3190         goto finish;
3191     }
3192 
3193     task->GetTaskEvent( event );
3194     CM_CHK_NULL_GOTOFINISH_CMERROR( event );
3195     CM_CHK_CMSTATUS_GOTOFINISH(event->SetTaskDriverId( param.taskIdOut ));
3196     CM_CHK_CMSTATUS_GOTOFINISH(event->SetTaskOsData( param.osData ));
3197 
3198 finish:
3199     return hr;
3200 }
3201 
3202 //*-----------------------------------------------------------------------------
3203 //! Flush the queue, i.e. submit all tasks in the queue to execute according
3204 //! to their order in the the queue. The queue will be empty after flush,
3205 //! This is a non-blocking call. i.e. it returns immediately without waiting for
3206 //! GPU to finish the execution of tasks.
3207 //! INPUT:
3208 //! OUTPUT:
3209 //!     CM_SUCCESS if all tasks in the queue are submitted
3210 //!     CM_FAILURE otherwise.
3211 //*-----------------------------------------------------------------------------
FlushEnqueueWithHintsTask(CmTaskInternal * task)3212 int32_t CmQueueRT::FlushEnqueueWithHintsTask( CmTaskInternal* task )
3213 {
3214     CM_RETURN_CODE               hr             = CM_SUCCESS;
3215     CM_HAL_EXEC_HINTS_TASK_PARAM param;
3216     PCM_CONTEXT_DATA             cmData        = nullptr;
3217     CmKernelData*                kernelData    = nullptr;
3218     uint32_t                     kernelDataSize = 0;
3219     uint32_t                     count          = 0;
3220     CmEventRT                    *event        = nullptr;
3221     PCM_HAL_KERNEL_PARAM         tempData      = nullptr;
3222 
3223     uint32_t original_stream_index = 0;
3224 
3225     CmSafeMemSet( &param, 0, sizeof( CM_HAL_EXEC_HINTS_TASK_PARAM ) );
3226     task->GetKernelCount ( count );
3227     param.numKernels = count;
3228     param.kernels = MOS_NewArray(PCM_HAL_KERNEL_PARAM, count);
3229     param.kernelSizes = MOS_NewArray(uint32_t, count);
3230     param.kernelCurbeOffset = MOS_NewArray(uint32_t, count);
3231     param.queueOption = m_queueOption;
3232 
3233     CM_CHK_NULL_GOTOFINISH_CMERROR(param.kernels);
3234     CM_CHK_NULL_GOTOFINISH_CMERROR(param.kernelSizes);
3235     CM_CHK_NULL_GOTOFINISH_CMERROR(param.kernelCurbeOffset);
3236 
3237     task->GetHints(param.hints);
3238     task->GetNumTasksGenerated(param.numTasksGenerated);
3239     task->GetLastTask(param.isLastTask);
3240 
3241     for( uint32_t i = 0; i < count; i ++ )
3242     {
3243         task->GetKernelData( i, kernelData );
3244         CM_CHK_NULL_GOTOFINISH_CMERROR( kernelData );
3245 
3246         task->GetKernelDataSize( i, kernelDataSize );
3247         if( kernelDataSize == 0 )
3248         {
3249             CM_ASSERTMESSAGE("Error: Invalid kernel data size.");
3250             hr = CM_FAILURE;
3251             goto finish;
3252         }
3253 
3254         tempData = kernelData->GetHalCmKernelData();
3255 
3256         param.kernels[ i ]             = tempData;
3257         param.kernelSizes[ i ]         = kernelDataSize;
3258         param.kernelCurbeOffset[ i ]   = task->GetKernelCurbeOffset(i);
3259     }
3260 
3261     param.userDefinedMediaState = task->GetMediaStatePtr();
3262     cmData = (PCM_CONTEXT_DATA)m_device->GetAccelData();
3263     CM_CHK_NULL_GOTOFINISH_CMERROR(cmData);
3264 
3265     CM_CHK_MOSSTATUS_GOTOFINISH_CMERROR(cmData->cmHalState->pfnSetPowerOption(cmData->cmHalState, task->GetPowerOption()));
3266 
3267     original_stream_index = cmData->cmHalState->osInterface->streamIndex;
3268     cmData->cmHalState->pfnSetGpuContext(
3269         cmData->cmHalState, static_cast<MOS_GPU_CONTEXT>(m_queueOption.GPUContext),
3270         original_stream_index, m_gpuContextHandle);
3271     RegisterSyncEvent();
3272 
3273     CM_CHK_MOSSTATUS_GOTOFINISH_CMERROR(cmData->cmHalState->pfnExecuteHintsTask(cmData->cmHalState, &param));
3274 
3275     if( param.taskIdOut < 0 )
3276     {
3277         CM_ASSERTMESSAGE("Error: Invalid task ID.");
3278         hr = CM_FAILURE;
3279         goto finish;
3280     }
3281 
3282     TASK_LOG(task);
3283 
3284     task->GetTaskEvent( event );
3285     CM_CHK_NULL_GOTOFINISH_CMERROR( event );
3286     CM_CHK_CMSTATUS_GOTOFINISH(event->SetTaskDriverId( param.taskIdOut ));
3287     CM_CHK_CMSTATUS_GOTOFINISH(event->SetTaskOsData( param.osData ));
3288     CM_CHK_CMSTATUS_GOTOFINISH(task->ResetKernelDataStatus());
3289 
3290 finish:
3291 
3292     MosSafeDeleteArray( param.kernels );
3293     MosSafeDeleteArray( param.kernelSizes );
3294     MosSafeDeleteArray( param.kernelCurbeOffset );
3295 
3296     return hr;
3297 }
3298 
3299 //*-----------------------------------------------------------------------------
3300 //! Flush the queue, i.e. submit all tasks in the queue to execute according
3301 //! to their order in the the queue. The queue will be empty after flush,
3302 //! This is a non-blocking call. i.e. it returs immediately without waiting for
3303 //! GPU to finish the execution of tasks.
3304 //! INPUT:
3305 //! OUTPUT:
3306 //!     CM_SUCCESS if all tasks in the queue are submitted
3307 //!     CM_FAILURE otherwise.
3308 //*-----------------------------------------------------------------------------
FlushTaskWithoutSync(bool flushBlocked)3309 int32_t CmQueueRT::FlushTaskWithoutSync( bool flushBlocked )
3310 {
3311     int32_t             hr          = CM_SUCCESS;
3312     CmTaskInternal*     task       = nullptr;
3313     uint32_t            taskType  = CM_TASK_TYPE_DEFAULT;
3314     uint32_t            freeSurfNum = 0;
3315     CmSurfaceManager*   surfaceMgr = nullptr;
3316     CSync*              surfaceLock = nullptr;
3317     PCM_CONTEXT_DATA    cmData = (PCM_CONTEXT_DATA)m_device->GetAccelData();
3318     CmEventRT*          event = nullptr;
3319     int32_t             taskId = 0;
3320 
3321     m_criticalSectionHalExecute.Acquire(); // Enter HalCm Execute Protection
3322 
3323     while( !m_enqueuedTasks.IsEmpty() )
3324     {
3325         uint32_t flushedTaskCount = m_flushedTasks.GetCount();
3326         if ( flushBlocked )
3327         {
3328             while( flushedTaskCount >= m_halMaxValues->maxTasks )
3329             {
3330                 // If the task count in flushed queue is no less than hw restrictiion,
3331                 // query the staus of flushed task queue. Remove any finished tasks from the queue
3332                 QueryFlushedTasks();
3333                 flushedTaskCount = m_flushedTasks.GetCount();
3334             }
3335         }
3336         else
3337         {
3338             if( flushedTaskCount >= m_halMaxValues->maxTasks )
3339             {
3340                 // If the task count in flushed queue is no less than hw restrictiion,
3341                 // query the staus of flushed task queue. Remove any finished tasks from the queue
3342                 QueryFlushedTasks();
3343                 flushedTaskCount = m_flushedTasks.GetCount();
3344                 if( flushedTaskCount >= m_halMaxValues->maxTasks )
3345                 {
3346                     // If none of flushed tasks finishes, we can't flush more taks.
3347                     break;
3348                 }
3349             }
3350         }
3351 
3352         task = (CmTaskInternal*)m_enqueuedTasks.Pop();
3353         CM_CHK_NULL_GOTOFINISH_CMERROR( task );
3354 
3355         CmNotifierGroup *notifiers = m_device->GetNotifiers();
3356         if (notifiers != nullptr)
3357         {
3358             notifiers->NotifyTaskFlushed(m_device, task);
3359         }
3360 
3361         task->GetTaskType(taskType);
3362 
3363         switch(taskType)
3364         {
3365             case CM_INTERNAL_TASK_WITH_THREADSPACE:
3366                 hr = FlushGeneralTask(task);
3367                 break;
3368 
3369             case CM_INTERNAL_TASK_WITH_THREADGROUPSPACE:
3370                 hr = FlushGroupTask(task);
3371                 break;
3372 
3373             case CM_INTERNAL_TASK_VEBOX:
3374                 hr = FlushVeboxTask(task);
3375                 break;
3376 
3377             case CM_INTERNAL_TASK_ENQUEUEWITHHINTS:
3378                 hr = FlushEnqueueWithHintsTask(task);
3379                 break;
3380 
3381             default:    // by default, assume the task is considered as general task: CM_INTERNAL_TASK_WITH_THREADSPACE
3382                 hr = FlushGeneralTask(task);
3383                 break;
3384         }
3385 
3386         if(hr == CM_SUCCESS)
3387         {
3388             m_flushedTasks.Push( task );
3389             task->VtuneSetFlushTime(); // Record Flush Time
3390         }
3391         else
3392         {
3393             // Failed to flush, destroy the task.
3394             CmTaskInternal::Destroy( task );
3395         }
3396 
3397     } // loop for task
3398 
3399 #if MDF_SURFACE_CONTENT_DUMP
3400     if (cmData->cmHalState->dumpSurfaceContent)
3401     {
3402         task->GetTaskEvent(event);
3403         if (event != nullptr)
3404         {
3405             while (event->GetStatusWithoutFlush() != CM_STATUS_FINISHED)
3406             {
3407                 event->Query();
3408             }
3409             event->GetTaskDriverId(taskId);
3410         }
3411         task->SurfaceDump(taskId);
3412     }
3413 #endif
3414     QueryFlushedTasks();
3415 
3416 finish:
3417     m_criticalSectionHalExecute.Release();//Leave HalCm Execute Protection
3418 
3419     //Delayed destroy for resource
3420     m_device->GetSurfaceManager(surfaceMgr);
3421     if (!surfaceMgr)
3422     {
3423         CM_ASSERTMESSAGE("Error: Pointer to surface manager is null.");
3424         return CM_NULL_POINTER;
3425     }
3426 
3427     surfaceLock = m_device->GetSurfaceCreationLock();
3428     if (surfaceLock == nullptr)
3429     {
3430         CM_ASSERTMESSAGE("Error: Pointer to surface creation lock is null.");
3431         return CM_NULL_POINTER;
3432     }
3433     surfaceLock->Acquire();
3434     surfaceMgr->RefreshDelayDestroySurfaces(freeSurfNum);
3435     surfaceLock->Release();
3436 
3437     return hr;
3438 }
3439 
3440 //*-----------------------------------------------------------------------------
3441 //| Purpose:    Enqueue a Vebox Task
3442 //| Arguments :
3443 //|               pVebox_G75      [in]       Pointer to a CmVebox object
3444 //|               event          [in]       Reference to the pointer to Event
3445 //|
3446 //| Returns:    Result of the operation.
3447 //*-----------------------------------------------------------------------------
EnqueueVebox(CmVebox * vebox,CmEvent * & event)3448 CM_RT_API int32_t CmQueueRT::EnqueueVebox(CmVebox * vebox, CmEvent* & event)
3449 {
3450     INSERT_API_CALL_LOG(GetHalState());
3451 
3452     int32_t hr                  = CM_SUCCESS;
3453     CmTaskInternal* task   = nullptr;
3454     int32_t taskDriverId        = -1;
3455     bool isEventVisible    = (event == CM_NO_EVENT)? false:true;
3456     CmEventRT *eventRT = static_cast<CmEventRT *>(event);
3457 
3458     //Check if the input is valid
3459     if ( vebox == nullptr )
3460     {
3461         CM_ASSERTMESSAGE("Error: Pointer to vebox is null.");
3462         return CM_NULL_POINTER;
3463     }
3464     CmVeboxRT *veboxRT = static_cast<CmVeboxRT *>(vebox);
3465     CM_CHK_CMSTATUS_GOTOFINISH(CmTaskInternal::Create(m_device,  veboxRT, task ));
3466 
3467     LARGE_INTEGER nEnqueueTime;
3468     if ( !(MosUtilities::MosQueryPerformanceCounter( (uint64_t*)&nEnqueueTime.QuadPart )) )
3469     {
3470         CM_ASSERTMESSAGE("Error: Query Performance counter failure.");
3471         hr = CM_FAILURE;
3472         goto finish;
3473     }
3474 
3475     CM_CHK_CMSTATUS_GOTOFINISH(CreateEvent(task, isEventVisible, taskDriverId, eventRT));
3476 
3477     if ( eventRT != nullptr )
3478     {
3479         eventRT->SetEnqueueTime( nEnqueueTime );
3480     }
3481     event = eventRT;
3482 
3483     if (!m_enqueuedTasks.Push(task))
3484     {
3485         CM_ASSERTMESSAGE("Error: Push enqueued tasks failure.")
3486         hr = CM_FAILURE;
3487         goto finish;
3488     }
3489 
3490     CM_CHK_CMSTATUS_GOTOFINISH(FlushTaskWithoutSync());
3491 
3492 finish:
3493     if (hr != CM_SUCCESS)
3494     {
3495         CmTaskInternal::Destroy(task);
3496     }
3497     return hr;
3498 }
3499 
3500 //*-----------------------------------------------------------------------------
3501 //| Purpose:   Create Event and Update event in m_eventArray
3502 //| Returns:   result of operation
3503 //*-----------------------------------------------------------------------------
CreateEvent(CmTaskInternal * task,bool isVisible,int32_t & taskDriverId,CmEventRT * & event)3504 int32_t CmQueueRT::CreateEvent(CmTaskInternal *task, bool isVisible, int32_t &taskDriverId, CmEventRT *&event )
3505 {
3506     int32_t hr = CM_SUCCESS;
3507 
3508     m_criticalSectionEvent.Acquire();
3509 
3510     uint32_t freeSlotInEventArray = m_eventArray.GetFirstFreeIndex();
3511 
3512     hr = CmEventRT::Create( freeSlotInEventArray, this, task, taskDriverId, m_device, isVisible, event );
3513 
3514     if (hr == CM_SUCCESS)
3515     {
3516         m_eventArray.SetElement( freeSlotInEventArray, event );
3517         m_eventCount ++;
3518 
3519         if (task)
3520             task->SetTaskEvent( event );
3521 
3522         if (!isVisible)
3523         {
3524             event = nullptr;
3525         }
3526     }
3527     else
3528     {
3529         CM_ASSERTMESSAGE("Error: Create Event failure.")
3530     }
3531 
3532     m_criticalSectionEvent.Release();
3533 
3534     return hr;
3535 }
3536 
3537 //*---------------------------------------------------------------------------------------------------------
3538 //| Name:       EnqueueCopyCPUToGPUFullStride()
3539 //| Purpose:    Copy data from system memory to video memory (surface)
3540 //| Arguments:
3541 //|             surface      [in]  Pointer to a CmSurface2D object as copy destination
3542 //|             sysMem       [in]  Pointer to a system memory as copy source
3543 //|             widthStride   [in]  Width stride in bytes for system memory (to calculate start of next line)
3544 //|             heightStride  [in]  Width stride in row for system memory (to calculate start of next plane)
3545 //|             option        [in]  Option passed from user, blocking copy, non-blocking copy or disable turbo boost
3546 //|             event        [in,out]  Reference to the pointer to Event
3547 //| Returns:    Result of the operation.
3548 //|
3549 //| Restrictions & Notes:
3550 //|             1) sysMem must be 16-byte aligned.
3551 //|             2) Surface's width must be 16-byte aligned regarding performance.
3552 //|             3) widthStride and heightStride are used to indicate the padding information in system memory
3553 //|                 widthStride = width_in_pixel * bytes_per_pixel + padding_in_bytes
3554 //|                 heightStride = height + padding_in_row
3555 //*---------------------------------------------------------------------------------------------------------
EnqueueCopyCPUToGPUFullStride(CmSurface2D * surface,const unsigned char * sysMem,const uint32_t widthStride,const uint32_t heightStride,const uint32_t option,CmEvent * & event)3556 CM_RT_API int32_t CmQueueRT::EnqueueCopyCPUToGPUFullStride( CmSurface2D* surface,
3557                                                      const unsigned char* sysMem,
3558                                                      const uint32_t widthStride,
3559                                                      const uint32_t heightStride,
3560                                                      const uint32_t option,
3561                                                      CmEvent* & event )
3562 {
3563     INSERT_API_CALL_LOG(GetHalState());
3564 
3565     if (!m_device->HasGpuCopyKernel())
3566     {
3567         return CM_NOT_IMPLEMENTED;
3568     }
3569 
3570     CmSurface2DRT *surfaceRT = static_cast<CmSurface2DRT *>(surface);
3571     return EnqueueCopyInternal(surfaceRT, (unsigned char*)sysMem, widthStride, heightStride, CM_FASTCOPY_CPU2GPU, option, event);
3572 }
3573 
3574 //*---------------------------------------------------------------------------------------------------------
3575 //| Name:       EnqueueCopyGPUToCPUFullStride()
3576 //| Purpose:    Copy data from tiled video memory (surface) to linear system memory
3577 //| Arguments:
3578 //|             surface      [in]  Pointer to a CmSurface2D object as copy source
3579 //|             sysMem       [in]  Pointer to a system memory as copy destination
3580 //|             widthStride   [in]  Width stride in bytes for system memory (to calculate start of next line)
3581 //|             heightStride  [in]  Width stride in row for system memory (to calculate start of next plane)
3582 //|             option        [in]  Option passed from user, blocking copy,non-blocking copy or disable turbo boost
3583 //|             event        [in,out]  Reference to the pointer to Event
3584 //| Returns:    Result of the operation.
3585 //|
3586 //| Restrictions & Notes:
3587 //|             1) sysMem must be 16-byte aligned.
3588 //|             2) Surface's width must be 16-byte aligned regarding performance.
3589 //|             3) widthStride and heightStride are used to indicate the padding information in system memory
3590 //|                 widthStride = width_in_pixel * bytes_per_pixel + padding_in_bytes
3591 //|                 heightStride = height + padding_in_row
3592 //*---------------------------------------------------------------------------------------------------------
EnqueueCopyGPUToCPUFullStride(CmSurface2D * surface,unsigned char * sysMem,const uint32_t widthStride,const uint32_t heightStride,const uint32_t option,CmEvent * & event)3593 CM_RT_API int32_t CmQueueRT::EnqueueCopyGPUToCPUFullStride( CmSurface2D* surface,
3594                                                      unsigned char* sysMem,
3595                                                      const uint32_t widthStride,
3596                                                      const uint32_t heightStride,
3597                                                      const uint32_t option,
3598                                                      CmEvent* & event )
3599 {
3600     INSERT_API_CALL_LOG(GetHalState());
3601 
3602     if (!m_device->HasGpuCopyKernel())
3603     {
3604         return CM_NOT_IMPLEMENTED;
3605     }
3606 
3607     CmSurface2DRT *surfaceRT = static_cast<CmSurface2DRT *>(surface);
3608     return EnqueueCopyInternal(surfaceRT, sysMem, widthStride, heightStride, CM_FASTCOPY_GPU2CPU, option, event);
3609 }
3610 
3611 //*---------------------------------------------------------------------------------------------------------
3612 //| Name:       CreateGPUCopyKernel()
3613 //| Purpose:    Create GPUCopy kernel, reuse the kernel if it has been created and resuable
3614 //| Arguments:
3615 //|             widthInByte      [in]  surface's width in bytes
3616 //|             height           [in]  surface's height
3617 //|             format           [in]  surface's height
3618 //|             copyDirection    [in]  copy direction, cpu -> gpu or gpu -> cpu
3619 //|             gpuCopyKernelParam [out] kernel param
3620 //|
3621 //| Returns:    Result of the operation.
3622 //|
3623 //*---------------------------------------------------------------------------------------------------------
CreateGPUCopyKernel(uint32_t widthInByte,uint32_t height,CM_SURFACE_FORMAT format,CM_GPUCOPY_DIRECTION copyDirection,CM_GPUCOPY_KERNEL * & gpuCopyKernelParam)3624 int32_t CmQueueRT::CreateGPUCopyKernel(uint32_t widthInByte,
3625                                        uint32_t height,
3626                                        CM_SURFACE_FORMAT format,
3627                                        CM_GPUCOPY_DIRECTION copyDirection,
3628                                        CM_GPUCOPY_KERNEL* &gpuCopyKernelParam)
3629 {
3630     int32_t     hr                 = CM_SUCCESS;
3631 
3632     //Search existing kernel
3633     CM_CHK_CMSTATUS_GOTOFINISH(SearchGPUCopyKernel(widthInByte, height, format, copyDirection, gpuCopyKernelParam));
3634 
3635     if(gpuCopyKernelParam != nullptr)
3636     { // reuse
3637         GPUCOPY_KERNEL_LOCK(gpuCopyKernelParam);
3638     }
3639     else
3640     {
3641         gpuCopyKernelParam   = new (std::nothrow) CM_GPUCOPY_KERNEL ;
3642         CM_CHK_NULL_GOTOFINISH_CMERROR(gpuCopyKernelParam);
3643         CmSafeMemSet(gpuCopyKernelParam, 0, sizeof(CM_GPUCOPY_KERNEL));
3644 
3645         CM_CHK_CMSTATUS_GOTOFINISH(AllocateGPUCopyKernel(widthInByte, height, format, copyDirection, gpuCopyKernelParam->kernel));
3646         CM_CHK_CMSTATUS_GOTOFINISH(GetGPUCopyKrnID(widthInByte, height, format, copyDirection, gpuCopyKernelParam->kernelID));
3647         GPUCOPY_KERNEL_LOCK(gpuCopyKernelParam);
3648 
3649         CM_CHK_CMSTATUS_GOTOFINISH(AddGPUCopyKernel(gpuCopyKernelParam));
3650     }
3651 
3652 finish:
3653     if( hr != CM_SUCCESS)
3654     {
3655         CmSafeDelete(gpuCopyKernelParam);
3656     }
3657 
3658     return hr;
3659 }
3660 
3661 //*---------------------------------------------------------------------------------------------------------
3662 //| Name:       SearchGPUCopyKernel()
3663 //| Purpose:    Search if the required kernel exists
3664 //| Arguments:
3665 //|             widthInByte      [in]  surface's width in bytes
3666 //|             height           [in]  surface's height
3667 //|             format           [in]  surface's height
3668 //|             copyDirection    [in]  copy direction, cpu -> gpu or gpu -> cpu
3669 //|             gpuCopyKernelParam [out] kernel param
3670 //|
3671 //| Returns:    Result of the operation.
3672 //|
3673 //*---------------------------------------------------------------------------------------------------------
SearchGPUCopyKernel(uint32_t widthInByte,uint32_t height,CM_SURFACE_FORMAT format,CM_GPUCOPY_DIRECTION copyDirection,CM_GPUCOPY_KERNEL * & kernelParam)3674 int32_t CmQueueRT::SearchGPUCopyKernel(uint32_t widthInByte,
3675                                        uint32_t height,
3676                                        CM_SURFACE_FORMAT format,
3677                                        CM_GPUCOPY_DIRECTION copyDirection,
3678                                        CM_GPUCOPY_KERNEL* &kernelParam)
3679 {
3680     int32_t     hr = CM_SUCCESS;
3681     CM_GPUCOPY_KERNEL *gpucopyKernel = nullptr;
3682     CM_GPUCOPY_KERNEL_ID kernelTypeID = GPU_COPY_KERNEL_UNKNOWN;
3683 
3684     kernelParam = nullptr;
3685     CM_CHK_CMSTATUS_GOTOFINISH(GetGPUCopyKrnID(widthInByte, height, format, copyDirection, kernelTypeID));
3686 
3687     for(uint32_t index =0 ;  index< m_copyKernelParamArrayCount; index++)
3688     {
3689         gpucopyKernel = (CM_GPUCOPY_KERNEL*)m_copyKernelParamArray.GetElement(index);
3690         if(gpucopyKernel != nullptr)
3691         {
3692             if(!gpucopyKernel->locked &&
3693                gpucopyKernel->kernelID == kernelTypeID)
3694             {
3695                 kernelParam = gpucopyKernel;
3696                 break;
3697             }
3698         }
3699     }
3700 
3701 finish:
3702     return hr;
3703 }
3704 
3705 //*---------------------------------------------------------------------------------------------------------
3706 //| Name:       AddGPUCopyKernel()
3707 //| Purpose:    Add new kernel into m_copyKernelParamArray
3708 //| Arguments:
3709 //|             widthInByte      [in]  surface's width in bytes
3710 //|             height           [in]  surface's height
3711 //|             format           [in]  surface's height
3712 //|             copyDirection    [in]  copy direction, cpu -> gpu or gpu -> cpu
3713 //|             gpuCopyKernelParam [out] kernel param
3714 //|
3715 //| Returns:    Result of the operation.
3716 //|
3717 //*---------------------------------------------------------------------------------------------------------
AddGPUCopyKernel(CM_GPUCOPY_KERNEL * & kernelParam)3718 int32_t CmQueueRT::AddGPUCopyKernel(CM_GPUCOPY_KERNEL* &kernelParam)
3719 {
3720     int32_t hr = CM_SUCCESS;
3721     // critical section protection
3722     CLock locker(m_criticalSectionGPUCopyKrn);
3723 
3724     CM_CHK_NULL_GOTOFINISH(kernelParam, CM_INVALID_GPUCOPY_KERNEL);
3725 
3726     // the newly created kernel must be locked
3727     if(!kernelParam->locked)
3728     {
3729         CM_ASSERTMESSAGE("Error: The newly created kernel must be locked.")
3730         hr = CM_INVALID_GPUCOPY_KERNEL;
3731         goto finish;
3732     }
3733 
3734     m_copyKernelParamArray.SetElement(m_copyKernelParamArrayCount, kernelParam);
3735     m_copyKernelParamArrayCount ++;
3736 
3737 finish:
3738     return hr;
3739 }
3740 
3741 //*---------------------------------------------------------------------------------------------------------
3742 //| Name:       GetGPUCopyKrnID()
3743 //| Purpose:    Calculate the kernel ID accroding surface's width, height and copy direction
3744 //| Arguments:
3745 //|             widthInByte      [in]  surface's width in bytes
3746 //|             height           [in]  surface's height
3747 //|             format           [in]  surface's height
3748 //|             copyDirection    [in]  copy direction, cpu -> gpu or gpu -> cpu
3749 //|             kernelID         [out] kernel id
3750 //|
3751 //| Returns:    Result of the operation.
3752 //|
3753 //*---------------------------------------------------------------------------------------------------------
GetGPUCopyKrnID(uint32_t widthInByte,uint32_t height,CM_SURFACE_FORMAT format,CM_GPUCOPY_DIRECTION copyDirection,CM_GPUCOPY_KERNEL_ID & kernelID)3754 int32_t CmQueueRT::GetGPUCopyKrnID( uint32_t widthInByte, uint32_t height, CM_SURFACE_FORMAT format,
3755             CM_GPUCOPY_DIRECTION copyDirection, CM_GPUCOPY_KERNEL_ID &kernelID )
3756 {
3757     int32_t hr = CM_SUCCESS;
3758 
3759     kernelID = GPU_COPY_KERNEL_UNKNOWN;
3760 
3761     if (format == CM_SURFACE_FORMAT_NV12 || format == CM_SURFACE_FORMAT_P010 || format == CM_SURFACE_FORMAT_P016)
3762     {
3763         switch(copyDirection)
3764         {
3765             case CM_FASTCOPY_GPU2CPU:
3766                 if ( (height&0x7) ||(widthInByte&0x7f))
3767                 {
3768                     kernelID = GPU_COPY_KERNEL_GPU2CPU_UNALIGNED_NV12_ID ;
3769                 }
3770                 else
3771                 {   // height 8-row aligned, widthByte 128 multiple
3772                     kernelID = GPU_COPY_KERNEL_GPU2CPU_ALIGNED_NV12_ID ;
3773                 }
3774                 break;
3775 
3776             case CM_FASTCOPY_CPU2GPU:
3777                 kernelID = GPU_COPY_KERNEL_CPU2GPU_NV12_ID;
3778                 break;
3779 
3780             case CM_FASTCOPY_GPU2GPU:
3781                 kernelID = GPU_COPY_KERNEL_GPU2GPU_NV12_ID;
3782                 break;
3783 
3784             case CM_FASTCOPY_CPU2CPU:
3785                 kernelID = GPU_COPY_KERNEL_CPU2CPU_ID;
3786                 break;
3787 
3788             default :
3789                 CM_ASSERTMESSAGE("Error: Invalid fast copy direction.")
3790                 hr = CM_FAILURE;
3791                 break;
3792         }
3793     }
3794     else
3795     {
3796         switch(copyDirection)
3797         {
3798             case CM_FASTCOPY_GPU2CPU:
3799                 if ( (height&0x7) ||(widthInByte&0x7f))
3800                 {
3801                     kernelID = GPU_COPY_KERNEL_GPU2CPU_UNALIGNED_ID;
3802                 }
3803                 else
3804                 {   // height 8-row aligned, widthByte 128 multiple
3805                     kernelID = GPU_COPY_KERNEL_GPU2CPU_ALIGNED_ID;
3806                 }
3807                 break;
3808 
3809             case CM_FASTCOPY_CPU2GPU:
3810                 kernelID = GPU_COPY_KERNEL_CPU2GPU_ID;
3811                 break;
3812 
3813             case CM_FASTCOPY_GPU2GPU:
3814                 kernelID = GPU_COPY_KERNEL_GPU2GPU_ID;
3815                 break;
3816 
3817             case CM_FASTCOPY_CPU2CPU:
3818                 kernelID = GPU_COPY_KERNEL_CPU2CPU_ID;
3819                 break;
3820 
3821             default :
3822                 CM_ASSERTMESSAGE("Error: Invalid fast copy direction.")
3823                 hr = CM_FAILURE;
3824                 break;
3825         }
3826     }
3827 
3828     return hr;
3829 }
3830 
3831 //*---------------------------------------------------------------------------------------------------------
3832 //| Name:       AllocateGPUCopyKernel()
3833 //| Purpose:    Allocate GPUCopy Kernel
3834 //| Arguments:
3835 //|             widthInByte      [in]  surface's width in bytes
3836 //|             height           [in]  surface's height
3837 //|             format           [in]  surface's height
3838 //|             copyDirection    [in]  copy direction, cpu -> gpu or gpu -> cpu
3839 //|             kernel          [out] pointer to created kernel
3840 //|
3841 //| Returns:    Result of the operation.
3842 //|
3843 //*---------------------------------------------------------------------------------------------------------
AllocateGPUCopyKernel(uint32_t widthInByte,uint32_t height,CM_SURFACE_FORMAT format,CM_GPUCOPY_DIRECTION copyDirection,CmKernel * & kernel)3844 int32_t CmQueueRT::AllocateGPUCopyKernel( uint32_t widthInByte, uint32_t height, CM_SURFACE_FORMAT format,
3845             CM_GPUCOPY_DIRECTION copyDirection, CmKernel *&kernel )
3846 {
3847     int32_t          hr                 = CM_SUCCESS;
3848     CmProgram       *gpuCopyProgram    = nullptr;
3849 
3850     CM_CHK_CMSTATUS_GOTOFINISH( m_device->LoadPredefinedCopyKernel(gpuCopyProgram));
3851     CM_CHK_NULL_GOTOFINISH_CMERROR(gpuCopyProgram);
3852 
3853     if (format == CM_SURFACE_FORMAT_NV12 || format == CM_SURFACE_FORMAT_P010 || format == CM_SURFACE_FORMAT_P016)
3854     {
3855         switch(copyDirection)
3856         {
3857             case CM_FASTCOPY_GPU2CPU:
3858                 if ( (height&0x7) ||(widthInByte&0x7f))
3859                 {
3860                     CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateKernel( gpuCopyProgram, _NAME( surfaceCopy_read_NV12_32x32 ) , kernel,"PredefinedGPUCopyKernel"));
3861                 }
3862                 else
3863                 {   // height 8-row aligned, widthByte 128 multiple
3864                     CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateKernel( gpuCopyProgram, _NAME( surfaceCopy_read_NV12_aligned_32x32 ) , kernel,"PredefinedGPUCopyKernel"));
3865                 }
3866                 break;
3867 
3868             case CM_FASTCOPY_CPU2GPU:
3869                 CM_CHK_CMSTATUS_GOTOFINISH( m_device->CreateKernel( gpuCopyProgram, _NAME( surfaceCopy_write_NV12_32x32 ), kernel, "PredefinedGPUCopyKernel"));
3870                 break;
3871 
3872             case CM_FASTCOPY_GPU2GPU:
3873                 CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateKernel(gpuCopyProgram, _NAME(SurfaceCopy_2DTo2D_NV12_32x32), kernel, "PredefinedGPUCopyKernel"));
3874                 break;
3875 
3876             case CM_FASTCOPY_CPU2CPU:
3877                 CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateKernel(gpuCopyProgram, _NAME(SurfaceCopy_BufferToBuffer_4k), kernel, "PredefinedGPUCopyKernel"));
3878                 break;
3879 
3880             default :
3881                 CM_ASSERTMESSAGE("Error: Invalid fast copy direction.")
3882                 hr = CM_FAILURE;
3883                 break;
3884         }
3885     }
3886     else
3887     {
3888         switch(copyDirection)
3889         {
3890             case CM_FASTCOPY_GPU2CPU:
3891                 if ( (height&0x7) ||(widthInByte&0x7f))
3892                 {
3893                     CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateKernel( gpuCopyProgram, _NAME( surfaceCopy_read_32x32 ) , kernel, "PredefinedGPUCopyKernel"));
3894                 }
3895                 else
3896                 {   // height 8-row aligned, widthByte 128 multiple
3897                     CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateKernel( gpuCopyProgram, _NAME( surfaceCopy_read_aligned_32x32  ) , kernel, "PredefinedGPUCopyKernel"));
3898                 }
3899                 break;
3900 
3901             case CM_FASTCOPY_CPU2GPU:
3902                 CM_CHK_CMSTATUS_GOTOFINISH( m_device->CreateKernel( gpuCopyProgram, _NAME( surfaceCopy_write_32x32 ), kernel, "PredefinedGPUCopyKernel" ));
3903                 break;
3904 
3905             case CM_FASTCOPY_GPU2GPU:
3906                 CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateKernel(gpuCopyProgram, _NAME(SurfaceCopy_2DTo2D_32x32), kernel, "PredefinedGPUCopyKernel"));
3907                 break;
3908 
3909             case CM_FASTCOPY_CPU2CPU:
3910                 CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateKernel(gpuCopyProgram, _NAME(SurfaceCopy_BufferToBuffer_4k), kernel, "PredefinedGPUCopyKernel"));
3911                 break;
3912 
3913             default :
3914                 CM_ASSERTMESSAGE("Error: Invalid fast copy direction.")
3915                 hr = CM_FAILURE;
3916                 break;
3917         }
3918     }
3919 
3920 finish:
3921     return hr;
3922 }
3923 
EnqueueFast(CmTask * task,CmEvent * & event,const CmThreadSpace * threadSpace)3924 CM_RT_API int32_t CmQueueRT::EnqueueFast(CmTask *task,
3925                                          CmEvent* &event,
3926                                          const CmThreadSpace *threadSpace)
3927 {
3928     CM_HAL_STATE * state = ((PCM_CONTEXT_DATA)m_device->GetAccelData())->cmHalState;
3929     int32_t result = CM_SUCCESS;
3930     if (state == nullptr)
3931     {
3932         result = CM_NULL_POINTER;
3933     }
3934     else if (state->advExecutor == nullptr ||
3935              state->advExecutor->SwitchToFastPath(task) == false)
3936     {
3937         return Enqueue(task, event, threadSpace);
3938     }
3939     else
3940     {
3941         auto gpu_context_name
3942                 = static_cast<MOS_GPU_CONTEXT>(m_queueOption.GPUContext);
3943         // Selects the correct GPU context.
3944         uint32_t old_stream_idx = state->pfnSetGpuContext(state, gpu_context_name,
3945                                                           m_streamIndex,
3946                                                           m_gpuContextHandle);
3947         const CmThreadSpaceRT *threadSpaceRTConst
3948                 = static_cast<const CmThreadSpaceRT*>(threadSpace);
3949         if (state->cmHalInterface->CheckMediaModeAvailability() == false)
3950         {
3951             if (threadSpaceRTConst != nullptr)
3952             {
3953                 result = state->advExecutor->SubmitComputeTask(
3954                     this, task, event, threadSpaceRTConst->GetThreadGroupSpace(),
3955                     gpu_context_name);
3956             }
3957             else
3958             {
3959                 result = state->advExecutor->SubmitComputeTask(this, task, event,
3960                                                                nullptr,
3961                                                                gpu_context_name);
3962             }
3963         }
3964         else
3965         {
3966             result = state->advExecutor->SubmitTask(this, task, event, threadSpace,
3967                                                     gpu_context_name);
3968         }
3969         state->osInterface->streamIndex = old_stream_idx;
3970     }
3971     return result;
3972 }
3973 
DestroyEventFast(CmEvent * & event)3974 CM_RT_API int32_t CmQueueRT::DestroyEventFast(CmEvent *&event)
3975 {
3976     CM_HAL_STATE * state = ((PCM_CONTEXT_DATA)m_device->GetAccelData())->cmHalState;
3977 
3978     if (state == nullptr)
3979     {
3980         return CM_NULL_POINTER;
3981     }
3982     else if (state->advExecutor == nullptr)
3983     {
3984         return DestroyEvent(event);
3985     }
3986     else
3987     {
3988         return state->advExecutor->DestoryEvent(this, event);
3989     }
3990 }
3991 
3992 CM_RT_API int32_t
EnqueueWithGroupFast(CmTask * task,CmEvent * & event,const CmThreadGroupSpace * threadGroupSpace)3993 CmQueueRT::EnqueueWithGroupFast(CmTask *task,
3994                                 CmEvent* &event,
3995                                 const CmThreadGroupSpace *threadGroupSpace)
3996 {
3997     CM_HAL_STATE * state = ((PCM_CONTEXT_DATA)m_device->GetAccelData())->cmHalState;
3998     int32_t result = CM_SUCCESS;
3999     if (state == nullptr)
4000     {
4001         return CM_NULL_POINTER;
4002     }
4003     else if (state->advExecutor == nullptr ||
4004              state->advExecutor->SwitchToFastPath(task) == false)
4005     {
4006         return EnqueueWithGroup(task, event, threadGroupSpace);
4007     }
4008 
4009     auto gpu_context_name = static_cast<MOS_GPU_CONTEXT>(m_queueOption.GPUContext);
4010     // Selects the correct GPU context.
4011     uint32_t old_stream_idx = state->pfnSetGpuContext(state, gpu_context_name,
4012                                                       m_streamIndex,
4013                                                       m_gpuContextHandle);
4014     if (state->cmHalInterface->CheckMediaModeAvailability())
4015     {
4016         result = state->advExecutor->SubmitGpgpuTask(this, task, event,
4017                                                      threadGroupSpace,
4018                                                      gpu_context_name);
4019     }
4020     else
4021     {
4022         result = state->advExecutor->SubmitComputeTask(this, task, event,
4023                                                        threadGroupSpace,
4024                                                        gpu_context_name);
4025     }
4026     state->osInterface->streamIndex = old_stream_idx;
4027     return result;
4028 }
4029 
GetOSSyncEventHandle(void * & hOSSyncEvent)4030 int32_t CmQueueRT::GetOSSyncEventHandle(void *& hOSSyncEvent)
4031 {
4032     hOSSyncEvent = m_osSyncEvent;
4033     return CM_SUCCESS;
4034 }
4035 
4036 
RegisterSyncEvent()4037 int32_t CmQueueRT::RegisterSyncEvent()
4038 {
4039     CM_RETURN_CODE  hr = CM_SUCCESS;
4040 
4041     CM_HAL_OSSYNC_PARAM syncParam;
4042     void *syncEventHandle = nullptr;
4043     syncParam.osSyncEvent = syncEventHandle;
4044 
4045     PCM_CONTEXT_DATA  cmData = (PCM_CONTEXT_DATA)m_device->GetAccelData();
4046     PCM_HAL_STATE  cmHalState = cmData->cmHalState;
4047     // Call HAL layer to wait for Task finished with event-driven mechanism
4048     CM_CHK_MOSSTATUS_GOTOFINISH_CMERROR(cmHalState->pfnRegisterUMDNotifyEventHandle(cmHalState, &syncParam));
4049 
4050     m_osSyncEvent = syncParam.osSyncEvent;
4051 
4052 finish:
4053     return hr;
4054 }
4055 
CreateGpuContext(CM_HAL_STATE * halState,MOS_GPU_CONTEXT gpuContextName,MOS_GPU_NODE gpuNode,MOS_GPUCTX_CREATOPTIONS * createOptions)4056 MOS_STATUS CmQueueRT::CreateGpuContext(CM_HAL_STATE *halState,
4057                                        MOS_GPU_CONTEXT gpuContextName,
4058                                        MOS_GPU_NODE gpuNode,
4059                                        MOS_GPUCTX_CREATOPTIONS *createOptions)
4060 {
4061     uint32_t old_stream_idx = 0;
4062     MOS_STATUS status = MOS_STATUS_UNKNOWN;
4063     if (MOS_GPU_CONTEXT_CM_COMPUTE == gpuContextName)
4064     {
4065         m_streamIndex = halState->pfnRegisterStream(halState);
4066         old_stream_idx = halState->osInterface->streamIndex;
4067         halState->osInterface->streamIndex = m_streamIndex;
4068         m_gpuContextHandle = halState->pfnCreateGpuComputeContext(halState,
4069                                                                   createOptions);
4070         if (MOS_GPU_CONTEXT_INVALID_HANDLE != m_gpuContextHandle)
4071         {
4072             status = MOS_STATUS_SUCCESS;
4073             CreateSyncBuffer(halState);
4074         }
4075     }
4076     else
4077     {  // As there is only one render context, the original stream index will be used.
4078         old_stream_idx = m_streamIndex = halState->osInterface->streamIndex;
4079         status = halState->pfnCreateGPUContext(halState, gpuContextName, gpuNode,
4080                                                createOptions);
4081     }
4082     halState->osInterface->streamIndex = old_stream_idx;
4083     return status;
4084 }
4085 
DestroyComputeGpuContext()4086 MOS_STATUS  CmQueueRT::DestroyComputeGpuContext()
4087 {
4088     MOS_STATUS          status      = MOS_STATUS_SUCCESS;
4089     PCM_CONTEXT_DATA    cmCtxData   = nullptr;
4090     PCM_HAL_STATE       cmHalState  = nullptr;
4091 
4092     if (MOS_GPU_CONTEXT_INVALID_HANDLE == m_gpuContextHandle)
4093     {
4094         return MOS_STATUS_SUCCESS;
4095     }
4096 
4097     cmCtxData   = (PCM_CONTEXT_DATA)m_device->GetAccelData();
4098     if(!cmCtxData || !cmCtxData->cmHalState || !cmCtxData->cmHalState->osInterface)
4099     {
4100         return MOS_STATUS_INVALID_PARAMETER;
4101     }
4102 
4103     cmHalState = cmCtxData->cmHalState;
4104 
4105     status =  cmHalState->osInterface->pfnDestroyGpuComputeContext(cmHalState->osInterface, m_gpuContextHandle);
4106 
4107     return status;
4108 }
4109 
ExecuteGroupTask(CM_HAL_STATE * halState,CM_HAL_EXEC_TASK_GROUP_PARAM * taskParam,MOS_GPU_CONTEXT gpuContextName)4110 MOS_STATUS CmQueueRT::ExecuteGroupTask(CM_HAL_STATE *halState,
4111                                        CM_HAL_EXEC_TASK_GROUP_PARAM *taskParam,
4112                                        MOS_GPU_CONTEXT gpuContextName)
4113 {
4114     uint32_t old_stream_idx = halState->pfnSetGpuContext(halState, gpuContextName,
4115                                                          m_streamIndex,
4116                                                          m_gpuContextHandle);
4117     if (INVALID_STREAM_INDEX == old_stream_idx)
4118     {
4119         return MOS_STATUS_UNKNOWN;
4120     }
4121     RegisterSyncEvent();
4122     CM_CHK_MOSSTATUS_RETURN(SelectSyncBuffer(halState));
4123     MOS_STATUS result = halState->pfnExecuteGroupTask(halState, taskParam);
4124     halState->osInterface->streamIndex = old_stream_idx;
4125     return result;
4126 }
4127 
ExecuteGeneralTask(CM_HAL_STATE * halState,CM_HAL_EXEC_TASK_PARAM * taskParam,MOS_GPU_CONTEXT gpuContextName)4128 MOS_STATUS CmQueueRT::ExecuteGeneralTask(CM_HAL_STATE *halState,
4129                                          CM_HAL_EXEC_TASK_PARAM *taskParam,
4130                                          MOS_GPU_CONTEXT gpuContextName)
4131 {
4132     uint32_t old_stream_idx = halState->pfnSetGpuContext(halState, gpuContextName,
4133                                                          m_streamIndex,
4134                                                          m_gpuContextHandle);
4135     if (INVALID_STREAM_INDEX == old_stream_idx)
4136     {
4137         return MOS_STATUS_UNKNOWN;
4138     }
4139     RegisterSyncEvent();
4140     MOS_STATUS result = halState->pfnExecuteTask(halState, taskParam);
4141     halState->osInterface->streamIndex = old_stream_idx;
4142     return result;
4143 }
4144 
4145 #if CM_LOG_ON
GetHalState()4146 CM_HAL_STATE* CmQueueRT::GetHalState() { return m_device->GetHalState(); }
4147 #endif  // #if CM_LOG_ON
4148 }  // namespace
4149